PostgreSQL Source Code  git master
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/catalog.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner_private.h"
#include "utils/timestamp.h"
#include <lib/sort_template.h>
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 
#define ST_SORT   sort_checkpoint_bufferids
 
#define ST_ELEMENT_TYPE   CkptSortItem
 
#define ST_COMPARE(a, b)   ckpt_buforder_comparator(a, b)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   sort_pending_writebacks
 
#define ST_ELEMENT_TYPE   PendingWriteback
 
#define ST_COMPARE(a, b)   buffertag_comparator(&a->tag, &b->tag)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static Buffer ReadBuffer_common (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf)
 
static void BufferSync (int flags)
 
static uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static bool StartBufferIO (BufferDesc *buf, bool forInput)
 
static void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext *io_context)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FindAndDropRelationBuffers (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
 
static void RelationCopyStorageUsingBuffer (RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rlocator_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const BufferTag *ba, const BufferTag *bb)
 
static int ckpt_buforder_comparator (const CkptSortItem *a, const CkptSortItem *b)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
bool ReadRecentBuffer (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
 
static void InvalidateBuffer (BufferDesc *buf)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferPoolAccess (void)
 
void PrintBufferLeakWarning (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
void BufmgrCommit (void)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelationBuffers (SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelationsAllBuffers (SMgrRelation *smgr_reln, int nlocators)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void CreateAndCopyRelationData (RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, int mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
void AbortBufferIO (void)
 
uint32 LockBufHdr (BufferDesc *desc)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *context)
 
void TestForOldSnapshot_impl (Snapshot snapshot, Relation relation)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY
 
int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY
 
int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER
 
int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER
 
int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER
 
static BufferDescInProgressBuf = NULL
 
static bool IsForInput
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 

Macro Definition Documentation

◆ BUF_DROP_FULL_SCAN_THRESHOLD

#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)

Definition at line 81 of file bufmgr.c.

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 71 of file bufmgr.c.

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 70 of file bufmgr.c.

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 63 of file bufmgr.c.

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:383
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:233
int32 * LocalRefCount
Definition: localbuf.c:46

Definition at line 441 of file bufmgr.c.

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 62 of file bufmgr.c.

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 66 of file bufmgr.c.

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Definition at line 90 of file bufmgr.c.

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 73 of file bufmgr.c.

◆ ST_COMPARE [1/2]

#define ST_COMPARE (   a,
  b 
)    ckpt_buforder_comparator(a, b)

Definition at line 5002 of file bufmgr.c.

◆ ST_COMPARE [2/2]

#define ST_COMPARE (   a,
  b 
)    buffertag_comparator(&a->tag, &b->tag)

Definition at line 5002 of file bufmgr.c.

◆ ST_DEFINE [1/2]

#define ST_DEFINE

Definition at line 5004 of file bufmgr.c.

◆ ST_DEFINE [2/2]

#define ST_DEFINE

Definition at line 5004 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [1/2]

#define ST_ELEMENT_TYPE   CkptSortItem

Definition at line 5001 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [2/2]

#define ST_ELEMENT_TYPE   PendingWriteback

Definition at line 5001 of file bufmgr.c.

◆ ST_SCOPE [1/2]

#define ST_SCOPE   static

Definition at line 5003 of file bufmgr.c.

◆ ST_SCOPE [2/2]

#define ST_SCOPE   static

Definition at line 5003 of file bufmgr.c.

◆ ST_SORT [1/2]

#define ST_SORT   sort_checkpoint_bufferids

Definition at line 5000 of file bufmgr.c.

◆ ST_SORT [2/2]

#define ST_SORT   sort_pending_writebacks

Definition at line 5000 of file bufmgr.c.

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

◆ SMgrSortArray

typedef struct SMgrSortArray SMgrSortArray

Function Documentation

◆ AbortBufferIO()

void AbortBufferIO ( void  )

Definition at line 4709 of file bufmgr.c.

4710 {
4712 
4713  if (buf)
4714  {
4715  uint32 buf_state;
4716 
4717  buf_state = LockBufHdr(buf);
4718  Assert(buf_state & BM_IO_IN_PROGRESS);
4719  if (IsForInput)
4720  {
4721  Assert(!(buf_state & BM_DIRTY));
4722 
4723  /* We'd better not think buffer is valid yet */
4724  Assert(!(buf_state & BM_VALID));
4725  UnlockBufHdr(buf, buf_state);
4726  }
4727  else
4728  {
4729  Assert(buf_state & BM_DIRTY);
4730  UnlockBufHdr(buf, buf_state);
4731  /* Issue notice if this is not the first failure... */
4732  if (buf_state & BM_IO_ERROR)
4733  {
4734  /* Buffer is pinned, so we can read tag without spinlock */
4735  char *path;
4736 
4737  path = relpathperm(BufTagGetRelFileLocator(&buf->tag),
4738  BufTagGetForkNum(&buf->tag));
4739  ereport(WARNING,
4740  (errcode(ERRCODE_IO_ERROR),
4741  errmsg("could not write block %u of %s",
4742  buf->tag.blockNum, path),
4743  errdetail("Multiple failures --- write error might be permanent.")));
4744  pfree(path);
4745  }
4746  }
4748  }
4749 }
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
#define BM_DIRTY
Definition: buf_internals.h:60
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:63
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:61
#define BM_IO_ERROR
Definition: buf_internals.h:64
static BufferDesc * InProgressBuf
Definition: bufmgr.c:163
static bool IsForInput
Definition: bufmgr.c:164
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4822
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4677
unsigned int uint32
Definition: c.h:490
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define WARNING
Definition: elog.h:36
#define ereport(elevel,...)
Definition: elog.h:149
Assert(fmt[strlen(fmt) - 1] !='\n')
void pfree(void *pointer)
Definition: mcxt.c:1436
static char * buf
Definition: pg_test_fsync.c:67
#define relpathperm(rlocator, forknum)
Definition: relpath.h:90

References Assert(), BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_VALID, buf, BufTagGetForkNum(), BufTagGetRelFileLocator(), ereport, errcode(), errdetail(), errmsg(), InProgressBuf, IsForInput, LockBufHdr(), pfree(), relpathperm, TerminateBufferIO(), UnlockBufHdr(), and WARNING.

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 2642 of file bufmgr.c.

2643 {
2645 
2646  AtEOXact_LocalBuffers(isCommit);
2647 
2649 }
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2703
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:201
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:606

References Assert(), AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 2684 of file bufmgr.c.

2685 {
2686  AbortBufferIO();
2687  UnlockBuffers();
2688 
2690 
2691  /* localbuf.c needs a chance too */
2693 }
void UnlockBuffers(void)
Definition: bufmgr.c:4218
void AbortBufferIO(void)
Definition: bufmgr.c:4709
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:617

References AbortBufferIO(), AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferPoolAccess().

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 2272 of file bufmgr.c.

2273 {
2274  /* info obtained from freelist.c */
2275  int strategy_buf_id;
2276  uint32 strategy_passes;
2277  uint32 recent_alloc;
2278 
2279  /*
2280  * Information saved between calls so we can determine the strategy
2281  * point's advance rate and avoid scanning already-cleaned buffers.
2282  */
2283  static bool saved_info_valid = false;
2284  static int prev_strategy_buf_id;
2285  static uint32 prev_strategy_passes;
2286  static int next_to_clean;
2287  static uint32 next_passes;
2288 
2289  /* Moving averages of allocation rate and clean-buffer density */
2290  static float smoothed_alloc = 0;
2291  static float smoothed_density = 10.0;
2292 
2293  /* Potentially these could be tunables, but for now, not */
2294  float smoothing_samples = 16;
2295  float scan_whole_pool_milliseconds = 120000.0;
2296 
2297  /* Used to compute how far we scan ahead */
2298  long strategy_delta;
2299  int bufs_to_lap;
2300  int bufs_ahead;
2301  float scans_per_alloc;
2302  int reusable_buffers_est;
2303  int upcoming_alloc_est;
2304  int min_scan_buffers;
2305 
2306  /* Variables for the scanning loop proper */
2307  int num_to_scan;
2308  int num_written;
2309  int reusable_buffers;
2310 
2311  /* Variables for final smoothed_density update */
2312  long new_strategy_delta;
2313  uint32 new_recent_alloc;
2314 
2315  /*
2316  * Find out where the freelist clock sweep currently is, and how many
2317  * buffer allocations have happened since our last call.
2318  */
2319  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2320 
2321  /* Report buffer alloc counts to pgstat */
2322  PendingBgWriterStats.buf_alloc += recent_alloc;
2323 
2324  /*
2325  * If we're not running the LRU scan, just stop after doing the stats
2326  * stuff. We mark the saved state invalid so that we can recover sanely
2327  * if LRU scan is turned back on later.
2328  */
2329  if (bgwriter_lru_maxpages <= 0)
2330  {
2331  saved_info_valid = false;
2332  return true;
2333  }
2334 
2335  /*
2336  * Compute strategy_delta = how many buffers have been scanned by the
2337  * clock sweep since last time. If first time through, assume none. Then
2338  * see if we are still ahead of the clock sweep, and if so, how many
2339  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2340  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2341  * behavior when the passes counts wrap around.
2342  */
2343  if (saved_info_valid)
2344  {
2345  int32 passes_delta = strategy_passes - prev_strategy_passes;
2346 
2347  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2348  strategy_delta += (long) passes_delta * NBuffers;
2349 
2350  Assert(strategy_delta >= 0);
2351 
2352  if ((int32) (next_passes - strategy_passes) > 0)
2353  {
2354  /* we're one pass ahead of the strategy point */
2355  bufs_to_lap = strategy_buf_id - next_to_clean;
2356 #ifdef BGW_DEBUG
2357  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2358  next_passes, next_to_clean,
2359  strategy_passes, strategy_buf_id,
2360  strategy_delta, bufs_to_lap);
2361 #endif
2362  }
2363  else if (next_passes == strategy_passes &&
2364  next_to_clean >= strategy_buf_id)
2365  {
2366  /* on same pass, but ahead or at least not behind */
2367  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2368 #ifdef BGW_DEBUG
2369  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2370  next_passes, next_to_clean,
2371  strategy_passes, strategy_buf_id,
2372  strategy_delta, bufs_to_lap);
2373 #endif
2374  }
2375  else
2376  {
2377  /*
2378  * We're behind, so skip forward to the strategy point and start
2379  * cleaning from there.
2380  */
2381 #ifdef BGW_DEBUG
2382  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2383  next_passes, next_to_clean,
2384  strategy_passes, strategy_buf_id,
2385  strategy_delta);
2386 #endif
2387  next_to_clean = strategy_buf_id;
2388  next_passes = strategy_passes;
2389  bufs_to_lap = NBuffers;
2390  }
2391  }
2392  else
2393  {
2394  /*
2395  * Initializing at startup or after LRU scanning had been off. Always
2396  * start at the strategy point.
2397  */
2398 #ifdef BGW_DEBUG
2399  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2400  strategy_passes, strategy_buf_id);
2401 #endif
2402  strategy_delta = 0;
2403  next_to_clean = strategy_buf_id;
2404  next_passes = strategy_passes;
2405  bufs_to_lap = NBuffers;
2406  }
2407 
2408  /* Update saved info for next time */
2409  prev_strategy_buf_id = strategy_buf_id;
2410  prev_strategy_passes = strategy_passes;
2411  saved_info_valid = true;
2412 
2413  /*
2414  * Compute how many buffers had to be scanned for each new allocation, ie,
2415  * 1/density of reusable buffers, and track a moving average of that.
2416  *
2417  * If the strategy point didn't move, we don't update the density estimate
2418  */
2419  if (strategy_delta > 0 && recent_alloc > 0)
2420  {
2421  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2422  smoothed_density += (scans_per_alloc - smoothed_density) /
2423  smoothing_samples;
2424  }
2425 
2426  /*
2427  * Estimate how many reusable buffers there are between the current
2428  * strategy point and where we've scanned ahead to, based on the smoothed
2429  * density estimate.
2430  */
2431  bufs_ahead = NBuffers - bufs_to_lap;
2432  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2433 
2434  /*
2435  * Track a moving average of recent buffer allocations. Here, rather than
2436  * a true average we want a fast-attack, slow-decline behavior: we
2437  * immediately follow any increase.
2438  */
2439  if (smoothed_alloc <= (float) recent_alloc)
2440  smoothed_alloc = recent_alloc;
2441  else
2442  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2443  smoothing_samples;
2444 
2445  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2446  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2447 
2448  /*
2449  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2450  * eventually underflow to zero, and the underflows produce annoying
2451  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2452  * zero, there's no point in tracking smaller and smaller values of
2453  * smoothed_alloc, so just reset it to exactly zero to avoid this
2454  * syndrome. It will pop back up as soon as recent_alloc increases.
2455  */
2456  if (upcoming_alloc_est == 0)
2457  smoothed_alloc = 0;
2458 
2459  /*
2460  * Even in cases where there's been little or no buffer allocation
2461  * activity, we want to make a small amount of progress through the buffer
2462  * cache so that as many reusable buffers as possible are clean after an
2463  * idle period.
2464  *
2465  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2466  * the BGW will be called during the scan_whole_pool time; slice the
2467  * buffer pool into that many sections.
2468  */
2469  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2470 
2471  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2472  {
2473 #ifdef BGW_DEBUG
2474  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2475  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2476 #endif
2477  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2478  }
2479 
2480  /*
2481  * Now write out dirty reusable buffers, working forward from the
2482  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2483  * enough buffers to match our estimate of the next cycle's allocation
2484  * requirements, or hit the bgwriter_lru_maxpages limit.
2485  */
2486 
2487  /* Make sure we can handle the pin inside SyncOneBuffer */
2489 
2490  num_to_scan = bufs_to_lap;
2491  num_written = 0;
2492  reusable_buffers = reusable_buffers_est;
2493 
2494  /* Execute the LRU scan */
2495  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2496  {
2497  int sync_state = SyncOneBuffer(next_to_clean, true,
2498  wb_context);
2499 
2500  if (++next_to_clean >= NBuffers)
2501  {
2502  next_to_clean = 0;
2503  next_passes++;
2504  }
2505  num_to_scan--;
2506 
2507  if (sync_state & BUF_WRITTEN)
2508  {
2509  reusable_buffers++;
2510  if (++num_written >= bgwriter_lru_maxpages)
2511  {
2513  break;
2514  }
2515  }
2516  else if (sync_state & BUF_REUSABLE)
2517  reusable_buffers++;
2518  }
2519 
2520  PendingBgWriterStats.buf_written_clean += num_written;
2521 
2522 #ifdef BGW_DEBUG
2523  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2524  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2525  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2526  bufs_to_lap - num_to_scan,
2527  num_written,
2528  reusable_buffers - reusable_buffers_est);
2529 #endif
2530 
2531  /*
2532  * Consider the above scan as being like a new allocation scan.
2533  * Characterize its density and update the smoothed one based on it. This
2534  * effectively halves the moving average period in cases where both the
2535  * strategy and the background writer are doing some useful scanning,
2536  * which is helpful because a long memory isn't as desirable on the
2537  * density estimates.
2538  */
2539  new_strategy_delta = bufs_to_lap - num_to_scan;
2540  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2541  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2542  {
2543  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2544  smoothed_density += (scans_per_alloc - smoothed_density) /
2545  smoothing_samples;
2546 
2547 #ifdef BGW_DEBUG
2548  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2549  new_recent_alloc, new_strategy_delta,
2550  scans_per_alloc, smoothed_density);
2551 #endif
2552  }
2553 
2554  /* Return true if OK to hibernate */
2555  return (bufs_to_lap == 0 && recent_alloc == 0);
2556 }
int BgWriterDelay
Definition: bgwriter.c:61
#define BUF_REUSABLE
Definition: bufmgr.c:71
double bgwriter_lru_multiplier
Definition: bufmgr.c:136
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2575
int bgwriter_lru_maxpages
Definition: bufmgr.c:135
#define BUF_WRITTEN
Definition: bufmgr.c:70
signed int int32
Definition: c.h:478
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
int NBuffers
Definition: globals.c:136
PgStat_BgWriterStats PendingBgWriterStats
ResourceOwner CurrentResourceOwner
Definition: resowner.c:146
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:950
PgStat_Counter buf_written_clean
Definition: pgstat.h:254
PgStat_Counter maxwritten_clean
Definition: pgstat.h:255
PgStat_Counter buf_alloc
Definition: pgstat.h:256

References Assert(), bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, PgStat_BgWriterStats::buf_alloc, BUF_REUSABLE, BUF_WRITTEN, PgStat_BgWriterStats::buf_written_clean, CurrentResourceOwner, DEBUG1, DEBUG2, elog(), PgStat_BgWriterStats::maxwritten_clean, NBuffers, PendingBgWriterStats, ResourceOwnerEnlargeBuffers(), StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

◆ BufferAlloc()

static BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr,
IOContext io_context 
)
static

Definition at line 1139 of file bufmgr.c.

1143 {
1144  bool from_ring;
1145  BufferTag newTag; /* identity of requested block */
1146  uint32 newHash; /* hash value for newTag */
1147  LWLock *newPartitionLock; /* buffer partition lock for it */
1148  BufferTag oldTag; /* previous identity of selected buffer */
1149  uint32 oldHash; /* hash value for oldTag */
1150  LWLock *oldPartitionLock; /* buffer partition lock for it */
1151  uint32 oldFlags;
1152  int buf_id;
1153  BufferDesc *buf;
1154  bool valid;
1155  uint32 buf_state;
1156 
1157  /* create a tag so we can lookup the buffer */
1158  InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1159 
1160  /* determine its hash code and partition lock ID */
1161  newHash = BufTableHashCode(&newTag);
1162  newPartitionLock = BufMappingPartitionLock(newHash);
1163 
1164  /* see if the block is in the buffer pool already */
1165  LWLockAcquire(newPartitionLock, LW_SHARED);
1166  buf_id = BufTableLookup(&newTag, newHash);
1167  if (buf_id >= 0)
1168  {
1169  /*
1170  * Found it. Now, pin the buffer so no one can steal it from the
1171  * buffer pool, and check to see if the correct data has been loaded
1172  * into the buffer.
1173  */
1174  buf = GetBufferDescriptor(buf_id);
1175 
1176  valid = PinBuffer(buf, strategy);
1177 
1178  /* Can release the mapping lock as soon as we've pinned it */
1179  LWLockRelease(newPartitionLock);
1180 
1181  *foundPtr = true;
1182 
1183  if (!valid)
1184  {
1185  /*
1186  * We can only get here if (a) someone else is still reading in
1187  * the page, or (b) a previous read attempt failed. We have to
1188  * wait for any active read attempt to finish, and then set up our
1189  * own read attempt if the page is still not BM_VALID.
1190  * StartBufferIO does it all.
1191  */
1192  if (StartBufferIO(buf, true))
1193  {
1194  /*
1195  * If we get here, previous attempts to read the buffer must
1196  * have failed ... but we shall bravely try again. Set
1197  * io_context since we will in fact need to count an IO
1198  * Operation.
1199  */
1200  *io_context = IOContextForStrategy(strategy);
1201  *foundPtr = false;
1202  }
1203  }
1204 
1205  return buf;
1206  }
1207 
1208  /*
1209  * Didn't find it in the buffer pool. We'll have to initialize a new
1210  * buffer. Remember to unlock the mapping lock while doing the work.
1211  */
1212  LWLockRelease(newPartitionLock);
1213 
1214  *io_context = IOContextForStrategy(strategy);
1215 
1216  /* Loop here in case we have to try another victim buffer */
1217  for (;;)
1218  {
1219  /*
1220  * Ensure, while the spinlock's not yet held, that there's a free
1221  * refcount entry.
1222  */
1224 
1225  /*
1226  * Select a victim buffer. The buffer is returned with its header
1227  * spinlock still held!
1228  */
1229  buf = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1230 
1231  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1232 
1233  /* Must copy buffer flags while we still hold the spinlock */
1234  oldFlags = buf_state & BUF_FLAG_MASK;
1235 
1236  /* Pin the buffer and then release the buffer spinlock */
1238 
1239  /*
1240  * If the buffer was dirty, try to write it out. There is a race
1241  * condition here, in that someone might dirty it after we released it
1242  * above, or even while we are writing it out (since our share-lock
1243  * won't prevent hint-bit updates). We will recheck the dirty bit
1244  * after re-locking the buffer header.
1245  */
1246  if (oldFlags & BM_DIRTY)
1247  {
1248  /*
1249  * We need a share-lock on the buffer contents to write it out
1250  * (else we might write invalid data, eg because someone else is
1251  * compacting the page contents while we write). We must use a
1252  * conditional lock acquisition here to avoid deadlock. Even
1253  * though the buffer was not pinned (and therefore surely not
1254  * locked) when StrategyGetBuffer returned it, someone else could
1255  * have pinned and exclusive-locked it by the time we get here. If
1256  * we try to get the lock unconditionally, we'd block waiting for
1257  * them; if they later block waiting for us, deadlock ensues.
1258  * (This has been observed to happen when two backends are both
1259  * trying to split btree index pages, and the second one just
1260  * happens to be trying to split the page the first one got from
1261  * StrategyGetBuffer.)
1262  */
1264  LW_SHARED))
1265  {
1266  /*
1267  * If using a nondefault strategy, and writing the buffer
1268  * would require a WAL flush, let the strategy decide whether
1269  * to go ahead and write/reuse the buffer or to choose another
1270  * victim. We need lock to inspect the page LSN, so this
1271  * can't be done inside StrategyGetBuffer.
1272  */
1273  if (strategy != NULL)
1274  {
1275  XLogRecPtr lsn;
1276 
1277  /* Read the LSN while holding buffer header lock */
1278  buf_state = LockBufHdr(buf);
1279  lsn = BufferGetLSN(buf);
1280  UnlockBufHdr(buf, buf_state);
1281 
1282  if (XLogNeedsFlush(lsn) &&
1283  StrategyRejectBuffer(strategy, buf, from_ring))
1284  {
1285  /* Drop lock/pin and loop around for another buffer */
1287  UnpinBuffer(buf);
1288  continue;
1289  }
1290  }
1291 
1292  /* OK, do the I/O */
1293  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1295  smgr->smgr_rlocator.locator.dbOid,
1297 
1298  FlushBuffer(buf, NULL, IOOBJECT_RELATION, *io_context);
1300 
1302  &buf->tag);
1303 
1304  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1306  smgr->smgr_rlocator.locator.dbOid,
1308  }
1309  else
1310  {
1311  /*
1312  * Someone else has locked the buffer, so give it up and loop
1313  * back to get another one.
1314  */
1315  UnpinBuffer(buf);
1316  continue;
1317  }
1318  }
1319 
1320  /*
1321  * To change the association of a valid buffer, we'll need to have
1322  * exclusive lock on both the old and new mapping partitions.
1323  */
1324  if (oldFlags & BM_TAG_VALID)
1325  {
1326  /*
1327  * Need to compute the old tag's hashcode and partition lock ID.
1328  * XXX is it worth storing the hashcode in BufferDesc so we need
1329  * not recompute it here? Probably not.
1330  */
1331  oldTag = buf->tag;
1332  oldHash = BufTableHashCode(&oldTag);
1333  oldPartitionLock = BufMappingPartitionLock(oldHash);
1334 
1335  /*
1336  * Must lock the lower-numbered partition first to avoid
1337  * deadlocks.
1338  */
1339  if (oldPartitionLock < newPartitionLock)
1340  {
1341  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1342  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1343  }
1344  else if (oldPartitionLock > newPartitionLock)
1345  {
1346  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1347  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1348  }
1349  else
1350  {
1351  /* only one partition, only one lock */
1352  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1353  }
1354  }
1355  else
1356  {
1357  /* if it wasn't valid, we need only the new partition */
1358  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1359  /* remember we have no old-partition lock or tag */
1360  oldPartitionLock = NULL;
1361  /* keep the compiler quiet about uninitialized variables */
1362  oldHash = 0;
1363  }
1364 
1365  /*
1366  * Try to make a hashtable entry for the buffer under its new tag.
1367  * This could fail because while we were writing someone else
1368  * allocated another buffer for the same block we want to read in.
1369  * Note that we have not yet removed the hashtable entry for the old
1370  * tag.
1371  */
1372  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1373 
1374  if (buf_id >= 0)
1375  {
1376  /*
1377  * Got a collision. Someone has already done what we were about to
1378  * do. We'll just handle this as if it were found in the buffer
1379  * pool in the first place. First, give up the buffer we were
1380  * planning to use.
1381  */
1382  UnpinBuffer(buf);
1383 
1384  /* Can give up that buffer's mapping partition lock now */
1385  if (oldPartitionLock != NULL &&
1386  oldPartitionLock != newPartitionLock)
1387  LWLockRelease(oldPartitionLock);
1388 
1389  /* remaining code should match code at top of routine */
1390 
1391  buf = GetBufferDescriptor(buf_id);
1392 
1393  valid = PinBuffer(buf, strategy);
1394 
1395  /* Can release the mapping lock as soon as we've pinned it */
1396  LWLockRelease(newPartitionLock);
1397 
1398  *foundPtr = true;
1399 
1400  if (!valid)
1401  {
1402  /*
1403  * We can only get here if (a) someone else is still reading
1404  * in the page, or (b) a previous read attempt failed. We
1405  * have to wait for any active read attempt to finish, and
1406  * then set up our own read attempt if the page is still not
1407  * BM_VALID. StartBufferIO does it all.
1408  */
1409  if (StartBufferIO(buf, true))
1410  {
1411  /*
1412  * If we get here, previous attempts to read the buffer
1413  * must have failed ... but we shall bravely try again.
1414  */
1415  *foundPtr = false;
1416  }
1417  }
1418 
1419  return buf;
1420  }
1421 
1422  /*
1423  * Need to lock the buffer header too in order to change its tag.
1424  */
1425  buf_state = LockBufHdr(buf);
1426 
1427  /*
1428  * Somebody could have pinned or re-dirtied the buffer while we were
1429  * doing the I/O and making the new hashtable entry. If so, we can't
1430  * recycle this buffer; we must undo everything we've done and start
1431  * over with a new victim buffer.
1432  */
1433  oldFlags = buf_state & BUF_FLAG_MASK;
1434  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1435  break;
1436 
1437  UnlockBufHdr(buf, buf_state);
1438  BufTableDelete(&newTag, newHash);
1439  if (oldPartitionLock != NULL &&
1440  oldPartitionLock != newPartitionLock)
1441  LWLockRelease(oldPartitionLock);
1442  LWLockRelease(newPartitionLock);
1443  UnpinBuffer(buf);
1444  }
1445 
1446  /*
1447  * Okay, it's finally safe to rename the buffer.
1448  *
1449  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1450  * paranoia. We also reset the usage_count since any recency of use of
1451  * the old content is no longer relevant. (The usage_count starts out at
1452  * 1 so that the buffer can survive one clock-sweep pass.)
1453  *
1454  * Make sure BM_PERMANENT is set for buffers that must be written at every
1455  * checkpoint. Unlogged buffers only need to be written at shutdown
1456  * checkpoints, except for their "init" forks, which need to be treated
1457  * just like permanent relations.
1458  */
1459  buf->tag = newTag;
1460  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1463  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1464  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1465  else
1466  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1467 
1468  UnlockBufHdr(buf, buf_state);
1469 
1470  if (oldPartitionLock != NULL)
1471  {
1472  BufTableDelete(&oldTag, oldHash);
1473  if (oldPartitionLock != newPartitionLock)
1474  LWLockRelease(oldPartitionLock);
1475  }
1476 
1477  LWLockRelease(newPartitionLock);
1478 
1479  if (oldFlags & BM_VALID)
1480  {
1481  /*
1482  * When a BufferAccessStrategy is in use, blocks evicted from shared
1483  * buffers are counted as IOOP_EVICT in the corresponding context
1484  * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
1485  * strategy in two cases: 1) while initially claiming buffers for the
1486  * strategy ring 2) to replace an existing strategy ring buffer
1487  * because it is pinned or in use and cannot be reused.
1488  *
1489  * Blocks evicted from buffers already in the strategy ring are
1490  * counted as IOOP_REUSE in the corresponding strategy context.
1491  *
1492  * At this point, we can accurately count evictions and reuses,
1493  * because we have successfully claimed the valid buffer. Previously,
1494  * we may have been forced to release the buffer due to concurrent
1495  * pinners or erroring out.
1496  */
1498  from_ring ? IOOP_REUSE : IOOP_EVICT);
1499  }
1500 
1501  /*
1502  * Buffer contents are currently invalid. Try to obtain the right to
1503  * start I/O. If StartBufferIO returns false, then someone else managed
1504  * to read it before we did, so there's nothing left for BufferAlloc() to
1505  * do.
1506  */
1507  if (StartBufferIO(buf, true))
1508  *foundPtr = false;
1509  else
1510  *foundPtr = true;
1511 
1512  return buf;
1513 }
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_TAG_VALID
Definition: buf_internals.h:62
#define BM_PERMANENT
Definition: buf_internals.h:68
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:44
static BufferDesc * GetBufferDescriptor(uint32 id)
#define BUF_FLAG_MASK
Definition: buf_internals.h:47
static LWLock * BufMappingPartitionLock(uint32 hashcode)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:65
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:50
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:67
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:2871
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1752
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1855
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:63
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4974
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4626
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:217
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:1898
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:673
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:713
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1195
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1803
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1366
@ LW_SHARED
Definition: lwlock.h:116
@ LW_EXCLUSIVE
Definition: lwlock.h:115
@ IOOBJECT_RELATION
Definition: pgstat.h:277
@ IOOP_EVICT
Definition: pgstat.h:295
@ IOOP_REUSE
Definition: pgstat.h:299
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
Definition: pgstat_io.c:66
@ INIT_FORKNUM
Definition: relpath.h:53
Definition: lwlock.h:40
RelFileLocator locator
RelFileNumber relNumber
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:42
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:2843
uint64 XLogRecPtr
Definition: xlogdefs.h:21

References Assert(), BackendWritebackContext, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_ERROR, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BUF_USAGECOUNT_ONE, BufferDescriptorGetContentLock(), BufferGetLSN, BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), RelFileLocator::dbOid, FlushBuffer(), GetBufferDescriptor(), INIT_FORKNUM, InitBufferTag(), IOContextForStrategy(), IOOBJECT_RELATION, IOOP_EVICT, IOOP_REUSE, RelFileLocatorBackend::locator, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockConditionalAcquire(), LWLockRelease(), pgstat_count_io_op(), PinBuffer(), PinBuffer_Locked(), RelFileLocator::relNumber, ReservePrivateRefCountEntry(), ScheduleBufferTagForWriteback(), SMgrRelationData::smgr_rlocator, RelFileLocator::spcOid, StartBufferIO(), StrategyGetBuffer(), StrategyRejectBuffer(), UnlockBufHdr(), UnpinBuffer(), and XLogNeedsFlush().

Referenced by ReadBuffer_common().

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 2811 of file bufmgr.c.

2812 {
2813  BufferDesc *bufHdr;
2814 
2815  Assert(BufferIsPinned(buffer));
2816 
2817  if (BufferIsLocal(buffer))
2818  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2819  else
2820  bufHdr = GetBufferDescriptor(buffer - 1);
2821 
2822  /* pinned, so OK to read tag without spinlock */
2823  return bufHdr->tag.blockNum;
2824 }
#define BufferIsLocal(buffer)
Definition: buf.h:37
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:441
BufferTag tag
BlockNumber blockNum
Definition: buf_internals.h:97

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by _bt_binsrch_insert(), _bt_bottomupdel_pass(), _bt_check_unique(), _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_endpoint(), _bt_finish_split(), _bt_first(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_simpledel_pass(), _bt_split(), _bt_unlink_halfdead_page(), _bt_walk_left(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_fork_with_data(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_is_all_visible(), heap_page_prune(), heap_prune_chain(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), index_compute_xid_horizon_for_tuples(), lazy_scan_noprune(), lazy_scan_prune(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), ReadBufferBI(), RelationAddExtraBlocks(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), ScanSourceDatabasePgClassPage(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgWalk(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), visibilitymap_set(), and XLogReadBufferExtended().

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 3083 of file bufmgr.c.

3084 {
3085  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
3086  char *page = BufferGetPage(buffer);
3087  XLogRecPtr lsn;
3088  uint32 buf_state;
3089 
3090  /*
3091  * If we don't need locking for correctness, fastpath out.
3092  */
3093  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
3094  return PageGetLSN(page);
3095 
3096  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3097  Assert(BufferIsValid(buffer));
3098  Assert(BufferIsPinned(buffer));
3099 
3100  buf_state = LockBufHdr(bufHdr);
3101  lsn = PageGetLSN(page);
3102  UnlockBufHdr(bufHdr, buf_state);
3103 
3104  return lsn;
3105 }
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:285
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
#define XLogHintBitIsNeeded()
Definition: xlog.h:115

References Assert(), PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), LockBufHdr(), PageGetLSN(), UnlockBufHdr(), and XLogHintBitIsNeeded.

Referenced by _bt_killitems(), _bt_readpage(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileLocator rlocator,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 2832 of file bufmgr.c.

2834 {
2835  BufferDesc *bufHdr;
2836 
2837  /* Do the same checks as BufferGetBlockNumber. */
2838  Assert(BufferIsPinned(buffer));
2839 
2840  if (BufferIsLocal(buffer))
2841  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2842  else
2843  bufHdr = GetBufferDescriptor(buffer - 1);
2844 
2845  /* pinned, so OK to read tag without spinlock */
2846  *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
2847  *forknum = BufTagGetForkNum(&bufHdr->tag);
2848  *blknum = bufHdr->tag.blockNum;
2849 }

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 3053 of file bufmgr.c.

3054 {
3055  BufferDesc *bufHdr;
3056 
3057  /* Local buffers are used only for temp relations. */
3058  if (BufferIsLocal(buffer))
3059  return false;
3060 
3061  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3062  Assert(BufferIsValid(buffer));
3063  Assert(BufferIsPinned(buffer));
3064 
3065  /*
3066  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3067  * need not bother with the buffer header spinlock. Even if someone else
3068  * changes the buffer header state while we're doing this, the state is
3069  * changed atomically, so we'll read the old value or the new value, but
3070  * not random garbage.
3071  */
3072  bufHdr = GetBufferDescriptor(buffer - 1);
3073  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3074 }
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:236
pg_atomic_uint32 state

References Assert(), BM_PERMANENT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 1996 of file bufmgr.c.

1997 {
1998  uint32 buf_state;
1999  int buf_id;
2000  int num_to_scan;
2001  int num_spaces;
2002  int num_processed;
2003  int num_written;
2004  CkptTsStatus *per_ts_stat = NULL;
2005  Oid last_tsid;
2006  binaryheap *ts_heap;
2007  int i;
2008  int mask = BM_DIRTY;
2009  WritebackContext wb_context;
2010 
2011  /* Make sure we can handle the pin inside SyncOneBuffer */
2013 
2014  /*
2015  * Unless this is a shutdown checkpoint or we have been explicitly told,
2016  * we write only permanent, dirty buffers. But at shutdown or end of
2017  * recovery, we write all dirty buffers.
2018  */
2021  mask |= BM_PERMANENT;
2022 
2023  /*
2024  * Loop over all buffers, and mark the ones that need to be written with
2025  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2026  * can estimate how much work needs to be done.
2027  *
2028  * This allows us to write only those pages that were dirty when the
2029  * checkpoint began, and not those that get dirtied while it proceeds.
2030  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2031  * later in this function, or by normal backends or the bgwriter cleaning
2032  * scan, the flag is cleared. Any buffer dirtied after this point won't
2033  * have the flag set.
2034  *
2035  * Note that if we fail to write some buffer, we may leave buffers with
2036  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2037  * certainly need to be written for the next checkpoint attempt, too.
2038  */
2039  num_to_scan = 0;
2040  for (buf_id = 0; buf_id < NBuffers; buf_id++)
2041  {
2042  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2043 
2044  /*
2045  * Header spinlock is enough to examine BM_DIRTY, see comment in
2046  * SyncOneBuffer.
2047  */
2048  buf_state = LockBufHdr(bufHdr);
2049 
2050  if ((buf_state & mask) == mask)
2051  {
2052  CkptSortItem *item;
2053 
2054  buf_state |= BM_CHECKPOINT_NEEDED;
2055 
2056  item = &CkptBufferIds[num_to_scan++];
2057  item->buf_id = buf_id;
2058  item->tsId = bufHdr->tag.spcOid;
2059  item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2060  item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2061  item->blockNum = bufHdr->tag.blockNum;
2062  }
2063 
2064  UnlockBufHdr(bufHdr, buf_state);
2065 
2066  /* Check for barrier events in case NBuffers is large. */
2069  }
2070 
2071  if (num_to_scan == 0)
2072  return; /* nothing to do */
2073 
2075 
2076  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2077 
2078  /*
2079  * Sort buffers that need to be written to reduce the likelihood of random
2080  * IO. The sorting is also important for the implementation of balancing
2081  * writes between tablespaces. Without balancing writes we'd potentially
2082  * end up writing to the tablespaces one-by-one; possibly overloading the
2083  * underlying system.
2084  */
2085  sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2086 
2087  num_spaces = 0;
2088 
2089  /*
2090  * Allocate progress status for each tablespace with buffers that need to
2091  * be flushed. This requires the to-be-flushed array to be sorted.
2092  */
2093  last_tsid = InvalidOid;
2094  for (i = 0; i < num_to_scan; i++)
2095  {
2096  CkptTsStatus *s;
2097  Oid cur_tsid;
2098 
2099  cur_tsid = CkptBufferIds[i].tsId;
2100 
2101  /*
2102  * Grow array of per-tablespace status structs, every time a new
2103  * tablespace is found.
2104  */
2105  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
2106  {
2107  Size sz;
2108 
2109  num_spaces++;
2110 
2111  /*
2112  * Not worth adding grow-by-power-of-2 logic here - even with a
2113  * few hundred tablespaces this should be fine.
2114  */
2115  sz = sizeof(CkptTsStatus) * num_spaces;
2116 
2117  if (per_ts_stat == NULL)
2118  per_ts_stat = (CkptTsStatus *) palloc(sz);
2119  else
2120  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
2121 
2122  s = &per_ts_stat[num_spaces - 1];
2123  memset(s, 0, sizeof(*s));
2124  s->tsId = cur_tsid;
2125 
2126  /*
2127  * The first buffer in this tablespace. As CkptBufferIds is sorted
2128  * by tablespace all (s->num_to_scan) buffers in this tablespace
2129  * will follow afterwards.
2130  */
2131  s->index = i;
2132 
2133  /*
2134  * progress_slice will be determined once we know how many buffers
2135  * are in each tablespace, i.e. after this loop.
2136  */
2137 
2138  last_tsid = cur_tsid;
2139  }
2140  else
2141  {
2142  s = &per_ts_stat[num_spaces - 1];
2143  }
2144 
2145  s->num_to_scan++;
2146 
2147  /* Check for barrier events. */
2150  }
2151 
2152  Assert(num_spaces > 0);
2153 
2154  /*
2155  * Build a min-heap over the write-progress in the individual tablespaces,
2156  * and compute how large a portion of the total progress a single
2157  * processed buffer is.
2158  */
2159  ts_heap = binaryheap_allocate(num_spaces,
2161  NULL);
2162 
2163  for (i = 0; i < num_spaces; i++)
2164  {
2165  CkptTsStatus *ts_stat = &per_ts_stat[i];
2166 
2167  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
2168 
2169  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
2170  }
2171 
2172  binaryheap_build(ts_heap);
2173 
2174  /*
2175  * Iterate through to-be-checkpointed buffers and write the ones (still)
2176  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
2177  * tablespaces; otherwise the sorting would lead to only one tablespace
2178  * receiving writes at a time, making inefficient use of the hardware.
2179  */
2180  num_processed = 0;
2181  num_written = 0;
2182  while (!binaryheap_empty(ts_heap))
2183  {
2184  BufferDesc *bufHdr = NULL;
2185  CkptTsStatus *ts_stat = (CkptTsStatus *)
2187 
2188  buf_id = CkptBufferIds[ts_stat->index].buf_id;
2189  Assert(buf_id != -1);
2190 
2191  bufHdr = GetBufferDescriptor(buf_id);
2192 
2193  num_processed++;
2194 
2195  /*
2196  * We don't need to acquire the lock here, because we're only looking
2197  * at a single bit. It's possible that someone else writes the buffer
2198  * and clears the flag right after we check, but that doesn't matter
2199  * since SyncOneBuffer will then do nothing. However, there is a
2200  * further race condition: it's conceivable that between the time we
2201  * examine the bit here and the time SyncOneBuffer acquires the lock,
2202  * someone else not only wrote the buffer but replaced it with another
2203  * page and dirtied it. In that improbable case, SyncOneBuffer will
2204  * write the buffer though we didn't need to. It doesn't seem worth
2205  * guarding against this, though.
2206  */
2208  {
2209  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2210  {
2211  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2213  num_written++;
2214  }
2215  }
2216 
2217  /*
2218  * Measure progress independent of actually having to flush the buffer
2219  * - otherwise writing become unbalanced.
2220  */
2221  ts_stat->progress += ts_stat->progress_slice;
2222  ts_stat->num_scanned++;
2223  ts_stat->index++;
2224 
2225  /* Have all the buffers from the tablespace been processed? */
2226  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2227  {
2228  binaryheap_remove_first(ts_heap);
2229  }
2230  else
2231  {
2232  /* update heap with the new progress */
2233  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2234  }
2235 
2236  /*
2237  * Sleep to throttle our I/O rate.
2238  *
2239  * (This will check for barrier events even if it doesn't sleep.)
2240  */
2241  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2242  }
2243 
2244  /* issue all pending flushes */
2245  IssuePendingWritebacks(&wb_context);
2246 
2247  pfree(per_ts_stat);
2248  per_ts_stat = NULL;
2249  binaryheap_free(ts_heap);
2250 
2251  /*
2252  * Update checkpoint statistics. As noted above, this doesn't include
2253  * buffers written by other backends or bgwriter scan.
2254  */
2255  CheckpointStats.ckpt_bufs_written += num_written;
2256 
2257  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2258 }
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:125
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:109
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:32
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:173
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:68
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:207
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:158
#define binaryheap_empty(h)
Definition: binaryheap.h:52
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4939
int checkpoint_flush_after
Definition: bufmgr.c:158
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4962
struct CkptTsStatus CkptTsStatus
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:5015
double float8
Definition: c.h:614
size_t Size
Definition: c.h:589
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:697
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:37
int i
Definition: isn.c:73
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1456
void * palloc(Size size)
Definition: mcxt.c:1210
PgStat_CheckpointerStats PendingCheckpointerStats
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:468
int ckpt_bufs_written
Definition: xlog.h:162
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:109
int index
Definition: bufmgr.c:117
int num_scanned
Definition: bufmgr.c:114
float8 progress
Definition: bufmgr.c:108
int num_to_scan
Definition: bufmgr.c:112
Oid tsId
Definition: bufmgr.c:99
PgStat_Counter buf_written_checkpoints
Definition: pgstat.h:266
Oid spcOid
Definition: buf_internals.h:93
CheckpointStatsData CheckpointStats
Definition: xlog.c:212
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:135
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:138
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:134

References Assert(), binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, PgStat_CheckpointerStats::buf_written_checkpoints, BufTagGetForkNum(), BufTagGetRelNumber(), CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_ALL, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, CurrentResourceOwner, DatumGetPointer(), CkptSortItem::forkNum, GetBufferDescriptor(), i, CkptTsStatus::index, InvalidOid, IssuePendingWritebacks(), LockBufHdr(), NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), PendingCheckpointerStats, pfree(), pg_atomic_read_u32(), PointerGetDatum(), ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress, CkptTsStatus::progress_slice, CkptSortItem::relNumber, repalloc(), ResourceOwnerEnlargeBuffers(), buftag::spcOid, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdr(), and WritebackContextInit().

Referenced by CheckPointBuffers().

◆ buffertag_comparator()

static int buffertag_comparator ( const BufferTag ba,
const BufferTag bb 
)
inlinestatic

Definition at line 4874 of file bufmgr.c.

4875 {
4876  int ret;
4877  RelFileLocator rlocatora;
4878  RelFileLocator rlocatorb;
4879 
4880  rlocatora = BufTagGetRelFileLocator(ba);
4881  rlocatorb = BufTagGetRelFileLocator(bb);
4882 
4883  ret = rlocator_comparator(&rlocatora, &rlocatorb);
4884 
4885  if (ret != 0)
4886  return ret;
4887 
4888  if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
4889  return -1;
4890  if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
4891  return 1;
4892 
4893  if (ba->blockNum < bb->blockNum)
4894  return -1;
4895  if (ba->blockNum > bb->blockNum)
4896  return 1;
4897 
4898  return 0;
4899 }
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4795

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), and rlocator_comparator().

◆ BufmgrCommit()

void BufmgrCommit ( void  )

Definition at line 2797 of file bufmgr.c.

2798 {
2799  /* Nothing to do in bufmgr anymore... */
2800 }

Referenced by PrepareTransaction(), and RecordTransactionCommit().

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 2703 of file bufmgr.c.

2704 {
2705 #ifdef USE_ASSERT_CHECKING
2706  int RefCountErrors = 0;
2708  int i;
2709 
2710  /* check the array */
2711  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2712  {
2714 
2715  if (res->buffer != InvalidBuffer)
2716  {
2717  PrintBufferLeakWarning(res->buffer);
2718  RefCountErrors++;
2719  }
2720  }
2721 
2722  /* if necessary search the hash */
2724  {
2725  HASH_SEQ_STATUS hstat;
2726 
2728  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2729  {
2730  PrintBufferLeakWarning(res->buffer);
2731  RefCountErrors++;
2732  }
2733  }
2734 
2735  Assert(RefCountErrors == 0);
2736 #endif
2737 }
#define InvalidBuffer
Definition: buf.h:25
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:90
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2743
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:199
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:200
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1431
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1421

References Assert(), hash_seq_init(), hash_seq_search(), i, InvalidBuffer, PrintBufferLeakWarning(), PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and res.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 2787 of file bufmgr.c.

2788 {
2789  BufferSync(flags);
2790 }
static void BufferSync(int flags)
Definition: bufmgr.c:1996

References BufferSync().

Referenced by CheckPointGuts().

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const CkptSortItem a,
const CkptSortItem b 
)
inlinestatic

Definition at line 4908 of file bufmgr.c.

4909 {
4910  /* compare tablespace */
4911  if (a->tsId < b->tsId)
4912  return -1;
4913  else if (a->tsId > b->tsId)
4914  return 1;
4915  /* compare relation */
4916  if (a->relNumber < b->relNumber)
4917  return -1;
4918  else if (a->relNumber > b->relNumber)
4919  return 1;
4920  /* compare fork */
4921  else if (a->forkNum < b->forkNum)
4922  return -1;
4923  else if (a->forkNum > b->forkNum)
4924  return 1;
4925  /* compare block number */
4926  else if (a->blockNum < b->blockNum)
4927  return -1;
4928  else if (a->blockNum > b->blockNum)
4929  return 1;
4930  /* equal page IDs are unlikely, but not impossible */
4931  return 0;
4932 }
int b
Definition: isn.c:70
int a
Definition: isn.c:69

References a, and b.

◆ ConditionalLockBuffer()

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 4473 of file bufmgr.c.

4474 {
4475  BufferDesc *bufHdr;
4476  uint32 buf_state,
4477  refcount;
4478 
4479  Assert(BufferIsValid(buffer));
4480 
4481  if (BufferIsLocal(buffer))
4482  {
4483  refcount = LocalRefCount[-buffer - 1];
4484  /* There should be exactly one pin */
4485  Assert(refcount > 0);
4486  if (refcount != 1)
4487  return false;
4488  /* Nobody else to wait for */
4489  return true;
4490  }
4491 
4492  /* There should be exactly one local pin */
4493  refcount = GetPrivateRefCount(buffer);
4494  Assert(refcount);
4495  if (refcount != 1)
4496  return false;
4497 
4498  /* Try to acquire lock */
4499  if (!ConditionalLockBuffer(buffer))
4500  return false;
4501 
4502  bufHdr = GetBufferDescriptor(buffer - 1);
4503  buf_state = LockBufHdr(bufHdr);
4504  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
4505 
4506  Assert(refcount > 0);
4507  if (refcount == 1)
4508  {
4509  /* Successfully acquired exclusive lock with pincount 1 */
4510  UnlockBufHdr(bufHdr, buf_state);
4511  return true;
4512  }
4513 
4514  /* Failed, so release the lock */
4515  UnlockBufHdr(bufHdr, buf_state);
4516  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4517  return false;
4518 }
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:4272
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4246
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:110

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), ConditionalLockBuffer(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr().

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), and lazy_scan_heap().

◆ CreateAndCopyRelationData()

void CreateAndCopyRelationData ( RelFileLocator  src_rlocator,
RelFileLocator  dst_rlocator,
bool  permanent 
)

Definition at line 3873 of file bufmgr.c.

3875 {
3876  RelFileLocatorBackend rlocator;
3877  char relpersistence;
3878 
3879  /* Set the relpersistence. */
3880  relpersistence = permanent ?
3881  RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
3882 
3883  /*
3884  * Create and copy all forks of the relation. During create database we
3885  * have a separate cleanup mechanism which deletes complete database
3886  * directory. Therefore, each individual relation doesn't need to be
3887  * registered for cleanup.
3888  */
3889  RelationCreateStorage(dst_rlocator, relpersistence, false);
3890 
3891  /* copy main fork. */
3892  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
3893  permanent);
3894 
3895  /* copy those extra forks that exist */
3896  for (ForkNumber forkNum = MAIN_FORKNUM + 1;
3897  forkNum <= MAX_FORKNUM; forkNum++)
3898  {
3899  if (smgrexists(smgropen(src_rlocator, InvalidBackendId), forkNum))
3900  {
3901  smgrcreate(smgropen(dst_rlocator, InvalidBackendId), forkNum, false);
3902 
3903  /*
3904  * WAL log creation if the relation is persistent, or this is the
3905  * init fork of an unlogged relation.
3906  */
3907  if (permanent || forkNum == INIT_FORKNUM)
3908  log_smgrcreate(&dst_rlocator, forkNum);
3909 
3910  /* Copy a fork's data, block by block. */
3911  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
3912  permanent);
3913  }
3914  }
3915 
3916  /* close source and destination smgr if exists. */
3917  rlocator.backend = InvalidBackendId;
3918 
3919  rlocator.locator = src_rlocator;
3920  smgrcloserellocator(rlocator);
3921 
3922  rlocator.locator = dst_rlocator;
3923  smgrcloserellocator(rlocator);
3924 }
#define InvalidBackendId
Definition: backendid.h:23
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:3782
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
#define MAX_FORKNUM
Definition: relpath.h:62
void smgrcloserellocator(RelFileLocatorBackend rlocator)
Definition: smgr.c:346
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:369
SMgrRelation smgropen(RelFileLocator rlocator, BackendId backend)
Definition: smgr.c:146
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:247
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:120
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:185

References RelFileLocatorBackend::backend, INIT_FORKNUM, InvalidBackendId, RelFileLocatorBackend::locator, log_smgrcreate(), MAIN_FORKNUM, MAX_FORKNUM, RelationCopyStorageUsingBuffer(), RelationCreateStorage(), smgrcloserellocator(), smgrcreate(), smgrexists(), and smgropen().

Referenced by CreateDatabaseUsingWalLog().

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 3484 of file bufmgr.c.

3485 {
3486  int i;
3487 
3488  /*
3489  * We needn't consider local buffers, since by assumption the target
3490  * database isn't our own.
3491  */
3492 
3493  for (i = 0; i < NBuffers; i++)
3494  {
3495  BufferDesc *bufHdr = GetBufferDescriptor(i);
3496  uint32 buf_state;
3497 
3498  /*
3499  * As in DropRelationBuffers, an unlocked precheck should be safe and
3500  * saves some cycles.
3501  */
3502  if (bufHdr->tag.dbOid != dbid)
3503  continue;
3504 
3505  buf_state = LockBufHdr(bufHdr);
3506  if (bufHdr->tag.dbOid == dbid)
3507  InvalidateBuffer(bufHdr); /* releases spinlock */
3508  else
3509  UnlockBufHdr(bufHdr, buf_state);
3510  }
3511 }
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1533
Oid dbOid
Definition: buf_internals.h:94

References buftag::dbOid, GetBufferDescriptor(), i, InvalidateBuffer(), LockBufHdr(), NBuffers, BufferDesc::tag, and UnlockBufHdr().

Referenced by createdb_failure_callback(), dbase_redo(), dropdb(), and movedb().

◆ DropRelationBuffers()

void DropRelationBuffers ( SMgrRelation  smgr_reln,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 3129 of file bufmgr.c.

3131 {
3132  int i;
3133  int j;
3134  RelFileLocatorBackend rlocator;
3135  BlockNumber nForkBlock[MAX_FORKNUM];
3136  uint64 nBlocksToInvalidate = 0;
3137 
3138  rlocator = smgr_reln->smgr_rlocator;
3139 
3140  /* If it's a local relation, it's localbuf.c's problem. */
3141  if (RelFileLocatorBackendIsTemp(rlocator))
3142  {
3143  if (rlocator.backend == MyBackendId)
3144  {
3145  for (j = 0; j < nforks; j++)
3146  DropRelationLocalBuffers(rlocator.locator, forkNum[j],
3147  firstDelBlock[j]);
3148  }
3149  return;
3150  }
3151 
3152  /*
3153  * To remove all the pages of the specified relation forks from the buffer
3154  * pool, we need to scan the entire buffer pool but we can optimize it by
3155  * finding the buffers from BufMapping table provided we know the exact
3156  * size of each fork of the relation. The exact size is required to ensure
3157  * that we don't leave any buffer for the relation being dropped as
3158  * otherwise the background writer or checkpointer can lead to a PANIC
3159  * error while flushing buffers corresponding to files that don't exist.
3160  *
3161  * To know the exact size, we rely on the size cached for each fork by us
3162  * during recovery which limits the optimization to recovery and on
3163  * standbys but we can easily extend it once we have shared cache for
3164  * relation size.
3165  *
3166  * In recovery, we cache the value returned by the first lseek(SEEK_END)
3167  * and the future writes keeps the cached value up-to-date. See
3168  * smgrextend. It is possible that the value of the first lseek is smaller
3169  * than the actual number of existing blocks in the file due to buggy
3170  * Linux kernels that might not have accounted for the recent write. But
3171  * that should be fine because there must not be any buffers after that
3172  * file size.
3173  */
3174  for (i = 0; i < nforks; i++)
3175  {
3176  /* Get the number of blocks for a relation's fork */
3177  nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
3178 
3179  if (nForkBlock[i] == InvalidBlockNumber)
3180  {
3181  nBlocksToInvalidate = InvalidBlockNumber;
3182  break;
3183  }
3184 
3185  /* calculate the number of blocks to be invalidated */
3186  nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
3187  }
3188 
3189  /*
3190  * We apply the optimization iff the total number of blocks to invalidate
3191  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3192  */
3193  if (BlockNumberIsValid(nBlocksToInvalidate) &&
3194  nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3195  {
3196  for (j = 0; j < nforks; j++)
3197  FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
3198  nForkBlock[j], firstDelBlock[j]);
3199  return;
3200  }
3201 
3202  for (i = 0; i < NBuffers; i++)
3203  {
3204  BufferDesc *bufHdr = GetBufferDescriptor(i);
3205  uint32 buf_state;
3206 
3207  /*
3208  * We can make this a tad faster by prechecking the buffer tag before
3209  * we attempt to lock the buffer; this saves a lot of lock
3210  * acquisitions in typical cases. It should be safe because the
3211  * caller must have AccessExclusiveLock on the relation, or some other
3212  * reason to be certain that no one is loading new pages of the rel
3213  * into the buffer pool. (Otherwise we might well miss such pages
3214  * entirely.) Therefore, while the tag might be changing while we
3215  * look at it, it can't be changing *to* a value we care about, only
3216  * *away* from such a value. So false negatives are impossible, and
3217  * false positives are safe because we'll recheck after getting the
3218  * buffer lock.
3219  *
3220  * We could check forkNum and blockNum as well as the rlocator, but
3221  * the incremental win from doing so seems small.
3222  */
3223  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
3224  continue;
3225 
3226  buf_state = LockBufHdr(bufHdr);
3227 
3228  for (j = 0; j < nforks; j++)
3229  {
3230  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
3231  BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
3232  bufHdr->tag.blockNum >= firstDelBlock[j])
3233  {
3234  InvalidateBuffer(bufHdr); /* releases spinlock */
3235  break;
3236  }
3237  }
3238  if (j >= nforks)
3239  UnlockBufHdr(bufHdr, buf_state);
3240  }
3241 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:81
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:3423
BackendId MyBackendId
Definition: globals.c:85
int j
Definition: isn.c:74
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:336
#define RelFileLocatorBackendIsTemp(rlocator)
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:603

References RelFileLocatorBackend::backend, buftag::blockNum, BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetForkNum(), BufTagMatchesRelFileLocator(), DropRelationLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, RelFileLocatorBackend::locator, LockBufHdr(), MAX_FORKNUM, MyBackendId, NBuffers, RelFileLocatorBackendIsTemp, SMgrRelationData::smgr_rlocator, smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrtruncate().

◆ DropRelationsAllBuffers()

void DropRelationsAllBuffers ( SMgrRelation smgr_reln,
int  nlocators 
)

Definition at line 3252 of file bufmgr.c.

3253 {
3254  int i;
3255  int n = 0;
3256  SMgrRelation *rels;
3257  BlockNumber (*block)[MAX_FORKNUM + 1];
3258  uint64 nBlocksToInvalidate = 0;
3259  RelFileLocator *locators;
3260  bool cached = true;
3261  bool use_bsearch;
3262 
3263  if (nlocators == 0)
3264  return;
3265 
3266  rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
3267 
3268  /* If it's a local relation, it's localbuf.c's problem. */
3269  for (i = 0; i < nlocators; i++)
3270  {
3271  if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
3272  {
3273  if (smgr_reln[i]->smgr_rlocator.backend == MyBackendId)
3274  DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
3275  }
3276  else
3277  rels[n++] = smgr_reln[i];
3278  }
3279 
3280  /*
3281  * If there are no non-local relations, then we're done. Release the
3282  * memory and return.
3283  */
3284  if (n == 0)
3285  {
3286  pfree(rels);
3287  return;
3288  }
3289 
3290  /*
3291  * This is used to remember the number of blocks for all the relations
3292  * forks.
3293  */
3294  block = (BlockNumber (*)[MAX_FORKNUM + 1])
3295  palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
3296 
3297  /*
3298  * We can avoid scanning the entire buffer pool if we know the exact size
3299  * of each of the given relation forks. See DropRelationBuffers.
3300  */
3301  for (i = 0; i < n && cached; i++)
3302  {
3303  for (int j = 0; j <= MAX_FORKNUM; j++)
3304  {
3305  /* Get the number of blocks for a relation's fork. */
3306  block[i][j] = smgrnblocks_cached(rels[i], j);
3307 
3308  /* We need to only consider the relation forks that exists. */
3309  if (block[i][j] == InvalidBlockNumber)
3310  {
3311  if (!smgrexists(rels[i], j))
3312  continue;
3313  cached = false;
3314  break;
3315  }
3316 
3317  /* calculate the total number of blocks to be invalidated */
3318  nBlocksToInvalidate += block[i][j];
3319  }
3320  }
3321 
3322  /*
3323  * We apply the optimization iff the total number of blocks to invalidate
3324  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3325  */
3326  if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3327  {
3328  for (i = 0; i < n; i++)
3329  {
3330  for (int j = 0; j <= MAX_FORKNUM; j++)
3331  {
3332  /* ignore relation forks that doesn't exist */
3333  if (!BlockNumberIsValid(block[i][j]))
3334  continue;
3335 
3336  /* drop all the buffers for a particular relation fork */
3337  FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
3338  j, block[i][j], 0);
3339  }
3340  }
3341 
3342  pfree(block);
3343  pfree(rels);
3344  return;
3345  }
3346 
3347  pfree(block);
3348  locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
3349  for (i = 0; i < n; i++)
3350  locators[i] = rels[i]->smgr_rlocator.locator;
3351 
3352  /*
3353  * For low number of relations to drop just use a simple walk through, to
3354  * save the bsearch overhead. The threshold to use is rather a guess than
3355  * an exactly determined value, as it depends on many factors (CPU and RAM
3356  * speeds, amount of shared buffers etc.).
3357  */
3358  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3359 
3360  /* sort the list of rlocators if necessary */
3361  if (use_bsearch)
3362  pg_qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
3363 
3364  for (i = 0; i < NBuffers; i++)
3365  {
3366  RelFileLocator *rlocator = NULL;
3367  BufferDesc *bufHdr = GetBufferDescriptor(i);
3368  uint32 buf_state;
3369 
3370  /*
3371  * As in DropRelationBuffers, an unlocked precheck should be safe and
3372  * saves some cycles.
3373  */
3374 
3375  if (!use_bsearch)
3376  {
3377  int j;
3378 
3379  for (j = 0; j < n; j++)
3380  {
3381  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
3382  {
3383  rlocator = &locators[j];
3384  break;
3385  }
3386  }
3387  }
3388  else
3389  {
3390  RelFileLocator locator;
3391 
3392  locator = BufTagGetRelFileLocator(&bufHdr->tag);
3393  rlocator = bsearch((const void *) &(locator),
3394  locators, n, sizeof(RelFileLocator),
3396  }
3397 
3398  /* buffer doesn't belong to any of the given relfilelocators; skip it */
3399  if (rlocator == NULL)
3400  continue;
3401 
3402  buf_state = LockBufHdr(bufHdr);
3403  if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
3404  InvalidateBuffer(bufHdr); /* releases spinlock */
3405  else
3406  UnlockBufHdr(bufHdr, buf_state);
3407  }
3408 
3409  pfree(locators);
3410  pfree(rels);
3411 }
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:73
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:384
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))

References BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), DropRelationAllLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, if(), InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyBackendId, NBuffers, palloc(), pfree(), pg_qsort(), RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, rlocator_comparator(), smgrexists(), smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrdounlinkall().

◆ FindAndDropRelationBuffers()

static void FindAndDropRelationBuffers ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  nForkBlock,
BlockNumber  firstDelBlock 
)
static

Definition at line 3423 of file bufmgr.c.

3426 {
3427  BlockNumber curBlock;
3428 
3429  for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
3430  {
3431  uint32 bufHash; /* hash value for tag */
3432  BufferTag bufTag; /* identity of requested block */
3433  LWLock *bufPartitionLock; /* buffer partition lock for it */
3434  int buf_id;
3435  BufferDesc *bufHdr;
3436  uint32 buf_state;
3437 
3438  /* create a tag so we can lookup the buffer */
3439  InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
3440 
3441  /* determine its hash code and partition lock ID */
3442  bufHash = BufTableHashCode(&bufTag);
3443  bufPartitionLock = BufMappingPartitionLock(bufHash);
3444 
3445  /* Check that it is in the buffer pool. If not, do nothing. */
3446  LWLockAcquire(bufPartitionLock, LW_SHARED);
3447  buf_id = BufTableLookup(&bufTag, bufHash);
3448  LWLockRelease(bufPartitionLock);
3449 
3450  if (buf_id < 0)
3451  continue;
3452 
3453  bufHdr = GetBufferDescriptor(buf_id);
3454 
3455  /*
3456  * We need to lock the buffer header and recheck if the buffer is
3457  * still associated with the same block because the buffer could be
3458  * evicted by some other backend loading blocks for a different
3459  * relation after we release lock on the BufMapping table.
3460  */
3461  buf_state = LockBufHdr(bufHdr);
3462 
3463  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
3464  BufTagGetForkNum(&bufHdr->tag) == forkNum &&
3465  bufHdr->tag.blockNum >= firstDelBlock)
3466  InvalidateBuffer(bufHdr); /* releases spinlock */
3467  else
3468  UnlockBufHdr(bufHdr, buf_state);
3469  }
3470 }

References buftag::blockNum, BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), GetBufferDescriptor(), InitBufferTag(), InvalidateBuffer(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), BufferDesc::tag, and UnlockBufHdr().

Referenced by DropRelationBuffers(), and DropRelationsAllBuffers().

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 2871 of file bufmgr.c.

2873 {
2874  XLogRecPtr recptr;
2875  ErrorContextCallback errcallback;
2876  instr_time io_start,
2877  io_time;
2878  Block bufBlock;
2879  char *bufToWrite;
2880  uint32 buf_state;
2881 
2882  /*
2883  * Try to start an I/O operation. If StartBufferIO returns false, then
2884  * someone else flushed the buffer before we could, so we need not do
2885  * anything.
2886  */
2887  if (!StartBufferIO(buf, false))
2888  return;
2889 
2890  /* Setup error traceback support for ereport() */
2892  errcallback.arg = (void *) buf;
2893  errcallback.previous = error_context_stack;
2894  error_context_stack = &errcallback;
2895 
2896  /* Find smgr relation for buffer */
2897  if (reln == NULL)
2899 
2900  TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
2901  buf->tag.blockNum,
2903  reln->smgr_rlocator.locator.dbOid,
2905 
2906  buf_state = LockBufHdr(buf);
2907 
2908  /*
2909  * Run PageGetLSN while holding header lock, since we don't have the
2910  * buffer locked exclusively in all cases.
2911  */
2912  recptr = BufferGetLSN(buf);
2913 
2914  /* To check if block content changes while flushing. - vadim 01/17/97 */
2915  buf_state &= ~BM_JUST_DIRTIED;
2916  UnlockBufHdr(buf, buf_state);
2917 
2918  /*
2919  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2920  * rule that log updates must hit disk before any of the data-file changes
2921  * they describe do.
2922  *
2923  * However, this rule does not apply to unlogged relations, which will be
2924  * lost after a crash anyway. Most unlogged relation pages do not bear
2925  * LSNs since we never emit WAL records for them, and therefore flushing
2926  * up through the buffer LSN would be useless, but harmless. However,
2927  * GiST indexes use LSNs internally to track page-splits, and therefore
2928  * unlogged GiST pages bear "fake" LSNs generated by
2929  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2930  * LSN counter could advance past the WAL insertion point; and if it did
2931  * happen, attempting to flush WAL through that location would fail, with
2932  * disastrous system-wide consequences. To make sure that can't happen,
2933  * skip the flush if the buffer isn't permanent.
2934  */
2935  if (buf_state & BM_PERMANENT)
2936  XLogFlush(recptr);
2937 
2938  /*
2939  * Now it's safe to write buffer to disk. Note that no one else should
2940  * have been able to write it while we were busy with log flushing because
2941  * only one process at a time can set the BM_IO_IN_PROGRESS bit.
2942  */
2943  bufBlock = BufHdrGetBlock(buf);
2944 
2945  /*
2946  * Update page checksum if desired. Since we have only shared lock on the
2947  * buffer, other processes might be updating hint bits in it, so we must
2948  * copy the page to private storage if we do checksumming.
2949  */
2950  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2951 
2952  if (track_io_timing)
2953  INSTR_TIME_SET_CURRENT(io_start);
2954  else
2955  INSTR_TIME_SET_ZERO(io_start);
2956 
2957  /*
2958  * bufToWrite is either the shared buffer or a copy, as appropriate.
2959  */
2960  smgrwrite(reln,
2961  BufTagGetForkNum(&buf->tag),
2962  buf->tag.blockNum,
2963  bufToWrite,
2964  false);
2965 
2966  /*
2967  * When a strategy is in use, only flushes of dirty buffers already in the
2968  * strategy ring are counted as strategy writes (IOCONTEXT
2969  * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
2970  * statistics tracking.
2971  *
2972  * If a shared buffer initially added to the ring must be flushed before
2973  * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
2974  *
2975  * If a shared buffer which was added to the ring later because the
2976  * current strategy buffer is pinned or in use or because all strategy
2977  * buffers were dirty and rejected (for BAS_BULKREAD operations only)
2978  * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
2979  * (from_ring will be false).
2980  *
2981  * When a strategy is not in use, the write can only be a "regular" write
2982  * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
2983  */
2985 
2986  if (track_io_timing)
2987  {
2988  INSTR_TIME_SET_CURRENT(io_time);
2989  INSTR_TIME_SUBTRACT(io_time, io_start);
2992  }
2993 
2995 
2996  /*
2997  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2998  * end the BM_IO_IN_PROGRESS state.
2999  */
3000  TerminateBufferIO(buf, true, 0);
3001 
3002  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3003  buf->tag.blockNum,
3005  reln->smgr_rlocator.locator.dbOid,
3007 
3008  /* Pop the error context stack */
3009  error_context_stack = errcallback.previous;
3010 }
bool track_io_timing
Definition: bufmgr.c:137
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:62
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4755
void * Block
Definition: bufmgr.h:24
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1510
Pointer Page
Definition: bufpage.h:78
ErrorContextCallback * error_context_stack
Definition: elog.c:95
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:122
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:178
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:181
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:194
#define INSTR_TIME_SET_ZERO(t)
Definition: instr_time.h:172
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:526
@ IOOP_WRITE
Definition: pgstat.h:300
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:554
instr_time blk_write_time
Definition: instrument.h:37
int64 shared_blks_written
Definition: instrument.h:29
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2514

References ErrorContextCallback::arg, BufferUsage::blk_write_time, BM_JUST_DIRTIED, BM_PERMANENT, buf, BufferGetLSN, BufHdrGetBlock, BufTagGetForkNum(), BufTagGetRelFileLocator(), ErrorContextCallback::callback, RelFileLocator::dbOid, error_context_stack, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SET_ZERO, INSTR_TIME_SUBTRACT, InvalidBackendId, IOOBJECT_RELATION, IOOP_WRITE, RelFileLocatorBackend::locator, LockBufHdr(), PageSetChecksumCopy(), pgBufferUsage, pgstat_count_buffer_write_time, pgstat_count_io_op(), ErrorContextCallback::previous, RelFileLocator::relNumber, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rlocator, smgropen(), smgrwrite(), RelFileLocator::spcOid, StartBufferIO(), TerminateBufferIO(), track_io_timing, UnlockBufHdr(), and XLogFlush().

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushOneBuffer(), FlushRelationBuffers(), FlushRelationsAllBuffers(), and SyncOneBuffer().

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 3942 of file bufmgr.c.

3943 {
3944  int i;
3945  BufferDesc *bufHdr;
3946 
3947  /* Make sure we can handle the pin inside the loop */
3949 
3950  for (i = 0; i < NBuffers; i++)
3951  {
3952  uint32 buf_state;
3953 
3954  bufHdr = GetBufferDescriptor(i);
3955 
3956  /*
3957  * As in DropRelationBuffers, an unlocked precheck should be safe and
3958  * saves some cycles.
3959  */
3960  if (bufHdr->tag.dbOid != dbid)
3961  continue;
3962 
3964 
3965  buf_state = LockBufHdr(bufHdr);
3966  if (bufHdr->tag.dbOid == dbid &&
3967  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3968  {
3969  PinBuffer_Locked(bufHdr);
3973  UnpinBuffer(bufHdr);
3974  }
3975  else
3976  UnlockBufHdr(bufHdr, buf_state);
3977  }
3978 }
@ IOCONTEXT_NORMAL
Definition: pgstat.h:287

References BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), CurrentResourceOwner, buftag::dbOid, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by dbase_redo().

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 3985 of file bufmgr.c.

3986 {
3987  BufferDesc *bufHdr;
3988 
3989  /* currently not needed, but no fundamental reason not to support */
3990  Assert(!BufferIsLocal(buffer));
3991 
3992  Assert(BufferIsPinned(buffer));
3993 
3994  bufHdr = GetBufferDescriptor(buffer - 1);
3995 
3997 
3999 }
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1919

References Assert(), PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, and LWLockHeldByMe().

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), and XLogReadBufferForRedoExtended().

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 3590 of file bufmgr.c.

3591 {
3592  int i;
3593  BufferDesc *bufHdr;
3594 
3595  if (RelationUsesLocalBuffers(rel))
3596  {
3597  for (i = 0; i < NLocBuffer; i++)
3598  {
3599  uint32 buf_state;
3600 
3601  bufHdr = GetLocalBufferDescriptor(i);
3602  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
3603  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3604  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3605  {
3606  ErrorContextCallback errcallback;
3607  Page localpage;
3608 
3609  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3610 
3611  /* Setup error traceback support for ereport() */
3613  errcallback.arg = (void *) bufHdr;
3614  errcallback.previous = error_context_stack;
3615  error_context_stack = &errcallback;
3616 
3617  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3618 
3620  BufTagGetForkNum(&bufHdr->tag),
3621  bufHdr->tag.blockNum,
3622  localpage,
3623  false);
3624 
3625  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3626  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3627 
3629 
3630  /* Pop the error context stack */
3631  error_context_stack = errcallback.previous;
3632  }
3633  }
3634 
3635  return;
3636  }
3637 
3638  /* Make sure we can handle the pin inside the loop */
3640 
3641  for (i = 0; i < NBuffers; i++)
3642  {
3643  uint32 buf_state;
3644 
3645  bufHdr = GetBufferDescriptor(i);
3646 
3647  /*
3648  * As in DropRelationBuffers, an unlocked precheck should be safe and
3649  * saves some cycles.
3650  */
3651  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
3652  continue;
3653 
3655 
3656  buf_state = LockBufHdr(bufHdr);
3657  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
3658  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3659  {
3660  PinBuffer_Locked(bufHdr);
3664  UnpinBuffer(bufHdr);
3665  }
3666  else
3667  UnlockBufHdr(bufHdr, buf_state);
3668  }
3669 }
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:272
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:66
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4775
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1539
int NLocBuffer
Definition: localbuf.c:42
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:278
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:571
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:637
RelFileLocator rd_locator
Definition: rel.h:56

References ErrorContextCallback::arg, buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_VALID, BufferDescriptorGetContentLock(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushBuffer(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_WRITE, local_buffer_write_error_callback(), LocalBufHdrGetBlock, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, NLocBuffer, PageSetChecksumInplace(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), pgstat_count_io_op(), PinBuffer_Locked(), ErrorContextCallback::previous, RelationData::rd_locator, RelationGetSmgr(), RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), smgrwrite(), BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by fill_seq_with_data(), heapam_relation_copy_data(), and index_copy_data().

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 3681 of file bufmgr.c.

3682 {
3683  int i;
3684  SMgrSortArray *srels;
3685  bool use_bsearch;
3686 
3687  if (nrels == 0)
3688  return;
3689 
3690  /* fill-in array for qsort */
3691  srels = palloc(sizeof(SMgrSortArray) * nrels);
3692 
3693  for (i = 0; i < nrels; i++)
3694  {
3695  Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
3696 
3697  srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
3698  srels[i].srel = smgrs[i];
3699  }
3700 
3701  /*
3702  * Save the bsearch overhead for low number of relations to sync. See
3703  * DropRelationsAllBuffers for details.
3704  */
3705  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
3706 
3707  /* sort the list of SMgrRelations if necessary */
3708  if (use_bsearch)
3709  pg_qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
3710 
3711  /* Make sure we can handle the pin inside the loop */
3713 
3714  for (i = 0; i < NBuffers; i++)
3715  {
3716  SMgrSortArray *srelent = NULL;
3717  BufferDesc *bufHdr = GetBufferDescriptor(i);
3718  uint32 buf_state;
3719 
3720  /*
3721  * As in DropRelationBuffers, an unlocked precheck should be safe and
3722  * saves some cycles.
3723  */
3724 
3725  if (!use_bsearch)
3726  {
3727  int j;
3728 
3729  for (j = 0; j < nrels; j++)
3730  {
3731  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
3732  {
3733  srelent = &srels[j];
3734  break;
3735  }
3736  }
3737  }
3738  else
3739  {
3740  RelFileLocator rlocator;
3741 
3742  rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3743  srelent = bsearch((const void *) &(rlocator),
3744  srels, nrels, sizeof(SMgrSortArray),
3746  }
3747 
3748  /* buffer doesn't belong to any of the given relfilelocators; skip it */
3749  if (srelent == NULL)
3750  continue;
3751 
3753 
3754  buf_state = LockBufHdr(bufHdr);
3755  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
3756  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3757  {
3758  PinBuffer_Locked(bufHdr);
3760  FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
3762  UnpinBuffer(bufHdr);
3763  }
3764  else
3765  UnlockBufHdr(bufHdr, buf_state);
3766  }
3767 
3768  pfree(srels);
3769 }
SMgrRelation srel
Definition: bufmgr.c:130
RelFileLocator rlocator
Definition: bufmgr.c:129

References Assert(), BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, j, RelFileLocatorBackend::locator, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, palloc(), pfree(), pg_qsort(), PinBuffer_Locked(), RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), SMgrSortArray::rlocator, rlocator_comparator(), SMgrRelationData::smgr_rlocator, SMgrSortArray::srel, BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by smgrdosyncall().

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 406 of file bufmgr.c.

407 {
408  Assert(ref->refcount == 0);
409 
410  if (ref >= &PrivateRefCountArray[0] &&
412  {
413  ref->buffer = InvalidBuffer;
414 
415  /*
416  * Mark the just used entry as reserved - in many scenarios that
417  * allows us to avoid ever having to search the array/hash for free
418  * entries.
419  */
420  ReservedRefCountEntry = ref;
421  }
422  else
423  {
424  bool found;
425  Buffer buffer = ref->buffer;
426 
427  hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
428  Assert(found);
431  }
432 }
int Buffer
Definition: buf.h:23
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:203
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:953
@ HASH_REMOVE
Definition: hsearch.h:115

References Assert(), PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountEntry.

Referenced by UnpinBuffer().

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 383 of file bufmgr.c.

384 {
386 
387  Assert(BufferIsValid(buffer));
388  Assert(!BufferIsLocal(buffer));
389 
390  /*
391  * Not moving the entry - that's ok for the current users, but we might
392  * want to change this one day.
393  */
394  ref = GetPrivateRefCountEntry(buffer, false);
395 
396  if (ref == NULL)
397  return 0;
398  return ref->refcount;
399 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:309

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by ConditionalLockBufferForCleanup(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), PrintBufferLeakWarning(), and ReadRecentBuffer().

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 309 of file bufmgr.c.

310 {
312  int i;
313 
314  Assert(BufferIsValid(buffer));
315  Assert(!BufferIsLocal(buffer));
316 
317  /*
318  * First search for references in the array, that'll be sufficient in the
319  * majority of cases.
320  */
321  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
322  {
324 
325  if (res->buffer == buffer)
326  return res;
327  }
328 
329  /*
330  * By here we know that the buffer, if already pinned, isn't residing in
331  * the array.
332  *
333  * Only look up the buffer in the hashtable if we've previously overflowed
334  * into it.
335  */
336  if (PrivateRefCountOverflowed == 0)
337  return NULL;
338 
339  res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
340 
341  if (res == NULL)
342  return NULL;
343  else if (!do_move)
344  {
345  /* caller doesn't want us to move the hash entry into the array */
346  return res;
347  }
348  else
349  {
350  /* move buffer from hashtable into the free array slot */
351  bool found;
353 
354  /* Ensure there's a free array slot */
356 
357  /* Use up the reserved slot */
358  Assert(ReservedRefCountEntry != NULL);
360  ReservedRefCountEntry = NULL;
361  Assert(free->buffer == InvalidBuffer);
362 
363  /* and fill it */
364  free->buffer = buffer;
365  free->refcount = res->refcount;
366 
367  /* delete from hashtable */
368  hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
369  Assert(found);
372 
373  return free;
374  }
375 }
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, res, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBuffer().

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 4447 of file bufmgr.c.

4448 {
4449  int bufid = GetStartupBufferPinWaitBufId();
4450 
4451  /*
4452  * If we get woken slowly then it's possible that the Startup process was
4453  * already woken by other backends before we got here. Also possible that
4454  * we get here by multiple interrupts or interrupts at inappropriate
4455  * times, so make sure we do nothing if the bufid is not set.
4456  */
4457  if (bufid < 0)
4458  return false;
4459 
4460  if (GetPrivateRefCount(bufid + 1) > 0)
4461  return true;
4462 
4463  return false;
4464 }
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:639

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and RecoveryConflictInterrupt().

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 4043 of file bufmgr.c.

4044 {
4045  Assert(BufferIsPinned(buffer));
4047  if (BufferIsLocal(buffer))
4048  LocalRefCount[-buffer - 1]++;
4049  else
4050  {
4051  PrivateRefCountEntry *ref;
4052 
4053  ref = GetPrivateRefCountEntry(buffer, true);
4054  Assert(ref != NULL);
4055  ref->refcount++;
4056  }
4058 }
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:963

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, GetPrivateRefCountEntry(), LocalRefCount, PrivateRefCountEntry::refcount, ResourceOwnerEnlargeBuffers(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

◆ InitBufferPoolAccess()

void InitBufferPoolAccess ( void  )

Definition at line 2659 of file bufmgr.c.

2660 {
2661  HASHCTL hash_ctl;
2662 
2663  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2664 
2665  hash_ctl.keysize = sizeof(int32);
2666  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2667 
2668  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2669  HASH_ELEM | HASH_BLOBS);
2670 
2671  /*
2672  * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
2673  * the corresponding phase of backend shutdown.
2674  */
2675  Assert(MyProc != NULL);
2677 }
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2684
struct PrivateRefCountEntry PrivateRefCountEntry
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:350
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
PGPROC * MyProc
Definition: proc.c:66
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76

References Assert(), AtProcExit_Buffers(), HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MyProc, on_shmem_exit(), PrivateRefCountArray, and PrivateRefCountHash.

Referenced by BaseInit().

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 1533 of file bufmgr.c.

1534 {
1535  BufferTag oldTag;
1536  uint32 oldHash; /* hash value for oldTag */
1537  LWLock *oldPartitionLock; /* buffer partition lock for it */
1538  uint32 oldFlags;
1539  uint32 buf_state;
1540 
1541  /* Save the original buffer tag before dropping the spinlock */
1542  oldTag = buf->tag;
1543 
1544  buf_state = pg_atomic_read_u32(&buf->state);
1545  Assert(buf_state & BM_LOCKED);
1546  UnlockBufHdr(buf, buf_state);
1547 
1548  /*
1549  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1550  * worth storing the hashcode in BufferDesc so we need not recompute it
1551  * here? Probably not.
1552  */
1553  oldHash = BufTableHashCode(&oldTag);
1554  oldPartitionLock = BufMappingPartitionLock(oldHash);
1555 
1556 retry:
1557 
1558  /*
1559  * Acquire exclusive mapping lock in preparation for changing the buffer's
1560  * association.
1561  */
1562  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1563 
1564  /* Re-lock the buffer header */
1565  buf_state = LockBufHdr(buf);
1566 
1567  /* If it's changed while we were waiting for lock, do nothing */
1568  if (!BufferTagsEqual(&buf->tag, &oldTag))
1569  {
1570  UnlockBufHdr(buf, buf_state);
1571  LWLockRelease(oldPartitionLock);
1572  return;
1573  }
1574 
1575  /*
1576  * We assume the only reason for it to be pinned is that someone else is
1577  * flushing the page out. Wait for them to finish. (This could be an
1578  * infinite loop if the refcount is messed up... it would be nice to time
1579  * out after awhile, but there seems no way to be sure how many loops may
1580  * be needed. Note that if the other guy has pinned the buffer but not
1581  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1582  * be busy-looping here.)
1583  */
1584  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1585  {
1586  UnlockBufHdr(buf, buf_state);
1587  LWLockRelease(oldPartitionLock);
1588  /* safety check: should definitely not be our *own* pin */
1590  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1591  WaitIO(buf);
1592  goto retry;
1593  }
1594 
1595  /*
1596  * Clear out the buffer's tag and flags. We must do this to ensure that
1597  * linear scans of the buffer array don't think the buffer is valid.
1598  */
1599  oldFlags = buf_state & BUF_FLAG_MASK;
1600  ClearBufferTag(&buf->tag);
1601  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1602  UnlockBufHdr(buf, buf_state);
1603 
1604  /*
1605  * Remove the buffer from the lookup hashtable, if it was in there.
1606  */
1607  if (oldFlags & BM_TAG_VALID)
1608  BufTableDelete(&oldTag, oldHash);
1609 
1610  /*
1611  * Done with mapping lock.
1612  */
1613  LWLockRelease(oldPartitionLock);
1614 
1615  /*
1616  * Insert the buffer at the head of the list of free buffers.
1617  */
1619 }
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
#define BM_LOCKED
Definition: buf_internals.h:59
static void ClearBufferTag(BufferTag *tag)
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:4583
#define ERROR
Definition: elog.h:39
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363

References Assert(), BM_LOCKED, BM_TAG_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), elog(), ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), StrategyFreeBuffer(), UnlockBufHdr(), and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), and FindAndDropRelationBuffers().

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 4529 of file bufmgr.c.

4530 {
4531  BufferDesc *bufHdr;
4532  uint32 buf_state;
4533 
4534  Assert(BufferIsValid(buffer));
4535 
4536  if (BufferIsLocal(buffer))
4537  {
4538  /* There should be exactly one pin */
4539  if (LocalRefCount[-buffer - 1] != 1)
4540  return false;
4541  /* Nobody else to wait for */
4542  return true;
4543  }
4544 
4545  /* There should be exactly one local pin */
4546  if (GetPrivateRefCount(buffer) != 1)
4547  return false;
4548 
4549  bufHdr = GetBufferDescriptor(buffer - 1);
4550 
4551  /* caller must hold exclusive lock on buffer */
4553  LW_EXCLUSIVE));
4554 
4555  buf_state = LockBufHdr(bufHdr);
4556 
4557  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4558  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4559  {
4560  /* pincount is OK. */
4561  UnlockBufHdr(bufHdr, buf_state);
4562  return true;
4563  }
4564 
4565  UnlockBufHdr(bufHdr, buf_state);
4566  return false;
4567 }
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1963

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsValid(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBufHdr(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), and UnlockBufHdr().

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), and hashbucketcleanup().

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext context)

Definition at line 5015 of file bufmgr.c.

5016 {
5017  int i;
5018 
5019  if (context->nr_pending == 0)
5020  return;
5021 
5022  /*
5023  * Executing the writes in-order can make them a lot faster, and allows to
5024  * merge writeback requests to consecutive blocks into larger writebacks.
5025  */
5026  sort_pending_writebacks(context->pending_writebacks, context->nr_pending);
5027 
5028  /*
5029  * Coalesce neighbouring writes, but nothing else. For that we iterate
5030  * through the, now sorted, array of pending flushes, and look forward to
5031  * find all neighbouring (or identical) writes.
5032  */
5033  for (i = 0; i < context->nr_pending; i++)
5034  {
5037  SMgrRelation reln;
5038  int ahead;
5039  BufferTag tag;
5040  RelFileLocator currlocator;
5041  Size nblocks = 1;
5042 
5043  cur = &context->pending_writebacks[i];
5044  tag = cur->tag;
5045  currlocator = BufTagGetRelFileLocator(&tag);
5046 
5047  /*
5048  * Peek ahead, into following writeback requests, to see if they can
5049  * be combined with the current one.
5050  */
5051  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
5052  {
5053 
5054  next = &context->pending_writebacks[i + ahead + 1];
5055 
5056  /* different file, stop */
5057  if (!RelFileLocatorEquals(currlocator,
5058  BufTagGetRelFileLocator(&next->tag)) ||
5059  BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
5060  break;
5061 
5062  /* ok, block queued twice, skip */
5063  if (cur->tag.blockNum == next->tag.blockNum)
5064  continue;
5065 
5066  /* only merge consecutive writes */
5067  if (cur->tag.blockNum + 1 != next->tag.blockNum)
5068  break;
5069 
5070  nblocks++;
5071  cur = next;
5072  }
5073 
5074  i += ahead;
5075 
5076  /* and finally tell the kernel to write the data to storage */
5077  reln = smgropen(currlocator, InvalidBackendId);
5078  smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
5079  }
5080 
5081  context->nr_pending = 0;
5082 }
static int32 next
Definition: blutils.c:219
struct cursor * cur
Definition: ecpg.c:28
#define RelFileLocatorEquals(locator1, locator2)
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:567
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), cur, i, InvalidBackendId, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, RelFileLocatorEquals, smgropen(), and smgrwriteback().

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 4775 of file bufmgr.c.

4776 {
4777  BufferDesc *bufHdr = (BufferDesc *) arg;
4778 
4779  if (bufHdr != NULL)
4780  {
4781  char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
4782  MyBackendId,
4783  BufTagGetForkNum(&bufHdr->tag));
4784 
4785  errcontext("writing block %u of relation %s",
4786  bufHdr->tag.blockNum, path);
4787  pfree(path);
4788  }
4789 }
#define errcontext
Definition: elog.h:196
void * arg
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:85

References arg, buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, MyBackendId, pfree(), relpathbackend, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
int  mode 
)

Definition at line 4246 of file bufmgr.c.

4247 {
4248  BufferDesc *buf;
4249 
4250  Assert(BufferIsPinned(buffer));
4251  if (BufferIsLocal(buffer))
4252  return; /* local buffers need no lock */
4253 
4254  buf = GetBufferDescriptor(buffer - 1);
4255 
4256  if (mode == BUFFER_LOCK_UNLOCK)
4258  else if (mode == BUFFER_LOCK_SHARE)
4260  else if (mode == BUFFER_LOCK_EXCLUSIVE)
4262  else
4263  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
4264 }
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:111
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:112
static PgChecksumMode mode
Definition: pg_checksums.c:65

References Assert(), buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, elog(), ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), and mode.

Referenced by _bt_lockbuf(), _bt_unlockbuf(), _bt_upgradelockbufcleanup(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_getnewbuf(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), blbulkdelete(), blgetbitmap(), blinsert(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), brinbuild(), brinbuildempty(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), entryLoadMoreItems(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), ginbuildempty(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistbuildempty(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_inplace_update(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_bitmap_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgetpage(), heapgettup(), initBloomState(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), LockBufferForCleanup(), log_newpage_range(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), ScanSourceDatabasePgClass(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferExtended(), XLogReadBufferForRedoExtended(), and XLogRecordPageWithFreeSpace().

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 4303 of file bufmgr.c.

4304 {
4305  BufferDesc *bufHdr;
4306  TimestampTz waitStart = 0;
4307  bool waiting = false;
4308  bool logged_recovery_conflict = false;
4309 
4310  Assert(BufferIsPinned(buffer));
4311  Assert(PinCountWaitBuf == NULL);
4312 
4313  if (BufferIsLocal(buffer))
4314  {
4315  /* There should be exactly one pin */
4316  if (LocalRefCount[-buffer - 1] != 1)
4317  elog(ERROR, "incorrect local pin count: %d",
4318  LocalRefCount[-buffer - 1]);
4319  /* Nobody else to wait for */
4320  return;
4321  }
4322 
4323  /* There should be exactly one local pin */
4324  if (GetPrivateRefCount(buffer) != 1)
4325  elog(ERROR, "incorrect local pin count: %d",
4326  GetPrivateRefCount(buffer));
4327 
4328  bufHdr = GetBufferDescriptor(buffer - 1);
4329 
4330  for (;;)
4331  {
4332  uint32 buf_state;
4333 
4334  /* Try to acquire lock */
4336  buf_state = LockBufHdr(bufHdr);
4337 
4338  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4339  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4340  {
4341  /* Successfully acquired exclusive lock with pincount 1 */
4342  UnlockBufHdr(bufHdr, buf_state);
4343 
4344  /*
4345  * Emit the log message if recovery conflict on buffer pin was
4346  * resolved but the startup process waited longer than
4347  * deadlock_timeout for it.
4348  */
4349  if (logged_recovery_conflict)
4351  waitStart, GetCurrentTimestamp(),
4352  NULL, false);
4353 
4354  if (waiting)
4355  {
4356  /* reset ps display to remove the suffix if we added one */
4358  waiting = false;
4359  }
4360  return;
4361  }
4362  /* Failed, so mark myself as waiting for pincount 1 */
4363  if (buf_state & BM_PIN_COUNT_WAITER)
4364  {
4365  UnlockBufHdr(bufHdr, buf_state);
4366  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4367  elog(ERROR, "multiple backends attempting to wait for pincount 1");
4368  }
4370  PinCountWaitBuf = bufHdr;
4371  buf_state |= BM_PIN_COUNT_WAITER;
4372  UnlockBufHdr(bufHdr, buf_state);
4373  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4374 
4375  /* Wait to be signaled by UnpinBuffer() */
4376  if (InHotStandby)
4377  {
4378  if (!waiting)
4379  {
4380  /* adjust the process title to indicate that it's waiting */
4381  set_ps_display_suffix("waiting");
4382  waiting = true;
4383  }
4384 
4385  /*
4386  * Emit the log message if the startup process is waiting longer
4387  * than deadlock_timeout for recovery conflict on buffer pin.
4388  *
4389  * Skip this if first time through because the startup process has
4390  * not started waiting yet in this case. So, the wait start
4391  * timestamp is set after this logic.
4392  */
4393  if (waitStart != 0 && !logged_recovery_conflict)
4394  {
4396 
4397  if (TimestampDifferenceExceeds(waitStart, now,
4398  DeadlockTimeout))
4399  {
4401  waitStart, now, NULL, true);
4402  logged_recovery_conflict = true;
4403  }
4404  }
4405 
4406  /*
4407  * Set the wait start timestamp if logging is enabled and first
4408  * time through.
4409  */
4410  if (log_recovery_conflict_waits && waitStart == 0)
4411  waitStart = GetCurrentTimestamp();
4412 
4413  /* Publish the bufid that Startup process waits on */
4414  SetStartupBufferPinWaitBufId(buffer - 1);
4415  /* Set alarm and then wait to be signaled by UnpinBuffer() */
4417  /* Reset the published bufid */
4419  }
4420  else
4422 
4423  /*
4424  * Remove flag marking us as waiter. Normally this will not be set
4425  * anymore, but ProcWaitForSignal() can return for other signals as
4426  * well. We take care to only reset the flag if we're the waiter, as
4427  * theoretically another backend could have started waiting. That's
4428  * impossible with the current usages due to table level locking, but
4429  * better be safe.
4430  */
4431  buf_state = LockBufHdr(bufHdr);
4432  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
4434  buf_state &= ~BM_PIN_COUNT_WAITER;
4435  UnlockBufHdr(bufHdr, buf_state);
4436 
4437  PinCountWaitBuf = NULL;
4438  /* Loop back and try again */
4439  }
4440 }
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1727
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1582
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1546
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:66
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:167
int64 TimestampTz
Definition: timestamp.h:39
static volatile sig_atomic_t waiting
Definition: latch.c:162
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:45
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:396
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:344
int DeadlockTimeout
Definition: proc.c:58
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:627
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1797
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:779
bool log_recovery_conflict_waits
Definition: standby.c:43
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:274
int wait_backend_pgprocno
int pgprocno
Definition: proc.h:191
#define PG_WAIT_BUFFER_PIN
Definition: wait_event.h:20
#define InHotStandby
Definition: xlogutils.h:57

References Assert(), BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, DeadlockTimeout, elog(), ERROR, GetBufferDescriptor(), GetCurrentTimestamp(), GetPrivateRefCount(), InHotStandby, LocalRefCount, LockBuffer(), LockBufHdr(), log_recovery_conflict_waits, LogRecoveryConflict(), MyProc, now(), PG_WAIT_BUFFER_PIN, PGPROC::pgprocno, PinCountWaitBuf, PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), set_ps_display_remove_suffix(), set_ps_display_suffix(), SetStartupBufferPinWaitBufId(), TimestampDifferenceExceeds(), UnlockBufHdr(), BufferDesc::wait_backend_pgprocno, and waiting.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), ReadBuffer_common(), and XLogReadBufferForRedoExtended().

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 4822 of file bufmgr.c.

4823 {
4824  SpinDelayStatus delayStatus;
4825  uint32 old_buf_state;
4826 
4827  init_local_spin_delay(&delayStatus);
4828 
4829  while (true)
4830  {
4831  /* set BM_LOCKED flag */
4832  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4833  /* if it wasn't set before we're OK */
4834  if (!(old_buf_state & BM_LOCKED))
4835  break;
4836  perform_spin_delay(&delayStatus);
4837  }
4838  finish_spin_delay(&delayStatus);
4839  return old_buf_state | BM_LOCKED;
4840 }
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:367
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:186
#define init_local_spin_delay(status)
Definition: s_lock.h:863

References BM_LOCKED, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), FindAndDropRelationBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetBufferFromRing(), InvalidateBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), pg_buffercache_pages(), ReadBuffer_common(), ReadRecentBuffer(), StartBufferIO(), StrategyGetBuffer(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), UnpinBuffer(), and WaitIO().

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 1631 of file bufmgr.c.

1632 {
1633  BufferDesc *bufHdr;
1634  uint32 buf_state;
1635  uint32 old_buf_state;
1636 
1637  if (!BufferIsValid(buffer))
1638  elog(ERROR, "bad buffer ID: %d", buffer);
1639 
1640  if (BufferIsLocal(buffer))
1641  {
1642  MarkLocalBufferDirty(buffer);
1643  return;
1644  }
1645 
1646  bufHdr = GetBufferDescriptor(buffer - 1);
1647 
1648  Assert(BufferIsPinned(buffer));
1650  LW_EXCLUSIVE));
1651 
1652  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1653  for (;;)
1654  {
1655  if (old_buf_state & BM_LOCKED)
1656  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1657 
1658  buf_state = old_buf_state;
1659 
1660  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1661  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1662 
1663  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1664  buf_state))
1665  break;
1666  }
1667 
1668  /*
1669  * If the buffer was not dirty already, do vacuum accounting.
1670  */
1671  if (!(old_buf_state & BM_DIRTY))
1672  {
1673  VacuumPageDirty++;
1675  if (VacuumCostActive)
1677  }
1678 }
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:306
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4850
bool VacuumCostActive
Definition: globals.c:153
int64 VacuumPageDirty
Definition: globals.c:150
int VacuumCostBalance
Definition: globals.c:152
int VacuumCostPageDirty
Definition: globals.c:144
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:296
int64 shared_blks_dirtied
Definition: instrument.h:28

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, BufferIsValid(), elog(), ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_pass(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newroot(), _bt_restore_meta(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), do_setval(), doPickSplit(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_freeze_execute_prepared(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_freeze_page(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune(), heap_xlog_update(), heap_xlog_vacuum(), heap_xlog_visible(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_page(), log_newpage_range(), moveLeafs(), nextval_internal(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 4075 of file bufmgr.c.

4076 {
4077  BufferDesc *bufHdr;
4078  Page page = BufferGetPage(buffer);
4079 
4080  if (!BufferIsValid(buffer))
4081  elog(ERROR, "bad buffer ID: %d", buffer);
4082 
4083  if (BufferIsLocal(buffer))
4084  {
4085  MarkLocalBufferDirty(buffer);
4086  return;
4087  }
4088 
4089  bufHdr = GetBufferDescriptor(buffer - 1);
4090 
4091  Assert(GetPrivateRefCount(buffer) > 0);
4092  /* here, either share or exclusive lock is OK */
4094 
4095  /*
4096  * This routine might get called many times on the same page, if we are
4097  * making the first scan after commit of an xact that added/deleted many
4098  * tuples. So, be as quick as we can if the buffer is already dirty. We
4099  * do this by not acquiring spinlock if it looks like the status bits are
4100  * already set. Since we make this test unlocked, there's a chance we
4101  * might fail to notice that the flags have just been cleared, and failed
4102  * to reset them, due to memory-ordering issues. But since this function
4103  * is only intended to be used in cases where failing to write out the
4104  * data would be harmless anyway, it doesn't really matter.
4105  */
4106  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
4108  {
4110  bool dirtied = false;
4111  bool delayChkptFlags = false;
4112  uint32 buf_state;
4113 
4114  /*
4115  * If we need to protect hint bit updates from torn writes, WAL-log a
4116  * full page image of the page. This full page image is only necessary
4117  * if the hint bit update is the first change to the page since the
4118  * last checkpoint.
4119  *
4120  * We don't check full_page_writes here because that logic is included
4121  * when we call XLogInsert() since the value changes dynamically.
4122  */
4123  if (XLogHintBitIsNeeded() &&
4124  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
4125  {
4126  /*
4127  * If we must not write WAL, due to a relfilelocator-specific
4128  * condition or being in recovery, don't dirty the page. We can
4129  * set the hint, just not dirty the page as a result so the hint
4130  * is lost when we evict the page or shutdown.
4131  *
4132  * See src/backend/storage/page/README for longer discussion.
4133  */
4134  if (RecoveryInProgress() ||
4136  return;
4137 
4138  /*
4139  * If the block is already dirty because we either made a change
4140  * or set a hint already, then we don't need to write a full page
4141  * image. Note that aggressive cleaning of blocks dirtied by hint
4142  * bit setting would increase the call rate. Bulk setting of hint
4143  * bits would reduce the call rate...
4144  *
4145  * We must issue the WAL record before we mark the buffer dirty.
4146  * Otherwise we might write the page before we write the WAL. That
4147  * causes a race condition, since a checkpoint might occur between
4148  * writing the WAL record and marking the buffer dirty. We solve
4149  * that with a kluge, but one that is already in use during
4150  * transaction commit to prevent race conditions. Basically, we
4151  * simply prevent the checkpoint WAL record from being written
4152  * until we have marked the buffer dirty. We don't start the
4153  * checkpoint flush until we have marked dirty, so our checkpoint
4154  * must flush the change to disk successfully or the checkpoint
4155  * never gets written, so crash recovery will fix.
4156  *
4157  * It's possible we may enter here without an xid, so it is
4158  * essential that CreateCheckPoint waits for virtual transactions
4159  * rather than full transactionids.
4160  */
4163  delayChkptFlags = true;
4164  lsn = XLogSaveBufferForHint(buffer, buffer_std);
4165  }
4166 
4167  buf_state = LockBufHdr(bufHdr);
4168 
4169  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4170 
4171  if (!(buf_state & BM_DIRTY))
4172  {
4173  dirtied = true; /* Means "will be dirtied by this action" */
4174 
4175  /*
4176  * Set the page LSN if we wrote a backup block. We aren't supposed
4177  * to set this when only holding a share lock but as long as we
4178  * serialise it somehow we're OK. We choose to set LSN while
4179  * holding the buffer header lock, which causes any reader of an
4180  * LSN who holds only a share lock to also obtain a buffer header
4181  * lock before using PageGetLSN(), which is enforced in
4182  * BufferGetLSNAtomic().
4183  *
4184  * If checksums are enabled, you might think we should reset the
4185  * checksum here. That will happen when the page is written
4186  * sometime later in this checkpoint cycle.
4187  */
4188  if (!XLogRecPtrIsInvalid(lsn))
4189  PageSetLSN(page, lsn);
4190  }
4191 
4192  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
4193  UnlockBufHdr(bufHdr, buf_state);
4194 
4195  if (delayChkptFlags)
4197 
4198  if (dirtied)
4199  {
4200  VacuumPageDirty++;
4202  if (VacuumCostActive)
4204  }
4205  }
4206 }
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
#define DELAY_CHKPT_START
Definition: proc.h:119
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:550
int delayChkptFlags
Definition: proc.h:231
bool RecoveryInProgress(void)
Definition: xlog.c:5908
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1019

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferGetPage(), BufferIsLocal, BufferIsValid(), BufTagGetRelFileLocator(), DELAY_CHKPT_START, PGPROC::delayChkptFlags, elog(), ERROR, GetBufferDescriptor(), GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), LWLockHeldByMe(), MarkLocalBufferDirty(), MyProc, PageSetLSN(), pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), RelFileLocatorSkippingWAL(), BufferUsage::shared_blks_dirtied, BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 283 of file bufmgr.c.

284 {
286 
287  /* only allowed to be called when a reservation has been made */
288  Assert(ReservedRefCountEntry != NULL);
289 
290  /* use up the reserved entry */
292  ReservedRefCountEntry = NULL;
293 
294  /* and fill it */
295  res->buffer = buffer;
296  res->refcount = 0;
297 
298  return res;
299 }

References Assert(), PrivateRefCountEntry::buffer, res, and ReservedRefCountEntry.

Referenced by PinBuffer(), and PinBuffer_Locked().

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy 
)
static

Definition at line 1752 of file bufmgr.c.

1753 {
1755  bool result;
1756  PrivateRefCountEntry *ref;
1757 
1758  ref = GetPrivateRefCountEntry(b, true);
1759 
1760  if (ref == NULL)
1761  {
1762  uint32 buf_state;
1763  uint32 old_buf_state;
1764 
1766  ref = NewPrivateRefCountEntry(b);
1767 
1768  old_buf_state = pg_atomic_read_u32(&buf->state);
1769  for (;;)
1770  {
1771  if (old_buf_state & BM_LOCKED)
1772  old_buf_state = WaitBufHdrUnlocked(buf);
1773 
1774  buf_state = old_buf_state;
1775 
1776  /* increase refcount */
1777  buf_state += BUF_REFCOUNT_ONE;
1778 
1779  if (strategy == NULL)
1780  {
1781  /* Default case: increase usagecount unless already max. */
1783  buf_state += BUF_USAGECOUNT_ONE;
1784  }
1785  else
1786  {
1787  /*
1788  * Ring buffers shouldn't evict others from pool. Thus we
1789  * don't make usagecount more than 1.
1790  */
1791  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1792  buf_state += BUF_USAGECOUNT_ONE;
1793  }
1794 
1795  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1796  buf_state))
1797  {
1798  result = (buf_state & BM_VALID) != 0;
1799 
1800  /*
1801  * Assume that we acquired a buffer pin for the purposes of
1802  * Valgrind buffer client checks (even in !result case) to
1803  * keep things simple. Buffers that are unsafe to access are
1804  * not generally guaranteed to be marked undefined or
1805  * non-accessible in any case.
1806  */
1808  break;
1809  }
1810  }
1811  }
1812  else
1813  {
1814  /*
1815  * If we previously pinned the buffer, it must surely be valid.
1816  *
1817  * Note: We deliberately avoid a Valgrind client request here.
1818  * Individual access methods can optionally superimpose buffer page
1819  * client requests on top of our client requests to enforce that
1820  * buffers are only accessed while locked (and pinned). It's possible
1821  * that the buffer page is legitimately non-accessible here. We
1822  * cannot meddle with that.
1823  */
1824  result = true;
1825  }
1826 
1827  ref->refcount++;
1828  Assert(ref->refcount > 0);
1830  return result;
1831 }
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:77
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:42
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:51
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:283
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26

References Assert(), b, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservePrivateRefCountEntry(), ResourceOwnerRememberBuffer(), VALGRIND_MAKE_MEM_DEFINED, and WaitBufHdrUnlocked().

Referenced by BufferAlloc(), and ReadRecentBuffer().

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 1855 of file bufmgr.c.

1856 {
1857  Buffer b;
1858  PrivateRefCountEntry *ref;
1859  uint32 buf_state;
1860 
1861  /*
1862  * As explained, We don't expect any preexisting pins. That allows us to
1863  * manipulate the PrivateRefCount after releasing the spinlock
1864  */
1866 
1867  /*
1868  * Buffer can't have a preexisting pin, so mark its page as defined to
1869  * Valgrind (this is similar to the PinBuffer() case where the backend
1870  * doesn't already have a buffer pin)
1871  */
1873 
1874  /*
1875  * Since we hold the buffer spinlock, we can update the buffer state and
1876  * release the lock in one operation.
1877  */
1878  buf_state = pg_atomic_read_u32(&buf->state);
1879  Assert(buf_state & BM_LOCKED);
1880  buf_state += BUF_REFCOUNT_ONE;
1881  UnlockBufHdr(buf, buf_state);
1882 
1884 
1885  ref = NewPrivateRefCountEntry(b);
1886  ref->refcount++;
1887 
1889 }

References Assert(), b, BM_LOCKED, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), UnlockBufHdr(), and VALGRIND_MAKE_MEM_DEFINED.

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), ReadRecentBuffer(), and SyncOneBuffer().

◆ PrefetchBuffer()

PrefetchBufferResult PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 584 of file bufmgr.c.

585 {
586  Assert(RelationIsValid(reln));
587  Assert(BlockNumberIsValid(blockNum));
588 
589  if (RelationUsesLocalBuffers(reln))
590  {
591  /* see comments in ReadBufferExtended */
592  if (RELATION_IS_OTHER_TEMP(reln))
593  ereport(ERROR,
594  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
595  errmsg("cannot access temporary tables of other sessions")));
596 
597  /* pass it off to localbuf.c */
598  return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
599  }
600  else
601  {
602  /* pass it to the shared buffer version */
603  return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
604  }
605 }
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:497
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:65
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:658
#define RelationIsValid(relation)
Definition: rel.h:476

References Assert(), BlockNumberIsValid(), ereport, errcode(), errmsg(), ERROR, PrefetchLocalBuffer(), PrefetchSharedBuffer(), RELATION_IS_OTHER_TEMP, RelationGetSmgr(), RelationIsValid, and RelationUsesLocalBuffers.

Referenced by acquire_sample_rows(), BitmapPrefetch(), count_nondeletable_pages(), and pg_prewarm().

◆ PrefetchSharedBuffer()

PrefetchBufferResult PrefetchSharedBuffer ( SMgrRelation  smgr_reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 497 of file bufmgr.c.

500 {
501  PrefetchBufferResult result = {InvalidBuffer, false};
502  BufferTag newTag; /* identity of requested block */
503  uint32 newHash; /* hash value for newTag */
504  LWLock *newPartitionLock; /* buffer partition lock for it */
505  int buf_id;
506 
507  Assert(BlockNumberIsValid(blockNum));
508 
509  /* create a tag so we can lookup the buffer */
510  InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
511  forkNum, blockNum);
512 
513  /* determine its hash code and partition lock ID */
514  newHash = BufTableHashCode(&newTag);
515  newPartitionLock = BufMappingPartitionLock(newHash);
516 
517  /* see if the block is in the buffer pool already */
518  LWLockAcquire(newPartitionLock, LW_SHARED);
519  buf_id = BufTableLookup(&newTag, newHash);
520  LWLockRelease(newPartitionLock);
521 
522  /* If not in buffers, initiate prefetch */
523  if (buf_id < 0)
524  {
525 #ifdef USE_PREFETCH
526  /*
527  * Try to initiate an asynchronous read. This returns false in
528  * recovery if the relation file doesn't exist.
529  */
530  if (smgrprefetch(smgr_reln, forkNum, blockNum))
531  result.initiated_io = true;
532 #endif /* USE_PREFETCH */
533  }
534  else
535  {
536  /*
537  * Report the buffer it was in at that time. The caller may be able
538  * to avoid a buffer table lookup, but it's not pinned and it must be
539  * rechecked!
540  */
541  result.recent_buffer = buf_id + 1;
542  }
543 
544  /*
545  * If the block *is* in buffers, we do nothing. This is not really ideal:
546  * the block might be just about to be evicted, which would be stupid
547  * since we know we are going to need it soon. But the only easy answer
548  * is to bump the usage_count, which does not seem like a great solution:
549  * when the caller does ultimately touch the block, usage_count would get
550  * bumped again, resulting in too much favoritism for blocks that are
551  * involved in a prefetch sequence. A real fix would involve some
552  * additional per-buffer state, and it's not clear that there's enough of
553  * a problem to justify that.
554  */
555 
556  return result;
557 }
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:518
Buffer recent_buffer
Definition: bufmgr.h:59

References Assert(), BlockNumberIsValid(), BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), InitBufferTag(), PrefetchBufferResult::initiated_io, InvalidBuffer, RelFileLocatorBackend::locator, LW_SHARED, LWLockAcquire(), LWLockRelease(), PrefetchBufferResult::recent_buffer, SMgrRelationData::smgr_rlocator, and smgrprefetch().

Referenced by PrefetchBuffer(), and XLogPrefetcherNextBlock().

◆ PrintBufferLeakWarning()

void PrintBufferLeakWarning ( Buffer  buffer)

Definition at line 2743 of file bufmgr.c.

2744 {
2745  BufferDesc *buf;
2746  int32 loccount;
2747  char *path;
2748  BackendId backend;
2749  uint32 buf_state;
2750 
2751  Assert(BufferIsValid(buffer));
2752  if (BufferIsLocal(buffer))
2753  {
2754  buf = GetLocalBufferDescriptor(-buffer - 1);
2755  loccount = LocalRefCount[-buffer - 1];
2756  backend = MyBackendId;
2757  }
2758  else
2759  {
2760  buf = GetBufferDescriptor(buffer - 1);
2761  loccount = GetPrivateRefCount(buffer);
2762  backend = InvalidBackendId;
2763  }
2764 
2765  /* theoretically we should lock the bufhdr here */
2766  path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
2767  BufTagGetForkNum(&buf->tag));
2768  buf_state = pg_atomic_read_u32(&buf->state);
2769  elog(WARNING,
2770  "buffer refcount leak: [%03d] "
2771  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2772  buffer, path,
2773  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2774  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2775  pfree(path);
2776 }
int BackendId
Definition: backendid.h:21

References Assert(), buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), BufTagGetForkNum(), BufTagGetRelFileLocator(), elog(), GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), InvalidBackendId, LocalRefCount, MyBackendId, pfree(), pg_atomic_read_u32(), relpathbackend, and WARNING.

Referenced by CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResourceOwnerReleaseInternal().

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 704 of file bufmgr.c.

705 {
706  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
707 }
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:751
@ RBM_NORMAL
Definition: bufmgr.h:44

References MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_getbuf(), _bt_search_insert(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinbuild(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_inplace_update(), heap_lock_tuple(), heap_update(), initBloomState(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

◆ ReadBuffer_common()

static Buffer ReadBuffer_common ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool hit 
)
static

Definition at line 811 of file bufmgr.c.

814 {
815  BufferDesc *bufHdr;
816  Block bufBlock;
817  bool found;
818  IOContext io_context;
819  IOObject io_object;
820  bool isExtend;
821  bool isLocalBuf = SmgrIsTemp(smgr);
822 
823  *hit = false;
824 
825  /* Make sure we will have room to remember the buffer pin */
827 
828  isExtend = (blockNum == P_NEW);
829 
830  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
834  smgr->smgr_rlocator.backend,
835  isExtend);
836 
837  /* Substitute proper block number if caller asked for P_NEW */
838  if (isExtend)
839  {
840  blockNum = smgrnblocks(smgr, forkNum);
841  /* Fail if relation is already at maximum possible length */
842  if (blockNum == P_NEW)
843  ereport(ERROR,
844  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
845  errmsg("cannot extend relation %s beyond %u blocks",
846  relpath(smgr->smgr_rlocator, forkNum),
847  P_NEW)));
848  }
849 
850  if (isLocalBuf)
851  {
852  /*
853  * LocalBufferAlloc() will set the io_context to IOCONTEXT_NORMAL. We
854  * do not use a BufferAccessStrategy for I/O of temporary tables.
855  * However, in some cases, the "strategy" may not be NULL, so we can't
856  * rely on IOContextForStrategy() to set the right IOContext for us.
857  * This may happen in cases like CREATE TEMPORARY TABLE AS...
858  */
859  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found, &io_context);
860  if (found)
862  else if (isExtend)
864  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
867  }
868  else
869  {
870  /*
871  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
872  * not currently in memory.
873  */
874  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
875  strategy, &found, &io_context);
876  if (found)
878  else if (isExtend)
880  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
883  }
884 
885  /* At this point we do NOT hold any locks. */
886 
887  /* if it was already in the buffer pool, we're done */
888  if (found)
889  {
890  if (!isExtend)
891  {
892  /* Just need to update stats before we exit */
893  *hit = true;
894  VacuumPageHit++;
895 
896  if (VacuumCostActive)
898 
899  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
903  smgr->smgr_rlocator.backend,
904  isExtend,
905  found);
906 
907  /*
908  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
909  * locked on return.
910  */
911  if (!isLocalBuf)
912  {
913  if (mode == RBM_ZERO_AND_LOCK)
915  LW_EXCLUSIVE);
916  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
918  }
919 
920  return BufferDescriptorGetBuffer(bufHdr);
921  }
922 
923  /*
924  * We get here only in the corner case where we are trying to extend
925  * the relation but we found a pre-existing buffer marked BM_VALID.
926  * This can happen because mdread doesn't complain about reads beyond
927  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
928  * read a block beyond EOF could have left a "valid" zero-filled
929  * buffer. Unfortunately, we have also seen this case occurring
930  * because of buggy Linux kernels that sometimes return an
931  * lseek(SEEK_END) result that doesn't account for a recent write. In
932  * that situation, the pre-existing buffer would contain valid data
933  * that we don't want to overwrite. Since the legitimate case should
934  * always have left a zero-filled buffer, complain if not PageIsNew.
935  */
936  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
937  if (!PageIsNew((Page) bufBlock))
938  ereport(ERROR,
939  (errmsg("unexpected data beyond EOF in block %u of relation %s",
940  blockNum, relpath(smgr->smgr_rlocator, forkNum)),
941  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
942 
943  /*
944  * We *must* do smgrextend before succeeding, else the page will not
945  * be reserved by the kernel, and the next P_NEW call will decide to
946  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
947  * call that BufferAlloc didn't, and proceed.
948  */
949  if (isLocalBuf)
950  {
951  /* Only need to adjust flags */
952  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
953 
954  Assert(buf_state & BM_VALID);
955  buf_state &= ~BM_VALID;
956  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
957  }
958  else
959  {
960  /*
961  * Loop to handle the very small possibility that someone re-sets
962  * BM_VALID between our clearing it and StartBufferIO inspecting
963  * it.
964  */
965  do
966  {
967  uint32 buf_state = LockBufHdr(bufHdr);
968 
969  Assert(buf_state & BM_VALID);
970  buf_state &= ~BM_VALID;
971  UnlockBufHdr(bufHdr, buf_state);
972  } while (!StartBufferIO(bufHdr, true));
973  }
974  }
975 
976  /*
977  * if we have gotten to this point, we have allocated a buffer for the
978  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
979  * if it's a shared buffer.
980  *
981  * Note: if smgrextend fails, we will end up with a buffer that is
982  * allocated but not marked BM_VALID. P_NEW will still select the same
983  * block number (because the relation didn't get any longer on disk) and
984  * so future attempts to extend the relation will find the same buffer (if
985  * it's not been recycled) but come right back here to try smgrextend
986  * again.
987  */
988  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
989 
990  if (isLocalBuf)
991  {
992  bufBlock = LocalBufHdrGetBlock(bufHdr);
993  io_object = IOOBJECT_TEMP_RELATION;
994  }
995  else
996  {
997  bufBlock = BufHdrGetBlock(bufHdr);
998  io_object = IOOBJECT_RELATION;
999  }
1000 
1001  if (isExtend)
1002  {
1003  /* new buffers are zero-filled */
1004  MemSet((char *) bufBlock, 0, BLCKSZ);
1005  /* don't set checksum for all-zero page */
1006  smgrextend(smgr, forkNum, blockNum, bufBlock, false);
1007 
1008  pgstat_count_io_op(io_object, io_context, IOOP_EXTEND);
1009 
1010  /*
1011  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
1012  * although we're essentially performing a write. At least on linux
1013  * doing so defeats the 'delayed allocation' mechanism, leading to
1014  * increased file fragmentation.
1015  */
1016  }
1017  else
1018  {
1019  /*
1020  * Read in the page, unless the caller intends to overwrite it and
1021  * just wants us to allocate a buffer.
1022  */
1024  MemSet((char *) bufBlock, 0, BLCKSZ);
1025  else
1026  {
1027  instr_time io_start,
1028  io_time;
1029 
1030  if (track_io_timing)
1031  INSTR_TIME_SET_CURRENT(io_start);
1032  else
1033  INSTR_TIME_SET_ZERO(io_start);
1034 
1035  smgrread(smgr, forkNum, blockNum, bufBlock);
1036 
1037  pgstat_count_io_op(io_object, io_context, IOOP_READ);
1038 
1039  if (track_io_timing)
1040  {
1041  INSTR_TIME_SET_CURRENT(io_time);
1042  INSTR_TIME_SUBTRACT(io_time, io_start);
1045  }
1046 
1047  /* check for garbage data */
1048  if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
1050  {
1052  {
1053  ereport(WARNING,
1055  errmsg("invalid page in block %u of relation %s; zeroing out page",
1056  blockNum,
1057  relpath(smgr->smgr_rlocator, forkNum))));
1058  MemSet((char *) bufBlock, 0, BLCKSZ);
1059  }
1060  else
1061  ereport(ERROR,
1063  errmsg("invalid page in block %u of relation %s",
1064  blockNum,
1065  relpath(smgr->smgr_rlocator, forkNum))));
1066  }
1067  }
1068  }
1069 
1070  /*
1071  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
1072  * the page as valid, to make sure that no other backend sees the zeroed
1073  * page before the caller has had a chance to initialize it.
1074  *
1075  * Since no-one else can be looking at the page contents yet, there is no
1076  * difference between an exclusive lock and a cleanup-strength lock. (Note
1077  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
1078  * they assert that the buffer is already valid.)
1079  */
1081  !isLocalBuf)
1082  {
1084  }
1085 
1086  if (isLocalBuf)
1087  {
1088  /* Only need to adjust flags */
1089  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1090 
1091  buf_state |= BM_VALID;
1092  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1093  }
1094  else
1095  {
1096  /* Set BM_VALID, terminate IO, and wake up any waiters */
1097  TerminateBufferIO(bufHdr, false, BM_VALID);
1098  }
1099 
1100  VacuumPageMiss++;
1101  if (VacuumCostActive)
1103 
1104  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1106  smgr->smgr_rlocator.locator.dbOid,
1108  smgr->smgr_rlocator.backend,
1109  isExtend,
1110  found);
1111 
1112  return BufferDescriptorGetBuffer(bufHdr);
1113 }
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext *io_context)
Definition: bufmgr.c:1139
bool zero_damaged_pages
Definition: bufmgr.c:134
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:4303
#define P_NEW
Definition: bufmgr.h:105
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:49
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:47
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:45
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:50
bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
Definition: bufpage.c:88
#define PIV_LOG_WARNING
Definition: bufpage.h:465
static bool PageIsNew(Page page)
Definition: bufpage.h:230
#define PIV_REPORT_STAT
Definition: bufpage.h:466
#define MemSet(start, val, len)
Definition: c.h:1004
int errhint(const char *fmt,...)
Definition: elog.c:1316
int64 VacuumPageHit
Definition: globals.c:148
int VacuumCostPageMiss
Definition: globals.c:143
int64 VacuumPageMiss
Definition: globals.c:149
int VacuumCostPageHit
Definition: globals.c:142
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr, IOContext *io_context)
Definition: localbuf.c:110
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:524
IOObject
Definition: pgstat.h:276
IOContext
Definition: pgstat.h:284
@ IOOP_EXTEND
Definition: pgstat.h:296
@ IOOP_READ
Definition: pgstat.h:298
#define relpath(rlocator, forknum)
Definition: relpath.h:94
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:579
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer)
Definition: smgr.c:532
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:493
#define SmgrIsTemp(smgr)
Definition: smgr.h:77
int64 local_blks_hit
Definition: instrument.h:30
int64 local_blks_written
Definition: instrument.h:33
int64 shared_blks_read
Definition: instrument.h:27
instr_time blk_read_time
Definition: instrument.h:36
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26

References Assert(), RelFileLocatorBackend::backend, BufferUsage::blk_read_time, BM_VALID, BufferAlloc(), BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufHdrGetBlock, CurrentResourceOwner, RelFileLocator::dbOid, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errhint(), errmsg(), ERROR, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SET_ZERO, INSTR_TIME_SUBTRACT, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_EXTEND, IOOP_READ, BufferUsage::local_blks_hit, BufferUsage::local_blks_read, BufferUsage::local_blks_written, LocalBufferAlloc(), LocalBufHdrGetBlock, RelFileLocatorBackend::locator, LockBufferForCleanup(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), MemSet, mode, P_NEW, PageIsNew(), PageIsVerifiedExtended(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), pgBufferUsage, pgstat_count_buffer_read_time, pgstat_count_io_op(), PIV_LOG_WARNING, PIV_REPORT_STAT, RBM_NORMAL, RBM_NORMAL_NO_LOG, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RBM_ZERO_ON_ERROR, RelFileLocator::relNumber, relpath, ResourceOwnerEnlargeBuffers(), BufferUsage::shared_blks_hit, BufferUsage::shared_blks_read, BufferUsage::shared_blks_written, SMgrRelationData::smgr_rlocator, smgrextend(), SmgrIsTemp, smgrnblocks(), smgrread(), RelFileLocator::spcOid, StartBufferIO(), BufferDesc::state, TerminateBufferIO(), track_io_timing, UnlockBufHdr(), VacuumCostActive, VacuumCostBalance, VacuumCostPageHit, VacuumCostPageMiss, VacuumPageHit, VacuumPageMiss, WARNING, and zero_damaged_pages.

Referenced by ReadBufferExtended(), and ReadBufferWithoutRelcache().

◆ ReadBufferExtended()

Buffer ReadBufferExtended ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)

Definition at line 751 of file bufmgr.c.

753 {
754  bool hit;
755  Buffer buf;
756 
757  /*
758  * Reject attempts to read non-local temporary relations; we would be
759  * likely to get wrong data since we have no visibility into the owning
760  * session's local buffers.
761  */
762  if (RELATION_IS_OTHER_TEMP(reln))
763  ereport(ERROR,
764  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
765  errmsg("cannot access temporary tables of other sessions")));
766 
767  /*
768  * Read the buffer, and update pgstat counters to reflect a cache hit or
769  * miss.
770  */
772  buf = ReadBuffer_common(RelationGetSmgr(reln), reln->rd_rel->relpersistence,
773  forkNum, blockNum, mode, strategy, &hit);
774  if (hit)
776  return buf;
777 }
static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:811
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:607
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:612
Form_pg_class rd_rel
Definition: rel.h:110

References buf, ereport, errcode(), errmsg(), ERROR, mode, pgstat_count_buffer_hit, pgstat_count_buffer_read, RelationData::rd_rel, ReadBuffer_common(), RELATION_IS_OTHER_TEMP, and RelationGetSmgr().

Referenced by _hash_getbuf_with_strategy(), _hash_getinitbuf(), _hash_getnewbuf(), autoprewarm_database_main(), blbulkdelete(), blgetbitmap(), blvacuumcleanup(), brin_vacuum_scan(), brinbuildempty(), bt_recheck_sibling_links(), btvacuumpage(), collect_corrupt_items(), collect_visibility_data(), count_nondeletable_pages(), fill_seq_fork_with_data(), fsm_readbuf(), get_raw_page_internal(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginScanToDelete(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistbuildempty(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbulkdelete(), heapam_scan_analyze_next_block(), heapgetpage(), lazy_scan_heap(), lazy_vacuum_heap_rel(), log_newpage_range(), palloc_btree_page(), pg_prewarm(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstathashindex(), pgstatindex_impl(), ReadBuffer(), ReadBufferBI(), spgprocesspending(), spgvacuumpage(), statapprox_heap(), verify_heapam(), and vm_readbuf().

◆ ReadBufferWithoutRelcache()

Buffer ReadBufferWithoutRelcache ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool  permanent 
)

Definition at line 791 of file bufmgr.c.

794 {
795  bool hit;
796 
797  SMgrRelation smgr = smgropen(rlocator, InvalidBackendId);
798 
799  return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT :
800  RELPERSISTENCE_UNLOGGED, forkNum, blockNum,
801  mode, strategy, &hit);
802 }

References InvalidBackendId, mode, ReadBuffer_common(), and smgropen().

Referenced by RelationCopyStorageUsingBuffer(), ScanSourceDatabasePgClass(), and XLogReadBufferExtended().

◆ ReadRecentBuffer()

bool ReadRecentBuffer ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
Buffer  recent_buffer 
)

Definition at line 615 of file bufmgr.c.

617 {
618  BufferDesc *bufHdr;
619  BufferTag tag;
620  uint32 buf_state;
621  bool have_private_ref;
622 
623  Assert(BufferIsValid(recent_buffer));
624 
627  InitBufferTag(&tag, &rlocator, forkNum, blockNum);
628 
629  if (BufferIsLocal(recent_buffer))
630  {
631  int b = -recent_buffer - 1;
632 
633  bufHdr = GetLocalBufferDescriptor(b);
634  buf_state = pg_atomic_read_u32(&bufHdr->state);
635 
636  /* Is it still valid and holding the right tag? */
637  if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
638  {
639  /*
640  * Bump buffer's ref and usage counts. This is equivalent of
641  * PinBuffer for a shared buffer.
642  */
643  if (LocalRefCount[b] == 0)
644  {
646  {
647  buf_state += BUF_USAGECOUNT_ONE;
648  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
649  }
650  }
651  LocalRefCount[b]++;
653 
655 
656  return true;
657  }
658  }
659  else
660  {
661  bufHdr = GetBufferDescriptor(recent_buffer - 1);
662  have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
663 
664  /*
665  * Do we already have this buffer pinned with a private reference? If
666  * so, it must be valid and it is safe to check the tag without
667  * locking. If not, we have to lock the header first and then check.
668  */
669  if (have_private_ref)
670  buf_state = pg_atomic_read_u32(&bufHdr->state);
671  else
672  buf_state = LockBufHdr(bufHdr);
673 
674  if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
675  {
676  /*
677  * It's now safe to pin the buffer. We can't pin first and ask
678  * questions later, because it might confuse code paths like
679  * InvalidateBuffer() if we pinned a random non-matching buffer.
680  */
681  if (have_private_ref)
682  PinBuffer(bufHdr, NULL); /* bump pin count */
683  else
684  PinBuffer_Locked(bufHdr); /* pin for first time */
685 
687 
688  return true;
689  }
690 
691  /* If we locked the header above, now unlock. */
692  if (!have_private_ref)
693  UnlockBufHdr(bufHdr, buf_state);
694  }
695 
696  return false;
697 }

References Assert(), b, BM_MAX_USAGE_COUNT, BM_VALID, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferIsLocal, BufferIsValid(), BufferTagsEqual(), CurrentResourceOwner, GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), InitBufferTag(), BufferUsage::local_blks_hit, LocalRefCount, LockBufHdr(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), pgBufferUsage, PinBuffer(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), ResourceOwnerRememberBuffer(), BufferUsage::shared_blks_hit, BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by XLogReadBufferExtended().

◆ RelationCopyStorageUsingBuffer()

static void RelationCopyStorageUsingBuffer ( RelFileLocator  srclocator,
RelFileLocator  dstlocator,
ForkNumber  forkNum,
bool  permanent 
)
static

Definition at line 3782 of file bufmgr.c.

3785 {
3786  Buffer srcBuf;
3787  Buffer dstBuf;
3788  Page srcPage;
3789  Page dstPage;
3790  bool use_wal;
3791  BlockNumber nblocks;
3792  BlockNumber blkno;
3794  BufferAccessStrategy bstrategy_src;
3795  BufferAccessStrategy bstrategy_dst;
3796 
3797  /*
3798  * In general, we want to write WAL whenever wal_level > 'minimal', but we
3799  * can skip it when copying any fork of an unlogged relation other than
3800  * the init fork.
3801  */
3802  use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
3803 
3804  /* Get number of blocks in the source relation. */
3805  nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId),
3806  forkNum);
3807 
3808  /* Nothing to copy; just return. */
3809  if (nblocks == 0)
3810  return;
3811 
3812  /*
3813  * Bulk extend the destination relation of the same size as the source
3814  * relation before starting to copy block by block.
3815  */
3816  memset(buf.data, 0, BLCKSZ);
3817  smgrextend(smgropen(dstlocator, InvalidBackendId), forkNum, nblocks - 1,
3818  buf.data, true);
3819 
3820  /* This is a bulk operation, so use buffer access strategies. */
3821  bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
3822  bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
3823 
3824  /* Iterate over each block of the source relation file. */
3825  for (blkno = 0; blkno < nblocks; blkno++)
3826  {
3828 
3829  /* Read block from source relation. */
3830  srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno,
3831  RBM_NORMAL, bstrategy_src,
3832  permanent);
3833  LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
3834  srcPage = BufferGetPage(srcBuf);
3835 
3836  dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno,
3837  RBM_ZERO_AND_LOCK, bstrategy_dst,
3838  permanent);
3839  dstPage = BufferGetPage(dstBuf);
3840 
3842 
3843  /* Copy page data from the source to the destination. */
3844  memcpy(dstPage, srcPage, BLCKSZ);
3845  MarkBufferDirty(dstBuf);
3846 
3847  /* WAL-log the copied page. */
3848  if (use_wal)
3849  log_newpage_buffer(dstBuf, true);
3850 
3851  END_CRIT_SECTION();
3852 
3853  UnlockReleaseBuffer(dstBuf);
3854  UnlockReleaseBuffer(srcBuf);
3855  }
3856 
3857  FreeAccessStrategy(bstrategy_src);
3858  FreeAccessStrategy(bstrategy_dst);
3859 }
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4028
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1631
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:791
@ BAS_BULKREAD
Definition: bufmgr.h:35
@ BAS_BULKWRITE
Definition: bufmgr.h:37
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:541
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:596
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
#define XLogIsNeeded()
Definition: xlog.h:104
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1191

References BAS_BULKREAD, BAS_BULKWRITE, buf, BUFFER_LOCK_SHARE, BufferGetPage(), CHECK_FOR_INTERRUPTS, END_CRIT_SECTION, FreeAccessStrategy(), GetAccessStrategy(), INIT_FORKNUM, InvalidBackendId, LockBuffer(), log_newpage_buffer(), MarkBufferDirty(), RBM_NORMAL, RBM_ZERO_AND_LOCK, ReadBufferWithoutRelcache(), smgrextend(), smgrnblocks(), smgropen(), START_CRIT_SECTION, UnlockReleaseBuffer(), and XLogIsNeeded.

Referenced by CreateAndCopyRelationData().

◆ RelationGetNumberOfBlocksInFork()

BlockNumber RelationGetNumberOfBlocksInFork ( Relation  relation,
ForkNumber  forkNum 
)

Definition at line 3021 of file bufmgr.c.

3022 {
3023  if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
3024  {
3025  /*
3026  * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
3027  * tableam returns the size in bytes - but for the purpose of this
3028  * routine, we want the number of blocks. Therefore divide, rounding
3029  * up.
3030  */
3031  uint64 szbytes;
3032 
3033  szbytes = table_relation_size(relation, forkNum);
3034 
3035  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
3036  }
3037  else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
3038  {
3039  return smgrnblocks(RelationGetSmgr(relation), forkNum);
3040  }
3041  else
3042  Assert(false);
3043 
3044  return 0; /* keep compiler quiet */
3045 }
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1875

References Assert(), RelationData::rd_rel, RelationGetSmgr(), smgrnblocks(), and table_relation_size().

Referenced by _hash_getnewbuf(), _hash_init(), autoprewarm_database_main(), get_raw_page_internal(), and pg_prewarm().

◆ ReleaseAndReadBuffer()

Buffer ReleaseAndReadBuffer ( Buffer  buffer,
Relation  relation,
BlockNumber  blockNum 
)

Definition at line 1694 of file bufmgr.c.

1697 {
1698  ForkNumber forkNum = MAIN_FORKNUM;
1699  BufferDesc *bufHdr;
1700 
1701  if (BufferIsValid(buffer))
1702  {
1703  Assert(BufferIsPinned(buffer));
1704  if (BufferIsLocal(buffer))
1705  {
1706  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1707  if (bufHdr->tag.blockNum == blockNum &&
1708  BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
1709  BufTagGetForkNum(&bufHdr->tag) == forkNum)
1710  return buffer;
1712  LocalRefCount[-buffer - 1]--;
1713  }
1714  else
1715  {
1716  bufHdr = GetBufferDescriptor(buffer - 1);
1717  /* we have pin, so it's ok to examine tag without spinlock */
1718  if (bufHdr->tag.blockNum == blockNum &&
1719  BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
1720  BufTagGetForkNum(&bufHdr->tag) == forkNum)
1721  return buffer;
1722  UnpinBuffer(bufHdr);
1723  }
1724  }
1725 
1726  return ReadBuffer(relation, blockNum);
1727 }
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:704
void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:972

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), CurrentResourceOwner, GetBufferDescriptor(), GetLocalBufferDescriptor(),