PostgreSQL Source Code  git master
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/catalog.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner_private.h"
#include "utils/timestamp.h"
#include <lib/sort_template.h>
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 
#define ST_SORT   sort_checkpoint_bufferids
 
#define ST_ELEMENT_TYPE   CkptSortItem
 
#define ST_COMPARE(a, b)   ckpt_buforder_comparator(a, b)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   sort_pending_writebacks
 
#define ST_ELEMENT_TYPE   PendingWriteback
 
#define ST_COMPARE(a, b)   buffertag_comparator(&a->tag, &b->tag)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static Buffer ReadBuffer_common (SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf, bool fixOwner)
 
static void BufferSync (int flags)
 
static uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static bool StartBufferIO (BufferDesc *buf, bool forInput)
 
static void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln)
 
static void FindAndDropRelFileNodeBuffers (RelFileNode rnode, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
 
static void RelationCopyStorageUsingBuffer (Relation src, Relation dst, ForkNumber forkNum, bool isunlogged)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rnode_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const BufferTag *a, const BufferTag *b)
 
static int ckpt_buforder_comparator (const CkptSortItem *a, const CkptSortItem *b)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
bool ReadRecentBuffer (RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
 
static void InvalidateBuffer (BufferDesc *buf)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferPoolAccess (void)
 
void PrintBufferLeakWarning (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
void BufmgrCommit (void)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelFileNodeBuffers (SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelFileNodesAllBuffers (SMgrRelation *smgr_reln, int nnodes)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void CreateAndCopyRelationData (RelFileNode src_rnode, RelFileNode dst_rnode, bool permanent)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, int mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
void AbortBufferIO (void)
 
uint32 LockBufHdr (BufferDesc *desc)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *context)
 
void TestForOldSnapshot_impl (Snapshot snapshot, Relation relation)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = 0
 
int maintenance_io_concurrency = 0
 
int checkpoint_flush_after = 0
 
int bgwriter_flush_after = 0
 
int backend_flush_after = 0
 
static BufferDescInProgressBuf = NULL
 
static bool IsForInput
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 

Macro Definition Documentation

◆ BUF_DROP_FULL_SCAN_THRESHOLD

#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)

Definition at line 81 of file bufmgr.c.

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 71 of file bufmgr.c.

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 70 of file bufmgr.c.

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 63 of file bufmgr.c.

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:389
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
int32 * LocalRefCount
Definition: localbuf.c:45

Definition at line 450 of file bufmgr.c.

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 62 of file bufmgr.c.

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 66 of file bufmgr.c.

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Definition at line 90 of file bufmgr.c.

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 73 of file bufmgr.c.

◆ ST_COMPARE [1/2]

#define ST_COMPARE (   a,
  b 
)    ckpt_buforder_comparator(a, b)

Definition at line 4911 of file bufmgr.c.

◆ ST_COMPARE [2/2]

#define ST_COMPARE (   a,
  b 
)    buffertag_comparator(&a->tag, &b->tag)

Definition at line 4911 of file bufmgr.c.

◆ ST_DEFINE [1/2]

#define ST_DEFINE

Definition at line 4913 of file bufmgr.c.

◆ ST_DEFINE [2/2]

#define ST_DEFINE

Definition at line 4913 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [1/2]

#define ST_ELEMENT_TYPE   CkptSortItem

Definition at line 4910 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [2/2]

#define ST_ELEMENT_TYPE   PendingWriteback

Definition at line 4910 of file bufmgr.c.

◆ ST_SCOPE [1/2]

#define ST_SCOPE   static

Definition at line 4912 of file bufmgr.c.

◆ ST_SCOPE [2/2]

#define ST_SCOPE   static

Definition at line 4912 of file bufmgr.c.

◆ ST_SORT [1/2]

#define ST_SORT   sort_checkpoint_bufferids

Definition at line 4909 of file bufmgr.c.

◆ ST_SORT [2/2]

#define ST_SORT   sort_pending_writebacks

Definition at line 4909 of file bufmgr.c.

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

◆ SMgrSortArray

typedef struct SMgrSortArray SMgrSortArray

Function Documentation

◆ AbortBufferIO()

void AbortBufferIO ( void  )

Definition at line 4626 of file bufmgr.c.

4627 {
4629 
4630  if (buf)
4631  {
4632  uint32 buf_state;
4633 
4634  buf_state = LockBufHdr(buf);
4635  Assert(buf_state & BM_IO_IN_PROGRESS);
4636  if (IsForInput)
4637  {
4638  Assert(!(buf_state & BM_DIRTY));
4639 
4640  /* We'd better not think buffer is valid yet */
4641  Assert(!(buf_state & BM_VALID));
4642  UnlockBufHdr(buf, buf_state);
4643  }
4644  else
4645  {
4646  Assert(buf_state & BM_DIRTY);
4647  UnlockBufHdr(buf, buf_state);
4648  /* Issue notice if this is not the first failure... */
4649  if (buf_state & BM_IO_ERROR)
4650  {
4651  /* Buffer is pinned, so we can read tag without spinlock */
4652  char *path;
4653 
4654  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4655  ereport(WARNING,
4656  (errcode(ERRCODE_IO_ERROR),
4657  errmsg("could not write block %u of %s",
4658  buf->tag.blockNum, path),
4659  errdetail("Multiple failures --- write error might be permanent.")));
4660  pfree(path);
4661  }
4662  }
4664  }
4665 }
#define BM_DIRTY
Definition: buf_internals.h:59
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:62
#define UnlockBufHdr(desc, s)
#define BM_VALID
Definition: buf_internals.h:60
#define BM_IO_ERROR
Definition: buf_internals.h:63
static BufferDesc * InProgressBuf
Definition: bufmgr.c:163
static bool IsForInput
Definition: bufmgr.c:164
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4736
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4594
unsigned int uint32
Definition: c.h:441
int errdetail(const char *fmt,...)
Definition: elog.c:1037
int errcode(int sqlerrcode)
Definition: elog.c:693
int errmsg(const char *fmt,...)
Definition: elog.c:904
#define WARNING
Definition: elog.h:30
#define ereport(elevel,...)
Definition: elog.h:143
Assert(fmt[strlen(fmt) - 1] !='\n')
void pfree(void *pointer)
Definition: mcxt.c:1175
static char * buf
Definition: pg_test_fsync.c:67
#define relpathperm(rnode, forknum)
Definition: relpath.h:83

References Assert(), BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_VALID, buf, ereport, errcode(), errdetail(), errmsg(), InProgressBuf, IsForInput, LockBufHdr(), pfree(), relpathperm, TerminateBufferIO(), UnlockBufHdr, and WARNING.

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 2587 of file bufmgr.c.

2588 {
2590 
2591  AtEOXact_LocalBuffers(isCommit);
2592 
2594 }
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2648
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:201
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:577

References Assert(), AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 2629 of file bufmgr.c.

2630 {
2631  AbortBufferIO();
2632  UnlockBuffers();
2633 
2635 
2636  /* localbuf.c needs a chance too */
2638 }
void UnlockBuffers(void)
Definition: bufmgr.c:4128
void AbortBufferIO(void)
Definition: bufmgr.c:4626
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:588

References AbortBufferIO(), AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferPoolAccess().

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 2217 of file bufmgr.c.

2218 {
2219  /* info obtained from freelist.c */
2220  int strategy_buf_id;
2221  uint32 strategy_passes;
2222  uint32 recent_alloc;
2223 
2224  /*
2225  * Information saved between calls so we can determine the strategy
2226  * point's advance rate and avoid scanning already-cleaned buffers.
2227  */
2228  static bool saved_info_valid = false;
2229  static int prev_strategy_buf_id;
2230  static uint32 prev_strategy_passes;
2231  static int next_to_clean;
2232  static uint32 next_passes;
2233 
2234  /* Moving averages of allocation rate and clean-buffer density */
2235  static float smoothed_alloc = 0;
2236  static float smoothed_density = 10.0;
2237 
2238  /* Potentially these could be tunables, but for now, not */
2239  float smoothing_samples = 16;
2240  float scan_whole_pool_milliseconds = 120000.0;
2241 
2242  /* Used to compute how far we scan ahead */
2243  long strategy_delta;
2244  int bufs_to_lap;
2245  int bufs_ahead;
2246  float scans_per_alloc;
2247  int reusable_buffers_est;
2248  int upcoming_alloc_est;
2249  int min_scan_buffers;
2250 
2251  /* Variables for the scanning loop proper */
2252  int num_to_scan;
2253  int num_written;
2254  int reusable_buffers;
2255 
2256  /* Variables for final smoothed_density update */
2257  long new_strategy_delta;
2258  uint32 new_recent_alloc;
2259 
2260  /*
2261  * Find out where the freelist clock sweep currently is, and how many
2262  * buffer allocations have happened since our last call.
2263  */
2264  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2265 
2266  /* Report buffer alloc counts to pgstat */
2267  PendingBgWriterStats.buf_alloc += recent_alloc;
2268 
2269  /*
2270  * If we're not running the LRU scan, just stop after doing the stats
2271  * stuff. We mark the saved state invalid so that we can recover sanely
2272  * if LRU scan is turned back on later.
2273  */
2274  if (bgwriter_lru_maxpages <= 0)
2275  {
2276  saved_info_valid = false;
2277  return true;
2278  }
2279 
2280  /*
2281  * Compute strategy_delta = how many buffers have been scanned by the
2282  * clock sweep since last time. If first time through, assume none. Then
2283  * see if we are still ahead of the clock sweep, and if so, how many
2284  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2285  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2286  * behavior when the passes counts wrap around.
2287  */
2288  if (saved_info_valid)
2289  {
2290  int32 passes_delta = strategy_passes - prev_strategy_passes;
2291 
2292  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2293  strategy_delta += (long) passes_delta * NBuffers;
2294 
2295  Assert(strategy_delta >= 0);
2296 
2297  if ((int32) (next_passes - strategy_passes) > 0)
2298  {
2299  /* we're one pass ahead of the strategy point */
2300  bufs_to_lap = strategy_buf_id - next_to_clean;
2301 #ifdef BGW_DEBUG
2302  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2303  next_passes, next_to_clean,
2304  strategy_passes, strategy_buf_id,
2305  strategy_delta, bufs_to_lap);
2306 #endif
2307  }
2308  else if (next_passes == strategy_passes &&
2309  next_to_clean >= strategy_buf_id)
2310  {
2311  /* on same pass, but ahead or at least not behind */
2312  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2313 #ifdef BGW_DEBUG
2314  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2315  next_passes, next_to_clean,
2316  strategy_passes, strategy_buf_id,
2317  strategy_delta, bufs_to_lap);
2318 #endif
2319  }
2320  else
2321  {
2322  /*
2323  * We're behind, so skip forward to the strategy point and start
2324  * cleaning from there.
2325  */
2326 #ifdef BGW_DEBUG
2327  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2328  next_passes, next_to_clean,
2329  strategy_passes, strategy_buf_id,
2330  strategy_delta);
2331 #endif
2332  next_to_clean = strategy_buf_id;
2333  next_passes = strategy_passes;
2334  bufs_to_lap = NBuffers;
2335  }
2336  }
2337  else
2338  {
2339  /*
2340  * Initializing at startup or after LRU scanning had been off. Always
2341  * start at the strategy point.
2342  */
2343 #ifdef BGW_DEBUG
2344  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2345  strategy_passes, strategy_buf_id);
2346 #endif
2347  strategy_delta = 0;
2348  next_to_clean = strategy_buf_id;
2349  next_passes = strategy_passes;
2350  bufs_to_lap = NBuffers;
2351  }
2352 
2353  /* Update saved info for next time */
2354  prev_strategy_buf_id = strategy_buf_id;
2355  prev_strategy_passes = strategy_passes;
2356  saved_info_valid = true;
2357 
2358  /*
2359  * Compute how many buffers had to be scanned for each new allocation, ie,
2360  * 1/density of reusable buffers, and track a moving average of that.
2361  *
2362  * If the strategy point didn't move, we don't update the density estimate
2363  */
2364  if (strategy_delta > 0 && recent_alloc > 0)
2365  {
2366  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2367  smoothed_density += (scans_per_alloc - smoothed_density) /
2368  smoothing_samples;
2369  }
2370 
2371  /*
2372  * Estimate how many reusable buffers there are between the current
2373  * strategy point and where we've scanned ahead to, based on the smoothed
2374  * density estimate.
2375  */
2376  bufs_ahead = NBuffers - bufs_to_lap;
2377  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2378 
2379  /*
2380  * Track a moving average of recent buffer allocations. Here, rather than
2381  * a true average we want a fast-attack, slow-decline behavior: we
2382  * immediately follow any increase.
2383  */
2384  if (smoothed_alloc <= (float) recent_alloc)
2385  smoothed_alloc = recent_alloc;
2386  else
2387  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2388  smoothing_samples;
2389 
2390  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2391  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2392 
2393  /*
2394  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2395  * eventually underflow to zero, and the underflows produce annoying
2396  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2397  * zero, there's no point in tracking smaller and smaller values of
2398  * smoothed_alloc, so just reset it to exactly zero to avoid this
2399  * syndrome. It will pop back up as soon as recent_alloc increases.
2400  */
2401  if (upcoming_alloc_est == 0)
2402  smoothed_alloc = 0;
2403 
2404  /*
2405  * Even in cases where there's been little or no buffer allocation
2406  * activity, we want to make a small amount of progress through the buffer
2407  * cache so that as many reusable buffers as possible are clean after an
2408  * idle period.
2409  *
2410  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2411  * the BGW will be called during the scan_whole_pool time; slice the
2412  * buffer pool into that many sections.
2413  */
2414  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2415 
2416  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2417  {
2418 #ifdef BGW_DEBUG
2419  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2420  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2421 #endif
2422  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2423  }
2424 
2425  /*
2426  * Now write out dirty reusable buffers, working forward from the
2427  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2428  * enough buffers to match our estimate of the next cycle's allocation
2429  * requirements, or hit the bgwriter_lru_maxpages limit.
2430  */
2431 
2432  /* Make sure we can handle the pin inside SyncOneBuffer */
2434 
2435  num_to_scan = bufs_to_lap;
2436  num_written = 0;
2437  reusable_buffers = reusable_buffers_est;
2438 
2439  /* Execute the LRU scan */
2440  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2441  {
2442  int sync_state = SyncOneBuffer(next_to_clean, true,
2443  wb_context);
2444 
2445  if (++next_to_clean >= NBuffers)
2446  {
2447  next_to_clean = 0;
2448  next_passes++;
2449  }
2450  num_to_scan--;
2451 
2452  if (sync_state & BUF_WRITTEN)
2453  {
2454  reusable_buffers++;
2455  if (++num_written >= bgwriter_lru_maxpages)
2456  {
2458  break;
2459  }
2460  }
2461  else if (sync_state & BUF_REUSABLE)
2462  reusable_buffers++;
2463  }
2464 
2465  PendingBgWriterStats.buf_written_clean += num_written;
2466 
2467 #ifdef BGW_DEBUG
2468  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2469  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2470  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2471  bufs_to_lap - num_to_scan,
2472  num_written,
2473  reusable_buffers - reusable_buffers_est);
2474 #endif
2475 
2476  /*
2477  * Consider the above scan as being like a new allocation scan.
2478  * Characterize its density and update the smoothed one based on it. This
2479  * effectively halves the moving average period in cases where both the
2480  * strategy and the background writer are doing some useful scanning,
2481  * which is helpful because a long memory isn't as desirable on the
2482  * density estimates.
2483  */
2484  new_strategy_delta = bufs_to_lap - num_to_scan;
2485  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2486  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2487  {
2488  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2489  smoothed_density += (scans_per_alloc - smoothed_density) /
2490  smoothing_samples;
2491 
2492 #ifdef BGW_DEBUG
2493  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2494  new_recent_alloc, new_strategy_delta,
2495  scans_per_alloc, smoothed_density);
2496 #endif
2497  }
2498 
2499  /* Return true if OK to hibernate */
2500  return (bufs_to_lap == 0 && recent_alloc == 0);
2501 }
int BgWriterDelay
Definition: bgwriter.c:61
#define BUF_REUSABLE
Definition: bufmgr.c:71
double bgwriter_lru_multiplier
Definition: bufmgr.c:136
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2520
int bgwriter_lru_maxpages
Definition: bufmgr.c:135
#define BUF_WRITTEN
Definition: bufmgr.c:70
signed int int32
Definition: c.h:429
#define DEBUG2
Definition: elog.h:23
#define DEBUG1
Definition: elog.h:24
#define elog(elevel,...)
Definition: elog.h:218
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
int NBuffers
Definition: globals.c:136
PgStat_BgWriterStats PendingBgWriterStats
ResourceOwner CurrentResourceOwner
Definition: resowner.c:146
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:945
PgStat_Counter buf_written_clean
Definition: pgstat.h:262
PgStat_Counter maxwritten_clean
Definition: pgstat.h:263
PgStat_Counter buf_alloc
Definition: pgstat.h:264

References Assert(), bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, PgStat_BgWriterStats::buf_alloc, BUF_REUSABLE, BUF_WRITTEN, PgStat_BgWriterStats::buf_written_clean, CurrentResourceOwner, DEBUG1, DEBUG2, elog, PgStat_BgWriterStats::maxwritten_clean, NBuffers, PendingBgWriterStats, ResourceOwnerEnlargeBuffers(), StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

◆ BufferAlloc()

static BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr 
)
static

Definition at line 1109 of file bufmgr.c.

1113 {
1114  BufferTag newTag; /* identity of requested block */
1115  uint32 newHash; /* hash value for newTag */
1116  LWLock *newPartitionLock; /* buffer partition lock for it */
1117  BufferTag oldTag; /* previous identity of selected buffer */
1118  uint32 oldHash; /* hash value for oldTag */
1119  LWLock *oldPartitionLock; /* buffer partition lock for it */
1120  uint32 oldFlags;
1121  int buf_id;
1122  BufferDesc *buf;
1123  bool valid;
1124  uint32 buf_state;
1125 
1126  /* create a tag so we can lookup the buffer */
1127  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
1128 
1129  /* determine its hash code and partition lock ID */
1130  newHash = BufTableHashCode(&newTag);
1131  newPartitionLock = BufMappingPartitionLock(newHash);
1132 
1133  /* see if the block is in the buffer pool already */
1134  LWLockAcquire(newPartitionLock, LW_SHARED);
1135  buf_id = BufTableLookup(&newTag, newHash);
1136  if (buf_id >= 0)
1137  {
1138  /*
1139  * Found it. Now, pin the buffer so no one can steal it from the
1140  * buffer pool, and check to see if the correct data has been loaded
1141  * into the buffer.
1142  */
1143  buf = GetBufferDescriptor(buf_id);
1144 
1145  valid = PinBuffer(buf, strategy);
1146 
1147  /* Can release the mapping lock as soon as we've pinned it */
1148  LWLockRelease(newPartitionLock);
1149 
1150  *foundPtr = true;
1151 
1152  if (!valid)
1153  {
1154  /*
1155  * We can only get here if (a) someone else is still reading in
1156  * the page, or (b) a previous read attempt failed. We have to
1157  * wait for any active read attempt to finish, and then set up our
1158  * own read attempt if the page is still not BM_VALID.
1159  * StartBufferIO does it all.
1160  */
1161  if (StartBufferIO(buf, true))
1162  {
1163  /*
1164  * If we get here, previous attempts to read the buffer must
1165  * have failed ... but we shall bravely try again.
1166  */
1167  *foundPtr = false;
1168  }
1169  }
1170 
1171  return buf;
1172  }
1173 
1174  /*
1175  * Didn't find it in the buffer pool. We'll have to initialize a new
1176  * buffer. Remember to unlock the mapping lock while doing the work.
1177  */
1178  LWLockRelease(newPartitionLock);
1179 
1180  /* Loop here in case we have to try another victim buffer */
1181  for (;;)
1182  {
1183  /*
1184  * Ensure, while the spinlock's not yet held, that there's a free
1185  * refcount entry.
1186  */
1188 
1189  /*
1190  * Select a victim buffer. The buffer is returned with its header
1191  * spinlock still held!
1192  */
1193  buf = StrategyGetBuffer(strategy, &buf_state);
1194 
1195  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1196 
1197  /* Must copy buffer flags while we still hold the spinlock */
1198  oldFlags = buf_state & BUF_FLAG_MASK;
1199 
1200  /* Pin the buffer and then release the buffer spinlock */
1202 
1203  /*
1204  * If the buffer was dirty, try to write it out. There is a race
1205  * condition here, in that someone might dirty it after we released it
1206  * above, or even while we are writing it out (since our share-lock
1207  * won't prevent hint-bit updates). We will recheck the dirty bit
1208  * after re-locking the buffer header.
1209  */
1210  if (oldFlags & BM_DIRTY)
1211  {
1212  /*
1213  * We need a share-lock on the buffer contents to write it out
1214  * (else we might write invalid data, eg because someone else is
1215  * compacting the page contents while we write). We must use a
1216  * conditional lock acquisition here to avoid deadlock. Even
1217  * though the buffer was not pinned (and therefore surely not
1218  * locked) when StrategyGetBuffer returned it, someone else could
1219  * have pinned and exclusive-locked it by the time we get here. If
1220  * we try to get the lock unconditionally, we'd block waiting for
1221  * them; if they later block waiting for us, deadlock ensues.
1222  * (This has been observed to happen when two backends are both
1223  * trying to split btree index pages, and the second one just
1224  * happens to be trying to split the page the first one got from
1225  * StrategyGetBuffer.)
1226  */
1228  LW_SHARED))
1229  {
1230  /*
1231  * If using a nondefault strategy, and writing the buffer
1232  * would require a WAL flush, let the strategy decide whether
1233  * to go ahead and write/reuse the buffer or to choose another
1234  * victim. We need lock to inspect the page LSN, so this
1235  * can't be done inside StrategyGetBuffer.
1236  */
1237  if (strategy != NULL)
1238  {
1239  XLogRecPtr lsn;
1240 
1241  /* Read the LSN while holding buffer header lock */
1242  buf_state = LockBufHdr(buf);
1243  lsn = BufferGetLSN(buf);
1244  UnlockBufHdr(buf, buf_state);
1245 
1246  if (XLogNeedsFlush(lsn) &&
1247  StrategyRejectBuffer(strategy, buf))
1248  {
1249  /* Drop lock/pin and loop around for another buffer */
1251  UnpinBuffer(buf, true);
1252  continue;
1253  }
1254  }
1255 
1256  /* OK, do the I/O */
1257  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1258  smgr->smgr_rnode.node.spcNode,
1259  smgr->smgr_rnode.node.dbNode,
1260  smgr->smgr_rnode.node.relNode);
1261 
1262  FlushBuffer(buf, NULL);
1264 
1266  &buf->tag);
1267 
1268  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1269  smgr->smgr_rnode.node.spcNode,
1270  smgr->smgr_rnode.node.dbNode,
1271  smgr->smgr_rnode.node.relNode);
1272  }
1273  else
1274  {
1275  /*
1276  * Someone else has locked the buffer, so give it up and loop
1277  * back to get another one.
1278  */
1279  UnpinBuffer(buf, true);
1280  continue;
1281  }
1282  }
1283 
1284  /*
1285  * To change the association of a valid buffer, we'll need to have
1286  * exclusive lock on both the old and new mapping partitions.
1287  */
1288  if (oldFlags & BM_TAG_VALID)
1289  {
1290  /*
1291  * Need to compute the old tag's hashcode and partition lock ID.
1292  * XXX is it worth storing the hashcode in BufferDesc so we need
1293  * not recompute it here? Probably not.
1294  */
1295  oldTag = buf->tag;
1296  oldHash = BufTableHashCode(&oldTag);
1297  oldPartitionLock = BufMappingPartitionLock(oldHash);
1298 
1299  /*
1300  * Must lock the lower-numbered partition first to avoid
1301  * deadlocks.
1302  */
1303  if (oldPartitionLock < newPartitionLock)
1304  {
1305  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1306  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1307  }
1308  else if (oldPartitionLock > newPartitionLock)
1309  {
1310  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1311  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1312  }
1313  else
1314  {
1315  /* only one partition, only one lock */
1316  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1317  }
1318  }
1319  else
1320  {
1321  /* if it wasn't valid, we need only the new partition */
1322  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1323  /* remember we have no old-partition lock or tag */
1324  oldPartitionLock = NULL;
1325  /* keep the compiler quiet about uninitialized variables */
1326  oldHash = 0;
1327  }
1328 
1329  /*
1330  * Try to make a hashtable entry for the buffer under its new tag.
1331  * This could fail because while we were writing someone else
1332  * allocated another buffer for the same block we want to read in.
1333  * Note that we have not yet removed the hashtable entry for the old
1334  * tag.
1335  */
1336  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1337 
1338  if (buf_id >= 0)
1339  {
1340  /*
1341  * Got a collision. Someone has already done what we were about to
1342  * do. We'll just handle this as if it were found in the buffer
1343  * pool in the first place. First, give up the buffer we were
1344  * planning to use.
1345  */
1346  UnpinBuffer(buf, true);
1347 
1348  /* Can give up that buffer's mapping partition lock now */
1349  if (oldPartitionLock != NULL &&
1350  oldPartitionLock != newPartitionLock)
1351  LWLockRelease(oldPartitionLock);
1352 
1353  /* remaining code should match code at top of routine */
1354 
1355  buf = GetBufferDescriptor(buf_id);
1356 
1357  valid = PinBuffer(buf, strategy);
1358 
1359  /* Can release the mapping lock as soon as we've pinned it */
1360  LWLockRelease(newPartitionLock);
1361 
1362  *foundPtr = true;
1363 
1364  if (!valid)
1365  {
1366  /*
1367  * We can only get here if (a) someone else is still reading
1368  * in the page, or (b) a previous read attempt failed. We
1369  * have to wait for any active read attempt to finish, and
1370  * then set up our own read attempt if the page is still not
1371  * BM_VALID. StartBufferIO does it all.
1372  */
1373  if (StartBufferIO(buf, true))
1374  {
1375  /*
1376  * If we get here, previous attempts to read the buffer
1377  * must have failed ... but we shall bravely try again.
1378  */
1379  *foundPtr = false;
1380  }
1381  }
1382 
1383  return buf;
1384  }
1385 
1386  /*
1387  * Need to lock the buffer header too in order to change its tag.
1388  */
1389  buf_state = LockBufHdr(buf);
1390 
1391  /*
1392  * Somebody could have pinned or re-dirtied the buffer while we were
1393  * doing the I/O and making the new hashtable entry. If so, we can't
1394  * recycle this buffer; we must undo everything we've done and start
1395  * over with a new victim buffer.
1396  */
1397  oldFlags = buf_state & BUF_FLAG_MASK;
1398  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1399  break;
1400 
1401  UnlockBufHdr(buf, buf_state);
1402  BufTableDelete(&newTag, newHash);
1403  if (oldPartitionLock != NULL &&
1404  oldPartitionLock != newPartitionLock)
1405  LWLockRelease(oldPartitionLock);
1406  LWLockRelease(newPartitionLock);
1407  UnpinBuffer(buf, true);
1408  }
1409 
1410  /*
1411  * Okay, it's finally safe to rename the buffer.
1412  *
1413  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1414  * paranoia. We also reset the usage_count since any recency of use of
1415  * the old content is no longer relevant. (The usage_count starts out at
1416  * 1 so that the buffer can survive one clock-sweep pass.)
1417  *
1418  * Make sure BM_PERMANENT is set for buffers that must be written at every
1419  * checkpoint. Unlogged buffers only need to be written at shutdown
1420  * checkpoints, except for their "init" forks, which need to be treated
1421  * just like permanent relations.
1422  */
1423  buf->tag = newTag;
1424  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1427  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1428  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1429  else
1430  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1431 
1432  UnlockBufHdr(buf, buf_state);
1433 
1434  if (oldPartitionLock != NULL)
1435  {
1436  BufTableDelete(&oldTag, oldHash);
1437  if (oldPartitionLock != newPartitionLock)
1438  LWLockRelease(oldPartitionLock);
1439  }
1440 
1441  LWLockRelease(newPartitionLock);
1442 
1443  /*
1444  * Buffer contents are currently invalid. Try to obtain the right to
1445  * start I/O. If StartBufferIO returns false, then someone else managed
1446  * to read it before we did, so there's nothing left for BufferAlloc() to
1447  * do.
1448  */
1449  if (StartBufferIO(buf, true))
1450  *foundPtr = false;
1451  else
1452  *foundPtr = true;
1453 
1454  return buf;
1455 }
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
#define BM_TAG_VALID
Definition: buf_internals.h:61
#define BM_PERMANENT
Definition: buf_internals.h:67
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:43
#define GetBufferDescriptor(id)
#define BUF_FLAG_MASK
Definition: buf_internals.h:46
#define BufferDescriptorGetContentLock(bdesc)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:64
#define BufMappingPartitionLock(hashcode)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:44
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:49
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:66
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1694
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1797
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:63
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4883
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4543
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:217
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2815
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1842
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:685
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1196
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1800
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1367
@ LW_SHARED
Definition: lwlock.h:105
@ LW_EXCLUSIVE
Definition: lwlock.h:104
@ INIT_FORKNUM
Definition: relpath.h:46
Definition: lwlock.h:32
RelFileNode node
Definition: relfilenode.h:74
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:2838
uint64 XLogRecPtr
Definition: xlogdefs.h:21

References Assert(), BackendWritebackContext, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_ERROR, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BUF_USAGECOUNT_ONE, BufferDescriptorGetContentLock, BufferGetLSN, BufMappingPartitionLock, BufTableDelete(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), RelFileNode::dbNode, FlushBuffer(), GetBufferDescriptor, INIT_BUFFERTAG, INIT_FORKNUM, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockConditionalAcquire(), LWLockRelease(), RelFileNodeBackend::node, PinBuffer(), PinBuffer_Locked(), RelFileNode::relNode, ReservePrivateRefCountEntry(), ScheduleBufferTagForWriteback(), SMgrRelationData::smgr_rnode, RelFileNode::spcNode, StartBufferIO(), StrategyGetBuffer(), StrategyRejectBuffer(), UnlockBufHdr, UnpinBuffer(), and XLogNeedsFlush().

Referenced by ReadBuffer_common().

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 2755 of file bufmgr.c.

2756 {
2757  BufferDesc *bufHdr;
2758 
2759  Assert(BufferIsPinned(buffer));
2760 
2761  if (BufferIsLocal(buffer))
2762  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2763  else
2764  bufHdr = GetBufferDescriptor(buffer - 1);
2765 
2766  /* pinned, so OK to read tag without spinlock */
2767  return bufHdr->tag.blockNum;
2768 }
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define GetLocalBufferDescriptor(id)
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:450
BufferTag tag
BlockNumber blockNum
Definition: buf_internals.h:94

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, GetBufferDescriptor, GetLocalBufferDescriptor, and BufferDesc::tag.

Referenced by _bt_binsrch_insert(), _bt_bottomupdel_pass(), _bt_check_unique(), _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_endpoint(), _bt_finish_split(), _bt_first(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_simpledel_pass(), _bt_split(), _bt_unlink_halfdead_page(), _bt_walk_left(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_fork_with_data(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_is_all_visible(), heap_page_prune(), heap_prune_chain(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), index_compute_xid_horizon_for_tuples(), lazy_scan_noprune(), lazy_scan_prune(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), ReadBufferBI(), RelationAddExtraBlocks(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), ScanSourceDatabasePgClassPage(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgWalk(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), visibilitymap_set(), and XLogReadBufferExtended().

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 3004 of file bufmgr.c.

3005 {
3006  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
3007  char *page = BufferGetPage(buffer);
3008  XLogRecPtr lsn;
3009  uint32 buf_state;
3010 
3011  /*
3012  * If we don't need locking for correctness, fastpath out.
3013  */
3014  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
3015  return PageGetLSN(page);
3016 
3017  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3018  Assert(BufferIsValid(buffer));
3019  Assert(BufferIsPinned(buffer));
3020 
3021  buf_state = LockBufHdr(bufHdr);
3022  lsn = PageGetLSN(page);
3023  UnlockBufHdr(bufHdr, buf_state);
3024 
3025  return lsn;
3026 }
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define PageGetLSN(page)
Definition: bufpage.h:365
#define XLogHintBitIsNeeded()
Definition: xlog.h:115

References Assert(), PrivateRefCountEntry::buffer, BufferGetPage, BufferIsLocal, BufferIsPinned, BufferIsValid, GetBufferDescriptor, LockBufHdr(), PageGetLSN, UnlockBufHdr, and XLogHintBitIsNeeded.

Referenced by _bt_killitems(), _bt_readpage(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileNode rnode,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 2776 of file bufmgr.c.

2778 {
2779  BufferDesc *bufHdr;
2780 
2781  /* Do the same checks as BufferGetBlockNumber. */
2782  Assert(BufferIsPinned(buffer));
2783 
2784  if (BufferIsLocal(buffer))
2785  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2786  else
2787  bufHdr = GetBufferDescriptor(buffer - 1);
2788 
2789  /* pinned, so OK to read tag without spinlock */
2790  *rnode = bufHdr->tag.rnode;
2791  *forknum = bufHdr->tag.forkNum;
2792  *blknum = bufHdr->tag.blockNum;
2793 }
ForkNumber forkNum
Definition: buf_internals.h:93
RelFileNode rnode
Definition: buf_internals.h:92

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, buftag::rnode, and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 2974 of file bufmgr.c.

2975 {
2976  BufferDesc *bufHdr;
2977 
2978  /* Local buffers are used only for temp relations. */
2979  if (BufferIsLocal(buffer))
2980  return false;
2981 
2982  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2983  Assert(BufferIsValid(buffer));
2984  Assert(BufferIsPinned(buffer));
2985 
2986  /*
2987  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2988  * need not bother with the buffer header spinlock. Even if someone else
2989  * changes the buffer header state while we're doing this, the state is
2990  * changed atomically, so we'll read the old value or the new value, but
2991  * not random garbage.
2992  */
2993  bufHdr = GetBufferDescriptor(buffer - 1);
2994  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2995 }
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241
pg_atomic_uint32 state

References Assert(), BM_PERMANENT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid, GetBufferDescriptor, pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 1941 of file bufmgr.c.

1942 {
1943  uint32 buf_state;
1944  int buf_id;
1945  int num_to_scan;
1946  int num_spaces;
1947  int num_processed;
1948  int num_written;
1949  CkptTsStatus *per_ts_stat = NULL;
1950  Oid last_tsid;
1951  binaryheap *ts_heap;
1952  int i;
1953  int mask = BM_DIRTY;
1954  WritebackContext wb_context;
1955 
1956  /* Make sure we can handle the pin inside SyncOneBuffer */
1958 
1959  /*
1960  * Unless this is a shutdown checkpoint or we have been explicitly told,
1961  * we write only permanent, dirty buffers. But at shutdown or end of
1962  * recovery, we write all dirty buffers.
1963  */
1966  mask |= BM_PERMANENT;
1967 
1968  /*
1969  * Loop over all buffers, and mark the ones that need to be written with
1970  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1971  * can estimate how much work needs to be done.
1972  *
1973  * This allows us to write only those pages that were dirty when the
1974  * checkpoint began, and not those that get dirtied while it proceeds.
1975  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1976  * later in this function, or by normal backends or the bgwriter cleaning
1977  * scan, the flag is cleared. Any buffer dirtied after this point won't
1978  * have the flag set.
1979  *
1980  * Note that if we fail to write some buffer, we may leave buffers with
1981  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1982  * certainly need to be written for the next checkpoint attempt, too.
1983  */
1984  num_to_scan = 0;
1985  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1986  {
1987  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1988 
1989  /*
1990  * Header spinlock is enough to examine BM_DIRTY, see comment in
1991  * SyncOneBuffer.
1992  */
1993  buf_state = LockBufHdr(bufHdr);
1994 
1995  if ((buf_state & mask) == mask)
1996  {
1997  CkptSortItem *item;
1998 
1999  buf_state |= BM_CHECKPOINT_NEEDED;
2000 
2001  item = &CkptBufferIds[num_to_scan++];
2002  item->buf_id = buf_id;
2003  item->tsId = bufHdr->tag.rnode.spcNode;
2004  item->relNode = bufHdr->tag.rnode.relNode;
2005  item->forkNum = bufHdr->tag.forkNum;
2006  item->blockNum = bufHdr->tag.blockNum;
2007  }
2008 
2009  UnlockBufHdr(bufHdr, buf_state);
2010 
2011  /* Check for barrier events in case NBuffers is large. */
2014  }
2015 
2016  if (num_to_scan == 0)
2017  return; /* nothing to do */
2018 
2020 
2021  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2022 
2023  /*
2024  * Sort buffers that need to be written to reduce the likelihood of random
2025  * IO. The sorting is also important for the implementation of balancing
2026  * writes between tablespaces. Without balancing writes we'd potentially
2027  * end up writing to the tablespaces one-by-one; possibly overloading the
2028  * underlying system.
2029  */
2030  sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2031 
2032  num_spaces = 0;
2033 
2034  /*
2035  * Allocate progress status for each tablespace with buffers that need to
2036  * be flushed. This requires the to-be-flushed array to be sorted.
2037  */
2038  last_tsid = InvalidOid;
2039  for (i = 0; i < num_to_scan; i++)
2040  {
2041  CkptTsStatus *s;
2042  Oid cur_tsid;
2043 
2044  cur_tsid = CkptBufferIds[i].tsId;
2045 
2046  /*
2047  * Grow array of per-tablespace status structs, every time a new
2048  * tablespace is found.
2049  */
2050  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
2051  {
2052  Size sz;
2053 
2054  num_spaces++;
2055 
2056  /*
2057  * Not worth adding grow-by-power-of-2 logic here - even with a
2058  * few hundred tablespaces this should be fine.
2059  */
2060  sz = sizeof(CkptTsStatus) * num_spaces;
2061 
2062  if (per_ts_stat == NULL)
2063  per_ts_stat = (CkptTsStatus *) palloc(sz);
2064  else
2065  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
2066 
2067  s = &per_ts_stat[num_spaces - 1];
2068  memset(s, 0, sizeof(*s));
2069  s->tsId = cur_tsid;
2070 
2071  /*
2072  * The first buffer in this tablespace. As CkptBufferIds is sorted
2073  * by tablespace all (s->num_to_scan) buffers in this tablespace
2074  * will follow afterwards.
2075  */
2076  s->index = i;
2077 
2078  /*
2079  * progress_slice will be determined once we know how many buffers
2080  * are in each tablespace, i.e. after this loop.
2081  */
2082 
2083  last_tsid = cur_tsid;
2084  }
2085  else
2086  {
2087  s = &per_ts_stat[num_spaces - 1];
2088  }
2089 
2090  s->num_to_scan++;
2091 
2092  /* Check for barrier events. */
2095  }
2096 
2097  Assert(num_spaces > 0);
2098 
2099  /*
2100  * Build a min-heap over the write-progress in the individual tablespaces,
2101  * and compute how large a portion of the total progress a single
2102  * processed buffer is.
2103  */
2104  ts_heap = binaryheap_allocate(num_spaces,
2106  NULL);
2107 
2108  for (i = 0; i < num_spaces; i++)
2109  {
2110  CkptTsStatus *ts_stat = &per_ts_stat[i];
2111 
2112  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
2113 
2114  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
2115  }
2116 
2117  binaryheap_build(ts_heap);
2118 
2119  /*
2120  * Iterate through to-be-checkpointed buffers and write the ones (still)
2121  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
2122  * tablespaces; otherwise the sorting would lead to only one tablespace
2123  * receiving writes at a time, making inefficient use of the hardware.
2124  */
2125  num_processed = 0;
2126  num_written = 0;
2127  while (!binaryheap_empty(ts_heap))
2128  {
2129  BufferDesc *bufHdr = NULL;
2130  CkptTsStatus *ts_stat = (CkptTsStatus *)
2132 
2133  buf_id = CkptBufferIds[ts_stat->index].buf_id;
2134  Assert(buf_id != -1);
2135 
2136  bufHdr = GetBufferDescriptor(buf_id);
2137 
2138  num_processed++;
2139 
2140  /*
2141  * We don't need to acquire the lock here, because we're only looking
2142  * at a single bit. It's possible that someone else writes the buffer
2143  * and clears the flag right after we check, but that doesn't matter
2144  * since SyncOneBuffer will then do nothing. However, there is a
2145  * further race condition: it's conceivable that between the time we
2146  * examine the bit here and the time SyncOneBuffer acquires the lock,
2147  * someone else not only wrote the buffer but replaced it with another
2148  * page and dirtied it. In that improbable case, SyncOneBuffer will
2149  * write the buffer though we didn't need to. It doesn't seem worth
2150  * guarding against this, though.
2151  */
2153  {
2154  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2155  {
2156  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2158  num_written++;
2159  }
2160  }
2161 
2162  /*
2163  * Measure progress independent of actually having to flush the buffer
2164  * - otherwise writing become unbalanced.
2165  */
2166  ts_stat->progress += ts_stat->progress_slice;
2167  ts_stat->num_scanned++;
2168  ts_stat->index++;
2169 
2170  /* Have all the buffers from the tablespace been processed? */
2171  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2172  {
2173  binaryheap_remove_first(ts_heap);
2174  }
2175  else
2176  {
2177  /* update heap with the new progress */
2178  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2179  }
2180 
2181  /*
2182  * Sleep to throttle our I/O rate.
2183  *
2184  * (This will check for barrier events even if it doesn't sleep.)
2185  */
2186  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2187  }
2188 
2189  /* issue all pending flushes */
2190  IssuePendingWritebacks(&wb_context);
2191 
2192  pfree(per_ts_stat);
2193  per_ts_stat = NULL;
2194  binaryheap_free(ts_heap);
2195 
2196  /*
2197  * Update checkpoint statistics. As noted above, this doesn't include
2198  * buffers written by other backends or bgwriter scan.
2199  */
2200  CheckpointStats.ckpt_bufs_written += num_written;
2201 
2202  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2203 }
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:125
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:109
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:32
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:173
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:68
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:207
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:158
#define binaryheap_empty(h)
Definition: binaryheap.h:52
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4848
int checkpoint_flush_after
Definition: bufmgr.c:158
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4871
struct CkptTsStatus CkptTsStatus
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4924
double float8
Definition: c.h:565
size_t Size
Definition: c.h:540
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:697
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:37
int i
Definition: isn.c:73
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1188
void * palloc(Size size)
Definition: mcxt.c:1068
PgStat_CheckpointerStats PendingCheckpointerStats
#define DatumGetPointer(X)
Definition: postgres.h:593
#define PointerGetDatum(X)
Definition: postgres.h:600
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:467
int ckpt_bufs_written
Definition: xlog.h:162
ForkNumber forkNum
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:109
int index
Definition: bufmgr.c:117
int num_scanned
Definition: bufmgr.c:114
float8 progress
Definition: bufmgr.c:108
int num_to_scan
Definition: bufmgr.c:112
Oid tsId
Definition: bufmgr.c:99
PgStat_Counter buf_written_checkpoints
Definition: pgstat.h:274
CheckpointStatsData CheckpointStats
Definition: xlog.c:206
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:135
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:138
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:134

References Assert(), binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, PgStat_CheckpointerStats::buf_written_checkpoints, CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_ALL, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, CurrentResourceOwner, DatumGetPointer, buftag::forkNum, CkptSortItem::forkNum, GetBufferDescriptor, i, CkptTsStatus::index, InvalidOid, IssuePendingWritebacks(), LockBufHdr(), NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), PendingCheckpointerStats, pfree(), pg_atomic_read_u32(), PointerGetDatum, ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress, CkptTsStatus::progress_slice, CkptSortItem::relNode, RelFileNode::relNode, repalloc(), ResourceOwnerEnlargeBuffers(), buftag::rnode, RelFileNode::spcNode, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdr, and WritebackContextInit().

Referenced by CheckPointBuffers().

◆ buffertag_comparator()

static int buffertag_comparator ( const BufferTag a,
const BufferTag b 
)
inlinestatic

Definition at line 4788 of file bufmgr.c.

4789 {
4790  int ret;
4791 
4792  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4793 
4794  if (ret != 0)
4795  return ret;
4796 
4797  if (ba->forkNum < bb->forkNum)
4798  return -1;
4799  if (ba->forkNum > bb->forkNum)
4800  return 1;
4801 
4802  if (ba->blockNum < bb->blockNum)
4803  return -1;
4804  if (ba->blockNum > bb->blockNum)
4805  return 1;
4806 
4807  return 0;
4808 }
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4709

References buftag::blockNum, buftag::forkNum, buftag::rnode, and rnode_comparator().

◆ BufmgrCommit()

void BufmgrCommit ( void  )

Definition at line 2741 of file bufmgr.c.

2742 {
2743  /* Nothing to do in bufmgr anymore... */
2744 }

Referenced by PrepareTransaction(), and RecordTransactionCommit().

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 2648 of file bufmgr.c.

2649 {
2650 #ifdef USE_ASSERT_CHECKING
2651  int RefCountErrors = 0;
2653  int i;
2654 
2655  /* check the array */
2656  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2657  {
2659 
2660  if (res->buffer != InvalidBuffer)
2661  {
2662  PrintBufferLeakWarning(res->buffer);
2663  RefCountErrors++;
2664  }
2665  }
2666 
2667  /* if necessary search the hash */
2669  {
2670  HASH_SEQ_STATUS hstat;
2671 
2673  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2674  {
2675  PrintBufferLeakWarning(res->buffer);
2676  RefCountErrors++;
2677  }
2678  }
2679 
2680  Assert(RefCountErrors == 0);
2681 #endif
2682 }
#define InvalidBuffer
Definition: buf.h:25
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:90
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2688
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:199
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:200
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1436
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1426

References Assert(), hash_seq_init(), hash_seq_search(), i, InvalidBuffer, PrintBufferLeakWarning(), PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and res.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 2731 of file bufmgr.c.

2732 {
2733  BufferSync(flags);
2734 }
static void BufferSync(int flags)
Definition: bufmgr.c:1941

References BufferSync().

Referenced by CheckPointGuts().

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const CkptSortItem a,
const CkptSortItem b 
)
inlinestatic

Definition at line 4817 of file bufmgr.c.

4818 {
4819  /* compare tablespace */
4820  if (a->tsId < b->tsId)
4821  return -1;
4822  else if (a->tsId > b->tsId)
4823  return 1;
4824  /* compare relation */
4825  if (a->relNode < b->relNode)
4826  return -1;
4827  else if (a->relNode > b->relNode)
4828  return 1;
4829  /* compare fork */
4830  else if (a->forkNum < b->forkNum)
4831  return -1;
4832  else if (a->forkNum > b->forkNum)
4833  return 1;
4834  /* compare block number */
4835  else if (a->blockNum < b->blockNum)
4836  return -1;
4837  else if (a->blockNum > b->blockNum)
4838  return 1;
4839  /* equal page IDs are unlikely, but not impossible */
4840  return 0;
4841 }
int b
Definition: isn.c:70
int a
Definition: isn.c:69

References a, and b.

◆ ConditionalLockBuffer()

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 4390 of file bufmgr.c.

4391 {
4392  BufferDesc *bufHdr;
4393  uint32 buf_state,
4394  refcount;
4395 
4396  Assert(BufferIsValid(buffer));
4397 
4398  if (BufferIsLocal(buffer))
4399  {
4400  refcount = LocalRefCount[-buffer - 1];
4401  /* There should be exactly one pin */
4402  Assert(refcount > 0);
4403  if (refcount != 1)
4404  return false;
4405  /* Nobody else to wait for */
4406  return true;
4407  }
4408 
4409  /* There should be exactly one local pin */
4410  refcount = GetPrivateRefCount(buffer);
4411  Assert(refcount);
4412  if (refcount != 1)
4413  return false;
4414 
4415  /* Try to acquire lock */
4416  if (!ConditionalLockBuffer(buffer))
4417  return false;
4418 
4419  bufHdr = GetBufferDescriptor(buffer - 1);
4420  buf_state = LockBufHdr(bufHdr);
4421  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
4422 
4423  Assert(refcount > 0);
4424  if (refcount == 1)
4425  {
4426  /* Successfully acquired exclusive lock with pincount 1 */
4427  UnlockBufHdr(bufHdr, buf_state);
4428  return true;
4429  }
4430 
4431  /* Failed, so release the lock */
4432  UnlockBufHdr(bufHdr, buf_state);
4433  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4434  return false;
4435 }
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:4182
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4156
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid, ConditionalLockBuffer(), GetBufferDescriptor, GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr.

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), and lazy_scan_heap().

◆ CreateAndCopyRelationData()

void CreateAndCopyRelationData ( RelFileNode  src_rnode,
RelFileNode  dst_rnode,
bool  permanent 
)

Definition at line 3778 of file bufmgr.c.

3780 {
3781  Relation src_rel;
3782  Relation dst_rel;
3783  char relpersistence;
3784 
3785  /* Set the relpersistence. */
3786  relpersistence = permanent ?
3787  RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
3788 
3789  /*
3790  * We can't use a real relcache entry for a relation in some other
3791  * database, but since we're only going to access the fields related to
3792  * physical storage, a fake one is good enough. If we didn't do this and
3793  * used the smgr layer directly, we would have to worry about
3794  * invalidations.
3795  */
3796  src_rel = CreateFakeRelcacheEntry(src_rnode);
3797  dst_rel = CreateFakeRelcacheEntry(dst_rnode);
3798 
3799  /*
3800  * Create and copy all forks of the relation. During create database we
3801  * have a separate cleanup mechanism which deletes complete database
3802  * directory. Therefore, each individual relation doesn't need to be
3803  * registered for cleanup.
3804  */
3805  RelationCreateStorage(dst_rnode, relpersistence, false);
3806 
3807  /* copy main fork. */
3808  RelationCopyStorageUsingBuffer(src_rel, dst_rel, MAIN_FORKNUM, permanent);
3809 
3810  /* copy those extra forks that exist */
3811  for (ForkNumber forkNum = MAIN_FORKNUM + 1;
3812  forkNum <= MAX_FORKNUM; forkNum++)
3813  {
3814  if (smgrexists(RelationGetSmgr(src_rel), forkNum))
3815  {
3816  smgrcreate(RelationGetSmgr(dst_rel), forkNum, false);
3817 
3818  /*
3819  * WAL log creation if the relation is persistent, or this is the
3820  * init fork of an unlogged relation.
3821  */
3822  if (permanent || forkNum == INIT_FORKNUM)
3823  log_smgrcreate(&dst_rnode, forkNum);
3824 
3825  /* Copy a fork's data, block by block. */
3826  RelationCopyStorageUsingBuffer(src_rel, dst_rel, forkNum,
3827  permanent);
3828  }
3829  }
3830 
3831  /* Release fake relcache entries. */
3832  FreeFakeRelcacheEntry(src_rel);
3833  FreeFakeRelcacheEntry(dst_rel);
3834 }
static void RelationCopyStorageUsingBuffer(Relation src, Relation dst, ForkNumber forkNum, bool isunlogged)
Definition: bufmgr.c:3695
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:556
ForkNumber
Definition: relpath.h:41
@ MAIN_FORKNUM
Definition: relpath.h:43
#define MAX_FORKNUM
Definition: relpath.h:55
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:369
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:247
SMgrRelation RelationCreateStorage(RelFileNode rnode, char relpersistence, bool register_delete)
Definition: storage.c:120
void log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum)
Definition: storage.c:185
void FreeFakeRelcacheEntry(Relation fakerel)
Definition: xlogutils.c:640
Relation CreateFakeRelcacheEntry(RelFileNode rnode)
Definition: xlogutils.c:597

References CreateFakeRelcacheEntry(), FreeFakeRelcacheEntry(), INIT_FORKNUM, log_smgrcreate(), MAIN_FORKNUM, MAX_FORKNUM, RelationCopyStorageUsingBuffer(), RelationCreateStorage(), RelationGetSmgr(), smgrcreate(), and smgrexists().

Referenced by CreateDatabaseUsingWalLog().

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 3404 of file bufmgr.c.

3405 {
3406  int i;
3407 
3408  /*
3409  * We needn't consider local buffers, since by assumption the target
3410  * database isn't our own.
3411  */
3412 
3413  for (i = 0; i < NBuffers; i++)
3414  {
3415  BufferDesc *bufHdr = GetBufferDescriptor(i);
3416  uint32 buf_state;
3417 
3418  /*
3419  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3420  * and saves some cycles.
3421  */
3422  if (bufHdr->tag.rnode.dbNode != dbid)
3423  continue;
3424 
3425  buf_state = LockBufHdr(bufHdr);
3426  if (bufHdr->tag.rnode.dbNode == dbid)
3427  InvalidateBuffer(bufHdr); /* releases spinlock */
3428  else
3429  UnlockBufHdr(bufHdr, buf_state);
3430  }
3431 }
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1475

References RelFileNode::dbNode, GetBufferDescriptor, i, InvalidateBuffer(), LockBufHdr(), NBuffers, buftag::rnode, BufferDesc::tag, and UnlockBufHdr.

Referenced by createdb_failure_callback(), dbase_redo(), dropdb(), and movedb().

◆ DropRelFileNodeBuffers()

void DropRelFileNodeBuffers ( SMgrRelation  smgr_reln,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 3050 of file bufmgr.c.

3052 {
3053  int i;
3054  int j;
3055  RelFileNodeBackend rnode;
3056  BlockNumber nForkBlock[MAX_FORKNUM];
3057  uint64 nBlocksToInvalidate = 0;
3058 
3059  rnode = smgr_reln->smgr_rnode;
3060 
3061  /* If it's a local relation, it's localbuf.c's problem. */
3062  if (RelFileNodeBackendIsTemp(rnode))
3063  {
3064  if (rnode.backend == MyBackendId)
3065  {
3066  for (j = 0; j < nforks; j++)
3067  DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
3068  firstDelBlock[j]);
3069  }
3070  return;
3071  }
3072 
3073  /*
3074  * To remove all the pages of the specified relation forks from the buffer
3075  * pool, we need to scan the entire buffer pool but we can optimize it by
3076  * finding the buffers from BufMapping table provided we know the exact
3077  * size of each fork of the relation. The exact size is required to ensure
3078  * that we don't leave any buffer for the relation being dropped as
3079  * otherwise the background writer or checkpointer can lead to a PANIC
3080  * error while flushing buffers corresponding to files that don't exist.
3081  *
3082  * To know the exact size, we rely on the size cached for each fork by us
3083  * during recovery which limits the optimization to recovery and on
3084  * standbys but we can easily extend it once we have shared cache for
3085  * relation size.
3086  *
3087  * In recovery, we cache the value returned by the first lseek(SEEK_END)
3088  * and the future writes keeps the cached value up-to-date. See
3089  * smgrextend. It is possible that the value of the first lseek is smaller
3090  * than the actual number of existing blocks in the file due to buggy
3091  * Linux kernels that might not have accounted for the recent write. But
3092  * that should be fine because there must not be any buffers after that
3093  * file size.
3094  */
3095  for (i = 0; i < nforks; i++)
3096  {
3097  /* Get the number of blocks for a relation's fork */
3098  nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
3099 
3100  if (nForkBlock[i] == InvalidBlockNumber)
3101  {
3102  nBlocksToInvalidate = InvalidBlockNumber;
3103  break;
3104  }
3105 
3106  /* calculate the number of blocks to be invalidated */
3107  nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
3108  }
3109 
3110  /*
3111  * We apply the optimization iff the total number of blocks to invalidate
3112  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3113  */
3114  if (BlockNumberIsValid(nBlocksToInvalidate) &&
3115  nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3116  {
3117  for (j = 0; j < nforks; j++)
3118  FindAndDropRelFileNodeBuffers(rnode.node, forkNum[j],
3119  nForkBlock[j], firstDelBlock[j]);
3120  return;
3121  }
3122 
3123  for (i = 0; i < NBuffers; i++)
3124  {
3125  BufferDesc *bufHdr = GetBufferDescriptor(i);
3126  uint32 buf_state;
3127 
3128  /*
3129  * We can make this a tad faster by prechecking the buffer tag before
3130  * we attempt to lock the buffer; this saves a lot of lock
3131  * acquisitions in typical cases. It should be safe because the
3132  * caller must have AccessExclusiveLock on the relation, or some other
3133  * reason to be certain that no one is loading new pages of the rel
3134  * into the buffer pool. (Otherwise we might well miss such pages
3135  * entirely.) Therefore, while the tag might be changing while we
3136  * look at it, it can't be changing *to* a value we care about, only
3137  * *away* from such a value. So false negatives are impossible, and
3138  * false positives are safe because we'll recheck after getting the
3139  * buffer lock.
3140  *
3141  * We could check forkNum and blockNum as well as the rnode, but the
3142  * incremental win from doing so seems small.
3143  */
3144  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
3145  continue;
3146 
3147  buf_state = LockBufHdr(bufHdr);
3148 
3149  for (j = 0; j < nforks; j++)
3150  {
3151  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
3152  bufHdr->tag.forkNum == forkNum[j] &&
3153  bufHdr->tag.blockNum >= firstDelBlock[j])
3154  {
3155  InvalidateBuffer(bufHdr); /* releases spinlock */
3156  break;
3157  }
3158  }
3159  if (j >= nforks)
3160  UnlockBufHdr(bufHdr, buf_state);
3161  }
3162 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:81
static void FindAndDropRelFileNodeBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:3343
BackendId MyBackendId
Definition: globals.c:85
int j
Definition: isn.c:74
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:326
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:603
BackendId backend
Definition: relfilenode.h:75

References RelFileNodeBackend::backend, buftag::blockNum, BlockNumberIsValid, BUF_DROP_FULL_SCAN_THRESHOLD, DropRelFileNodeLocalBuffers(), FindAndDropRelFileNodeBuffers(), buftag::forkNum, GetBufferDescriptor, i, InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyBackendId, NBuffers, RelFileNodeBackend::node, RelFileNodeBackendIsTemp, RelFileNodeEquals, buftag::rnode, SMgrRelationData::smgr_rnode, smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr.

Referenced by smgrtruncate().

◆ DropRelFileNodesAllBuffers()

void DropRelFileNodesAllBuffers ( SMgrRelation smgr_reln,
int  nnodes 
)

Definition at line 3174 of file bufmgr.c.

3175 {
3176  int i;
3177  int j;
3178  int n = 0;
3179  SMgrRelation *rels;
3180  BlockNumber (*block)[MAX_FORKNUM + 1];
3181  uint64 nBlocksToInvalidate = 0;
3182  RelFileNode *nodes;
3183  bool cached = true;
3184  bool use_bsearch;
3185 
3186  if (nnodes == 0)
3187  return;
3188 
3189  rels = palloc(sizeof(SMgrRelation) * nnodes); /* non-local relations */
3190 
3191  /* If it's a local relation, it's localbuf.c's problem. */
3192  for (i = 0; i < nnodes; i++)
3193  {
3194  if (RelFileNodeBackendIsTemp(smgr_reln[i]->smgr_rnode))
3195  {
3196  if (smgr_reln[i]->smgr_rnode.backend == MyBackendId)
3197  DropRelFileNodeAllLocalBuffers(smgr_reln[i]->smgr_rnode.node);
3198  }
3199  else
3200  rels[n++] = smgr_reln[i];
3201  }
3202 
3203  /*
3204  * If there are no non-local relations, then we're done. Release the
3205  * memory and return.
3206  */
3207  if (n == 0)
3208  {
3209  pfree(rels);
3210  return;
3211  }
3212 
3213  /*
3214  * This is used to remember the number of blocks for all the relations
3215  * forks.
3216  */
3217  block = (BlockNumber (*)[MAX_FORKNUM + 1])
3218  palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
3219 
3220  /*
3221  * We can avoid scanning the entire buffer pool if we know the exact size
3222  * of each of the given relation forks. See DropRelFileNodeBuffers.
3223  */
3224  for (i = 0; i < n && cached; i++)
3225  {
3226  for (j = 0; j <= MAX_FORKNUM; j++)
3227  {
3228  /* Get the number of blocks for a relation's fork. */
3229  block[i][j] = smgrnblocks_cached(rels[i], j);
3230 
3231  /* We need to only consider the relation forks that exists. */
3232  if (block[i][j] == InvalidBlockNumber)
3233  {
3234  if (!smgrexists(rels[i], j))
3235  continue;
3236  cached = false;
3237  break;
3238  }
3239 
3240  /* calculate the total number of blocks to be invalidated */
3241  nBlocksToInvalidate += block[i][j];
3242  }
3243  }
3244 
3245  /*
3246  * We apply the optimization iff the total number of blocks to invalidate
3247  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3248  */
3249  if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3250  {
3251  for (i = 0; i < n; i++)
3252  {
3253  for (j = 0; j <= MAX_FORKNUM; j++)
3254  {
3255  /* ignore relation forks that doesn't exist */
3256  if (!BlockNumberIsValid(block[i][j]))
3257  continue;
3258 
3259  /* drop all the buffers for a particular relation fork */
3260  FindAndDropRelFileNodeBuffers(rels[i]->smgr_rnode.node,
3261  j, block[i][j], 0);
3262  }
3263  }
3264 
3265  pfree(block);
3266  pfree(rels);
3267  return;
3268  }
3269 
3270  pfree(block);
3271  nodes = palloc(sizeof(RelFileNode) * n); /* non-local relations */
3272  for (i = 0; i < n; i++)
3273  nodes[i] = rels[i]->smgr_rnode.node;
3274 
3275  /*
3276  * For low number of relations to drop just use a simple walk through, to
3277  * save the bsearch overhead. The threshold to use is rather a guess than
3278  * an exactly determined value, as it depends on many factors (CPU and RAM
3279  * speeds, amount of shared buffers etc.).
3280  */
3281  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3282 
3283  /* sort the list of rnodes if necessary */
3284  if (use_bsearch)
3285  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
3286 
3287  for (i = 0; i < NBuffers; i++)
3288  {
3289  RelFileNode *rnode = NULL;
3290  BufferDesc *bufHdr = GetBufferDescriptor(i);
3291  uint32 buf_state;
3292 
3293  /*
3294  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3295  * and saves some cycles.
3296  */
3297 
3298  if (!use_bsearch)
3299  {
3300  int j;
3301 
3302  for (j = 0; j < n; j++)
3303  {
3304  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3305  {
3306  rnode = &nodes[j];
3307  break;
3308  }
3309  }
3310  }
3311  else
3312  {
3313  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3314  nodes, n, sizeof(RelFileNode),
3316  }
3317 
3318  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3319  if (rnode == NULL)
3320  continue;
3321 
3322  buf_state = LockBufHdr(bufHdr);
3323  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3324  InvalidateBuffer(bufHdr); /* releases spinlock */
3325  else
3326  UnlockBufHdr(bufHdr, buf_state);
3327  }
3328 
3329  pfree(nodes);
3330  pfree(rels);
3331 }
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:73
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:373
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))

References BlockNumberIsValid, BUF_DROP_FULL_SCAN_THRESHOLD, DropRelFileNodeAllLocalBuffers(), FindAndDropRelFileNodeBuffers(), GetBufferDescriptor, i, if(), InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyBackendId, NBuffers, SMgrRelationData::node, palloc(), pfree(), pg_qsort(), RelFileNodeBackendIsTemp, RelFileNodeEquals, RELS_BSEARCH_THRESHOLD, buftag::rnode, rnode_comparator(), smgrexists(), smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr.

Referenced by smgrdounlinkall().

◆ FindAndDropRelFileNodeBuffers()

static void FindAndDropRelFileNodeBuffers ( RelFileNode  rnode,
ForkNumber  forkNum,
BlockNumber  nForkBlock,
BlockNumber  firstDelBlock 
)
static

Definition at line 3343 of file bufmgr.c.

3346 {
3347  BlockNumber curBlock;
3348 
3349  for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
3350  {
3351  uint32 bufHash; /* hash value for tag */
3352  BufferTag bufTag; /* identity of requested block */
3353  LWLock *bufPartitionLock; /* buffer partition lock for it */
3354  int buf_id;
3355  BufferDesc *bufHdr;
3356  uint32 buf_state;
3357 
3358  /* create a tag so we can lookup the buffer */
3359  INIT_BUFFERTAG(bufTag, rnode, forkNum, curBlock);
3360 
3361  /* determine its hash code and partition lock ID */
3362  bufHash = BufTableHashCode(&bufTag);
3363  bufPartitionLock = BufMappingPartitionLock(bufHash);
3364 
3365  /* Check that it is in the buffer pool. If not, do nothing. */
3366  LWLockAcquire(bufPartitionLock, LW_SHARED);
3367  buf_id = BufTableLookup(&bufTag, bufHash);
3368  LWLockRelease(bufPartitionLock);
3369 
3370  if (buf_id < 0)
3371  continue;
3372 
3373  bufHdr = GetBufferDescriptor(buf_id);
3374 
3375  /*
3376  * We need to lock the buffer header and recheck if the buffer is
3377  * still associated with the same block because the buffer could be
3378  * evicted by some other backend loading blocks for a different
3379  * relation after we release lock on the BufMapping table.
3380  */
3381  buf_state = LockBufHdr(bufHdr);
3382 
3383  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
3384  bufHdr->tag.forkNum == forkNum &&
3385  bufHdr->tag.blockNum >= firstDelBlock)
3386  InvalidateBuffer(bufHdr); /* releases spinlock */
3387  else
3388  UnlockBufHdr(bufHdr, buf_state);
3389  }
3390 }

References buftag::blockNum, BufMappingPartitionLock, BufTableHashCode(), BufTableLookup(), buftag::forkNum, GetBufferDescriptor, INIT_BUFFERTAG, InvalidateBuffer(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), RelFileNodeEquals, buftag::rnode, BufferDesc::tag, and UnlockBufHdr.

Referenced by DropRelFileNodeBuffers(), and DropRelFileNodesAllBuffers().

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln 
)
static

Definition at line 2815 of file bufmgr.c.

2816 {
2817  XLogRecPtr recptr;
2818  ErrorContextCallback errcallback;
2819  instr_time io_start,
2820  io_time;
2821  Block bufBlock;
2822  char *bufToWrite;
2823  uint32 buf_state;
2824 
2825  /*
2826  * Try to start an I/O operation. If StartBufferIO returns false, then
2827  * someone else flushed the buffer before we could, so we need not do
2828  * anything.
2829  */
2830  if (!StartBufferIO(buf, false))
2831  return;
2832 
2833  /* Setup error traceback support for ereport() */
2835  errcallback.arg = (void *) buf;
2836  errcallback.previous = error_context_stack;
2837  error_context_stack = &errcallback;
2838 
2839  /* Find smgr relation for buffer */
2840  if (reln == NULL)
2841  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2842 
2843  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2844  buf->tag.blockNum,
2845  reln->smgr_rnode.node.spcNode,
2846  reln->smgr_rnode.node.dbNode,
2847  reln->smgr_rnode.node.relNode);
2848 
2849  buf_state = LockBufHdr(buf);
2850 
2851  /*
2852  * Run PageGetLSN while holding header lock, since we don't have the
2853  * buffer locked exclusively in all cases.
2854  */
2855  recptr = BufferGetLSN(buf);
2856 
2857  /* To check if block content changes while flushing. - vadim 01/17/97 */
2858  buf_state &= ~BM_JUST_DIRTIED;
2859  UnlockBufHdr(buf, buf_state);
2860 
2861  /*
2862  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2863  * rule that log updates must hit disk before any of the data-file changes
2864  * they describe do.
2865  *
2866  * However, this rule does not apply to unlogged relations, which will be
2867  * lost after a crash anyway. Most unlogged relation pages do not bear
2868  * LSNs since we never emit WAL records for them, and therefore flushing
2869  * up through the buffer LSN would be useless, but harmless. However,
2870  * GiST indexes use LSNs internally to track page-splits, and therefore
2871  * unlogged GiST pages bear "fake" LSNs generated by
2872  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2873  * LSN counter could advance past the WAL insertion point; and if it did
2874  * happen, attempting to flush WAL through that location would fail, with
2875  * disastrous system-wide consequences. To make sure that can't happen,
2876  * skip the flush if the buffer isn't permanent.
2877  */
2878  if (buf_state & BM_PERMANENT)
2879  XLogFlush(recptr);
2880 
2881  /*
2882  * Now it's safe to write buffer to disk. Note that no one else should
2883  * have been able to write it while we were busy with log flushing because
2884  * only one process at a time can set the BM_IO_IN_PROGRESS bit.
2885  */
2886  bufBlock = BufHdrGetBlock(buf);
2887 
2888  /*
2889  * Update page checksum if desired. Since we have only shared lock on the
2890  * buffer, other processes might be updating hint bits in it, so we must
2891  * copy the page to private storage if we do checksumming.
2892  */
2893  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2894 
2895  if (track_io_timing)
2896  INSTR_TIME_SET_CURRENT(io_start);
2897 
2898  /*
2899  * bufToWrite is either the shared buffer or a copy, as appropriate.
2900  */
2901  smgrwrite(reln,
2902  buf->tag.forkNum,
2903  buf->tag.blockNum,
2904  bufToWrite,
2905  false);
2906 
2907  if (track_io_timing)
2908  {
2909  INSTR_TIME_SET_CURRENT(io_time);
2910  INSTR_TIME_SUBTRACT(io_time, io_start);
2913  }
2914 
2916 
2917  /*
2918  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2919  * end the BM_IO_IN_PROGRESS state.
2920  */
2921  TerminateBufferIO(buf, true, 0);
2922 
2923  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2924  buf->tag.blockNum,
2925  reln->smgr_rnode.node.spcNode,
2926  reln->smgr_rnode.node.dbNode,
2927  reln->smgr_rnode.node.relNode);
2928 
2929  /* Pop the error context stack */
2930  error_context_stack = errcallback.previous;
2931 }
#define InvalidBackendId
Definition: backendid.h:23
bool track_io_timing
Definition: bufmgr.c:137
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:62
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4671
void * Block
Definition: bufmgr.h:24
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1510
Pointer Page
Definition: bufpage.h:78
ErrorContextCallback * error_context_stack
Definition: elog.c:93
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
struct timeval instr_time
Definition: instr_time.h:150
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:470
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:554
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:146
instr_time blk_write_time
Definition: instrument.h:37
int64 shared_blks_written
Definition: instrument.h:29
struct ErrorContextCallback * previous
Definition: elog.h:232
void(* callback)(void *arg)
Definition: elog.h:233
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2509

References ErrorContextCallback::arg, BufferUsage::blk_write_time, BM_JUST_DIRTIED, BM_PERMANENT, buf, BufferGetLSN, BufHdrGetBlock, ErrorContextCallback::callback, RelFileNode::dbNode, error_context_stack, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, InvalidBackendId, LockBufHdr(), RelFileNodeBackend::node, PageSetChecksumCopy(), pgBufferUsage, pgstat_count_buffer_write_time, ErrorContextCallback::previous, RelFileNode::relNode, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rnode, smgropen(), smgrwrite(), RelFileNode::spcNode, StartBufferIO(), TerminateBufferIO(), track_io_timing, UnlockBufHdr, and XLogFlush().

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushOneBuffer(), FlushRelationBuffers(), FlushRelationsAllBuffers(), and SyncOneBuffer().

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 3852 of file bufmgr.c.

3853 {
3854  int i;
3855  BufferDesc *bufHdr;
3856 
3857  /* Make sure we can handle the pin inside the loop */
3859 
3860  for (i = 0; i < NBuffers; i++)
3861  {
3862  uint32 buf_state;
3863 
3864  bufHdr = GetBufferDescriptor(i);
3865 
3866  /*
3867  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3868  * and saves some cycles.
3869  */
3870  if (bufHdr->tag.rnode.dbNode != dbid)
3871  continue;
3872 
3874 
3875  buf_state = LockBufHdr(bufHdr);
3876  if (bufHdr->tag.rnode.dbNode == dbid &&
3877  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3878  {
3879  PinBuffer_Locked(bufHdr);
3881  FlushBuffer(bufHdr, NULL);
3883  UnpinBuffer(bufHdr, true);
3884  }
3885  else
3886  UnlockBufHdr(bufHdr, buf_state);
3887  }
3888 }

References BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock, CurrentResourceOwner, RelFileNode::dbNode, FlushBuffer(), GetBufferDescriptor, i, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by dbase_redo().

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 3895 of file bufmgr.c.

3896 {
3897  BufferDesc *bufHdr;
3898 
3899  /* currently not needed, but no fundamental reason not to support */
3900  Assert(!BufferIsLocal(buffer));
3901 
3902  Assert(BufferIsPinned(buffer));
3903 
3904  bufHdr = GetBufferDescriptor(buffer - 1);
3905 
3907 
3908  FlushBuffer(bufHdr, NULL);
3909 }
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1916

References Assert(), PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, FlushBuffer(), GetBufferDescriptor, and LWLockHeldByMe().

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), and XLogReadBufferForRedoExtended().

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 3508 of file bufmgr.c.

3509 {
3510  int i;
3511  BufferDesc *bufHdr;
3512 
3513  if (RelationUsesLocalBuffers(rel))
3514  {
3515  for (i = 0; i < NLocBuffer; i++)
3516  {
3517  uint32 buf_state;
3518 
3519  bufHdr = GetLocalBufferDescriptor(i);
3520  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3521  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3522  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3523  {
3524  ErrorContextCallback errcallback;
3525  Page localpage;
3526 
3527  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3528 
3529  /* Setup error traceback support for ereport() */
3531  errcallback.arg = (void *) bufHdr;
3532  errcallback.previous = error_context_stack;
3533  error_context_stack = &errcallback;
3534 
3535  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3536 
3538  bufHdr->tag.forkNum,
3539  bufHdr->tag.blockNum,
3540  localpage,
3541  false);
3542 
3543  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3544  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3545 
3546  /* Pop the error context stack */
3547  error_context_stack = errcallback.previous;
3548  }
3549  }
3550 
3551  return;
3552  }
3553 
3554  /* Make sure we can handle the pin inside the loop */
3556 
3557  for (i = 0; i < NBuffers; i++)
3558  {
3559  uint32 buf_state;
3560 
3561  bufHdr = GetBufferDescriptor(i);
3562 
3563  /*
3564  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3565  * and saves some cycles.
3566  */
3567  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3568  continue;
3569 
3571 
3572  buf_state = LockBufHdr(bufHdr);
3573  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3574  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3575  {
3576  PinBuffer_Locked(bufHdr);
3578  FlushBuffer(bufHdr, RelationGetSmgr(rel));
3580  UnpinBuffer(bufHdr, true);
3581  }
3582  else
3583  UnlockBufHdr(bufHdr, buf_state);
3584  }
3585 }
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:66
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4690
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1539
int NLocBuffer
Definition: localbuf.c:41
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:622
RelFileNode rd_node
Definition: rel.h:56

References ErrorContextCallback::arg, buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_VALID, BufferDescriptorGetContentLock, ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushBuffer(), buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, i, local_buffer_write_error_callback(), LocalBufHdrGetBlock, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, NLocBuffer, PageSetChecksumInplace(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), PinBuffer_Locked(), ErrorContextCallback::previous, RelationData::rd_node, RelationGetSmgr(), RelationUsesLocalBuffers, RelFileNodeEquals, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, smgrwrite(), BufferDesc::state, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by fill_seq_with_data(), heapam_relation_copy_data(), and index_copy_data().

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 3597 of file bufmgr.c.

3598 {
3599  int i;
3600  SMgrSortArray *srels;
3601  bool use_bsearch;
3602 
3603  if (nrels == 0)
3604  return;
3605 
3606  /* fill-in array for qsort */
3607  srels = palloc(sizeof(SMgrSortArray) * nrels);
3608 
3609  for (i = 0; i < nrels; i++)
3610  {
3611  Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
3612 
3613  srels[i].rnode = smgrs[i]->smgr_rnode.node;
3614  srels[i].srel = smgrs[i];
3615  }
3616 
3617  /*
3618  * Save the bsearch overhead for low number of relations to sync. See
3619  * DropRelFileNodesAllBuffers for details.
3620  */
3621  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
3622 
3623  /* sort the list of SMgrRelations if necessary */
3624  if (use_bsearch)
3625  pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
3626 
3627  /* Make sure we can handle the pin inside the loop */
3629 
3630  for (i = 0; i < NBuffers; i++)
3631  {
3632  SMgrSortArray *srelent = NULL;
3633  BufferDesc *bufHdr = GetBufferDescriptor(i);
3634  uint32 buf_state;
3635 
3636  /*
3637  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3638  * and saves some cycles.
3639  */
3640 
3641  if (!use_bsearch)
3642  {
3643  int j;
3644 
3645  for (j = 0; j < nrels; j++)
3646  {
3647  if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
3648  {
3649  srelent = &srels[j];
3650  break;
3651  }
3652  }
3653  }
3654  else
3655  {
3656  srelent = bsearch((const void *) &(bufHdr->tag.rnode),
3657  srels, nrels, sizeof(SMgrSortArray),
3659  }
3660 
3661  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3662  if (srelent == NULL)
3663  continue;
3664 
3666 
3667  buf_state = LockBufHdr(bufHdr);
3668  if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
3669  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3670  {
3671  PinBuffer_Locked(bufHdr);
3673  FlushBuffer(bufHdr, srelent->srel);
3675  UnpinBuffer(bufHdr, true);
3676  }
3677  else
3678  UnlockBufHdr(bufHdr, buf_state);
3679  }
3680 
3681  pfree(srels);
3682 }
RelFileNode rnode
Definition: bufmgr.c:129
SMgrRelation srel
Definition: bufmgr.c:130

References Assert(), BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock, CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor, i, j, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, RelFileNodeBackend::node, palloc(), pfree(), pg_qsort(), PinBuffer_Locked(), RelFileNodeBackendIsTemp, RelFileNodeEquals, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), SMgrSortArray::rnode, buftag::rnode, rnode_comparator(), SMgrRelationData::smgr_rnode, SMgrSortArray::srel, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by smgrdosyncall().

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 412 of file bufmgr.c.

413 {
414  Assert(ref->refcount == 0);
415 
416  if (ref >= &PrivateRefCountArray[0] &&
418  {
419  ref->buffer = InvalidBuffer;
420 
421  /*
422  * Mark the just used entry as reserved - in many scenarios that
423  * allows us to avoid ever having to search the array/hash for free
424  * entries.
425  */
426  ReservedRefCountEntry = ref;
427  }
428  else
429  {
430  bool found;
431  Buffer buffer = ref->buffer;
432 
434  (void *) &buffer,
435  HASH_REMOVE,
436  &found);
437  Assert(found);
440  }
441 }
int Buffer
Definition: buf.h:23
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:203
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:954
@ HASH_REMOVE
Definition: hsearch.h:115

References Assert(), PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountEntry.

Referenced by UnpinBuffer().

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 389 of file bufmgr.c.

390 {
392 
393  Assert(BufferIsValid(buffer));
394  Assert(!BufferIsLocal(buffer));
395 
396  /*
397  * Not moving the entry - that's ok for the current users, but we might
398  * want to change this one day.
399  */
400  ref = GetPrivateRefCountEntry(buffer, false);
401 
402  if (ref == NULL)
403  return 0;
404  return ref->refcount;
405 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:309

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid, GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by ConditionalLockBufferForCleanup(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), PrintBufferLeakWarning(), and ReadRecentBuffer().

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 309 of file bufmgr.c.

310 {
312  int i;
313 
314  Assert(BufferIsValid(buffer));
315  Assert(!BufferIsLocal(buffer));
316 
317  /*
318  * First search for references in the array, that'll be sufficient in the
319  * majority of cases.
320  */
321  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
322  {
324 
325  if (res->buffer == buffer)
326  return res;
327  }
328 
329  /*
330  * By here we know that the buffer, if already pinned, isn't residing in
331  * the array.
332  *
333  * Only look up the buffer in the hashtable if we've previously overflowed
334  * into it.
335  */
336  if (PrivateRefCountOverflowed == 0)
337  return NULL;
338 
340  (void *) &buffer,
341  HASH_FIND,
342  NULL);
343 
344  if (res == NULL)
345  return NULL;
346  else if (!do_move)
347  {
348  /* caller doesn't want us to move the hash entry into the array */
349  return res;
350  }
351  else
352  {
353  /* move buffer from hashtable into the free array slot */
354  bool found;
356 
357  /* Ensure there's a free array slot */
359 
360  /* Use up the reserved slot */
361  Assert(ReservedRefCountEntry != NULL);
363  ReservedRefCountEntry = NULL;
364  Assert(free->buffer == InvalidBuffer);
365 
366  /* and fill it */
367  free->buffer = buffer;
368  free->refcount = res->refcount;
369 
370  /* delete from hashtable */
372  (void *) &buffer,
373  HASH_REMOVE,
374  &found);
375  Assert(found);
378 
379  return free;
380  }
381 }
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid, free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, res, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBuffer().

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 4364 of file bufmgr.c.

4365 {
4366  int bufid = GetStartupBufferPinWaitBufId();
4367 
4368  /*
4369  * If we get woken slowly then it's possible that the Startup process was
4370  * already woken by other backends before we got here. Also possible that
4371  * we get here by multiple interrupts or interrupts at inappropriate
4372  * times, so make sure we do nothing if the bufid is not set.
4373  */
4374  if (bufid < 0)
4375  return false;
4376 
4377  if (GetPrivateRefCount(bufid + 1) > 0)
4378  return true;
4379 
4380  return false;
4381 }
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:645

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and RecoveryConflictInterrupt().

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 3953 of file bufmgr.c.

3954 {
3955  Assert(BufferIsPinned(buffer));
3957  if (BufferIsLocal(buffer))
3958  LocalRefCount[-buffer - 1]++;
3959  else
3960  {
3961  PrivateRefCountEntry *ref;
3962 
3963  ref = GetPrivateRefCountEntry(buffer, true);
3964  Assert(ref != NULL);
3965  ref->refcount++;
3966  }
3968 }
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:958

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, GetPrivateRefCountEntry(), LocalRefCount, PrivateRefCountEntry::refcount, ResourceOwnerEnlargeBuffers(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

◆ InitBufferPoolAccess()

void InitBufferPoolAccess ( void  )

Definition at line 2604 of file bufmgr.c.

2605 {
2606  HASHCTL hash_ctl;
2607 
2608  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2609 
2610  hash_ctl.keysize = sizeof(int32);
2611  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2612 
2613  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2614  HASH_ELEM | HASH_BLOBS);
2615 
2616  /*
2617  * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
2618  * the corresponding phase of backend shutdown.
2619  */
2620  Assert(MyProc != NULL);
2622 }
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2629
struct PrivateRefCountEntry PrivateRefCountEntry
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:349
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
PGPROC * MyProc
Definition: proc.c:68
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76

References Assert(), AtProcExit_Buffers(), HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MyProc, on_shmem_exit(), PrivateRefCountArray, and PrivateRefCountHash.

Referenced by BaseInit().

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 1475 of file bufmgr.c.

1476 {
1477  BufferTag oldTag;
1478  uint32 oldHash; /* hash value for oldTag */
1479  LWLock *oldPartitionLock; /* buffer partition lock for it */
1480  uint32 oldFlags;
1481  uint32 buf_state;
1482 
1483  /* Save the original buffer tag before dropping the spinlock */
1484  oldTag = buf->tag;
1485 
1486  buf_state = pg_atomic_read_u32(&buf->state);
1487  Assert(buf_state & BM_LOCKED);
1488  UnlockBufHdr(buf, buf_state);
1489 
1490  /*
1491  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1492  * worth storing the hashcode in BufferDesc so we need not recompute it
1493  * here? Probably not.
1494  */
1495  oldHash = BufTableHashCode(&oldTag);
1496  oldPartitionLock = BufMappingPartitionLock(oldHash);
1497 
1498 retry:
1499 
1500  /*
1501  * Acquire exclusive mapping lock in preparation for changing the buffer's
1502  * association.
1503  */
1504  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1505 
1506  /* Re-lock the buffer header */
1507  buf_state = LockBufHdr(buf);
1508 
1509  /* If it's changed while we were waiting for lock, do nothing */
1510  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1511  {
1512  UnlockBufHdr(buf, buf_state);
1513  LWLockRelease(oldPartitionLock);
1514  return;
1515  }
1516 
1517  /*
1518  * We assume the only reason for it to be pinned is that someone else is
1519  * flushing the page out. Wait for them to finish. (This could be an
1520  * infinite loop if the refcount is messed up... it would be nice to time
1521  * out after awhile, but there seems no way to be sure how many loops may
1522  * be needed. Note that if the other guy has pinned the buffer but not
1523  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1524  * be busy-looping here.)
1525  */
1526  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1527  {
1528  UnlockBufHdr(buf, buf_state);
1529  LWLockRelease(oldPartitionLock);
1530  /* safety check: should definitely not be our *own* pin */
1532  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1533  WaitIO(buf);
1534  goto retry;
1535  }
1536 
1537  /*
1538  * Clear out the buffer's tag and flags. We must do this to ensure that
1539  * linear scans of the buffer array don't think the buffer is valid.
1540  */
1541  oldFlags = buf_state & BUF_FLAG_MASK;
1542  CLEAR_BUFFERTAG(buf->tag);
1543  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1544  UnlockBufHdr(buf, buf_state);
1545 
1546  /*
1547  * Remove the buffer from the lookup hashtable, if it was in there.
1548  */
1549  if (oldFlags & BM_TAG_VALID)
1550  BufTableDelete(&oldTag, oldHash);
1551 
1552  /*
1553  * Done with mapping lock.
1554  */
1555  LWLockRelease(oldPartitionLock);
1556 
1557  /*
1558  * Insert the buffer at the head of the list of free buffers.
1559  */
1561 }
#define BufferDescriptorGetBuffer(bdesc)
#define BUFFERTAGS_EQUAL(a, b)
#define BM_LOCKED
Definition: buf_internals.h:58
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:97
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:4500
#define ERROR
Definition: elog.h:33
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363

References Assert(), BM_LOCKED, BM_TAG_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer, BUFFERTAGS_EQUAL, BufMappingPartitionLock, BufTableDelete(), BufTableHashCode(), CLEAR_BUFFERTAG, elog, ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), StrategyFreeBuffer(), UnlockBufHdr, and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelFileNodeBuffers(), DropRelFileNodesAllBuffers(), and FindAndDropRelFileNodeBuffers().

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 4446 of file bufmgr.c.

4447 {
4448  BufferDesc *bufHdr;
4449  uint32 buf_state;
4450 
4451  Assert(BufferIsValid(buffer));
4452 
4453  if (BufferIsLocal(buffer))
4454  {
4455  /* There should be exactly one pin */
4456  if (LocalRefCount[-buffer - 1] != 1)
4457  return false;
4458  /* Nobody else to wait for */
4459  return true;
4460  }
4461 
4462  /* There should be exactly one local pin */
4463  if (GetPrivateRefCount(buffer) != 1)
4464  return false;
4465 
4466  bufHdr = GetBufferDescriptor(buffer - 1);
4467 
4468  /* caller must hold exclusive lock on buffer */
4470  LW_EXCLUSIVE));
4471 
4472  buf_state = LockBufHdr(bufHdr);
4473 
4474  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4475  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4476  {
4477  /* pincount is OK. */
4478  UnlockBufHdr(bufHdr, buf_state);
4479  return true;
4480  }
4481 
4482  UnlockBufHdr(bufHdr, buf_state);
4483  return false;
4484 }
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1934

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsValid, GetBufferDescriptor, GetPrivateRefCount(), LocalRefCount, LockBufHdr(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), and UnlockBufHdr.

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), hash_xlog_split_allocate_page(), and hashbucketcleanup().

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext context)

Definition at line 4924 of file bufmgr.c.

4925 {
4926  int i;
4927 
4928  if (context->nr_pending == 0)
4929  return;
4930 
4931  /*
4932  * Executing the writes in-order can make them a lot faster, and allows to
4933  * merge writeback requests to consecutive blocks into larger writebacks.
4934  */
4935  sort_pending_writebacks(context->pending_writebacks, context->nr_pending);
4936 
4937  /*
4938  * Coalesce neighbouring writes, but nothing else. For that we iterate
4939  * through the, now sorted, array of pending flushes, and look forward to
4940  * find all neighbouring (or identical) writes.
4941  */
4942  for (i = 0; i < context->nr_pending; i++)
4943  {
4946  SMgrRelation reln;
4947  int ahead;
4948  BufferTag tag;
4949  Size nblocks = 1;
4950 
4951  cur = &context->pending_writebacks[i];
4952  tag = cur->tag;
4953 
4954  /*
4955  * Peek ahead, into following writeback requests, to see if they can
4956  * be combined with the current one.
4957  */
4958  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4959  {
4960  next = &context->pending_writebacks[i + ahead + 1];
4961 
4962  /* different file, stop */
4963  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4964  cur->tag.forkNum != next->tag.forkNum)
4965  break;
4966 
4967  /* ok, block queued twice, skip */
4968  if (cur->tag.blockNum == next->tag.blockNum)
4969  continue;
4970 
4971  /* only merge consecutive writes */
4972  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4973  break;
4974 
4975  nblocks++;
4976  cur = next;
4977  }
4978 
4979  i += ahead;
4980 
4981  /* and finally tell the kernel to write the data to storage */
4982  reln = smgropen(tag.rnode, InvalidBackendId);
4983  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4984  }
4985 
4986  context->nr_pending = 0;
4987 }
static int32 next
Definition: blutils.c:219
struct cursor * cur
Definition: ecpg.c:28
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:567
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]

References buftag::blockNum, cur, buftag::forkNum, i, InvalidBackendId, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, RelFileNodeEquals, buftag::rnode, smgropen(), and smgrwriteback().

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 4690 of file bufmgr.c.

4691 {
4692  BufferDesc *bufHdr = (BufferDesc *) arg;
4693 
4694  if (bufHdr != NULL)
4695  {
4696  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4697  bufHdr->tag.forkNum);
4698 
4699  errcontext("writing block %u of relation %s",
4700  bufHdr->tag.blockNum, path);
4701  pfree(path);
4702  }
4703 }
#define errcontext
Definition: elog.h:190
void * arg
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78

References arg, buftag::blockNum, errcontext, buftag::forkNum, MyBackendId, pfree(), relpathbackend, buftag::rnode, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
int  mode 
)

Definition at line 4156 of file bufmgr.c.

4157 {
4158  BufferDesc *buf;
4159 
4160  Assert(BufferIsPinned(buffer));
4161  if (BufferIsLocal(buffer))
4162  return; /* local buffers need no lock */
4163 
4164  buf = GetBufferDescriptor(buffer - 1);
4165 
4166  if (mode == BUFFER_LOCK_UNLOCK)
4168  else if (mode == BUFFER_LOCK_SHARE)
4170  else if (mode == BUFFER_LOCK_EXCLUSIVE)
4172  else
4173  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
4174 }
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:97
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
static PgChecksumMode mode
Definition: pg_checksums.c:65

References Assert(), buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, elog, ERROR, GetBufferDescriptor, LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), and mode.

Referenced by _bt_lockbuf(), _bt_unlockbuf(), _bt_upgradelockbufcleanup(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_getnewbuf(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), blbulkdelete(), blgetbitmap(), blinsert(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), brinbuild(), brinbuildempty(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), entryLoadMoreItems(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), ginbuildempty(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistbuildempty(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_inplace_update(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_bitmap_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgetpage(), heapgettup(), initBloomState(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), LockBufferForCleanup(), log_newpage_range(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), ScanSourceDatabasePgClass(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferExtended(), XLogReadBufferForRedoExtended(), and XLogRecordPageWithFreeSpace().

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 4213 of file bufmgr.c.

4214 {
4215  BufferDesc *bufHdr;
4216  char *new_status = NULL;
4217  TimestampTz waitStart = 0;
4218  bool logged_recovery_conflict = false;
4219 
4220  Assert(BufferIsPinned(buffer));
4221  Assert(PinCountWaitBuf == NULL);
4222 
4223  if (BufferIsLocal(buffer))
4224  {
4225  /* There should be exactly one pin */
4226  if (LocalRefCount[-buffer - 1] != 1)
4227  elog(ERROR, "incorrect local pin count: %d",
4228  LocalRefCount[-buffer - 1]);
4229  /* Nobody else to wait for */
4230  return;
4231  }
4232 
4233  /* There should be exactly one local pin */
4234  if (GetPrivateRefCount(buffer) != 1)
4235  elog(ERROR, "incorrect local pin count: %d",
4236  GetPrivateRefCount(buffer));
4237 
4238  bufHdr = GetBufferDescriptor(buffer - 1);
4239 
4240  for (;;)
4241  {
4242  uint32 buf_state;
4243 
4244  /* Try to acquire lock */
4246  buf_state = LockBufHdr(bufHdr);
4247 
4248  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4249  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4250  {
4251  /* Successfully acquired exclusive lock with pincount 1 */
4252  UnlockBufHdr(bufHdr, buf_state);
4253 
4254  /*
4255  * Emit the log message if recovery conflict on buffer pin was
4256  * resolved but the startup process waited longer than
4257  * deadlock_timeout for it.
4258  */
4259  if (logged_recovery_conflict)
4261  waitStart, GetCurrentTimestamp(),
4262  NULL, false);
4263 
4264  /* Report change to non-waiting status */
4265  if (new_status)
4266  {
4267  set_ps_display(new_status);
4268  pfree(new_status);
4269  }
4270  return;
4271  }
4272  /* Failed, so mark myself as waiting for pincount 1 */
4273  if (buf_state & BM_PIN_COUNT_WAITER)
4274  {
4275  UnlockBufHdr(bufHdr, buf_state);
4276  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4277  elog(ERROR, "multiple backends attempting to wait for pincount 1");
4278  }
4280  PinCountWaitBuf = bufHdr;
4281  buf_state |= BM_PIN_COUNT_WAITER;
4282  UnlockBufHdr(bufHdr, buf_state);
4283  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4284 
4285  /* Wait to be signaled by UnpinBuffer() */
4286  if (InHotStandby)
4287  {
4288  /* Report change to waiting status */
4289  if (update_process_title && new_status == NULL)
4290  {
4291  const char *old_status;
4292  int len;
4293 
4294  old_status = get_ps_display(&len);
4295  new_status = (char *) palloc(len + 8 + 1);
4296  memcpy(new_status, old_status, len);
4297  strcpy(new_status + len, " waiting");
4298  set_ps_display(new_status);
4299  new_status[len] = '\0'; /* truncate off " waiting" */
4300  }
4301 
4302  /*
4303  * Emit the log message if the startup process is waiting longer
4304  * than deadlock_timeout for recovery conflict on buffer pin.
4305  *
4306  * Skip this if first time through because the startup process has
4307  * not started waiting yet in this case. So, the wait start
4308  * timestamp is set after this logic.
4309  */
4310  if (waitStart != 0 && !logged_recovery_conflict)
4311  {
4313 
4314  if (TimestampDifferenceExceeds(waitStart, now,
4315  DeadlockTimeout))
4316  {
4318  waitStart, now, NULL, true);
4319  logged_recovery_conflict = true;
4320  }
4321  }
4322 
4323  /*
4324  * Set the wait start timestamp if logging is enabled and first
4325  * time through.
4326  */
4327  if (log_recovery_conflict_waits && waitStart == 0)
4328  waitStart = GetCurrentTimestamp();
4329 
4330  /* Publish the bufid that Startup process waits on */
4331  SetStartupBufferPinWaitBufId(buffer - 1);
4332  /* Set alarm and then wait to be signaled by UnpinBuffer() */
4334  /* Reset the published bufid */
4336  }
4337  else
4339 
4340  /*
4341  * Remove flag marking us as waiter. Normally this will not be set
4342  * anymore, but ProcWaitForSignal() can return for other signals as
4343  * well. We take care to only reset the flag if we're the waiter, as
4344  * theoretically another backend could have started waiting. That's
4345  * impossible with the current usages due to table level locking, but
4346  * better be safe.
4347  */
4348  buf_state = LockBufHdr(bufHdr);
4349  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
4351  buf_state &= ~BM_PIN_COUNT_WAITER;
4352  UnlockBufHdr(bufHdr, buf_state);
4353 
4354  PinCountWaitBuf = NULL;
4355  /* Loop back and try again */
4356  }
4357 }
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1705
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1574
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1538
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:65
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:167
int64 TimestampTz
Definition: timestamp.h:39
const void size_t len
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:44
const char * get_ps_display(int *displen)
Definition: ps_status.c:430
bool update_process_title
Definition: ps_status.c:36
void set_ps_display(const char *activity)
Definition: ps_status.c:349
int DeadlockTimeout
Definition: proc.c:60
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:633
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1873
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:755
bool log_recovery_conflict_waits
Definition: standby.c:43
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:251
int wait_backend_pgprocno
int pgprocno
Definition: proc.h:188
#define PG_WAIT_BUFFER_PIN
Definition: wait_event.h:20
#define InHotStandby
Definition: xlogutils.h:57

References Assert(), BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, DeadlockTimeout, elog, ERROR, get_ps_display(), GetBufferDescriptor, GetCurrentTimestamp(), GetPrivateRefCount(), InHotStandby, len, LocalRefCount, LockBuffer(), LockBufHdr(), log_recovery_conflict_waits, LogRecoveryConflict(), MyProc, now(), palloc(), pfree(), PG_WAIT_BUFFER_PIN, PGPROC::pgprocno, PinCountWaitBuf, PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), set_ps_display(), SetStartupBufferPinWaitBufId(), TimestampDifferenceExceeds(), UnlockBufHdr, update_process_title, and BufferDesc::wait_backend_pgprocno.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), ReadBuffer_common(), and XLogReadBufferForRedoExtended().

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 4736 of file bufmgr.c.

4737 {
4738  SpinDelayStatus delayStatus;
4739  uint32 old_buf_state;
4740 
4741  init_local_spin_delay(&delayStatus);
4742 
4743  while (true)
4744  {
4745  /* set BM_LOCKED flag */
4746  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4747  /* if it wasn't set before we're OK */
4748  if (!(old_buf_state & BM_LOCKED))
4749  break;
4750  perform_spin_delay(&delayStatus);
4751  }
4752  finish_spin_delay(&delayStatus);
4753  return old_buf_state | BM_LOCKED;
4754 }
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:372
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:125
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:175
#define init_local_spin_delay(status)
Definition: s_lock.h:1084

References BM_LOCKED, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), DropRelFileNodeBuffers(), DropRelFileNodesAllBuffers(), FindAndDropRelFileNodeBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetBufferFromRing(), InvalidateBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), pg_buffercache_pages(), ReadBuffer_common(), ReadRecentBuffer(), StartBufferIO(), StrategyGetBuffer(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), UnpinBuffer(), and WaitIO().

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 1573 of file bufmgr.c.

1574 {
1575  BufferDesc *bufHdr;
1576  uint32 buf_state;
1577  uint32 old_buf_state;
1578 
1579  if (!BufferIsValid(buffer))
1580  elog(ERROR, "bad buffer ID: %d", buffer);
1581 
1582  if (BufferIsLocal(buffer))
1583  {
1584  MarkLocalBufferDirty(buffer);
1585  return;
1586  }
1587 
1588  bufHdr = GetBufferDescriptor(buffer - 1);
1589 
1590  Assert(BufferIsPinned(buffer));
1592  LW_EXCLUSIVE));
1593 
1594  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1595  for (;;)
1596  {
1597  if (old_buf_state & BM_LOCKED)
1598  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1599 
1600  buf_state = old_buf_state;
1601 
1602  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1603  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1604 
1605  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1606  buf_state))
1607  break;
1608  }
1609 
1610  /*
1611  * If the buffer was not dirty already, do vacuum accounting.
1612  */
1613  if (!(old_buf_state & BM_DIRTY))
1614  {
1615  VacuumPageDirty++;
1617  if (VacuumCostActive)
1619  }
1620 }
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4764
bool VacuumCostActive
Definition: globals.c:153
int64 VacuumPageDirty
Definition: globals.c:150
int VacuumCostBalance
Definition: globals.c:152
int VacuumCostPageDirty
Definition: globals.c:144
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:286
int64 shared_blks_dirtied
Definition: instrument.h:28

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, BufferIsValid, elog, ERROR, GetBufferDescriptor, LW_EXCLUSIVE, LWLockHeldByMeInMode(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_pass(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newroot(), _bt_restore_meta(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), do_setval(), doPickSplit(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_freeze_page(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune(), heap_xlog_update(), heap_xlog_vacuum(), heap_xlog_visible(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_scan_prune(), lazy_vacuum_heap_page(), log_newpage_range(), moveLeafs(), nextval_internal(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 3985 of file bufmgr.c.

3986 {
3987  BufferDesc *bufHdr;
3988  Page page = BufferGetPage(buffer);
3989 
3990  if (!BufferIsValid(buffer))
3991  elog(ERROR, "bad buffer ID: %d", buffer);
3992 
3993  if (BufferIsLocal(buffer))
3994  {
3995  MarkLocalBufferDirty(buffer);
3996  return;
3997  }
3998 
3999  bufHdr = GetBufferDescriptor(buffer - 1);
4000 
4001  Assert(GetPrivateRefCount(buffer) > 0);
4002  /* here, either share or exclusive lock is OK */
4004 
4005  /*
4006  * This routine might get called many times on the same page, if we are
4007  * making the first scan after commit of an xact that added/deleted many
4008  * tuples. So, be as quick as we can if the buffer is already dirty. We
4009  * do this by not acquiring spinlock if it looks like the status bits are
4010  * already set. Since we make this test unlocked, there's a chance we
4011  * might fail to notice that the flags have just been cleared, and failed
4012  * to reset them, due to memory-ordering issues. But since this function
4013  * is only intended to be used in cases where failing to write out the
4014  * data would be harmless anyway, it doesn't really matter.
4015  */
4016  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
4018  {
4020  bool dirtied = false;
4021  bool delayChkptFlags = false;
4022  uint32 buf_state;
4023 
4024  /*
4025  * If we need to protect hint bit updates from torn writes, WAL-log a
4026  * full page image of the page. This full page image is only necessary
4027  * if the hint bit update is the first change to the page since the
4028  * last checkpoint.
4029  *
4030  * We don't check full_page_writes here because that logic is included
4031  * when we call XLogInsert() since the value changes dynamically.
4032  */
4033  if (XLogHintBitIsNeeded() &&
4034  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
4035  {
4036  /*
4037  * If we must not write WAL, due to a relfilenode-specific
4038  * condition or being in recovery, don't dirty the page. We can
4039  * set the hint, just not dirty the page as a result so the hint
4040  * is lost when we evict the page or shutdown.
4041  *
4042  * See src/backend/storage/page/README for longer discussion.
4043  */
4044  if (RecoveryInProgress() ||
4045  RelFileNodeSkippingWAL(bufHdr->tag.rnode))
4046  return;
4047 
4048  /*
4049  * If the block is already dirty because we either made a change
4050  * or set a hint already, then we don't need to write a full page
4051  * image. Note that aggressive cleaning of blocks dirtied by hint
4052  * bit setting would increase the call rate. Bulk setting of hint
4053  * bits would reduce the call rate...
4054  *
4055  * We must issue the WAL record before we mark the buffer dirty.
4056  * Otherwise we might write the page before we write the WAL. That
4057  * causes a race condition, since a checkpoint might occur between
4058  * writing the WAL record and marking the buffer dirty. We solve
4059  * that with a kluge, but one that is already in use during
4060  * transaction commit to prevent race conditions. Basically, we
4061  * simply prevent the checkpoint WAL record from being written
4062  * until we have marked the buffer dirty. We don't start the
4063  * checkpoint flush until we have marked dirty, so our checkpoint
4064  * must flush the change to disk successfully or the checkpoint
4065  * never gets written, so crash recovery will fix.
4066  *
4067  * It's possible we may enter here without an xid, so it is
4068  * essential that CreateCheckPoint waits for virtual transactions
4069  * rather than full transactionids.
4070  */
4073  delayChkptFlags = true;
4074  lsn = XLogSaveBufferForHint(buffer, buffer_std);
4075  }
4076 
4077  buf_state = LockBufHdr(bufHdr);
4078 
4079  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4080 
4081  if (!(buf_state & BM_DIRTY))
4082  {
4083  dirtied = true; /* Means "will be dirtied by this action" */
4084 
4085  /*
4086  * Set the page LSN if we wrote a backup block. We aren't supposed
4087  * to set this when only holding a share lock but as long as we
4088  * serialise it somehow we're OK. We choose to set LSN while
4089  * holding the buffer header lock, which causes any reader of an
4090  * LSN who holds only a share lock to also obtain a buffer header
4091  * lock before using PageGetLSN(), which is enforced in
4092  * BufferGetLSNAtomic().
4093  *
4094  * If checksums are enabled, you might think we should reset the
4095  * checksum here. That will happen when the page is written
4096  * sometime later in this checkpoint cycle.
4097  */
4098  if (!XLogRecPtrIsInvalid(lsn))
4099  PageSetLSN(page, lsn);
4100  }
4101 
4102  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
4103  UnlockBufHdr(bufHdr, buf_state);
4104 
4105  if (delayChkptFlags)
4107 
4108  if (dirtied)
4109  {
4110  VacuumPageDirty++;
4112  if (VacuumCostActive)
4114  }
4115  }
4116 }
#define PageSetLSN(page, lsn)
Definition: bufpage.h:367
#define DELAY_CHKPT_START
Definition: proc.h:117
bool RelFileNodeSkippingWAL(RelFileNode rnode)
Definition: storage.c:550
int delayChkptFlags
Definition: proc.h:225
bool RecoveryInProgress(void)
Definition: xlog.c:5753
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1005

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock, BufferGetPage, BufferIsLocal, BufferIsValid, DELAY_CHKPT_START, PGPROC::delayChkptFlags, elog, ERROR, GetBufferDescriptor, GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), LWLockHeldByMe(), MarkLocalBufferDirty(), MyProc, PageSetLSN, pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), RelFileNodeSkippingWAL(), buftag::rnode, BufferUsage::shared_blks_dirtied, BufferDesc::state, BufferDesc::tag, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 283 of file bufmgr.c.

284 {
286 
287  /* only allowed to be called when a reservation has been made */
288  Assert(ReservedRefCountEntry != NULL);
289 
290  /* use up the reserved entry */
292  ReservedRefCountEntry = NULL;
293 
294  /* and fill it */
295  res->buffer = buffer;
296  res->refcount = 0;
297 
298  return res;
299 }

References Assert(), PrivateRefCountEntry::buffer, res, and ReservedRefCountEntry.

Referenced by PinBuffer(), and PinBuffer_Locked().

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy 
)
static

Definition at line 1694 of file bufmgr.c.

1695 {
1697  bool result;
1698  PrivateRefCountEntry *ref;
1699 
1700  ref = GetPrivateRefCountEntry(b, true);
1701 
1702  if (ref == NULL)
1703  {
1704  uint32 buf_state;
1705  uint32 old_buf_state;
1706 
1708  ref = NewPrivateRefCountEntry(b);
1709 
1710  old_buf_state = pg_atomic_read_u32(&buf->state);
1711  for (;;)
1712  {
1713  if (old_buf_state & BM_LOCKED)
1714  old_buf_state = WaitBufHdrUnlocked(buf);
1715 
1716  buf_state = old_buf_state;
1717 
1718  /* increase refcount */
1719  buf_state += BUF_REFCOUNT_ONE;
1720 
1721  if (strategy == NULL)
1722  {
1723  /* Default case: increase usagecount unless already max. */
1725  buf_state += BUF_USAGECOUNT_ONE;
1726  }
1727  else
1728  {
1729  /*
1730  * Ring buffers shouldn't evict others from pool. Thus we
1731  * don't make usagecount more than 1.
1732  */
1733  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1734  buf_state += BUF_USAGECOUNT_ONE;
1735  }
1736 
1737  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1738  buf_state))
1739  {
1740  result = (buf_state & BM_VALID) != 0;
1741 
1742  /*
1743  * Assume that we acquired a buffer pin for the purposes of
1744  * Valgrind buffer client checks (even in !result case) to
1745  * keep things simple. Buffers that are unsafe to access are
1746  * not generally guaranteed to be marked undefined or
1747  * non-accessible in any case.
1748  */
1750  break;
1751  }
1752  }
1753  }
1754  else
1755  {
1756  /*
1757  * If we previously pinned the buffer, it must surely be valid.
1758  *
1759  * Note: We deliberately avoid a Valgrind client request here.
1760  * Individual access methods can optionally superimpose buffer page
1761  * client requests on top of our client requests to enforce that
1762  * buffers are only accessed while locked (and pinned). It's possible
1763  * that the buffer page is legitimately non-accessible here. We
1764  * cannot meddle with that.
1765  */
1766  result = true;
1767  }
1768 
1769  ref->refcount++;
1770  Assert(ref->refcount > 0);
1772  return result;
1773 }
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:76
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:41
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:50
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:283
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26

References Assert(), b, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer, BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservePrivateRefCountEntry(), ResourceOwnerRememberBuffer(), VALGRIND_MAKE_MEM_DEFINED, and WaitBufHdrUnlocked().

Referenced by BufferAlloc(), and ReadRecentBuffer().

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 1797 of file bufmgr.c.

1798 {
1799  Buffer b;
1800  PrivateRefCountEntry *ref;
1801  uint32 buf_state;
1802 
1803  /*
1804  * As explained, We don't expect any preexisting pins. That allows us to
1805  * manipulate the PrivateRefCount after releasing the spinlock
1806  */
1808 
1809  /*
1810  * Buffer can't have a preexisting pin, so mark its page as defined to
1811  * Valgrind (this is similar to the PinBuffer() case where the backend
1812  * doesn't already have a buffer pin)
1813  */
1815 
1816  /*
1817  * Since we hold the buffer spinlock, we can update the buffer state and
1818  * release the lock in one operation.
1819  */
1820  buf_state = pg_atomic_read_u32(&buf->state);
1821  Assert(buf_state & BM_LOCKED);
1822  buf_state += BUF_REFCOUNT_ONE;
1823  UnlockBufHdr(buf, buf_state);
1824 
1826 
1827  ref = NewPrivateRefCountEntry(b);
1828  ref->refcount++;
1829 
1831 }

References Assert(), b, BM_LOCKED, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer, BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), UnlockBufHdr, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), ReadRecentBuffer(), and SyncOneBuffer().

◆ PrefetchBuffer()

PrefetchBufferResult PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 592 of file bufmgr.c.

593 {
594  Assert(RelationIsValid(reln));
595  Assert(BlockNumberIsValid(blockNum));
596 
597  if (RelationUsesLocalBuffers(reln))
598  {
599  /* see comments in ReadBufferExtended */
600  if (RELATION_IS_OTHER_TEMP(reln))
601  ereport(ERROR,
602  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
603  errmsg("cannot access temporary tables of other sessions")));
604 
605  /* pass it off to localbuf.c */
606  return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
607  }
608  else
609  {
610  /* pass it to the shared buffer version */
611  return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
612  }
613 }
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:505
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:643
#define RelationIsValid(relation)
Definition: rel.h:462

References Assert(), BlockNumberIsValid, ereport, errcode(), errmsg(), ERROR, PrefetchLocalBuffer(), PrefetchSharedBuffer(), RELATION_IS_OTHER_TEMP, RelationGetSmgr(), RelationIsValid, and RelationUsesLocalBuffers.

Referenced by acquire_sample_rows(), BitmapPrefetch(), count_nondeletable_pages(), and pg_prewarm().

◆ PrefetchSharedBuffer()

PrefetchBufferResult PrefetchSharedBuffer ( SMgrRelation  smgr_reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 505 of file bufmgr.c.

508 {
509  PrefetchBufferResult result = {InvalidBuffer, false};
510  BufferTag newTag; /* identity of requested block */
511  uint32 newHash; /* hash value for newTag */
512  LWLock *newPartitionLock; /* buffer partition lock for it */
513  int buf_id;
514 
515  Assert(BlockNumberIsValid(blockNum));
516 
517  /* create a tag so we can lookup the buffer */
518  INIT_BUFFERTAG(newTag, smgr_reln->smgr_rnode.node,
519  forkNum, blockNum);
520 
521  /* determine its hash code and partition lock ID */
522  newHash = BufTableHashCode(&newTag);
523  newPartitionLock = BufMappingPartitionLock(newHash);
524 
525  /* see if the block is in the buffer pool already */
526  LWLockAcquire(newPartitionLock, LW_SHARED);
527  buf_id = BufTableLookup(&newTag, newHash);
528  LWLockRelease(newPartitionLock);
529 
530  /* If not in buffers, initiate prefetch */
531  if (buf_id < 0)
532  {
533 #ifdef USE_PREFETCH
534  /*
535  * Try to initiate an asynchronous read. This returns false in
536  * recovery if the relation file doesn't exist.
537  */
538  if (smgrprefetch(smgr_reln, forkNum, blockNum))
539  result.initiated_io = true;
540 #endif /* USE_PREFETCH */
541  }
542  else
543  {
544  /*
545  * Report the buffer it was in at that time. The caller may be able
546  * to avoid a buffer table lookup, but it's not pinned and it must be
547  * rechecked!
548  */
549  result.recent_buffer = buf_id + 1;
550  }
551 
552  /*
553  * If the block *is* in buffers, we do nothing. This is not really ideal:
554  * the block might be just about to be evicted, which would be stupid
555  * since we know we are going to need it soon. But the only easy answer
556  * is to bump the usage_count, which does not seem like a great solution:
557  * when the caller does ultimately touch the block, usage_count would get
558  * bumped again, resulting in too much favoritism for blocks that are
559  * involved in a prefetch sequence. A real fix would involve some
560  * additional per-buffer state, and it's not clear that there's enough of
561  * a problem to justify that.
562  */
563 
564  return result;
565 }
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:518
Buffer recent_buffer
Definition: bufmgr.h:54

References Assert(), BlockNumberIsValid, BufMappingPartitionLock, BufTableHashCode(), BufTableLookup(), INIT_BUFFERTAG, PrefetchBufferResult::initiated_io, InvalidBuffer, LW_SHARED, LWLockAcquire(), LWLockRelease(), RelFileNodeBackend::node, PrefetchBufferResult::recent_buffer, SMgrRelationData::smgr_rnode, and smgrprefetch().

Referenced by PrefetchBuffer(), and XLogPrefetcherNextBlock().

◆ PrintBufferLeakWarning()

void PrintBufferLeakWarning ( Buffer  buffer)

Definition at line 2688 of file bufmgr.c.

2689 {
2690  BufferDesc *buf;
2691  int32 loccount;
2692  char *path;
2693  BackendId backend;
2694  uint32 buf_state;
2695 
2696  Assert(BufferIsValid(buffer));
2697  if (BufferIsLocal(buffer))
2698  {
2699  buf = GetLocalBufferDescriptor(-buffer - 1);
2700  loccount = LocalRefCount[-buffer - 1];
2701  backend = MyBackendId;
2702  }
2703  else
2704  {
2705  buf = GetBufferDescriptor(buffer - 1);
2706  loccount = GetPrivateRefCount(buffer);
2707  backend = InvalidBackendId;
2708  }
2709 
2710  /* theoretically we should lock the bufhdr here */
2711  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2712  buf_state = pg_atomic_read_u32(&buf->state);
2713  elog(WARNING,
2714  "buffer refcount leak: [%03d] "
2715  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2716  buffer, path,
2717  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2718  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2719  pfree(path);
2720 }
int BackendId
Definition: backendid.h:21

References Assert(), buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid, elog, GetBufferDescriptor, GetLocalBufferDescriptor, GetPrivateRefCount(), InvalidBackendId, LocalRefCount, MyBackendId, pfree(), pg_atomic_read_u32(), relpathbackend, and WARNING.

Referenced by CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResourceOwnerReleaseInternal().

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 702 of file bufmgr.c.

703 {
704  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
705 }
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:749
@ RBM_NORMAL
Definition: bufmgr.h:39

References MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_getbuf(), _bt_search_insert(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinbuild(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_page_items_internal(), bt_page_stats_internal(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_inplace_update(), heap_lock_tuple(), heap_update(), initBloomState(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

◆ ReadBuffer_common()

static Buffer ReadBuffer_common ( SMgrRelation  reln,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool hit 
)
static

Definition at line 809 of file bufmgr.c.

812 {
813  BufferDesc *bufHdr;
814  Block bufBlock;
815  bool found;
816  bool isExtend;
817  bool isLocalBuf = SmgrIsTemp(smgr);
818 
819  *hit = false;
820 
821  /* Make sure we will have room to remember the buffer pin */
823 
824  isExtend = (blockNum == P_NEW);
825 
826  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
827  smgr->smgr_rnode.node.spcNode,
828  smgr->smgr_rnode.node.dbNode,
829  smgr->smgr_rnode.node.relNode,
830  smgr->smgr_rnode.backend,
831  isExtend);
832 
833  /* Substitute proper block number if caller asked for P_NEW */
834  if (isExtend)
835  {
836  blockNum = smgrnblocks(smgr, forkNum);
837  /* Fail if relation is already at maximum possible length */
838  if (blockNum == P_NEW)
839  ereport(ERROR,
840  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
841  errmsg("cannot extend relation %s beyond %u blocks",
842  relpath(smgr->smgr_rnode, forkNum),
843  P_NEW)));
844  }
845 
846  if (isLocalBuf)
847  {
848  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
849  if (found)
851  else if (isExtend)
853  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
856  }
857  else
858  {
859  /*
860  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
861  * not currently in memory.
862  */
863  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
864  strategy, &found);
865  if (found)
867  else if (isExtend)
869  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
872  }
873 
874  /* At this point we do NOT hold any locks. */
875 
876  /* if it was already in the buffer pool, we're done */
877  if (found)
878  {
879  if (!isExtend)
880  {
881  /* Just need to update stats before we exit */
882  *hit = true;
883  VacuumPageHit++;
884 
885  if (VacuumCostActive)
887 
888  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
889  smgr->smgr_rnode.node.spcNode,
890  smgr->smgr_rnode.node.dbNode,
891  smgr->smgr_rnode.node.relNode,
892  smgr->smgr_rnode.backend,
893  isExtend,
894  found);
895 
896  /*
897  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
898  * locked on return.
899  */
900  if (!isLocalBuf)
901  {
902  if (mode == RBM_ZERO_AND_LOCK)
904  LW_EXCLUSIVE);
905  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
907  }
908 
909  return BufferDescriptorGetBuffer(bufHdr);
910  }
911 
912  /*
913  * We get here only in the corner case where we are trying to extend
914  * the relation but we found a pre-existing buffer marked BM_VALID.
915  * This can happen because mdread doesn't complain about reads beyond
916  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
917  * read a block beyond EOF could have left a "valid" zero-filled
918  * buffer. Unfortunately, we have also seen this case occurring
919  * because of buggy Linux kernels that sometimes return an
920  * lseek(SEEK_END) result that doesn't account for a recent write. In
921  * that situation, the pre-existing buffer would contain valid data
922  * that we don't want to overwrite. Since the legitimate case should
923  * always have left a zero-filled buffer, complain if not PageIsNew.
924  */
925  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
926  if (!PageIsNew((Page) bufBlock))
927  ereport(ERROR,
928  (errmsg("unexpected data beyond EOF in block %u of relation %s",
929  blockNum, relpath(smgr->smgr_rnode, forkNum)),
930  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
931 
932  /*
933  * We *must* do smgrextend before succeeding, else the page will not
934  * be reserved by the kernel, and the next P_NEW call will decide to
935  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
936  * call that BufferAlloc didn't, and proceed.
937  */
938  if (isLocalBuf)
939  {
940  /* Only need to adjust flags */
941  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
942 
943  Assert(buf_state & BM_VALID);
944  buf_state &= ~BM_VALID;
945  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
946  }
947  else
948  {
949  /*
950  * Loop to handle the very small possibility that someone re-sets
951  * BM_VALID between our clearing it and StartBufferIO inspecting
952  * it.
953  */
954  do
955  {
956  uint32 buf_state = LockBufHdr(bufHdr);
957 
958  Assert(buf_state & BM_VALID);
959  buf_state &= ~BM_VALID;
960  UnlockBufHdr(bufHdr, buf_state);
961  } while (!StartBufferIO(bufHdr, true));
962  }
963  }
964 
965  /*
966  * if we have gotten to this point, we have allocated a buffer for the
967  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
968  * if it's a shared buffer.
969  *
970  * Note: if smgrextend fails, we will end up with a buffer that is
971  * allocated but not marked BM_VALID. P_NEW will still select the same
972  * block number (because the relation didn't get any longer on disk) and
973  * so future attempts to extend the relation will find the same buffer (if
974  * it's not been recycled) but come right back here to try smgrextend
975  * again.
976  */
977  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
978 
979  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
980 
981  if (isExtend)
982  {
983  /* new buffers are zero-filled */
984  MemSet((char *) bufBlock, 0, BLCKSZ);
985  /* don't set checksum for all-zero page */
986  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
987 
988  /*
989  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
990  * although we're essentially performing a write. At least on linux
991  * doing so defeats the 'delayed allocation' mechanism, leading to
992  * increased file fragmentation.
993  */
994  }
995  else
996  {
997  /*
998  * Read in the page, unless the caller intends to overwrite it and
999  * just wants us to allocate a buffer.
1000  */
1002  MemSet((char *) bufBlock, 0, BLCKSZ);
1003  else
1004  {
1005  instr_time io_start,
1006  io_time;
1007 
1008  if (track_io_timing)
1009  INSTR_TIME_SET_CURRENT(io_start);
1010 
1011  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
1012 
1013  if (track_io_timing)
1014  {
1015  INSTR_TIME_SET_CURRENT(io_time);
1016  INSTR_TIME_SUBTRACT(io_time, io_start);
1019  }
1020 
1021  /* check for garbage data */
1022  if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
1024  {
1026  {
1027  ereport(WARNING,
1029  errmsg("invalid page in block %u of relation %s; zeroing out page",
1030  blockNum,
1031  relpath(smgr->smgr_rnode, forkNum))));
1032  MemSet((char *) bufBlock, 0, BLCKSZ);
1033  }
1034  else
1035  ereport(ERROR,
1037  errmsg("invalid page in block %u of relation %s",
1038  blockNum,
1039  relpath(smgr->smgr_rnode, forkNum))));
1040  }
1041  }
1042  }
1043 
1044  /*
1045  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
1046  * the page as valid, to make sure that no other backend sees the zeroed
1047  * page before the caller has had a chance to initialize it.
1048  *
1049  * Since no-one else can be looking at the page contents yet, there is no
1050  * difference between an exclusive lock and a cleanup-strength lock. (Note
1051  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
1052  * they assert that the buffer is already valid.)
1053  */
1055  !isLocalBuf)
1056  {
1058  }
1059 
1060  if (isLocalBuf)
1061  {
1062  /* Only need to adjust flags */
1063  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1064 
1065  buf_state |= BM_VALID;
1066  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1067  }
1068  else
1069  {
1070  /* Set BM_VALID, terminate IO, and wake up any waiters */
1071  TerminateBufferIO(bufHdr, false, BM_VALID);
1072  }
1073 
1074  VacuumPageMiss++;
1075  if (VacuumCostActive)
1077 
1078  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1079  smgr->smgr_rnode.node.spcNode,
1080  smgr->smgr_rnode.node.dbNode,
1081  smgr->smgr_rnode.node.relNode,
1082  smgr->smgr_rnode.backend,
1083  isExtend,
1084  found);
1085 
1086  return BufferDescriptorGetBuffer(bufHdr);
1087 }
bool zero_damaged_pages
Definition: bufmgr.c:134
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1109
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:4213
#define P_NEW
Definition: bufmgr.h:91
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:44
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:42
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:40
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:45
bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
Definition: bufpage.c:88
#define PIV_LOG_WARNING
Definition: bufpage.h:412
#define PageIsNew(page)
Definition: bufpage.h:228
#define PIV_REPORT_STAT
Definition: bufpage.h:413
#define MemSet(start, val, len)
Definition: c.h:1008
int errhint(const char *fmt,...)
Definition: elog.c:1151
int64 VacuumPageHit
Definition: globals.c:148
int VacuumCostPageMiss
Definition: globals.c:143
int64 VacuumPageMiss
Definition: globals.c:149
int VacuumCostPageHit
Definition: globals.c:142
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:109
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:43
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:468
#define relpath(rnode, forknum)
Definition: relpath.h:87
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:579
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:493
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:532
#define SmgrIsTemp(smgr)
Definition: smgr.h:77
int64 local_blks_hit
Definition: instrument.h:30
int64 local_blks_written
Definition: instrument.h:33
int64 shared_blks_read
Definition: instrument.h:27
instr_time blk_read_time
Definition: instrument.h:36
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26

References Assert(), RelFileNodeBackend::backend, BufferUsage::blk_read_time, BM_VALID, BufferAlloc(), BufferDescriptorGetBuffer, BufferDescriptorGetContentLock, BufHdrGetBlock, CurrentResourceOwner, RelFileNode::dbNode, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errhint(), errmsg(), ERROR, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, BufferUsage::local_blks_hit, BufferUsage::local_blks_read, BufferUsage::local_blks_written, LocalBufferAlloc(), LocalBufHdrGetBlock, LockBufferForCleanup(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), MemSet, mode, RelFileNodeBackend::node, P_NEW, PageIsNew, PageIsVerifiedExtended(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), pgBufferUsage, pgstat_count_buffer_read_time, PIV_LOG_WARNING, PIV_REPORT_STAT, RBM_NORMAL, RBM_NORMAL_NO_LOG, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RBM_ZERO_ON_ERROR, RelFileNode::relNode, relpath, ResourceOwnerEnlargeBuffers(), BufferUsage::shared_blks_hit, BufferUsage::shared_blks_read, BufferUsage::shared_blks_written, SMgrRelationData::smgr_rnode, smgrextend(), SmgrIsTemp, smgrnblocks(), smgrread(), RelFileNode::spcNode, StartBufferIO(), BufferDesc::state, TerminateBufferIO(), track_io_timing, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageHit, VacuumCostPageMiss, VacuumPageHit, VacuumPageMiss, WARNING, and zero_damaged_pages.

Referenced by ReadBufferExtended(), and ReadBufferWithoutRelcache().

◆ ReadBufferExtended()

Buffer ReadBufferExtended ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)

Definition at line 749 of file bufmgr.c.

751 {
752  bool hit;
753  Buffer buf;
754 
755  /*
756  * Reject attempts to read non-local temporary relations; we would be
757  * likely to get wrong data since we have no visibility into the owning
758  * session's local buffers.
759  */
760  if (RELATION_IS_OTHER_TEMP(reln))
761  ereport(ERROR,
762  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
763  errmsg("cannot access temporary tables of other sessions")));
764 
765  /*
766  * Read the buffer, and update pgstat counters to reflect a cache hit or
767  * miss.
768  */
770  buf = ReadBuffer_common(RelationGetSmgr(reln), reln->rd_rel->relpersistence,
771  forkNum, blockNum, mode, strategy, &hit);
772  if (hit)
774  return buf;
775 }
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:809
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:550
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:555
Form_pg_class rd_rel
Definition: rel.h:109

References buf, ereport, errcode(), errmsg(), ERROR, mode, pgstat_count_buffer_hit, pgstat_count_buffer_read, RelationData::rd_rel, ReadBuffer_common(), RELATION_IS_OTHER_TEMP, and RelationGetSmgr().

Referenced by _hash_getbuf_with_strategy(), _hash_getinitbuf(), _hash_getnewbuf(), autoprewarm_database_main(), blbulkdelete(), blgetbitmap(), blvacuumcleanup(), brin_vacuum_scan(), brinbuildempty(), bt_recheck_sibling_links(), btvacuumpage(), collect_corrupt_items(), collect_visibility_data(), count_nondeletable_pages(), fill_seq_fork_with_data(), fsm_readbuf(), get_raw_page_internal(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginScanToDelete(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistbuildempty(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbulkdelete(), heapam_scan_analyze_next_block(), heapgetpage(), lazy_scan_heap(), lazy_vacuum_heap_rel(), log_newpage_range(), palloc_btree_page(), pg_prewarm(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstathashindex(), pgstatindex_impl(), ReadBuffer(), ReadBufferBI(), spgprocesspending(), spgvacuumpage(), statapprox_heap(), verify_heapam(), and vm_readbuf().

◆ ReadBufferWithoutRelcache()

Buffer ReadBufferWithoutRelcache ( RelFileNode  rnode,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool  permanent 
)

Definition at line 789 of file bufmgr.c.

792 {
793  bool hit;
794 
795  SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
796 
797  return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT :
798  RELPERSISTENCE_UNLOGGED, forkNum, blockNum,
799  mode, strategy, &hit);
800 }

References InvalidBackendId, mode, ReadBuffer_common(), and smgropen().

Referenced by RelationCopyStorageUsingBuffer(), ScanSourceDatabasePgClass(), and XLogReadBufferExtended().

◆ ReadRecentBuffer()

bool ReadRecentBuffer ( RelFileNode  rnode,
ForkNumber  forkNum,
BlockNumber  blockNum,
Buffer  recent_buffer 
)

Definition at line 623 of file bufmgr.c.

625 {
626  BufferDesc *bufHdr;
627  BufferTag tag;
628  uint32 buf_state;
629  bool have_private_ref;
630 
631  Assert(BufferIsValid(recent_buffer));
632 
635  INIT_BUFFERTAG(tag, rnode, forkNum, blockNum);
636 
637  if (BufferIsLocal(recent_buffer))
638  {
639  bufHdr = GetBufferDescriptor(-recent_buffer - 1);
640  buf_state = pg_atomic_read_u32(&bufHdr->state);
641 
642  /* Is it still valid and holding the right tag? */
643  if ((buf_state & BM_VALID) && BUFFERTAGS_EQUAL(tag, bufHdr->tag))
644  {
645  /* Bump local buffer's ref and usage counts. */
647  LocalRefCount[-recent_buffer - 1]++;
649  pg_atomic_write_u32(&bufHdr->state,
650  buf_state + BUF_USAGECOUNT_ONE);
651 
653 
654  return true;
655  }
656  }
657  else
658  {
659  bufHdr = GetBufferDescriptor(recent_buffer - 1);
660  have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
661 
662  /*
663  * Do we already have this buffer pinned with a private reference? If
664  * so, it must be valid and it is safe to check the tag without
665  * locking. If not, we have to lock the header first and then check.
666  */
667  if (have_private_ref)
668  buf_state = pg_atomic_read_u32(&bufHdr->state);
669  else
670  buf_state = LockBufHdr(bufHdr);
671 
672  if ((buf_state & BM_VALID) && BUFFERTAGS_EQUAL(tag, bufHdr->tag))
673  {
674  /*
675  * It's now safe to pin the buffer. We can't pin first and ask
676  * questions later, because it might confuse code paths like
677  * InvalidateBuffer() if we pinned a random non-matching buffer.
678  */
679  if (have_private_ref)
680  PinBuffer(bufHdr, NULL); /* bump pin count */
681  else
682  PinBuffer_Locked(bufHdr); /* pin for first time */
683 
685 
686  return true;
687  }
688 
689  /* If we locked the header above, now unlock. */
690  if (!have_private_ref)
691  UnlockBufHdr(bufHdr, buf_state);
692  }
693 
694  return false;
695 }
static void pg_atomic_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:258

References Assert(), BM_MAX_USAGE_COUNT, BM_VALID, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferIsLocal, BufferIsValid, BUFFERTAGS_EQUAL, CurrentResourceOwner, GetBufferDescriptor, GetPrivateRefCount(), INIT_BUFFERTAG, BufferUsage::local_blks_hit, LocalRefCount, LockBufHdr(), pg_atomic_read_u32(), pg_atomic_write_u32(), pgBufferUsage, PinBuffer(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), ResourceOwnerRememberBuffer(), BufferUsage::shared_blks_hit, BufferDesc::state, BufferDesc::tag, and UnlockBufHdr.

Referenced by XLogReadBufferExtended().

◆ RelationCopyStorageUsingBuffer()

static void RelationCopyStorageUsingBuffer ( Relation  src,
Relation  dst,
ForkNumber  forkNum,
bool  isunlogged 
)
static

Definition at line 3695 of file bufmgr.c.

3697 {
3698  Buffer srcBuf;
3699  Buffer dstBuf;
3700  Page srcPage;
3701  Page dstPage;
3702  bool use_wal;
3703  BlockNumber nblocks;
3704  BlockNumber blkno;
3705  BufferAccessStrategy bstrategy_src;
3706  BufferAccessStrategy bstrategy_dst;
3707 
3708  /*
3709  * In general, we want to write WAL whenever wal_level > 'minimal', but we
3710  * can skip it when copying any fork of an unlogged relation other than
3711  * the init fork.
3712  */
3713  use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
3714 
3715  /* Get number of blocks in the source relation. */
3716  nblocks = smgrnblocks(RelationGetSmgr(src), forkNum);
3717 
3718  /* Nothing to copy; just return. */
3719  if (nblocks == 0)
3720  return;
3721 
3722  /* This is a bulk operation, so use buffer access strategies. */
3723  bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
3724  bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
3725 
3726  /* Iterate over each block of the source relation file. */
3727  for (blkno = 0; blkno < nblocks; blkno++)
3728  {
3730 
3731  /* Read block from source relation. */
3732  srcBuf = ReadBufferWithoutRelcache(src->rd_node, forkNum, blkno,
3733  RBM_NORMAL, bstrategy_src,
3734  permanent);
3735  srcPage = BufferGetPage(srcBuf);
3736  if (PageIsNew(srcPage) || PageIsEmpty(srcPage))
3737  {
3738  ReleaseBuffer(srcBuf);
3739  continue;
3740  }
3741 
3742  /* Use P_NEW to extend the destination relation. */
3743  dstBuf = ReadBufferWithoutRelcache(dst->rd_node, forkNum, P_NEW,
3744  RBM_NORMAL, bstrategy_dst,
3745  permanent);
3747 
3749 
3750  /* Copy page data from the source to the destination. */
3751  dstPage = BufferGetPage(dstBuf);
3752  memcpy(dstPage, srcPage, BLCKSZ);
3753  MarkBufferDirty(dstBuf);
3754 
3755  /* WAL-log the copied page. */
3756  if (use_wal)
3757  log_newpage_buffer(dstBuf, true);
3758 
3759  END_CRIT_SECTION();
3760 
3761  UnlockReleaseBuffer(dstBuf);
3762  ReleaseBuffer(srcBuf);
3763  }
3764 }
Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:789
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3915
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3938
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1573
@ BAS_BULKREAD
Definition: bufmgr.h:30
@ BAS_BULKWRITE
Definition: bufmgr.h:32
#define PageIsEmpty(page)
Definition: bufpage.h:221
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:541
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
#define XLogIsNeeded()
Definition: xlog.h:104
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1177

References BAS_BULKREAD, BAS_BULKWRITE, BUFFER_LOCK_EXCLUSIVE, BufferGetPage, CHECK_FOR_INTERRUPTS, END_CRIT_SECTION, GetAccessStrategy(), INIT_FORKNUM, LockBuffer(), log_newpage_buffer(), MarkBufferDirty(), P_NEW, PageIsEmpty, PageIsNew, RBM_NORMAL, RelationData::rd_node, ReadBufferWithoutRelcache(), RelationGetSmgr(), ReleaseBuffer(), smgrnblocks(), START_CRIT_SECTION, UnlockReleaseBuffer(), and XLogIsNeeded.

Referenced by CreateAndCopyRelationData().

◆ RelationGetNumberOfBlocksInFork()

BlockNumber RelationGetNumberOfBlocksInFork ( Relation  relation,
ForkNumber  forkNum 
)

Definition at line 2942 of file bufmgr.c.

2943 {
2944  if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
2945  {
2946  /*
2947  * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
2948  * tableam returns the size in bytes - but for the purpose of this
2949  * routine, we want the number of blocks. Therefore divide, rounding
2950  * up.
2951  */
2952  uint64 szbytes;
2953 
2954  szbytes = table_relation_size(relation, forkNum);
2955 
2956  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
2957  }
2958  else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
2959  {
2960  return smgrnblocks(RelationGetSmgr(relation), forkNum);
2961  }
2962  else
2963  Assert(false);
2964 
2965  return 0; /* keep compiler quiet */
2966 }
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1840

References Assert(), RelationData::rd_rel, RelationGetSmgr(), smgrnblocks(), and table_relation_size().

Referenced by _hash_getnewbuf(), _hash_init(), autoprewarm_database_main(), get_raw_page_internal(), and pg_prewarm().

◆ ReleaseAndReadBuffer()

Buffer ReleaseAndReadBuffer ( Buffer  buffer,
Relation  relation,
BlockNumber  blockNum 
)

Definition at line 1636 of file bufmgr.c.

1639 {
1640  ForkNumber forkNum = MAIN_FORKNUM;
1641  BufferDesc *bufHdr;
1642 
1643  if (BufferIsValid(buffer))
1644  {
1645  Assert(BufferIsPinned(buffer));
1646  if (BufferIsLocal(buffer))
1647  {
1648  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1649  if (bufHdr->tag.blockNum == blockNum &&
1650  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1651  bufHdr->tag.forkNum == forkNum)
1652  return buffer;
1654  LocalRefCount[-buffer - 1]--;
1655  }
1656  else
1657  {
1658  bufHdr = GetBufferDescriptor(buffer - 1);
1659  /* we have pin, so it's ok to examine tag without spinlock */
1660  if (bufHdr->tag.blockNum == blockNum &&
1661  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1662  bufHdr->tag.forkNum == forkNum)
1663  return buffer;
1664  UnpinBuffer(bufHdr, true);
1665  }
1666  }
1667 
1668  return ReadBuffer(relation, blockNum);
1669 }
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:702
void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:967

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid, CurrentResourceOwner, buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, LocalRefCount, MAIN_FORKNUM, RelationData::rd_node, ReadBuffer(), RelFileNodeEquals, ResourceOwnerForgetBuffer(), buftag::rnode, BufferDesc::tag, and UnpinBuffer().

Referenced by _bt_relandgetbuf(), ginFindLeafPage(), heapam_index_fetch_tuple(), and heapam_scan_bitmap_next_block().

◆ ReleaseBuffer()

void ReleaseBuffer ( Buffer  buffer)

Definition at line 3915 of file bufmgr.c.

3916 {
3917  if (!BufferIsValid(buffer))
3918  elog(ERROR, "bad buffer ID: %d", buffer);
3919 
3920  if (BufferIsLocal(buffer))
3921  {
3923 
3924  Assert(LocalRefCount[-buffer - 1] > 0);
3925  LocalRefCount[-buffer - 1]--;
3926  return;
3927  }
3928 
3929  UnpinBuffer(GetBufferDescriptor(buffer - 1), true);
3930 }

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid, CurrentResourceOwner, elog, ERROR, GetBufferDescriptor, LocalRefCount, ResourceOwnerForgetBuffer(), and UnpinBuffer().

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_getbuf(), _bt_pagedel(), _bt_relbuf(), _bt_search_insert(), _bt_unlink_halfdead_page(), _hash_dropbuf(), _hash_getbuf_with_condlock_cleanup(), autoprewarm_database_main(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brin_vacuum_scan(), bringetbitmap(), brinGetTupleForHeapBlock(), brininsert(), brinRevmapTerminate(), brinsummarize(), collect_corrupt_items(), collect_visibility_data(), entryLoadMoreItems(), ExecEndBitmapHeapScan(), ExecEndIndexOnlyScan(), ExecReScanBitmapHeapScan(), FreeBulkInsertState(), freeGinBtreeStack(), fsm_vacuum_page(), get_actual_variable_endpoint(), get_raw_page_internal(), GetRecordedFreeSpace(), ginDeletePage(), ginFindParents(), ginFinishSplit(), ginFreeScanKeys(), ginInsertCleanup(), GinNewBuffer(), ginScanToDelete(), gistdoinsert(), gistFindCorrectParent(), gistNewBuffer(), gistvacuum_delete_empty_pages(), heap_abort_speculative(), heap_delete(), heap_endscan(), heap_fetch(), heap_force_common(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_rescan(), heap_update(), heap_xlog_delete(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heap_xlog_visible(), heapam_index_fetch_reset(), heapam_scan_sample_next_block(), heapam_tuple_lock(), heapgetpage(), heapgettup(), heapgettup_pagemode(), lazy_scan_heap(), lazy_vacuum_heap_rel(), pg_prewarm(), pg_visibility(), pg_visibility_map(), pg_visibility_map_summary(), pgstatindex_impl(), ReadBufferBI(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), ReleaseBulkInsertStatePin(), ResourceOwnerReleaseInternal(), revmap_get_buffer(), revmap_physical_extend(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), statapprox_heap(), summarize_range(), terminate_brin_buildstate(), tts_buffer_heap_clear(), tts_buffer_heap_materialize(), tts_buffer_heap_store_tuple(), UnlockReleaseBuffer(), verify_heapam(), visibilitymap_count(), visibilitymap_get_status(), visibilitymap_pin(), and XLogReadBufferExtended().

◆ ReservePrivateRefCountEntry()

static void ReservePrivateRefCountEntry ( void  )
static

Definition at line 217 of file bufmgr.c.

218 {
219  /* Already reserved (or freed), nothing to do */
220  if (ReservedRefCountEntry != NULL)
221  return;
222 
223  /*
224  * First search for a free entry the array, that'll be sufficient in the
225  * majority of cases.
226  */
227  {
228  int i;
229 
230  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
231  {
233 
235 
236  if (res->buffer == InvalidBuffer)
237  {
239  return;
240  }
241  }
242  }
243 
244  /*
245  * No luck. All array entries are full. Move one array entry into the hash
246  * table.
247  */
248  {
249  /*
250  * Move entry from the current clock position in the array into the
251  * hashtable. Use that slot.
252  */
253  PrivateRefCountEntry *hashent;
254  bool found;
255 
256  /* select victim slot */
259 
260  /* Better be used, otherwise we shouldn't get here. */
262 
263  /* enter victim array entry into hashtable */
265  (void *) &(ReservedRefCountEntry->buffer),
266  HASH_ENTER,
267  &found);
268  Assert(!found);
270 
271  /* clear the now free array slot */
274 
276  }
277 }
static uint32 PrivateRefCountClock
Definition: bufmgr.c:202
@ HASH_ENTER
Definition: hsearch.h:114

References Assert(), PrivateRefCountEntry::buffer, HASH_ENTER, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountClock, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, res, and ReservedRefCountEntry.

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetPrivateRefCountEntry(), PinBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ rnode_comparator()

static int rnode_comparator ( const void *  p1,
const void *  p2 
)
static

Definition at line 4709 of file bufmgr.c.

4710 {
4711  RelFileNode n1 = *(const RelFileNode *) p1;
4712  RelFileNode n2 = *(const RelFileNode *) p2;
4713 
4714  if (n1.relNode < n2.relNode)
4715  return -1;
4716  else if (n1.relNode > n2.relNode)
4717  return 1;
4718 
4719  if (n1.dbNode < n2.dbNode)
4720  return -1;
4721  else if (n1.dbNode > n2.