PostgreSQL Source Code  git master
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xlog.h"
#include "catalog/catalog.h"
#include "catalog/storage.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner_private.h"
#include "utils/timestamp.h"
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)   LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static Buffer ReadBuffer_common (SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf, bool fixOwner)
 
static void BufferSync (int flags)
 
static uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static bool StartBufferIO (BufferDesc *buf, bool forInput)
 
static void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rnode_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const void *p1, const void *p2)
 
static int ckpt_buforder_comparator (const void *pa, const void *pb)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
static void InvalidateBuffer (BufferDesc *buf)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferPoolAccess (void)
 
void InitBufferPoolBackend (void)
 
void PrintBufferLeakWarning (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
void BufmgrCommit (void)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelFileNodeBuffers (RelFileNodeBackend rnode, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelFileNodesAllBuffers (RelFileNodeBackend *rnodes, int nnodes)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, int mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
void AbortBufferIO (void)
 
uint32 LockBufHdr (BufferDesc *desc)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *context)
 
void TestForOldSnapshot_impl (Snapshot snapshot, Relation relation)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = 0
 
int maintenance_io_concurrency = 0
 
int checkpoint_flush_after = 0
 
int bgwriter_flush_after = 0
 
int backend_flush_after = 0
 
static BufferDescInProgressBuf = NULL
 
static bool IsForInput
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 

Macro Definition Documentation

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 69 of file bufmgr.c.

Referenced by BgBufferSync(), and SyncOneBuffer().

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 68 of file bufmgr.c.

Referenced by BgBufferSync(), BufferSync(), and SyncOneBuffer().

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 61 of file bufmgr.c.

Referenced by BufferAlloc(), and FlushBuffer().

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:379
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
int32 * LocalRefCount
Definition: localbuf.c:45

Definition at line 440 of file bufmgr.c.

Referenced by BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetTag(), BufferIsPermanent(), ConditionalLockBuffer(), FlushOneBuffer(), IncrBufferRefCount(), LockBuffer(), LockBufferForCleanup(), MarkBufferDirty(), and ReleaseAndReadBuffer().

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 60 of file bufmgr.c.

Referenced by FlushBuffer(), PinBuffer(), PinBuffer_Locked(), ReadBuffer_common(), and UnpinBuffer().

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 64 of file bufmgr.c.

Referenced by FlushRelationBuffers(), and ReadBuffer_common().

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 71 of file bufmgr.c.

Referenced by DropRelFileNodesAllBuffers(), and FlushRelationsAllBuffers().

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

◆ SMgrSortArray

typedef struct SMgrSortArray SMgrSortArray

Function Documentation

◆ AbortBufferIO()

void AbortBufferIO ( void  )

Definition at line 4200 of file bufmgr.c.

References Assert, buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_VALID, buf, BufferDescriptorGetIOLock, ereport, errcode(), errdetail(), errmsg(), buftag::forkNum, InProgressBuf, IsForInput, LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), pfree(), relpathperm, buftag::rnode, BufferDesc::tag, TerminateBufferIO(), UnlockBufHdr, and WARNING.

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

4201 {
4203 
4204  if (buf)
4205  {
4206  uint32 buf_state;
4207 
4208  /*
4209  * Since LWLockReleaseAll has already been called, we're not holding
4210  * the buffer's io_in_progress_lock. We have to re-acquire it so that
4211  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
4212  * buffer will be in a busy spin until we succeed in doing this.
4213  */
4215 
4216  buf_state = LockBufHdr(buf);
4217  Assert(buf_state & BM_IO_IN_PROGRESS);
4218  if (IsForInput)
4219  {
4220  Assert(!(buf_state & BM_DIRTY));
4221 
4222  /* We'd better not think buffer is valid yet */
4223  Assert(!(buf_state & BM_VALID));
4224  UnlockBufHdr(buf, buf_state);
4225  }
4226  else
4227  {
4228  Assert(buf_state & BM_DIRTY);
4229  UnlockBufHdr(buf, buf_state);
4230  /* Issue notice if this is not the first failure... */
4231  if (buf_state & BM_IO_ERROR)
4232  {
4233  /* Buffer is pinned, so we can read tag without spinlock */
4234  char *path;
4235 
4236  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4237  ereport(WARNING,
4238  (errcode(ERRCODE_IO_ERROR),
4239  errmsg("could not write block %u of %s",
4240  buf->tag.blockNum, path),
4241  errdetail("Multiple failures --- write error might be permanent.")));
4242  pfree(path);
4243  }
4244  }
4245  TerminateBufferIO(buf, false, BM_IO_ERROR);
4246  }
4247 }
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
ForkNumber forkNum
Definition: buf_internals.h:93
int errcode(int sqlerrcode)
Definition: elog.c:610
#define BM_DIRTY
Definition: buf_internals.h:58
#define BufferDescriptorGetIOLock(bdesc)
static BufferDesc * InProgressBuf
Definition: bufmgr.c:153
void pfree(void *pointer)
Definition: mcxt.c:1057
static char * buf
Definition: pg_test_fsync.c:68
int errdetail(const char *fmt,...)
Definition: elog.c:957
unsigned int uint32
Definition: c.h:374
static bool IsForInput
Definition: bufmgr.c:154
#define WARNING
Definition: elog.h:40
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4168
#define BM_VALID
Definition: buf_internals.h:59
#define ereport(elevel,...)
Definition: elog.h:144
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
#define Assert(condition)
Definition: c.h:745
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
#define BM_IO_ERROR
Definition: buf_internals.h:62
BufferTag tag
int errmsg(const char *fmt,...)
Definition: elog.c:824
#define UnlockBufHdr(desc, s)
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:61

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 2478 of file bufmgr.c.

References Assert, AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

2479 {
2481 
2482  AtEOXact_LocalBuffers(isCommit);
2483 
2485 }
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:191
#define Assert(condition)
Definition: c.h:745
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2553
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:578

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 2534 of file bufmgr.c.

References AbortBufferIO(), AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferPoolBackend().

2535 {
2536  AbortBufferIO();
2537  UnlockBuffers();
2538 
2540 
2541  /* localbuf.c needs a chance too */
2543 }
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:589
void UnlockBuffers(void)
Definition: bufmgr.c:3722
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2553
void AbortBufferIO(void)
Definition: bufmgr.c:4200

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 2108 of file bufmgr.c.

References Assert, bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, BgWriterStats, BUF_REUSABLE, BUF_WRITTEN, CurrentResourceOwner, DEBUG1, DEBUG2, elog, PgStat_MsgBgWriter::m_buf_alloc, PgStat_MsgBgWriter::m_buf_written_clean, PgStat_MsgBgWriter::m_maxwritten_clean, NBuffers, ResourceOwnerEnlargeBuffers(), StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

2109 {
2110  /* info obtained from freelist.c */
2111  int strategy_buf_id;
2112  uint32 strategy_passes;
2113  uint32 recent_alloc;
2114 
2115  /*
2116  * Information saved between calls so we can determine the strategy
2117  * point's advance rate and avoid scanning already-cleaned buffers.
2118  */
2119  static bool saved_info_valid = false;
2120  static int prev_strategy_buf_id;
2121  static uint32 prev_strategy_passes;
2122  static int next_to_clean;
2123  static uint32 next_passes;
2124 
2125  /* Moving averages of allocation rate and clean-buffer density */
2126  static float smoothed_alloc = 0;
2127  static float smoothed_density = 10.0;
2128 
2129  /* Potentially these could be tunables, but for now, not */
2130  float smoothing_samples = 16;
2131  float scan_whole_pool_milliseconds = 120000.0;
2132 
2133  /* Used to compute how far we scan ahead */
2134  long strategy_delta;
2135  int bufs_to_lap;
2136  int bufs_ahead;
2137  float scans_per_alloc;
2138  int reusable_buffers_est;
2139  int upcoming_alloc_est;
2140  int min_scan_buffers;
2141 
2142  /* Variables for the scanning loop proper */
2143  int num_to_scan;
2144  int num_written;
2145  int reusable_buffers;
2146 
2147  /* Variables for final smoothed_density update */
2148  long new_strategy_delta;
2149  uint32 new_recent_alloc;
2150 
2151  /*
2152  * Find out where the freelist clock sweep currently is, and how many
2153  * buffer allocations have happened since our last call.
2154  */
2155  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2156 
2157  /* Report buffer alloc counts to pgstat */
2158  BgWriterStats.m_buf_alloc += recent_alloc;
2159 
2160  /*
2161  * If we're not running the LRU scan, just stop after doing the stats
2162  * stuff. We mark the saved state invalid so that we can recover sanely
2163  * if LRU scan is turned back on later.
2164  */
2165  if (bgwriter_lru_maxpages <= 0)
2166  {
2167  saved_info_valid = false;
2168  return true;
2169  }
2170 
2171  /*
2172  * Compute strategy_delta = how many buffers have been scanned by the
2173  * clock sweep since last time. If first time through, assume none. Then
2174  * see if we are still ahead of the clock sweep, and if so, how many
2175  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2176  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2177  * behavior when the passes counts wrap around.
2178  */
2179  if (saved_info_valid)
2180  {
2181  int32 passes_delta = strategy_passes - prev_strategy_passes;
2182 
2183  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2184  strategy_delta += (long) passes_delta * NBuffers;
2185 
2186  Assert(strategy_delta >= 0);
2187 
2188  if ((int32) (next_passes - strategy_passes) > 0)
2189  {
2190  /* we're one pass ahead of the strategy point */
2191  bufs_to_lap = strategy_buf_id - next_to_clean;
2192 #ifdef BGW_DEBUG
2193  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2194  next_passes, next_to_clean,
2195  strategy_passes, strategy_buf_id,
2196  strategy_delta, bufs_to_lap);
2197 #endif
2198  }
2199  else if (next_passes == strategy_passes &&
2200  next_to_clean >= strategy_buf_id)
2201  {
2202  /* on same pass, but ahead or at least not behind */
2203  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2204 #ifdef BGW_DEBUG
2205  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2206  next_passes, next_to_clean,
2207  strategy_passes, strategy_buf_id,
2208  strategy_delta, bufs_to_lap);
2209 #endif
2210  }
2211  else
2212  {
2213  /*
2214  * We're behind, so skip forward to the strategy point and start
2215  * cleaning from there.
2216  */
2217 #ifdef BGW_DEBUG
2218  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2219  next_passes, next_to_clean,
2220  strategy_passes, strategy_buf_id,
2221  strategy_delta);
2222 #endif
2223  next_to_clean = strategy_buf_id;
2224  next_passes = strategy_passes;
2225  bufs_to_lap = NBuffers;
2226  }
2227  }
2228  else
2229  {
2230  /*
2231  * Initializing at startup or after LRU scanning had been off. Always
2232  * start at the strategy point.
2233  */
2234 #ifdef BGW_DEBUG
2235  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2236  strategy_passes, strategy_buf_id);
2237 #endif
2238  strategy_delta = 0;
2239  next_to_clean = strategy_buf_id;
2240  next_passes = strategy_passes;
2241  bufs_to_lap = NBuffers;
2242  }
2243 
2244  /* Update saved info for next time */
2245  prev_strategy_buf_id = strategy_buf_id;
2246  prev_strategy_passes = strategy_passes;
2247  saved_info_valid = true;
2248 
2249  /*
2250  * Compute how many buffers had to be scanned for each new allocation, ie,
2251  * 1/density of reusable buffers, and track a moving average of that.
2252  *
2253  * If the strategy point didn't move, we don't update the density estimate
2254  */
2255  if (strategy_delta > 0 && recent_alloc > 0)
2256  {
2257  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2258  smoothed_density += (scans_per_alloc - smoothed_density) /
2259  smoothing_samples;
2260  }
2261 
2262  /*
2263  * Estimate how many reusable buffers there are between the current
2264  * strategy point and where we've scanned ahead to, based on the smoothed
2265  * density estimate.
2266  */
2267  bufs_ahead = NBuffers - bufs_to_lap;
2268  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2269 
2270  /*
2271  * Track a moving average of recent buffer allocations. Here, rather than
2272  * a true average we want a fast-attack, slow-decline behavior: we
2273  * immediately follow any increase.
2274  */
2275  if (smoothed_alloc <= (float) recent_alloc)
2276  smoothed_alloc = recent_alloc;
2277  else
2278  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2279  smoothing_samples;
2280 
2281  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2282  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2283 
2284  /*
2285  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2286  * eventually underflow to zero, and the underflows produce annoying
2287  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2288  * zero, there's no point in tracking smaller and smaller values of
2289  * smoothed_alloc, so just reset it to exactly zero to avoid this
2290  * syndrome. It will pop back up as soon as recent_alloc increases.
2291  */
2292  if (upcoming_alloc_est == 0)
2293  smoothed_alloc = 0;
2294 
2295  /*
2296  * Even in cases where there's been little or no buffer allocation
2297  * activity, we want to make a small amount of progress through the buffer
2298  * cache so that as many reusable buffers as possible are clean after an
2299  * idle period.
2300  *
2301  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2302  * the BGW will be called during the scan_whole_pool time; slice the
2303  * buffer pool into that many sections.
2304  */
2305  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2306 
2307  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2308  {
2309 #ifdef BGW_DEBUG
2310  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2311  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2312 #endif
2313  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2314  }
2315 
2316  /*
2317  * Now write out dirty reusable buffers, working forward from the
2318  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2319  * enough buffers to match our estimate of the next cycle's allocation
2320  * requirements, or hit the bgwriter_lru_maxpages limit.
2321  */
2322 
2323  /* Make sure we can handle the pin inside SyncOneBuffer */
2325 
2326  num_to_scan = bufs_to_lap;
2327  num_written = 0;
2328  reusable_buffers = reusable_buffers_est;
2329 
2330  /* Execute the LRU scan */
2331  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2332  {
2333  int sync_state = SyncOneBuffer(next_to_clean, true,
2334  wb_context);
2335 
2336  if (++next_to_clean >= NBuffers)
2337  {
2338  next_to_clean = 0;
2339  next_passes++;
2340  }
2341  num_to_scan--;
2342 
2343  if (sync_state & BUF_WRITTEN)
2344  {
2345  reusable_buffers++;
2346  if (++num_written >= bgwriter_lru_maxpages)
2347  {
2349  break;
2350  }
2351  }
2352  else if (sync_state & BUF_REUSABLE)
2353  reusable_buffers++;
2354  }
2355 
2356  BgWriterStats.m_buf_written_clean += num_written;
2357 
2358 #ifdef BGW_DEBUG
2359  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2360  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2361  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2362  bufs_to_lap - num_to_scan,
2363  num_written,
2364  reusable_buffers - reusable_buffers_est);
2365 #endif
2366 
2367  /*
2368  * Consider the above scan as being like a new allocation scan.
2369  * Characterize its density and update the smoothed one based on it. This
2370  * effectively halves the moving average period in cases where both the
2371  * strategy and the background writer are doing some useful scanning,
2372  * which is helpful because a long memory isn't as desirable on the
2373  * density estimates.
2374  */
2375  new_strategy_delta = bufs_to_lap - num_to_scan;
2376  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2377  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2378  {
2379  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2380  smoothed_density += (scans_per_alloc - smoothed_density) /
2381  smoothing_samples;
2382 
2383 #ifdef BGW_DEBUG
2384  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2385  new_recent_alloc, new_strategy_delta,
2386  scans_per_alloc, smoothed_density);
2387 #endif
2388  }
2389 
2390  /* Return true if OK to hibernate */
2391  return (bufs_to_lap == 0 && recent_alloc == 0);
2392 }
PgStat_Counter m_buf_alloc
Definition: pgstat.h:434
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:395
#define DEBUG1
Definition: elog.h:25
int BgWriterDelay
Definition: bgwriter.c:64
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
PgStat_Counter m_maxwritten_clean
Definition: pgstat.h:431
PgStat_Counter m_buf_written_clean
Definition: pgstat.h:430
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:142
double bgwriter_lru_multiplier
Definition: bufmgr.c:126
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2411
signed int int32
Definition: c.h:362
#define BUF_REUSABLE
Definition: bufmgr.c:69
int bgwriter_lru_maxpages
Definition: bufmgr.c:125
#define DEBUG2
Definition: elog.h:24
unsigned int uint32
Definition: c.h:374
#define BUF_WRITTEN
Definition: bufmgr.c:68
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define Assert(condition)
Definition: c.h:745
#define elog(elevel,...)
Definition: elog.h:214
int NBuffers
Definition: globals.c:132

◆ BufferAlloc()

static BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr 
)
static

Definition at line 1006 of file bufmgr.c.

References Assert, BackendWritebackContext, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_ERROR, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BUF_FLAG_MASK, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BUF_USAGECOUNT_ONE, BufferDescriptorGetContentLock, BufferGetLSN, BufMappingPartitionLock, BufTableDelete(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), RelFileNode::dbNode, FlushBuffer(), GetBufferDescriptor, INIT_BUFFERTAG, INIT_FORKNUM, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockConditionalAcquire(), LWLockRelease(), RelFileNodeBackend::node, PinBuffer(), PinBuffer_Locked(), RelFileNode::relNode, ReservePrivateRefCountEntry(), ScheduleBufferTagForWriteback(), SMgrRelationData::smgr_rnode, RelFileNode::spcNode, StartBufferIO(), StrategyGetBuffer(), StrategyRejectBuffer(), BufferDesc::tag, UnlockBufHdr, UnpinBuffer(), and XLogNeedsFlush().

Referenced by ReadBuffer_common().

1010 {
1011  BufferTag newTag; /* identity of requested block */
1012  uint32 newHash; /* hash value for newTag */
1013  LWLock *newPartitionLock; /* buffer partition lock for it */
1014  BufferTag oldTag; /* previous identity of selected buffer */
1015  uint32 oldHash; /* hash value for oldTag */
1016  LWLock *oldPartitionLock; /* buffer partition lock for it */
1017  uint32 oldFlags;
1018  int buf_id;
1019  BufferDesc *buf;
1020  bool valid;
1021  uint32 buf_state;
1022 
1023  /* create a tag so we can lookup the buffer */
1024  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
1025 
1026  /* determine its hash code and partition lock ID */
1027  newHash = BufTableHashCode(&newTag);
1028  newPartitionLock = BufMappingPartitionLock(newHash);
1029 
1030  /* see if the block is in the buffer pool already */
1031  LWLockAcquire(newPartitionLock, LW_SHARED);
1032  buf_id = BufTableLookup(&newTag, newHash);
1033  if (buf_id >= 0)
1034  {
1035  /*
1036  * Found it. Now, pin the buffer so no one can steal it from the
1037  * buffer pool, and check to see if the correct data has been loaded
1038  * into the buffer.
1039  */
1040  buf = GetBufferDescriptor(buf_id);
1041 
1042  valid = PinBuffer(buf, strategy);
1043 
1044  /* Can release the mapping lock as soon as we've pinned it */
1045  LWLockRelease(newPartitionLock);
1046 
1047  *foundPtr = true;
1048 
1049  if (!valid)
1050  {
1051  /*
1052  * We can only get here if (a) someone else is still reading in
1053  * the page, or (b) a previous read attempt failed. We have to
1054  * wait for any active read attempt to finish, and then set up our
1055  * own read attempt if the page is still not BM_VALID.
1056  * StartBufferIO does it all.
1057  */
1058  if (StartBufferIO(buf, true))
1059  {
1060  /*
1061  * If we get here, previous attempts to read the buffer must
1062  * have failed ... but we shall bravely try again.
1063  */
1064  *foundPtr = false;
1065  }
1066  }
1067 
1068  return buf;
1069  }
1070 
1071  /*
1072  * Didn't find it in the buffer pool. We'll have to initialize a new
1073  * buffer. Remember to unlock the mapping lock while doing the work.
1074  */
1075  LWLockRelease(newPartitionLock);
1076 
1077  /* Loop here in case we have to try another victim buffer */
1078  for (;;)
1079  {
1080  /*
1081  * Ensure, while the spinlock's not yet held, that there's a free
1082  * refcount entry.
1083  */
1085 
1086  /*
1087  * Select a victim buffer. The buffer is returned with its header
1088  * spinlock still held!
1089  */
1090  buf = StrategyGetBuffer(strategy, &buf_state);
1091 
1092  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1093 
1094  /* Must copy buffer flags while we still hold the spinlock */
1095  oldFlags = buf_state & BUF_FLAG_MASK;
1096 
1097  /* Pin the buffer and then release the buffer spinlock */
1098  PinBuffer_Locked(buf);
1099 
1100  /*
1101  * If the buffer was dirty, try to write it out. There is a race
1102  * condition here, in that someone might dirty it after we released it
1103  * above, or even while we are writing it out (since our share-lock
1104  * won't prevent hint-bit updates). We will recheck the dirty bit
1105  * after re-locking the buffer header.
1106  */
1107  if (oldFlags & BM_DIRTY)
1108  {
1109  /*
1110  * We need a share-lock on the buffer contents to write it out
1111  * (else we might write invalid data, eg because someone else is
1112  * compacting the page contents while we write). We must use a
1113  * conditional lock acquisition here to avoid deadlock. Even
1114  * though the buffer was not pinned (and therefore surely not
1115  * locked) when StrategyGetBuffer returned it, someone else could
1116  * have pinned and exclusive-locked it by the time we get here. If
1117  * we try to get the lock unconditionally, we'd block waiting for
1118  * them; if they later block waiting for us, deadlock ensues.
1119  * (This has been observed to happen when two backends are both
1120  * trying to split btree index pages, and the second one just
1121  * happens to be trying to split the page the first one got from
1122  * StrategyGetBuffer.)
1123  */
1125  LW_SHARED))
1126  {
1127  /*
1128  * If using a nondefault strategy, and writing the buffer
1129  * would require a WAL flush, let the strategy decide whether
1130  * to go ahead and write/reuse the buffer or to choose another
1131  * victim. We need lock to inspect the page LSN, so this
1132  * can't be done inside StrategyGetBuffer.
1133  */
1134  if (strategy != NULL)
1135  {
1136  XLogRecPtr lsn;
1137 
1138  /* Read the LSN while holding buffer header lock */
1139  buf_state = LockBufHdr(buf);
1140  lsn = BufferGetLSN(buf);
1141  UnlockBufHdr(buf, buf_state);
1142 
1143  if (XLogNeedsFlush(lsn) &&
1144  StrategyRejectBuffer(strategy, buf))
1145  {
1146  /* Drop lock/pin and loop around for another buffer */
1148  UnpinBuffer(buf, true);
1149  continue;
1150  }
1151  }
1152 
1153  /* OK, do the I/O */
1154  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1155  smgr->smgr_rnode.node.spcNode,
1156  smgr->smgr_rnode.node.dbNode,
1157  smgr->smgr_rnode.node.relNode);
1158 
1159  FlushBuffer(buf, NULL);
1161 
1163  &buf->tag);
1164 
1165  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1166  smgr->smgr_rnode.node.spcNode,
1167  smgr->smgr_rnode.node.dbNode,
1168  smgr->smgr_rnode.node.relNode);
1169  }
1170  else
1171  {
1172  /*
1173  * Someone else has locked the buffer, so give it up and loop
1174  * back to get another one.
1175  */
1176  UnpinBuffer(buf, true);
1177  continue;
1178  }
1179  }
1180 
1181  /*
1182  * To change the association of a valid buffer, we'll need to have
1183  * exclusive lock on both the old and new mapping partitions.
1184  */
1185  if (oldFlags & BM_TAG_VALID)
1186  {
1187  /*
1188  * Need to compute the old tag's hashcode and partition lock ID.
1189  * XXX is it worth storing the hashcode in BufferDesc so we need
1190  * not recompute it here? Probably not.
1191  */
1192  oldTag = buf->tag;
1193  oldHash = BufTableHashCode(&oldTag);
1194  oldPartitionLock = BufMappingPartitionLock(oldHash);
1195 
1196  /*
1197  * Must lock the lower-numbered partition first to avoid
1198  * deadlocks.
1199  */
1200  if (oldPartitionLock < newPartitionLock)
1201  {
1202  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1203  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1204  }
1205  else if (oldPartitionLock > newPartitionLock)
1206  {
1207  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1208  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1209  }
1210  else
1211  {
1212  /* only one partition, only one lock */
1213  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1214  }
1215  }
1216  else
1217  {
1218  /* if it wasn't valid, we need only the new partition */
1219  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1220  /* remember we have no old-partition lock or tag */
1221  oldPartitionLock = NULL;
1222  /* keep the compiler quiet about uninitialized variables */
1223  oldHash = 0;
1224  }
1225 
1226  /*
1227  * Try to make a hashtable entry for the buffer under its new tag.
1228  * This could fail because while we were writing someone else
1229  * allocated another buffer for the same block we want to read in.
1230  * Note that we have not yet removed the hashtable entry for the old
1231  * tag.
1232  */
1233  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1234 
1235  if (buf_id >= 0)
1236  {
1237  /*
1238  * Got a collision. Someone has already done what we were about to
1239  * do. We'll just handle this as if it were found in the buffer
1240  * pool in the first place. First, give up the buffer we were
1241  * planning to use.
1242  */
1243  UnpinBuffer(buf, true);
1244 
1245  /* Can give up that buffer's mapping partition lock now */
1246  if (oldPartitionLock != NULL &&
1247  oldPartitionLock != newPartitionLock)
1248  LWLockRelease(oldPartitionLock);
1249 
1250  /* remaining code should match code at top of routine */
1251 
1252  buf = GetBufferDescriptor(buf_id);
1253 
1254  valid = PinBuffer(buf, strategy);
1255 
1256  /* Can release the mapping lock as soon as we've pinned it */
1257  LWLockRelease(newPartitionLock);
1258 
1259  *foundPtr = true;
1260 
1261  if (!valid)
1262  {
1263  /*
1264  * We can only get here if (a) someone else is still reading
1265  * in the page, or (b) a previous read attempt failed. We
1266  * have to wait for any active read attempt to finish, and
1267  * then set up our own read attempt if the page is still not
1268  * BM_VALID. StartBufferIO does it all.
1269  */
1270  if (StartBufferIO(buf, true))
1271  {
1272  /*
1273  * If we get here, previous attempts to read the buffer
1274  * must have failed ... but we shall bravely try again.
1275  */
1276  *foundPtr = false;
1277  }
1278  }
1279 
1280  return buf;
1281  }
1282 
1283  /*
1284  * Need to lock the buffer header too in order to change its tag.
1285  */
1286  buf_state = LockBufHdr(buf);
1287 
1288  /*
1289  * Somebody could have pinned or re-dirtied the buffer while we were
1290  * doing the I/O and making the new hashtable entry. If so, we can't
1291  * recycle this buffer; we must undo everything we've done and start
1292  * over with a new victim buffer.
1293  */
1294  oldFlags = buf_state & BUF_FLAG_MASK;
1295  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1296  break;
1297 
1298  UnlockBufHdr(buf, buf_state);
1299  BufTableDelete(&newTag, newHash);
1300  if (oldPartitionLock != NULL &&
1301  oldPartitionLock != newPartitionLock)
1302  LWLockRelease(oldPartitionLock);
1303  LWLockRelease(newPartitionLock);
1304  UnpinBuffer(buf, true);
1305  }
1306 
1307  /*
1308  * Okay, it's finally safe to rename the buffer.
1309  *
1310  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1311  * paranoia. We also reset the usage_count since any recency of use of
1312  * the old content is no longer relevant. (The usage_count starts out at
1313  * 1 so that the buffer can survive one clock-sweep pass.)
1314  *
1315  * Make sure BM_PERMANENT is set for buffers that must be written at every
1316  * checkpoint. Unlogged buffers only need to be written at shutdown
1317  * checkpoints, except for their "init" forks, which need to be treated
1318  * just like permanent relations.
1319  */
1320  buf->tag = newTag;
1321  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1324  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1325  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1326  else
1327  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1328 
1329  UnlockBufHdr(buf, buf_state);
1330 
1331  if (oldPartitionLock != NULL)
1332  {
1333  BufTableDelete(&oldTag, oldHash);
1334  if (oldPartitionLock != newPartitionLock)
1335  LWLockRelease(oldPartitionLock);
1336  }
1337 
1338  LWLockRelease(newPartitionLock);
1339 
1340  /*
1341  * Buffer contents are currently invalid. Try to get the io_in_progress
1342  * lock. If StartBufferIO returns false, then someone else managed to
1343  * read it before we did, so there's nothing left for BufferAlloc() to do.
1344  */
1345  if (StartBufferIO(buf, true))
1346  *foundPtr = false;
1347  else
1348  *foundPtr = true;
1349 
1350  return buf;
1351 }
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1590
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
Definition: lwlock.h:31
#define BM_PERMANENT
Definition: buf_internals.h:66
#define BufMappingPartitionLock(hashcode)
#define BM_TAG_VALID
Definition: buf_internals.h:60
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3166
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
#define BM_DIRTY
Definition: buf_internals.h:58
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2721
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4101
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4470
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
WritebackContext BackendWritebackContext
Definition: buf_init.c:23
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1380
static char * buf
Definition: pg_test_fsync.c:68
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
unsigned int uint32
Definition: c.h:374
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1738
#define BM_VALID
Definition: buf_internals.h:59
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:686
RelFileNode node
Definition: relfilenode.h:74
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1693
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
#define BM_IO_ERROR
Definition: buf_internals.h:62
BufferTag tag
#define UnlockBufHdr(desc, s)
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:207
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:61
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 2661 of file bufmgr.c.

References Assert, buftag::blockNum, BufferIsLocal, BufferIsPinned, GetBufferDescriptor, GetLocalBufferDescriptor, and BufferDesc::tag.

Referenced by _bt_check_unique(), _bt_checkpage(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_endpoint(), _bt_finish_split(), _bt_first(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_split(), _bt_unlink_halfdead_page(), _bt_walk_left(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_with_data(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_is_all_visible(), heap_prune_chain(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), ReadBufferBI(), RelationAddExtraBlocks(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgWalk(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), visibilitymap_set(), and XLogReadBufferExtended().

2662 {
2663  BufferDesc *bufHdr;
2664 
2665  Assert(BufferIsPinned(buffer));
2666 
2667  if (BufferIsLocal(buffer))
2668  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2669  else
2670  bufHdr = GetBufferDescriptor(buffer - 1);
2671 
2672  /* pinned, so OK to read tag without spinlock */
2673  return bufHdr->tag.blockNum;
2674 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:440
#define GetLocalBufferDescriptor(id)
#define GetBufferDescriptor(id)
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94
BufferTag tag

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 2924 of file bufmgr.c.

References Assert, BufferGetPage, BufferIsLocal, BufferIsPinned, BufferIsValid, GetBufferDescriptor, LockBufHdr(), PageGetLSN, UnlockBufHdr, and XLogHintBitIsNeeded.

Referenced by _bt_killitems(), _bt_readpage(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

2925 {
2926  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2927  char *page = BufferGetPage(buffer);
2928  XLogRecPtr lsn;
2929  uint32 buf_state;
2930 
2931  /*
2932  * If we don't need locking for correctness, fastpath out.
2933  */
2934  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2935  return PageGetLSN(page);
2936 
2937  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2938  Assert(BufferIsValid(buffer));
2939  Assert(BufferIsPinned(buffer));
2940 
2941  buf_state = LockBufHdr(bufHdr);
2942  lsn = PageGetLSN(page);
2943  UnlockBufHdr(bufHdr, buf_state);
2944 
2945  return lsn;
2946 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:440
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:374
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define PageGetLSN(page)
Definition: bufpage.h:366
#define UnlockBufHdr(desc, s)
#define XLogHintBitIsNeeded()
Definition: xlog.h:202

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileNode rnode,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 2682 of file bufmgr.c.

References Assert, buftag::blockNum, BufferIsLocal, BufferIsPinned, buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, buftag::rnode, and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

2684 {
2685  BufferDesc *bufHdr;
2686 
2687  /* Do the same checks as BufferGetBlockNumber. */
2688  Assert(BufferIsPinned(buffer));
2689 
2690  if (BufferIsLocal(buffer))
2691  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2692  else
2693  bufHdr = GetBufferDescriptor(buffer - 1);
2694 
2695  /* pinned, so OK to read tag without spinlock */
2696  *rnode = bufHdr->tag.rnode;
2697  *forknum = bufHdr->tag.forkNum;
2698  *blknum = bufHdr->tag.blockNum;
2699 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:440
ForkNumber forkNum
Definition: buf_internals.h:93
#define GetLocalBufferDescriptor(id)
#define GetBufferDescriptor(id)
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 2894 of file bufmgr.c.

References Assert, BM_PERMANENT, BufferIsLocal, BufferIsPinned, BufferIsValid, GetBufferDescriptor, pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

2895 {
2896  BufferDesc *bufHdr;
2897 
2898  /* Local buffers are used only for temp relations. */
2899  if (BufferIsLocal(buffer))
2900  return false;
2901 
2902  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2903  Assert(BufferIsValid(buffer));
2904  Assert(BufferIsPinned(buffer));
2905 
2906  /*
2907  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2908  * need not bother with the buffer header spinlock. Even if someone else
2909  * changes the buffer header state while we're doing this, the state is
2910  * changed atomically, so we'll read the old value or the new value, but
2911  * not random garbage.
2912  */
2913  bufHdr = GetBufferDescriptor(buffer - 1);
2914  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2915 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:440
#define BM_PERMANENT
Definition: buf_internals.h:66
#define GetBufferDescriptor(id)
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
pg_atomic_uint32 state
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 1831 of file bufmgr.c.

References Assert, BgWriterStats, binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_ALL, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), ckpt_buforder_comparator(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, CurrentResourceOwner, DatumGetPointer, buftag::forkNum, CkptSortItem::forkNum, GetBufferDescriptor, i, CkptTsStatus::index, InvalidOid, IssuePendingWritebacks(), LockBufHdr(), PgStat_MsgBgWriter::m_buf_written_checkpoints, NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), pfree(), pg_atomic_read_u32(), PointerGetDatum, ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress, CkptTsStatus::progress_slice, qsort, RelFileNode::relNode, CkptSortItem::relNode, repalloc(), ResourceOwnerEnlargeBuffers(), buftag::rnode, RelFileNode::spcNode, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdr, and WritebackContextInit().

Referenced by CheckPointBuffers().

1832 {
1833  uint32 buf_state;
1834  int buf_id;
1835  int num_to_scan;
1836  int num_spaces;
1837  int num_processed;
1838  int num_written;
1839  CkptTsStatus *per_ts_stat = NULL;
1840  Oid last_tsid;
1841  binaryheap *ts_heap;
1842  int i;
1843  int mask = BM_DIRTY;
1844  WritebackContext wb_context;
1845 
1846  /* Make sure we can handle the pin inside SyncOneBuffer */
1848 
1849  /*
1850  * Unless this is a shutdown checkpoint or we have been explicitly told,
1851  * we write only permanent, dirty buffers. But at shutdown or end of
1852  * recovery, we write all dirty buffers.
1853  */
1856  mask |= BM_PERMANENT;
1857 
1858  /*
1859  * Loop over all buffers, and mark the ones that need to be written with
1860  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1861  * can estimate how much work needs to be done.
1862  *
1863  * This allows us to write only those pages that were dirty when the
1864  * checkpoint began, and not those that get dirtied while it proceeds.
1865  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1866  * later in this function, or by normal backends or the bgwriter cleaning
1867  * scan, the flag is cleared. Any buffer dirtied after this point won't
1868  * have the flag set.
1869  *
1870  * Note that if we fail to write some buffer, we may leave buffers with
1871  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1872  * certainly need to be written for the next checkpoint attempt, too.
1873  */
1874  num_to_scan = 0;
1875  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1876  {
1877  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1878 
1879  /*
1880  * Header spinlock is enough to examine BM_DIRTY, see comment in
1881  * SyncOneBuffer.
1882  */
1883  buf_state = LockBufHdr(bufHdr);
1884 
1885  if ((buf_state & mask) == mask)
1886  {
1887  CkptSortItem *item;
1888 
1889  buf_state |= BM_CHECKPOINT_NEEDED;
1890 
1891  item = &CkptBufferIds[num_to_scan++];
1892  item->buf_id = buf_id;
1893  item->tsId = bufHdr->tag.rnode.spcNode;
1894  item->relNode = bufHdr->tag.rnode.relNode;
1895  item->forkNum = bufHdr->tag.forkNum;
1896  item->blockNum = bufHdr->tag.blockNum;
1897  }
1898 
1899  UnlockBufHdr(bufHdr, buf_state);
1900 
1901  /* Check for barrier events in case NBuffers is large. */
1904  }
1905 
1906  if (num_to_scan == 0)
1907  return; /* nothing to do */
1908 
1910 
1911  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1912 
1913  /*
1914  * Sort buffers that need to be written to reduce the likelihood of random
1915  * IO. The sorting is also important for the implementation of balancing
1916  * writes between tablespaces. Without balancing writes we'd potentially
1917  * end up writing to the tablespaces one-by-one; possibly overloading the
1918  * underlying system.
1919  */
1920  qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1922 
1923  num_spaces = 0;
1924 
1925  /*
1926  * Allocate progress status for each tablespace with buffers that need to
1927  * be flushed. This requires the to-be-flushed array to be sorted.
1928  */
1929  last_tsid = InvalidOid;
1930  for (i = 0; i < num_to_scan; i++)
1931  {
1932  CkptTsStatus *s;
1933  Oid cur_tsid;
1934 
1935  cur_tsid = CkptBufferIds[i].tsId;
1936 
1937  /*
1938  * Grow array of per-tablespace status structs, every time a new
1939  * tablespace is found.
1940  */
1941  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1942  {
1943  Size sz;
1944 
1945  num_spaces++;
1946 
1947  /*
1948  * Not worth adding grow-by-power-of-2 logic here - even with a
1949  * few hundred tablespaces this should be fine.
1950  */
1951  sz = sizeof(CkptTsStatus) * num_spaces;
1952 
1953  if (per_ts_stat == NULL)
1954  per_ts_stat = (CkptTsStatus *) palloc(sz);
1955  else
1956  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1957 
1958  s = &per_ts_stat[num_spaces - 1];
1959  memset(s, 0, sizeof(*s));
1960  s->tsId = cur_tsid;
1961 
1962  /*
1963  * The first buffer in this tablespace. As CkptBufferIds is sorted
1964  * by tablespace all (s->num_to_scan) buffers in this tablespace
1965  * will follow afterwards.
1966  */
1967  s->index = i;
1968 
1969  /*
1970  * progress_slice will be determined once we know how many buffers
1971  * are in each tablespace, i.e. after this loop.
1972  */
1973 
1974  last_tsid = cur_tsid;
1975  }
1976  else
1977  {
1978  s = &per_ts_stat[num_spaces - 1];
1979  }
1980 
1981  s->num_to_scan++;
1982 
1983  /* Check for barrier events. */
1986  }
1987 
1988  Assert(num_spaces > 0);
1989 
1990  /*
1991  * Build a min-heap over the write-progress in the individual tablespaces,
1992  * and compute how large a portion of the total progress a single
1993  * processed buffer is.
1994  */
1995  ts_heap = binaryheap_allocate(num_spaces,
1997  NULL);
1998 
1999  for (i = 0; i < num_spaces; i++)
2000  {
2001  CkptTsStatus *ts_stat = &per_ts_stat[i];
2002 
2003  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
2004 
2005  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
2006  }
2007 
2008  binaryheap_build(ts_heap);
2009 
2010  /*
2011  * Iterate through to-be-checkpointed buffers and write the ones (still)
2012  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
2013  * tablespaces; otherwise the sorting would lead to only one tablespace
2014  * receiving writes at a time, making inefficient use of the hardware.
2015  */
2016  num_processed = 0;
2017  num_written = 0;
2018  while (!binaryheap_empty(ts_heap))
2019  {
2020  BufferDesc *bufHdr = NULL;
2021  CkptTsStatus *ts_stat = (CkptTsStatus *)
2023 
2024  buf_id = CkptBufferIds[ts_stat->index].buf_id;
2025  Assert(buf_id != -1);
2026 
2027  bufHdr = GetBufferDescriptor(buf_id);
2028 
2029  num_processed++;
2030 
2031  /*
2032  * We don't need to acquire the lock here, because we're only looking
2033  * at a single bit. It's possible that someone else writes the buffer
2034  * and clears the flag right after we check, but that doesn't matter
2035  * since SyncOneBuffer will then do nothing. However, there is a
2036  * further race condition: it's conceivable that between the time we
2037  * examine the bit here and the time SyncOneBuffer acquires the lock,
2038  * someone else not only wrote the buffer but replaced it with another
2039  * page and dirtied it. In that improbable case, SyncOneBuffer will
2040  * write the buffer though we didn't need to. It doesn't seem worth
2041  * guarding against this, though.
2042  */
2044  {
2045  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2046  {
2047  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2049  num_written++;
2050  }
2051  }
2052 
2053  /*
2054  * Measure progress independent of actually having to flush the buffer
2055  * - otherwise writing become unbalanced.
2056  */
2057  ts_stat->progress += ts_stat->progress_slice;
2058  ts_stat->num_scanned++;
2059  ts_stat->index++;
2060 
2061  /* Have all the buffers from the tablespace been processed? */
2062  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2063  {
2064  binaryheap_remove_first(ts_heap);
2065  }
2066  else
2067  {
2068  /* update heap with the new progress */
2069  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2070  }
2071 
2072  /*
2073  * Sleep to throttle our I/O rate.
2074  *
2075  * (This will check for barrier events even if it doesn't sleep.)
2076  */
2077  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2078  }
2079 
2080  /* issue all pending flushes */
2081  IssuePendingWritebacks(&wb_context);
2082 
2083  pfree(per_ts_stat);
2084  per_ts_stat = NULL;
2085  binaryheap_free(ts_heap);
2086 
2087  /*
2088  * Update checkpoint statistics. As noted above, this doesn't include
2089  * buffers written by other backends or bgwriter scan.
2090  */
2091  CheckpointStats.ckpt_bufs_written += num_written;
2092 
2093  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2094 }
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:680
PgStat_Counter m_buf_written_checkpoints
Definition: pgstat.h:429
#define BM_PERMANENT
Definition: buf_internals.h:66
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:226
Oid tsId
Definition: bufmgr.c:89
#define binaryheap_empty(h)
Definition: binaryheap.h:52
ForkNumber forkNum
Definition: buf_internals.h:93
#define PointerGetDatum(X)
Definition: postgres.h:556
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:452
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4435
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:142
int checkpoint_flush_after
Definition: bufmgr.c:148
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
unsigned int Oid
Definition: postgres_ext.h:31
#define BM_DIRTY
Definition: buf_internals.h:58
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2411
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4504
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4458
void pfree(void *pointer)
Definition: mcxt.c:1057
double float8
Definition: c.h:498
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
int num_to_scan
Definition: bufmgr.c:102
float8 progress_slice
Definition: bufmgr.c:99
int index
Definition: bufmgr.c:107
float8 progress
Definition: bufmgr.c:98
static int ckpt_buforder_comparator(const void *pa, const void *pb)
Definition: bufmgr.c:4401
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:222
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:374
#define BUF_WRITTEN
Definition: bufmgr.c:68
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
int ckpt_bufs_written
Definition: xlog.h:252
BlockNumber blockNum
#define InvalidOid
Definition: postgres_ext.h:36
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:126
const symbol * s
Definition: header.h:17
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
#define Assert(condition)
Definition: c.h:745
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:35
CheckpointStatsData CheckpointStats
Definition: xlog.c:186
CkptSortItem * CkptBufferIds
Definition: buf_init.c:24
size_t Size
Definition: c.h:473
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:69
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1070
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:33
#define DatumGetPointer(X)
Definition: postgres.h:549
BufferTag tag
void * palloc(Size size)
Definition: mcxt.c:950
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:132
pg_atomic_uint32 state
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:174
int num_scanned
Definition: bufmgr.c:104
#define qsort(a, b, c, d)
Definition: port.h:475
ForkNumber forkNum
struct CkptTsStatus CkptTsStatus
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:221
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ buffertag_comparator()

static int buffertag_comparator ( const void *  p1,
const void *  p2 
)
static

Definition at line 4370 of file bufmgr.c.

References buftag::blockNum, buftag::forkNum, buftag::rnode, and rnode_comparator().

Referenced by IssuePendingWritebacks().

4371 {
4372  const BufferTag *ba = (const BufferTag *) a;
4373  const BufferTag *bb = (const BufferTag *) b;
4374  int ret;
4375 
4376  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4377 
4378  if (ret != 0)
4379  return ret;
4380 
4381  if (ba->forkNum < bb->forkNum)
4382  return -1;
4383  if (ba->forkNum > bb->forkNum)
4384  return 1;
4385 
4386  if (ba->blockNum < bb->blockNum)
4387  return -1;
4388  if (ba->blockNum > bb->blockNum)
4389  return 1;
4390 
4391  return 0;
4392 }
ForkNumber forkNum
Definition: buf_internals.h:93
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4291
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92

◆ BufmgrCommit()

void BufmgrCommit ( void  )

Definition at line 2647 of file bufmgr.c.

Referenced by PrepareTransaction(), and RecordTransactionCommit().

2648 {
2649  /* Nothing to do in bufmgr anymore... */
2650 }

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 2553 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, hash_seq_init(), hash_seq_search(), i, InvalidBuffer, PrintBufferLeakWarning(), PrivateRefCountArray, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

2554 {
2555 #ifdef USE_ASSERT_CHECKING
2556  int RefCountErrors = 0;
2557  PrivateRefCountEntry *res;
2558  int i;
2559 
2560  /* check the array */
2561  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2562  {
2563  res = &PrivateRefCountArray[i];
2564 
2565  if (res->buffer != InvalidBuffer)
2566  {
2568  RefCountErrors++;
2569  }
2570  }
2571 
2572  /* if necessary search the hash */
2574  {
2575  HASH_SEQ_STATUS hstat;
2576 
2578  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2579  {
2581  RefCountErrors++;
2582  }
2583 
2584  }
2585 
2586  Assert(RefCountErrors == 0);
2587 #endif
2588 }
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2594
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:191
#define InvalidBuffer
Definition: buf.h:25
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:189
#define Assert(condition)
Definition: c.h:745
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:80
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1401
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1391
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:190
int i

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 2637 of file bufmgr.c.

References BufferSync().

Referenced by CheckPointGuts().

2638 {
2639  BufferSync(flags);
2640 }
static void BufferSync(int flags)
Definition: bufmgr.c:1831

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const void *  pa,
const void *  pb 
)
static

Definition at line 4401 of file bufmgr.c.

References CkptSortItem::blockNum, CkptSortItem::forkNum, CkptSortItem::relNode, and CkptSortItem::tsId.

Referenced by BufferSync().

4402 {
4403  const CkptSortItem *a = (const CkptSortItem *) pa;
4404  const CkptSortItem *b = (const CkptSortItem *) pb;
4405 
4406  /* compare tablespace */
4407  if (a->tsId < b->tsId)
4408  return -1;
4409  else if (a->tsId > b->tsId)
4410  return 1;
4411  /* compare relation */
4412  if (a->relNode < b->relNode)
4413  return -1;
4414  else if (a->relNode > b->relNode)
4415  return 1;
4416  /* compare fork */
4417  else if (a->forkNum < b->forkNum)
4418  return -1;
4419  else if (a->forkNum > b->forkNum)
4420  return 1;
4421  /* compare block number */
4422  else if (a->blockNum < b->blockNum)
4423  return -1;
4424  else if (a->blockNum > b->blockNum)
4425  return 1;
4426  /* equal page IDs are unlikely, but not impossible */
4427  return 0;
4428 }
BlockNumber blockNum
ForkNumber forkNum

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

Definition at line 3776 of file bufmgr.c.

References Assert, buf, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, GetBufferDescriptor, LW_EXCLUSIVE, and LWLockConditionalAcquire().

Referenced by _bt_conditionallockbuf(), BloomNewBuffer(), ConditionalLockBufferForCleanup(), GinNewBuffer(), gistNewBuffer(), RelationGetBufferForTuple(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), and SpGistUpdateMetaPage().

3777 {
3778  BufferDesc *buf;
3779 
3780  Assert(BufferIsPinned(buffer));
3781  if (BufferIsLocal(buffer))
3782  return true; /* act as though we got it */
3783 
3784  buf = GetBufferDescriptor(buffer - 1);
3785 
3787  LW_EXCLUSIVE);
3788 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:440
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1380
static char * buf
Definition: pg_test_fsync.c:68
#define GetBufferDescriptor(id)
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 3944 of file bufmgr.c.

References Assert, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid, ConditionalLockBuffer(), GetBufferDescriptor, GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr.

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), lazy_scan_heap(), and lazy_vacuum_heap().

3945 {
3946  BufferDesc *bufHdr;
3947  uint32 buf_state,
3948  refcount;
3949 
3950  Assert(BufferIsValid(buffer));
3951 
3952  if (BufferIsLocal(buffer))
3953  {
3954  refcount = LocalRefCount[-buffer - 1];
3955  /* There should be exactly one pin */
3956  Assert(refcount > 0);
3957  if (refcount != 1)
3958  return false;
3959  /* Nobody else to wait for */
3960  return true;
3961  }
3962 
3963  /* There should be exactly one local pin */
3964  refcount = GetPrivateRefCount(buffer);
3965  Assert(refcount);
3966  if (refcount != 1)
3967  return false;
3968 
3969  /* Try to acquire lock */
3970  if (!ConditionalLockBuffer(buffer))
3971  return false;
3972 
3973  bufHdr = GetBufferDescriptor(buffer - 1);
3974  buf_state = LockBufHdr(bufHdr);
3975  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3976 
3977  Assert(refcount > 0);
3978  if (refcount == 1)
3979  {
3980  /* Successfully acquired exclusive lock with pincount 1 */
3981  UnlockBufHdr(bufHdr, buf_state);
3982  return true;
3983  }
3984 
3985  /* Failed, so release the lock */
3986  UnlockBufHdr(bufHdr, buf_state);
3987  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3988  return false;
3989 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:379
unsigned int uint32
Definition: c.h:374
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3776
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3750
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define UnlockBufHdr(desc, s)
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 3147 of file bufmgr.c.

References buftag::blockNum, buf, BufferDescriptorGetBuffer, RelFileNode::dbNode, elog, buftag::forkNum, BufferDesc::freeNext, GetBufferDescriptor, GetPrivateRefCount(), i, InvalidateBuffer(), InvalidBackendId, LockBufHdr(), LOG, NBuffers, relpathbackend, relpathperm, buftag::rnode, BufferDesc::tag, and UnlockBufHdr.

Referenced by dbase_redo(), dropdb(), and movedb().

3148 {
3149  int i;
3150 
3151  /*
3152  * We needn't consider local buffers, since by assumption the target
3153  * database isn't our own.
3154  */
3155 
3156  for (i = 0; i < NBuffers; i++)
3157  {
3158  BufferDesc *bufHdr = GetBufferDescriptor(i);
3159  uint32 buf_state;
3160 
3161  /*
3162  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3163  * and saves some cycles.
3164  */
3165  if (bufHdr->tag.rnode.dbNode != dbid)
3166  continue;
3167 
3168  buf_state = LockBufHdr(bufHdr);
3169  if (bufHdr->tag.rnode.dbNode == dbid)
3170  InvalidateBuffer(bufHdr); /* releases spinlock */
3171  else
3172  UnlockBufHdr(bufHdr, buf_state);
3173  }
3174 }
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1371
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:374
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:132

◆ DropRelFileNodeBuffers()

void DropRelFileNodeBuffers ( RelFileNodeBackend  rnode,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 2975 of file bufmgr.c.

References RelFileNodeBackend::backend, buftag::blockNum, DropRelFileNodeLocalBuffers(), buftag::forkNum, GetBufferDescriptor, i, InvalidateBuffer(), LockBufHdr(), MyBackendId, NBuffers, RelFileNodeBackend::node, RelFileNodeBackendIsTemp, RelFileNodeEquals, buftag::rnode, BufferDesc::tag, and UnlockBufHdr.

Referenced by smgrtruncate().

2977 {
2978  int i;
2979  int j;
2980 
2981  /* If it's a local relation, it's localbuf.c's problem. */
2982  if (RelFileNodeBackendIsTemp(rnode))
2983  {
2984  if (rnode.backend == MyBackendId)
2985  {
2986  for (j = 0; j < nforks; j++)
2987  DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
2988  firstDelBlock[j]);
2989  }
2990  return;
2991  }
2992 
2993  for (i = 0; i < NBuffers; i++)
2994  {
2995  BufferDesc *bufHdr = GetBufferDescriptor(i);
2996  uint32 buf_state;
2997 
2998  /*
2999  * We can make this a tad faster by prechecking the buffer tag before
3000  * we attempt to lock the buffer; this saves a lot of lock
3001  * acquisitions in typical cases. It should be safe because the
3002  * caller must have AccessExclusiveLock on the relation, or some other
3003  * reason to be certain that no one is loading new pages of the rel
3004  * into the buffer pool. (Otherwise we might well miss such pages
3005  * entirely.) Therefore, while the tag might be changing while we
3006  * look at it, it can't be changing *to* a value we care about, only
3007  * *away* from such a value. So false negatives are impossible, and
3008  * false positives are safe because we'll recheck after getting the
3009  * buffer lock.
3010  *
3011  * We could check forkNum and blockNum as well as the rnode, but the
3012  * incremental win from doing so seems small.
3013  */
3014  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
3015  continue;
3016 
3017  buf_state = LockBufHdr(bufHdr);
3018 
3019  for (j = 0; j < nforks; j++)
3020  {
3021  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
3022  bufHdr->tag.forkNum == forkNum[j] &&
3023  bufHdr->tag.blockNum >= firstDelBlock[j])
3024  {
3025  InvalidateBuffer(bufHdr); /* releases spinlock */
3026  break;
3027  }
3028  }
3029  if (j >= nforks)
3030  UnlockBufHdr(bufHdr, buf_state);
3031  }
3032 }
BackendId MyBackendId
Definition: globals.c:81
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ForkNumber forkNum
Definition: buf_internals.h:93
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1371
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:326
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:374
RelFileNode node
Definition: relfilenode.h:74
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
BackendId backend
Definition: relfilenode.h:75
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:132
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ DropRelFileNodesAllBuffers()

void DropRelFileNodesAllBuffers ( RelFileNodeBackend rnodes,
int  nnodes 
)

Definition at line 3044 of file bufmgr.c.

References DropRelFileNodeAllLocalBuffers(), GetBufferDescriptor, i, InvalidateBuffer(), LockBufHdr(), MyBackendId, NBuffers, RelFileNodeBackend::node, palloc(), pfree(), pg_qsort(), RelFileNodeBackendIsTemp, RelFileNodeEquals, RELS_BSEARCH_THRESHOLD, buftag::rnode, rnode_comparator(), BufferDesc::tag, and UnlockBufHdr.

Referenced by smgrdounlinkall().

3045 {
3046  int i,
3047  n = 0;
3048  RelFileNode *nodes;
3049  bool use_bsearch;
3050 
3051  if (nnodes == 0)
3052  return;
3053 
3054  nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
3055 
3056  /* If it's a local relation, it's localbuf.c's problem. */
3057  for (i = 0; i < nnodes; i++)
3058  {
3059  if (RelFileNodeBackendIsTemp(rnodes[i]))
3060  {
3061  if (rnodes[i].backend == MyBackendId)
3062  DropRelFileNodeAllLocalBuffers(rnodes[i].node);
3063  }
3064  else
3065  nodes[n++] = rnodes[i].node;
3066  }
3067 
3068  /*
3069  * If there are no non-local relations, then we're done. Release the
3070  * memory and return.
3071  */
3072  if (n == 0)
3073  {
3074  pfree(nodes);
3075  return;
3076  }
3077 
3078  /*
3079  * For low number of relations to drop just use a simple walk through, to
3080  * save the bsearch overhead. The threshold to use is rather a guess than
3081  * an exactly determined value, as it depends on many factors (CPU and RAM
3082  * speeds, amount of shared buffers etc.).
3083  */
3084  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3085 
3086  /* sort the list of rnodes if necessary */
3087  if (use_bsearch)
3088  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
3089 
3090  for (i = 0; i < NBuffers; i++)
3091  {
3092  RelFileNode *rnode = NULL;
3093  BufferDesc *bufHdr = GetBufferDescriptor(i);
3094  uint32 buf_state;
3095 
3096  /*
3097  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3098  * and saves some cycles.
3099  */
3100 
3101  if (!use_bsearch)
3102  {
3103  int j;
3104 
3105  for (j = 0; j < n; j++)
3106  {
3107  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3108  {
3109  rnode = &nodes[j];
3110  break;
3111  }
3112  }
3113  }
3114  else
3115  {
3116  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3117  nodes, n, sizeof(RelFileNode),
3119  }
3120 
3121  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3122  if (rnode == NULL)
3123  continue;
3124 
3125  buf_state = LockBufHdr(bufHdr);
3126  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3127  InvalidateBuffer(bufHdr); /* releases spinlock */
3128  else
3129  UnlockBufHdr(bufHdr, buf_state);
3130  }
3131 
3132  pfree(nodes);
3133 }
BackendId MyBackendId
Definition: globals.c:81
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1371
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:71
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:373
void pfree(void *pointer)
Definition: mcxt.c:1057
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:374
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4291
RelFileNode node
Definition: relfilenode.h:74
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
void * palloc(Size size)
Definition: mcxt.c:950
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:132
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln 
)
static

Definition at line 2721 of file bufmgr.c.

References ErrorContextCallback::arg, BufferUsage::blk_write_time, buftag::blockNum, BM_JUST_DIRTIED, BM_PERMANENT, BufferGetLSN, BufHdrGetBlock, ErrorContextCallback::callback, RelFileNode::dbNode, error_context_stack, buftag::forkNum, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, InvalidBackendId, LockBufHdr(), RelFileNodeBackend::node, PageSetChecksumCopy(), pgBufferUsage, pgstat_count_buffer_write_time, ErrorContextCallback::previous, RelFileNode::relNode, buftag::rnode, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rnode, smgropen(), smgrwrite(), RelFileNode::spcNode, StartBufferIO(), BufferDesc::tag, TerminateBufferIO(), track_io_timing, UnlockBufHdr, and XLogFlush().

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushOneBuffer(), FlushRelationBuffers(), FlushRelationsAllBuffers(), and SyncOneBuffer().

2722 {
2723  XLogRecPtr recptr;
2724  ErrorContextCallback errcallback;
2725  instr_time io_start,
2726  io_time;
2727  Block bufBlock;
2728  char *bufToWrite;
2729  uint32 buf_state;
2730 
2731  /*
2732  * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2733  * false, then someone else flushed the buffer before we could, so we need
2734  * not do anything.
2735  */
2736  if (!StartBufferIO(buf, false))
2737  return;
2738 
2739  /* Setup error traceback support for ereport() */
2741  errcallback.arg = (void *) buf;
2742  errcallback.previous = error_context_stack;
2743  error_context_stack = &errcallback;
2744 
2745  /* Find smgr relation for buffer */
2746  if (reln == NULL)
2747  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2748 
2749  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2750  buf->tag.blockNum,
2751  reln->smgr_rnode.node.spcNode,
2752  reln->smgr_rnode.node.dbNode,
2753  reln->smgr_rnode.node.relNode);
2754 
2755  buf_state = LockBufHdr(buf);
2756 
2757  /*
2758  * Run PageGetLSN while holding header lock, since we don't have the
2759  * buffer locked exclusively in all cases.
2760  */
2761  recptr = BufferGetLSN(buf);
2762 
2763  /* To check if block content changes while flushing. - vadim 01/17/97 */
2764  buf_state &= ~BM_JUST_DIRTIED;
2765  UnlockBufHdr(buf, buf_state);
2766 
2767  /*
2768  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2769  * rule that log updates must hit disk before any of the data-file changes
2770  * they describe do.
2771  *
2772  * However, this rule does not apply to unlogged relations, which will be
2773  * lost after a crash anyway. Most unlogged relation pages do not bear
2774  * LSNs since we never emit WAL records for them, and therefore flushing
2775  * up through the buffer LSN would be useless, but harmless. However,
2776  * GiST indexes use LSNs internally to track page-splits, and therefore
2777  * unlogged GiST pages bear "fake" LSNs generated by
2778  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2779  * LSN counter could advance past the WAL insertion point; and if it did
2780  * happen, attempting to flush WAL through that location would fail, with
2781  * disastrous system-wide consequences. To make sure that can't happen,
2782  * skip the flush if the buffer isn't permanent.
2783  */
2784  if (buf_state & BM_PERMANENT)
2785  XLogFlush(recptr);
2786 
2787  /*
2788  * Now it's safe to write buffer to disk. Note that no one else should
2789  * have been able to write it while we were busy with log flushing because
2790  * we have the io_in_progress lock.
2791  */
2792  bufBlock = BufHdrGetBlock(buf);
2793 
2794  /*
2795  * Update page checksum if desired. Since we have only shared lock on the
2796  * buffer, other processes might be updating hint bits in it, so we must
2797  * copy the page to private storage if we do checksumming.
2798  */
2799  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2800 
2801  if (track_io_timing)
2802  INSTR_TIME_SET_CURRENT(io_start);
2803 
2804  /*
2805  * bufToWrite is either the shared buffer or a copy, as appropriate.
2806  */
2807  smgrwrite(reln,
2808  buf->tag.forkNum,
2809  buf->tag.blockNum,
2810  bufToWrite,
2811  false);
2812 
2813  if (track_io_timing)
2814  {
2815  INSTR_TIME_SET_CURRENT(io_time);
2816  INSTR_TIME_SUBTRACT(io_time, io_start);
2819  }
2820 
2822 
2823  /*
2824  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2825  * end the io_in_progress state.
2826  */
2827  TerminateBufferIO(buf, true, 0);
2828 
2829  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2830  buf->tag.blockNum,
2831  reln->smgr_rnode.node.spcNode,
2832  reln->smgr_rnode.node.dbNode,
2833  reln->smgr_rnode.node.relNode);
2834 
2835  /* Pop the error context stack */
2836  error_context_stack = errcallback.previous;
2837 }
#define BM_PERMANENT
Definition: buf_internals.h:66
ForkNumber forkNum
Definition: buf_internals.h:93
struct timeval instr_time
Definition: instr_time.h:150
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1385
void(* callback)(void *arg)
Definition: elog.h:229
struct ErrorContextCallback * previous
Definition: elog.h:228
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2845
ErrorContextCallback * error_context_stack
Definition: elog.c:92
long shared_blks_written
Definition: instrument.h:24
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4101
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:524
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
unsigned int uint32
Definition: c.h:374
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:146
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4168
#define InvalidBackendId
Definition: backendid.h:23
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:60
RelFileNode node
Definition: relfilenode.h:74
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
instr_time blk_write_time
Definition: instrument.h:32
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:1439
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
BufferTag tag
#define UnlockBufHdr(desc, s)
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:61
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4253
bool track_io_timing
Definition: bufmgr.c:127
Pointer Page
Definition: bufpage.h:78
BufferUsage pgBufferUsage
Definition: instrument.c:20
void * Block
Definition: bufmgr.h:24

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 3448 of file bufmgr.c.

References BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock, CurrentResourceOwner, RelFileNode::dbNode, FlushBuffer(), GetBufferDescriptor, i, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by dbase_redo().

3449 {
3450  int i;
3451  BufferDesc *bufHdr;
3452 
3453  /* Make sure we can handle the pin inside the loop */
3455 
3456  for (i = 0; i < NBuffers; i++)
3457  {
3458  uint32 buf_state;
3459 
3460  bufHdr = GetBufferDescriptor(i);
3461 
3462  /*
3463  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3464  * and saves some cycles.
3465  */
3466  if (bufHdr->tag.rnode.dbNode != dbid)
3467  continue;
3468 
3470 
3471  buf_state = LockBufHdr(bufHdr);
3472  if (bufHdr->tag.rnode.dbNode == dbid &&
3473  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3474  {
3475  PinBuffer_Locked(bufHdr);
3477  FlushBuffer(bufHdr, NULL);
3479  UnpinBuffer(bufHdr, true);
3480  }
3481  else
3482  UnlockBufHdr(bufHdr, buf_state);
3483  }
3484 }
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
#define BM_DIRTY
Definition: buf_internals.h:58
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2721
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:374
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1738
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define BM_VALID
Definition: buf_internals.h:59
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1693
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:132
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:207

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 3491 of file bufmgr.c.

References Assert, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, FlushBuffer(), GetBufferDescriptor, and LWLockHeldByMe().

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), and XLogReadBufferForRedoExtended().

3492 {
3493  BufferDesc *bufHdr;
3494 
3495  /* currently not needed, but no fundamental reason not to support */
3496  Assert(!BufferIsLocal(buffer));
3497 
3498  Assert(BufferIsPinned(buffer));
3499 
3500  bufHdr = GetBufferDescriptor(buffer - 1);
3501 
3503 
3504  FlushBuffer(bufHdr, NULL);
3505 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:440
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1928
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2721
#define GetBufferDescriptor(id)
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 3252 of file bufmgr.c.

References ErrorContextCallback::arg, buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_VALID, BufferDescriptorGetContentLock, ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushBuffer(), buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, i, local_buffer_write_error_callback(), LocalBufHdrGetBlock, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, NLocBuffer, PageSetChecksumInplace(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), PinBuffer_Locked(), ErrorContextCallback::previous, RelationData::rd_node, RelationData::rd_smgr, RelationOpenSmgr, RelationUsesLocalBuffers, RelFileNodeEquals, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, smgrwrite(), BufferDesc::state, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by heapam_relation_copy_data(), and index_copy_data().

3253 {
3254  int i;
3255  BufferDesc *bufHdr;
3256 
3257  /* Open rel at the smgr level if not already done */
3258  RelationOpenSmgr(rel);
3259 
3260  if (RelationUsesLocalBuffers(rel))
3261  {
3262  for (i = 0; i < NLocBuffer; i++)
3263  {
3264  uint32 buf_state;
3265 
3266  bufHdr = GetLocalBufferDescriptor(i);
3267  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3268  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3269  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3270  {
3271  ErrorContextCallback errcallback;
3272  Page localpage;
3273 
3274  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3275 
3276  /* Setup error traceback support for ereport() */
3278  errcallback.arg = (void *) bufHdr;
3279  errcallback.previous = error_context_stack;
3280  error_context_stack = &errcallback;
3281 
3282  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3283 
3284  smgrwrite(rel->rd_smgr,
3285  bufHdr->tag.forkNum,
3286  bufHdr->tag.blockNum,
3287  localpage,
3288  false);
3289 
3290  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3291  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3292 
3293  /* Pop the error context stack */
3294  error_context_stack = errcallback.previous;
3295  }
3296  }
3297 
3298  return;
3299  }
3300 
3301  /* Make sure we can handle the pin inside the loop */
3303 
3304  for (i = 0; i < NBuffers; i++)
3305  {
3306  uint32 buf_state;
3307 
3308  bufHdr = GetBufferDescriptor(i);
3309 
3310  /*
3311  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3312  * and saves some cycles.
3313  */
3314  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3315  continue;
3316 
3318 
3319  buf_state = LockBufHdr(bufHdr);
3320  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3321  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3322  {
3323  PinBuffer_Locked(bufHdr);
3325  FlushBuffer(bufHdr, rel->rd_smgr);
3327  UnpinBuffer(bufHdr, true);
3328  }
3329  else
3330  UnlockBufHdr(bufHdr, buf_state);
3331  }
3332 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:64
ForkNumber forkNum
Definition: buf_internals.h:93
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4272
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
#define GetLocalBufferDescriptor(id)
#define BM_DIRTY
Definition: buf_internals.h:58
void(* callback)(void *arg)
Definition: elog.h:229
struct ErrorContextCallback * previous
Definition: elog.h:228
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2721
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
ErrorContextCallback * error_context_stack
Definition: elog.c:92
#define RelationOpenSmgr(relation)
Definition: rel.h:513
int NLocBuffer
Definition: localbuf.c:41
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:524
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
unsigned int uint32
Definition: c.h:374
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1738
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define BM_VALID
Definition: buf_internals.h:59
RelFileNode rd_node
Definition: rel.h:55
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1693
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1414
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:572
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:132
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:207
pg_atomic_uint32 state
Pointer Page
Definition: bufpage.h:78
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 3344 of file bufmgr.c.

References Assert, BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock, CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor, i, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, RelFileNodeBackend::node, palloc(), pfree(), pg_qsort(), PinBuffer_Locked(), RelFileNodeBackendIsTemp, RelFileNodeEquals, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, SMgrSortArray::rnode, rnode_comparator(), SMgrRelationData::smgr_rnode, SMgrSortArray::srel, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by smgrdosyncall().

3345 {
3346  int i;
3347  SMgrSortArray *srels;
3348  bool use_bsearch;
3349 
3350  if (nrels == 0)
3351  return;
3352 
3353  /* fill-in array for qsort */
3354  srels = palloc(sizeof(SMgrSortArray) * nrels);
3355 
3356  for (i = 0; i < nrels; i++)
3357  {
3358  Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
3359 
3360  srels[i].rnode = smgrs[i]->smgr_rnode.node;
3361  srels[i].srel = smgrs[i];
3362  }
3363 
3364  /*
3365  * Save the bsearch overhead for low number of relations to sync. See
3366  * DropRelFileNodesAllBuffers for details.
3367  */
3368  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
3369 
3370  /* sort the list of SMgrRelations if necessary */
3371  if (use_bsearch)
3372  pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
3373 
3374  /* Make sure we can handle the pin inside the loop */
3376 
3377  for (i = 0; i < NBuffers; i++)
3378  {
3379  SMgrSortArray *srelent = NULL;
3380  BufferDesc *bufHdr = GetBufferDescriptor(i);
3381  uint32 buf_state;
3382 
3383  /*
3384  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3385  * and saves some cycles.
3386  */
3387 
3388  if (!use_bsearch)
3389  {
3390  int j;
3391 
3392  for (j = 0; j < nrels; j++)
3393  {
3394  if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
3395  {
3396  srelent = &srels[j];
3397  break;
3398  }
3399  }
3400 
3401  }
3402  else
3403  {
3404  srelent = bsearch((const void *) &(bufHdr->tag.rnode),
3405  srels, nrels, sizeof(SMgrSortArray),
3407  }
3408 
3409  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3410  if (srelent == NULL)
3411  continue;
3412 
3414 
3415  buf_state = LockBufHdr(bufHdr);
3416  if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
3417  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3418  {
3419  PinBuffer_Locked(bufHdr);
3421  FlushBuffer(bufHdr, srelent->srel);
3423  UnpinBuffer(bufHdr, true);
3424  }
3425  else
3426  UnlockBufHdr(bufHdr, buf_state);
3427  }
3428 
3429  pfree(srels);
3430 }
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:71
#define BM_DIRTY
Definition: buf_internals.h:58
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2721
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
void pfree(void *pointer)
Definition: mcxt.c:1057
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
SMgrRelation srel
Definition: bufmgr.c:120
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:374
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1738
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4291
#define BM_VALID
Definition: buf_internals.h:59
RelFileNode node
Definition: relfilenode.h:74
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
#define Assert(condition)
Definition: c.h:745
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1693
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
void * palloc(Size size)
Definition: mcxt.c:950
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:132
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:207
RelFileNode rnode
Definition: bufmgr.c:119
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 402 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, and REFCOUNT_ARRAY_ENTRIES.

Referenced by UnpinBuffer().

403 {
404  Assert(ref->refcount == 0);
405 
406  if (ref >= &PrivateRefCountArray[0] &&
408  {
409  ref->buffer = InvalidBuffer;
410 
411  /*
412  * Mark the just used entry as reserved - in many scenarios that
413  * allows us to avoid ever having to search the array/hash for free
414  * entries.
415  */
416  ReservedRefCountEntry = ref;
417  }
418  else
419  {
420  bool found;
421  Buffer buffer = ref->buffer;
422 
424  (void *) &buffer,
425  HASH_REMOVE,
426  &found);
427  Assert(found);
430  }
431 }
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:191
#define InvalidBuffer
Definition: buf.h:25
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:919
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:189
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:193
#define Assert(condition)
Definition: c.h:745
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:80
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:190
int Buffer
Definition: buf.h:23

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 379 of file bufmgr.c.

References Assert, BufferIsLocal, BufferIsValid, GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), and PrintBufferLeakWarning().

380 {
382 
383  Assert(BufferIsValid(buffer));
384  Assert(!BufferIsLocal(buffer));
385 
386  /*
387  * Not moving the entry - that's ok for the current users, but we might
388  * want to change this one day.
389  */
390  ref = GetPrivateRefCountEntry(buffer, false);
391 
392  if (ref == NULL)
393  return 0;
394  return ref->refcount;
395 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:299
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 299 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid, free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBuffer().

300 {
302  int i;
303 
304  Assert(BufferIsValid(buffer));
305  Assert(!BufferIsLocal(buffer));
306 
307  /*
308  * First search for references in the array, that'll be sufficient in the
309  * majority of cases.
310  */
311  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
312  {
313  res = &PrivateRefCountArray[i];
314 
315  if (res->buffer == buffer)
316  return res;
317  }
318 
319  /*
320  * By here we know that the buffer, if already pinned, isn't residing in
321  * the array.
322  *
323  * Only look up the buffer in the hashtable if we've previously overflowed
324  * into it.
325  */
326  if (PrivateRefCountOverflowed == 0)
327  return NULL;
328 
330  (void *) &buffer,
331  HASH_FIND,
332  NULL);
333 
334  if (res == NULL)
335  return NULL;
336  else if (!do_move)
337  {
338  /* caller doesn't want us to move the hash entry into the array */
339  return res;
340  }
341  else
342  {
343  /* move buffer from hashtable into the free array slot */
344  bool found;
346 
347  /* Ensure there's a free array slot */
349 
350  /* Use up the reserved slot */
351  Assert(ReservedRefCountEntry != NULL);
352  free = ReservedRefCountEntry;
353  ReservedRefCountEntry = NULL;
354  Assert(free->buffer == InvalidBuffer);
355 
356  /* and fill it */
357  free->buffer = buffer;
358  free->refcount = res->refcount;
359 
360  /* delete from hashtable */
362  (void *) &buffer,
363  HASH_REMOVE,
364  &found);
365  Assert(found);
368 
369  return free;
370  }
371 }
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:191
#define InvalidBuffer
Definition: buf.h:25
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:919
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:189
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:193
#define free(a)
Definition: header.h:65
#define Assert(condition)
Definition: c.h:745
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:80
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:190
int i
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:207

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 3918 of file bufmgr.c.

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and RecoveryConflictInterrupt().

3919 {
3920  int bufid = GetStartupBufferPinWaitBufId();
3921 
3922  /*
3923  * If we get woken slowly then it's possible that the Startup process was
3924  * already woken by other backends before we got here. Also possible that
3925  * we get here by multiple interrupts or interrupts at inappropriate
3926  * times, so make sure we do nothing if the bufid is not set.
3927  */
3928  if (bufid < 0)
3929  return false;
3930 
3931  if (GetPrivateRefCount(bufid + 1) > 0)
3932  return true;
3933 
3934  return false;
3935 }
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:379
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:652

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 3549 of file bufmgr.c.

References Assert, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, GetPrivateRefCountEntry(), LocalRefCount, PrivateRefCountEntry::refcount, ResourceOwnerEnlargeBuffers(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

3550 {
3551  Assert(BufferIsPinned(buffer));
3553  if (BufferIsLocal(buffer))
3554  LocalRefCount[-buffer - 1]++;
3555  else
3556  {
3557  PrivateRefCountEntry *ref;
3558 
3559  ref = GetPrivateRefCountEntry(buffer, true);
3560  Assert(ref != NULL);
3561  ref->refcount++;
3562  }
3564 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:299
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:440
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:930
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
int32 * LocalRefCount
Definition: localbuf.c:45

◆ InitBufferPoolAccess()

void InitBufferPoolAccess ( void  )

Definition at line 2500 of file bufmgr.c.

References HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MemSet, and PrivateRefCountArray.

Referenced by BaseInit().

2501 {
2502  HASHCTL hash_ctl;
2503 
2504  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2505 
2506  MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2507  hash_ctl.keysize = sizeof(int32);
2508  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2509 
2510  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2511  HASH_ELEM | HASH_BLOBS);
2512 }
struct PrivateRefCountEntry PrivateRefCountEntry
#define HASH_ELEM
Definition: hsearch.h:85
Size entrysize
Definition: hsearch.h:72
#define MemSet(start, val, len)
Definition: c.h:949
signed int int32
Definition: c.h:362
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:189
#define HASH_BLOBS
Definition: hsearch.h:86
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:326
Size keysize
Definition: hsearch.h:71
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:190

◆ InitBufferPoolBackend()

void InitBufferPoolBackend ( void  )

Definition at line 2524 of file bufmgr.c.

References AtProcExit_Buffers(), and on_shmem_exit().

Referenced by AuxiliaryProcessMain(), and InitPostgres().

2525 {
2527 }
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2534

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 1371 of file bufmgr.c.

References Assert, BM_LOCKED, BM_TAG_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer, BUFFERTAGS_EQUAL, BufMappingPartitionLock, BufTableDelete(), BufTableHashCode(), CLEAR_BUFFERTAG, elog, ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), BufferDesc::state, StrategyFreeBuffer(), BufferDesc::tag, UnlockBufHdr, and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelFileNodeBuffers(), and DropRelFileNodesAllBuffers().

1372 {
1373  BufferTag oldTag;
1374  uint32 oldHash; /* hash value for oldTag */
1375  LWLock *oldPartitionLock; /* buffer partition lock for it */
1376  uint32 oldFlags;
1377  uint32 buf_state;
1378 
1379  /* Save the original buffer tag before dropping the spinlock */
1380  oldTag = buf->tag;
1381 
1382  buf_state = pg_atomic_read_u32(&buf->state);
1383  Assert(buf_state & BM_LOCKED);
1384  UnlockBufHdr(buf, buf_state);
1385 
1386  /*
1387  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1388  * worth storing the hashcode in BufferDesc so we need not recompute it
1389  * here? Probably not.
1390  */
1391  oldHash = BufTableHashCode(&oldTag);
1392  oldPartitionLock = BufMappingPartitionLock(oldHash);
1393 
1394 retry:
1395 
1396  /*
1397  * Acquire exclusive mapping lock in preparation for changing the buffer's
1398  * association.
1399  */
1400  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1401 
1402  /* Re-lock the buffer header */
1403  buf_state = LockBufHdr(buf);
1404 
1405  /* If it's changed while we were waiting for lock, do nothing */
1406  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1407  {
1408  UnlockBufHdr(buf, buf_state);
1409  LWLockRelease(oldPartitionLock);
1410  return;
1411  }
1412 
1413  /*
1414  * We assume the only reason for it to be pinned is that someone else is
1415  * flushing the page out. Wait for them to finish. (This could be an
1416  * infinite loop if the refcount is messed up... it would be nice to time
1417  * out after awhile, but there seems no way to be sure how many loops may
1418  * be needed. Note that if the other guy has pinned the buffer but not
1419  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1420  * be busy-looping here.)
1421  */
1422  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1423  {
1424  UnlockBufHdr(buf, buf_state);
1425  LWLockRelease(oldPartitionLock);
1426  /* safety check: should definitely not be our *own* pin */
1428  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1429  WaitIO(buf);
1430  goto retry;
1431  }
1432 
1433  /*
1434  * Clear out the buffer's tag and flags. We must do this to ensure that
1435  * linear scans of the buffer array don't think the buffer is valid.
1436  */
1437  oldFlags = buf_state & BUF_FLAG_MASK;
1438  CLEAR_BUFFERTAG(buf->tag);
1439  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1440  UnlockBufHdr(buf, buf_state);
1441 
1442  /*
1443  * Remove the buffer from the lookup hashtable, if it was in there.
1444  */
1445  if (oldFlags & BM_TAG_VALID)
1446  BufTableDelete(&oldTag, oldHash);
1447 
1448  /*
1449  * Done with mapping lock.
1450  */
1451  LWLockRelease(oldPartitionLock);
1452 
1453  /*
1454  * Insert the buffer at the head of the list of free buffers.
1455  */
1456  StrategyFreeBuffer(buf);
1457 }
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
Definition: lwlock.h:31
#define BufMappingPartitionLock(hashcode)
#define BM_TAG_VALID
Definition: buf_internals.h:60
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:4054
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:364
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
#define ERROR
Definition: elog.h:43
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:379
unsigned int uint32
Definition: c.h:374
#define BUFFERTAGS_EQUAL(a, b)
#define BM_LOCKED
Definition: buf_internals.h:57
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
#define Assert(condition)
Definition: c.h:745
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:97
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
#define BufferDescriptorGetBuffer(bdesc)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
BufferTag tag
#define UnlockBufHdr(desc, s)
#define elog(elevel,...)
Definition: elog.h:214
pg_atomic_uint32 state
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 4000 of file bufmgr.c.

References Assert, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsValid, GetBufferDescriptor, GetPrivateRefCount(), LocalRefCount, LockBufHdr(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), and UnlockBufHdr.

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), hash_xlog_split_allocate_page(), and hashbucketcleanup().

4001 {
4002  BufferDesc *bufHdr;
4003  uint32 buf_state;
4004 
4005  Assert(BufferIsValid(buffer));
4006 
4007  if (BufferIsLocal(buffer))
4008  {
4009  /* There should be exactly one pin */
4010  if (LocalRefCount[-buffer - 1] != 1)
4011  return false;
4012  /* Nobody else to wait for */
4013  return true;
4014  }
4015 
4016  /* There should be exactly one local pin */
4017  if (GetPrivateRefCount(buffer) != 1)
4018  return false;
4019 
4020  bufHdr = GetBufferDescriptor(buffer - 1);
4021 
4022  /* caller must hold exclusive lock on buffer */
4024  LW_EXCLUSIVE));
4025 
4026  buf_state = LockBufHdr(bufHdr);
4027 
4028  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4029  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4030  {
4031  /* pincount is OK. */
4032  UnlockBufHdr(bufHdr, buf_state);
4033  return true;
4034  }
4035 
4036  UnlockBufHdr(bufHdr, buf_state);
4037  return false;
4038 }
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1946
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:379
unsigned int uint32
Definition: c.h:374
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define UnlockBufHdr(desc, s)
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext context)

Definition at line 4504 of file bufmgr.c.

References buftag::blockNum, buffertag_comparator(), cur, buftag::forkNum, i, InvalidBackendId, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, qsort, RelFileNodeEquals, buftag::rnode, smgropen(), smgrwriteback(), and PendingWriteback::tag.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

4505 {
4506  int i;
4507 
4508  if (context->nr_pending == 0)
4509  return;
4510 
4511  /*
4512  * Executing the writes in-order can make them a lot faster, and allows to
4513  * merge writeback requests to consecutive blocks into larger writebacks.
4514  */
4515  qsort(&context->pending_writebacks, context->nr_pending,
4517 
4518  /*
4519  * Coalesce neighbouring writes, but nothing else. For that we iterate
4520  * through the, now sorted, array of pending flushes, and look forward to
4521  * find all neighbouring (or identical) writes.
4522  */
4523  for (i = 0; i < context->nr_pending; i++)
4524  {
4527  SMgrRelation reln;
4528  int ahead;
4529  BufferTag tag;
4530  Size nblocks = 1;
4531 
4532  cur = &context->pending_writebacks[i];
4533  tag = cur->tag;
4534 
4535  /*
4536  * Peek ahead, into following writeback requests, to see if they can
4537  * be combined with the current one.
4538  */
4539  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4540  {
4541  next = &context->pending_writebacks[i + ahead + 1];
4542 
4543  /* different file, stop */
4544  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4545  cur->tag.forkNum != next->tag.forkNum)
4546  break;
4547 
4548  /* ok, block queued twice, skip */
4549  if (cur->tag.blockNum == next->tag.blockNum)
4550  continue;
4551 
4552  /* only merge consecutive writes */
4553  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4554  break;
4555 
4556  nblocks++;
4557  cur = next;
4558  }
4559 
4560  i += ahead;
4561 
4562  /* and finally tell the kernel to write the data to storage */
4563  reln = smgropen(tag.rnode, InvalidBackendId);
4564  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4565  }
4566 
4567  context->nr_pending = 0;
4568 }
static int32 next
Definition: blutils.c:219
ForkNumber forkNum
Definition: buf_internals.h:93
struct cursor * cur
Definition: ecpg.c:28
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:537
static int buffertag_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4370
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:146
#define InvalidBackendId
Definition: backendid.h:23
size_t Size
Definition: c.h:473
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
int i
#define qsort(a, b, c, d)
Definition: port.h:475
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 4272 of file bufmgr.c.

References buftag::blockNum, errcontext, buftag::forkNum, MyBackendId, pfree(), relpathbackend, buftag::rnode, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

4273 {
4274  BufferDesc *bufHdr = (BufferDesc *) arg;
4275 
4276  if (bufHdr != NULL)
4277  {
4278  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4279  bufHdr->tag.forkNum);
4280 
4281  errcontext("writing block %u of relation %s",
4282  bufHdr->tag.blockNum, path);
4283  pfree(path);
4284  }
4285 }
BackendId MyBackendId
Definition: globals.c:81
ForkNumber forkNum
Definition: buf_internals.h:93
void pfree(void *pointer)
Definition: mcxt.c:1057
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define errcontext
Definition: elog.h:185
void * arg
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
int  mode 
)

Definition at line 3750 of file bufmgr.c.

References Assert, buf, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, elog, ERROR, GetBufferDescriptor, LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), and LWLockRelease().

Referenced by _bt_lockbuf(), _bt_unlockbuf(), _bt_upgradelockbufcleanup(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_getnewbuf(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), blbulkdelete(), blgetbitmap(), blinsert(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), brinbuild(), brinbuildempty(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_page_items(), bt_page_stats(), bt_recheck_sibling_links(), checkXLogConsistency(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), entryLoadMoreItems(), fill_seq_with_data(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), ginbuildempty(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistbuildempty(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_compute_xid_horizon_for_tuples(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_inplace_update(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_bitmap_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgetpage(), heapgettup(), initBloomState(), lazy_scan_heap(), LockBufferForCleanup(), log_newpage_range(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferExtended(), XLogReadBufferForRedoExtended(), and XLogRecordPageWithFreeSpace().

3751 {
3752  BufferDesc *buf;
3753 
3754  Assert(BufferIsPinned(buffer));
3755  if (BufferIsLocal(buffer))
3756  return; /* local buffers need no lock */
3757 
3758  buf = GetBufferDescriptor(buffer - 1);
3759 
3760  if (mode == BUFFER_LOCK_UNLOCK)
3762  else if (mode == BUFFER_LOCK_SHARE)
3764  else if (mode == BUFFER_LOCK_EXCLUSIVE)
3766  else
3767  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3768 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:440
static PgChecksumMode mode
Definition: pg_checksums.c:61
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:68
#define GetBufferDescriptor(id)
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
#define elog(elevel,...)
Definition: elog.h:214
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:97

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 3807 of file bufmgr.c.

References Assert, BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, elog, ERROR, get_ps_display(), GetBufferDescriptor, GetPrivateRefCount(), InHotStandby, LocalRefCount, LockBuffer(), LockBufHdr(), MyProcPid, palloc(), pfree(), PG_WAIT_BUFFER_PIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), set_ps_display(), SetStartupBufferPinWaitBufId(), UnlockBufHdr, update_process_title, and BufferDesc::wait_backend_pid.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), ReadBuffer_common(), and XLogReadBufferForRedoExtended().

3808 {
3809  BufferDesc *bufHdr;
3810  char *new_status = NULL;
3811 
3812  Assert(BufferIsPinned(buffer));
3813  Assert(PinCountWaitBuf == NULL);
3814 
3815  if (BufferIsLocal(buffer))
3816  {
3817  /* There should be exactly one pin */
3818  if (LocalRefCount[-buffer - 1] != 1)
3819  elog(ERROR, "incorrect local pin count: %d",
3820  LocalRefCount[-buffer - 1]);
3821  /* Nobody else to wait for */
3822  return;
3823  }
3824 
3825  /* There should be exactly one local pin */
3826  if (GetPrivateRefCount(buffer) != 1)
3827  elog(ERROR, "incorrect local pin count: %d",
3828  GetPrivateRefCount(buffer));
3829 
3830  bufHdr = GetBufferDescriptor(buffer - 1);
3831 
3832  for (;;)
3833  {
3834  uint32 buf_state;
3835 
3836  /* Try to acquire lock */
3838  buf_state = LockBufHdr(bufHdr);
3839 
3840  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3841  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3842  {
3843  /* Successfully acquired exclusive lock with pincount 1 */
3844  UnlockBufHdr(bufHdr, buf_state);
3845 
3846  /* Report change to non-waiting status */
3847  if (new_status)
3848  {
3849  set_ps_display(new_status);
3850  pfree(new_status);
3851  }
3852  return;
3853  }
3854  /* Failed, so mark myself as waiting for pincount 1 */
3855  if (buf_state & BM_PIN_COUNT_WAITER)
3856  {
3857  UnlockBufHdr(bufHdr, buf_state);
3858  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3859  elog(ERROR, "multiple backends attempting to wait for pincount 1");
3860  }
3861  bufHdr->wait_backend_pid = MyProcPid;
3862  PinCountWaitBuf = bufHdr;
3863  buf_state |= BM_PIN_COUNT_WAITER;
3864  UnlockBufHdr(bufHdr, buf_state);
3865  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3866 
3867  /* Wait to be signaled by UnpinBuffer() */
3868  if (InHotStandby)
3869  {
3870  /* Report change to waiting status */
3871  if (update_process_title && new_status == NULL)
3872  {
3873  const char *old_status;
3874  int len;
3875 
3876  old_status = get_ps_display(&len);
3877  new_status = (char *) palloc(len + 8 + 1);
3878  memcpy(new_status, old_status, len);
3879  strcpy(new_status + len, " waiting");
3880  set_ps_display(new_status);
3881  new_status[len] = '\0'; /* truncate off " waiting" */
3882  }
3883 
3884  /* Publish the bufid that Startup process waits on */
3885  SetStartupBufferPinWaitBufId(buffer - 1);
3886  /* Set alarm and then wait to be signaled by UnpinBuffer() */
3888  /* Reset the published bufid */
3890  }
3891  else
3893 
3894  /*
3895  * Remove flag marking us as waiter. Normally this will not be set
3896  * anymore, but ProcWaitForSignal() can return for other signals as
3897  * well. We take care to only reset the flag if we're the waiter, as
3898  * theoretically another backend could have started waiting. That's
3899  * impossible with the current usages due to table level locking, but
3900  * better be safe.
3901  */
3902  buf_state = LockBufHdr(bufHdr);
3903  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3904  bufHdr->wait_backend_pid == MyProcPid)
3905  buf_state &= ~BM_PIN_COUNT_WAITER;
3906  UnlockBufHdr(bufHdr, buf_state);
3907 
3908  PinCountWaitBuf = NULL;
3909  /* Loop back and try again */
3910  }
3911 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:440
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
int MyProcPid
Definition: globals.c:40
int wait_backend_pid
bool update_process_title
Definition: ps_status.c:36
#define InHotStandby
Definition: xlog.h:74
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
void set_ps_display(const char *activity)
Definition: ps_status.c:349
void pfree(void *pointer)
Definition: mcxt.c:1057
#define ERROR
Definition: elog.h:43
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:483
const char * get_ps_display(int *displen)
Definition: ps_status.c:430
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:640
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:379
unsigned int uint32
Definition: c.h:374
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1796
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3750
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:786
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
void * palloc(Size size)
Definition: mcxt.c:950
#define UnlockBufHdr(desc, s)
#define elog(elevel,...)
Definition: elog.h:214
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:157
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:64

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 4318 of file bufmgr.c.

References BM_LOCKED, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), DropRelFileNodeBuffers(), DropRelFileNodesAllBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetBufferFromRing(), InvalidateBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), pg_buffercache_pages(), ReadBuffer_common(), StartBufferIO(), StrategyGetBuffer(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), UnpinBuffer(), and WaitIO().

4319 {
4320  SpinDelayStatus delayStatus;
4321  uint32 old_buf_state;
4322 
4323  init_local_spin_delay(&delayStatus);
4324 
4325  while (true)
4326  {
4327  /* set BM_LOCKED flag */
4328  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4329  /* if it wasn't set before we're OK */
4330  if (!(old_buf_state & BM_LOCKED))
4331  break;
4332  perform_spin_delay(&delayStatus);
4333  }
4334  finish_spin_delay(&delayStatus);
4335  return old_buf_state | BM_LOCKED;
4336 }
#define init_local_spin_delay(status)
Definition: s_lock.h:1043
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:174
unsigned int uint32
Definition: c.h:374
#define BM_LOCKED
Definition: buf_internals.h:57
pg_atomic_uint32 state
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:372
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:124

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 1469 of file bufmgr.c.

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, BufferIsValid, elog, ERROR, GetBufferDescriptor, LW_EXCLUSIVE, LWLockHeldByMeInMode(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_one_page(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newroot(), _bt_restore_meta(), _bt_split(), _bt_unlink_halfdead_page(), _bt_update_meta_cleanup_info(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), do_setval(), doPickSplit(), fill_seq_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune(), heap_update(), heap_xlog_clean(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_freeze_page(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heap_xlog_visible(), lazy_scan_heap(), lazy_vacuum_page(), log_newpage_range(), moveLeafs(), nextval_internal(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

1470 {
1471  BufferDesc *bufHdr;
1472  uint32 buf_state;
1473  uint32 old_buf_state;
1474 
1475  if (!BufferIsValid(buffer))
1476  elog(ERROR, "bad buffer ID: %d", buffer);
1477 
1478  if (BufferIsLocal(buffer))
1479  {
1480  MarkLocalBufferDirty(buffer);
1481  return;
1482  }
1483 
1484  bufHdr = GetBufferDescriptor(buffer - 1);
1485 
1486  Assert(BufferIsPinned(buffer));
1488  LW_EXCLUSIVE));
1489 
1490  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1491  for (;;)
1492  {
1493  if (old_buf_state & BM_LOCKED)
1494  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1495 
1496  buf_state = old_buf_state;
1497 
1498  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1499  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1500 
1501  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1502  buf_state))
1503  break;
1504  }
1505 
1506  /*
1507  * If the buffer was not dirty already, do vacuum accounting.
1508  */
1509  if (!(old_buf_state & BM_DIRTY))
1510  {
1511  VacuumPageDirty++;
1513  if (VacuumCostActive)
1515  }
1516 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:440
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1946
int VacuumCostBalance
Definition: globals.c:148
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
int64 VacuumPageDirty
Definition: globals.c:146
#define BM_DIRTY
Definition: buf_internals.h:58
int VacuumCostPageDirty
Definition: globals.c:140
#define ERROR
Definition: elog.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
unsigned int uint32
Definition: c.h:374
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:286
#define BM_LOCKED
Definition: buf_internals.h:57
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4346
#define elog(elevel,...)
Definition: elog.h:214
pg_atomic_uint32 state
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
bool VacuumCostActive
Definition: globals.c:149
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 3581 of file bufmgr.c.

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock, BufferGetPage, BufferIsLocal, BufferIsValid, PGPROC::delayChkpt, elog, ERROR, GetBufferDescriptor, GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), LWLockHeldByMe(), MarkLocalBufferDirty(), MyProc, PageSetLSN, pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), RelFileNodeSkippingWAL(), buftag::rnode, BufferUsage::shared_blks_dirtied, BufferDesc::state, BufferDesc::tag, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

3582 {
3583  BufferDesc *bufHdr;
3584  Page page = BufferGetPage(buffer);
3585 
3586  if (!BufferIsValid(buffer))
3587  elog(ERROR, "bad buffer ID: %d", buffer);
3588 
3589  if (BufferIsLocal(buffer))
3590  {
3591  MarkLocalBufferDirty(buffer);
3592  return;
3593  }
3594 
3595  bufHdr = GetBufferDescriptor(buffer - 1);
3596 
3597  Assert(GetPrivateRefCount(buffer) > 0);
3598  /* here, either share or exclusive lock is OK */
3600 
3601  /*
3602  * This routine might get called many times on the same page, if we are
3603  * making the first scan after commit of an xact that added/deleted many
3604  * tuples. So, be as quick as we can if the buffer is already dirty. We
3605  * do this by not acquiring spinlock if it looks like the status bits are
3606  * already set. Since we make this test unlocked, there's a chance we
3607  * might fail to notice that the flags have just been cleared, and failed
3608  * to reset them, due to memory-ordering issues. But since this function
3609  * is only intended to be used in cases where failing to write out the
3610  * data would be harmless anyway, it doesn't really matter.
3611  */
3612  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3614  {
3616  bool dirtied = false;
3617  bool delayChkpt = false;
3618  uint32 buf_state;
3619 
3620  /*
3621  * If we need to protect hint bit updates from torn writes, WAL-log a
3622  * full page image of the page. This full page image is only necessary
3623  * if the hint bit update is the first change to the page since the
3624  * last checkpoint.
3625  *
3626  * We don't check full_page_writes here because that logic is included
3627  * when we call XLogInsert() since the value changes dynamically.
3628  */
3629  if (XLogHintBitIsNeeded() &&
3630  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3631  {
3632  /*
3633  * If we must not write WAL, due to a relfilenode-specific
3634  * condition or being in recovery, don't dirty the page. We can
3635  * set the hint, just not dirty the page as a result so the hint
3636  * is lost when we evict the page or shutdown.
3637  *
3638  * See src/backend/storage/page/README for longer discussion.
3639  */
3640  if (RecoveryInProgress() ||
3641  RelFileNodeSkippingWAL(bufHdr->tag.rnode))
3642  return;
3643 
3644  /*
3645  * If the block is already dirty because we either made a change
3646  * or set a hint already, then we don't need to write a full page
3647  * image. Note that aggressive cleaning of blocks dirtied by hint
3648  * bit setting would increase the call rate. Bulk setting of hint
3649  * bits would reduce the call rate...
3650  *
3651  * We must issue the WAL record before we mark the buffer dirty.
3652  * Otherwise we might write the page before we write the WAL. That
3653  * causes a race condition, since a checkpoint might occur between
3654  * writing the WAL record and marking the buffer dirty. We solve
3655  * that with a kluge, but one that is already in use during
3656  * transaction commit to prevent race conditions. Basically, we
3657  * simply prevent the checkpoint WAL record from being written
3658  * until we have marked the buffer dirty. We don't start the
3659  * checkpoint flush until we have marked dirty, so our checkpoint
3660  * must flush the change to disk successfully or the checkpoint
3661  * never gets written, so crash recovery will fix.
3662  *
3663  * It's possible we may enter here without an xid, so it is
3664  * essential that CreateCheckpoint waits for virtual transactions
3665  * rather than full transactionids.
3666  */
3667  MyProc->delayChkpt = delayChkpt = true;
3668  lsn = XLogSaveBufferForHint(buffer, buffer_std);
3669  }
3670 
3671  buf_state = LockBufHdr(bufHdr);
3672 
3673  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3674 
3675  if (!(buf_state & BM_DIRTY))
3676  {
3677  dirtied = true; /* Means "will be dirtied by this action" */
3678 
3679  /*
3680  * Set the page LSN if we wrote a backup block. We aren't supposed
3681  * to set this when only holding a share lock but as long as we
3682  * serialise it somehow we're OK. We choose to set LSN while
3683  * holding the buffer header lock, which causes any reader of an
3684  * LSN who holds only a share lock to also obtain a buffer header
3685  * lock before using PageGetLSN(), which is enforced in
3686  * BufferGetLSNAtomic().
3687  *
3688  * If checksums are enabled, you might think we should reset the
3689  * checksum here. That will happen when the page is written
3690  * sometime later in this checkpoint cycle.
3691  */
3692  if (!XLogRecPtrIsInvalid(lsn))
3693  PageSetLSN(page, lsn);
3694  }
3695 
3696  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3697  UnlockBufHdr(bufHdr, buf_state);
3698 
3699  if (delayChkpt)
3700  MyProc->delayChkpt = false;
3701 
3702  if (dirtied)
3703  {
3704  VacuumPageDirty++;
3706  if (VacuumCostActive)
3708  }
3709  }
3710 }
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
#define BM_PERMANENT
Definition: buf_internals.h:66
int VacuumCostBalance
Definition: globals.c:148
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1928
PGPROC * MyProc
Definition: proc.c:67
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:917
int64 VacuumPageDirty
Definition: globals.c:146
bool RecoveryInProgress(void)
Definition: xlog.c:8074
#define BM_DIRTY
Definition: buf_internals.h:58
int VacuumCostPageDirty
Definition: globals.c:140
#define ERROR
Definition: elog.h:43
bool delayChkpt
Definition: proc.h:176
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:379
unsigned int uint32
Definition: c.h:374
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:286
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
RelFileNode rnode
Definition: buf_internals.h:92
bool RelFileNodeSkippingWAL(RelFileNode rnode)
Definition: storage.c:496
BufferTag tag
#define UnlockBufHdr(desc, s)
#define elog(elevel,...)
Definition: elog.h:214
pg_atomic_uint32 state
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define XLogHintBitIsNeeded()
Definition: xlog.h:202
Pointer Page
Definition: bufpage.h:78
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
bool VacuumCostActive
Definition: globals.c:149
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 273 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, PrivateRefCountEntry::refcount, and ReservedRefCountEntry.

Referenced by PinBuffer(), and PinBuffer_Locked().

274 {
276 
277  /* only allowed to be called when a reservation has been made */
278  Assert(ReservedRefCountEntry != NULL);
279 
280  /* use up the reserved entry */
281  res = ReservedRefCountEntry;
282  ReservedRefCountEntry = NULL;
283 
284  /* and fill it */
285  res->buffer = buffer;
286  res->refcount = 0;
287 
288  return res;
289 }
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:193
#define Assert(condition)
Definition: c.h:745

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy 
)
static

Definition at line 1590 of file bufmgr.c.

References Assert, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer, BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservePrivateRefCountEntry(), ResourceOwnerRememberBuffer(), BufferDesc::state, VALGRIND_MAKE_MEM_DEFINED, and WaitBufHdrUnlocked().

Referenced by BufferAlloc().

1591 {
1593  bool result;
1594  PrivateRefCountEntry *ref;
1595 
1596  ref = GetPrivateRefCountEntry(b, true);
1597 
1598  if (ref == NULL)
1599  {
1600  uint32 buf_state;
1601  uint32 old_buf_state;
1602 
1604  ref = NewPrivateRefCountEntry(b);
1605 
1606  old_buf_state = pg_atomic_read_u32(&buf->state);
1607  for (;;)
1608  {
1609  if (old_buf_state & BM_LOCKED)
1610  old_buf_state = WaitBufHdrUnlocked(buf);
1611 
1612  buf_state = old_buf_state;
1613 
1614  /* increase refcount */
1615  buf_state += BUF_REFCOUNT_ONE;
1616 
1617  if (strategy == NULL)
1618  {
1619  /* Default case: increase usagecount unless already max. */
1621  buf_state += BUF_USAGECOUNT_ONE;
1622  }
1623  else
1624  {
1625  /*
1626  * Ring buffers shouldn't evict others from pool. Thus we
1627  * don't make usagecount more than 1.
1628  */
1629  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1630  buf_state += BUF_USAGECOUNT_ONE;
1631  }
1632 
1633  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1634  buf_state))
1635  {
1636  result = (buf_state & BM_VALID) != 0;
1637 
1638  /*
1639  * Assume that we acquired a buffer pin for the purposes of
1640  * Valgrind buffer client checks (even in !result case) to
1641  * keep things simple. Buffers that are unsafe to access are
1642  * not generally guaranteed to be marked undefined or
1643  * non-accessible in any case.
1644  */
1646  break;
1647  }
1648  }
1649  }
1650  else
1651  {
1652  /*
1653  * If we previously pinned the buffer, it must surely be valid.
1654  *
1655  * Note: We deliberately avoid a Valgrind client request here.
1656  * Individual access methods can optionally superimpose buffer page
1657  * client requests on top of our client requests to enforce that
1658  * buffers are only accessed while locked (and pinned). It's possible
1659  * that the buffer page is legitimately non-accessible here. We
1660  * cannot meddle with that.
1661  */
1662  result = true;
1663  }
1664 
1665  ref->refcount++;
1666  Assert(ref->refcount > 0);
1668  return result;
1669 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:299
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:930
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:273
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
unsigned int uint32
Definition: c.h:374
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define BM_VALID
Definition: buf_internals.h:59
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:60
int result
Definition: header.h:19
#define Assert(condition)
Definition: c.h:745
#define BufferDescriptorGetBuffer(bdesc)
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4346
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:76
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:207
pg_atomic_uint32 state
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:49
int Buffer
Definition: buf.h:23
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 1693 of file bufmgr.c.

References Assert, BM_LOCKED, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer, BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), BufferDesc::state, UnlockBufHdr, and VALGRIND_MAKE_MEM_DEFINED.

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), and SyncOneBuffer().

1694 {
1695  Buffer b;
1696  PrivateRefCountEntry *ref;
1697  uint32 buf_state;
1698 
1699  /*
1700  * As explained, We don't expect any preexisting pins. That allows us to
1701  * manipulate the PrivateRefCount after releasing the spinlock
1702  */
1704 
1705  /*
1706  * Buffer can't have a preexisting pin, so mark its page as defined to
1707  * Valgrind (this is similar to the PinBuffer() case where the backend
1708  * doesn't already have a buffer pin)
1709  */
1711 
1712  /*
1713  * Since we hold the buffer spinlock, we can update the buffer state and
1714  * release the lock in one operation.
1715  */
1716  buf_state = pg_atomic_read_u32(&buf->state);
1717  Assert(buf_state & BM_LOCKED);
1718  buf_state += BUF_REFCOUNT_ONE;
1719  UnlockBufHdr(buf, buf_state);
1720 
1721  b = BufferDescriptorGetBuffer(buf);
1722 
1723  ref = NewPrivateRefCountEntry(b);
1724  ref->refcount++;
1725 
1727 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:299
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:930
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:273
unsigned int uint32
Definition: c.h:374
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:60
#define Assert(condition)
Definition: c.h:745
#define BufferDescriptorGetBuffer(bdesc)
#define UnlockBufHdr(desc, s)
pg_atomic_uint32 state
int Buffer
Definition: buf.h:23
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ PrefetchBuffer()

PrefetchBufferResult PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 575 of file bufmgr.c.

References Assert, BlockNumberIsValid, ereport, errcode(), errmsg(), ERROR, PrefetchLocalBuffer(), PrefetchSharedBuffer(), RelationData::rd_smgr, RELATION_IS_OTHER_TEMP, RelationIsValid, RelationOpenSmgr, and RelationUsesLocalBuffers.

Referenced by BitmapPrefetch(), count_nondeletable_pages(), HeapTupleHeaderAdvanceLatestRemovedXid(), and pg_prewarm().

576 {
577  Assert(RelationIsValid(reln));
578  Assert(BlockNumberIsValid(blockNum));
579 
580  /* Open it at the smgr level if not already done */
581  RelationOpenSmgr(reln);
582 
583  if (RelationUsesLocalBuffers(reln))
584  {
585  /* see comments in ReadBufferExtended */
586  if (RELATION_IS_OTHER_TEMP(reln))
587  ereport(ERROR,
588  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
589  errmsg("cannot access temporary tables of other sessions")));
590 
591  /* pass it off to localbuf.c */
592  return PrefetchLocalBuffer(reln->rd_smgr, forkNum, blockNum);
593  }
594  else
595  {
596  /* pass it to the shared buffer version */
597  return PrefetchSharedBuffer(reln->rd_smgr, forkNum, blockNum);
598  }
599 }
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
int errcode(int sqlerrcode)
Definition: elog.c:610
#define RelationOpenSmgr(relation)
Definition: rel.h:513
#define ERROR
Definition: elog.h:43
#define RelationIsValid(relation)
Definition: rel.h:429
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:488
#define ereport(elevel,...)
Definition: elog.h:144
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define Assert(condition)
Definition: c.h:745
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:593
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:572
int errmsg(const char *fmt,...)
Definition: elog.c:824

◆ PrefetchSharedBuffer()

PrefetchBufferResult PrefetchSharedBuffer ( SMgrRelation  smgr_reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 488 of file bufmgr.c.

References Assert, BlockNumberIsValid, BufMappingPartitionLock, BufTableHashCode(), BufTableLookup(), INIT_BUFFERTAG, PrefetchBufferResult::initiated_io, InvalidBuffer, LW_SHARED, LWLockAcquire(), LWLockRelease(), RelFileNodeBackend::node, PrefetchBufferResult::recent_buffer, SMgrRelationData::smgr_rnode, and smgrprefetch().

Referenced by PrefetchBuffer().

491 {
493  BufferTag newTag; /* identity of requested block */
494  uint32 newHash; /* hash value for newTag */
495  LWLock *newPartitionLock; /* buffer partition lock for it */
496  int buf_id;
497 
498  Assert(BlockNumberIsValid(blockNum));
499 
500  /* create a tag so we can lookup the buffer */
501  INIT_BUFFERTAG(newTag, smgr_reln->smgr_rnode.node,
502  forkNum, blockNum);
503 
504  /* determine its hash code and partition lock ID */
505  newHash = BufTableHashCode(&newTag);
506  newPartitionLock = BufMappingPartitionLock(newHash);
507 
508  /* see if the block is in the buffer pool already */
509  LWLockAcquire(newPartitionLock, LW_SHARED);
510  buf_id = BufTableLookup(&newTag, newHash);
511  LWLockRelease(newPartitionLock);
512 
513  /* If not in buffers, initiate prefetch */
514  if (buf_id < 0)
515  {
516 #ifdef USE_PREFETCH
517  /*
518  * Try to initiate an asynchronous read. This returns false in
519  * recovery if the relation file doesn't exist.
520  */
521  if (smgrprefetch(smgr_reln, forkNum, blockNum))
522  result.initiated_io = true;
523 #endif /* USE_PREFETCH */
524  }
525  else
526  {
527  /*
528  * Report the buffer it was in at that time. The caller may be able
529  * to avoid a buffer table lookup, but it's not pinned and it must be
530  * rechecked!
531  */
532  result.recent_buffer = buf_id + 1;
533  }
534 
535  /*
536  * If the block *is* in buffers, we do nothing. This is not really ideal:
537  * the block might be just about to be evicted, which would be stupid
538  * since we know we are going to need it soon. But the only easy answer
539  * is to bump the usage_count, which does not seem like a great solution:
540  * when the caller does ultimately touch the block, usage_count would get
541  * bumped again, resulting in too much favoritism for blocks that are
542  * involved in a prefetch sequence. A real fix would involve some
543  * additional per-buffer state, and it's not clear that there's enough of
544  * a problem to justify that.
545  */
546 
547  return result;
548 }
Definition: lwlock.h:31
#define BufMappingPartitionLock(hashcode)
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
#define InvalidBuffer
Definition: buf.h:25
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
Buffer recent_buffer
Definition: bufmgr.h:54
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
unsigned int uint32
Definition: c.h:374
int result
Definition: header.h:19
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
RelFileNode node
Definition: relfilenode.h:74
#define Assert(condition)
Definition: c.h:745
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:488

◆ PrintBufferLeakWarning()

void PrintBufferLeakWarning ( Buffer  buffer)

Definition at line 2594 of file bufmgr.c.

References Assert, buftag::blockNum, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BufferIsLocal, BufferIsValid, elog, buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, GetPrivateRefCount(), InvalidBackendId, LocalRefCount, MyBackendId, pfree(), pg_atomic_read_u32(), relpathbackend, buftag::rnode, BufferDesc::state, BufferDesc::tag, and WARNING.

Referenced by CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResourceOwnerReleaseInternal().

2595 {
2596  BufferDesc *buf;
2597  int32 loccount;
2598  char *path;
2599  BackendId backend;
2600  uint32 buf_state;
2601 
2602  Assert(BufferIsValid(buffer));
2603  if (BufferIsLocal(buffer))
2604  {
2605  buf = GetLocalBufferDescriptor(-buffer - 1);
2606  loccount = LocalRefCount[-buffer - 1];
2607  backend = MyBackendId;
2608  }
2609  else
2610  {
2611  buf = GetBufferDescriptor(buffer - 1);
2612  loccount = GetPrivateRefCount(buffer);
2613  backend = InvalidBackendId;
2614  }
2615 
2616  /* theoretically we should lock the bufhdr here */
2617  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2618  buf_state = pg_atomic_read_u32(&buf->state);
2619  elog(WARNING,
2620  "buffer refcount leak: [%03d] "
2621  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2622  buffer, path,
2623  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2624  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2625  pfree(path);
2626 }
BackendId MyBackendId
Definition: globals.c:81
ForkNumber forkNum
Definition: buf_internals.h:93
#define GetLocalBufferDescriptor(id)
signed int int32
Definition: c.h:362
void pfree(void *pointer)
Definition: mcxt.c:1057
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
static char * buf
Definition: pg_test_fsync.c:68
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:379
unsigned int uint32
Definition: c.h:374
#define WARNING
Definition: elog.h:40
#define InvalidBackendId
Definition: backendid.h:23
int BackendId
Definition: backendid.h:21
#define Assert(condition)
Definition: c.h:745
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define elog(elevel,...)
Definition: elog.h:214
pg_atomic_uint32 state
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 607 of file bufmgr.c.

References MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_getbuf(), _bt_search_insert(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinbuild(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_page_items(), bt_page_stats(), fill_seq_with_data(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_compute_xid_horizon_for_tuples(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_inplace_update(), heap_lock_tuple(), heap_update(), initBloomState(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

608 {
609  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
610 }
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:653

◆ ReadBuffer_common()

static Buffer ReadBuffer_common ( SMgrRelation  reln,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool hit 
)
static

Definition at line 716 of file bufmgr.c.

References Assert, RelFileNodeBackend::backend, BufferUsage::blk_read_time, BM_VALID, BufferAlloc(), BufferDescriptorGetBuffer, BufferDescriptorGetContentLock, BufHdrGetBlock, CurrentResourceOwner, RelFileNode::dbNode, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errhint(), errmsg(), ERROR, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, BufferUsage::local_blks_hit, BufferUsage::local_blks_read, BufferUsage::local_blks_written, LocalBufferAlloc(), LocalBufHdrGetBlock, LockBufferForCleanup(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), MemSet, RelFileNodeBackend::node, P_NEW, PageIsNew, PageIsVerified(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), pgBufferUsage, pgstat_count_buffer_read_time, RBM_NORMAL, RBM_NORMAL_NO_LOG, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RBM_ZERO_ON_ERROR, RelFileNode::relNode, relpath, ResourceOwnerEnlargeBuffers(), BufferUsage::shared_blks_hit, BufferUsage::shared_blks_read, BufferUsage::shared_blks_written, SMgrRelationData::smgr_rnode, smgrextend(), SmgrIsTemp, smgrnblocks(), smgrread(), RelFileNode::spcNode, StartBufferIO(), BufferDesc::state, TerminateBufferIO(), track_io_timing, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageHit, VacuumCostPageMiss, VacuumPageHit, VacuumPageMiss, WARNING, and zero_damaged_pages.

Referenced by ReadBufferExtended(), and ReadBufferWithoutRelcache().

719 {
720  BufferDesc *bufHdr;
721  Block bufBlock;
722  bool found;
723  bool isExtend;
724  bool isLocalBuf = SmgrIsTemp(smgr);
725 
726  *hit = false;
727 
728  /* Make sure we will have room to remember the buffer pin */
730 
731  isExtend = (blockNum == P_NEW);
732 
733  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
734  smgr->smgr_rnode.node.spcNode,
735  smgr->smgr_rnode.node.dbNode,
736  smgr->smgr_rnode.node.relNode,
737  smgr->smgr_rnode.backend,
738  isExtend);
739 
740  /* Substitute proper block number if caller asked for P_NEW */
741  if (isExtend)
742  blockNum = smgrnblocks(smgr, forkNum);
743 
744  if (isLocalBuf)
745  {
746  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
747  if (found)
749  else if (isExtend)
751  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
754  }
755  else
756  {
757  /*
758  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
759  * not currently in memory.
760  */
761  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
762  strategy, &found);
763  if (found)
765  else if (isExtend)
767  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
770  }
771 
772  /* At this point we do NOT hold any locks. */
773 
774  /* if it was already in the buffer pool, we're done */
775  if (found)
776  {
777  if (!isExtend)
778  {
779  /* Just need to update stats before we exit */
780  *hit = true;
781  VacuumPageHit++;
782 
783  if (VacuumCostActive)
785 
786  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
787  smgr->smgr_rnode.node.spcNode,
788  smgr->smgr_rnode.node.dbNode,
789  smgr->smgr_rnode.node.relNode,
790  smgr->smgr_rnode.backend,
791  isExtend,
792  found);
793 
794  /*
795  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
796  * locked on return.
797  */
798  if (!isLocalBuf)
799  {
800  if (mode == RBM_ZERO_AND_LOCK)
802  LW_EXCLUSIVE);
803  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
805  }
806 
807  return BufferDescriptorGetBuffer(bufHdr);
808  }
809 
810  /*
811  * We get here only in the corner case where we are trying to extend
812  * the relation but we found a pre-existing buffer marked BM_VALID.
813  * This can happen because mdread doesn't complain about reads beyond
814  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
815  * read a block beyond EOF could have left a "valid" zero-filled
816  * buffer. Unfortunately, we have also seen this case occurring
817  * because of buggy Linux kernels that sometimes return an
818  * lseek(SEEK_END) result that doesn't account for a recent write. In
819  * that situation, the pre-existing buffer would contain valid data
820  * that we don't want to overwrite. Since the legitimate case should
821  * always have left a zero-filled buffer, complain if not PageIsNew.
822  */
823  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
824  if (!PageIsNew((Page) bufBlock))
825  ereport(ERROR,
826  (errmsg("unexpected data beyond EOF in block %u of relation %s",
827  blockNum, relpath(smgr->smgr_rnode, forkNum)),
828  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
829 
830  /*
831  * We *must* do smgrextend before succeeding, else the page will not
832  * be reserved by the kernel, and the next P_NEW call will decide to
833  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
834  * call that BufferAlloc didn't, and proceed.
835  */
836  if (isLocalBuf)
837  {
838  /* Only need to adjust flags */
839  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
840 
841  Assert(buf_state & BM_VALID);
842  buf_state &= ~BM_VALID;
843  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
844  }
845  else
846  {
847  /*
848  * Loop to handle the very small possibility that someone re-sets
849  * BM_VALID between our clearing it and StartBufferIO inspecting
850  * it.
851  */
852  do
853  {
854  uint32 buf_state = LockBufHdr(bufHdr);
855 
856  Assert(buf_state & BM_VALID);
857  buf_state &= ~BM_VALID;
858  UnlockBufHdr(bufHdr, buf_state);
859  } while (!StartBufferIO(bufHdr, true));
860  }
861  }
862 
863  /*
864  * if we have gotten to this point, we have allocated a buffer for the
865  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
866  * if it's a shared buffer.
867  *
868  * Note: if smgrextend fails, we will end up with a buffer that is
869  * allocated but not marked BM_VALID. P_NEW will still select the same
870  * block number (because the relation didn't get any longer on disk) and
871  * so future attempts to extend the relation will find the same buffer (if
872  * it's not been recycled) but come right back here to try smgrextend
873  * again.
874  */
875  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
876 
877  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
878 
879  if (isExtend)
880  {
881  /* new buffers are zero-filled */
882  MemSet((char *) bufBlock, 0, BLCKSZ);
883  /* don't set checksum for all-zero page */
884  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
885 
886  /*
887  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
888  * although we're essentially performing a write. At least on linux
889  * doing so defeats the 'delayed allocation' mechanism, leading to
890  * increased file fragmentation.
891  */
892  }
893  else
894  {
895  /*
896  * Read in the page, unless the caller intends to overwrite it and
897  * just wants us to allocate a buffer.
898  */
900  MemSet((char *) bufBlock, 0, BLCKSZ);
901  else
902  {
903  instr_time io_start,
904  io_time;
905 
906  if (track_io_timing)
907  INSTR_TIME_SET_CURRENT(io_start);
908 
909  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
910 
911  if (track_io_timing)
912  {
913  INSTR_TIME_SET_CURRENT(io_time);
914  INSTR_TIME_SUBTRACT(io_time, io_start);
917  }
918 
919  /* check for garbage data */
920  if (!PageIsVerified((Page) bufBlock, blockNum))
921  {
923  {
926  errmsg("invalid page in block %u of relation %s; zeroing out page",
927  blockNum,
928  relpath(smgr->smgr_rnode, forkNum))));
929  MemSet((char *) bufBlock, 0, BLCKSZ);
930  }
931  else
932  ereport(ERROR,
934  errmsg("invalid page in block %u of relation %s",
935  blockNum,
936  relpath(smgr->smgr_rnode, forkNum))));
937  }
938  }
939  }
940 
941  /*
942  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
943  * the page as valid, to make sure that no other backend sees the zeroed
944  * page before the caller has had a chance to initialize it.
945  *
946  * Since no-one else can be looking at the page contents yet, there is no
947  * difference between an exclusive lock and a cleanup-strength lock. (Note
948  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
949  * they assert that the buffer is already valid.)
950  */
952  !isLocalBuf)
953  {
955  }
956 
957  if (isLocalBuf)
958  {
959  /* Only need to adjust flags */
960  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
961 
962  buf_state |= BM_VALID;
963  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
964  }
965  else
966  {
967  /* Set BM_VALID, terminate IO, and wake up any waiters */
968  TerminateBufferIO(bufHdr, false, BM_VALID);
969  }
970 
971  VacuumPageMiss++;
972  if (VacuumCostActive)
974 
975  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
976  smgr->smgr_rnode.node.spcNode,
977  smgr->smgr_rnode.node.dbNode,
978  smgr->smgr_rnode.node.relNode,
979  smgr->smgr_rnode.backend,
980  isExtend,
981  found);
982 
983  return BufferDescriptorGetBuffer(bufHdr);
984 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:64
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:109
static PgChecksumMode mode
Definition: pg_checksums.c:61
long local_blks_hit
Definition: instrument.h:25
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3807
int64 VacuumPageMiss
Definition: globals.c:145
int errhint(const char *fmt,...)
Definition: elog.c:1071
long local_blks_read
Definition: instrument.h:26
int VacuumCostBalance
Definition: globals.c:148
bool PageIsVerified(Page page, BlockNumber blkno)
Definition: bufpage.c:82
instr_time blk_read_time
Definition: instrument.h:31
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct timeval instr_time
Definition: instr_time.h:150
long shared_blks_read
Definition: instrument.h:22
int64 VacuumPageHit
Definition: globals.c:144
int errcode(int sqlerrcode)
Definition: elog.c:610
#define MemSet(start, val, len)
Definition: c.h:949
#define P_NEW
Definition: bufmgr.h:91
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:502
#define SmgrIsTemp(smgr)
Definition: smgr.h:77
long shared_blks_written
Definition: instrument.h:24
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4101
#define ERROR
Definition: elog.h:43
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
unsigned int uint32
Definition: c.h:374
int VacuumCostPageHit
Definition: globals.c:138
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:45
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define WARNING
Definition: elog.h:40
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4168
#define BM_VALID
Definition: buf_internals.h:59
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:60
#define ereport(elevel,...)
Definition: elog.h:144
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1006
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4318
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:549
#define Assert(condition)
Definition: c.h:745
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:1437
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
#define BufferDescriptorGetBuffer(bdesc)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:463
#define PageIsNew(page)
Definition: bufpage.h:229
int errmsg(const char *fmt,...)
Definition: elog.c:824
long shared_blks_hit
Definition: instrument.h:21
#define UnlockBufHdr(desc, s)
long local_blks_written
Definition: instrument.h:28
#define relpath(rnode, forknum)
Definition: relpath.h:87
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
pg_atomic_uint32 state
int VacuumCostPageMiss
Definition: globals.c:139
bool track_io_timing
Definition: bufmgr.c:127
Pointer Page
Definition: bufpage.h:78
BufferUsage pgBufferUsage
Definition: instrument.c:20
void * Block
Definition: bufmgr.h:24
bool VacuumCostActive
Definition: globals.c:149
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241
bool zero_damaged_pages
Definition: bufmgr.c:124

◆ ReadBufferExtended()