PostgreSQL Source Code  git master
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xlog.h"
#include "catalog/catalog.h"
#include "catalog/storage.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner_private.h"
#include "utils/timestamp.h"
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)   LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static Buffer ReadBuffer_common (SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf, bool fixOwner)
 
static void BufferSync (int flags)
 
static uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static bool StartBufferIO (BufferDesc *buf, bool forInput)
 
static void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rnode_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const void *p1, const void *p2)
 
static int ckpt_buforder_comparator (const void *pa, const void *pb)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
static void InvalidateBuffer (BufferDesc *buf)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferPoolAccess (void)
 
void InitBufferPoolBackend (void)
 
void PrintBufferLeakWarning (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
void BufmgrCommit (void)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelFileNodeBuffers (RelFileNodeBackend rnode, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelFileNodesAllBuffers (RelFileNodeBackend *rnodes, int nnodes)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, int mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
void AbortBufferIO (void)
 
uint32 LockBufHdr (BufferDesc *desc)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *context)
 
void TestForOldSnapshot_impl (Snapshot snapshot, Relation relation)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = 0
 
int maintenance_io_concurrency = 0
 
int checkpoint_flush_after = 0
 
int bgwriter_flush_after = 0
 
int backend_flush_after = 0
 
static BufferDescInProgressBuf = NULL
 
static bool IsForInput
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 

Macro Definition Documentation

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 68 of file bufmgr.c.

Referenced by BgBufferSync(), and SyncOneBuffer().

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 67 of file bufmgr.c.

Referenced by BgBufferSync(), BufferSync(), and SyncOneBuffer().

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 60 of file bufmgr.c.

Referenced by BufferAlloc(), and FlushBuffer().

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
int32 * LocalRefCount
Definition: localbuf.c:45

Definition at line 439 of file bufmgr.c.

Referenced by BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetTag(), BufferIsPermanent(), FlushOneBuffer(), IncrBufferRefCount(), MarkBufferDirty(), and ReleaseAndReadBuffer().

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 59 of file bufmgr.c.

Referenced by FlushBuffer(), and ReadBuffer_common().

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 63 of file bufmgr.c.

Referenced by FlushRelationBuffers(), and ReadBuffer_common().

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 70 of file bufmgr.c.

Referenced by DropRelFileNodesAllBuffers(), and FlushRelationsAllBuffers().

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

◆ SMgrSortArray

typedef struct SMgrSortArray SMgrSortArray

Function Documentation

◆ AbortBufferIO()

void AbortBufferIO ( void  )

Definition at line 4172 of file bufmgr.c.

References Assert, buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_VALID, buf, BufferDescriptorGetIOLock, ereport, errcode(), errdetail(), errmsg(), buftag::forkNum, InProgressBuf, IsForInput, LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), pfree(), relpathperm, buftag::rnode, BufferDesc::tag, TerminateBufferIO(), UnlockBufHdr, and WARNING.

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

4173 {
4175 
4176  if (buf)
4177  {
4178  uint32 buf_state;
4179 
4180  /*
4181  * Since LWLockReleaseAll has already been called, we're not holding
4182  * the buffer's io_in_progress_lock. We have to re-acquire it so that
4183  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
4184  * buffer will be in a busy spin until we succeed in doing this.
4185  */
4187 
4188  buf_state = LockBufHdr(buf);
4189  Assert(buf_state & BM_IO_IN_PROGRESS);
4190  if (IsForInput)
4191  {
4192  Assert(!(buf_state & BM_DIRTY));
4193 
4194  /* We'd better not think buffer is valid yet */
4195  Assert(!(buf_state & BM_VALID));
4196  UnlockBufHdr(buf, buf_state);
4197  }
4198  else
4199  {
4200  Assert(buf_state & BM_DIRTY);
4201  UnlockBufHdr(buf, buf_state);
4202  /* Issue notice if this is not the first failure... */
4203  if (buf_state & BM_IO_ERROR)
4204  {
4205  /* Buffer is pinned, so we can read tag without spinlock */
4206  char *path;
4207 
4208  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4209  ereport(WARNING,
4210  (errcode(ERRCODE_IO_ERROR),
4211  errmsg("could not write block %u of %s",
4212  buf->tag.blockNum, path),
4213  errdetail("Multiple failures --- write error might be permanent.")));
4214  pfree(path);
4215  }
4216  }
4217  TerminateBufferIO(buf, false, BM_IO_ERROR);
4218  }
4219 }
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
ForkNumber forkNum
Definition: buf_internals.h:93
int errcode(int sqlerrcode)
Definition: elog.c:610
#define BM_DIRTY
Definition: buf_internals.h:58
#define BufferDescriptorGetIOLock(bdesc)
static BufferDesc * InProgressBuf
Definition: bufmgr.c:152
void pfree(void *pointer)
Definition: mcxt.c:1056
static char * buf
Definition: pg_test_fsync.c:67
int errdetail(const char *fmt,...)
Definition: elog.c:957
unsigned int uint32
Definition: c.h:367
static bool IsForInput
Definition: bufmgr.c:153
#define WARNING
Definition: elog.h:40
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4140
#define BM_VALID
Definition: buf_internals.h:59
#define ereport(elevel,...)
Definition: elog.h:144
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
#define Assert(condition)
Definition: c.h:738
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
#define BM_IO_ERROR
Definition: buf_internals.h:62
BufferTag tag
int errmsg(const char *fmt,...)
Definition: elog.c:824
#define UnlockBufHdr(desc, s)
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:61

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 2443 of file bufmgr.c.

References Assert, AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

2444 {
2446 
2447  AtEOXact_LocalBuffers(isCommit);
2448 
2450 }
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:190
#define Assert(condition)
Definition: c.h:738
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2518
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:578

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 2499 of file bufmgr.c.

References AbortBufferIO(), AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferPoolBackend().

2500 {
2501  AbortBufferIO();
2502  UnlockBuffers();
2503 
2505 
2506  /* localbuf.c needs a chance too */
2508 }
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:589
void UnlockBuffers(void)
Definition: bufmgr.c:3694
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2518
void AbortBufferIO(void)
Definition: bufmgr.c:4172

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 2073 of file bufmgr.c.

References Assert, bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, BgWriterStats, BUF_REUSABLE, BUF_WRITTEN, CurrentResourceOwner, DEBUG1, DEBUG2, elog, PgStat_MsgBgWriter::m_buf_alloc, PgStat_MsgBgWriter::m_buf_written_clean, PgStat_MsgBgWriter::m_maxwritten_clean, NBuffers, ResourceOwnerEnlargeBuffers(), StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

2074 {
2075  /* info obtained from freelist.c */
2076  int strategy_buf_id;
2077  uint32 strategy_passes;
2078  uint32 recent_alloc;
2079 
2080  /*
2081  * Information saved between calls so we can determine the strategy
2082  * point's advance rate and avoid scanning already-cleaned buffers.
2083  */
2084  static bool saved_info_valid = false;
2085  static int prev_strategy_buf_id;
2086  static uint32 prev_strategy_passes;
2087  static int next_to_clean;
2088  static uint32 next_passes;
2089 
2090  /* Moving averages of allocation rate and clean-buffer density */
2091  static float smoothed_alloc = 0;
2092  static float smoothed_density = 10.0;
2093 
2094  /* Potentially these could be tunables, but for now, not */
2095  float smoothing_samples = 16;
2096  float scan_whole_pool_milliseconds = 120000.0;
2097 
2098  /* Used to compute how far we scan ahead */
2099  long strategy_delta;
2100  int bufs_to_lap;
2101  int bufs_ahead;
2102  float scans_per_alloc;
2103  int reusable_buffers_est;
2104  int upcoming_alloc_est;
2105  int min_scan_buffers;
2106 
2107  /* Variables for the scanning loop proper */
2108  int num_to_scan;
2109  int num_written;
2110  int reusable_buffers;
2111 
2112  /* Variables for final smoothed_density update */
2113  long new_strategy_delta;
2114  uint32 new_recent_alloc;
2115 
2116  /*
2117  * Find out where the freelist clock sweep currently is, and how many
2118  * buffer allocations have happened since our last call.
2119  */
2120  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2121 
2122  /* Report buffer alloc counts to pgstat */
2123  BgWriterStats.m_buf_alloc += recent_alloc;
2124 
2125  /*
2126  * If we're not running the LRU scan, just stop after doing the stats
2127  * stuff. We mark the saved state invalid so that we can recover sanely
2128  * if LRU scan is turned back on later.
2129  */
2130  if (bgwriter_lru_maxpages <= 0)
2131  {
2132  saved_info_valid = false;
2133  return true;
2134  }
2135 
2136  /*
2137  * Compute strategy_delta = how many buffers have been scanned by the
2138  * clock sweep since last time. If first time through, assume none. Then
2139  * see if we are still ahead of the clock sweep, and if so, how many
2140  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2141  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2142  * behavior when the passes counts wrap around.
2143  */
2144  if (saved_info_valid)
2145  {
2146  int32 passes_delta = strategy_passes - prev_strategy_passes;
2147 
2148  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2149  strategy_delta += (long) passes_delta * NBuffers;
2150 
2151  Assert(strategy_delta >= 0);
2152 
2153  if ((int32) (next_passes - strategy_passes) > 0)
2154  {
2155  /* we're one pass ahead of the strategy point */
2156  bufs_to_lap = strategy_buf_id - next_to_clean;
2157 #ifdef BGW_DEBUG
2158  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2159  next_passes, next_to_clean,
2160  strategy_passes, strategy_buf_id,
2161  strategy_delta, bufs_to_lap);
2162 #endif
2163  }
2164  else if (next_passes == strategy_passes &&
2165  next_to_clean >= strategy_buf_id)
2166  {
2167  /* on same pass, but ahead or at least not behind */
2168  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2169 #ifdef BGW_DEBUG
2170  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2171  next_passes, next_to_clean,
2172  strategy_passes, strategy_buf_id,
2173  strategy_delta, bufs_to_lap);
2174 #endif
2175  }
2176  else
2177  {
2178  /*
2179  * We're behind, so skip forward to the strategy point and start
2180  * cleaning from there.
2181  */
2182 #ifdef BGW_DEBUG
2183  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2184  next_passes, next_to_clean,
2185  strategy_passes, strategy_buf_id,
2186  strategy_delta);
2187 #endif
2188  next_to_clean = strategy_buf_id;
2189  next_passes = strategy_passes;
2190  bufs_to_lap = NBuffers;
2191  }
2192  }
2193  else
2194  {
2195  /*
2196  * Initializing at startup or after LRU scanning had been off. Always
2197  * start at the strategy point.
2198  */
2199 #ifdef BGW_DEBUG
2200  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2201  strategy_passes, strategy_buf_id);
2202 #endif
2203  strategy_delta = 0;
2204  next_to_clean = strategy_buf_id;
2205  next_passes = strategy_passes;
2206  bufs_to_lap = NBuffers;
2207  }
2208 
2209  /* Update saved info for next time */
2210  prev_strategy_buf_id = strategy_buf_id;
2211  prev_strategy_passes = strategy_passes;
2212  saved_info_valid = true;
2213 
2214  /*
2215  * Compute how many buffers had to be scanned for each new allocation, ie,
2216  * 1/density of reusable buffers, and track a moving average of that.
2217  *
2218  * If the strategy point didn't move, we don't update the density estimate
2219  */
2220  if (strategy_delta > 0 && recent_alloc > 0)
2221  {
2222  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2223  smoothed_density += (scans_per_alloc - smoothed_density) /
2224  smoothing_samples;
2225  }
2226 
2227  /*
2228  * Estimate how many reusable buffers there are between the current
2229  * strategy point and where we've scanned ahead to, based on the smoothed
2230  * density estimate.
2231  */
2232  bufs_ahead = NBuffers - bufs_to_lap;
2233  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2234 
2235  /*
2236  * Track a moving average of recent buffer allocations. Here, rather than
2237  * a true average we want a fast-attack, slow-decline behavior: we
2238  * immediately follow any increase.
2239  */
2240  if (smoothed_alloc <= (float) recent_alloc)
2241  smoothed_alloc = recent_alloc;
2242  else
2243  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2244  smoothing_samples;
2245 
2246  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2247  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2248 
2249  /*
2250  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2251  * eventually underflow to zero, and the underflows produce annoying
2252  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2253  * zero, there's no point in tracking smaller and smaller values of
2254  * smoothed_alloc, so just reset it to exactly zero to avoid this
2255  * syndrome. It will pop back up as soon as recent_alloc increases.
2256  */
2257  if (upcoming_alloc_est == 0)
2258  smoothed_alloc = 0;
2259 
2260  /*
2261  * Even in cases where there's been little or no buffer allocation
2262  * activity, we want to make a small amount of progress through the buffer
2263  * cache so that as many reusable buffers as possible are clean after an
2264  * idle period.
2265  *
2266  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2267  * the BGW will be called during the scan_whole_pool time; slice the
2268  * buffer pool into that many sections.
2269  */
2270  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2271 
2272  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2273  {
2274 #ifdef BGW_DEBUG
2275  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2276  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2277 #endif
2278  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2279  }
2280 
2281  /*
2282  * Now write out dirty reusable buffers, working forward from the
2283  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2284  * enough buffers to match our estimate of the next cycle's allocation
2285  * requirements, or hit the bgwriter_lru_maxpages limit.
2286  */
2287 
2288  /* Make sure we can handle the pin inside SyncOneBuffer */
2290 
2291  num_to_scan = bufs_to_lap;
2292  num_written = 0;
2293  reusable_buffers = reusable_buffers_est;
2294 
2295  /* Execute the LRU scan */
2296  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2297  {
2298  int sync_state = SyncOneBuffer(next_to_clean, true,
2299  wb_context);
2300 
2301  if (++next_to_clean >= NBuffers)
2302  {
2303  next_to_clean = 0;
2304  next_passes++;
2305  }
2306  num_to_scan--;
2307 
2308  if (sync_state & BUF_WRITTEN)
2309  {
2310  reusable_buffers++;
2311  if (++num_written >= bgwriter_lru_maxpages)
2312  {
2314  break;
2315  }
2316  }
2317  else if (sync_state & BUF_REUSABLE)
2318  reusable_buffers++;
2319  }
2320 
2321  BgWriterStats.m_buf_written_clean += num_written;
2322 
2323 #ifdef BGW_DEBUG
2324  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2325  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2326  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2327  bufs_to_lap - num_to_scan,
2328  num_written,
2329  reusable_buffers - reusable_buffers_est);
2330 #endif
2331 
2332  /*
2333  * Consider the above scan as being like a new allocation scan.
2334  * Characterize its density and update the smoothed one based on it. This
2335  * effectively halves the moving average period in cases where both the
2336  * strategy and the background writer are doing some useful scanning,
2337  * which is helpful because a long memory isn't as desirable on the
2338  * density estimates.
2339  */
2340  new_strategy_delta = bufs_to_lap - num_to_scan;
2341  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2342  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2343  {
2344  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2345  smoothed_density += (scans_per_alloc - smoothed_density) /
2346  smoothing_samples;
2347 
2348 #ifdef BGW_DEBUG
2349  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2350  new_recent_alloc, new_strategy_delta,
2351  scans_per_alloc, smoothed_density);
2352 #endif
2353  }
2354 
2355  /* Return true if OK to hibernate */
2356  return (bufs_to_lap == 0 && recent_alloc == 0);
2357 }
PgStat_Counter m_buf_alloc
Definition: pgstat.h:434
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:395
#define DEBUG1
Definition: elog.h:25
int BgWriterDelay
Definition: bgwriter.c:64
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
PgStat_Counter m_maxwritten_clean
Definition: pgstat.h:431
PgStat_Counter m_buf_written_clean
Definition: pgstat.h:430
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:142
double bgwriter_lru_multiplier
Definition: bufmgr.c:125
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2376
signed int int32
Definition: c.h:355
#define BUF_REUSABLE
Definition: bufmgr.c:68
int bgwriter_lru_maxpages
Definition: bufmgr.c:124
#define DEBUG2
Definition: elog.h:24
unsigned int uint32
Definition: c.h:367
#define BUF_WRITTEN
Definition: bufmgr.c:67
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define Assert(condition)
Definition: c.h:738
#define elog(elevel,...)
Definition: elog.h:214
int NBuffers
Definition: globals.c:131

◆ BufferAlloc()

static BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr 
)
static

Definition at line 1005 of file bufmgr.c.

References Assert, BackendWritebackContext, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_ERROR, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BUF_FLAG_MASK, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BUF_USAGECOUNT_ONE, BufferDescriptorGetContentLock, BufferGetLSN, BufMappingPartitionLock, BufTableDelete(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), RelFileNode::dbNode, FlushBuffer(), GetBufferDescriptor, INIT_BUFFERTAG, INIT_FORKNUM, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockConditionalAcquire(), LWLockRelease(), RelFileNodeBackend::node, PinBuffer(), PinBuffer_Locked(), RelFileNode::relNode, ReservePrivateRefCountEntry(), ScheduleBufferTagForWriteback(), SMgrRelationData::smgr_rnode, RelFileNode::spcNode, StartBufferIO(), StrategyGetBuffer(), StrategyRejectBuffer(), BufferDesc::tag, UnlockBufHdr, UnpinBuffer(), and XLogNeedsFlush().

Referenced by ReadBuffer_common().

1009 {
1010  BufferTag newTag; /* identity of requested block */
1011  uint32 newHash; /* hash value for newTag */
1012  LWLock *newPartitionLock; /* buffer partition lock for it */
1013  BufferTag oldTag; /* previous identity of selected buffer */
1014  uint32 oldHash; /* hash value for oldTag */
1015  LWLock *oldPartitionLock; /* buffer partition lock for it */
1016  uint32 oldFlags;
1017  int buf_id;
1018  BufferDesc *buf;
1019  bool valid;
1020  uint32 buf_state;
1021 
1022  /* create a tag so we can lookup the buffer */
1023  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
1024 
1025  /* determine its hash code and partition lock ID */
1026  newHash = BufTableHashCode(&newTag);
1027  newPartitionLock = BufMappingPartitionLock(newHash);
1028 
1029  /* see if the block is in the buffer pool already */
1030  LWLockAcquire(newPartitionLock, LW_SHARED);
1031  buf_id = BufTableLookup(&newTag, newHash);
1032  if (buf_id >= 0)
1033  {
1034  /*
1035  * Found it. Now, pin the buffer so no one can steal it from the
1036  * buffer pool, and check to see if the correct data has been loaded
1037  * into the buffer.
1038  */
1039  buf = GetBufferDescriptor(buf_id);
1040 
1041  valid = PinBuffer(buf, strategy);
1042 
1043  /* Can release the mapping lock as soon as we've pinned it */
1044  LWLockRelease(newPartitionLock);
1045 
1046  *foundPtr = true;
1047 
1048  if (!valid)
1049  {
1050  /*
1051  * We can only get here if (a) someone else is still reading in
1052  * the page, or (b) a previous read attempt failed. We have to
1053  * wait for any active read attempt to finish, and then set up our
1054  * own read attempt if the page is still not BM_VALID.
1055  * StartBufferIO does it all.
1056  */
1057  if (StartBufferIO(buf, true))
1058  {
1059  /*
1060  * If we get here, previous attempts to read the buffer must
1061  * have failed ... but we shall bravely try again.
1062  */
1063  *foundPtr = false;
1064  }
1065  }
1066 
1067  return buf;
1068  }
1069 
1070  /*
1071  * Didn't find it in the buffer pool. We'll have to initialize a new
1072  * buffer. Remember to unlock the mapping lock while doing the work.
1073  */
1074  LWLockRelease(newPartitionLock);
1075 
1076  /* Loop here in case we have to try another victim buffer */
1077  for (;;)
1078  {
1079  /*
1080  * Ensure, while the spinlock's not yet held, that there's a free
1081  * refcount entry.
1082  */
1084 
1085  /*
1086  * Select a victim buffer. The buffer is returned with its header
1087  * spinlock still held!
1088  */
1089  buf = StrategyGetBuffer(strategy, &buf_state);
1090 
1091  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1092 
1093  /* Must copy buffer flags while we still hold the spinlock */
1094  oldFlags = buf_state & BUF_FLAG_MASK;
1095 
1096  /* Pin the buffer and then release the buffer spinlock */
1097  PinBuffer_Locked(buf);
1098 
1099  /*
1100  * If the buffer was dirty, try to write it out. There is a race
1101  * condition here, in that someone might dirty it after we released it
1102  * above, or even while we are writing it out (since our share-lock
1103  * won't prevent hint-bit updates). We will recheck the dirty bit
1104  * after re-locking the buffer header.
1105  */
1106  if (oldFlags & BM_DIRTY)
1107  {
1108  /*
1109  * We need a share-lock on the buffer contents to write it out
1110  * (else we might write invalid data, eg because someone else is
1111  * compacting the page contents while we write). We must use a
1112  * conditional lock acquisition here to avoid deadlock. Even
1113  * though the buffer was not pinned (and therefore surely not
1114  * locked) when StrategyGetBuffer returned it, someone else could
1115  * have pinned and exclusive-locked it by the time we get here. If
1116  * we try to get the lock unconditionally, we'd block waiting for
1117  * them; if they later block waiting for us, deadlock ensues.
1118  * (This has been observed to happen when two backends are both
1119  * trying to split btree index pages, and the second one just
1120  * happens to be trying to split the page the first one got from
1121  * StrategyGetBuffer.)
1122  */
1124  LW_SHARED))
1125  {
1126  /*
1127  * If using a nondefault strategy, and writing the buffer
1128  * would require a WAL flush, let the strategy decide whether
1129  * to go ahead and write/reuse the buffer or to choose another
1130  * victim. We need lock to inspect the page LSN, so this
1131  * can't be done inside StrategyGetBuffer.
1132  */
1133  if (strategy != NULL)
1134  {
1135  XLogRecPtr lsn;
1136 
1137  /* Read the LSN while holding buffer header lock */
1138  buf_state = LockBufHdr(buf);
1139  lsn = BufferGetLSN(buf);
1140  UnlockBufHdr(buf, buf_state);
1141 
1142  if (XLogNeedsFlush(lsn) &&
1143  StrategyRejectBuffer(strategy, buf))
1144  {
1145  /* Drop lock/pin and loop around for another buffer */
1147  UnpinBuffer(buf, true);
1148  continue;
1149  }
1150  }
1151 
1152  /* OK, do the I/O */
1153  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1154  smgr->smgr_rnode.node.spcNode,
1155  smgr->smgr_rnode.node.dbNode,
1156  smgr->smgr_rnode.node.relNode);
1157 
1158  FlushBuffer(buf, NULL);
1160 
1162  &buf->tag);
1163 
1164  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1165  smgr->smgr_rnode.node.spcNode,
1166  smgr->smgr_rnode.node.dbNode,
1167  smgr->smgr_rnode.node.relNode);
1168  }
1169  else
1170  {
1171  /*
1172  * Someone else has locked the buffer, so give it up and loop
1173  * back to get another one.
1174  */
1175  UnpinBuffer(buf, true);
1176  continue;
1177  }
1178  }
1179 
1180  /*
1181  * To change the association of a valid buffer, we'll need to have
1182  * exclusive lock on both the old and new mapping partitions.
1183  */
1184  if (oldFlags & BM_TAG_VALID)
1185  {
1186  /*
1187  * Need to compute the old tag's hashcode and partition lock ID.
1188  * XXX is it worth storing the hashcode in BufferDesc so we need
1189  * not recompute it here? Probably not.
1190  */
1191  oldTag = buf->tag;
1192  oldHash = BufTableHashCode(&oldTag);
1193  oldPartitionLock = BufMappingPartitionLock(oldHash);
1194 
1195  /*
1196  * Must lock the lower-numbered partition first to avoid
1197  * deadlocks.
1198  */
1199  if (oldPartitionLock < newPartitionLock)
1200  {
1201  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1202  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1203  }
1204  else if (oldPartitionLock > newPartitionLock)
1205  {
1206  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1207  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1208  }
1209  else
1210  {
1211  /* only one partition, only one lock */
1212  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1213  }
1214  }
1215  else
1216  {
1217  /* if it wasn't valid, we need only the new partition */
1218  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1219  /* remember we have no old-partition lock or tag */
1220  oldPartitionLock = NULL;
1221  /* keep the compiler quiet about uninitialized variables */
1222  oldHash = 0;
1223  }
1224 
1225  /*
1226  * Try to make a hashtable entry for the buffer under its new tag.
1227  * This could fail because while we were writing someone else
1228  * allocated another buffer for the same block we want to read in.
1229  * Note that we have not yet removed the hashtable entry for the old
1230  * tag.
1231  */
1232  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1233 
1234  if (buf_id >= 0)
1235  {
1236  /*
1237  * Got a collision. Someone has already done what we were about to
1238  * do. We'll just handle this as if it were found in the buffer
1239  * pool in the first place. First, give up the buffer we were
1240  * planning to use.
1241  */
1242  UnpinBuffer(buf, true);
1243 
1244  /* Can give up that buffer's mapping partition lock now */
1245  if (oldPartitionLock != NULL &&
1246  oldPartitionLock != newPartitionLock)
1247  LWLockRelease(oldPartitionLock);
1248 
1249  /* remaining code should match code at top of routine */
1250 
1251  buf = GetBufferDescriptor(buf_id);
1252 
1253  valid = PinBuffer(buf, strategy);
1254 
1255  /* Can release the mapping lock as soon as we've pinned it */
1256  LWLockRelease(newPartitionLock);
1257 
1258  *foundPtr = true;
1259 
1260  if (!valid)
1261  {
1262  /*
1263  * We can only get here if (a) someone else is still reading
1264  * in the page, or (b) a previous read attempt failed. We
1265  * have to wait for any active read attempt to finish, and
1266  * then set up our own read attempt if the page is still not
1267  * BM_VALID. StartBufferIO does it all.
1268  */
1269  if (StartBufferIO(buf, true))
1270  {
1271  /*
1272  * If we get here, previous attempts to read the buffer
1273  * must have failed ... but we shall bravely try again.
1274  */
1275  *foundPtr = false;
1276  }
1277  }
1278 
1279  return buf;
1280  }
1281 
1282  /*
1283  * Need to lock the buffer header too in order to change its tag.
1284  */
1285  buf_state = LockBufHdr(buf);
1286 
1287  /*
1288  * Somebody could have pinned or re-dirtied the buffer while we were
1289  * doing the I/O and making the new hashtable entry. If so, we can't
1290  * recycle this buffer; we must undo everything we've done and start
1291  * over with a new victim buffer.
1292  */
1293  oldFlags = buf_state & BUF_FLAG_MASK;
1294  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1295  break;
1296 
1297  UnlockBufHdr(buf, buf_state);
1298  BufTableDelete(&newTag, newHash);
1299  if (oldPartitionLock != NULL &&
1300  oldPartitionLock != newPartitionLock)
1301  LWLockRelease(oldPartitionLock);
1302  LWLockRelease(newPartitionLock);
1303  UnpinBuffer(buf, true);
1304  }
1305 
1306  /*
1307  * Okay, it's finally safe to rename the buffer.
1308  *
1309  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1310  * paranoia. We also reset the usage_count since any recency of use of
1311  * the old content is no longer relevant. (The usage_count starts out at
1312  * 1 so that the buffer can survive one clock-sweep pass.)
1313  *
1314  * Make sure BM_PERMANENT is set for buffers that must be written at every
1315  * checkpoint. Unlogged buffers only need to be written at shutdown
1316  * checkpoints, except for their "init" forks, which need to be treated
1317  * just like permanent relations.
1318  */
1319  buf->tag = newTag;
1320  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1323  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1324  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1325  else
1326  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1327 
1328  UnlockBufHdr(buf, buf_state);
1329 
1330  if (oldPartitionLock != NULL)
1331  {
1332  BufTableDelete(&oldTag, oldHash);
1333  if (oldPartitionLock != newPartitionLock)
1334  LWLockRelease(oldPartitionLock);
1335  }
1336 
1337  LWLockRelease(newPartitionLock);
1338 
1339  /*
1340  * Buffer contents are currently invalid. Try to get the io_in_progress
1341  * lock. If StartBufferIO returns false, then someone else managed to
1342  * read it before we did, so there's nothing left for BufferAlloc() to do.
1343  */
1344  if (StartBufferIO(buf, true))
1345  *foundPtr = false;
1346  else
1347  *foundPtr = true;
1348 
1349  return buf;
1350 }
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1589
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
Definition: lwlock.h:31
#define BM_PERMANENT
Definition: buf_internals.h:66
#define BufMappingPartitionLock(hashcode)
#define BM_TAG_VALID
Definition: buf_internals.h:60
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3168
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
#define BM_DIRTY
Definition: buf_internals.h:58
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2693
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4073
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4442
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
WritebackContext BackendWritebackContext
Definition: buf_init.c:23
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1380
static char * buf
Definition: pg_test_fsync.c:67
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
unsigned int uint32
Definition: c.h:367
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1712
#define BM_VALID
Definition: buf_internals.h:59
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:686
RelFileNode node
Definition: relfilenode.h:74
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1674
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
#define BM_IO_ERROR
Definition: buf_internals.h:62
BufferTag tag
#define UnlockBufHdr(desc, s)
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:60
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 2633 of file bufmgr.c.

References Assert, buftag::blockNum, BufferIsLocal, BufferIsPinned, GetBufferDescriptor, GetLocalBufferDescriptor, and BufferDesc::tag.

Referenced by _bt_check_unique(), _bt_checkpage(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_endpoint(), _bt_finish_split(), _bt_first(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_split(), _bt_unlink_halfdead_page(), _bt_walk_left(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_with_data(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_is_all_visible(), heap_prune_chain(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), ReadBufferBI(), RelationAddExtraBlocks(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgWalk(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), visibilitymap_set(), and XLogReadBufferExtended().

2634 {
2635  BufferDesc *bufHdr;
2636 
2637  Assert(BufferIsPinned(buffer));
2638 
2639  if (BufferIsLocal(buffer))
2640  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2641  else
2642  bufHdr = GetBufferDescriptor(buffer - 1);
2643 
2644  /* pinned, so OK to read tag without spinlock */
2645  return bufHdr->tag.blockNum;
2646 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
#define GetLocalBufferDescriptor(id)
#define GetBufferDescriptor(id)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94
BufferTag tag

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 2896 of file bufmgr.c.

References Assert, BufferGetPage, BufferIsLocal, BufferIsPinned, BufferIsValid, GetBufferDescriptor, LockBufHdr(), PageGetLSN, UnlockBufHdr, and XLogHintBitIsNeeded.

Referenced by _bt_killitems(), _bt_readpage(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

2897 {
2898  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2899  char *page = BufferGetPage(buffer);
2900  XLogRecPtr lsn;
2901  uint32 buf_state;
2902 
2903  /*
2904  * If we don't need locking for correctness, fastpath out.
2905  */
2906  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2907  return PageGetLSN(page);
2908 
2909  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2910  Assert(BufferIsValid(buffer));
2911  Assert(BufferIsPinned(buffer));
2912 
2913  buf_state = LockBufHdr(bufHdr);
2914  lsn = PageGetLSN(page);
2915  UnlockBufHdr(bufHdr, buf_state);
2916 
2917  return lsn;
2918 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define PageGetLSN(page)
Definition: bufpage.h:366
#define UnlockBufHdr(desc, s)
#define XLogHintBitIsNeeded()
Definition: xlog.h:202

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileNode rnode,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 2654 of file bufmgr.c.

References Assert, buftag::blockNum, BufferIsLocal, BufferIsPinned, buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, buftag::rnode, and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

2656 {
2657  BufferDesc *bufHdr;
2658 
2659  /* Do the same checks as BufferGetBlockNumber. */
2660  Assert(BufferIsPinned(buffer));
2661 
2662  if (BufferIsLocal(buffer))
2663  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2664  else
2665  bufHdr = GetBufferDescriptor(buffer - 1);
2666 
2667  /* pinned, so OK to read tag without spinlock */
2668  *rnode = bufHdr->tag.rnode;
2669  *forknum = bufHdr->tag.forkNum;
2670  *blknum = bufHdr->tag.blockNum;
2671 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
ForkNumber forkNum
Definition: buf_internals.h:93
#define GetLocalBufferDescriptor(id)
#define GetBufferDescriptor(id)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 2866 of file bufmgr.c.

References Assert, BM_PERMANENT, BufferIsLocal, BufferIsPinned, BufferIsValid, GetBufferDescriptor, pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

2867 {
2868  BufferDesc *bufHdr;
2869 
2870  /* Local buffers are used only for temp relations. */
2871  if (BufferIsLocal(buffer))
2872  return false;
2873 
2874  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2875  Assert(BufferIsValid(buffer));
2876  Assert(BufferIsPinned(buffer));
2877 
2878  /*
2879  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2880  * need not bother with the buffer header spinlock. Even if someone else
2881  * changes the buffer header state while we're doing this, the state is
2882  * changed atomically, so we'll read the old value or the new value, but
2883  * not random garbage.
2884  */
2885  bufHdr = GetBufferDescriptor(buffer - 1);
2886  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2887 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
#define BM_PERMANENT
Definition: buf_internals.h:66
#define GetBufferDescriptor(id)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
pg_atomic_uint32 state
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 1796 of file bufmgr.c.

References Assert, BgWriterStats, binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_ALL, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), ckpt_buforder_comparator(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, CurrentResourceOwner, DatumGetPointer, buftag::forkNum, CkptSortItem::forkNum, GetBufferDescriptor, i, CkptTsStatus::index, InvalidOid, IssuePendingWritebacks(), LockBufHdr(), PgStat_MsgBgWriter::m_buf_written_checkpoints, NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), pfree(), pg_atomic_read_u32(), PointerGetDatum, ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress, CkptTsStatus::progress_slice, qsort, RelFileNode::relNode, CkptSortItem::relNode, repalloc(), ResourceOwnerEnlargeBuffers(), buftag::rnode, RelFileNode::spcNode, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdr, and WritebackContextInit().

Referenced by CheckPointBuffers().

1797 {
1798  uint32 buf_state;
1799  int buf_id;
1800  int num_to_scan;
1801  int num_spaces;
1802  int num_processed;
1803  int num_written;
1804  CkptTsStatus *per_ts_stat = NULL;
1805  Oid last_tsid;
1806  binaryheap *ts_heap;
1807  int i;
1808  int mask = BM_DIRTY;
1809  WritebackContext wb_context;
1810 
1811  /* Make sure we can handle the pin inside SyncOneBuffer */
1813 
1814  /*
1815  * Unless this is a shutdown checkpoint or we have been explicitly told,
1816  * we write only permanent, dirty buffers. But at shutdown or end of
1817  * recovery, we write all dirty buffers.
1818  */
1821  mask |= BM_PERMANENT;
1822 
1823  /*
1824  * Loop over all buffers, and mark the ones that need to be written with
1825  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1826  * can estimate how much work needs to be done.
1827  *
1828  * This allows us to write only those pages that were dirty when the
1829  * checkpoint began, and not those that get dirtied while it proceeds.
1830  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1831  * later in this function, or by normal backends or the bgwriter cleaning
1832  * scan, the flag is cleared. Any buffer dirtied after this point won't
1833  * have the flag set.
1834  *
1835  * Note that if we fail to write some buffer, we may leave buffers with
1836  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1837  * certainly need to be written for the next checkpoint attempt, too.
1838  */
1839  num_to_scan = 0;
1840  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1841  {
1842  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1843 
1844  /*
1845  * Header spinlock is enough to examine BM_DIRTY, see comment in
1846  * SyncOneBuffer.
1847  */
1848  buf_state = LockBufHdr(bufHdr);
1849 
1850  if ((buf_state & mask) == mask)
1851  {
1852  CkptSortItem *item;
1853 
1854  buf_state |= BM_CHECKPOINT_NEEDED;
1855 
1856  item = &CkptBufferIds[num_to_scan++];
1857  item->buf_id = buf_id;
1858  item->tsId = bufHdr->tag.rnode.spcNode;
1859  item->relNode = bufHdr->tag.rnode.relNode;
1860  item->forkNum = bufHdr->tag.forkNum;
1861  item->blockNum = bufHdr->tag.blockNum;
1862  }
1863 
1864  UnlockBufHdr(bufHdr, buf_state);
1865 
1866  /* Check for barrier events in case NBuffers is large. */
1869  }
1870 
1871  if (num_to_scan == 0)
1872  return; /* nothing to do */
1873 
1875 
1876  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1877 
1878  /*
1879  * Sort buffers that need to be written to reduce the likelihood of random
1880  * IO. The sorting is also important for the implementation of balancing
1881  * writes between tablespaces. Without balancing writes we'd potentially
1882  * end up writing to the tablespaces one-by-one; possibly overloading the
1883  * underlying system.
1884  */
1885  qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1887 
1888  num_spaces = 0;
1889 
1890  /*
1891  * Allocate progress status for each tablespace with buffers that need to
1892  * be flushed. This requires the to-be-flushed array to be sorted.
1893  */
1894  last_tsid = InvalidOid;
1895  for (i = 0; i < num_to_scan; i++)
1896  {
1897  CkptTsStatus *s;
1898  Oid cur_tsid;
1899 
1900  cur_tsid = CkptBufferIds[i].tsId;
1901 
1902  /*
1903  * Grow array of per-tablespace status structs, every time a new
1904  * tablespace is found.
1905  */
1906  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1907  {
1908  Size sz;
1909 
1910  num_spaces++;
1911 
1912  /*
1913  * Not worth adding grow-by-power-of-2 logic here - even with a
1914  * few hundred tablespaces this should be fine.
1915  */
1916  sz = sizeof(CkptTsStatus) * num_spaces;
1917 
1918  if (per_ts_stat == NULL)
1919  per_ts_stat = (CkptTsStatus *) palloc(sz);
1920  else
1921  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1922 
1923  s = &per_ts_stat[num_spaces - 1];
1924  memset(s, 0, sizeof(*s));
1925  s->tsId = cur_tsid;
1926 
1927  /*
1928  * The first buffer in this tablespace. As CkptBufferIds is sorted
1929  * by tablespace all (s->num_to_scan) buffers in this tablespace
1930  * will follow afterwards.
1931  */
1932  s->index = i;
1933 
1934  /*
1935  * progress_slice will be determined once we know how many buffers
1936  * are in each tablespace, i.e. after this loop.
1937  */
1938 
1939  last_tsid = cur_tsid;
1940  }
1941  else
1942  {
1943  s = &per_ts_stat[num_spaces - 1];
1944  }
1945 
1946  s->num_to_scan++;
1947 
1948  /* Check for barrier events. */
1951  }
1952 
1953  Assert(num_spaces > 0);
1954 
1955  /*
1956  * Build a min-heap over the write-progress in the individual tablespaces,
1957  * and compute how large a portion of the total progress a single
1958  * processed buffer is.
1959  */
1960  ts_heap = binaryheap_allocate(num_spaces,
1962  NULL);
1963 
1964  for (i = 0; i < num_spaces; i++)
1965  {
1966  CkptTsStatus *ts_stat = &per_ts_stat[i];
1967 
1968  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
1969 
1970  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
1971  }
1972 
1973  binaryheap_build(ts_heap);
1974 
1975  /*
1976  * Iterate through to-be-checkpointed buffers and write the ones (still)
1977  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
1978  * tablespaces; otherwise the sorting would lead to only one tablespace
1979  * receiving writes at a time, making inefficient use of the hardware.
1980  */
1981  num_processed = 0;
1982  num_written = 0;
1983  while (!binaryheap_empty(ts_heap))
1984  {
1985  BufferDesc *bufHdr = NULL;
1986  CkptTsStatus *ts_stat = (CkptTsStatus *)
1988 
1989  buf_id = CkptBufferIds[ts_stat->index].buf_id;
1990  Assert(buf_id != -1);
1991 
1992  bufHdr = GetBufferDescriptor(buf_id);
1993 
1994  num_processed++;
1995 
1996  /*
1997  * We don't need to acquire the lock here, because we're only looking
1998  * at a single bit. It's possible that someone else writes the buffer
1999  * and clears the flag right after we check, but that doesn't matter
2000  * since SyncOneBuffer will then do nothing. However, there is a
2001  * further race condition: it's conceivable that between the time we
2002  * examine the bit here and the time SyncOneBuffer acquires the lock,
2003  * someone else not only wrote the buffer but replaced it with another
2004  * page and dirtied it. In that improbable case, SyncOneBuffer will
2005  * write the buffer though we didn't need to. It doesn't seem worth
2006  * guarding against this, though.
2007  */
2009  {
2010  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2011  {
2012  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2014  num_written++;
2015  }
2016  }
2017 
2018  /*
2019  * Measure progress independent of actually having to flush the buffer
2020  * - otherwise writing become unbalanced.
2021  */
2022  ts_stat->progress += ts_stat->progress_slice;
2023  ts_stat->num_scanned++;
2024  ts_stat->index++;
2025 
2026  /* Have all the buffers from the tablespace been processed? */
2027  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2028  {
2029  binaryheap_remove_first(ts_heap);
2030  }
2031  else
2032  {
2033  /* update heap with the new progress */
2034  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2035  }
2036 
2037  /*
2038  * Sleep to throttle our I/O rate.
2039  *
2040  * (This will check for barrier events even if it doesn't sleep.)
2041  */
2042  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2043  }
2044 
2045  /* issue all pending flushes */
2046  IssuePendingWritebacks(&wb_context);
2047 
2048  pfree(per_ts_stat);
2049  per_ts_stat = NULL;
2050  binaryheap_free(ts_heap);
2051 
2052  /*
2053  * Update checkpoint statistics. As noted above, this doesn't include
2054  * buffers written by other backends or bgwriter scan.
2055  */
2056  CheckpointStats.ckpt_bufs_written += num_written;
2057 
2058  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2059 }
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:670
PgStat_Counter m_buf_written_checkpoints
Definition: pgstat.h:429
#define BM_PERMANENT
Definition: buf_internals.h:66
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:226
Oid tsId
Definition: bufmgr.c:88
#define binaryheap_empty(h)
Definition: binaryheap.h:52
ForkNumber forkNum
Definition: buf_internals.h:93
#define PointerGetDatum(X)
Definition: postgres.h:556
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:452
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4407
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:142
int checkpoint_flush_after
Definition: bufmgr.c:147
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
unsigned int Oid
Definition: postgres_ext.h:31
#define BM_DIRTY
Definition: buf_internals.h:58
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2376
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4476
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4430
void pfree(void *pointer)
Definition: mcxt.c:1056
double float8
Definition: c.h:491
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
int num_to_scan
Definition: bufmgr.c:101
float8 progress_slice
Definition: bufmgr.c:98
int index
Definition: bufmgr.c:106
float8 progress
Definition: bufmgr.c:97
static int ckpt_buforder_comparator(const void *pa, const void *pb)
Definition: bufmgr.c:4373
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:222
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
#define BUF_WRITTEN
Definition: bufmgr.c:67
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
int ckpt_bufs_written
Definition: xlog.h:251
BlockNumber blockNum
#define InvalidOid
Definition: postgres_ext.h:36
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:126
const symbol * s
Definition: header.h:17
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
#define Assert(condition)
Definition: c.h:738
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:35
CheckpointStatsData CheckpointStats
Definition: xlog.c:185
CkptSortItem * CkptBufferIds
Definition: buf_init.c:24
size_t Size
Definition: c.h:466
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:69
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:33
#define DatumGetPointer(X)
Definition: postgres.h:549
BufferTag tag
void * palloc(Size size)
Definition: mcxt.c:949
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
pg_atomic_uint32 state
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:174
int num_scanned
Definition: bufmgr.c:103
#define qsort(a, b, c, d)
Definition: port.h:479
ForkNumber forkNum
struct CkptTsStatus CkptTsStatus
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:221
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ buffertag_comparator()

static int buffertag_comparator ( const void *  p1,
const void *  p2 
)
static

Definition at line 4342 of file bufmgr.c.

References buftag::blockNum, buftag::forkNum, buftag::rnode, and rnode_comparator().

Referenced by IssuePendingWritebacks().

4343 {
4344  const BufferTag *ba = (const BufferTag *) a;
4345  const BufferTag *bb = (const BufferTag *) b;
4346  int ret;
4347 
4348  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4349 
4350  if (ret != 0)
4351  return ret;
4352 
4353  if (ba->forkNum < bb->forkNum)
4354  return -1;
4355  if (ba->forkNum > bb->forkNum)
4356  return 1;
4357 
4358  if (ba->blockNum < bb->blockNum)
4359  return -1;
4360  if (ba->blockNum > bb->blockNum)
4361  return 1;
4362 
4363  return 0;
4364 }
ForkNumber forkNum
Definition: buf_internals.h:93
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4263
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92

◆ BufmgrCommit()

void BufmgrCommit ( void  )

Definition at line 2619 of file bufmgr.c.

Referenced by PrepareTransaction(), and RecordTransactionCommit().

2620 {
2621  /* Nothing to do in bufmgr anymore... */
2622 }

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 2518 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, hash_seq_init(), hash_seq_search(), i, InvalidBuffer, PrintBufferLeakWarning(), PrivateRefCountArray, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

2519 {
2520 #ifdef USE_ASSERT_CHECKING
2521  int RefCountErrors = 0;
2522  PrivateRefCountEntry *res;
2523  int i;
2524 
2525  /* check the array */
2526  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2527  {
2528  res = &PrivateRefCountArray[i];
2529 
2530  if (res->buffer != InvalidBuffer)
2531  {
2533  RefCountErrors++;
2534  }
2535  }
2536 
2537  /* if necessary search the hash */
2539  {
2540  HASH_SEQ_STATUS hstat;
2541 
2543  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2544  {
2546  RefCountErrors++;
2547  }
2548 
2549  }
2550 
2551  Assert(RefCountErrors == 0);
2552 #endif
2553 }
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2559
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:190
#define InvalidBuffer
Definition: buf.h:25
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:188
#define Assert(condition)
Definition: c.h:738
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:79
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1391
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1381
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:189
int i

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 2602 of file bufmgr.c.

References BufferSync(), CheckpointStats, CheckpointStatsData::ckpt_sync_end_t, CheckpointStatsData::ckpt_sync_t, CheckpointStatsData::ckpt_write_t, GetCurrentTimestamp(), and ProcessSyncRequests().

Referenced by CheckPointGuts().

2603 {
2604  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
2606  BufferSync(flags);
2608  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
2611  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
2612 }
void ProcessSyncRequests(void)
Definition: sync.c:236
TimestampTz ckpt_sync_end_t
Definition: xlog.h:248
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1574
static void BufferSync(int flags)
Definition: bufmgr.c:1796
CheckpointStatsData CheckpointStats
Definition: xlog.c:185
TimestampTz ckpt_write_t
Definition: xlog.h:246
TimestampTz ckpt_sync_t
Definition: xlog.h:247

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const void *  pa,
const void *  pb 
)
static

Definition at line 4373 of file bufmgr.c.

References CkptSortItem::blockNum, CkptSortItem::forkNum, CkptSortItem::relNode, and CkptSortItem::tsId.

Referenced by BufferSync().

4374 {
4375  const CkptSortItem *a = (const CkptSortItem *) pa;
4376  const CkptSortItem *b = (const CkptSortItem *) pb;
4377 
4378  /* compare tablespace */
4379  if (a->tsId < b->tsId)
4380  return -1;
4381  else if (a->tsId > b->tsId)
4382  return 1;
4383  /* compare relation */
4384  if (a->relNode < b->relNode)
4385  return -1;
4386  else if (a->relNode > b->relNode)
4387  return 1;
4388  /* compare fork */
4389  else if (a->forkNum < b->forkNum)
4390  return -1;
4391  else if (a->forkNum > b->forkNum)
4392  return 1;
4393  /* compare block number */
4394  else if (a->blockNum < b->blockNum)
4395  return -1;
4396  else if (a->blockNum > b->blockNum)
4397  return 1;
4398  /* equal page IDs are unlikely, but not impossible */
4399  return 0;
4400 }
BlockNumber blockNum
ForkNumber forkNum

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

Definition at line 3748 of file bufmgr.c.

References Assert, buf, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsValid, GetBufferDescriptor, LW_EXCLUSIVE, and LWLockConditionalAcquire().

Referenced by _bt_getbuf(), _bt_search_insert(), BloomNewBuffer(), ConditionalLockBufferForCleanup(), GinNewBuffer(), gistNewBuffer(), RelationGetBufferForTuple(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), and SpGistUpdateMetaPage().

3749 {
3750  BufferDesc *buf;
3751 
3752  Assert(BufferIsValid(buffer));
3753  if (BufferIsLocal(buffer))
3754  return true; /* act as though we got it */
3755 
3756  buf = GetBufferDescriptor(buffer - 1);
3757 
3759  LW_EXCLUSIVE);
3760 }
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1380
static char * buf
Definition: pg_test_fsync.c:67
#define GetBufferDescriptor(id)
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 3916 of file bufmgr.c.

References Assert, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid, ConditionalLockBuffer(), GetBufferDescriptor, GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr.

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), lazy_scan_heap(), and lazy_vacuum_heap().

3917 {
3918  BufferDesc *bufHdr;
3919  uint32 buf_state,
3920  refcount;
3921 
3922  Assert(BufferIsValid(buffer));
3923 
3924  if (BufferIsLocal(buffer))
3925  {
3926  refcount = LocalRefCount[-buffer - 1];
3927  /* There should be exactly one pin */
3928  Assert(refcount > 0);
3929  if (refcount != 1)
3930  return false;
3931  /* Nobody else to wait for */
3932  return true;
3933  }
3934 
3935  /* There should be exactly one local pin */
3936  refcount = GetPrivateRefCount(buffer);
3937  Assert(refcount);
3938  if (refcount != 1)
3939  return false;
3940 
3941  /* Try to acquire lock */
3942  if (!ConditionalLockBuffer(buffer))
3943  return false;
3944 
3945  bufHdr = GetBufferDescriptor(buffer - 1);
3946  buf_state = LockBufHdr(bufHdr);
3947  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3948 
3949  Assert(refcount > 0);
3950  if (refcount == 1)
3951  {
3952  /* Successfully acquired exclusive lock with pincount 1 */
3953  UnlockBufHdr(bufHdr, buf_state);
3954  return true;
3955  }
3956 
3957  /* Failed, so release the lock */
3958  UnlockBufHdr(bufHdr, buf_state);
3959  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3960  return false;
3961 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:367
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3748
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3722
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define UnlockBufHdr(desc, s)
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 3119 of file bufmgr.c.

References buftag::blockNum, buf, BufferDescriptorGetBuffer, RelFileNode::dbNode, elog, buftag::forkNum, BufferDesc::freeNext, GetBufferDescriptor, GetPrivateRefCount(), i, InvalidateBuffer(), InvalidBackendId, LockBufHdr(), LOG, NBuffers, relpathbackend, relpathperm, buftag::rnode, BufferDesc::tag, and UnlockBufHdr.

Referenced by dbase_redo(), dropdb(), and movedb().

3120 {
3121  int i;
3122 
3123  /*
3124  * We needn't consider local buffers, since by assumption the target
3125  * database isn't our own.
3126  */
3127 
3128  for (i = 0; i < NBuffers; i++)
3129  {
3130  BufferDesc *bufHdr = GetBufferDescriptor(i);
3131  uint32 buf_state;
3132 
3133  /*
3134  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3135  * and saves some cycles.
3136  */
3137  if (bufHdr->tag.rnode.dbNode != dbid)
3138  continue;
3139 
3140  buf_state = LockBufHdr(bufHdr);
3141  if (bufHdr->tag.rnode.dbNode == dbid)
3142  InvalidateBuffer(bufHdr); /* releases spinlock */
3143  else
3144  UnlockBufHdr(bufHdr, buf_state);
3145  }
3146 }
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1370
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131

◆ DropRelFileNodeBuffers()

void DropRelFileNodeBuffers ( RelFileNodeBackend  rnode,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 2947 of file bufmgr.c.

References RelFileNodeBackend::backend, buftag::blockNum, DropRelFileNodeLocalBuffers(), buftag::forkNum, GetBufferDescriptor, i, InvalidateBuffer(), LockBufHdr(), MyBackendId, NBuffers, RelFileNodeBackend::node, RelFileNodeBackendIsTemp, RelFileNodeEquals, buftag::rnode, BufferDesc::tag, and UnlockBufHdr.

Referenced by smgrtruncate().

2949 {
2950  int i;
2951  int j;
2952 
2953  /* If it's a local relation, it's localbuf.c's problem. */
2954  if (RelFileNodeBackendIsTemp(rnode))
2955  {
2956  if (rnode.backend == MyBackendId)
2957  {
2958  for (j = 0; j < nforks; j++)
2959  DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
2960  firstDelBlock[j]);
2961  }
2962  return;
2963  }
2964 
2965  for (i = 0; i < NBuffers; i++)
2966  {
2967  BufferDesc *bufHdr = GetBufferDescriptor(i);
2968  uint32 buf_state;
2969 
2970  /*
2971  * We can make this a tad faster by prechecking the buffer tag before
2972  * we attempt to lock the buffer; this saves a lot of lock
2973  * acquisitions in typical cases. It should be safe because the
2974  * caller must have AccessExclusiveLock on the relation, or some other
2975  * reason to be certain that no one is loading new pages of the rel
2976  * into the buffer pool. (Otherwise we might well miss such pages
2977  * entirely.) Therefore, while the tag might be changing while we
2978  * look at it, it can't be changing *to* a value we care about, only
2979  * *away* from such a value. So false negatives are impossible, and
2980  * false positives are safe because we'll recheck after getting the
2981  * buffer lock.
2982  *
2983  * We could check forkNum and blockNum as well as the rnode, but the
2984  * incremental win from doing so seems small.
2985  */
2986  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
2987  continue;
2988 
2989  buf_state = LockBufHdr(bufHdr);
2990 
2991  for (j = 0; j < nforks; j++)
2992  {
2993  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
2994  bufHdr->tag.forkNum == forkNum[j] &&
2995  bufHdr->tag.blockNum >= firstDelBlock[j])
2996  {
2997  InvalidateBuffer(bufHdr); /* releases spinlock */
2998  break;
2999  }
3000  }
3001  if (j >= nforks)
3002  UnlockBufHdr(bufHdr, buf_state);
3003  }
3004 }
BackendId MyBackendId
Definition: globals.c:81
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ForkNumber forkNum
Definition: buf_internals.h:93
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1370
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:326
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
RelFileNode node
Definition: relfilenode.h:74
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
BackendId backend
Definition: relfilenode.h:75
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ DropRelFileNodesAllBuffers()

void DropRelFileNodesAllBuffers ( RelFileNodeBackend rnodes,
int  nnodes 
)

Definition at line 3016 of file bufmgr.c.

References DropRelFileNodeAllLocalBuffers(), GetBufferDescriptor, i, InvalidateBuffer(), LockBufHdr(), MyBackendId, NBuffers, RelFileNodeBackend::node, palloc(), pfree(), pg_qsort(), RelFileNodeBackendIsTemp, RelFileNodeEquals, RELS_BSEARCH_THRESHOLD, buftag::rnode, rnode_comparator(), BufferDesc::tag, and UnlockBufHdr.

Referenced by smgrdounlinkall().

3017 {
3018  int i,
3019  n = 0;
3020  RelFileNode *nodes;
3021  bool use_bsearch;
3022 
3023  if (nnodes == 0)
3024  return;
3025 
3026  nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
3027 
3028  /* If it's a local relation, it's localbuf.c's problem. */
3029  for (i = 0; i < nnodes; i++)
3030  {
3031  if (RelFileNodeBackendIsTemp(rnodes[i]))
3032  {
3033  if (rnodes[i].backend == MyBackendId)
3034  DropRelFileNodeAllLocalBuffers(rnodes[i].node);
3035  }
3036  else
3037  nodes[n++] = rnodes[i].node;
3038  }
3039 
3040  /*
3041  * If there are no non-local relations, then we're done. Release the
3042  * memory and return.
3043  */
3044  if (n == 0)
3045  {
3046  pfree(nodes);
3047  return;
3048  }
3049 
3050  /*
3051  * For low number of relations to drop just use a simple walk through, to
3052  * save the bsearch overhead. The threshold to use is rather a guess than
3053  * an exactly determined value, as it depends on many factors (CPU and RAM
3054  * speeds, amount of shared buffers etc.).
3055  */
3056  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3057 
3058  /* sort the list of rnodes if necessary */
3059  if (use_bsearch)
3060  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
3061 
3062  for (i = 0; i < NBuffers; i++)
3063  {
3064  RelFileNode *rnode = NULL;
3065  BufferDesc *bufHdr = GetBufferDescriptor(i);
3066  uint32 buf_state;
3067 
3068  /*
3069  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3070  * and saves some cycles.
3071  */
3072 
3073  if (!use_bsearch)
3074  {
3075  int j;
3076 
3077  for (j = 0; j < n; j++)
3078  {
3079  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3080  {
3081  rnode = &nodes[j];
3082  break;
3083  }
3084  }
3085  }
3086  else
3087  {
3088  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3089  nodes, n, sizeof(RelFileNode),
3091  }
3092 
3093  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3094  if (rnode == NULL)
3095  continue;
3096 
3097  buf_state = LockBufHdr(bufHdr);
3098  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3099  InvalidateBuffer(bufHdr); /* releases spinlock */
3100  else
3101  UnlockBufHdr(bufHdr, buf_state);
3102  }
3103 
3104  pfree(nodes);
3105 }
BackendId MyBackendId
Definition: globals.c:81
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1370
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:70
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:373
void pfree(void *pointer)
Definition: mcxt.c:1056
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4263
RelFileNode node
Definition: relfilenode.h:74
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
void * palloc(Size size)
Definition: mcxt.c:949
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln 
)
static

Definition at line 2693 of file bufmgr.c.

References ErrorContextCallback::arg, BufferUsage::blk_write_time, buftag::blockNum, BM_JUST_DIRTIED, BM_PERMANENT, BufferGetLSN, BufHdrGetBlock, ErrorContextCallback::callback, RelFileNode::dbNode, error_context_stack, buftag::forkNum, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, InvalidBackendId, LockBufHdr(), RelFileNodeBackend::node, PageSetChecksumCopy(), pgBufferUsage, pgstat_count_buffer_write_time, ErrorContextCallback::previous, RelFileNode::relNode, buftag::rnode, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rnode, smgropen(), smgrwrite(), RelFileNode::spcNode, StartBufferIO(), BufferDesc::tag, TerminateBufferIO(), track_io_timing, UnlockBufHdr, and XLogFlush().

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushOneBuffer(), FlushRelationBuffers(), FlushRelationsAllBuffers(), and SyncOneBuffer().

2694 {
2695  XLogRecPtr recptr;
2696  ErrorContextCallback errcallback;
2697  instr_time io_start,
2698  io_time;
2699  Block bufBlock;
2700  char *bufToWrite;
2701  uint32 buf_state;
2702 
2703  /*
2704  * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2705  * false, then someone else flushed the buffer before we could, so we need
2706  * not do anything.
2707  */
2708  if (!StartBufferIO(buf, false))
2709  return;
2710 
2711  /* Setup error traceback support for ereport() */
2713  errcallback.arg = (void *) buf;
2714  errcallback.previous = error_context_stack;
2715  error_context_stack = &errcallback;
2716 
2717  /* Find smgr relation for buffer */
2718  if (reln == NULL)
2719  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2720 
2721  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2722  buf->tag.blockNum,
2723  reln->smgr_rnode.node.spcNode,
2724  reln->smgr_rnode.node.dbNode,
2725  reln->smgr_rnode.node.relNode);
2726 
2727  buf_state = LockBufHdr(buf);
2728 
2729  /*
2730  * Run PageGetLSN while holding header lock, since we don't have the
2731  * buffer locked exclusively in all cases.
2732  */
2733  recptr = BufferGetLSN(buf);
2734 
2735  /* To check if block content changes while flushing. - vadim 01/17/97 */
2736  buf_state &= ~BM_JUST_DIRTIED;
2737  UnlockBufHdr(buf, buf_state);
2738 
2739  /*
2740  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2741  * rule that log updates must hit disk before any of the data-file changes
2742  * they describe do.
2743  *
2744  * However, this rule does not apply to unlogged relations, which will be
2745  * lost after a crash anyway. Most unlogged relation pages do not bear
2746  * LSNs since we never emit WAL records for them, and therefore flushing
2747  * up through the buffer LSN would be useless, but harmless. However,
2748  * GiST indexes use LSNs internally to track page-splits, and therefore
2749  * unlogged GiST pages bear "fake" LSNs generated by
2750  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2751  * LSN counter could advance past the WAL insertion point; and if it did
2752  * happen, attempting to flush WAL through that location would fail, with
2753  * disastrous system-wide consequences. To make sure that can't happen,
2754  * skip the flush if the buffer isn't permanent.
2755  */
2756  if (buf_state & BM_PERMANENT)
2757  XLogFlush(recptr);
2758 
2759  /*
2760  * Now it's safe to write buffer to disk. Note that no one else should
2761  * have been able to write it while we were busy with log flushing because
2762  * we have the io_in_progress lock.
2763  */
2764  bufBlock = BufHdrGetBlock(buf);
2765 
2766  /*
2767  * Update page checksum if desired. Since we have only shared lock on the
2768  * buffer, other processes might be updating hint bits in it, so we must
2769  * copy the page to private storage if we do checksumming.
2770  */
2771  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2772 
2773  if (track_io_timing)
2774  INSTR_TIME_SET_CURRENT(io_start);
2775 
2776  /*
2777  * bufToWrite is either the shared buffer or a copy, as appropriate.
2778  */
2779  smgrwrite(reln,
2780  buf->tag.forkNum,
2781  buf->tag.blockNum,
2782  bufToWrite,
2783  false);
2784 
2785  if (track_io_timing)
2786  {
2787  INSTR_TIME_SET_CURRENT(io_time);
2788  INSTR_TIME_SUBTRACT(io_time, io_start);
2791  }
2792 
2794 
2795  /*
2796  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2797  * end the io_in_progress state.
2798  */
2799  TerminateBufferIO(buf, true, 0);
2800 
2801  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2802  buf->tag.blockNum,
2803  reln->smgr_rnode.node.spcNode,
2804  reln->smgr_rnode.node.dbNode,
2805  reln->smgr_rnode.node.relNode);
2806 
2807  /* Pop the error context stack */
2808  error_context_stack = errcallback.previous;
2809 }
#define BM_PERMANENT
Definition: buf_internals.h:66
ForkNumber forkNum
Definition: buf_internals.h:93
struct timeval instr_time
Definition: instr_time.h:150
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1165
void(* callback)(void *arg)
Definition: elog.h:229
struct ErrorContextCallback * previous
Definition: elog.h:228
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2847
ErrorContextCallback * error_context_stack
Definition: elog.c:92
long shared_blks_written
Definition: instrument.h:24
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4073
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:513
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
unsigned int uint32
Definition: c.h:367
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:145
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4140
#define InvalidBackendId
Definition: backendid.h:23
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:59
RelFileNode node
Definition: relfilenode.h:74
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
instr_time blk_write_time
Definition: instrument.h:32
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:1434
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
BufferTag tag
#define UnlockBufHdr(desc, s)
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:60
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4225
bool track_io_timing
Definition: bufmgr.c:126
Pointer Page
Definition: bufpage.h:78
BufferUsage pgBufferUsage
Definition: instrument.c:20
void * Block
Definition: bufmgr.h:24

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 3420 of file bufmgr.c.

References BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock, CurrentResourceOwner, RelFileNode::dbNode, FlushBuffer(), GetBufferDescriptor, i, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by dbase_redo().

3421 {
3422  int i;
3423  BufferDesc *bufHdr;
3424 
3425  /* Make sure we can handle the pin inside the loop */
3427 
3428  for (i = 0; i < NBuffers; i++)
3429  {
3430  uint32 buf_state;
3431 
3432  bufHdr = GetBufferDescriptor(i);
3433 
3434  /*
3435  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3436  * and saves some cycles.
3437  */
3438  if (bufHdr->tag.rnode.dbNode != dbid)
3439  continue;
3440 
3442 
3443  buf_state = LockBufHdr(bufHdr);
3444  if (bufHdr->tag.rnode.dbNode == dbid &&
3445  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3446  {
3447  PinBuffer_Locked(bufHdr);
3449  FlushBuffer(bufHdr, NULL);
3451  UnpinBuffer(bufHdr, true);
3452  }
3453  else
3454  UnlockBufHdr(bufHdr, buf_state);
3455  }
3456 }
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
#define BM_DIRTY
Definition: buf_internals.h:58
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2693
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1712
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define BM_VALID
Definition: buf_internals.h:59
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1674
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 3463 of file bufmgr.c.

References Assert, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, FlushBuffer(), GetBufferDescriptor, and LWLockHeldByMe().

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), and XLogReadBufferForRedoExtended().

3464 {
3465  BufferDesc *bufHdr;
3466 
3467  /* currently not needed, but no fundamental reason not to support */
3468  Assert(!BufferIsLocal(buffer));
3469 
3470  Assert(BufferIsPinned(buffer));
3471 
3472  bufHdr = GetBufferDescriptor(buffer - 1);
3473 
3475 
3476  FlushBuffer(bufHdr, NULL);
3477 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1928
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2693
#define GetBufferDescriptor(id)
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 3224 of file bufmgr.c.

References ErrorContextCallback::arg, buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_VALID, BufferDescriptorGetContentLock, ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushBuffer(), buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, i, local_buffer_write_error_callback(), LocalBufHdrGetBlock, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, NLocBuffer, PageSetChecksumInplace(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), PinBuffer_Locked(), ErrorContextCallback::previous, RelationData::rd_node, RelationData::rd_smgr, RelationOpenSmgr, RelationUsesLocalBuffers, RelFileNodeEquals, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, smgrwrite(), BufferDesc::state, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by heapam_relation_copy_data(), and index_copy_data().

3225 {
3226  int i;
3227  BufferDesc *bufHdr;
3228 
3229  /* Open rel at the smgr level if not already done */
3230  RelationOpenSmgr(rel);
3231 
3232  if (RelationUsesLocalBuffers(rel))
3233  {
3234  for (i = 0; i < NLocBuffer; i++)
3235  {
3236  uint32 buf_state;
3237 
3238  bufHdr = GetLocalBufferDescriptor(i);
3239  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3240  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3241  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3242  {
3243  ErrorContextCallback errcallback;
3244  Page localpage;
3245 
3246  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3247 
3248  /* Setup error traceback support for ereport() */
3250  errcallback.arg = (void *) bufHdr;
3251  errcallback.previous = error_context_stack;
3252  error_context_stack = &errcallback;
3253 
3254  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3255 
3256  smgrwrite(rel->rd_smgr,
3257  bufHdr->tag.forkNum,
3258  bufHdr->tag.blockNum,
3259  localpage,
3260  false);
3261 
3262  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3263  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3264 
3265  /* Pop the error context stack */
3266  error_context_stack = errcallback.previous;
3267  }
3268  }
3269 
3270  return;
3271  }
3272 
3273  /* Make sure we can handle the pin inside the loop */
3275 
3276  for (i = 0; i < NBuffers; i++)
3277  {
3278  uint32 buf_state;
3279 
3280  bufHdr = GetBufferDescriptor(i);
3281 
3282  /*
3283  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3284  * and saves some cycles.
3285  */
3286  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3287  continue;
3288 
3290 
3291  buf_state = LockBufHdr(bufHdr);
3292  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3293  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3294  {
3295  PinBuffer_Locked(bufHdr);
3297  FlushBuffer(bufHdr, rel->rd_smgr);
3299  UnpinBuffer(bufHdr, true);
3300  }
3301  else
3302  UnlockBufHdr(bufHdr, buf_state);
3303  }
3304 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:63
ForkNumber forkNum
Definition: buf_internals.h:93
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4244
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
#define GetLocalBufferDescriptor(id)
#define BM_DIRTY
Definition: buf_internals.h:58
void(* callback)(void *arg)
Definition: elog.h:229
struct ErrorContextCallback * previous
Definition: elog.h:228
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2693
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
ErrorContextCallback * error_context_stack
Definition: elog.c:92
#define RelationOpenSmgr(relation)
Definition: rel.h:513
int NLocBuffer
Definition: localbuf.c:41
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:513
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
unsigned int uint32
Definition: c.h:367
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1712
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define BM_VALID
Definition: buf_internals.h:59
RelFileNode rd_node
Definition: rel.h:55
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1674
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1194
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:572
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206
pg_atomic_uint32 state
Pointer Page
Definition: bufpage.h:78
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 3316 of file bufmgr.c.

References Assert, BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock, CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor, i, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, RelFileNodeBackend::node, palloc(), pfree(), pg_qsort(), PinBuffer_Locked(), RelFileNodeBackendIsTemp, RelFileNodeEquals, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, SMgrSortArray::rnode, rnode_comparator(), SMgrRelationData::smgr_rnode, SMgrSortArray::srel, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by smgrdosyncall().

3317 {
3318  int i;
3319  SMgrSortArray *srels;
3320  bool use_bsearch;
3321 
3322  if (nrels == 0)
3323  return;
3324 
3325  /* fill-in array for qsort */
3326  srels = palloc(sizeof(SMgrSortArray) * nrels);
3327 
3328  for (i = 0; i < nrels; i++)
3329  {
3330  Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
3331 
3332  srels[i].rnode = smgrs[i]->smgr_rnode.node;
3333  srels[i].srel = smgrs[i];
3334  }
3335 
3336  /*
3337  * Save the bsearch overhead for low number of relations to sync. See
3338  * DropRelFileNodesAllBuffers for details.
3339  */
3340  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
3341 
3342  /* sort the list of SMgrRelations if necessary */
3343  if (use_bsearch)
3344  pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
3345 
3346  /* Make sure we can handle the pin inside the loop */
3348 
3349  for (i = 0; i < NBuffers; i++)
3350  {
3351  SMgrSortArray *srelent = NULL;
3352  BufferDesc *bufHdr = GetBufferDescriptor(i);
3353  uint32 buf_state;
3354 
3355  /*
3356  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3357  * and saves some cycles.
3358  */
3359 
3360  if (!use_bsearch)
3361  {
3362  int j;
3363 
3364  for (j = 0; j < nrels; j++)
3365  {
3366  if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
3367  {
3368  srelent = &srels[j];
3369  break;
3370  }
3371  }
3372 
3373  }
3374  else
3375  {
3376  srelent = bsearch((const void *) &(bufHdr->tag.rnode),
3377  srels, nrels, sizeof(SMgrSortArray),
3379  }
3380 
3381  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3382  if (srelent == NULL)
3383  continue;
3384 
3386 
3387  buf_state = LockBufHdr(bufHdr);
3388  if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
3389  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3390  {
3391  PinBuffer_Locked(bufHdr);
3393  FlushBuffer(bufHdr, srelent->srel);
3395  UnpinBuffer(bufHdr, true);
3396  }
3397  else
3398  UnlockBufHdr(bufHdr, buf_state);
3399  }
3400 
3401  pfree(srels);
3402 }
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:70
#define BM_DIRTY
Definition: buf_internals.h:58
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2693
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
void pfree(void *pointer)
Definition: mcxt.c:1056
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
SMgrRelation srel
Definition: bufmgr.c:119
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1712
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4263
#define BM_VALID
Definition: buf_internals.h:59
RelFileNode node
Definition: relfilenode.h:74
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
#define Assert(condition)
Definition: c.h:738
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1674
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
void * palloc(Size size)
Definition: mcxt.c:949
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206
RelFileNode rnode
Definition: bufmgr.c:118
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 401 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, and REFCOUNT_ARRAY_ENTRIES.

Referenced by UnpinBuffer().

402 {
403  Assert(ref->refcount == 0);
404 
405  if (ref >= &PrivateRefCountArray[0] &&
407  {
408  ref->buffer = InvalidBuffer;
409 
410  /*
411  * Mark the just used entry as reserved - in many scenarios that
412  * allows us to avoid ever having to search the array/hash for free
413  * entries.
414  */
415  ReservedRefCountEntry = ref;
416  }
417  else
418  {
419  bool found;
420  Buffer buffer = ref->buffer;
421 
423  (void *) &buffer,
424  HASH_REMOVE,
425  &found);
426  Assert(found);
429  }
430 }
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:190
#define InvalidBuffer
Definition: buf.h:25
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:908
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:188
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:192
#define Assert(condition)
Definition: c.h:738
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:79
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:189
int Buffer
Definition: buf.h:23

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 378 of file bufmgr.c.

References Assert, BufferIsLocal, BufferIsValid, GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), and PrintBufferLeakWarning().

379 {
381 
382  Assert(BufferIsValid(buffer));
383  Assert(!BufferIsLocal(buffer));
384 
385  /*
386  * Not moving the entry - that's ok for the current users, but we might
387  * want to change this one day.
388  */
389  ref = GetPrivateRefCountEntry(buffer, false);
390 
391  if (ref == NULL)
392  return 0;
393  return ref->refcount;
394 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:298
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 298 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid, free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBuffer().

299 {
301  int i;
302 
303  Assert(BufferIsValid(buffer));
304  Assert(!BufferIsLocal(buffer));
305 
306  /*
307  * First search for references in the array, that'll be sufficient in the
308  * majority of cases.
309  */
310  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
311  {
312  res = &PrivateRefCountArray[i];
313 
314  if (res->buffer == buffer)
315  return res;
316  }
317 
318  /*
319  * By here we know that the buffer, if already pinned, isn't residing in
320  * the array.
321  *
322  * Only look up the buffer in the hashtable if we've previously overflowed
323  * into it.
324  */
325  if (PrivateRefCountOverflowed == 0)
326  return NULL;
327 
329  (void *) &buffer,
330  HASH_FIND,
331  NULL);
332 
333  if (res == NULL)
334  return NULL;
335  else if (!do_move)
336  {
337  /* caller doesn't want us to move the hash entry into the array */
338  return res;
339  }
340  else
341  {
342  /* move buffer from hashtable into the free array slot */
343  bool found;
345 
346  /* Ensure there's a free array slot */
348 
349  /* Use up the reserved slot */
350  Assert(ReservedRefCountEntry != NULL);
351  free = ReservedRefCountEntry;
352  ReservedRefCountEntry = NULL;
353  Assert(free->buffer == InvalidBuffer);
354 
355  /* and fill it */
356  free->buffer = buffer;
357  free->refcount = res->refcount;
358 
359  /* delete from hashtable */
361  (void *) &buffer,
362  HASH_REMOVE,
363  &found);
364  Assert(found);
367 
368  return free;
369  }
370 }
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:190
#define InvalidBuffer
Definition: buf.h:25
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:908
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:188
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:192
#define free(a)
Definition: header.h:65
#define Assert(condition)
Definition: c.h:738
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:79
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:189
int i
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 3890 of file bufmgr.c.

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and RecoveryConflictInterrupt().

3891 {
3892  int bufid = GetStartupBufferPinWaitBufId();
3893 
3894  /*
3895  * If we get woken slowly then it's possible that the Startup process was
3896  * already woken by other backends before we got here. Also possible that
3897  * we get here by multiple interrupts or interrupts at inappropriate
3898  * times, so make sure we do nothing if the bufid is not set.
3899  */
3900  if (bufid < 0)
3901  return false;
3902 
3903  if (GetPrivateRefCount(bufid + 1) > 0)
3904  return true;
3905 
3906  return false;
3907 }
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:657

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 3521 of file bufmgr.c.

References Assert, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, GetPrivateRefCountEntry(), LocalRefCount, PrivateRefCountEntry::refcount, ResourceOwnerEnlargeBuffers(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

3522 {
3523  Assert(BufferIsPinned(buffer));
3525  if (BufferIsLocal(buffer))
3526  LocalRefCount[-buffer - 1]++;
3527  else
3528  {
3529  PrivateRefCountEntry *ref;
3530 
3531  ref = GetPrivateRefCountEntry(buffer, true);
3532  Assert(ref != NULL);
3533  ref->refcount++;
3534  }
3536 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:298
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:930
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
int32 * LocalRefCount
Definition: localbuf.c:45

◆ InitBufferPoolAccess()

void InitBufferPoolAccess ( void  )

Definition at line 2465 of file bufmgr.c.

References HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MemSet, and PrivateRefCountArray.

Referenced by BaseInit().

2466 {
2467  HASHCTL hash_ctl;
2468 
2469  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2470 
2471  MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2472  hash_ctl.keysize = sizeof(int32);
2473  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2474 
2475  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2476  HASH_ELEM | HASH_BLOBS);
2477 }
struct PrivateRefCountEntry PrivateRefCountEntry
#define HASH_ELEM
Definition: hsearch.h:87
Size entrysize
Definition: hsearch.h:73
#define MemSet(start, val, len)
Definition: c.h:971
signed int int32
Definition: c.h:355
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:188
#define HASH_BLOBS
Definition: hsearch.h:88
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:318
Size keysize
Definition: hsearch.h:72
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:189

◆ InitBufferPoolBackend()

void InitBufferPoolBackend ( void  )

Definition at line 2489 of file bufmgr.c.

References AtProcExit_Buffers(), and on_shmem_exit().

Referenced by AuxiliaryProcessMain(), and InitPostgres().

2490 {
2492 }
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2499

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 1370 of file bufmgr.c.

References Assert, BM_LOCKED, BM_TAG_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer, BUFFERTAGS_EQUAL, BufMappingPartitionLock, BufTableDelete(), BufTableHashCode(), CLEAR_BUFFERTAG, elog, ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), BufferDesc::state, StrategyFreeBuffer(), BufferDesc::tag, UnlockBufHdr, and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelFileNodeBuffers(), and DropRelFileNodesAllBuffers().

1371 {
1372  BufferTag oldTag;
1373  uint32 oldHash; /* hash value for oldTag */
1374  LWLock *oldPartitionLock; /* buffer partition lock for it */
1375  uint32 oldFlags;
1376  uint32 buf_state;
1377 
1378  /* Save the original buffer tag before dropping the spinlock */
1379  oldTag = buf->tag;
1380 
1381  buf_state = pg_atomic_read_u32(&buf->state);
1382  Assert(buf_state & BM_LOCKED);
1383  UnlockBufHdr(buf, buf_state);
1384 
1385  /*
1386  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1387  * worth storing the hashcode in BufferDesc so we need not recompute it
1388  * here? Probably not.
1389  */
1390  oldHash = BufTableHashCode(&oldTag);
1391  oldPartitionLock = BufMappingPartitionLock(oldHash);
1392 
1393 retry:
1394 
1395  /*
1396  * Acquire exclusive mapping lock in preparation for changing the buffer's
1397  * association.
1398  */
1399  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1400 
1401  /* Re-lock the buffer header */
1402  buf_state = LockBufHdr(buf);
1403 
1404  /* If it's changed while we were waiting for lock, do nothing */
1405  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1406  {
1407  UnlockBufHdr(buf, buf_state);
1408  LWLockRelease(oldPartitionLock);
1409  return;
1410  }
1411 
1412  /*
1413  * We assume the only reason for it to be pinned is that someone else is
1414  * flushing the page out. Wait for them to finish. (This could be an
1415  * infinite loop if the refcount is messed up... it would be nice to time
1416  * out after awhile, but there seems no way to be sure how many loops may
1417  * be needed. Note that if the other guy has pinned the buffer but not
1418  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1419  * be busy-looping here.)
1420  */
1421  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1422  {
1423  UnlockBufHdr(buf, buf_state);
1424  LWLockRelease(oldPartitionLock);
1425  /* safety check: should definitely not be our *own* pin */
1427  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1428  WaitIO(buf);
1429  goto retry;
1430  }
1431 
1432  /*
1433  * Clear out the buffer's tag and flags. We must do this to ensure that
1434  * linear scans of the buffer array don't think the buffer is valid.
1435  */
1436  oldFlags = buf_state & BUF_FLAG_MASK;
1437  CLEAR_BUFFERTAG(buf->tag);
1438  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1439  UnlockBufHdr(buf, buf_state);
1440 
1441  /*
1442  * Remove the buffer from the lookup hashtable, if it was in there.
1443  */
1444  if (oldFlags & BM_TAG_VALID)
1445  BufTableDelete(&oldTag, oldHash);
1446 
1447  /*
1448  * Done with mapping lock.
1449  */
1450  LWLockRelease(oldPartitionLock);
1451 
1452  /*
1453  * Insert the buffer at the head of the list of free buffers.
1454  */
1455  StrategyFreeBuffer(buf);
1456 }
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
Definition: lwlock.h:31
#define BufMappingPartitionLock(hashcode)
#define BM_TAG_VALID
Definition: buf_internals.h:60
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:4026
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:364
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
#define ERROR
Definition: elog.h:43
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:367
#define BUFFERTAGS_EQUAL(a, b)
#define BM_LOCKED
Definition: buf_internals.h:57
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
#define Assert(condition)
Definition: c.h:738
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:97
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
#define BufferDescriptorGetBuffer(bdesc)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
BufferTag tag
#define UnlockBufHdr(desc, s)
#define elog(elevel,...)
Definition: elog.h:214
pg_atomic_uint32 state
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 3972 of file bufmgr.c.

References Assert, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsValid, GetBufferDescriptor, GetPrivateRefCount(), LocalRefCount, LockBufHdr(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), and UnlockBufHdr.

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), hash_xlog_split_allocate_page(), and hashbucketcleanup().

3973 {
3974  BufferDesc *bufHdr;
3975  uint32 buf_state;
3976 
3977  Assert(BufferIsValid(buffer));
3978 
3979  if (BufferIsLocal(buffer))
3980  {
3981  /* There should be exactly one pin */
3982  if (LocalRefCount[-buffer - 1] != 1)
3983  return false;
3984  /* Nobody else to wait for */
3985  return true;
3986  }
3987 
3988  /* There should be exactly one local pin */
3989  if (GetPrivateRefCount(buffer) != 1)
3990  return false;
3991 
3992  bufHdr = GetBufferDescriptor(buffer - 1);
3993 
3994  /* caller must hold exclusive lock on buffer */
3996  LW_EXCLUSIVE));
3997 
3998  buf_state = LockBufHdr(bufHdr);
3999 
4000  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4001  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4002  {
4003  /* pincount is OK. */
4004  UnlockBufHdr(bufHdr, buf_state);
4005  return true;
4006  }
4007 
4008  UnlockBufHdr(bufHdr, buf_state);
4009  return false;
4010 }
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1946
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:367
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define UnlockBufHdr(desc, s)
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext context)

Definition at line 4476 of file bufmgr.c.

References buftag::blockNum, buffertag_comparator(), cur, buftag::forkNum, i, InvalidBackendId, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, qsort, RelFileNodeEquals, buftag::rnode, smgropen(), smgrwriteback(), and PendingWriteback::tag.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

4477 {
4478  int i;
4479 
4480  if (context->nr_pending == 0)
4481  return;
4482 
4483  /*
4484  * Executing the writes in-order can make them a lot faster, and allows to
4485  * merge writeback requests to consecutive blocks into larger writebacks.
4486  */
4487  qsort(&context->pending_writebacks, context->nr_pending,
4489 
4490  /*
4491  * Coalesce neighbouring writes, but nothing else. For that we iterate
4492  * through the, now sorted, array of pending flushes, and look forward to
4493  * find all neighbouring (or identical) writes.
4494  */
4495  for (i = 0; i < context->nr_pending; i++)
4496  {
4499  SMgrRelation reln;
4500  int ahead;
4501  BufferTag tag;
4502  Size nblocks = 1;
4503 
4504  cur = &context->pending_writebacks[i];
4505  tag = cur->tag;
4506 
4507  /*
4508  * Peek ahead, into following writeback requests, to see if they can
4509  * be combined with the current one.
4510  */
4511  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4512  {
4513  next = &context->pending_writebacks[i + ahead + 1];
4514 
4515  /* different file, stop */
4516  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4517  cur->tag.forkNum != next->tag.forkNum)
4518  break;
4519 
4520  /* ok, block queued twice, skip */
4521  if (cur->tag.blockNum == next->tag.blockNum)
4522  continue;
4523 
4524  /* only merge consecutive writes */
4525  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4526  break;
4527 
4528  nblocks++;
4529  cur = next;
4530  }
4531 
4532  i += ahead;
4533 
4534  /* and finally tell the kernel to write the data to storage */
4535  reln = smgropen(tag.rnode, InvalidBackendId);
4536  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4537  }
4538 
4539  context->nr_pending = 0;
4540 }
static int32 next
Definition: blutils.c:218
ForkNumber forkNum
Definition: buf_internals.h:93
struct cursor * cur
Definition: ecpg.c:28
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:526
static int buffertag_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4342
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:145
#define InvalidBackendId
Definition: backendid.h:23
size_t Size
Definition: c.h:466
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
int i
#define qsort(a, b, c, d)
Definition: port.h:479
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 4244 of file bufmgr.c.

References buftag::blockNum, errcontext, buftag::forkNum, MyBackendId, pfree(), relpathbackend, buftag::rnode, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

4245 {
4246  BufferDesc *bufHdr = (BufferDesc *) arg;
4247 
4248  if (bufHdr != NULL)
4249  {
4250  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4251  bufHdr->tag.forkNum);
4252 
4253  errcontext("writing block %u of relation %s",
4254  bufHdr->tag.blockNum, path);
4255  pfree(path);
4256  }
4257 }
BackendId MyBackendId
Definition: globals.c:81
ForkNumber forkNum
Definition: buf_internals.h:93
void pfree(void *pointer)
Definition: mcxt.c:1056
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define errcontext
Definition: elog.h:185
void * arg
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
int  mode 
)

Definition at line 3722 of file bufmgr.c.

References Assert, buf, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsValid, elog, ERROR, GetBufferDescriptor, LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), and LWLockRelease().

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_first(), _bt_getbuf(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_readnextpage(), _bt_relandgetbuf(), _bt_search(), _bt_unlink_halfdead_page(), _bt_update_meta_cleanup_info(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_getnewbuf(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), blbulkdelete(), blgetbitmap(), blinsert(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), brinbuild(), brinbuildempty(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_page_items(), bt_page_stats(), btvacuumpage(), checkXLogConsistency(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), entryLoadMoreItems(), fill_seq_with_data(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), ginbuildempty(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistbuildempty(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_compute_xid_horizon_for_tuples(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_inplace_update(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_bitmap_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgetpage(), heapgettup(), initBloomState(), lazy_scan_heap(), LockBufferForCleanup(), log_newpage_range(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferExtended(), XLogReadBufferForRedoExtended(), and XLogRecordPageWithFreeSpace().

3723 {
3724  BufferDesc *buf;
3725 
3726  Assert(BufferIsValid(buffer));
3727  if (BufferIsLocal(buffer))
3728  return; /* local buffers need no lock */
3729 
3730  buf = GetBufferDescriptor(buffer - 1);
3731 
3732  if (mode == BUFFER_LOCK_UNLOCK)
3734  else if (mode == BUFFER_LOCK_SHARE)
3736  else if (mode == BUFFER_LOCK_EXCLUSIVE)
3738  else
3739  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3740 }
static PgChecksumMode mode
Definition: pg_checksums.c:61
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:67
#define GetBufferDescriptor(id)
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define elog(elevel,...)
Definition: elog.h:214
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:97

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 3779 of file bufmgr.c.

References Assert, BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid, elog, ERROR, get_ps_display(), GetBufferDescriptor, GetPrivateRefCount(), InHotStandby, LocalRefCount, LockBuffer(), LockBufHdr(), MyProcPid, palloc(), pfree(), PG_WAIT_BUFFER_PIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), set_ps_display(), SetStartupBufferPinWaitBufId(), UnlockBufHdr, update_process_title, and BufferDesc::wait_backend_pid.

Referenced by btvacuumpage(), ginVacuumPostingTree(), hashbulkdelete(), lazy_scan_heap(), ReadBuffer_common(), and XLogReadBufferForRedoExtended().

3780 {
3781  BufferDesc *bufHdr;
3782  char *new_status = NULL;
3783 
3784  Assert(BufferIsValid(buffer));
3785  Assert(PinCountWaitBuf == NULL);
3786 
3787  if (BufferIsLocal(buffer))
3788  {
3789  /* There should be exactly one pin */
3790  if (LocalRefCount[-buffer - 1] != 1)
3791  elog(ERROR, "incorrect local pin count: %d",
3792  LocalRefCount[-buffer - 1]);
3793  /* Nobody else to wait for */
3794  return;
3795  }
3796 
3797  /* There should be exactly one local pin */
3798  if (GetPrivateRefCount(buffer) != 1)
3799  elog(ERROR, "incorrect local pin count: %d",
3800  GetPrivateRefCount(buffer));
3801 
3802  bufHdr = GetBufferDescriptor(buffer - 1);
3803 
3804  for (;;)
3805  {
3806  uint32 buf_state;
3807 
3808  /* Try to acquire lock */
3810  buf_state = LockBufHdr(bufHdr);
3811 
3812  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3813  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3814  {
3815  /* Successfully acquired exclusive lock with pincount 1 */
3816  UnlockBufHdr(bufHdr, buf_state);
3817 
3818  /* Report change to non-waiting status */
3819  if (new_status)
3820  {
3821  set_ps_display(new_status);
3822  pfree(new_status);
3823  }
3824  return;
3825  }
3826  /* Failed, so mark myself as waiting for pincount 1 */
3827  if (buf_state & BM_PIN_COUNT_WAITER)
3828  {
3829  UnlockBufHdr(bufHdr, buf_state);
3830  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3831  elog(ERROR, "multiple backends attempting to wait for pincount 1");
3832  }
3833  bufHdr->wait_backend_pid = MyProcPid;
3834  PinCountWaitBuf = bufHdr;
3835  buf_state |= BM_PIN_COUNT_WAITER;
3836  UnlockBufHdr(bufHdr, buf_state);
3837  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3838 
3839  /* Wait to be signaled by UnpinBuffer() */
3840  if (InHotStandby)
3841  {
3842  /* Report change to waiting status */
3843  if (update_process_title && new_status == NULL)
3844  {
3845  const char *old_status;
3846  int len;
3847 
3848  old_status = get_ps_display(&len);
3849  new_status = (char *) palloc(len + 8 + 1);
3850  memcpy(new_status, old_status, len);
3851  strcpy(new_status + len, " waiting");
3852  set_ps_display(new_status);
3853  new_status[len] = '\0'; /* truncate off " waiting" */
3854  }
3855 
3856  /* Publish the bufid that Startup process waits on */
3857  SetStartupBufferPinWaitBufId(buffer - 1);
3858  /* Set alarm and then wait to be signaled by UnpinBuffer() */
3860  /* Reset the published bufid */
3862  }
3863  else
3865 
3866  /*
3867  * Remove flag marking us as waiter. Normally this will not be set
3868  * anymore, but ProcWaitForSignal() can return for other signals as
3869  * well. We take care to only reset the flag if we're the waiter, as
3870  * theoretically another backend could have started waiting. That's
3871  * impossible with the current usages due to table level locking, but
3872  * better be safe.
3873  */
3874  buf_state = LockBufHdr(bufHdr);
3875  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3876  bufHdr->wait_backend_pid == MyProcPid)
3877  buf_state &= ~BM_PIN_COUNT_WAITER;
3878  UnlockBufHdr(bufHdr, buf_state);
3879 
3880  PinCountWaitBuf = NULL;
3881  /* Loop back and try again */
3882  }
3883 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
int MyProcPid
Definition: globals.c:40
int wait_backend_pid
bool update_process_title
Definition: ps_status.c:36
#define InHotStandby
Definition: xlog.h:74
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
void set_ps_display(const char *activity)
Definition: ps_status.c:349
void pfree(void *pointer)
Definition: mcxt.c:1056
#define ERROR
Definition: elog.h:43
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:483
const char * get_ps_display(int *displen)
Definition: ps_status.c:430
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:645
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:367
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1800
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3722
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:786
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
void * palloc(Size size)
Definition: mcxt.c:949
#define UnlockBufHdr(desc, s)
#define elog(elevel,...)
Definition: elog.h:214
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:156
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:64

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 4290 of file bufmgr.c.

References BM_LOCKED, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), DropRelFileNodeBuffers(), DropRelFileNodesAllBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetBufferFromRing(), InvalidateBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), pg_buffercache_pages(), ReadBuffer_common(), StartBufferIO(), StrategyGetBuffer(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), UnpinBuffer(), and WaitIO().

4291 {
4292  SpinDelayStatus delayStatus;
4293  uint32 old_buf_state;
4294 
4295  init_local_spin_delay(&delayStatus);
4296 
4297  while (true)
4298  {
4299  /* set BM_LOCKED flag */
4300  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4301  /* if it wasn't set before we're OK */
4302  if (!(old_buf_state & BM_LOCKED))
4303  break;
4304  perform_spin_delay(&delayStatus);
4305  }
4306  finish_spin_delay(&delayStatus);
4307  return old_buf_state | BM_LOCKED;
4308 }
#define init_local_spin_delay(status)
Definition: s_lock.h:1043
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:174
unsigned int uint32
Definition: c.h:367
#define BM_LOCKED
Definition: buf_internals.h:57
pg_atomic_uint32 state
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:372
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:124

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 1468 of file bufmgr.c.

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, BufferIsValid, elog, ERROR, GetBufferDescriptor, LW_EXCLUSIVE, LWLockHeldByMeInMode(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_one_page(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newroot(), _bt_restore_meta(), _bt_split(), _bt_unlink_halfdead_page(), _bt_update_meta_cleanup_info(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), do_setval(), doPickSplit(), fill_seq_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune(), heap_update(), heap_xlog_clean(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_freeze_page(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heap_xlog_visible(), lazy_scan_heap(), lazy_vacuum_page(), log_newpage_range(), moveLeafs(), nextval_internal(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

1469 {
1470  BufferDesc *bufHdr;
1471  uint32 buf_state;
1472  uint32 old_buf_state;
1473 
1474  if (!BufferIsValid(buffer))
1475  elog(ERROR, "bad buffer ID: %d", buffer);
1476 
1477  if (BufferIsLocal(buffer))
1478  {
1479  MarkLocalBufferDirty(buffer);
1480  return;
1481  }
1482 
1483  bufHdr = GetBufferDescriptor(buffer - 1);
1484 
1485  Assert(BufferIsPinned(buffer));
1487  LW_EXCLUSIVE));
1488 
1489  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1490  for (;;)
1491  {
1492  if (old_buf_state & BM_LOCKED)
1493  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1494 
1495  buf_state = old_buf_state;
1496 
1497  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1498  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1499 
1500  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1501  buf_state))
1502  break;
1503  }
1504 
1505  /*
1506  * If the buffer was not dirty already, do vacuum accounting.
1507  */
1508  if (!(old_buf_state & BM_DIRTY))
1509  {
1510  VacuumPageDirty++;
1512  if (VacuumCostActive)
1514  }
1515 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1946
int VacuumCostBalance
Definition: globals.c:147
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
int64 VacuumPageDirty
Definition: globals.c:145
#define BM_DIRTY
Definition: buf_internals.h:58
int VacuumCostPageDirty
Definition: globals.c:139
#define ERROR
Definition: elog.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
unsigned int uint32
Definition: c.h:367
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:286
#define BM_LOCKED
Definition: buf_internals.h:57
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4318
#define elog(elevel,...)
Definition: elog.h:214
pg_atomic_uint32 state
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 3553 of file bufmgr.c.

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock, BufferGetPage, BufferIsLocal, BufferIsValid, PGPROC::delayChkpt, elog, ERROR, GetBufferDescriptor, GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), LWLockHeldByMe(), MarkLocalBufferDirty(), MyProc, PageSetLSN, pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), RelFileNodeSkippingWAL(), buftag::rnode, BufferUsage::shared_blks_dirtied, BufferDesc::state, BufferDesc::tag, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

3554 {
3555  BufferDesc *bufHdr;
3556  Page page = BufferGetPage(buffer);
3557 
3558  if (!BufferIsValid(buffer))
3559  elog(ERROR, "bad buffer ID: %d", buffer);
3560 
3561  if (BufferIsLocal(buffer))
3562  {
3563  MarkLocalBufferDirty(buffer);
3564  return;
3565  }
3566 
3567  bufHdr = GetBufferDescriptor(buffer - 1);
3568 
3569  Assert(GetPrivateRefCount(buffer) > 0);
3570  /* here, either share or exclusive lock is OK */
3572 
3573  /*
3574  * This routine might get called many times on the same page, if we are
3575  * making the first scan after commit of an xact that added/deleted many
3576  * tuples. So, be as quick as we can if the buffer is already dirty. We
3577  * do this by not acquiring spinlock if it looks like the status bits are
3578  * already set. Since we make this test unlocked, there's a chance we
3579  * might fail to notice that the flags have just been cleared, and failed
3580  * to reset them, due to memory-ordering issues. But since this function
3581  * is only intended to be used in cases where failing to write out the
3582  * data would be harmless anyway, it doesn't really matter.
3583  */
3584  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3586  {
3588  bool dirtied = false;
3589  bool delayChkpt = false;
3590  uint32 buf_state;
3591 
3592  /*
3593  * If we need to protect hint bit updates from torn writes, WAL-log a
3594  * full page image of the page. This full page image is only necessary
3595  * if the hint bit update is the first change to the page since the
3596  * last checkpoint.
3597  *
3598  * We don't check full_page_writes here because that logic is included
3599  * when we call XLogInsert() since the value changes dynamically.
3600  */
3601  if (XLogHintBitIsNeeded() &&
3602  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3603  {
3604  /*
3605  * If we must not write WAL, due to a relfilenode-specific
3606  * condition or being in recovery, don't dirty the page. We can
3607  * set the hint, just not dirty the page as a result so the hint
3608  * is lost when we evict the page or shutdown.
3609  *
3610  * See src/backend/storage/page/README for longer discussion.
3611  */
3612  if (RecoveryInProgress() ||
3613  RelFileNodeSkippingWAL(bufHdr->tag.rnode))
3614  return;
3615 
3616  /*
3617  * If the block is already dirty because we either made a change
3618  * or set a hint already, then we don't need to write a full page
3619  * image. Note that aggressive cleaning of blocks dirtied by hint
3620  * bit setting would increase the call rate. Bulk setting of hint
3621  * bits would reduce the call rate...
3622  *
3623  * We must issue the WAL record before we mark the buffer dirty.
3624  * Otherwise we might write the page before we write the WAL. That
3625  * causes a race condition, since a checkpoint might occur between
3626  * writing the WAL record and marking the buffer dirty. We solve
3627  * that with a kluge, but one that is already in use during
3628  * transaction commit to prevent race conditions. Basically, we
3629  * simply prevent the checkpoint WAL record from being written
3630  * until we have marked the buffer dirty. We don't start the
3631  * checkpoint flush until we have marked dirty, so our checkpoint
3632  * must flush the change to disk successfully or the checkpoint
3633  * never gets written, so crash recovery will fix.
3634  *
3635  * It's possible we may enter here without an xid, so it is
3636  * essential that CreateCheckpoint waits for virtual transactions
3637  * rather than full transactionids.
3638  */
3639  MyProc->delayChkpt = delayChkpt = true;
3640  lsn = XLogSaveBufferForHint(buffer, buffer_std);
3641  }
3642 
3643  buf_state = LockBufHdr(bufHdr);
3644 
3645  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3646 
3647  if (!(buf_state & BM_DIRTY))
3648  {
3649  dirtied = true; /* Means "will be dirtied by this action" */
3650 
3651  /*
3652  * Set the page LSN if we wrote a backup block. We aren't supposed
3653  * to set this when only holding a share lock but as long as we
3654  * serialise it somehow we're OK. We choose to set LSN while
3655  * holding the buffer header lock, which causes any reader of an
3656  * LSN who holds only a share lock to also obtain a buffer header
3657  * lock before using PageGetLSN(), which is enforced in
3658  * BufferGetLSNAtomic().
3659  *
3660  * If checksums are enabled, you might think we should reset the
3661  * checksum here. That will happen when the page is written
3662  * sometime later in this checkpoint cycle.
3663  */
3664  if (!XLogRecPtrIsInvalid(lsn))
3665  PageSetLSN(page, lsn);
3666  }
3667 
3668  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3669  UnlockBufHdr(bufHdr, buf_state);
3670 
3671  if (delayChkpt)
3672  MyProc->delayChkpt = false;
3673 
3674  if (dirtied)
3675  {
3676  VacuumPageDirty++;
3678  if (VacuumCostActive)
3680  }
3681  }
3682 }
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
#define BM_PERMANENT
Definition: buf_internals.h:66
int VacuumCostBalance
Definition: globals.c:147
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1928
PGPROC * MyProc
Definition: proc.c:67
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:898
int64 VacuumPageDirty
Definition: globals.c:145
bool RecoveryInProgress(void)
Definition: xlog.c:8072
#define BM_DIRTY
Definition: buf_internals.h:58
int VacuumCostPageDirty
Definition: globals.c:139
#define ERROR
Definition: elog.h:43
bool delayChkpt
Definition: proc.h:152
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:367
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:286
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
RelFileNode rnode
Definition: buf_internals.h:92
bool RelFileNodeSkippingWAL(RelFileNode rnode)
Definition: storage.c:496
BufferTag tag
#define UnlockBufHdr(desc, s)
#define elog(elevel,...)
Definition: elog.h:214
pg_atomic_uint32 state
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define XLogHintBitIsNeeded()
Definition: xlog.h:202
Pointer Page
Definition: bufpage.h:78
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 272 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, PrivateRefCountEntry::refcount, and ReservedRefCountEntry.

Referenced by PinBuffer(), and PinBuffer_Locked().

273 {
275 
276  /* only allowed to be called when a reservation has been made */
277  Assert(ReservedRefCountEntry != NULL);
278 
279  /* use up the reserved entry */
280  res = ReservedRefCountEntry;
281  ReservedRefCountEntry = NULL;
282 
283  /* and fill it */
284  res->buffer = buffer;
285  res->refcount = 0;
286 
287  return res;
288 }
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:192
#define Assert(condition)
Definition: c.h:738

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy 
)
static

Definition at line 1589 of file bufmgr.c.

References Assert, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservePrivateRefCountEntry(), ResourceOwnerRememberBuffer(), BufferDesc::state, and WaitBufHdrUnlocked().

Referenced by BufferAlloc().

1590 {
1592  bool result;
1593  PrivateRefCountEntry *ref;
1594 
1595  ref = GetPrivateRefCountEntry(b, true);
1596 
1597  if (ref == NULL)
1598  {
1599  uint32 buf_state;
1600  uint32 old_buf_state;
1601 
1603  ref = NewPrivateRefCountEntry(b);
1604 
1605  old_buf_state = pg_atomic_read_u32(&buf->state);
1606  for (;;)
1607  {
1608  if (old_buf_state & BM_LOCKED)
1609  old_buf_state = WaitBufHdrUnlocked(buf);
1610 
1611  buf_state = old_buf_state;
1612 
1613  /* increase refcount */
1614  buf_state += BUF_REFCOUNT_ONE;
1615 
1616  if (strategy == NULL)
1617  {
1618  /* Default case: increase usagecount unless already max. */
1620  buf_state += BUF_USAGECOUNT_ONE;
1621  }
1622  else
1623  {
1624  /*
1625  * Ring buffers shouldn't evict others from pool. Thus we
1626  * don't make usagecount more than 1.
1627  */
1628  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1629  buf_state += BUF_USAGECOUNT_ONE;
1630  }
1631 
1632  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1633  buf_state))
1634  {
1635  result = (buf_state & BM_VALID) != 0;
1636  break;
1637  }
1638  }
1639  }
1640  else
1641  {
1642  /* If we previously pinned the buffer, it must surely be valid */
1643  result = true;
1644  }
1645 
1646  ref->refcount++;
1647  Assert(ref->refcount > 0);
1649  return result;
1650 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:298
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:930
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:272
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
unsigned int uint32
Definition: c.h:367
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define BM_VALID
Definition: buf_internals.h:59
int result
Definition: header.h:19
#define Assert(condition)
Definition: c.h:738
#define BufferDescriptorGetBuffer(bdesc)
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4318
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:76
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206
pg_atomic_uint32 state
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:49
int Buffer
Definition: buf.h:23
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 1674 of file bufmgr.c.

References Assert, BM_LOCKED, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), BufferDesc::state, and UnlockBufHdr.

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), and SyncOneBuffer().

1675 {
1676  Buffer b;
1677  PrivateRefCountEntry *ref;
1678  uint32 buf_state;
1679 
1680  /*
1681  * As explained, We don't expect any preexisting pins. That allows us to
1682  * manipulate the PrivateRefCount after releasing the spinlock
1683  */
1685 
1686  /*
1687  * Since we hold the buffer spinlock, we can update the buffer state and
1688  * release the lock in one operation.
1689  */
1690  buf_state = pg_atomic_read_u32(&buf->state);
1691  Assert(buf_state & BM_LOCKED);
1692  buf_state += BUF_REFCOUNT_ONE;
1693  UnlockBufHdr(buf, buf_state);
1694 
1695  b = BufferDescriptorGetBuffer(buf);
1696 
1697  ref = NewPrivateRefCountEntry(b);
1698  ref->refcount++;
1699 
1701 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:298
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:930
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:272
unsigned int uint32
Definition: c.h:367
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define Assert(condition)
Definition: c.h:738
#define BufferDescriptorGetBuffer(bdesc)
#define UnlockBufHdr(desc, s)
pg_atomic_uint32 state
int Buffer
Definition: buf.h:23
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ PrefetchBuffer()

PrefetchBufferResult PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 574 of file bufmgr.c.

References Assert, BlockNumberIsValid, ereport, errcode(), errmsg(), ERROR, PrefetchLocalBuffer(), PrefetchSharedBuffer(), RelationData::rd_smgr, RELATION_IS_OTHER_TEMP, RelationIsValid, RelationOpenSmgr, and RelationUsesLocalBuffers.

Referenced by BitmapPrefetch(), count_nondeletable_pages(), HeapTupleHeaderAdvanceLatestRemovedXid(), and pg_prewarm().

575 {
576  Assert(RelationIsValid(reln));
577  Assert(BlockNumberIsValid(blockNum));
578 
579  /* Open it at the smgr level if not already done */
580  RelationOpenSmgr(reln);
581 
582  if (RelationUsesLocalBuffers(reln))
583  {
584  /* see comments in ReadBufferExtended */
585  if (RELATION_IS_OTHER_TEMP(reln))
586  ereport(ERROR,
587  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
588  errmsg("cannot access temporary tables of other sessions")));
589 
590  /* pass it off to localbuf.c */
591  return PrefetchLocalBuffer(reln->rd_smgr, forkNum, blockNum);
592  }
593  else
594  {
595  /* pass it to the shared buffer version */
596  return PrefetchSharedBuffer(reln->rd_smgr, forkNum, blockNum);
597  }
598 }
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
int errcode(int sqlerrcode)
Definition: elog.c:610
#define RelationOpenSmgr(relation)
Definition: rel.h:513
#define ERROR
Definition: elog.h:43
#define RelationIsValid(relation)
Definition: rel.h:429
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:487
#define ereport(elevel,...)
Definition: elog.h:144
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
#define Assert(condition)
Definition: c.h:738
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:593
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:572
int errmsg(const char *fmt,...)
Definition: elog.c:824

◆ PrefetchSharedBuffer()

PrefetchBufferResult PrefetchSharedBuffer ( SMgrRelation  smgr_reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 487 of file bufmgr.c.

References Assert, BlockNumberIsValid, BufMappingPartitionLock, BufTableHashCode(), BufTableLookup(), INIT_BUFFERTAG, PrefetchBufferResult::initiated_io, InvalidBuffer, LW_SHARED, LWLockAcquire(), LWLockRelease(), RelFileNodeBackend::node, PrefetchBufferResult::recent_buffer, SMgrRelationData::smgr_rnode, and smgrprefetch().

Referenced by PrefetchBuffer().

490 {
492  BufferTag newTag; /* identity of requested block */
493  uint32 newHash; /* hash value for newTag */
494  LWLock *newPartitionLock; /* buffer partition lock for it */
495  int buf_id;
496 
497  Assert(BlockNumberIsValid(blockNum));
498 
499  /* create a tag so we can lookup the buffer */
500  INIT_BUFFERTAG(newTag, smgr_reln->smgr_rnode.node,
501  forkNum, blockNum);
502 
503  /* determine its hash code and partition lock ID */
504  newHash = BufTableHashCode(&newTag);
505  newPartitionLock = BufMappingPartitionLock(newHash);
506 
507  /* see if the block is in the buffer pool already */
508  LWLockAcquire(newPartitionLock, LW_SHARED);
509  buf_id = BufTableLookup(&newTag, newHash);
510  LWLockRelease(newPartitionLock);
511 
512  /* If not in buffers, initiate prefetch */
513  if (buf_id < 0)
514  {
515 #ifdef USE_PREFETCH
516  /*
517  * Try to initiate an asynchronous read. This returns false in
518  * recovery if the relation file doesn't exist.
519  */
520  if (smgrprefetch(smgr_reln, forkNum, blockNum))
521  result.initiated_io = true;
522 #endif /* USE_PREFETCH */
523  }
524  else
525  {
526  /*
527  * Report the buffer it was in at that time. The caller may be able
528  * to avoid a buffer table lookup, but it's not pinned and it must be
529  * rechecked!
530  */
531  result.recent_buffer = buf_id + 1;
532  }
533 
534  /*
535  * If the block *is* in buffers, we do nothing. This is not really ideal:
536  * the block might be just about to be evicted, which would be stupid
537  * since we know we are going to need it soon. But the only easy answer
538  * is to bump the usage_count, which does not seem like a great solution:
539  * when the caller does ultimately touch the block, usage_count would get
540  * bumped again, resulting in too much favoritism for blocks that are
541  * involved in a prefetch sequence. A real fix would involve some
542  * additional per-buffer state, and it's not clear that there's enough of
543  * a problem to justify that.
544  */
545 
546  return result;
547 }
Definition: lwlock.h:31
#define BufMappingPartitionLock(hashcode)
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
#define InvalidBuffer
Definition: buf.h:25
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
Buffer recent_buffer
Definition: bufmgr.h:54
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1812
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
unsigned int uint32
Definition: c.h:367
int result
Definition: header.h:19
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
RelFileNode node
Definition: relfilenode.h:74
#define Assert(condition)
Definition: c.h:738
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:477

◆ PrintBufferLeakWarning()

void PrintBufferLeakWarning ( Buffer  buffer)

Definition at line 2559 of file bufmgr.c.

References Assert, buftag::blockNum, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BufferIsLocal, BufferIsValid, elog, buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, GetPrivateRefCount(), InvalidBackendId, LocalRefCount, MyBackendId, pfree(), pg_atomic_read_u32(), relpathbackend, buftag::rnode, BufferDesc::state, BufferDesc::tag, and WARNING.

Referenced by CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResourceOwnerReleaseInternal().

2560 {
2561  BufferDesc *buf;
2562  int32 loccount;
2563  char *path;
2564  BackendId backend;
2565  uint32 buf_state;
2566 
2567  Assert(BufferIsValid(buffer));
2568  if (BufferIsLocal(buffer))
2569  {
2570  buf = GetLocalBufferDescriptor(-buffer - 1);
2571  loccount = LocalRefCount[-buffer - 1];
2572  backend = MyBackendId;
2573  }
2574  else
2575  {
2576  buf = GetBufferDescriptor(buffer - 1);
2577  loccount = GetPrivateRefCount(buffer);
2578  backend = InvalidBackendId;
2579  }
2580 
2581  /* theoretically we should lock the bufhdr here */
2582  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2583  buf_state = pg_atomic_read_u32(&buf->state);
2584  elog(WARNING,
2585  "buffer refcount leak: [%03d] "
2586  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2587  buffer, path,
2588  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2589  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2590  pfree(path);
2591 }
BackendId MyBackendId
Definition: globals.c:81
ForkNumber forkNum
Definition: buf_internals.h:93
#define GetLocalBufferDescriptor(id)
signed int int32
Definition: c.h:355
void pfree(void *pointer)
Definition: mcxt.c:1056
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
static char * buf
Definition: pg_test_fsync.c:67
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:367
#define WARNING
Definition: elog.h:40
#define InvalidBackendId
Definition: backendid.h:23
int BackendId
Definition: backendid.h:21
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define elog(elevel,...)
Definition: elog.h:214
pg_atomic_uint32 state
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 606 of file bufmgr.c.

References MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_getbuf(), _bt_search_insert(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinbuild(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_page_items(), bt_page_stats(), fill_seq_with_data(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_compute_xid_horizon_for_tuples(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_inplace_update(), heap_lock_tuple(), heap_update(), initBloomState(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

607 {
608  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
609 }
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:652

◆ ReadBuffer_common()

static Buffer ReadBuffer_common ( SMgrRelation  reln,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool hit 
)
static

Definition at line 715 of file bufmgr.c.

References Assert, RelFileNodeBackend::backend, BufferUsage::blk_read_time, BM_VALID, BufferAlloc(), BufferDescriptorGetBuffer, BufferDescriptorGetContentLock, BufHdrGetBlock, CurrentResourceOwner, RelFileNode::dbNode, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errhint(), errmsg(), ERROR, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, BufferUsage::local_blks_hit, BufferUsage::local_blks_read, BufferUsage::local_blks_written, LocalBufferAlloc(), LocalBufHdrGetBlock, LockBufferForCleanup(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), MemSet, RelFileNodeBackend::node, P_NEW, PageIsNew, PageIsVerified(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), pgBufferUsage, pgstat_count_buffer_read_time, RBM_NORMAL, RBM_NORMAL_NO_LOG, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RBM_ZERO_ON_ERROR, RelFileNode::relNode, relpath, ResourceOwnerEnlargeBuffers(), BufferUsage::shared_blks_hit, BufferUsage::shared_blks_read, BufferUsage::shared_blks_written, SMgrRelationData::smgr_rnode, smgrextend(), SmgrIsTemp, smgrnblocks(), smgrread(), RelFileNode::spcNode, StartBufferIO(), BufferDesc::state, TerminateBufferIO(), track_io_timing, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageHit, VacuumCostPageMiss, VacuumPageHit, VacuumPageMiss, WARNING, and zero_damaged_pages.

Referenced by ReadBufferExtended(), and ReadBufferWithoutRelcache().

718 {
719  BufferDesc *bufHdr;
720  Block bufBlock;
721  bool found;
722  bool isExtend;
723  bool isLocalBuf = SmgrIsTemp(smgr);
724 
725  *hit = false;
726 
727  /* Make sure we will have room to remember the buffer pin */
729 
730  isExtend = (blockNum == P_NEW);
731 
732  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
733  smgr->smgr_rnode.node.spcNode,
734  smgr->smgr_rnode.node.dbNode,
735  smgr->smgr_rnode.node.relNode,
736  smgr->smgr_rnode.backend,
737  isExtend);
738 
739  /* Substitute proper block number if caller asked for P_NEW */
740  if (isExtend)
741  blockNum = smgrnblocks(smgr, forkNum);
742 
743  if (isLocalBuf)
744  {
745  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
746  if (found)
748  else if (isExtend)
750  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
753  }
754  else
755  {
756  /*
757  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
758  * not currently in memory.
759  */
760  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
761  strategy, &found);
762  if (found)
764  else if (isExtend)
766  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
769  }
770 
771  /* At this point we do NOT hold any locks. */
772 
773  /* if it was already in the buffer pool, we're done */
774  if (found)
775  {
776  if (!isExtend)
777  {
778  /* Just need to update stats before we exit */
779  *hit = true;
780  VacuumPageHit++;
781 
782  if (VacuumCostActive)
784 
785  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
786  smgr->smgr_rnode.node.spcNode,
787  smgr->smgr_rnode.node.dbNode,
788  smgr->smgr_rnode.node.relNode,
789  smgr->smgr_rnode.backend,
790  isExtend,
791  found);
792 
793  /*
794  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
795  * locked on return.
796  */
797  if (!isLocalBuf)
798  {
799  if (mode == RBM_ZERO_AND_LOCK)
801  LW_EXCLUSIVE);
802  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
804  }
805 
806  return BufferDescriptorGetBuffer(bufHdr);
807  }
808 
809  /*
810  * We get here only in the corner case where we are trying to extend
811  * the relation but we found a pre-existing buffer marked BM_VALID.
812  * This can happen because mdread doesn't complain about reads beyond
813  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
814  * read a block beyond EOF could have left a "valid" zero-filled
815  * buffer. Unfortunately, we have also seen this case occurring
816  * because of buggy Linux kernels that sometimes return an
817  * lseek(SEEK_END) result that doesn't account for a recent write. In
818  * that situation, the pre-existing buffer would contain valid data
819  * that we don't want to overwrite. Since the legitimate case should
820  * always have left a zero-filled buffer, complain if not PageIsNew.
821  */
822  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
823  if (!PageIsNew((Page) bufBlock))
824  ereport(ERROR,
825  (errmsg("unexpected data beyond EOF in block %u of relation %s",
826  blockNum, relpath(smgr->smgr_rnode, forkNum)),
827  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
828 
829  /*
830  * We *must* do smgrextend before succeeding, else the page will not
831  * be reserved by the kernel, and the next P_NEW call will decide to
832  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
833  * call that BufferAlloc didn't, and proceed.
834  */
835  if (isLocalBuf)
836  {
837  /* Only need to adjust flags */
838  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
839 
840  Assert(buf_state & BM_VALID);
841  buf_state &= ~BM_VALID;
842  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
843  }
844  else
845  {
846  /*
847  * Loop to handle the very small possibility that someone re-sets
848  * BM_VALID between our clearing it and StartBufferIO inspecting
849  * it.
850  */
851  do
852  {
853  uint32 buf_state = LockBufHdr(bufHdr);
854 
855  Assert(buf_state & BM_VALID);
856  buf_state &= ~BM_VALID;
857  UnlockBufHdr(bufHdr, buf_state);
858  } while (!StartBufferIO(bufHdr, true));
859  }
860  }
861 
862  /*
863  * if we have gotten to this point, we have allocated a buffer for the
864  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
865  * if it's a shared buffer.
866  *
867  * Note: if smgrextend fails, we will end up with a buffer that is
868  * allocated but not marked BM_VALID. P_NEW will still select the same
869  * block number (because the relation didn't get any longer on disk) and
870  * so future attempts to extend the relation will find the same buffer (if
871  * it's not been recycled) but come right back here to try smgrextend
872  * again.
873  */
874  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
875 
876  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
877 
878  if (isExtend)
879  {
880  /* new buffers are zero-filled */
881  MemSet((char *) bufBlock, 0, BLCKSZ);
882  /* don't set checksum for all-zero page */
883  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
884 
885  /*
886  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
887  * although we're essentially performing a write. At least on linux
888  * doing so defeats the 'delayed allocation' mechanism, leading to
889  * increased file fragmentation.
890  */
891  }
892  else
893  {
894  /*
895  * Read in the page, unless the caller intends to overwrite it and
896  * just wants us to allocate a buffer.
897  */
899  MemSet((char *) bufBlock, 0, BLCKSZ);
900  else
901  {
902  instr_time io_start,
903  io_time;
904 
905  if (track_io_timing)
906  INSTR_TIME_SET_CURRENT(io_start);
907 
908  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
909 
910  if (track_io_timing)
911  {
912  INSTR_TIME_SET_CURRENT(io_time);
913  INSTR_TIME_SUBTRACT(io_time, io_start);
916  }
917 
918  /* check for garbage data */
919  if (!PageIsVerified((Page) bufBlock, blockNum))
920  {
922  {
925  errmsg("invalid page in block %u of relation %s; zeroing out page",
926  blockNum,
927  relpath(smgr->smgr_rnode, forkNum))));
928  MemSet((char *) bufBlock, 0, BLCKSZ);
929  }
930  else
931  ereport(ERROR,
933  errmsg("invalid page in block %u of relation %s",
934  blockNum,
935  relpath(smgr->smgr_rnode, forkNum))));
936  }
937  }
938  }
939 
940  /*
941  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
942  * the page as valid, to make sure that no other backend sees the zeroed
943  * page before the caller has had a chance to initialize it.
944  *
945  * Since no-one else can be looking at the page contents yet, there is no
946  * difference between an exclusive lock and a cleanup-strength lock. (Note
947  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
948  * they assert that the buffer is already valid.)
949  */
951  !isLocalBuf)
952  {
954  }
955 
956  if (isLocalBuf)
957  {
958  /* Only need to adjust flags */
959  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
960 
961  buf_state |= BM_VALID;
962  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
963  }
964  else
965  {
966  /* Set BM_VALID, terminate IO, and wake up any waiters */
967  TerminateBufferIO(bufHdr, false, BM_VALID);
968  }
969 
970  VacuumPageMiss++;
971  if (VacuumCostActive)
973 
974  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
975  smgr->smgr_rnode.node.spcNode,
976  smgr->smgr_rnode.node.dbNode,
977  smgr->smgr_rnode.node.relNode,
978  smgr->smgr_rnode.backend,
979  isExtend,
980  found);
981 
982  return BufferDescriptorGetBuffer(bufHdr);
983 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:63
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:109
static PgChecksumMode mode
Definition: pg_checksums.c:61
long local_blks_hit
Definition: instrument.h:25
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3779
int64 VacuumPageMiss
Definition: globals.c:144
int errhint(const char *fmt,...)
Definition: elog.c:1071
long local_blks_read
Definition: instrument.h:26
int VacuumCostBalance
Definition: globals.c:147
bool PageIsVerified(Page page, BlockNumber blkno)
Definition: bufpage.c:82
instr_time blk_read_time
Definition: instrument.h:31
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct timeval instr_time
Definition: instr_time.h:150
long shared_blks_read
Definition: instrument.h:22
int64 VacuumPageHit
Definition: globals.c:143
int errcode(int sqlerrcode)
Definition: elog.c:610
#define MemSet(start, val, len)
Definition: c.h:971
#define P_NEW
Definition: bufmgr.h:91
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:491
#define SmgrIsTemp(smgr)
Definition: smgr.h:79
long shared_blks_written
Definition: instrument.h:24
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4073
#define ERROR
Definition: elog.h:43
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
unsigned int uint32
Definition: c.h:367
int VacuumCostPageHit
Definition: globals.c:137
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:45
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define WARNING
Definition: elog.h:40
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4140
#define BM_VALID
Definition: buf_internals.h:59
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:59
#define ereport(elevel,...)
Definition: elog.h:144
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1005
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4290
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:538
#define Assert(condition)
Definition: c.h:738
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:1432
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
#define BufferDescriptorGetBuffer(bdesc)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1208
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:462
#define PageIsNew(page)
Definition: bufpage.h:229
int errmsg(const char *fmt,...)
Definition: elog.c:824
long shared_blks_hit
Definition: instrument.h:21
#define UnlockBufHdr(desc, s)
long local_blks_written
Definition: instrument.h:28
#define relpath(rnode, forknum)
Definition: relpath.h:87
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
pg_atomic_uint32 state
int VacuumCostPageMiss
Definition: globals.c:138
bool track_io_timing
Definition: bufmgr.c:126
Pointer Page
Definition: bufpage.h:78
BufferUsage pgBufferUsage
Definition: instrument.c:20
void * Block
Definition: bufmgr.h:24
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241
bool zero_damaged_pages
Definition: bufmgr.c:123

◆ ReadBufferExtended()