PostgreSQL Source Code  git master
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xlog.h"
#include "catalog/catalog.h"
#include "catalog/storage.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/rel.h"
#include "utils/resowner_private.h"
#include "utils/timestamp.h"
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)   LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define DROP_RELS_BSEARCH_THRESHOLD   20
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static Buffer ReadBuffer_common (SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf, bool fixOwner)
 
static void BufferSync (int flags)
 
static uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static bool StartBufferIO (BufferDesc *buf, bool forInput)
 
static void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rnode_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const void *p1, const void *p2)
 
static int ckpt_buforder_comparator (const void *pa, const void *pb)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
bool ComputeIoConcurrency (int io_concurrency, double *target)
 
void PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
static void InvalidateBuffer (BufferDesc *buf)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferPoolAccess (void)
 
void InitBufferPoolBackend (void)
 
void PrintBufferLeakWarning (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
void BufmgrCommit (void)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelFileNodeBuffers (RelFileNodeBackend rnode, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelFileNodesAllBuffers (RelFileNodeBackend *rnodes, int nnodes)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, int mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
void AbortBufferIO (void)
 
uint32 LockBufHdr (BufferDesc *desc)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *context)
 
void TestForOldSnapshot_impl (Snapshot snapshot, Relation relation)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = 0
 
int checkpoint_flush_after = 0
 
int bgwriter_flush_after = 0
 
int backend_flush_after = 0
 
int target_prefetch_pages = 0
 
static BufferDescInProgressBuf = NULL
 
static bool IsForInput
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 

Macro Definition Documentation

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 67 of file bufmgr.c.

Referenced by BgBufferSync(), and SyncOneBuffer().

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 66 of file bufmgr.c.

Referenced by BgBufferSync(), BufferSync(), and SyncOneBuffer().

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 59 of file bufmgr.c.

Referenced by BufferAlloc(), and FlushBuffer().

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:359
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
int32 * LocalRefCount
Definition: localbuf.c:45

Definition at line 420 of file bufmgr.c.

Referenced by BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetTag(), BufferIsPermanent(), FlushOneBuffer(), IncrBufferRefCount(), MarkBufferDirty(), and ReleaseAndReadBuffer().

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 58 of file bufmgr.c.

Referenced by FlushBuffer(), and ReadBuffer_common().

◆ DROP_RELS_BSEARCH_THRESHOLD

#define DROP_RELS_BSEARCH_THRESHOLD   20

Definition at line 69 of file bufmgr.c.

Referenced by DropRelFileNodesAllBuffers().

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 62 of file bufmgr.c.

Referenced by FlushRelationBuffers(), and ReadBuffer_common().

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

Function Documentation

◆ AbortBufferIO()

void AbortBufferIO ( void  )

Definition at line 4030 of file bufmgr.c.

References Assert, buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_VALID, buf, BufferDescriptorGetIOLock, ereport, errcode(), errdetail(), errmsg(), buftag::forkNum, InProgressBuf, IsForInput, LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), pfree(), relpathperm, buftag::rnode, BufferDesc::tag, TerminateBufferIO(), UnlockBufHdr, and WARNING.

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

4031 {
4033 
4034  if (buf)
4035  {
4036  uint32 buf_state;
4037 
4038  /*
4039  * Since LWLockReleaseAll has already been called, we're not holding
4040  * the buffer's io_in_progress_lock. We have to re-acquire it so that
4041  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
4042  * buffer will be in a busy spin until we succeed in doing this.
4043  */
4045 
4046  buf_state = LockBufHdr(buf);
4047  Assert(buf_state & BM_IO_IN_PROGRESS);
4048  if (IsForInput)
4049  {
4050  Assert(!(buf_state & BM_DIRTY));
4051 
4052  /* We'd better not think buffer is valid yet */
4053  Assert(!(buf_state & BM_VALID));
4054  UnlockBufHdr(buf, buf_state);
4055  }
4056  else
4057  {
4058  Assert(buf_state & BM_DIRTY);
4059  UnlockBufHdr(buf, buf_state);
4060  /* Issue notice if this is not the first failure... */
4061  if (buf_state & BM_IO_ERROR)
4062  {
4063  /* Buffer is pinned, so we can read tag without spinlock */
4064  char *path;
4065 
4066  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4067  ereport(WARNING,
4068  (errcode(ERRCODE_IO_ERROR),
4069  errmsg("could not write block %u of %s",
4070  buf->tag.blockNum, path),
4071  errdetail("Multiple failures --- write error might be permanent.")));
4072  pfree(path);
4073  }
4074  }
4075  TerminateBufferIO(buf, false, BM_IO_ERROR);
4076  }
4077 }
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
ForkNumber forkNum
Definition: buf_internals.h:93
int errcode(int sqlerrcode)
Definition: elog.c:608
#define BM_DIRTY
Definition: buf_internals.h:58
#define BufferDescriptorGetIOLock(bdesc)
static BufferDesc * InProgressBuf
Definition: bufmgr.c:133
void pfree(void *pointer)
Definition: mcxt.c:1056
static char * buf
Definition: pg_test_fsync.c:67
int errdetail(const char *fmt,...)
Definition: elog.c:955
unsigned int uint32
Definition: c.h:359
static bool IsForInput
Definition: bufmgr.c:134
#define ereport(elevel, rest)
Definition: elog.h:141
#define WARNING
Definition: elog.h:40
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:3998
#define BM_VALID
Definition: buf_internals.h:59
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
#define Assert(condition)
Definition: c.h:739
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
#define BM_IO_ERROR
Definition: buf_internals.h:62
BufferTag tag
int errmsg(const char *fmt,...)
Definition: elog.c:822
#define UnlockBufHdr(desc, s)
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:61

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 2423 of file bufmgr.c.

References Assert, AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

2424 {
2426 
2427  AtEOXact_LocalBuffers(isCommit);
2428 
2430 }
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:171
#define Assert(condition)
Definition: c.h:739
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2498
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:572

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 2479 of file bufmgr.c.

References AbortBufferIO(), AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferPoolBackend().

2480 {
2481  AbortBufferIO();
2482  UnlockBuffers();
2483 
2485 
2486  /* localbuf.c needs a chance too */
2488 }
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:583
void UnlockBuffers(void)
Definition: bufmgr.c:3574
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2498
void AbortBufferIO(void)
Definition: bufmgr.c:4030

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 2053 of file bufmgr.c.

References Assert, bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, BgWriterStats, BUF_REUSABLE, BUF_WRITTEN, CurrentResourceOwner, DEBUG1, DEBUG2, elog, PgStat_MsgBgWriter::m_buf_alloc, PgStat_MsgBgWriter::m_buf_written_clean, PgStat_MsgBgWriter::m_maxwritten_clean, NBuffers, ResourceOwnerEnlargeBuffers(), StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

2054 {
2055  /* info obtained from freelist.c */
2056  int strategy_buf_id;
2057  uint32 strategy_passes;
2058  uint32 recent_alloc;
2059 
2060  /*
2061  * Information saved between calls so we can determine the strategy
2062  * point's advance rate and avoid scanning already-cleaned buffers.
2063  */
2064  static bool saved_info_valid = false;
2065  static int prev_strategy_buf_id;
2066  static uint32 prev_strategy_passes;
2067  static int next_to_clean;
2068  static uint32 next_passes;
2069 
2070  /* Moving averages of allocation rate and clean-buffer density */
2071  static float smoothed_alloc = 0;
2072  static float smoothed_density = 10.0;
2073 
2074  /* Potentially these could be tunables, but for now, not */
2075  float smoothing_samples = 16;
2076  float scan_whole_pool_milliseconds = 120000.0;
2077 
2078  /* Used to compute how far we scan ahead */
2079  long strategy_delta;
2080  int bufs_to_lap;
2081  int bufs_ahead;
2082  float scans_per_alloc;
2083  int reusable_buffers_est;
2084  int upcoming_alloc_est;
2085  int min_scan_buffers;
2086 
2087  /* Variables for the scanning loop proper */
2088  int num_to_scan;
2089  int num_written;
2090  int reusable_buffers;
2091 
2092  /* Variables for final smoothed_density update */
2093  long new_strategy_delta;
2094  uint32 new_recent_alloc;
2095 
2096  /*
2097  * Find out where the freelist clock sweep currently is, and how many
2098  * buffer allocations have happened since our last call.
2099  */
2100  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2101 
2102  /* Report buffer alloc counts to pgstat */
2103  BgWriterStats.m_buf_alloc += recent_alloc;
2104 
2105  /*
2106  * If we're not running the LRU scan, just stop after doing the stats
2107  * stuff. We mark the saved state invalid so that we can recover sanely
2108  * if LRU scan is turned back on later.
2109  */
2110  if (bgwriter_lru_maxpages <= 0)
2111  {
2112  saved_info_valid = false;
2113  return true;
2114  }
2115 
2116  /*
2117  * Compute strategy_delta = how many buffers have been scanned by the
2118  * clock sweep since last time. If first time through, assume none. Then
2119  * see if we are still ahead of the clock sweep, and if so, how many
2120  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2121  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2122  * behavior when the passes counts wrap around.
2123  */
2124  if (saved_info_valid)
2125  {
2126  int32 passes_delta = strategy_passes - prev_strategy_passes;
2127 
2128  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2129  strategy_delta += (long) passes_delta * NBuffers;
2130 
2131  Assert(strategy_delta >= 0);
2132 
2133  if ((int32) (next_passes - strategy_passes) > 0)
2134  {
2135  /* we're one pass ahead of the strategy point */
2136  bufs_to_lap = strategy_buf_id - next_to_clean;
2137 #ifdef BGW_DEBUG
2138  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2139  next_passes, next_to_clean,
2140  strategy_passes, strategy_buf_id,
2141  strategy_delta, bufs_to_lap);
2142 #endif
2143  }
2144  else if (next_passes == strategy_passes &&
2145  next_to_clean >= strategy_buf_id)
2146  {
2147  /* on same pass, but ahead or at least not behind */
2148  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2149 #ifdef BGW_DEBUG
2150  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2151  next_passes, next_to_clean,
2152  strategy_passes, strategy_buf_id,
2153  strategy_delta, bufs_to_lap);
2154 #endif
2155  }
2156  else
2157  {
2158  /*
2159  * We're behind, so skip forward to the strategy point and start
2160  * cleaning from there.
2161  */
2162 #ifdef BGW_DEBUG
2163  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2164  next_passes, next_to_clean,
2165  strategy_passes, strategy_buf_id,
2166  strategy_delta);
2167 #endif
2168  next_to_clean = strategy_buf_id;
2169  next_passes = strategy_passes;
2170  bufs_to_lap = NBuffers;
2171  }
2172  }
2173  else
2174  {
2175  /*
2176  * Initializing at startup or after LRU scanning had been off. Always
2177  * start at the strategy point.
2178  */
2179 #ifdef BGW_DEBUG
2180  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2181  strategy_passes, strategy_buf_id);
2182 #endif
2183  strategy_delta = 0;
2184  next_to_clean = strategy_buf_id;
2185  next_passes = strategy_passes;
2186  bufs_to_lap = NBuffers;
2187  }
2188 
2189  /* Update saved info for next time */
2190  prev_strategy_buf_id = strategy_buf_id;
2191  prev_strategy_passes = strategy_passes;
2192  saved_info_valid = true;
2193 
2194  /*
2195  * Compute how many buffers had to be scanned for each new allocation, ie,
2196  * 1/density of reusable buffers, and track a moving average of that.
2197  *
2198  * If the strategy point didn't move, we don't update the density estimate
2199  */
2200  if (strategy_delta > 0 && recent_alloc > 0)
2201  {
2202  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2203  smoothed_density += (scans_per_alloc - smoothed_density) /
2204  smoothing_samples;
2205  }
2206 
2207  /*
2208  * Estimate how many reusable buffers there are between the current
2209  * strategy point and where we've scanned ahead to, based on the smoothed
2210  * density estimate.
2211  */
2212  bufs_ahead = NBuffers - bufs_to_lap;
2213  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2214 
2215  /*
2216  * Track a moving average of recent buffer allocations. Here, rather than
2217  * a true average we want a fast-attack, slow-decline behavior: we
2218  * immediately follow any increase.
2219  */
2220  if (smoothed_alloc <= (float) recent_alloc)
2221  smoothed_alloc = recent_alloc;
2222  else
2223  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2224  smoothing_samples;
2225 
2226  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2227  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2228 
2229  /*
2230  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2231  * eventually underflow to zero, and the underflows produce annoying
2232  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2233  * zero, there's no point in tracking smaller and smaller values of
2234  * smoothed_alloc, so just reset it to exactly zero to avoid this
2235  * syndrome. It will pop back up as soon as recent_alloc increases.
2236  */
2237  if (upcoming_alloc_est == 0)
2238  smoothed_alloc = 0;
2239 
2240  /*
2241  * Even in cases where there's been little or no buffer allocation
2242  * activity, we want to make a small amount of progress through the buffer
2243  * cache so that as many reusable buffers as possible are clean after an
2244  * idle period.
2245  *
2246  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2247  * the BGW will be called during the scan_whole_pool time; slice the
2248  * buffer pool into that many sections.
2249  */
2250  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2251 
2252  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2253  {
2254 #ifdef BGW_DEBUG
2255  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2256  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2257 #endif
2258  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2259  }
2260 
2261  /*
2262  * Now write out dirty reusable buffers, working forward from the
2263  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2264  * enough buffers to match our estimate of the next cycle's allocation
2265  * requirements, or hit the bgwriter_lru_maxpages limit.
2266  */
2267 
2268  /* Make sure we can handle the pin inside SyncOneBuffer */
2270 
2271  num_to_scan = bufs_to_lap;
2272  num_written = 0;
2273  reusable_buffers = reusable_buffers_est;
2274 
2275  /* Execute the LRU scan */
2276  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2277  {
2278  int sync_state = SyncOneBuffer(next_to_clean, true,
2279  wb_context);
2280 
2281  if (++next_to_clean >= NBuffers)
2282  {
2283  next_to_clean = 0;
2284  next_passes++;
2285  }
2286  num_to_scan--;
2287 
2288  if (sync_state & BUF_WRITTEN)
2289  {
2290  reusable_buffers++;
2291  if (++num_written >= bgwriter_lru_maxpages)
2292  {
2294  break;
2295  }
2296  }
2297  else if (sync_state & BUF_REUSABLE)
2298  reusable_buffers++;
2299  }
2300 
2301  BgWriterStats.m_buf_written_clean += num_written;
2302 
2303 #ifdef BGW_DEBUG
2304  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2305  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2306  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2307  bufs_to_lap - num_to_scan,
2308  num_written,
2309  reusable_buffers - reusable_buffers_est);
2310 #endif
2311 
2312  /*
2313  * Consider the above scan as being like a new allocation scan.
2314  * Characterize its density and update the smoothed one based on it. This
2315  * effectively halves the moving average period in cases where both the
2316  * strategy and the background writer are doing some useful scanning,
2317  * which is helpful because a long memory isn't as desirable on the
2318  * density estimates.
2319  */
2320  new_strategy_delta = bufs_to_lap - num_to_scan;
2321  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2322  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2323  {
2324  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2325  smoothed_density += (scans_per_alloc - smoothed_density) /
2326  smoothing_samples;
2327 
2328 #ifdef BGW_DEBUG
2329  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2330  new_recent_alloc, new_strategy_delta,
2331  scans_per_alloc, smoothed_density);
2332 #endif
2333  }
2334 
2335  /* Return true if OK to hibernate */
2336  return (bufs_to_lap == 0 && recent_alloc == 0);
2337 }
PgStat_Counter m_buf_alloc
Definition: pgstat.h:420
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:395
#define DEBUG1
Definition: elog.h:25
int BgWriterDelay
Definition: bgwriter.c:67
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
PgStat_Counter m_maxwritten_clean
Definition: pgstat.h:417
PgStat_Counter m_buf_written_clean
Definition: pgstat.h:416
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:141
double bgwriter_lru_multiplier
Definition: bufmgr.c:111
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2356
signed int int32
Definition: c.h:347
#define BUF_REUSABLE
Definition: bufmgr.c:67
int bgwriter_lru_maxpages
Definition: bufmgr.c:110
#define DEBUG2
Definition: elog.h:24
unsigned int uint32
Definition: c.h:359
#define BUF_WRITTEN
Definition: bufmgr.c:66
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:893
#define Assert(condition)
Definition: c.h:739
#define elog(elevel,...)
Definition: elog.h:228
int NBuffers
Definition: globals.c:131

◆ BufferAlloc()

static BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr 
)
static

Definition at line 995 of file bufmgr.c.

References Assert, BackendWritebackContext, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_ERROR, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BUF_FLAG_MASK, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BUF_USAGECOUNT_ONE, BufferDescriptorGetContentLock, BufferGetLSN, BufMappingPartitionLock, BufTableDelete(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), RelFileNode::dbNode, FlushBuffer(), GetBufferDescriptor, INIT_BUFFERTAG, INIT_FORKNUM, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockConditionalAcquire(), LWLockRelease(), RelFileNodeBackend::node, PinBuffer(), PinBuffer_Locked(), RelFileNode::relNode, ReservePrivateRefCountEntry(), ScheduleBufferTagForWriteback(), SMgrRelationData::smgr_rnode, RelFileNode::spcNode, StartBufferIO(), StrategyGetBuffer(), StrategyRejectBuffer(), BufferDesc::tag, UnlockBufHdr, UnpinBuffer(), and XLogNeedsFlush().

Referenced by ReadBuffer_common().

999 {
1000  BufferTag newTag; /* identity of requested block */
1001  uint32 newHash; /* hash value for newTag */
1002  LWLock *newPartitionLock; /* buffer partition lock for it */
1003  BufferTag oldTag; /* previous identity of selected buffer */
1004  uint32 oldHash; /* hash value for oldTag */
1005  LWLock *oldPartitionLock; /* buffer partition lock for it */
1006  uint32 oldFlags;
1007  int buf_id;
1008  BufferDesc *buf;
1009  bool valid;
1010  uint32 buf_state;
1011 
1012  /* create a tag so we can lookup the buffer */
1013  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
1014 
1015  /* determine its hash code and partition lock ID */
1016  newHash = BufTableHashCode(&newTag);
1017  newPartitionLock = BufMappingPartitionLock(newHash);
1018 
1019  /* see if the block is in the buffer pool already */
1020  LWLockAcquire(newPartitionLock, LW_SHARED);
1021  buf_id = BufTableLookup(&newTag, newHash);
1022  if (buf_id >= 0)
1023  {
1024  /*
1025  * Found it. Now, pin the buffer so no one can steal it from the
1026  * buffer pool, and check to see if the correct data has been loaded
1027  * into the buffer.
1028  */
1029  buf = GetBufferDescriptor(buf_id);
1030 
1031  valid = PinBuffer(buf, strategy);
1032 
1033  /* Can release the mapping lock as soon as we've pinned it */
1034  LWLockRelease(newPartitionLock);
1035 
1036  *foundPtr = true;
1037 
1038  if (!valid)
1039  {
1040  /*
1041  * We can only get here if (a) someone else is still reading in
1042  * the page, or (b) a previous read attempt failed. We have to
1043  * wait for any active read attempt to finish, and then set up our
1044  * own read attempt if the page is still not BM_VALID.
1045  * StartBufferIO does it all.
1046  */
1047  if (StartBufferIO(buf, true))
1048  {
1049  /*
1050  * If we get here, previous attempts to read the buffer must
1051  * have failed ... but we shall bravely try again.
1052  */
1053  *foundPtr = false;
1054  }
1055  }
1056 
1057  return buf;
1058  }
1059 
1060  /*
1061  * Didn't find it in the buffer pool. We'll have to initialize a new
1062  * buffer. Remember to unlock the mapping lock while doing the work.
1063  */
1064  LWLockRelease(newPartitionLock);
1065 
1066  /* Loop here in case we have to try another victim buffer */
1067  for (;;)
1068  {
1069  /*
1070  * Ensure, while the spinlock's not yet held, that there's a free
1071  * refcount entry.
1072  */
1074 
1075  /*
1076  * Select a victim buffer. The buffer is returned with its header
1077  * spinlock still held!
1078  */
1079  buf = StrategyGetBuffer(strategy, &buf_state);
1080 
1081  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1082 
1083  /* Must copy buffer flags while we still hold the spinlock */
1084  oldFlags = buf_state & BUF_FLAG_MASK;
1085 
1086  /* Pin the buffer and then release the buffer spinlock */
1087  PinBuffer_Locked(buf);
1088 
1089  /*
1090  * If the buffer was dirty, try to write it out. There is a race
1091  * condition here, in that someone might dirty it after we released it
1092  * above, or even while we are writing it out (since our share-lock
1093  * won't prevent hint-bit updates). We will recheck the dirty bit
1094  * after re-locking the buffer header.
1095  */
1096  if (oldFlags & BM_DIRTY)
1097  {
1098  /*
1099  * We need a share-lock on the buffer contents to write it out
1100  * (else we might write invalid data, eg because someone else is
1101  * compacting the page contents while we write). We must use a
1102  * conditional lock acquisition here to avoid deadlock. Even
1103  * though the buffer was not pinned (and therefore surely not
1104  * locked) when StrategyGetBuffer returned it, someone else could
1105  * have pinned and exclusive-locked it by the time we get here. If
1106  * we try to get the lock unconditionally, we'd block waiting for
1107  * them; if they later block waiting for us, deadlock ensues.
1108  * (This has been observed to happen when two backends are both
1109  * trying to split btree index pages, and the second one just
1110  * happens to be trying to split the page the first one got from
1111  * StrategyGetBuffer.)
1112  */
1114  LW_SHARED))
1115  {
1116  /*
1117  * If using a nondefault strategy, and writing the buffer
1118  * would require a WAL flush, let the strategy decide whether
1119  * to go ahead and write/reuse the buffer or to choose another
1120  * victim. We need lock to inspect the page LSN, so this
1121  * can't be done inside StrategyGetBuffer.
1122  */
1123  if (strategy != NULL)
1124  {
1125  XLogRecPtr lsn;
1126 
1127  /* Read the LSN while holding buffer header lock */
1128  buf_state = LockBufHdr(buf);
1129  lsn = BufferGetLSN(buf);
1130  UnlockBufHdr(buf, buf_state);
1131 
1132  if (XLogNeedsFlush(lsn) &&
1133  StrategyRejectBuffer(strategy, buf))
1134  {
1135  /* Drop lock/pin and loop around for another buffer */
1137  UnpinBuffer(buf, true);
1138  continue;
1139  }
1140  }
1141 
1142  /* OK, do the I/O */
1143  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1144  smgr->smgr_rnode.node.spcNode,
1145  smgr->smgr_rnode.node.dbNode,
1146  smgr->smgr_rnode.node.relNode);
1147 
1148  FlushBuffer(buf, NULL);
1150 
1152  &buf->tag);
1153 
1154  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1155  smgr->smgr_rnode.node.spcNode,
1156  smgr->smgr_rnode.node.dbNode,
1157  smgr->smgr_rnode.node.relNode);
1158  }
1159  else
1160  {
1161  /*
1162  * Someone else has locked the buffer, so give it up and loop
1163  * back to get another one.
1164  */
1165  UnpinBuffer(buf, true);
1166  continue;
1167  }
1168  }
1169 
1170  /*
1171  * To change the association of a valid buffer, we'll need to have
1172  * exclusive lock on both the old and new mapping partitions.
1173  */
1174  if (oldFlags & BM_TAG_VALID)
1175  {
1176  /*
1177  * Need to compute the old tag's hashcode and partition lock ID.
1178  * XXX is it worth storing the hashcode in BufferDesc so we need
1179  * not recompute it here? Probably not.
1180  */
1181  oldTag = buf->tag;
1182  oldHash = BufTableHashCode(&oldTag);
1183  oldPartitionLock = BufMappingPartitionLock(oldHash);
1184 
1185  /*
1186  * Must lock the lower-numbered partition first to avoid
1187  * deadlocks.
1188  */
1189  if (oldPartitionLock < newPartitionLock)
1190  {
1191  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1192  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1193  }
1194  else if (oldPartitionLock > newPartitionLock)
1195  {
1196  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1197  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1198  }
1199  else
1200  {
1201  /* only one partition, only one lock */
1202  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1203  }
1204  }
1205  else
1206  {
1207  /* if it wasn't valid, we need only the new partition */
1208  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1209  /* remember we have no old-partition lock or tag */
1210  oldPartitionLock = NULL;
1211  /* keep the compiler quiet about uninitialized variables */
1212  oldHash = 0;
1213  }
1214 
1215  /*
1216  * Try to make a hashtable entry for the buffer under its new tag.
1217  * This could fail because while we were writing someone else
1218  * allocated another buffer for the same block we want to read in.
1219  * Note that we have not yet removed the hashtable entry for the old
1220  * tag.
1221  */
1222  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1223 
1224  if (buf_id >= 0)
1225  {
1226  /*
1227  * Got a collision. Someone has already done what we were about to
1228  * do. We'll just handle this as if it were found in the buffer
1229  * pool in the first place. First, give up the buffer we were
1230  * planning to use.
1231  */
1232  UnpinBuffer(buf, true);
1233 
1234  /* Can give up that buffer's mapping partition lock now */
1235  if (oldPartitionLock != NULL &&
1236  oldPartitionLock != newPartitionLock)
1237  LWLockRelease(oldPartitionLock);
1238 
1239  /* remaining code should match code at top of routine */
1240 
1241  buf = GetBufferDescriptor(buf_id);
1242 
1243  valid = PinBuffer(buf, strategy);
1244 
1245  /* Can release the mapping lock as soon as we've pinned it */
1246  LWLockRelease(newPartitionLock);
1247 
1248  *foundPtr = true;
1249 
1250  if (!valid)
1251  {
1252  /*
1253  * We can only get here if (a) someone else is still reading
1254  * in the page, or (b) a previous read attempt failed. We
1255  * have to wait for any active read attempt to finish, and
1256  * then set up our own read attempt if the page is still not
1257  * BM_VALID. StartBufferIO does it all.
1258  */
1259  if (StartBufferIO(buf, true))
1260  {
1261  /*
1262  * If we get here, previous attempts to read the buffer
1263  * must have failed ... but we shall bravely try again.
1264  */
1265  *foundPtr = false;
1266  }
1267  }
1268 
1269  return buf;
1270  }
1271 
1272  /*
1273  * Need to lock the buffer header too in order to change its tag.
1274  */
1275  buf_state = LockBufHdr(buf);
1276 
1277  /*
1278  * Somebody could have pinned or re-dirtied the buffer while we were
1279  * doing the I/O and making the new hashtable entry. If so, we can't
1280  * recycle this buffer; we must undo everything we've done and start
1281  * over with a new victim buffer.
1282  */
1283  oldFlags = buf_state & BUF_FLAG_MASK;
1284  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1285  break;
1286 
1287  UnlockBufHdr(buf, buf_state);
1288  BufTableDelete(&newTag, newHash);
1289  if (oldPartitionLock != NULL &&
1290  oldPartitionLock != newPartitionLock)
1291  LWLockRelease(oldPartitionLock);
1292  LWLockRelease(newPartitionLock);
1293  UnpinBuffer(buf, true);
1294  }
1295 
1296  /*
1297  * Okay, it's finally safe to rename the buffer.
1298  *
1299  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1300  * paranoia. We also reset the usage_count since any recency of use of
1301  * the old content is no longer relevant. (The usage_count starts out at
1302  * 1 so that the buffer can survive one clock-sweep pass.)
1303  *
1304  * Make sure BM_PERMANENT is set for buffers that must be written at every
1305  * checkpoint. Unlogged buffers only need to be written at shutdown
1306  * checkpoints, except for their "init" forks, which need to be treated
1307  * just like permanent relations.
1308  */
1309  buf->tag = newTag;
1310  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1313  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1314  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1315  else
1316  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1317 
1318  UnlockBufHdr(buf, buf_state);
1319 
1320  if (oldPartitionLock != NULL)
1321  {
1322  BufTableDelete(&oldTag, oldHash);
1323  if (oldPartitionLock != newPartitionLock)
1324  LWLockRelease(oldPartitionLock);
1325  }
1326 
1327  LWLockRelease(newPartitionLock);
1328 
1329  /*
1330  * Buffer contents are currently invalid. Try to get the io_in_progress
1331  * lock. If StartBufferIO returns false, then someone else managed to
1332  * read it before we did, so there's nothing left for BufferAlloc() to do.
1333  */
1334  if (StartBufferIO(buf, true))
1335  *foundPtr = false;
1336  else
1337  *foundPtr = true;
1338 
1339  return buf;
1340 }
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1579
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
Definition: lwlock.h:32
#define BM_PERMANENT
Definition: buf_internals.h:66
#define BufMappingPartitionLock(hashcode)
#define BM_TAG_VALID
Definition: buf_internals.h:60
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3126
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
#define BM_DIRTY
Definition: buf_internals.h:58
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2673
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1726
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:3931
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
char relpersistence
Definition: pg_class.h:78
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4300
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
WritebackContext BackendWritebackContext
Definition: buf_init.c:23
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1294
static char * buf
Definition: pg_test_fsync.c:67
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
unsigned int uint32
Definition: c.h:359
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1702
#define BM_VALID
Definition: buf_internals.h:59
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:686
RelFileNode node
Definition: relfilenode.h:74
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:739
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1664
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
#define BM_IO_ERROR
Definition: buf_internals.h:62
BufferTag tag
#define UnlockBufHdr(desc, s)
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:187
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:59
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 2613 of file bufmgr.c.

References Assert, buftag::blockNum, BufferIsLocal, BufferIsPinned, GetBufferDescriptor, GetLocalBufferDescriptor, and BufferDesc::tag.

Referenced by _bt_checkpage(), _bt_endpoint(), _bt_finish_split(), _bt_first(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_split(), _bt_unlink_halfdead_page(), _bt_walk_left(), _hash_addovflpage(), _hash_checkpage(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), CheckForSerializableConflictIn(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_with_data(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_is_all_visible(), heap_prune_chain(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), ReadBufferBI(), RelationAddExtraBlocks(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgWalk(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), visibilitymap_set(), and XLogReadBufferExtended().

2614 {
2615  BufferDesc *bufHdr;
2616 
2617  Assert(BufferIsPinned(buffer));
2618 
2619  if (BufferIsLocal(buffer))
2620  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2621  else
2622  bufHdr = GetBufferDescriptor(buffer - 1);
2623 
2624  /* pinned, so OK to read tag without spinlock */
2625  return bufHdr->tag.blockNum;
2626 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:420
#define GetLocalBufferDescriptor(id)
#define GetBufferDescriptor(id)
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94
BufferTag tag

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 2876 of file bufmgr.c.

References Assert, BufferGetPage, BufferIsLocal, BufferIsPinned, BufferIsValid, GetBufferDescriptor, LockBufHdr(), PageGetLSN, UnlockBufHdr, and XLogHintBitIsNeeded.

Referenced by _bt_killitems(), _bt_readpage(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

2877 {
2878  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2879  char *page = BufferGetPage(buffer);
2880  XLogRecPtr lsn;
2881  uint32 buf_state;
2882 
2883  /*
2884  * If we don't need locking for correctness, fastpath out.
2885  */
2886  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2887  return PageGetLSN(page);
2888 
2889  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2890  Assert(BufferIsValid(buffer));
2891  Assert(BufferIsPinned(buffer));
2892 
2893  buf_state = LockBufHdr(bufHdr);
2894  lsn = PageGetLSN(page);
2895  UnlockBufHdr(bufHdr, buf_state);
2896 
2897  return lsn;
2898 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:420
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:359
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define PageGetLSN(page)
Definition: bufpage.h:366
#define UnlockBufHdr(desc, s)
#define XLogHintBitIsNeeded()
Definition: xlog.h:192

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileNode rnode,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 2634 of file bufmgr.c.

References Assert, buftag::blockNum, BufferIsLocal, BufferIsPinned, buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, buftag::rnode, and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

2636 {
2637  BufferDesc *bufHdr;
2638 
2639  /* Do the same checks as BufferGetBlockNumber. */
2640  Assert(BufferIsPinned(buffer));
2641 
2642  if (BufferIsLocal(buffer))
2643  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2644  else
2645  bufHdr = GetBufferDescriptor(buffer - 1);
2646 
2647  /* pinned, so OK to read tag without spinlock */
2648  *rnode = bufHdr->tag.rnode;
2649  *forknum = bufHdr->tag.forkNum;
2650  *blknum = bufHdr->tag.blockNum;
2651 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:420
ForkNumber forkNum
Definition: buf_internals.h:93
#define GetLocalBufferDescriptor(id)
#define GetBufferDescriptor(id)
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 2846 of file bufmgr.c.

References Assert, BM_PERMANENT, BufferIsLocal, BufferIsPinned, BufferIsValid, GetBufferDescriptor, pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

2847 {
2848  BufferDesc *bufHdr;
2849 
2850  /* Local buffers are used only for temp relations. */
2851  if (BufferIsLocal(buffer))
2852  return false;
2853 
2854  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2855  Assert(BufferIsValid(buffer));
2856  Assert(BufferIsPinned(buffer));
2857 
2858  /*
2859  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2860  * need not bother with the buffer header spinlock. Even if someone else
2861  * changes the buffer header state while we're doing this, the state is
2862  * changed atomically, so we'll read the old value or the new value, but
2863  * not random garbage.
2864  */
2865  bufHdr = GetBufferDescriptor(buffer - 1);
2866  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2867 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:420
#define BM_PERMANENT
Definition: buf_internals.h:66
#define GetBufferDescriptor(id)
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
pg_atomic_uint32 state
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 1786 of file bufmgr.c.

References Assert, BgWriterStats, binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_ALL, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), ckpt_buforder_comparator(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, CurrentResourceOwner, DatumGetPointer, buftag::forkNum, CkptSortItem::forkNum, GetBufferDescriptor, i, CkptTsStatus::index, InvalidOid, IssuePendingWritebacks(), LockBufHdr(), PgStat_MsgBgWriter::m_buf_written_checkpoints, NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), pfree(), pg_atomic_read_u32(), PointerGetDatum, CkptTsStatus::progress, CkptTsStatus::progress_slice, qsort, RelFileNode::relNode, CkptSortItem::relNode, repalloc(), ResourceOwnerEnlargeBuffers(), buftag::rnode, RelFileNode::spcNode, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdr, and WritebackContextInit().

Referenced by CheckPointBuffers().

1787 {
1788  uint32 buf_state;
1789  int buf_id;
1790  int num_to_scan;
1791  int num_spaces;
1792  int num_processed;
1793  int num_written;
1794  CkptTsStatus *per_ts_stat = NULL;
1795  Oid last_tsid;
1796  binaryheap *ts_heap;
1797  int i;
1798  int mask = BM_DIRTY;
1799  WritebackContext wb_context;
1800 
1801  /* Make sure we can handle the pin inside SyncOneBuffer */
1803 
1804  /*
1805  * Unless this is a shutdown checkpoint or we have been explicitly told,
1806  * we write only permanent, dirty buffers. But at shutdown or end of
1807  * recovery, we write all dirty buffers.
1808  */
1811  mask |= BM_PERMANENT;
1812 
1813  /*
1814  * Loop over all buffers, and mark the ones that need to be written with
1815  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1816  * can estimate how much work needs to be done.
1817  *
1818  * This allows us to write only those pages that were dirty when the
1819  * checkpoint began, and not those that get dirtied while it proceeds.
1820  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1821  * later in this function, or by normal backends or the bgwriter cleaning
1822  * scan, the flag is cleared. Any buffer dirtied after this point won't
1823  * have the flag set.
1824  *
1825  * Note that if we fail to write some buffer, we may leave buffers with
1826  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1827  * certainly need to be written for the next checkpoint attempt, too.
1828  */
1829  num_to_scan = 0;
1830  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1831  {
1832  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1833 
1834  /*
1835  * Header spinlock is enough to examine BM_DIRTY, see comment in
1836  * SyncOneBuffer.
1837  */
1838  buf_state = LockBufHdr(bufHdr);
1839 
1840  if ((buf_state & mask) == mask)
1841  {
1842  CkptSortItem *item;
1843 
1844  buf_state |= BM_CHECKPOINT_NEEDED;
1845 
1846  item = &CkptBufferIds[num_to_scan++];
1847  item->buf_id = buf_id;
1848  item->tsId = bufHdr->tag.rnode.spcNode;
1849  item->relNode = bufHdr->tag.rnode.relNode;
1850  item->forkNum = bufHdr->tag.forkNum;
1851  item->blockNum = bufHdr->tag.blockNum;
1852  }
1853 
1854  UnlockBufHdr(bufHdr, buf_state);
1855  }
1856 
1857  if (num_to_scan == 0)
1858  return; /* nothing to do */
1859 
1861 
1862  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1863 
1864  /*
1865  * Sort buffers that need to be written to reduce the likelihood of random
1866  * IO. The sorting is also important for the implementation of balancing
1867  * writes between tablespaces. Without balancing writes we'd potentially
1868  * end up writing to the tablespaces one-by-one; possibly overloading the
1869  * underlying system.
1870  */
1871  qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1873 
1874  num_spaces = 0;
1875 
1876  /*
1877  * Allocate progress status for each tablespace with buffers that need to
1878  * be flushed. This requires the to-be-flushed array to be sorted.
1879  */
1880  last_tsid = InvalidOid;
1881  for (i = 0; i < num_to_scan; i++)
1882  {
1883  CkptTsStatus *s;
1884  Oid cur_tsid;
1885 
1886  cur_tsid = CkptBufferIds[i].tsId;
1887 
1888  /*
1889  * Grow array of per-tablespace status structs, every time a new
1890  * tablespace is found.
1891  */
1892  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1893  {
1894  Size sz;
1895 
1896  num_spaces++;
1897 
1898  /*
1899  * Not worth adding grow-by-power-of-2 logic here - even with a
1900  * few hundred tablespaces this should be fine.
1901  */
1902  sz = sizeof(CkptTsStatus) * num_spaces;
1903 
1904  if (per_ts_stat == NULL)
1905  per_ts_stat = (CkptTsStatus *) palloc(sz);
1906  else
1907  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1908 
1909  s = &per_ts_stat[num_spaces - 1];
1910  memset(s, 0, sizeof(*s));
1911  s->tsId = cur_tsid;
1912 
1913  /*
1914  * The first buffer in this tablespace. As CkptBufferIds is sorted
1915  * by tablespace all (s->num_to_scan) buffers in this tablespace
1916  * will follow afterwards.
1917  */
1918  s->index = i;
1919 
1920  /*
1921  * progress_slice will be determined once we know how many buffers
1922  * are in each tablespace, i.e. after this loop.
1923  */
1924 
1925  last_tsid = cur_tsid;
1926  }
1927  else
1928  {
1929  s = &per_ts_stat[num_spaces - 1];
1930  }
1931 
1932  s->num_to_scan++;
1933  }
1934 
1935  Assert(num_spaces > 0);
1936 
1937  /*
1938  * Build a min-heap over the write-progress in the individual tablespaces,
1939  * and compute how large a portion of the total progress a single
1940  * processed buffer is.
1941  */
1942  ts_heap = binaryheap_allocate(num_spaces,
1944  NULL);
1945 
1946  for (i = 0; i < num_spaces; i++)
1947  {
1948  CkptTsStatus *ts_stat = &per_ts_stat[i];
1949 
1950  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
1951 
1952  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
1953  }
1954 
1955  binaryheap_build(ts_heap);
1956 
1957  /*
1958  * Iterate through to-be-checkpointed buffers and write the ones (still)
1959  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
1960  * tablespaces; otherwise the sorting would lead to only one tablespace
1961  * receiving writes at a time, making inefficient use of the hardware.
1962  */
1963  num_processed = 0;
1964  num_written = 0;
1965  while (!binaryheap_empty(ts_heap))
1966  {
1967  BufferDesc *bufHdr = NULL;
1968  CkptTsStatus *ts_stat = (CkptTsStatus *)
1970 
1971  buf_id = CkptBufferIds[ts_stat->index].buf_id;
1972  Assert(buf_id != -1);
1973 
1974  bufHdr = GetBufferDescriptor(buf_id);
1975 
1976  num_processed++;
1977 
1978  /*
1979  * We don't need to acquire the lock here, because we're only looking
1980  * at a single bit. It's possible that someone else writes the buffer
1981  * and clears the flag right after we check, but that doesn't matter
1982  * since SyncOneBuffer will then do nothing. However, there is a
1983  * further race condition: it's conceivable that between the time we
1984  * examine the bit here and the time SyncOneBuffer acquires the lock,
1985  * someone else not only wrote the buffer but replaced it with another
1986  * page and dirtied it. In that improbable case, SyncOneBuffer will
1987  * write the buffer though we didn't need to. It doesn't seem worth
1988  * guarding against this, though.
1989  */
1991  {
1992  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
1993  {
1994  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
1996  num_written++;
1997  }
1998  }
1999 
2000  /*
2001  * Measure progress independent of actually having to flush the buffer
2002  * - otherwise writing become unbalanced.
2003  */
2004  ts_stat->progress += ts_stat->progress_slice;
2005  ts_stat->num_scanned++;
2006  ts_stat->index++;
2007 
2008  /* Have all the buffers from the tablespace been processed? */
2009  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2010  {
2011  binaryheap_remove_first(ts_heap);
2012  }
2013  else
2014  {
2015  /* update heap with the new progress */
2016  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2017  }
2018 
2019  /*
2020  * Sleep to throttle our I/O rate.
2021  */
2022  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2023  }
2024 
2025  /* issue all pending flushes */
2026  IssuePendingWritebacks(&wb_context);
2027 
2028  pfree(per_ts_stat);
2029  per_ts_stat = NULL;
2030  binaryheap_free(ts_heap);
2031 
2032  /*
2033  * Update checkpoint statistics. As noted above, this doesn't include
2034  * buffers written by other backends or bgwriter scan.
2035  */
2036  CheckpointStats.ckpt_bufs_written += num_written;
2037 
2038  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2039 }
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:662
PgStat_Counter m_buf_written_checkpoints
Definition: pgstat.h:415
#define BM_PERMANENT
Definition: buf_internals.h:66
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:216
Oid tsId
Definition: bufmgr.c:87
#define binaryheap_empty(h)
Definition: binaryheap.h:52
ForkNumber forkNum
Definition: buf_internals.h:93
#define PointerGetDatum(X)
Definition: postgres.h:556
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4265
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:141
int checkpoint_flush_after
Definition: bufmgr.c:119
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
unsigned int Oid
Definition: postgres_ext.h:31
#define BM_DIRTY
Definition: buf_internals.h:58
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2356
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4334
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4288
void pfree(void *pointer)
Definition: mcxt.c:1056
double float8
Definition: c.h:492
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
int num_to_scan
Definition: bufmgr.c:100
float8 progress_slice
Definition: bufmgr.c:97
int index
Definition: bufmgr.c:105
float8 progress
Definition: bufmgr.c:96
static int ckpt_buforder_comparator(const void *pa, const void *pb)
Definition: bufmgr.c:4231
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:212
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:359
#define BUF_WRITTEN
Definition: bufmgr.c:66
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:893
int ckpt_bufs_written
Definition: xlog.h:241
BlockNumber blockNum
#define InvalidOid
Definition: postgres_ext.h:36
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:126
const symbol * s
Definition: header.h:17
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
#define Assert(condition)
Definition: c.h:739
CheckpointStatsData CheckpointStats
Definition: xlog.c:181
CkptSortItem * CkptBufferIds
Definition: buf_init.c:24
size_t Size
Definition: c.h:467
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:69
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:33
#define DatumGetPointer(X)
Definition: postgres.h:549
BufferTag tag
void * palloc(Size size)
Definition: mcxt.c:949
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
pg_atomic_uint32 state
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:174
int num_scanned
Definition: bufmgr.c:102
#define qsort(a, b, c, d)
Definition: port.h:488
ForkNumber forkNum
struct CkptTsStatus CkptTsStatus
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:211
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ buffertag_comparator()

static int buffertag_comparator ( const void *  p1,
const void *  p2 
)
static

Definition at line 4200 of file bufmgr.c.

References buftag::blockNum, buftag::forkNum, buftag::rnode, and rnode_comparator().

Referenced by IssuePendingWritebacks().

4201 {
4202  const BufferTag *ba = (const BufferTag *) a;
4203  const BufferTag *bb = (const BufferTag *) b;
4204  int ret;
4205 
4206  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4207 
4208  if (ret != 0)
4209  return ret;
4210 
4211  if (ba->forkNum < bb->forkNum)
4212  return -1;
4213  if (ba->forkNum > bb->forkNum)
4214  return 1;
4215 
4216  if (ba->blockNum < bb->blockNum)
4217  return -1;
4218  if (ba->blockNum > bb->blockNum)
4219  return 1;
4220 
4221  return 0;
4222 }
ForkNumber forkNum
Definition: buf_internals.h:93
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4121
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92

◆ BufmgrCommit()

void BufmgrCommit ( void  )

Definition at line 2599 of file bufmgr.c.

Referenced by PrepareTransaction(), and RecordTransactionCommit().

2600 {
2601  /* Nothing to do in bufmgr anymore... */
2602 }

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 2498 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, hash_seq_init(), hash_seq_search(), i, InvalidBuffer, PrintBufferLeakWarning(), PrivateRefCountArray, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

2499 {
2500 #ifdef USE_ASSERT_CHECKING
2501  int RefCountErrors = 0;
2502  PrivateRefCountEntry *res;
2503  int i;
2504 
2505  /* check the array */
2506  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2507  {
2508  res = &PrivateRefCountArray[i];
2509 
2510  if (res->buffer != InvalidBuffer)
2511  {
2513  RefCountErrors++;
2514  }
2515  }
2516 
2517  /* if necessary search the hash */
2519  {
2520  HASH_SEQ_STATUS hstat;
2521 
2523  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2524  {
2526  RefCountErrors++;
2527  }
2528 
2529  }
2530 
2531  Assert(RefCountErrors == 0);
2532 #endif
2533 }
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2539
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:171
#define InvalidBuffer
Definition: buf.h:25
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:169
#define Assert(condition)
Definition: c.h:739
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:78
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1389
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1379
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:170
int i

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 2582 of file bufmgr.c.

References BufferSync(), CheckpointStats, CheckpointStatsData::ckpt_sync_end_t, CheckpointStatsData::ckpt_sync_t, CheckpointStatsData::ckpt_write_t, GetCurrentTimestamp(), and ProcessSyncRequests().

Referenced by CheckPointGuts().

2583 {
2584  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
2586  BufferSync(flags);
2588  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
2591  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
2592 }
void ProcessSyncRequests(void)
Definition: sync.c:236
TimestampTz ckpt_sync_end_t
Definition: xlog.h:238
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1583
static void BufferSync(int flags)
Definition: bufmgr.c:1786
CheckpointStatsData CheckpointStats
Definition: xlog.c:181
TimestampTz ckpt_write_t
Definition: xlog.h:236
TimestampTz ckpt_sync_t
Definition: xlog.h:237

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const void *  pa,
const void *  pb 
)
static

Definition at line 4231 of file bufmgr.c.

References CkptSortItem::blockNum, CkptSortItem::forkNum, CkptSortItem::relNode, and CkptSortItem::tsId.

Referenced by BufferSync().

4232 {
4233  const CkptSortItem *a = (const CkptSortItem *) pa;
4234  const CkptSortItem *b = (const CkptSortItem *) pb;
4235 
4236  /* compare tablespace */
4237  if (a->tsId < b->tsId)
4238  return -1;
4239  else if (a->tsId > b->tsId)
4240  return 1;
4241  /* compare relation */
4242  if (a->relNode < b->relNode)
4243  return -1;
4244  else if (a->relNode > b->relNode)
4245  return 1;
4246  /* compare fork */
4247  else if (a->forkNum < b->forkNum)
4248  return -1;
4249  else if (a->forkNum > b->forkNum)
4250  return 1;
4251  /* compare block number */
4252  else if (a->blockNum < b->blockNum)
4253  return -1;
4254  else if (a->blockNum > b->blockNum)
4255  return 1;
4256  /* equal page IDs are unlikely, but not impossible */
4257  return 0;
4258 }
BlockNumber blockNum
ForkNumber forkNum

◆ ComputeIoConcurrency()

bool ComputeIoConcurrency ( int  io_concurrency,
double *  target 
)

Definition at line 469 of file bufmgr.c.

References i, Max, MAX_IO_CONCURRENCY, and Min.

Referenced by check_effective_io_concurrency(), and ExecInitBitmapHeapScan().

470 {
471  double new_prefetch_pages = 0.0;
472  int i;
473 
474  /*
475  * Make sure the io_concurrency value is within valid range; it may have
476  * been forced with a manual pg_tablespace update.
477  */
478  io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY);
479 
480  /*----------
481  * The user-visible GUC parameter is the number of drives (spindles),
482  * which we need to translate to a number-of-pages-to-prefetch target.
483  * The target value is stashed in *extra and then assigned to the actual
484  * variable by assign_effective_io_concurrency.
485  *
486  * The expected number of prefetch pages needed to keep N drives busy is:
487  *
488  * drives | I/O requests
489  * -------+----------------
490  * 1 | 1
491  * 2 | 2/1 + 2/2 = 3
492  * 3 | 3/1 + 3/2 + 3/3 = 5 1/2
493  * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
494  * n | n * H(n)
495  *
496  * This is called the "coupon collector problem" and H(n) is called the
497  * harmonic series. This could be approximated by n * ln(n), but for
498  * reasonable numbers of drives we might as well just compute the series.
499  *
500  * Alternatively we could set the target to the number of pages necessary
501  * so that the expected number of active spindles is some arbitrary
502  * percentage of the total. This sounds the same but is actually slightly
503  * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
504  * that desired fraction.
505  *
506  * Experimental results show that both of these formulas aren't aggressive
507  * enough, but we don't really have any better proposals.
508  *
509  * Note that if io_concurrency = 0 (disabled), we must set target = 0.
510  *----------
511  */
512 
513  for (i = 1; i <= io_concurrency; i++)
514  new_prefetch_pages += (double) io_concurrency / (double) i;
515 
516  *target = new_prefetch_pages;
517 
518  /* This range check shouldn't fail, but let's be paranoid */
519  return (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX);
520 }
#define MAX_IO_CONCURRENCY
Definition: bufmgr.h:78
#define Min(x, y)
Definition: c.h:911
#define Max(x, y)
Definition: c.h:905
int i

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

Definition at line 3628 of file bufmgr.c.

References Assert, buf, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsValid, GetBufferDescriptor, LW_EXCLUSIVE, and LWLockConditionalAcquire().

Referenced by _bt_doinsert(), _bt_getbuf(), BloomNewBuffer(), ConditionalLockBufferForCleanup(), GinNewBuffer(), gistNewBuffer(), RelationGetBufferForTuple(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), and SpGistUpdateMetaPage().

3629 {
3630  BufferDesc *buf;
3631 
3632  Assert(BufferIsValid(buffer));
3633  if (BufferIsLocal(buffer))
3634  return true; /* act as though we got it */
3635 
3636  buf = GetBufferDescriptor(buffer - 1);
3637 
3639  LW_EXCLUSIVE);
3640 }
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1294
static char * buf
Definition: pg_test_fsync.c:67
#define GetBufferDescriptor(id)
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 3774 of file bufmgr.c.

References Assert, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid, ConditionalLockBuffer(), GetBufferDescriptor, GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr.

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), lazy_scan_heap(), and lazy_vacuum_heap().

3775 {
3776  BufferDesc *bufHdr;
3777  uint32 buf_state,
3778  refcount;
3779 
3780  Assert(BufferIsValid(buffer));
3781 
3782  if (BufferIsLocal(buffer))
3783  {
3784  refcount = LocalRefCount[-buffer - 1];
3785  /* There should be exactly one pin */
3786  Assert(refcount > 0);
3787  if (refcount != 1)
3788  return false;
3789  /* Nobody else to wait for */
3790  return true;
3791  }
3792 
3793  /* There should be exactly one local pin */
3794  refcount = GetPrivateRefCount(buffer);
3795  Assert(refcount);
3796  if (refcount != 1)
3797  return false;
3798 
3799  /* Try to acquire lock */
3800  if (!ConditionalLockBuffer(buffer))
3801  return false;
3802 
3803  bufHdr = GetBufferDescriptor(buffer - 1);
3804  buf_state = LockBufHdr(bufHdr);
3805  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3806 
3807  Assert(refcount > 0);
3808  if (refcount == 1)
3809  {
3810  /* Successfully acquired exclusive lock with pincount 1 */
3811  UnlockBufHdr(bufHdr, buf_state);
3812  return true;
3813  }
3814 
3815  /* Failed, so release the lock */
3816  UnlockBufHdr(bufHdr, buf_state);
3817  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3818  return false;
3819 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:359
unsigned int uint32
Definition: c.h:359
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3628
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3602
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define UnlockBufHdr(desc, s)
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 3099 of file bufmgr.c.

References buftag::blockNum, buf, BufferDescriptorGetBuffer, RelFileNode::dbNode, elog, buftag::forkNum, BufferDesc::freeNext, GetBufferDescriptor, GetPrivateRefCount(), i, InvalidateBuffer(), InvalidBackendId, LockBufHdr(), LOG, NBuffers, relpathbackend, relpathperm, buftag::rnode, BufferDesc::tag, and UnlockBufHdr.

Referenced by dbase_redo(), dropdb(), and movedb().

3100 {
3101  int i;
3102 
3103  /*
3104  * We needn't consider local buffers, since by assumption the target
3105  * database isn't our own.
3106  */
3107 
3108  for (i = 0; i < NBuffers; i++)
3109  {
3110  BufferDesc *bufHdr = GetBufferDescriptor(i);
3111  uint32 buf_state;
3112 
3113  /*
3114  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3115  * and saves some cycles.
3116  */
3117  if (bufHdr->tag.rnode.dbNode != dbid)
3118  continue;
3119 
3120  buf_state = LockBufHdr(bufHdr);
3121  if (bufHdr->tag.rnode.dbNode == dbid)
3122  InvalidateBuffer(bufHdr); /* releases spinlock */
3123  else
3124  UnlockBufHdr(bufHdr, buf_state);
3125  }
3126 }
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1360
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:359
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131

◆ DropRelFileNodeBuffers()

void DropRelFileNodeBuffers ( RelFileNodeBackend  rnode,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 2927 of file bufmgr.c.

References RelFileNodeBackend::backend, buftag::blockNum, DropRelFileNodeLocalBuffers(), buftag::forkNum, GetBufferDescriptor, i, InvalidateBuffer(), LockBufHdr(), MyBackendId, NBuffers, RelFileNodeBackend::node, RelFileNodeBackendIsTemp, RelFileNodeEquals, buftag::rnode, BufferDesc::tag, and UnlockBufHdr.

Referenced by smgrtruncate().

2929 {
2930  int i;
2931  int j;
2932 
2933  /* If it's a local relation, it's localbuf.c's problem. */
2934  if (RelFileNodeBackendIsTemp(rnode))
2935  {
2936  if (rnode.backend == MyBackendId)
2937  {
2938  for (j = 0; j < nforks; j++)
2939  DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
2940  firstDelBlock[j]);
2941  }
2942  return;
2943  }
2944 
2945  for (i = 0; i < NBuffers; i++)
2946  {
2947  BufferDesc *bufHdr = GetBufferDescriptor(i);
2948  uint32 buf_state;
2949 
2950  /*
2951  * We can make this a tad faster by prechecking the buffer tag before
2952  * we attempt to lock the buffer; this saves a lot of lock
2953  * acquisitions in typical cases. It should be safe because the
2954  * caller must have AccessExclusiveLock on the relation, or some other
2955  * reason to be certain that no one is loading new pages of the rel
2956  * into the buffer pool. (Otherwise we might well miss such pages
2957  * entirely.) Therefore, while the tag might be changing while we
2958  * look at it, it can't be changing *to* a value we care about, only
2959  * *away* from such a value. So false negatives are impossible, and
2960  * false positives are safe because we'll recheck after getting the
2961  * buffer lock.
2962  *
2963  * We could check forkNum and blockNum as well as the rnode, but the
2964  * incremental win from doing so seems small.
2965  */
2966  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
2967  continue;
2968 
2969  buf_state = LockBufHdr(bufHdr);
2970 
2971  for (j = 0; j < nforks; j++)
2972  {
2973  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
2974  bufHdr->tag.forkNum == forkNum[j] &&
2975  bufHdr->tag.blockNum >= firstDelBlock[j])
2976  {
2977  InvalidateBuffer(bufHdr); /* releases spinlock */
2978  break;
2979  }
2980  }
2981  if (j >= nforks)
2982  UnlockBufHdr(bufHdr, buf_state);
2983  }
2984 }
BackendId MyBackendId
Definition: globals.c:81
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ForkNumber forkNum
Definition: buf_internals.h:93
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1360
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:320
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:359
RelFileNode node
Definition: relfilenode.h:74
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
BackendId backend
Definition: relfilenode.h:75
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ DropRelFileNodesAllBuffers()

void DropRelFileNodesAllBuffers ( RelFileNodeBackend rnodes,
int  nnodes 
)

Definition at line 2996 of file bufmgr.c.

References DROP_RELS_BSEARCH_THRESHOLD, DropRelFileNodeAllLocalBuffers(), GetBufferDescriptor, i, InvalidateBuffer(), LockBufHdr(), MyBackendId, NBuffers, RelFileNodeBackend::node, palloc(), pfree(), pg_qsort(), RelFileNodeBackendIsTemp, RelFileNodeEquals, buftag::rnode, rnode_comparator(), BufferDesc::tag, and UnlockBufHdr.

Referenced by smgrdounlink(), and smgrdounlinkall().

2997 {
2998  int i,
2999  n = 0;
3000  RelFileNode *nodes;
3001  bool use_bsearch;
3002 
3003  if (nnodes == 0)
3004  return;
3005 
3006  nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
3007 
3008  /* If it's a local relation, it's localbuf.c's problem. */
3009  for (i = 0; i < nnodes; i++)
3010  {
3011  if (RelFileNodeBackendIsTemp(rnodes[i]))
3012  {
3013  if (rnodes[i].backend == MyBackendId)
3014  DropRelFileNodeAllLocalBuffers(rnodes[i].node);
3015  }
3016  else
3017  nodes[n++] = rnodes[i].node;
3018  }
3019 
3020  /*
3021  * If there are no non-local relations, then we're done. Release the
3022  * memory and return.
3023  */
3024  if (n == 0)
3025  {
3026  pfree(nodes);
3027  return;
3028  }
3029 
3030  /*
3031  * For low number of relations to drop just use a simple walk through, to
3032  * save the bsearch overhead. The threshold to use is rather a guess than
3033  * an exactly determined value, as it depends on many factors (CPU and RAM
3034  * speeds, amount of shared buffers etc.).
3035  */
3036  use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
3037 
3038  /* sort the list of rnodes if necessary */
3039  if (use_bsearch)
3040  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
3041 
3042  for (i = 0; i < NBuffers; i++)
3043  {
3044  RelFileNode *rnode = NULL;
3045  BufferDesc *bufHdr = GetBufferDescriptor(i);
3046  uint32 buf_state;
3047 
3048  /*
3049  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3050  * and saves some cycles.
3051  */
3052 
3053  if (!use_bsearch)
3054  {
3055  int j;
3056 
3057  for (j = 0; j < n; j++)
3058  {
3059  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3060  {
3061  rnode = &nodes[j];
3062  break;
3063  }
3064  }
3065  }
3066  else
3067  {
3068  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3069  nodes, n, sizeof(RelFileNode),
3071  }
3072 
3073  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3074  if (rnode == NULL)
3075  continue;
3076 
3077  buf_state = LockBufHdr(bufHdr);
3078  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3079  InvalidateBuffer(bufHdr); /* releases spinlock */
3080  else
3081  UnlockBufHdr(bufHdr, buf_state);
3082  }
3083 
3084  pfree(nodes);
3085 }
BackendId MyBackendId
Definition: globals.c:81
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1360
#define DROP_RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:69
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:367
void pfree(void *pointer)
Definition: mcxt.c:1056
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:359
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4121
RelFileNode node
Definition: relfilenode.h:74
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
void * palloc(Size size)
Definition: mcxt.c:949
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln 
)
static

Definition at line 2673 of file bufmgr.c.

References ErrorContextCallback::arg, BufferUsage::blk_write_time, buftag::blockNum, BM_JUST_DIRTIED, BM_PERMANENT, BufferGetLSN, BufHdrGetBlock, ErrorContextCallback::callback, RelFileNode::dbNode, error_context_stack, buftag::forkNum, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, InvalidBackendId, LockBufHdr(), RelFileNodeBackend::node, PageSetChecksumCopy(), pgBufferUsage, pgstat_count_buffer_write_time, ErrorContextCallback::previous, RelFileNode::relNode, buftag::rnode, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rnode, smgropen(), smgrwrite(), RelFileNode::spcNode, StartBufferIO(), BufferDesc::tag, TerminateBufferIO(), track_io_timing, UnlockBufHdr, and XLogFlush().

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushOneBuffer(), FlushRelationBuffers(), and SyncOneBuffer().

2674 {
2675  XLogRecPtr recptr;
2676  ErrorContextCallback errcallback;
2677  instr_time io_start,
2678  io_time;
2679  Block bufBlock;
2680  char *bufToWrite;
2681  uint32 buf_state;
2682 
2683  /*
2684  * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2685  * false, then someone else flushed the buffer before we could, so we need
2686  * not do anything.
2687  */
2688  if (!StartBufferIO(buf, false))
2689  return;
2690 
2691  /* Setup error traceback support for ereport() */
2693  errcallback.arg = (void *) buf;
2694  errcallback.previous = error_context_stack;
2695  error_context_stack = &errcallback;
2696 
2697  /* Find smgr relation for buffer */
2698  if (reln == NULL)
2699  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2700 
2701  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2702  buf->tag.blockNum,
2703  reln->smgr_rnode.node.spcNode,
2704  reln->smgr_rnode.node.dbNode,
2705  reln->smgr_rnode.node.relNode);
2706 
2707  buf_state = LockBufHdr(buf);
2708 
2709  /*
2710  * Run PageGetLSN while holding header lock, since we don't have the
2711  * buffer locked exclusively in all cases.
2712  */
2713  recptr = BufferGetLSN(buf);
2714 
2715  /* To check if block content changes while flushing. - vadim 01/17/97 */
2716  buf_state &= ~BM_JUST_DIRTIED;
2717  UnlockBufHdr(buf, buf_state);
2718 
2719  /*
2720  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2721  * rule that log updates must hit disk before any of the data-file changes
2722  * they describe do.
2723  *
2724  * However, this rule does not apply to unlogged relations, which will be
2725  * lost after a crash anyway. Most unlogged relation pages do not bear
2726  * LSNs since we never emit WAL records for them, and therefore flushing
2727  * up through the buffer LSN would be useless, but harmless. However,
2728  * GiST indexes use LSNs internally to track page-splits, and therefore
2729  * unlogged GiST pages bear "fake" LSNs generated by
2730  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2731  * LSN counter could advance past the WAL insertion point; and if it did
2732  * happen, attempting to flush WAL through that location would fail, with
2733  * disastrous system-wide consequences. To make sure that can't happen,
2734  * skip the flush if the buffer isn't permanent.
2735  */
2736  if (buf_state & BM_PERMANENT)
2737  XLogFlush(recptr);
2738 
2739  /*
2740  * Now it's safe to write buffer to disk. Note that no one else should
2741  * have been able to write it while we were busy with log flushing because
2742  * we have the io_in_progress lock.
2743  */
2744  bufBlock = BufHdrGetBlock(buf);
2745 
2746  /*
2747  * Update page checksum if desired. Since we have only shared lock on the
2748  * buffer, other processes might be updating hint bits in it, so we must
2749  * copy the page to private storage if we do checksumming.
2750  */
2751  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2752 
2753  if (track_io_timing)
2754  INSTR_TIME_SET_CURRENT(io_start);
2755 
2756  /*
2757  * bufToWrite is either the shared buffer or a copy, as appropriate.
2758  */
2759  smgrwrite(reln,
2760  buf->tag.forkNum,
2761  buf->tag.blockNum,
2762  bufToWrite,
2763  false);
2764 
2765  if (track_io_timing)
2766  {
2767  INSTR_TIME_SET_CURRENT(io_time);
2768  INSTR_TIME_SUBTRACT(io_time, io_start);
2771  }
2772 
2774 
2775  /*
2776  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2777  * end the io_in_progress state.
2778  */
2779  TerminateBufferIO(buf, true, 0);
2780 
2781  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2782  buf->tag.blockNum,
2783  reln->smgr_rnode.node.spcNode,
2784  reln->smgr_rnode.node.dbNode,
2785  reln->smgr_rnode.node.relNode);
2786 
2787  /* Pop the error context stack */
2788  error_context_stack = errcallback.previous;
2789 }
#define BM_PERMANENT
Definition: buf_internals.h:66
ForkNumber forkNum
Definition: buf_internals.h:93
struct timeval instr_time
Definition: instr_time.h:150
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1169
void(* callback)(void *arg)
Definition: elog.h:256
struct ErrorContextCallback * previous
Definition: elog.h:255
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2805
ErrorContextCallback * error_context_stack
Definition: elog.c:91
long shared_blks_written
Definition: instrument.h:24
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:3931
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:530
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
unsigned int uint32
Definition: c.h:359
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:145
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:3998
#define InvalidBackendId
Definition: backendid.h:23
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:58
RelFileNode node
Definition: relfilenode.h:74
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
instr_time blk_write_time
Definition: instrument.h:32
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:1395
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
BufferTag tag
#define UnlockBufHdr(desc, s)
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:59
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4083
bool track_io_timing
Definition: bufmgr.c:112
Pointer Page
Definition: bufpage.h:78
BufferUsage pgBufferUsage
Definition: instrument.c:20
void * Block
Definition: bufmgr.h:24

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 3302 of file bufmgr.c.

References BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock, CurrentResourceOwner, RelFileNode::dbNode, FlushBuffer(), GetBufferDescriptor, i, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by dbase_redo().

3303 {
3304  int i;
3305  BufferDesc *bufHdr;
3306 
3307  /* Make sure we can handle the pin inside the loop */
3309 
3310  for (i = 0; i < NBuffers; i++)
3311  {
3312  uint32 buf_state;
3313 
3314  bufHdr = GetBufferDescriptor(i);
3315 
3316  /*
3317  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3318  * and saves some cycles.
3319  */
3320  if (bufHdr->tag.rnode.dbNode != dbid)
3321  continue;
3322 
3324 
3325  buf_state = LockBufHdr(bufHdr);
3326  if (bufHdr->tag.rnode.dbNode == dbid &&
3327  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3328  {
3329  PinBuffer_Locked(bufHdr);
3331  FlushBuffer(bufHdr, NULL);
3333  UnpinBuffer(bufHdr, true);
3334  }
3335  else
3336  UnlockBufHdr(bufHdr, buf_state);
3337  }
3338 }
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
#define BM_DIRTY
Definition: buf_internals.h:58
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2673
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1726
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:359
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1702
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:893
#define BM_VALID
Definition: buf_internals.h:59
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1664
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:187

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 3345 of file bufmgr.c.

References Assert, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, FlushBuffer(), GetBufferDescriptor, and LWLockHeldByMe().

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), and XLogReadBufferForRedoExtended().

3346 {
3347  BufferDesc *bufHdr;
3348 
3349  /* currently not needed, but no fundamental reason not to support */
3350  Assert(!BufferIsLocal(buffer));
3351 
3352  Assert(BufferIsPinned(buffer));
3353 
3354  bufHdr = GetBufferDescriptor(buffer - 1);
3355 
3357 
3358  FlushBuffer(bufHdr, NULL);
3359 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:420
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1842
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2673
#define GetBufferDescriptor(id)
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 3204 of file bufmgr.c.

References ErrorContextCallback::arg, buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_VALID, BufferDescriptorGetContentLock, ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushBuffer(), buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, i, local_buffer_write_error_callback(), LocalBufHdrGetBlock, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, NLocBuffer, PageSetChecksumInplace(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), PinBuffer_Locked(), ErrorContextCallback::previous, RelationData::rd_node, RelationData::rd_smgr, RelationOpenSmgr, RelationUsesLocalBuffers, RelFileNodeEquals, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, smgrwrite(), BufferDesc::state, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by heap_sync(), heapam_relation_copy_data(), and index_copy_data().

3205 {
3206  int i;
3207  BufferDesc *bufHdr;
3208 
3209  /* Open rel at the smgr level if not already done */
3210  RelationOpenSmgr(rel);
3211 
3212  if (RelationUsesLocalBuffers(rel))
3213  {
3214  for (i = 0; i < NLocBuffer; i++)
3215  {
3216  uint32 buf_state;
3217 
3218  bufHdr = GetLocalBufferDescriptor(i);
3219  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3220  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3221  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3222  {
3223  ErrorContextCallback errcallback;
3224  Page localpage;
3225 
3226  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3227 
3228  /* Setup error traceback support for ereport() */
3230  errcallback.arg = (void *) bufHdr;
3231  errcallback.previous = error_context_stack;
3232  error_context_stack = &errcallback;
3233 
3234  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3235 
3236  smgrwrite(rel->rd_smgr,
3237  bufHdr->tag.forkNum,
3238  bufHdr->tag.blockNum,
3239  localpage,
3240  false);
3241 
3242  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3243  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3244 
3245  /* Pop the error context stack */
3246  error_context_stack = errcallback.previous;
3247  }
3248  }
3249 
3250  return;
3251  }
3252 
3253  /* Make sure we can handle the pin inside the loop */
3255 
3256  for (i = 0; i < NBuffers; i++)
3257  {
3258  uint32 buf_state;
3259 
3260  bufHdr = GetBufferDescriptor(i);
3261 
3262  /*
3263  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3264  * and saves some cycles.
3265  */
3266  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3267  continue;
3268 
3270 
3271  buf_state = LockBufHdr(bufHdr);
3272  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3273  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3274  {
3275  PinBuffer_Locked(bufHdr);
3277  FlushBuffer(bufHdr, rel->rd_smgr);
3279  UnpinBuffer(bufHdr, true);
3280  }
3281  else
3282  UnlockBufHdr(bufHdr, buf_state);
3283  }
3284 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:62
ForkNumber forkNum
Definition: buf_internals.h:93
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4102
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct SMgrRelationData * rd_smgr
Definition: rel.h:56
#define GetLocalBufferDescriptor(id)
#define BM_DIRTY
Definition: buf_internals.h:58
void(* callback)(void *arg)
Definition: elog.h:256
struct ErrorContextCallback * previous
Definition: elog.h:255
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2673
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1726
ErrorContextCallback * error_context_stack
Definition: elog.c:91
#define RelationOpenSmgr(relation)
Definition: rel.h:479
int NLocBuffer
Definition: localbuf.c:41
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:530
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
unsigned int uint32
Definition: c.h:359
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1702
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:893
#define BM_VALID
Definition: buf_internals.h:59
RelFileNode rd_node
Definition: rel.h:54
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1664
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1198
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:531
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:187
pg_atomic_uint32 state
Pointer Page
Definition: bufpage.h:78
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 382 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, and REFCOUNT_ARRAY_ENTRIES.

Referenced by UnpinBuffer().

383 {
384  Assert(ref->refcount == 0);
385 
386  if (ref >= &PrivateRefCountArray[0] &&
388  {
389  ref->buffer = InvalidBuffer;
390 
391  /*
392  * Mark the just used entry as reserved - in many scenarios that
393  * allows us to avoid ever having to search the array/hash for free
394  * entries.
395  */
396  ReservedRefCountEntry = ref;
397  }
398  else
399  {
400  bool found;
401  Buffer buffer = ref->buffer;
402 
404  (void *) &buffer,
405  HASH_REMOVE,
406  &found);
407  Assert(found);
410  }
411 }
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:171
#define InvalidBuffer
Definition: buf.h:25
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:906
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:169
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:173
#define Assert(condition)
Definition: c.h:739
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:78
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:170
int Buffer
Definition: buf.h:23

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 359 of file bufmgr.c.

References Assert, BufferIsLocal, BufferIsValid, GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), and PrintBufferLeakWarning().

360 {
362 
363  Assert(BufferIsValid(buffer));
364  Assert(!BufferIsLocal(buffer));
365 
366  /*
367  * Not moving the entry - that's ok for the current users, but we might
368  * want to change this one day.
369  */
370  ref = GetPrivateRefCountEntry(buffer, false);
371 
372  if (ref == NULL)
373  return 0;
374  return ref->refcount;
375 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:279
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 279 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid, free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBuffer().

280 {
282  int i;
283 
284  Assert(BufferIsValid(buffer));
285  Assert(!BufferIsLocal(buffer));
286 
287  /*
288  * First search for references in the array, that'll be sufficient in the
289  * majority of cases.
290  */
291  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
292  {
293  res = &PrivateRefCountArray[i];
294 
295  if (res->buffer == buffer)
296  return res;
297  }
298 
299  /*
300  * By here we know that the buffer, if already pinned, isn't residing in
301  * the array.
302  *
303  * Only look up the buffer in the hashtable if we've previously overflowed
304  * into it.
305  */
306  if (PrivateRefCountOverflowed == 0)
307  return NULL;
308 
310  (void *) &buffer,
311  HASH_FIND,
312  NULL);
313 
314  if (res == NULL)
315  return NULL;
316  else if (!do_move)
317  {
318  /* caller doesn't want us to move the hash entry into the array */
319  return res;
320  }
321  else
322  {
323  /* move buffer from hashtable into the free array slot */
324  bool found;
326 
327  /* Ensure there's a free array slot */
329 
330  /* Use up the reserved slot */
331  Assert(ReservedRefCountEntry != NULL);
332  free = ReservedRefCountEntry;
333  ReservedRefCountEntry = NULL;
334  Assert(free->buffer == InvalidBuffer);
335 
336  /* and fill it */
337  free->buffer = buffer;
338  free->refcount = res->refcount;
339 
340  /* delete from hashtable */
342  (void *) &buffer,
343  HASH_REMOVE,
344  &found);
345  Assert(found);
348 
349  return free;
350  }
351 }
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:171
#define InvalidBuffer
Definition: buf.h:25
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:906
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:169
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:173
#define free(a)
Definition: header.h:65
#define Assert(condition)
Definition: c.h:739
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:78
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:170
int i
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:187

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 3748 of file bufmgr.c.

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and RecoveryConflictInterrupt().

3749 {
3750  int bufid = GetStartupBufferPinWaitBufId();
3751 
3752  /*
3753  * If we get woken slowly then it's possible that the Startup process was
3754  * already woken by other backends before we got here. Also possible that
3755  * we get here by multiple interrupts or interrupts at inappropriate
3756  * times, so make sure we do nothing if the bufid is not set.
3757  */
3758  if (bufid < 0)
3759  return false;
3760 
3761  if (GetPrivateRefCount(bufid + 1) > 0)
3762  return true;
3763 
3764  return false;
3765 }
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:359
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:657

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 3403 of file bufmgr.c.

References Assert, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, GetPrivateRefCountEntry(), LocalRefCount, PrivateRefCountEntry::refcount, ResourceOwnerEnlargeBuffers(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

3404 {
3405  Assert(BufferIsPinned(buffer));
3407  if (BufferIsLocal(buffer))
3408  LocalRefCount[-buffer - 1]++;
3409  else
3410  {
3411  PrivateRefCountEntry *ref;
3412 
3413  ref = GetPrivateRefCountEntry(buffer, true);
3414  Assert(ref != NULL);
3415  ref->refcount++;
3416  }
3418 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:279
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:420
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:906
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:893
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
int32 * LocalRefCount
Definition: localbuf.c:45

◆ InitBufferPoolAccess()

void InitBufferPoolAccess ( void  )

Definition at line 2445 of file bufmgr.c.

References HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MemSet, and PrivateRefCountArray.

Referenced by BaseInit().

2446 {
2447  HASHCTL hash_ctl;
2448 
2449  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2450 
2451  MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2452  hash_ctl.keysize = sizeof(int32);
2453  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2454 
2455  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2456  HASH_ELEM | HASH_BLOBS);
2457 }
struct PrivateRefCountEntry PrivateRefCountEntry
#define HASH_ELEM
Definition: hsearch.h:87
Size entrysize
Definition: hsearch.h:73
#define MemSet(start, val, len)
Definition: c.h:962
signed int int32
Definition: c.h:347
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:169
#define HASH_BLOBS
Definition: hsearch.h:88
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:316
Size keysize
Definition: hsearch.h:72
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:170

◆ InitBufferPoolBackend()

void InitBufferPoolBackend ( void  )

Definition at line 2469 of file bufmgr.c.

References AtProcExit_Buffers(), and on_shmem_exit().

Referenced by AuxiliaryProcessMain(), and InitPostgres().

2470 {
2472 }
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2479

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 1360 of file bufmgr.c.

References Assert, BM_LOCKED, BM_TAG_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer, BUFFERTAGS_EQUAL, BufMappingPartitionLock, BufTableDelete(), BufTableHashCode(), CLEAR_BUFFERTAG, elog, ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), BufferDesc::state, StrategyFreeBuffer(), BufferDesc::tag, UnlockBufHdr, and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelFileNodeBuffers(), and DropRelFileNodesAllBuffers().

1361 {
1362  BufferTag oldTag;
1363  uint32 oldHash; /* hash value for oldTag */
1364  LWLock *oldPartitionLock; /* buffer partition lock for it */
1365  uint32 oldFlags;
1366  uint32 buf_state;
1367 
1368  /* Save the original buffer tag before dropping the spinlock */
1369  oldTag = buf->tag;
1370 
1371  buf_state = pg_atomic_read_u32(&buf->state);
1372  Assert(buf_state & BM_LOCKED);
1373  UnlockBufHdr(buf, buf_state);
1374 
1375  /*
1376  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1377  * worth storing the hashcode in BufferDesc so we need not recompute it
1378  * here? Probably not.
1379  */
1380  oldHash = BufTableHashCode(&oldTag);
1381  oldPartitionLock = BufMappingPartitionLock(oldHash);
1382 
1383 retry:
1384 
1385  /*
1386  * Acquire exclusive mapping lock in preparation for changing the buffer's
1387  * association.
1388  */
1389  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1390 
1391  /* Re-lock the buffer header */
1392  buf_state = LockBufHdr(buf);
1393 
1394  /* If it's changed while we were waiting for lock, do nothing */
1395  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1396  {
1397  UnlockBufHdr(buf, buf_state);
1398  LWLockRelease(oldPartitionLock);
1399  return;
1400  }
1401 
1402  /*
1403  * We assume the only reason for it to be pinned is that someone else is
1404  * flushing the page out. Wait for them to finish. (This could be an
1405  * infinite loop if the refcount is messed up... it would be nice to time
1406  * out after awhile, but there seems no way to be sure how many loops may
1407  * be needed. Note that if the other guy has pinned the buffer but not
1408  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1409  * be busy-looping here.)
1410  */
1411  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1412  {
1413  UnlockBufHdr(buf, buf_state);
1414  LWLockRelease(oldPartitionLock);
1415  /* safety check: should definitely not be our *own* pin */
1417  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1418  WaitIO(buf);
1419  goto retry;
1420  }
1421 
1422  /*
1423  * Clear out the buffer's tag and flags. We must do this to ensure that
1424  * linear scans of the buffer array don't think the buffer is valid.
1425  */
1426  oldFlags = buf_state & BUF_FLAG_MASK;
1427  CLEAR_BUFFERTAG(buf->tag);
1428  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1429  UnlockBufHdr(buf, buf_state);
1430 
1431  /*
1432  * Remove the buffer from the lookup hashtable, if it was in there.
1433  */
1434  if (oldFlags & BM_TAG_VALID)
1435  BufTableDelete(&oldTag, oldHash);
1436 
1437  /*
1438  * Done with mapping lock.
1439  */
1440  LWLockRelease(oldPartitionLock);
1441 
1442  /*
1443  * Insert the buffer at the head of the list of free buffers.
1444  */
1445  StrategyFreeBuffer(buf);
1446 }
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
Definition: lwlock.h:32
#define BufMappingPartitionLock(hashcode)
#define BM_TAG_VALID
Definition: buf_internals.h:60
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:3884
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:364
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1726
#define ERROR
Definition: elog.h:43
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:359
unsigned int uint32
Definition: c.h:359
#define BUFFERTAGS_EQUAL(a, b)
#define BM_LOCKED
Definition: buf_internals.h:57
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
#define Assert(condition)
Definition: c.h:739
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:97
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
#define BufferDescriptorGetBuffer(bdesc)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
BufferTag tag
#define UnlockBufHdr(desc, s)
#define elog(elevel,...)
Definition: elog.h:228
pg_atomic_uint32 state
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 3830 of file bufmgr.c.

References Assert, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsValid, GetBufferDescriptor, GetPrivateRefCount(), LocalRefCount, LockBufHdr(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), and UnlockBufHdr.

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), hash_xlog_split_allocate_page(), and hashbucketcleanup().

3831 {
3832  BufferDesc *bufHdr;
3833  uint32 buf_state;
3834 
3835  Assert(BufferIsValid(buffer));
3836 
3837  if (BufferIsLocal(buffer))
3838  {
3839  /* There should be exactly one pin */
3840  if (LocalRefCount[-buffer - 1] != 1)
3841  return false;
3842  /* Nobody else to wait for */
3843  return true;
3844  }
3845 
3846  /* There should be exactly one local pin */
3847  if (GetPrivateRefCount(buffer) != 1)
3848  return false;
3849 
3850  bufHdr = GetBufferDescriptor(buffer - 1);
3851 
3852  /* caller must hold exclusive lock on buffer */
3854  LW_EXCLUSIVE));
3855 
3856  buf_state = LockBufHdr(bufHdr);
3857 
3858  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3859  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3860  {
3861  /* pincount is OK. */
3862  UnlockBufHdr(bufHdr, buf_state);
3863  return true;
3864  }
3865 
3866  UnlockBufHdr(bufHdr, buf_state);
3867  return false;
3868 }
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1860
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:359
unsigned int uint32
Definition: c.h:359
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define UnlockBufHdr(desc, s)
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext context)

Definition at line 4334 of file bufmgr.c.

References buftag::blockNum, buffertag_comparator(), cur, buftag::forkNum, i, InvalidBackendId, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, qsort, RelFileNodeEquals, buftag::rnode, smgropen(), smgrwriteback(), and PendingWriteback::tag.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

4335 {
4336  int i;
4337 
4338  if (context->nr_pending == 0)
4339  return;
4340 
4341  /*
4342  * Executing the writes in-order can make them a lot faster, and allows to
4343  * merge writeback requests to consecutive blocks into larger writebacks.
4344  */
4345  qsort(&context->pending_writebacks, context->nr_pending,
4347 
4348  /*
4349  * Coalesce neighbouring writes, but nothing else. For that we iterate
4350  * through the, now sorted, array of pending flushes, and look forward to
4351  * find all neighbouring (or identical) writes.
4352  */
4353  for (i = 0; i < context->nr_pending; i++)
4354  {
4357  SMgrRelation reln;
4358  int ahead;
4359  BufferTag tag;
4360  Size nblocks = 1;
4361 
4362  cur = &context->pending_writebacks[i];
4363  tag = cur->tag;
4364 
4365  /*
4366  * Peek ahead, into following writeback requests, to see if they can
4367  * be combined with the current one.
4368  */
4369  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4370  {
4371  next = &context->pending_writebacks[i + ahead + 1];
4372 
4373  /* different file, stop */
4374  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4375  cur->tag.forkNum != next->tag.forkNum)
4376  break;
4377 
4378  /* ok, block queued twice, skip */
4379  if (cur->tag.blockNum == next->tag.blockNum)
4380  continue;
4381 
4382  /* only merge consecutive writes */
4383  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4384  break;
4385 
4386  nblocks++;
4387  cur = next;
4388  }
4389 
4390  i += ahead;
4391 
4392  /* and finally tell the kernel to write the data to storage */
4393  reln = smgropen(tag.rnode, InvalidBackendId);
4394  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4395  }
4396 
4397  context->nr_pending = 0;
4398 }
static int32 next
Definition: blutils.c:213
ForkNumber forkNum
Definition: buf_internals.h:93
struct cursor * cur
Definition: ecpg.c:28
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:543
static int buffertag_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4200
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:145
#define InvalidBackendId
Definition: backendid.h:23
size_t Size
Definition: c.h:467
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
int i
#define qsort(a, b, c, d)
Definition: port.h:488
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 4102 of file bufmgr.c.

References buftag::blockNum, errcontext, buftag::forkNum, MyBackendId, pfree(), relpathbackend, buftag::rnode, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

4103 {
4104  BufferDesc *bufHdr = (BufferDesc *) arg;
4105 
4106  if (bufHdr != NULL)
4107  {
4108  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4109  bufHdr->tag.forkNum);
4110 
4111  errcontext("writing block %u of relation %s",
4112  bufHdr->tag.blockNum, path);
4113  pfree(path);
4114  }
4115 }
BackendId MyBackendId
Definition: globals.c:81
ForkNumber forkNum
Definition: buf_internals.h:93
void pfree(void *pointer)
Definition: mcxt.c:1056
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define errcontext
Definition: elog.h:183
void * arg
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
int  mode 
)

Definition at line 3602 of file bufmgr.c.

References Assert, buf, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsValid, elog, ERROR, GetBufferDescriptor, LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), and LWLockRelease().

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_first(), _bt_getbuf(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_readnextpage(), _bt_relandgetbuf(), _bt_search(), _bt_unlink_halfdead_page(), _bt_update_meta_cleanup_info(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_getnewbuf(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), blbulkdelete(), blgetbitmap(), blinsert(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), brinbuild(), brinbuildempty(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_page_items(), bt_page_stats(), btvacuumpage(), checkXLogConsistency(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), entryLoadMoreItems(), fill_seq_with_data(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), ginbuildempty(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistbuildempty(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_compute_xid_horizon_for_tuples(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_inplace_update(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_bitmap_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgetpage(), heapgettup(), initBloomState(), lazy_scan_heap(), LockBufferForCleanup(), log_newpage_range(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferExtended(), XLogReadBufferForRedoExtended(), and XLogRecordPageWithFreeSpace().

3603 {
3604  BufferDesc *buf;
3605 
3606  Assert(BufferIsValid(buffer));
3607  if (BufferIsLocal(buffer))
3608  return; /* local buffers need no lock */
3609 
3610  buf = GetBufferDescriptor(buffer - 1);
3611 
3612  if (mode == BUFFER_LOCK_UNLOCK)
3614  else if (mode == BUFFER_LOCK_SHARE)
3616  else if (mode == BUFFER_LOCK_EXCLUSIVE)
3618  else
3619  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3620 }
static PgChecksumMode mode
Definition: pg_checksums.c:61
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:88
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1726
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:67
#define GetBufferDescriptor(id)
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define elog(elevel,...)
Definition: elog.h:228
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:87

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 3659 of file bufmgr.c.

References Assert, BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid, elog, ERROR, GetBufferDescriptor, GetPrivateRefCount(), InHotStandby, LocalRefCount, LockBuffer(), LockBufHdr(), MyProcPid, PG_WAIT_BUFFER_PIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), SetStartupBufferPinWaitBufId(), UnlockBufHdr, and BufferDesc::wait_backend_pid.

Referenced by btree_xlog_vacuum(), btvacuumpage(), btvacuumscan(), ginVacuumPostingTree(), hashbulkdelete(), lazy_scan_heap(), ReadBuffer_common(), and XLogReadBufferForRedoExtended().

3660 {
3661  BufferDesc *bufHdr;
3662 
3663  Assert(BufferIsValid(buffer));
3664  Assert(PinCountWaitBuf == NULL);
3665 
3666  if (BufferIsLocal(buffer))
3667  {
3668  /* There should be exactly one pin */
3669  if (LocalRefCount[-buffer - 1] != 1)
3670  elog(ERROR, "incorrect local pin count: %d",
3671  LocalRefCount[-buffer - 1]);
3672  /* Nobody else to wait for */
3673  return;
3674  }
3675 
3676  /* There should be exactly one local pin */
3677  if (GetPrivateRefCount(buffer) != 1)
3678  elog(ERROR, "incorrect local pin count: %d",
3679  GetPrivateRefCount(buffer));
3680 
3681  bufHdr = GetBufferDescriptor(buffer - 1);
3682 
3683  for (;;)
3684  {
3685  uint32 buf_state;
3686 
3687  /* Try to acquire lock */
3689  buf_state = LockBufHdr(bufHdr);
3690 
3691  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3692  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3693  {
3694  /* Successfully acquired exclusive lock with pincount 1 */
3695  UnlockBufHdr(bufHdr, buf_state);
3696  return;
3697  }
3698  /* Failed, so mark myself as waiting for pincount 1 */
3699  if (buf_state & BM_PIN_COUNT_WAITER)
3700  {
3701  UnlockBufHdr(bufHdr, buf_state);
3702  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3703  elog(ERROR, "multiple backends attempting to wait for pincount 1");
3704  }
3705  bufHdr->wait_backend_pid = MyProcPid;
3706  PinCountWaitBuf = bufHdr;
3707  buf_state |= BM_PIN_COUNT_WAITER;
3708  UnlockBufHdr(bufHdr, buf_state);
3709  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3710 
3711  /* Wait to be signaled by UnpinBuffer() */
3712  if (InHotStandby)
3713  {
3714  /* Publish the bufid that Startup process waits on */
3715  SetStartupBufferPinWaitBufId(buffer - 1);
3716  /* Set alarm and then wait to be signaled by UnpinBuffer() */
3718  /* Reset the published bufid */
3720  }
3721  else
3723 
3724  /*
3725  * Remove flag marking us as waiter. Normally this will not be set
3726  * anymore, but ProcWaitForSignal() can return for other signals as
3727  * well. We take care to only reset the flag if we're the waiter, as
3728  * theoretically another backend could have started waiting. That's
3729  * impossible with the current usages due to table level locking, but
3730  * better be safe.
3731  */
3732  buf_state = LockBufHdr(bufHdr);
3733  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3734  bufHdr->wait_backend_pid == MyProcPid)
3735  buf_state &= ~BM_PIN_COUNT_WAITER;
3736  UnlockBufHdr(bufHdr, buf_state);
3737 
3738  PinCountWaitBuf = NULL;
3739  /* Loop back and try again */
3740  }
3741 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:86
int MyProcPid
Definition: globals.c:40
int wait_backend_pid
#define InHotStandby
Definition: xlog.h:74
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:88
#define ERROR
Definition: elog.h:43
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:461
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:645
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:359
unsigned int uint32
Definition: c.h:359
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1798
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3602
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:756
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define UnlockBufHdr(desc, s)
#define elog(elevel,...)
Definition: elog.h:228
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:137
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:64

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 4148 of file bufmgr.c.

References BM_LOCKED, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), DropRelFileNodeBuffers(), DropRelFileNodesAllBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), GetBufferFromRing(), InvalidateBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), pg_buffercache_pages(), ReadBuffer_common(), StartBufferIO(), StrategyGetBuffer(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), UnpinBuffer(), and WaitIO().

4149 {
4150  SpinDelayStatus delayStatus;
4151  uint32 old_buf_state;
4152 
4153  init_local_spin_delay(&delayStatus);
4154 
4155  while (true)
4156  {
4157  /* set BM_LOCKED flag */
4158  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4159  /* if it wasn't set before we're OK */
4160  if (!(old_buf_state & BM_LOCKED))
4161  break;
4162  perform_spin_delay(&delayStatus);
4163  }
4164  finish_spin_delay(&delayStatus);
4165  return old_buf_state | BM_LOCKED;
4166 }
#define init_local_spin_delay(status)
Definition: s_lock.h:1043
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:174
unsigned int uint32
Definition: c.h:359
#define BM_LOCKED
Definition: buf_internals.h:57
pg_atomic_uint32 state
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:372
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:124

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 1458 of file bufmgr.c.

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, BufferIsValid, elog, ERROR, GetBufferDescriptor, LW_EXCLUSIVE, LWLockHeldByMeInMode(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newroot(), _bt_restore_meta(), _bt_split(), _bt_unlink_halfdead_page(), _bt_update_meta_cleanup_info(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), do_setval(), doPickSplit(), fill_seq_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune(), heap_update(), heap_xlog_clean(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_freeze_page(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heap_xlog_visible(), lazy_scan_heap(), lazy_vacuum_page(), log_newpage_range(), moveLeafs(), nextval_internal(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

1459 {
1460  BufferDesc *bufHdr;
1461  uint32 buf_state;
1462  uint32 old_buf_state;
1463 
1464  if (!BufferIsValid(buffer))
1465  elog(ERROR, "bad buffer ID: %d", buffer);
1466 
1467  if (BufferIsLocal(buffer))
1468  {
1469  MarkLocalBufferDirty(buffer);
1470  return;
1471  }
1472 
1473  bufHdr = GetBufferDescriptor(buffer - 1);
1474 
1475  Assert(BufferIsPinned(buffer));
1477  LW_EXCLUSIVE));
1478 
1479  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1480  for (;;)
1481  {
1482  if (old_buf_state & BM_LOCKED)
1483  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1484 
1485  buf_state = old_buf_state;
1486 
1487  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1488  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1489 
1490  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1491  buf_state))
1492  break;
1493  }
1494 
1495  /*
1496  * If the buffer was not dirty already, do vacuum accounting.
1497  */
1498  if (!(old_buf_state & BM_DIRTY))
1499  {
1500  VacuumPageDirty++;
1502  if (VacuumCostActive)
1504  }
1505 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:420
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1860
int VacuumCostBalance
Definition: globals.c:147
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
#define BM_DIRTY
Definition: buf_internals.h:58
int VacuumCostPageDirty
Definition: globals.c:139
#define ERROR
Definition: elog.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
unsigned int uint32
Definition: c.h:359
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:280
#define BM_LOCKED
Definition: buf_internals.h:57
int VacuumPageDirty
Definition: globals.c:145
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4176
#define elog(elevel,...)
Definition: elog.h:228
pg_atomic_uint32 state
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 3435 of file bufmgr.c.

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock, BufferGetPage, BufferIsLocal, BufferIsValid, PGXACT::delayChkpt, elog, ERROR, GetBufferDescriptor, GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), LWLockHeldByMe(), MarkLocalBufferDirty(), MyPgXact, PageSetLSN, pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), BufferUsage::shared_blks_dirtied, BufferDesc::state, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

3436 {
3437  BufferDesc *bufHdr;
3438  Page page = BufferGetPage(buffer);
3439 
3440  if (!BufferIsValid(buffer))
3441  elog(ERROR, "bad buffer ID: %d", buffer);
3442 
3443  if (BufferIsLocal(buffer))
3444  {
3445  MarkLocalBufferDirty(buffer);
3446  return;
3447  }
3448 
3449  bufHdr = GetBufferDescriptor(buffer - 1);
3450 
3451  Assert(GetPrivateRefCount(buffer) > 0);
3452  /* here, either share or exclusive lock is OK */
3454 
3455  /*
3456  * This routine might get called many times on the same page, if we are
3457  * making the first scan after commit of an xact that added/deleted many
3458  * tuples. So, be as quick as we can if the buffer is already dirty. We
3459  * do this by not acquiring spinlock if it looks like the status bits are
3460  * already set. Since we make this test unlocked, there's a chance we
3461  * might fail to notice that the flags have just been cleared, and failed
3462  * to reset them, due to memory-ordering issues. But since this function
3463  * is only intended to be used in cases where failing to write out the
3464  * data would be harmless anyway, it doesn't really matter.
3465  */
3466  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3468  {
3470  bool dirtied = false;
3471  bool delayChkpt = false;
3472  uint32 buf_state;
3473 
3474  /*
3475  * If we need to protect hint bit updates from torn writes, WAL-log a
3476  * full page image of the page. This full page image is only necessary
3477  * if the hint bit update is the first change to the page since the
3478  * last checkpoint.
3479  *
3480  * We don't check full_page_writes here because that logic is included
3481  * when we call XLogInsert() since the value changes dynamically.
3482  */
3483  if (XLogHintBitIsNeeded() &&
3484  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3485  {
3486  /*
3487  * If we're in recovery we cannot dirty a page because of a hint.
3488  * We can set the hint, just not dirty the page as a result so the
3489  * hint is lost when we evict the page or shutdown.
3490  *
3491  * See src/backend/storage/page/README for longer discussion.
3492  */
3493  if (RecoveryInProgress())
3494  return;
3495 
3496  /*
3497  * If the block is already dirty because we either made a change
3498  * or set a hint already, then we don't need to write a full page
3499  * image. Note that aggressive cleaning of blocks dirtied by hint
3500  * bit setting would increase the call rate. Bulk setting of hint
3501  * bits would reduce the call rate...
3502  *
3503  * We must issue the WAL record before we mark the buffer dirty.
3504  * Otherwise we might write the page before we write the WAL. That
3505  * causes a race condition, since a checkpoint might occur between
3506  * writing the WAL record and marking the buffer dirty. We solve
3507  * that with a kluge, but one that is already in use during
3508  * transaction commit to prevent race conditions. Basically, we
3509  * simply prevent the checkpoint WAL record from being written
3510  * until we have marked the buffer dirty. We don't start the
3511  * checkpoint flush until we have marked dirty, so our checkpoint
3512  * must flush the change to disk successfully or the checkpoint
3513  * never gets written, so crash recovery will fix.
3514  *
3515  * It's possible we may enter here without an xid, so it is
3516  * essential that CreateCheckpoint waits for virtual transactions
3517  * rather than full transactionids.
3518  */
3519  MyPgXact->delayChkpt = delayChkpt = true;
3520  lsn = XLogSaveBufferForHint(buffer, buffer_std);
3521  }
3522 
3523  buf_state = LockBufHdr(bufHdr);
3524 
3525  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3526 
3527  if (!(buf_state & BM_DIRTY))
3528  {
3529  dirtied = true; /* Means "will be dirtied by this action" */
3530 
3531  /*
3532  * Set the page LSN if we wrote a backup block. We aren't supposed
3533  * to set this when only holding a share lock but as long as we
3534  * serialise it somehow we're OK. We choose to set LSN while
3535  * holding the buffer header lock, which causes any reader of an
3536  * LSN who holds only a share lock to also obtain a buffer header
3537  * lock before using PageGetLSN(), which is enforced in
3538  * BufferGetLSNAtomic().
3539  *
3540  * If checksums are enabled, you might think we should reset the
3541  * checksum here. That will happen when the page is written
3542  * sometime later in this checkpoint cycle.
3543  */
3544  if (!XLogRecPtrIsInvalid(lsn))
3545  PageSetLSN(page, lsn);
3546  }
3547 
3548  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3549  UnlockBufHdr(bufHdr, buf_state);
3550 
3551  if (delayChkpt)
3552  MyPgXact->delayChkpt = false;
3553 
3554  if (dirtied)
3555  {
3556  VacuumPageDirty++;
3558  if (VacuumCostActive)
3560  }
3561  }
3562 }
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
#define BM_PERMANENT
Definition: buf_internals.h:66
int VacuumCostBalance
Definition: globals.c:147
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1842
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:893
bool RecoveryInProgress(void)
Definition: xlog.c:7935
#define BM_DIRTY
Definition: buf_internals.h:58
int VacuumCostPageDirty
Definition: globals.c:139
PGXACT * MyPgXact
Definition: proc.c:68
#define ERROR
Definition: elog.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:359
unsigned int uint32
Definition: c.h:359
#define BufferGetPage(buffer)
Definition: bufmgr.h:159
bool delayChkpt
Definition: proc.h:235
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:280
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
int VacuumPageDirty
Definition: globals.c:145
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
#define UnlockBufHdr(desc, s)
#define elog(elevel,...)
Definition: elog.h:228
pg_atomic_uint32 state
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define XLogHintBitIsNeeded()
Definition: xlog.h:192
Pointer Page
Definition: bufpage.h:78
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 253 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, PrivateRefCountEntry::refcount, and ReservedRefCountEntry.

Referenced by PinBuffer(), and PinBuffer_Locked().

254 {
256 
257  /* only allowed to be called when a reservation has been made */
258  Assert(ReservedRefCountEntry != NULL);
259 
260  /* use up the reserved entry */
261  res = ReservedRefCountEntry;
262  ReservedRefCountEntry = NULL;
263 
264  /* and fill it */
265  res->buffer = buffer;
266  res->refcount = 0;
267 
268  return res;
269 }
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:173
#define Assert(condition)
Definition: c.h:739

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy 
)
static

Definition at line 1579 of file bufmgr.c.

References Assert, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservePrivateRefCountEntry(), ResourceOwnerRememberBuffer(), BufferDesc::state, and WaitBufHdrUnlocked().

Referenced by BufferAlloc().

1580 {
1582  bool result;
1583  PrivateRefCountEntry *ref;
1584 
1585  ref = GetPrivateRefCountEntry(b, true);
1586 
1587  if (ref == NULL)
1588  {
1589  uint32 buf_state;
1590  uint32 old_buf_state;
1591 
1593  ref = NewPrivateRefCountEntry(b);
1594 
1595  old_buf_state = pg_atomic_read_u32(&buf->state);
1596  for (;;)
1597  {
1598  if (old_buf_state & BM_LOCKED)
1599  old_buf_state = WaitBufHdrUnlocked(buf);
1600 
1601  buf_state = old_buf_state;
1602 
1603  /* increase refcount */
1604  buf_state += BUF_REFCOUNT_ONE;
1605 
1606  if (strategy == NULL)
1607  {
1608  /* Default case: increase usagecount unless already max. */
1610  buf_state += BUF_USAGECOUNT_ONE;
1611  }
1612  else
1613  {
1614  /*
1615  * Ring buffers shouldn't evict others from pool. Thus we
1616  * don't make usagecount more than 1.
1617  */
1618  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1619  buf_state += BUF_USAGECOUNT_ONE;
1620  }
1621 
1622  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1623  buf_state))
1624  {
1625  result = (buf_state & BM_VALID) != 0;
1626  break;
1627  }
1628  }
1629  }
1630  else
1631  {
1632  /* If we previously pinned the buffer, it must surely be valid */
1633  result = true;
1634  }
1635 
1636  ref->refcount++;
1637  Assert(ref->refcount > 0);
1639  return result;
1640 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:279
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:906
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:253
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
unsigned int uint32
Definition: c.h:359
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define BM_VALID
Definition: buf_internals.h:59
int result
Definition: header.h:19
#define Assert(condition)
Definition: c.h:739
#define BufferDescriptorGetBuffer(bdesc)
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4176
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:76
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:187
pg_atomic_uint32 state
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:49
int Buffer
Definition: buf.h:23
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 1664 of file bufmgr.c.

References Assert, BM_LOCKED, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), BufferDesc::state, and UnlockBufHdr.

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushRelationBuffers(), and SyncOneBuffer().

1665 {
1666  Buffer b;
1667  PrivateRefCountEntry *ref;
1668  uint32 buf_state;
1669 
1670  /*
1671  * As explained, We don't expect any preexisting pins. That allows us to
1672  * manipulate the PrivateRefCount after releasing the spinlock
1673  */
1675 
1676  /*
1677  * Since we hold the buffer spinlock, we can update the buffer state and
1678  * release the lock in one operation.
1679  */
1680  buf_state = pg_atomic_read_u32(&buf->state);
1681  Assert(buf_state & BM_LOCKED);
1682  buf_state += BUF_REFCOUNT_ONE;
1683  UnlockBufHdr(buf, buf_state);
1684 
1685  b = BufferDescriptorGetBuffer(buf);
1686 
1687  ref = NewPrivateRefCountEntry(b);
1688  ref->refcount++;
1689 
1691 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:279
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:906
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:253
unsigned int uint32
Definition: c.h:359
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define Assert(condition)
Definition: c.h:739
#define BufferDescriptorGetBuffer(bdesc)
#define UnlockBufHdr(desc, s)
pg_atomic_uint32 state
int Buffer
Definition: buf.h:23
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ PrefetchBuffer()

void PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 531 of file bufmgr.c.

References Assert, BlockNumberIsValid, BufMappingPartitionLock, BufTableHashCode(), BufTableLookup(), ereport, errcode(), errmsg(), ERROR, INIT_BUFFERTAG, LocalPrefetchBuffer(), LW_SHARED, LWLockAcquire(), LWLockRelease(), RelFileNodeBackend::node, RelationData::rd_smgr, RELATION_IS_OTHER_TEMP, RelationIsValid, RelationOpenSmgr, RelationUsesLocalBuffers, SMgrRelationData::smgr_rnode, and smgrprefetch().

Referenced by BitmapPrefetch(), count_nondeletable_pages(), HeapTupleHeaderAdvanceLatestRemovedXid(), and pg_prewarm().

532 {
533 #ifdef USE_PREFETCH
534  Assert(RelationIsValid(reln));
535  Assert(BlockNumberIsValid(blockNum));
536 
537  /* Open it at the smgr level if not already done */
538  RelationOpenSmgr(reln);
539 
540  if (RelationUsesLocalBuffers(reln))
541  {
542  /* see comments in ReadBufferExtended */
543  if (RELATION_IS_OTHER_TEMP(reln))
544  ereport(ERROR,
545  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
546  errmsg("cannot access temporary tables of other sessions")));
547 
548  /* pass it off to localbuf.c */
549  LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
550  }
551  else
552  {
553  BufferTag newTag; /* identity of requested block */
554  uint32 newHash; /* hash value for newTag */
555  LWLock *newPartitionLock; /* buffer partition lock for it */
556  int buf_id;
557 
558  /* create a tag so we can lookup the buffer */
559  INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
560  forkNum, blockNum);
561 
562  /* determine its hash code and partition lock ID */
563  newHash = BufTableHashCode(&newTag);
564  newPartitionLock = BufMappingPartitionLock(newHash);
565 
566  /* see if the block is in the buffer pool already */
567  LWLockAcquire(newPartitionLock, LW_SHARED);
568  buf_id = BufTableLookup(&newTag, newHash);
569  LWLockRelease(newPartitionLock);
570 
571  /* If not in buffers, initiate prefetch */
572  if (buf_id < 0)
573  smgrprefetch(reln->rd_smgr, forkNum, blockNum);
574 
575  /*
576  * If the block *is* in buffers, we do nothing. This is not really
577  * ideal: the block might be just about to be evicted, which would be
578  * stupid since we know we are going to need it soon. But the only
579  * easy answer is to bump the usage_count, which does not seem like a
580  * great solution: when the caller does ultimately touch the block,
581  * usage_count would get bumped again, resulting in too much
582  * favoritism for blocks that are involved in a prefetch sequence. A
583  * real fix would involve some additional per-buffer state, and it's
584  * not clear that there's enough of a problem to justify that.
585  */
586  }
587 #endif /* USE_PREFETCH */
588 }
Definition: lwlock.h:32
#define BufMappingPartitionLock(hashcode)
void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
struct SMgrRelationData * rd_smgr
Definition: rel.h:56
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
int errcode(int sqlerrcode)
Definition: elog.c:608
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1726
#define RelationOpenSmgr(relation)
Definition: rel.h:479
#define ERROR
Definition: elog.h:43
#define RelationIsValid(relation)
Definition: rel.h:395
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
unsigned int uint32
Definition: c.h:359
#define ereport(elevel, rest)
Definition: elog.h:141
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
RelFileNode node
Definition: relfilenode.h:74
#define Assert(condition)
Definition: c.h:739
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:552
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:531
int errmsg(const char *fmt,...)
Definition: elog.c:822
void smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:494

◆ PrintBufferLeakWarning()

void PrintBufferLeakWarning ( Buffer  buffer)

Definition at line 2539 of file bufmgr.c.

References Assert, buftag::blockNum, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BufferIsLocal, BufferIsValid, elog, buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, GetPrivateRefCount(), InvalidBackendId, LocalRefCount, MyBackendId, pfree(), pg_atomic_read_u32(), relpathbackend, buftag::rnode, BufferDesc::state, BufferDesc::tag, and WARNING.

Referenced by CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResourceOwnerReleaseInternal().

2540 {
2541  BufferDesc *buf;
2542  int32 loccount;
2543  char *path;
2544  BackendId backend;
2545  uint32 buf_state;
2546 
2547  Assert(BufferIsValid(buffer));
2548  if (BufferIsLocal(buffer))
2549  {
2550  buf = GetLocalBufferDescriptor(-buffer - 1);
2551  loccount = LocalRefCount[-buffer - 1];
2552  backend = MyBackendId;
2553  }
2554  else
2555  {
2556  buf = GetBufferDescriptor(buffer - 1);
2557  loccount = GetPrivateRefCount(buffer);
2558  backend = InvalidBackendId;
2559  }
2560 
2561  /* theoretically we should lock the bufhdr here */
2562  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2563  buf_state = pg_atomic_read_u32(&buf->state);
2564  elog(WARNING,
2565  "buffer refcount leak: [%03d] "
2566  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2567  buffer, path,
2568  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2569  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2570  pfree(path);
2571 }
BackendId MyBackendId
Definition: globals.c:81
ForkNumber forkNum
Definition: buf_internals.h:93
#define GetLocalBufferDescriptor(id)
signed int int32
Definition: c.h:347
void pfree(void *pointer)
Definition: mcxt.c:1056
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
static char * buf
Definition: pg_test_fsync.c:67
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:359
unsigned int uint32
Definition: c.h:359
#define WARNING
Definition: elog.h:40
#define InvalidBackendId
Definition: backendid.h:23
int BackendId
Definition: backendid.h:21
#define Assert(condition)
Definition: c.h:739
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94
#define BufferIsValid(bufnum)
Definition: bufmgr.h:113
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define elog(elevel,...)
Definition: elog.h:228
pg_atomic_uint32 state
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 596 of file bufmgr.c.

References MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_doinsert(), _bt_getbuf(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinbuild(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_page_items(), bt_page_stats(), fill_seq_with_data(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_compute_xid_horizon_for_tuples(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_inplace_update(), heap_lock_tuple(), heap_update(), initBloomState(), log_newpage_range(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

597 {
598  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
599 }
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:642

◆ ReadBuffer_common()

static Buffer ReadBuffer_common ( SMgrRelation  reln,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool hit 
)
static

Definition at line 705 of file bufmgr.c.

References Assert, RelFileNodeBackend::backend, BufferUsage::blk_read_time, BM_VALID, BufferAlloc(), BufferDescriptorGetBuffer, BufferDescriptorGetContentLock, BufHdrGetBlock, CurrentResourceOwner, RelFileNode::dbNode, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errhint(), errmsg(), ERROR, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, BufferUsage::local_blks_hit, BufferUsage::local_blks_read, BufferUsage::local_blks_written, LocalBufferAlloc(), LocalBufHdrGetBlock, LockBufferForCleanup(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), MemSet, RelFileNodeBackend::node, P_NEW, PageIsNew, PageIsVerified(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), pgBufferUsage, pgstat_count_buffer_read_time, RBM_NORMAL, RBM_NORMAL_NO_LOG, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RBM_ZERO_ON_ERROR, RelFileNode::relNode, relpath, ResourceOwnerEnlargeBuffers(), BufferUsage::shared_blks_hit, BufferUsage::shared_blks_read, BufferUsage::shared_blks_written, SMgrRelationData::smgr_rnode, smgrextend(), SmgrIsTemp, smgrnblocks(), smgrread(), RelFileNode::spcNode, StartBufferIO(), BufferDesc::state, TerminateBufferIO(), track_io_timing, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageHit, VacuumCostPageMiss, VacuumPageHit, VacuumPageMiss, WARNING, and zero_damaged_pages.

Referenced by ReadBufferExtended(), and ReadBufferWithoutRelcache().

708 {
709  BufferDesc *bufHdr;
710  Block bufBlock;
711  bool found;
712  bool isExtend;
713  bool isLocalBuf = SmgrIsTemp(smgr);
714 
715  *hit = false;
716 
717  /* Make sure we will have room to remember the buffer pin */
719 
720  isExtend = (blockNum == P_NEW);
721 
722  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
723  smgr->smgr_rnode.node.spcNode,
724  smgr->smgr_rnode.node.dbNode,
725  smgr->smgr_rnode.node.relNode,
726  smgr->smgr_rnode.backend,
727  isExtend);
728 
729  /* Substitute proper block number if caller asked for P_NEW */
730  if (isExtend)
731  blockNum = smgrnblocks(smgr, forkNum);
732 
733  if (isLocalBuf)
734  {
735  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
736  if (found)
738  else if (isExtend)
740  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
743  }
744  else
745  {
746  /*
747  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
748  * not currently in memory.
749  */
750  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
751  strategy, &found);
752  if (found)
754  else if (isExtend)
756  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
759  }
760 
761  /* At this point we do NOT hold any locks. */
762 
763  /* if it was already in the buffer pool, we're done */
764  if (found)
765  {
766  if (!isExtend)
767  {
768  /* Just need to update stats before we exit */
769  *hit = true;
770  VacuumPageHit++;
771 
772  if (VacuumCostActive)
774 
775  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
776  smgr->smgr_rnode.node.spcNode,
777  smgr->smgr_rnode.node.dbNode,
778  smgr->smgr_rnode.node.relNode,
779  smgr->smgr_rnode.backend,
780  isExtend,
781  found);
782 
783  /*
784  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
785  * locked on return.
786  */
787  if (!isLocalBuf)
788  {
789  if (mode == RBM_ZERO_AND_LOCK)
791  LW_EXCLUSIVE);
792  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
794  }
795 
796  return BufferDescriptorGetBuffer(bufHdr);
797  }
798 
799  /*
800  * We get here only in the corner case where we are trying to extend
801  * the relation but we found a pre-existing buffer marked BM_VALID.
802  * This can happen because mdread doesn't complain about reads beyond
803  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
804  * read a block beyond EOF could have left a "valid" zero-filled
805  * buffer. Unfortunately, we have also seen this case occurring
806  * because of buggy Linux kernels that sometimes return an
807  * lseek(SEEK_END) result that doesn't account for a recent write. In
808  * that situation, the pre-existing buffer would contain valid data
809  * that we don't want to overwrite. Since the legitimate case should
810  * always have left a zero-filled buffer, complain if not PageIsNew.
811  */
812  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
813  if (!PageIsNew((Page) bufBlock))
814  ereport(ERROR,
815  (errmsg("unexpected data beyond EOF in block %u of relation %s",
816  blockNum, relpath(smgr->smgr_rnode, forkNum)),
817  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
818 
819  /*
820  * We *must* do smgrextend before succeeding, else the page will not
821  * be reserved by the kernel, and the next P_NEW call will decide to
822  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
823  * call that BufferAlloc didn't, and proceed.
824  */
825  if (isLocalBuf)
826  {
827  /* Only need to adjust flags */
828  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
829 
830  Assert(buf_state & BM_VALID);
831  buf_state &= ~BM_VALID;
832  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
833  }
834  else
835  {
836  /*
837  * Loop to handle the very small possibility that someone re-sets
838  * BM_VALID between our clearing it and StartBufferIO inspecting
839  * it.
840  */
841  do
842  {
843  uint32 buf_state = LockBufHdr(bufHdr);
844 
845  Assert(buf_state & BM_VALID);
846  buf_state &= ~BM_VALID;
847  UnlockBufHdr(bufHdr, buf_state);
848  } while (!StartBufferIO(bufHdr, true));
849  }
850  }
851 
852  /*
853  * if we have gotten to this point, we have allocated a buffer for the
854  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
855  * if it's a shared buffer.
856  *
857  * Note: if smgrextend fails, we will end up with a buffer that is
858  * allocated but not marked BM_VALID. P_NEW will still select the same
859  * block number (because the relation didn't get any longer on disk) and
860  * so future attempts to extend the relation will find the same buffer (if
861  * it's not been recycled) but come right back here to try smgrextend
862  * again.
863  */
864  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
865 
866  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
867 
868  if (isExtend)
869  {
870  /* new buffers are zero-filled */
871  MemSet((char *) bufBlock, 0, BLCKSZ);
872  /* don't set checksum for all-zero page */
873  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
874 
875  /*
876  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
877  * although we're essentially performing a write. At least on linux
878  * doing so defeats the 'delayed allocation' mechanism, leading to
879  * increased file fragmentation.
880  */
881  }
882  else
883  {
884  /*
885  * Read in the page, unless the caller intends to overwrite it and
886  * just wants us to allocate a buffer.
887  */
889  MemSet((char *) bufBlock, 0, BLCKSZ);
890  else
891  {
892  instr_time io_start,
893  io_time;
894 
895  if (track_io_timing)
896  INSTR_TIME_SET_CURRENT(io_start);
897 
898  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
899 
900  if (track_io_timing)
901  {
902  INSTR_TIME_SET_CURRENT(io_time);
903  INSTR_TIME_SUBTRACT(io_time, io_start);
906  }
907 
908  /* check for garbage data */
909  if (!PageIsVerified((Page) bufBlock, blockNum))
910  {
912  {
915  errmsg("invalid page in block %u of relation %s; zeroing out page",
916  blockNum,
917  relpath(smgr->smgr_rnode, forkNum))));
918  MemSet((char *) bufBlock, 0, BLCKSZ);
919  }
920  else
921  ereport(ERROR,
923  errmsg("invalid page in block %u of relation %s",
924  blockNum,
925  relpath(smgr->smgr_rnode, forkNum))));
926  }
927  }
928  }
929 
930  /*
931  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
932  * the page as valid, to make sure that no other backend sees the zeroed
933  * page before the caller has had a chance to initialize it.
934  *
935  * Since no-one else can be looking at the page contents yet, there is no
936  * difference between an exclusive lock and a cleanup-strength lock. (Note
937  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
938  * they assert that the buffer is already valid.)
939  */
941  !isLocalBuf)
942  {
944  }
945 
946  if (isLocalBuf)
947  {
948  /* Only need to adjust flags */
949  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
950 
951  buf_state |= BM_VALID;
952  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
953  }
954  else
955  {
956  /* Set BM_VALID, terminate IO, and wake up any waiters */
957  TerminateBufferIO(bufHdr, false, BM_VALID);
958  }
959 
960  VacuumPageMiss++;
961  if (VacuumCostActive)
963 
964  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
965  smgr->smgr_rnode.node.spcNode,
966  smgr->smgr_rnode.node.dbNode,
967  smgr->smgr_rnode.node.relNode,
968  smgr->smgr_rnode.backend,
969  isExtend,
970  found);
971 
972  return BufferDescriptorGetBuffer(bufHdr);
973 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:62
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:103
static PgChecksumMode mode
Definition: pg_checksums.c:61
long local_blks_hit
Definition: instrument.h:25
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3659
int errhint(const char *fmt,...)
Definition: elog.c:1069
long local_blks_read
Definition: instrument.h:26
int VacuumCostBalance
Definition: globals.c:147
bool PageIsVerified(Page page, BlockNumber blkno)
Definition: bufpage.c:82
instr_time blk_read_time
Definition: instrument.h:31
int VacuumPageHit
Definition: globals.c:143
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct timeval instr_time
Definition: instr_time.h:150
long shared_blks_read
Definition: instrument.h:22
int errcode(int sqlerrcode)
Definition: elog.c:608
#define MemSet(start, val, len)
Definition: c.h:962
#define P_NEW
Definition: bufmgr.h:81
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:508
#define SmgrIsTemp(smgr)
Definition: smgr.h:79
long shared_blks_written
Definition: instrument.h:24
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:3931
#define ERROR
Definition: elog.h:43
char relpersistence
Definition: pg_class.h:78
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
unsigned int uint32
Definition: c.h:359
#define ereport(elevel, rest)
Definition: elog.h:141
int VacuumCostPageHit
Definition: globals.c:137
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:45
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:893
#define WARNING
Definition: elog.h:40
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:3998
#define BM_VALID
Definition: buf_internals.h:59
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:58
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:995
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:555
#define Assert(condition)
Definition: c.h:739
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:1393
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
#define BufferDescriptorGetBuffer(bdesc)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1122
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:483
#define PageIsNew(page)
Definition: bufpage.h:229
int errmsg(const char *fmt,...)
Definition: elog.c:822
long shared_blks_hit
Definition: instrument.h:21
#define UnlockBufHdr(desc, s)
long local_blks_written
Definition: instrument.h:28
#define relpath(rnode, forknum)
Definition: relpath.h:87
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
pg_atomic_uint32 state
int VacuumPageMiss
Definition: globals.c:144
int VacuumCostPageMiss
Definition: globals.c:138
bool track_io_timing
Definition: bufmgr.c:112
Pointer Page
Definition: bufpage.h:78
BufferUsage pgBufferUsage
Definition: instrument.c:20
void * Block
Definition: bufmgr.h:24
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241
bool zero_damaged_pages
Definition: bufmgr.c:109

◆ ReadBufferExtended()

Buffer ReadBufferExtended ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)

Definition at line 642 of file bufmgr.c.

References buf, ereport, errcode(), errmsg(), ERROR, pgstat_count_buffer_hit, pgstat_count_buffer_read, RelationData::rd_rel, RelationData::rd_smgr, ReadBuffer_common(), RELATION_IS_OTHER_TEMP, and RelationOpenSmgr.

Referenced by _hash_getbuf_with_strategy(), _hash_getinitbuf(), _hash_getnewbuf(), autoprewarm_database_main(), blbulkdelete(), blgetbitmap(), blvacuumcleanup(), brin_vacuum_scan(), brinbuildempty(), btvacuumpage(), btvacuumscan(), collect_corrupt_items(), collect_visibility_data(), count_nondeletable_pages(), fsm_readbuf(), get_raw_page_internal(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginScanToDelete(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistbuildempty(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbulkdelete(), heapam_scan_analyze_next_block(), heapgetpage(), lazy_scan_heap(), lazy_vacuum_heap(), palloc_btree_page(), pg_prewarm(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstathashindex(), pgstatindex_impl(), ReadBuffer(), ReadBufferBI(), spgprocesspending(), spgvacuumpage(), statapprox_heap(), and vm_readbuf().

644 {
645  bool hit;
646  Buffer buf;
647 
648  /* Open it at the smgr level if not already done */
649  RelationOpenSmgr(reln);
650 
651  /*
652  * Reject attempts to read non-local temporary relations; we would be
653  * likely to get wrong data since we have no visibility into the owning
654  * session's local buffers.
655  */
656  if (RELATION_IS_OTHER_TEMP(reln))
657  ereport(ERROR,
658  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
659  errmsg("cannot access temporary tables of other sessions")));
660 
661  /*
662  * Read the buffer, and update pgstat counters to reflect a cache hit or
663  * miss.
664  */
666  buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
667  forkNum, blockNum, mode, strategy, &hit);
668  if (hit)
670  return buf;
671 }
static PgChecksumMode mode
Definition: pg_checksums.c:61
struct SMgrRelationData * rd_smgr
Definition: rel.h:56
int errcode(int sqlerrcode)
Definition: elog.c:608
Form_pg_class rd_rel
Definition: rel.h:83
#define RelationOpenSmgr(relation)
Definition: rel.h:479
#define ERROR
Definition: elog.h:43
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:1383
static char * buf
Definition: pg_test_fsync.c:67
#define ereport(elevel, rest)
Definition: elog.h:141
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:1388
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:705
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:552
int errmsg(const char *fmt,...)
Definition: elog.c:822
int Buffer
Definition: buf.h:23

◆ ReadBufferWithoutRelcache()

Buffer ReadBufferWithoutRelcache ( RelFileNode  rnode,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)

Definition at line 684 of file bufmgr.c.

References Assert, InRecovery, InvalidBackendId, ReadBuffer_common(), and smgropen().

Referenced by XLogReadBufferExtended().

687 {
688  bool hit;
689 
690  SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
691 
693 
694  return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
695  mode, strategy, &hit);
696 }
static PgChecksumMode mode
Definition: pg_checksums.c:61
bool InRecovery
Definition: xlog.c:200
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:145
#define InvalidBackendId
Definition: backendid.h:23
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:705
#define Assert(condition)
Definition: c.h:739

◆ RelationGetNumberOfBlocksInFork()

BlockNumber RelationGetNumberOfBlocksInFork ( Relation  relation,
ForkNumber  forkNum 
)

Definition at line 2800 of file bufmgr.c.

References Assert, RelationData::rd_rel, RelationData::rd_smgr, RelationOpenSmgr, smgrnblocks(), and table_relation_size().

Referenced by _hash_getnewbuf(), _hash_init(), autoprewarm_database_main(), get_raw_page_internal(), and pg_prewarm().

2801 {
2802  switch (relation->rd_rel->relkind)
2803  {
2804  case RELKIND_SEQUENCE:
2805  case RELKIND_INDEX:
2806  case RELKIND_PARTITIONED_INDEX:
2807  /* Open it at the smgr level if not already done */
2808  RelationOpenSmgr(relation);
2809 
2810  return smgrnblocks(relation->rd_smgr, forkNum);
2811 
2812  case RELKIND_RELATION:
2813  case RELKIND_TOASTVALUE:
2814  case RELKIND_MATVIEW:
2815  {
2816  /*
2817  * Not every table AM uses BLCKSZ wide fixed size blocks.
2818  * Therefore tableam returns the size in bytes - but for the
2819  * purpose of this routine, we want the number of blocks.
2820  * Therefore divide, rounding up.
2821  */
2822  uint64 szbytes;
2823 
2824  szbytes = table_relation_size(relation, forkNum);
2825 
2826  return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
2827  }
2828  case RELKIND_VIEW:
2829  case RELKIND_COMPOSITE_TYPE:
2830  case RELKIND_FOREIGN_TABLE:
2831  case RELKIND_PARTITIONED_TABLE:
2832  default:
2833  Assert(false);
2834  break;
2835  }
2836 
2837  return 0; /* keep compiler quiet */
2838 }
struct SMgrRelationData * rd_smgr
Definition: rel.h:56
Form_pg_class rd_rel
Definition: rel.h:83
#define RelationOpenSmgr(relation)
Definition: rel.h:479
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1592
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:555
#define Assert(condition)
Definition: c.h:739

◆ ReleaseAndReadBuffer()

Buffer ReleaseAndReadBuffer ( Buffer  buffer,
Relation  relation,
BlockNumber  blockNum 
)

Definition at line 1521 of file bufmgr.c.

References Assert, buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid, CurrentResourceOwner, buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, LocalRefCount, MAIN_FORKNUM, RelationData::rd_node, ReadBuffer(), RelFileNodeEquals, ResourceOwnerForgetBuffer(), buftag::rnode, BufferDesc::tag, and UnpinBuffer().

Referenced by _bt_relandgetbuf(), ginFindLeafPage(), heapam_index_fetch_tuple(), and heapam_scan_bitmap_next_block().

1524 {
1525  ForkNumber forkNum = MAIN_FORKNUM;
1526  BufferDesc *bufHdr;
1527 
1528  if (BufferIsValid(buffer))
1529  {
1530  Assert(BufferIsPinned(buffer));
1531  if (BufferIsLocal(buffer))
1532  {
1533  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1534  if (bufHdr->tag.blockNum == blockNum &&
1535  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1536  bufHdr->tag.forkNum == forkNum)
1537  return buffer;
1539  LocalRefCount[-buffer - 1]--;
1540  }
1541  else
1542  {
1543  bufHdr = GetBufferDescriptor(buffer - 1);
1544  /* we have pin, so it's ok to examine tag without spinlock */
1545  if (bufHdr->tag.blockNum == blockNum &&
1546  RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
1547  bufHdr->tag.forkNum == forkNum)
1548  return buffer;
1549  UnpinBuffer(bufHdr, true);
1550  }
1551  }
1552 
1553  return ReadBuffer(relation, blockNum);
1554 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:420
ForkNumber forkNum
Definition: buf_internals.h:93
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
#define GetLocalBufferDescriptor(id)
#define GetBufferDescriptor(id)
ForkNumber
Definition: relpath.h:40
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1702
RelFileNode rd_node
Definition: rel.h:54
#define Assert(condition)
Definition: c.h:739
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:596
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94