PostgreSQL Source Code  git master
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xlog.h"
#include "catalog/catalog.h"
#include "catalog/storage.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner_private.h"
#include "utils/timestamp.h"
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)   LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static Buffer ReadBuffer_common (SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf, bool fixOwner)
 
static void BufferSync (int flags)
 
static uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static bool StartBufferIO (BufferDesc *buf, bool forInput)
 
static void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rnode_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const void *p1, const void *p2)
 
static int ckpt_buforder_comparator (const void *pa, const void *pb)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
void PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
static void InvalidateBuffer (BufferDesc *buf)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferPoolAccess (void)
 
void InitBufferPoolBackend (void)
 
void PrintBufferLeakWarning (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
void BufmgrCommit (void)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelFileNodeBuffers (RelFileNodeBackend rnode, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelFileNodesAllBuffers (RelFileNodeBackend *rnodes, int nnodes)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, int mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
void AbortBufferIO (void)
 
uint32 LockBufHdr (BufferDesc *desc)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *context)
 
void TestForOldSnapshot_impl (Snapshot snapshot, Relation relation)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = 0
 
int maintenance_io_concurrency = 0
 
int checkpoint_flush_after = 0
 
int bgwriter_flush_after = 0
 
int backend_flush_after = 0
 
static BufferDescInProgressBuf = NULL
 
static bool IsForInput
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 

Macro Definition Documentation

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 68 of file bufmgr.c.

Referenced by BgBufferSync(), and SyncOneBuffer().

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 67 of file bufmgr.c.

Referenced by BgBufferSync(), BufferSync(), and SyncOneBuffer().

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 60 of file bufmgr.c.

Referenced by BufferAlloc(), and FlushBuffer().

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
int32 * LocalRefCount
Definition: localbuf.c:45

Definition at line 439 of file bufmgr.c.

Referenced by BufferGetBlockNumber(), BufferGetLSNAtomic(), BufferGetTag(), BufferIsPermanent(), FlushOneBuffer(), IncrBufferRefCount(), MarkBufferDirty(), and ReleaseAndReadBuffer().

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 59 of file bufmgr.c.

Referenced by FlushBuffer(), and ReadBuffer_common().

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 63 of file bufmgr.c.

Referenced by FlushRelationBuffers(), and ReadBuffer_common().

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 70 of file bufmgr.c.

Referenced by DropRelFileNodesAllBuffers(), and FlushRelationsAllBuffers().

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

◆ SMgrSortArray

typedef struct SMgrSortArray SMgrSortArray

Function Documentation

◆ AbortBufferIO()

void AbortBufferIO ( void  )

Definition at line 4123 of file bufmgr.c.

References Assert, buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_VALID, buf, BufferDescriptorGetIOLock, ereport, errcode(), errdetail(), errmsg(), buftag::forkNum, InProgressBuf, IsForInput, LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), pfree(), relpathperm, buftag::rnode, BufferDesc::tag, TerminateBufferIO(), UnlockBufHdr, and WARNING.

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

4124 {
4126 
4127  if (buf)
4128  {
4129  uint32 buf_state;
4130 
4131  /*
4132  * Since LWLockReleaseAll has already been called, we're not holding
4133  * the buffer's io_in_progress_lock. We have to re-acquire it so that
4134  * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
4135  * buffer will be in a busy spin until we succeed in doing this.
4136  */
4138 
4139  buf_state = LockBufHdr(buf);
4140  Assert(buf_state & BM_IO_IN_PROGRESS);
4141  if (IsForInput)
4142  {
4143  Assert(!(buf_state & BM_DIRTY));
4144 
4145  /* We'd better not think buffer is valid yet */
4146  Assert(!(buf_state & BM_VALID));
4147  UnlockBufHdr(buf, buf_state);
4148  }
4149  else
4150  {
4151  Assert(buf_state & BM_DIRTY);
4152  UnlockBufHdr(buf, buf_state);
4153  /* Issue notice if this is not the first failure... */
4154  if (buf_state & BM_IO_ERROR)
4155  {
4156  /* Buffer is pinned, so we can read tag without spinlock */
4157  char *path;
4158 
4159  path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
4160  ereport(WARNING,
4161  (errcode(ERRCODE_IO_ERROR),
4162  errmsg("could not write block %u of %s",
4163  buf->tag.blockNum, path),
4164  errdetail("Multiple failures --- write error might be permanent.")));
4165  pfree(path);
4166  }
4167  }
4168  TerminateBufferIO(buf, false, BM_IO_ERROR);
4169  }
4170 }
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
ForkNumber forkNum
Definition: buf_internals.h:93
int errcode(int sqlerrcode)
Definition: elog.c:610
#define BM_DIRTY
Definition: buf_internals.h:58
#define BufferDescriptorGetIOLock(bdesc)
static BufferDesc * InProgressBuf
Definition: bufmgr.c:152
void pfree(void *pointer)
Definition: mcxt.c:1056
static char * buf
Definition: pg_test_fsync.c:67
int errdetail(const char *fmt,...)
Definition: elog.c:957
unsigned int uint32
Definition: c.h:367
static bool IsForInput
Definition: bufmgr.c:153
#define WARNING
Definition: elog.h:40
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4091
#define BM_VALID
Definition: buf_internals.h:59
#define ereport(elevel,...)
Definition: elog.h:144
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
#define Assert(condition)
Definition: c.h:738
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1123
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
#define BM_IO_ERROR
Definition: buf_internals.h:62
BufferTag tag
int errmsg(const char *fmt,...)
Definition: elog.c:824
#define UnlockBufHdr(desc, s)
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:61

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 2394 of file bufmgr.c.

References Assert, AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

2395 {
2397 
2398  AtEOXact_LocalBuffers(isCommit);
2399 
2401 }
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:190
#define Assert(condition)
Definition: c.h:738
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2469
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:572

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 2450 of file bufmgr.c.

References AbortBufferIO(), AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferPoolBackend().

2451 {
2452  AbortBufferIO();
2453  UnlockBuffers();
2454 
2456 
2457  /* localbuf.c needs a chance too */
2459 }
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:583
void UnlockBuffers(void)
Definition: bufmgr.c:3645
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:2469
void AbortBufferIO(void)
Definition: bufmgr.c:4123

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 2024 of file bufmgr.c.

References Assert, bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, BgWriterStats, BUF_REUSABLE, BUF_WRITTEN, CurrentResourceOwner, DEBUG1, DEBUG2, elog, PgStat_MsgBgWriter::m_buf_alloc, PgStat_MsgBgWriter::m_buf_written_clean, PgStat_MsgBgWriter::m_maxwritten_clean, NBuffers, ResourceOwnerEnlargeBuffers(), StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

2025 {
2026  /* info obtained from freelist.c */
2027  int strategy_buf_id;
2028  uint32 strategy_passes;
2029  uint32 recent_alloc;
2030 
2031  /*
2032  * Information saved between calls so we can determine the strategy
2033  * point's advance rate and avoid scanning already-cleaned buffers.
2034  */
2035  static bool saved_info_valid = false;
2036  static int prev_strategy_buf_id;
2037  static uint32 prev_strategy_passes;
2038  static int next_to_clean;
2039  static uint32 next_passes;
2040 
2041  /* Moving averages of allocation rate and clean-buffer density */
2042  static float smoothed_alloc = 0;
2043  static float smoothed_density = 10.0;
2044 
2045  /* Potentially these could be tunables, but for now, not */
2046  float smoothing_samples = 16;
2047  float scan_whole_pool_milliseconds = 120000.0;
2048 
2049  /* Used to compute how far we scan ahead */
2050  long strategy_delta;
2051  int bufs_to_lap;
2052  int bufs_ahead;
2053  float scans_per_alloc;
2054  int reusable_buffers_est;
2055  int upcoming_alloc_est;
2056  int min_scan_buffers;
2057 
2058  /* Variables for the scanning loop proper */
2059  int num_to_scan;
2060  int num_written;
2061  int reusable_buffers;
2062 
2063  /* Variables for final smoothed_density update */
2064  long new_strategy_delta;
2065  uint32 new_recent_alloc;
2066 
2067  /*
2068  * Find out where the freelist clock sweep currently is, and how many
2069  * buffer allocations have happened since our last call.
2070  */
2071  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2072 
2073  /* Report buffer alloc counts to pgstat */
2074  BgWriterStats.m_buf_alloc += recent_alloc;
2075 
2076  /*
2077  * If we're not running the LRU scan, just stop after doing the stats
2078  * stuff. We mark the saved state invalid so that we can recover sanely
2079  * if LRU scan is turned back on later.
2080  */
2081  if (bgwriter_lru_maxpages <= 0)
2082  {
2083  saved_info_valid = false;
2084  return true;
2085  }
2086 
2087  /*
2088  * Compute strategy_delta = how many buffers have been scanned by the
2089  * clock sweep since last time. If first time through, assume none. Then
2090  * see if we are still ahead of the clock sweep, and if so, how many
2091  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2092  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2093  * behavior when the passes counts wrap around.
2094  */
2095  if (saved_info_valid)
2096  {
2097  int32 passes_delta = strategy_passes - prev_strategy_passes;
2098 
2099  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2100  strategy_delta += (long) passes_delta * NBuffers;
2101 
2102  Assert(strategy_delta >= 0);
2103 
2104  if ((int32) (next_passes - strategy_passes) > 0)
2105  {
2106  /* we're one pass ahead of the strategy point */
2107  bufs_to_lap = strategy_buf_id - next_to_clean;
2108 #ifdef BGW_DEBUG
2109  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2110  next_passes, next_to_clean,
2111  strategy_passes, strategy_buf_id,
2112  strategy_delta, bufs_to_lap);
2113 #endif
2114  }
2115  else if (next_passes == strategy_passes &&
2116  next_to_clean >= strategy_buf_id)
2117  {
2118  /* on same pass, but ahead or at least not behind */
2119  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2120 #ifdef BGW_DEBUG
2121  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2122  next_passes, next_to_clean,
2123  strategy_passes, strategy_buf_id,
2124  strategy_delta, bufs_to_lap);
2125 #endif
2126  }
2127  else
2128  {
2129  /*
2130  * We're behind, so skip forward to the strategy point and start
2131  * cleaning from there.
2132  */
2133 #ifdef BGW_DEBUG
2134  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2135  next_passes, next_to_clean,
2136  strategy_passes, strategy_buf_id,
2137  strategy_delta);
2138 #endif
2139  next_to_clean = strategy_buf_id;
2140  next_passes = strategy_passes;
2141  bufs_to_lap = NBuffers;
2142  }
2143  }
2144  else
2145  {
2146  /*
2147  * Initializing at startup or after LRU scanning had been off. Always
2148  * start at the strategy point.
2149  */
2150 #ifdef BGW_DEBUG
2151  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2152  strategy_passes, strategy_buf_id);
2153 #endif
2154  strategy_delta = 0;
2155  next_to_clean = strategy_buf_id;
2156  next_passes = strategy_passes;
2157  bufs_to_lap = NBuffers;
2158  }
2159 
2160  /* Update saved info for next time */
2161  prev_strategy_buf_id = strategy_buf_id;
2162  prev_strategy_passes = strategy_passes;
2163  saved_info_valid = true;
2164 
2165  /*
2166  * Compute how many buffers had to be scanned for each new allocation, ie,
2167  * 1/density of reusable buffers, and track a moving average of that.
2168  *
2169  * If the strategy point didn't move, we don't update the density estimate
2170  */
2171  if (strategy_delta > 0 && recent_alloc > 0)
2172  {
2173  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2174  smoothed_density += (scans_per_alloc - smoothed_density) /
2175  smoothing_samples;
2176  }
2177 
2178  /*
2179  * Estimate how many reusable buffers there are between the current
2180  * strategy point and where we've scanned ahead to, based on the smoothed
2181  * density estimate.
2182  */
2183  bufs_ahead = NBuffers - bufs_to_lap;
2184  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2185 
2186  /*
2187  * Track a moving average of recent buffer allocations. Here, rather than
2188  * a true average we want a fast-attack, slow-decline behavior: we
2189  * immediately follow any increase.
2190  */
2191  if (smoothed_alloc <= (float) recent_alloc)
2192  smoothed_alloc = recent_alloc;
2193  else
2194  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2195  smoothing_samples;
2196 
2197  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2198  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2199 
2200  /*
2201  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2202  * eventually underflow to zero, and the underflows produce annoying
2203  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2204  * zero, there's no point in tracking smaller and smaller values of
2205  * smoothed_alloc, so just reset it to exactly zero to avoid this
2206  * syndrome. It will pop back up as soon as recent_alloc increases.
2207  */
2208  if (upcoming_alloc_est == 0)
2209  smoothed_alloc = 0;
2210 
2211  /*
2212  * Even in cases where there's been little or no buffer allocation
2213  * activity, we want to make a small amount of progress through the buffer
2214  * cache so that as many reusable buffers as possible are clean after an
2215  * idle period.
2216  *
2217  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2218  * the BGW will be called during the scan_whole_pool time; slice the
2219  * buffer pool into that many sections.
2220  */
2221  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2222 
2223  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2224  {
2225 #ifdef BGW_DEBUG
2226  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2227  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2228 #endif
2229  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2230  }
2231 
2232  /*
2233  * Now write out dirty reusable buffers, working forward from the
2234  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2235  * enough buffers to match our estimate of the next cycle's allocation
2236  * requirements, or hit the bgwriter_lru_maxpages limit.
2237  */
2238 
2239  /* Make sure we can handle the pin inside SyncOneBuffer */
2241 
2242  num_to_scan = bufs_to_lap;
2243  num_written = 0;
2244  reusable_buffers = reusable_buffers_est;
2245 
2246  /* Execute the LRU scan */
2247  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2248  {
2249  int sync_state = SyncOneBuffer(next_to_clean, true,
2250  wb_context);
2251 
2252  if (++next_to_clean >= NBuffers)
2253  {
2254  next_to_clean = 0;
2255  next_passes++;
2256  }
2257  num_to_scan--;
2258 
2259  if (sync_state & BUF_WRITTEN)
2260  {
2261  reusable_buffers++;
2262  if (++num_written >= bgwriter_lru_maxpages)
2263  {
2265  break;
2266  }
2267  }
2268  else if (sync_state & BUF_REUSABLE)
2269  reusable_buffers++;
2270  }
2271 
2272  BgWriterStats.m_buf_written_clean += num_written;
2273 
2274 #ifdef BGW_DEBUG
2275  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
2276  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
2277  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
2278  bufs_to_lap - num_to_scan,
2279  num_written,
2280  reusable_buffers - reusable_buffers_est);
2281 #endif
2282 
2283  /*
2284  * Consider the above scan as being like a new allocation scan.
2285  * Characterize its density and update the smoothed one based on it. This
2286  * effectively halves the moving average period in cases where both the
2287  * strategy and the background writer are doing some useful scanning,
2288  * which is helpful because a long memory isn't as desirable on the
2289  * density estimates.
2290  */
2291  new_strategy_delta = bufs_to_lap - num_to_scan;
2292  new_recent_alloc = reusable_buffers - reusable_buffers_est;
2293  if (new_strategy_delta > 0 && new_recent_alloc > 0)
2294  {
2295  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
2296  smoothed_density += (scans_per_alloc - smoothed_density) /
2297  smoothing_samples;
2298 
2299 #ifdef BGW_DEBUG
2300  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
2301  new_recent_alloc, new_strategy_delta,
2302  scans_per_alloc, smoothed_density);
2303 #endif
2304  }
2305 
2306  /* Return true if OK to hibernate */
2307  return (bufs_to_lap == 0 && recent_alloc == 0);
2308 }
PgStat_Counter m_buf_alloc
Definition: pgstat.h:435
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:395
#define DEBUG1
Definition: elog.h:25
int BgWriterDelay
Definition: bgwriter.c:64
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
PgStat_Counter m_maxwritten_clean
Definition: pgstat.h:432
PgStat_Counter m_buf_written_clean
Definition: pgstat.h:431
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:142
double bgwriter_lru_multiplier
Definition: bufmgr.c:125
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2327
signed int int32
Definition: c.h:355
#define BUF_REUSABLE
Definition: bufmgr.c:68
int bgwriter_lru_maxpages
Definition: bufmgr.c:124
#define DEBUG2
Definition: elog.h:24
unsigned int uint32
Definition: c.h:367
#define BUF_WRITTEN
Definition: bufmgr.c:67
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define Assert(condition)
Definition: c.h:738
#define elog(elevel,...)
Definition: elog.h:214
int NBuffers
Definition: globals.c:131

◆ BufferAlloc()

static BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr 
)
static

Definition at line 956 of file bufmgr.c.

References Assert, BackendWritebackContext, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_ERROR, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BUF_FLAG_MASK, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BUF_USAGECOUNT_ONE, BufferDescriptorGetContentLock, BufferGetLSN, BufMappingPartitionLock, BufTableDelete(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), RelFileNode::dbNode, FlushBuffer(), GetBufferDescriptor, INIT_BUFFERTAG, INIT_FORKNUM, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockConditionalAcquire(), LWLockRelease(), RelFileNodeBackend::node, PinBuffer(), PinBuffer_Locked(), RelFileNode::relNode, ReservePrivateRefCountEntry(), ScheduleBufferTagForWriteback(), SMgrRelationData::smgr_rnode, RelFileNode::spcNode, StartBufferIO(), StrategyGetBuffer(), StrategyRejectBuffer(), BufferDesc::tag, UnlockBufHdr, UnpinBuffer(), and XLogNeedsFlush().

Referenced by ReadBuffer_common().

960 {
961  BufferTag newTag; /* identity of requested block */
962  uint32 newHash; /* hash value for newTag */
963  LWLock *newPartitionLock; /* buffer partition lock for it */
964  BufferTag oldTag; /* previous identity of selected buffer */
965  uint32 oldHash; /* hash value for oldTag */
966  LWLock *oldPartitionLock; /* buffer partition lock for it */
967  uint32 oldFlags;
968  int buf_id;
969  BufferDesc *buf;
970  bool valid;
971  uint32 buf_state;
972 
973  /* create a tag so we can lookup the buffer */
974  INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
975 
976  /* determine its hash code and partition lock ID */
977  newHash = BufTableHashCode(&newTag);
978  newPartitionLock = BufMappingPartitionLock(newHash);
979 
980  /* see if the block is in the buffer pool already */
981  LWLockAcquire(newPartitionLock, LW_SHARED);
982  buf_id = BufTableLookup(&newTag, newHash);
983  if (buf_id >= 0)
984  {
985  /*
986  * Found it. Now, pin the buffer so no one can steal it from the
987  * buffer pool, and check to see if the correct data has been loaded
988  * into the buffer.
989  */
990  buf = GetBufferDescriptor(buf_id);
991 
992  valid = PinBuffer(buf, strategy);
993 
994  /* Can release the mapping lock as soon as we've pinned it */
995  LWLockRelease(newPartitionLock);
996 
997  *foundPtr = true;
998 
999  if (!valid)
1000  {
1001  /*
1002  * We can only get here if (a) someone else is still reading in
1003  * the page, or (b) a previous read attempt failed. We have to
1004  * wait for any active read attempt to finish, and then set up our
1005  * own read attempt if the page is still not BM_VALID.
1006  * StartBufferIO does it all.
1007  */
1008  if (StartBufferIO(buf, true))
1009  {
1010  /*
1011  * If we get here, previous attempts to read the buffer must
1012  * have failed ... but we shall bravely try again.
1013  */
1014  *foundPtr = false;
1015  }
1016  }
1017 
1018  return buf;
1019  }
1020 
1021  /*
1022  * Didn't find it in the buffer pool. We'll have to initialize a new
1023  * buffer. Remember to unlock the mapping lock while doing the work.
1024  */
1025  LWLockRelease(newPartitionLock);
1026 
1027  /* Loop here in case we have to try another victim buffer */
1028  for (;;)
1029  {
1030  /*
1031  * Ensure, while the spinlock's not yet held, that there's a free
1032  * refcount entry.
1033  */
1035 
1036  /*
1037  * Select a victim buffer. The buffer is returned with its header
1038  * spinlock still held!
1039  */
1040  buf = StrategyGetBuffer(strategy, &buf_state);
1041 
1042  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1043 
1044  /* Must copy buffer flags while we still hold the spinlock */
1045  oldFlags = buf_state & BUF_FLAG_MASK;
1046 
1047  /* Pin the buffer and then release the buffer spinlock */
1048  PinBuffer_Locked(buf);
1049 
1050  /*
1051  * If the buffer was dirty, try to write it out. There is a race
1052  * condition here, in that someone might dirty it after we released it
1053  * above, or even while we are writing it out (since our share-lock
1054  * won't prevent hint-bit updates). We will recheck the dirty bit
1055  * after re-locking the buffer header.
1056  */
1057  if (oldFlags & BM_DIRTY)
1058  {
1059  /*
1060  * We need a share-lock on the buffer contents to write it out
1061  * (else we might write invalid data, eg because someone else is
1062  * compacting the page contents while we write). We must use a
1063  * conditional lock acquisition here to avoid deadlock. Even
1064  * though the buffer was not pinned (and therefore surely not
1065  * locked) when StrategyGetBuffer returned it, someone else could
1066  * have pinned and exclusive-locked it by the time we get here. If
1067  * we try to get the lock unconditionally, we'd block waiting for
1068  * them; if they later block waiting for us, deadlock ensues.
1069  * (This has been observed to happen when two backends are both
1070  * trying to split btree index pages, and the second one just
1071  * happens to be trying to split the page the first one got from
1072  * StrategyGetBuffer.)
1073  */
1075  LW_SHARED))
1076  {
1077  /*
1078  * If using a nondefault strategy, and writing the buffer
1079  * would require a WAL flush, let the strategy decide whether
1080  * to go ahead and write/reuse the buffer or to choose another
1081  * victim. We need lock to inspect the page LSN, so this
1082  * can't be done inside StrategyGetBuffer.
1083  */
1084  if (strategy != NULL)
1085  {
1086  XLogRecPtr lsn;
1087 
1088  /* Read the LSN while holding buffer header lock */
1089  buf_state = LockBufHdr(buf);
1090  lsn = BufferGetLSN(buf);
1091  UnlockBufHdr(buf, buf_state);
1092 
1093  if (XLogNeedsFlush(lsn) &&
1094  StrategyRejectBuffer(strategy, buf))
1095  {
1096  /* Drop lock/pin and loop around for another buffer */
1098  UnpinBuffer(buf, true);
1099  continue;
1100  }
1101  }
1102 
1103  /* OK, do the I/O */
1104  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
1105  smgr->smgr_rnode.node.spcNode,
1106  smgr->smgr_rnode.node.dbNode,
1107  smgr->smgr_rnode.node.relNode);
1108 
1109  FlushBuffer(buf, NULL);
1111 
1113  &buf->tag);
1114 
1115  TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
1116  smgr->smgr_rnode.node.spcNode,
1117  smgr->smgr_rnode.node.dbNode,
1118  smgr->smgr_rnode.node.relNode);
1119  }
1120  else
1121  {
1122  /*
1123  * Someone else has locked the buffer, so give it up and loop
1124  * back to get another one.
1125  */
1126  UnpinBuffer(buf, true);
1127  continue;
1128  }
1129  }
1130 
1131  /*
1132  * To change the association of a valid buffer, we'll need to have
1133  * exclusive lock on both the old and new mapping partitions.
1134  */
1135  if (oldFlags & BM_TAG_VALID)
1136  {
1137  /*
1138  * Need to compute the old tag's hashcode and partition lock ID.
1139  * XXX is it worth storing the hashcode in BufferDesc so we need
1140  * not recompute it here? Probably not.
1141  */
1142  oldTag = buf->tag;
1143  oldHash = BufTableHashCode(&oldTag);
1144  oldPartitionLock = BufMappingPartitionLock(oldHash);
1145 
1146  /*
1147  * Must lock the lower-numbered partition first to avoid
1148  * deadlocks.
1149  */
1150  if (oldPartitionLock < newPartitionLock)
1151  {
1152  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1153  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1154  }
1155  else if (oldPartitionLock > newPartitionLock)
1156  {
1157  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1158  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1159  }
1160  else
1161  {
1162  /* only one partition, only one lock */
1163  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1164  }
1165  }
1166  else
1167  {
1168  /* if it wasn't valid, we need only the new partition */
1169  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1170  /* remember we have no old-partition lock or tag */
1171  oldPartitionLock = NULL;
1172  /* keep the compiler quiet about uninitialized variables */
1173  oldHash = 0;
1174  }
1175 
1176  /*
1177  * Try to make a hashtable entry for the buffer under its new tag.
1178  * This could fail because while we were writing someone else
1179  * allocated another buffer for the same block we want to read in.
1180  * Note that we have not yet removed the hashtable entry for the old
1181  * tag.
1182  */
1183  buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
1184 
1185  if (buf_id >= 0)
1186  {
1187  /*
1188  * Got a collision. Someone has already done what we were about to
1189  * do. We'll just handle this as if it were found in the buffer
1190  * pool in the first place. First, give up the buffer we were
1191  * planning to use.
1192  */
1193  UnpinBuffer(buf, true);
1194 
1195  /* Can give up that buffer's mapping partition lock now */
1196  if (oldPartitionLock != NULL &&
1197  oldPartitionLock != newPartitionLock)
1198  LWLockRelease(oldPartitionLock);
1199 
1200  /* remaining code should match code at top of routine */
1201 
1202  buf = GetBufferDescriptor(buf_id);
1203 
1204  valid = PinBuffer(buf, strategy);
1205 
1206  /* Can release the mapping lock as soon as we've pinned it */
1207  LWLockRelease(newPartitionLock);
1208 
1209  *foundPtr = true;
1210 
1211  if (!valid)
1212  {
1213  /*
1214  * We can only get here if (a) someone else is still reading
1215  * in the page, or (b) a previous read attempt failed. We
1216  * have to wait for any active read attempt to finish, and
1217  * then set up our own read attempt if the page is still not
1218  * BM_VALID. StartBufferIO does it all.
1219  */
1220  if (StartBufferIO(buf, true))
1221  {
1222  /*
1223  * If we get here, previous attempts to read the buffer
1224  * must have failed ... but we shall bravely try again.
1225  */
1226  *foundPtr = false;
1227  }
1228  }
1229 
1230  return buf;
1231  }
1232 
1233  /*
1234  * Need to lock the buffer header too in order to change its tag.
1235  */
1236  buf_state = LockBufHdr(buf);
1237 
1238  /*
1239  * Somebody could have pinned or re-dirtied the buffer while we were
1240  * doing the I/O and making the new hashtable entry. If so, we can't
1241  * recycle this buffer; we must undo everything we've done and start
1242  * over with a new victim buffer.
1243  */
1244  oldFlags = buf_state & BUF_FLAG_MASK;
1245  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
1246  break;
1247 
1248  UnlockBufHdr(buf, buf_state);
1249  BufTableDelete(&newTag, newHash);
1250  if (oldPartitionLock != NULL &&
1251  oldPartitionLock != newPartitionLock)
1252  LWLockRelease(oldPartitionLock);
1253  LWLockRelease(newPartitionLock);
1254  UnpinBuffer(buf, true);
1255  }
1256 
1257  /*
1258  * Okay, it's finally safe to rename the buffer.
1259  *
1260  * Clearing BM_VALID here is necessary, clearing the dirtybits is just
1261  * paranoia. We also reset the usage_count since any recency of use of
1262  * the old content is no longer relevant. (The usage_count starts out at
1263  * 1 so that the buffer can survive one clock-sweep pass.)
1264  *
1265  * Make sure BM_PERMANENT is set for buffers that must be written at every
1266  * checkpoint. Unlogged buffers only need to be written at shutdown
1267  * checkpoints, except for their "init" forks, which need to be treated
1268  * just like permanent relations.
1269  */
1270  buf->tag = newTag;
1271  buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
1274  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1275  buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
1276  else
1277  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1278 
1279  UnlockBufHdr(buf, buf_state);
1280 
1281  if (oldPartitionLock != NULL)
1282  {
1283  BufTableDelete(&oldTag, oldHash);
1284  if (oldPartitionLock != newPartitionLock)
1285  LWLockRelease(oldPartitionLock);
1286  }
1287 
1288  LWLockRelease(newPartitionLock);
1289 
1290  /*
1291  * Buffer contents are currently invalid. Try to get the io_in_progress
1292  * lock. If StartBufferIO returns false, then someone else managed to
1293  * read it before we did, so there's nothing left for BufferAlloc() to do.
1294  */
1295  if (StartBufferIO(buf, true))
1296  *foundPtr = false;
1297  else
1298  *foundPtr = true;
1299 
1300  return buf;
1301 }
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:1540
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
Definition: lwlock.h:32
#define BM_PERMANENT
Definition: buf_internals.h:66
#define BufMappingPartitionLock(hashcode)
#define BM_TAG_VALID
Definition: buf_internals.h:60
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3163
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
#define BM_DIRTY
Definition: buf_internals.h:58
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2644
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1727
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4024
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4393
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
WritebackContext BackendWritebackContext
Definition: buf_init.c:23
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1295
static char * buf
Definition: pg_test_fsync.c:67
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
unsigned int uint32
Definition: c.h:367
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1663
#define BM_VALID
Definition: buf_internals.h:59
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:686
RelFileNode node
Definition: relfilenode.h:74
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1625
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1123
#define BM_IO_ERROR
Definition: buf_internals.h:62
BufferTag tag
#define UnlockBufHdr(desc, s)
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:60
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 2584 of file bufmgr.c.

References Assert, buftag::blockNum, BufferIsLocal, BufferIsPinned, GetBufferDescriptor, GetLocalBufferDescriptor, and BufferDesc::tag.

Referenced by _bt_check_unique(), _bt_checkpage(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_endpoint(), _bt_finish_split(), _bt_first(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_readnextpage(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_split(), _bt_unlink_halfdead_page(), _bt_walk_left(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_with_data(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_is_all_visible(), heap_prune_chain(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), ReadBufferBI(), RelationAddExtraBlocks(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgWalk(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), visibilitymap_set(), and XLogReadBufferExtended().

2585 {
2586  BufferDesc *bufHdr;
2587 
2588  Assert(BufferIsPinned(buffer));
2589 
2590  if (BufferIsLocal(buffer))
2591  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2592  else
2593  bufHdr = GetBufferDescriptor(buffer - 1);
2594 
2595  /* pinned, so OK to read tag without spinlock */
2596  return bufHdr->tag.blockNum;
2597 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
#define GetLocalBufferDescriptor(id)
#define GetBufferDescriptor(id)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94
BufferTag tag

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 2847 of file bufmgr.c.

References Assert, BufferGetPage, BufferIsLocal, BufferIsPinned, BufferIsValid, GetBufferDescriptor, LockBufHdr(), PageGetLSN, UnlockBufHdr, and XLogHintBitIsNeeded.

Referenced by _bt_killitems(), _bt_readpage(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

2848 {
2849  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2850  char *page = BufferGetPage(buffer);
2851  XLogRecPtr lsn;
2852  uint32 buf_state;
2853 
2854  /*
2855  * If we don't need locking for correctness, fastpath out.
2856  */
2857  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
2858  return PageGetLSN(page);
2859 
2860  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2861  Assert(BufferIsValid(buffer));
2862  Assert(BufferIsPinned(buffer));
2863 
2864  buf_state = LockBufHdr(bufHdr);
2865  lsn = PageGetLSN(page);
2866  UnlockBufHdr(bufHdr, buf_state);
2867 
2868  return lsn;
2869 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define PageGetLSN(page)
Definition: bufpage.h:366
#define UnlockBufHdr(desc, s)
#define XLogHintBitIsNeeded()
Definition: xlog.h:193

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileNode rnode,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 2605 of file bufmgr.c.

References Assert, buftag::blockNum, BufferIsLocal, BufferIsPinned, buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, buftag::rnode, and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

2607 {
2608  BufferDesc *bufHdr;
2609 
2610  /* Do the same checks as BufferGetBlockNumber. */
2611  Assert(BufferIsPinned(buffer));
2612 
2613  if (BufferIsLocal(buffer))
2614  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
2615  else
2616  bufHdr = GetBufferDescriptor(buffer - 1);
2617 
2618  /* pinned, so OK to read tag without spinlock */
2619  *rnode = bufHdr->tag.rnode;
2620  *forknum = bufHdr->tag.forkNum;
2621  *blknum = bufHdr->tag.blockNum;
2622 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
ForkNumber forkNum
Definition: buf_internals.h:93
#define GetLocalBufferDescriptor(id)
#define GetBufferDescriptor(id)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 2817 of file bufmgr.c.

References Assert, BM_PERMANENT, BufferIsLocal, BufferIsPinned, BufferIsValid, GetBufferDescriptor, pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

2818 {
2819  BufferDesc *bufHdr;
2820 
2821  /* Local buffers are used only for temp relations. */
2822  if (BufferIsLocal(buffer))
2823  return false;
2824 
2825  /* Make sure we've got a real buffer, and that we hold a pin on it. */
2826  Assert(BufferIsValid(buffer));
2827  Assert(BufferIsPinned(buffer));
2828 
2829  /*
2830  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
2831  * need not bother with the buffer header spinlock. Even if someone else
2832  * changes the buffer header state while we're doing this, the state is
2833  * changed atomically, so we'll read the old value or the new value, but
2834  * not random garbage.
2835  */
2836  bufHdr = GetBufferDescriptor(buffer - 1);
2837  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
2838 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
#define BM_PERMANENT
Definition: buf_internals.h:66
#define GetBufferDescriptor(id)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
pg_atomic_uint32 state
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 1747 of file bufmgr.c.

References Assert, BgWriterStats, binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_ALL, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), ckpt_buforder_comparator(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, CurrentResourceOwner, DatumGetPointer, buftag::forkNum, CkptSortItem::forkNum, GetBufferDescriptor, i, CkptTsStatus::index, InvalidOid, IssuePendingWritebacks(), LockBufHdr(), PgStat_MsgBgWriter::m_buf_written_checkpoints, NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), pfree(), pg_atomic_read_u32(), PointerGetDatum, ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress, CkptTsStatus::progress_slice, qsort, RelFileNode::relNode, CkptSortItem::relNode, repalloc(), ResourceOwnerEnlargeBuffers(), buftag::rnode, RelFileNode::spcNode, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdr, and WritebackContextInit().

Referenced by CheckPointBuffers().

1748 {
1749  uint32 buf_state;
1750  int buf_id;
1751  int num_to_scan;
1752  int num_spaces;
1753  int num_processed;
1754  int num_written;
1755  CkptTsStatus *per_ts_stat = NULL;
1756  Oid last_tsid;
1757  binaryheap *ts_heap;
1758  int i;
1759  int mask = BM_DIRTY;
1760  WritebackContext wb_context;
1761 
1762  /* Make sure we can handle the pin inside SyncOneBuffer */
1764 
1765  /*
1766  * Unless this is a shutdown checkpoint or we have been explicitly told,
1767  * we write only permanent, dirty buffers. But at shutdown or end of
1768  * recovery, we write all dirty buffers.
1769  */
1772  mask |= BM_PERMANENT;
1773 
1774  /*
1775  * Loop over all buffers, and mark the ones that need to be written with
1776  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
1777  * can estimate how much work needs to be done.
1778  *
1779  * This allows us to write only those pages that were dirty when the
1780  * checkpoint began, and not those that get dirtied while it proceeds.
1781  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
1782  * later in this function, or by normal backends or the bgwriter cleaning
1783  * scan, the flag is cleared. Any buffer dirtied after this point won't
1784  * have the flag set.
1785  *
1786  * Note that if we fail to write some buffer, we may leave buffers with
1787  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
1788  * certainly need to be written for the next checkpoint attempt, too.
1789  */
1790  num_to_scan = 0;
1791  for (buf_id = 0; buf_id < NBuffers; buf_id++)
1792  {
1793  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
1794 
1795  /*
1796  * Header spinlock is enough to examine BM_DIRTY, see comment in
1797  * SyncOneBuffer.
1798  */
1799  buf_state = LockBufHdr(bufHdr);
1800 
1801  if ((buf_state & mask) == mask)
1802  {
1803  CkptSortItem *item;
1804 
1805  buf_state |= BM_CHECKPOINT_NEEDED;
1806 
1807  item = &CkptBufferIds[num_to_scan++];
1808  item->buf_id = buf_id;
1809  item->tsId = bufHdr->tag.rnode.spcNode;
1810  item->relNode = bufHdr->tag.rnode.relNode;
1811  item->forkNum = bufHdr->tag.forkNum;
1812  item->blockNum = bufHdr->tag.blockNum;
1813  }
1814 
1815  UnlockBufHdr(bufHdr, buf_state);
1816 
1817  /* Check for barrier events in case NBuffers is large. */
1820  }
1821 
1822  if (num_to_scan == 0)
1823  return; /* nothing to do */
1824 
1826 
1827  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
1828 
1829  /*
1830  * Sort buffers that need to be written to reduce the likelihood of random
1831  * IO. The sorting is also important for the implementation of balancing
1832  * writes between tablespaces. Without balancing writes we'd potentially
1833  * end up writing to the tablespaces one-by-one; possibly overloading the
1834  * underlying system.
1835  */
1836  qsort(CkptBufferIds, num_to_scan, sizeof(CkptSortItem),
1838 
1839  num_spaces = 0;
1840 
1841  /*
1842  * Allocate progress status for each tablespace with buffers that need to
1843  * be flushed. This requires the to-be-flushed array to be sorted.
1844  */
1845  last_tsid = InvalidOid;
1846  for (i = 0; i < num_to_scan; i++)
1847  {
1848  CkptTsStatus *s;
1849  Oid cur_tsid;
1850 
1851  cur_tsid = CkptBufferIds[i].tsId;
1852 
1853  /*
1854  * Grow array of per-tablespace status structs, every time a new
1855  * tablespace is found.
1856  */
1857  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
1858  {
1859  Size sz;
1860 
1861  num_spaces++;
1862 
1863  /*
1864  * Not worth adding grow-by-power-of-2 logic here - even with a
1865  * few hundred tablespaces this should be fine.
1866  */
1867  sz = sizeof(CkptTsStatus) * num_spaces;
1868 
1869  if (per_ts_stat == NULL)
1870  per_ts_stat = (CkptTsStatus *) palloc(sz);
1871  else
1872  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
1873 
1874  s = &per_ts_stat[num_spaces - 1];
1875  memset(s, 0, sizeof(*s));
1876  s->tsId = cur_tsid;
1877 
1878  /*
1879  * The first buffer in this tablespace. As CkptBufferIds is sorted
1880  * by tablespace all (s->num_to_scan) buffers in this tablespace
1881  * will follow afterwards.
1882  */
1883  s->index = i;
1884 
1885  /*
1886  * progress_slice will be determined once we know how many buffers
1887  * are in each tablespace, i.e. after this loop.
1888  */
1889 
1890  last_tsid = cur_tsid;
1891  }
1892  else
1893  {
1894  s = &per_ts_stat[num_spaces - 1];
1895  }
1896 
1897  s->num_to_scan++;
1898 
1899  /* Check for barrier events. */
1902  }
1903 
1904  Assert(num_spaces > 0);
1905 
1906  /*
1907  * Build a min-heap over the write-progress in the individual tablespaces,
1908  * and compute how large a portion of the total progress a single
1909  * processed buffer is.
1910  */
1911  ts_heap = binaryheap_allocate(num_spaces,
1913  NULL);
1914 
1915  for (i = 0; i < num_spaces; i++)
1916  {
1917  CkptTsStatus *ts_stat = &per_ts_stat[i];
1918 
1919  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
1920 
1921  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
1922  }
1923 
1924  binaryheap_build(ts_heap);
1925 
1926  /*
1927  * Iterate through to-be-checkpointed buffers and write the ones (still)
1928  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
1929  * tablespaces; otherwise the sorting would lead to only one tablespace
1930  * receiving writes at a time, making inefficient use of the hardware.
1931  */
1932  num_processed = 0;
1933  num_written = 0;
1934  while (!binaryheap_empty(ts_heap))
1935  {
1936  BufferDesc *bufHdr = NULL;
1937  CkptTsStatus *ts_stat = (CkptTsStatus *)
1939 
1940  buf_id = CkptBufferIds[ts_stat->index].buf_id;
1941  Assert(buf_id != -1);
1942 
1943  bufHdr = GetBufferDescriptor(buf_id);
1944 
1945  num_processed++;
1946 
1947  /*
1948  * We don't need to acquire the lock here, because we're only looking
1949  * at a single bit. It's possible that someone else writes the buffer
1950  * and clears the flag right after we check, but that doesn't matter
1951  * since SyncOneBuffer will then do nothing. However, there is a
1952  * further race condition: it's conceivable that between the time we
1953  * examine the bit here and the time SyncOneBuffer acquires the lock,
1954  * someone else not only wrote the buffer but replaced it with another
1955  * page and dirtied it. In that improbable case, SyncOneBuffer will
1956  * write the buffer though we didn't need to. It doesn't seem worth
1957  * guarding against this, though.
1958  */
1960  {
1961  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
1962  {
1963  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
1965  num_written++;
1966  }
1967  }
1968 
1969  /*
1970  * Measure progress independent of actually having to flush the buffer
1971  * - otherwise writing become unbalanced.
1972  */
1973  ts_stat->progress += ts_stat->progress_slice;
1974  ts_stat->num_scanned++;
1975  ts_stat->index++;
1976 
1977  /* Have all the buffers from the tablespace been processed? */
1978  if (ts_stat->num_scanned == ts_stat->num_to_scan)
1979  {
1980  binaryheap_remove_first(ts_heap);
1981  }
1982  else
1983  {
1984  /* update heap with the new progress */
1985  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
1986  }
1987 
1988  /*
1989  * Sleep to throttle our I/O rate.
1990  *
1991  * (This will check for barrier events even if it doesn't sleep.)
1992  */
1993  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
1994  }
1995 
1996  /* issue all pending flushes */
1997  IssuePendingWritebacks(&wb_context);
1998 
1999  pfree(per_ts_stat);
2000  per_ts_stat = NULL;
2001  binaryheap_free(ts_heap);
2002 
2003  /*
2004  * Update checkpoint statistics. As noted above, this doesn't include
2005  * buffers written by other backends or bgwriter scan.
2006  */
2007  CheckpointStats.ckpt_bufs_written += num_written;
2008 
2009  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2010 }
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:663
PgStat_Counter m_buf_written_checkpoints
Definition: pgstat.h:430
#define BM_PERMANENT
Definition: buf_internals.h:66
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:217
Oid tsId
Definition: bufmgr.c:88
#define binaryheap_empty(h)
Definition: binaryheap.h:52
ForkNumber forkNum
Definition: buf_internals.h:93
#define PointerGetDatum(X)
Definition: postgres.h:556
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:65
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:429
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:4358
PgStat_MsgBgWriter BgWriterStats
Definition: pgstat.c:142
int checkpoint_flush_after
Definition: bufmgr.c:147
void binaryheap_replace_first(binaryheap *heap, Datum d)
Definition: binaryheap.c:204
unsigned int Oid
Definition: postgres_ext.h:31
#define BM_DIRTY
Definition: buf_internals.h:58
void binaryheap_add_unordered(binaryheap *heap, Datum d)
Definition: binaryheap.c:110
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:2327
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4427
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4381
void pfree(void *pointer)
Definition: mcxt.c:1056
double float8
Definition: c.h:491
Datum binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:159
int num_to_scan
Definition: bufmgr.c:101
float8 progress_slice
Definition: bufmgr.c:98
int index
Definition: bufmgr.c:106
float8 progress
Definition: bufmgr.c:97
static int ckpt_buforder_comparator(const void *pa, const void *pb)
Definition: bufmgr.c:4324
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:213
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
#define BUF_WRITTEN
Definition: bufmgr.c:67
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
int ckpt_bufs_written
Definition: xlog.h:242
BlockNumber blockNum
#define InvalidOid
Definition: postgres_ext.h:36
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:126
const symbol * s
Definition: header.h:17
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
#define Assert(condition)
Definition: c.h:738
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:35
CheckpointStatsData CheckpointStats
Definition: xlog.c:184
CkptSortItem * CkptBufferIds
Definition: buf_init.c:24
size_t Size
Definition: c.h:466
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:69
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:33
#define DatumGetPointer(X)
Definition: postgres.h:549
BufferTag tag
void * palloc(Size size)
Definition: mcxt.c:949
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
pg_atomic_uint32 state
Datum binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:174
int num_scanned
Definition: bufmgr.c:103
#define qsort(a, b, c, d)
Definition: port.h:479
ForkNumber forkNum
struct CkptTsStatus CkptTsStatus
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:212
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ buffertag_comparator()

static int buffertag_comparator ( const void *  p1,
const void *  p2 
)
static

Definition at line 4293 of file bufmgr.c.

References buftag::blockNum, buftag::forkNum, buftag::rnode, and rnode_comparator().

Referenced by IssuePendingWritebacks().

4294 {
4295  const BufferTag *ba = (const BufferTag *) a;
4296  const BufferTag *bb = (const BufferTag *) b;
4297  int ret;
4298 
4299  ret = rnode_comparator(&ba->rnode, &bb->rnode);
4300 
4301  if (ret != 0)
4302  return ret;
4303 
4304  if (ba->forkNum < bb->forkNum)
4305  return -1;
4306  if (ba->forkNum > bb->forkNum)
4307  return 1;
4308 
4309  if (ba->blockNum < bb->blockNum)
4310  return -1;
4311  if (ba->blockNum > bb->blockNum)
4312  return 1;
4313 
4314  return 0;
4315 }
ForkNumber forkNum
Definition: buf_internals.h:93
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4214
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92

◆ BufmgrCommit()

void BufmgrCommit ( void  )

Definition at line 2570 of file bufmgr.c.

Referenced by PrepareTransaction(), and RecordTransactionCommit().

2571 {
2572  /* Nothing to do in bufmgr anymore... */
2573 }

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 2469 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, hash_seq_init(), hash_seq_search(), i, InvalidBuffer, PrintBufferLeakWarning(), PrivateRefCountArray, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

2470 {
2471 #ifdef USE_ASSERT_CHECKING
2472  int RefCountErrors = 0;
2473  PrivateRefCountEntry *res;
2474  int i;
2475 
2476  /* check the array */
2477  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
2478  {
2479  res = &PrivateRefCountArray[i];
2480 
2481  if (res->buffer != InvalidBuffer)
2482  {
2484  RefCountErrors++;
2485  }
2486  }
2487 
2488  /* if necessary search the hash */
2490  {
2491  HASH_SEQ_STATUS hstat;
2492 
2494  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
2495  {
2497  RefCountErrors++;
2498  }
2499 
2500  }
2501 
2502  Assert(RefCountErrors == 0);
2503 #endif
2504 }
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:2510
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:190
#define InvalidBuffer
Definition: buf.h:25
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:188
#define Assert(condition)
Definition: c.h:738
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:79
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1390
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1380
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:189
int i

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 2553 of file bufmgr.c.

References BufferSync(), CheckpointStats, CheckpointStatsData::ckpt_sync_end_t, CheckpointStatsData::ckpt_sync_t, CheckpointStatsData::ckpt_write_t, GetCurrentTimestamp(), and ProcessSyncRequests().

Referenced by CheckPointGuts().

2554 {
2555  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
2557  BufferSync(flags);
2559  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
2562  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
2563 }
void ProcessSyncRequests(void)
Definition: sync.c:236
TimestampTz ckpt_sync_end_t
Definition: xlog.h:239
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1583
static void BufferSync(int flags)
Definition: bufmgr.c:1747
CheckpointStatsData CheckpointStats
Definition: xlog.c:184
TimestampTz ckpt_write_t
Definition: xlog.h:237
TimestampTz ckpt_sync_t
Definition: xlog.h:238

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const void *  pa,
const void *  pb 
)
static

Definition at line 4324 of file bufmgr.c.

References CkptSortItem::blockNum, CkptSortItem::forkNum, CkptSortItem::relNode, and CkptSortItem::tsId.

Referenced by BufferSync().

4325 {
4326  const CkptSortItem *a = (const CkptSortItem *) pa;
4327  const CkptSortItem *b = (const CkptSortItem *) pb;
4328 
4329  /* compare tablespace */
4330  if (a->tsId < b->tsId)
4331  return -1;
4332  else if (a->tsId > b->tsId)
4333  return 1;
4334  /* compare relation */
4335  if (a->relNode < b->relNode)
4336  return -1;
4337  else if (a->relNode > b->relNode)
4338  return 1;
4339  /* compare fork */
4340  else if (a->forkNum < b->forkNum)
4341  return -1;
4342  else if (a->forkNum > b->forkNum)
4343  return 1;
4344  /* compare block number */
4345  else if (a->blockNum < b->blockNum)
4346  return -1;
4347  else if (a->blockNum > b->blockNum)
4348  return 1;
4349  /* equal page IDs are unlikely, but not impossible */
4350  return 0;
4351 }
BlockNumber blockNum
ForkNumber forkNum

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

Definition at line 3699 of file bufmgr.c.

References Assert, buf, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsValid, GetBufferDescriptor, LW_EXCLUSIVE, and LWLockConditionalAcquire().

Referenced by _bt_getbuf(), _bt_search_insert(), BloomNewBuffer(), ConditionalLockBufferForCleanup(), GinNewBuffer(), gistNewBuffer(), RelationGetBufferForTuple(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), and SpGistUpdateMetaPage().

3700 {
3701  BufferDesc *buf;
3702 
3703  Assert(BufferIsValid(buffer));
3704  if (BufferIsLocal(buffer))
3705  return true; /* act as though we got it */
3706 
3707  buf = GetBufferDescriptor(buffer - 1);
3708 
3710  LW_EXCLUSIVE);
3711 }
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1295
static char * buf
Definition: pg_test_fsync.c:67
#define GetBufferDescriptor(id)
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 3867 of file bufmgr.c.

References Assert, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid, ConditionalLockBuffer(), GetBufferDescriptor, GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr.

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), lazy_scan_heap(), and lazy_vacuum_heap().

3868 {
3869  BufferDesc *bufHdr;
3870  uint32 buf_state,
3871  refcount;
3872 
3873  Assert(BufferIsValid(buffer));
3874 
3875  if (BufferIsLocal(buffer))
3876  {
3877  refcount = LocalRefCount[-buffer - 1];
3878  /* There should be exactly one pin */
3879  Assert(refcount > 0);
3880  if (refcount != 1)
3881  return false;
3882  /* Nobody else to wait for */
3883  return true;
3884  }
3885 
3886  /* There should be exactly one local pin */
3887  refcount = GetPrivateRefCount(buffer);
3888  Assert(refcount);
3889  if (refcount != 1)
3890  return false;
3891 
3892  /* Try to acquire lock */
3893  if (!ConditionalLockBuffer(buffer))
3894  return false;
3895 
3896  bufHdr = GetBufferDescriptor(buffer - 1);
3897  buf_state = LockBufHdr(bufHdr);
3898  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
3899 
3900  Assert(refcount > 0);
3901  if (refcount == 1)
3902  {
3903  /* Successfully acquired exclusive lock with pincount 1 */
3904  UnlockBufHdr(bufHdr, buf_state);
3905  return true;
3906  }
3907 
3908  /* Failed, so release the lock */
3909  UnlockBufHdr(bufHdr, buf_state);
3910  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3911  return false;
3912 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:367
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:3699
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3673
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define UnlockBufHdr(desc, s)
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 3070 of file bufmgr.c.

References buftag::blockNum, buf, BufferDescriptorGetBuffer, RelFileNode::dbNode, elog, buftag::forkNum, BufferDesc::freeNext, GetBufferDescriptor, GetPrivateRefCount(), i, InvalidateBuffer(), InvalidBackendId, LockBufHdr(), LOG, NBuffers, relpathbackend, relpathperm, buftag::rnode, BufferDesc::tag, and UnlockBufHdr.

Referenced by dbase_redo(), dropdb(), and movedb().

3071 {
3072  int i;
3073 
3074  /*
3075  * We needn't consider local buffers, since by assumption the target
3076  * database isn't our own.
3077  */
3078 
3079  for (i = 0; i < NBuffers; i++)
3080  {
3081  BufferDesc *bufHdr = GetBufferDescriptor(i);
3082  uint32 buf_state;
3083 
3084  /*
3085  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3086  * and saves some cycles.
3087  */
3088  if (bufHdr->tag.rnode.dbNode != dbid)
3089  continue;
3090 
3091  buf_state = LockBufHdr(bufHdr);
3092  if (bufHdr->tag.rnode.dbNode == dbid)
3093  InvalidateBuffer(bufHdr); /* releases spinlock */
3094  else
3095  UnlockBufHdr(bufHdr, buf_state);
3096  }
3097 }
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1321
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131

◆ DropRelFileNodeBuffers()

void DropRelFileNodeBuffers ( RelFileNodeBackend  rnode,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 2898 of file bufmgr.c.

References RelFileNodeBackend::backend, buftag::blockNum, DropRelFileNodeLocalBuffers(), buftag::forkNum, GetBufferDescriptor, i, InvalidateBuffer(), LockBufHdr(), MyBackendId, NBuffers, RelFileNodeBackend::node, RelFileNodeBackendIsTemp, RelFileNodeEquals, buftag::rnode, BufferDesc::tag, and UnlockBufHdr.

Referenced by smgrtruncate().

2900 {
2901  int i;
2902  int j;
2903 
2904  /* If it's a local relation, it's localbuf.c's problem. */
2905  if (RelFileNodeBackendIsTemp(rnode))
2906  {
2907  if (rnode.backend == MyBackendId)
2908  {
2909  for (j = 0; j < nforks; j++)
2910  DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
2911  firstDelBlock[j]);
2912  }
2913  return;
2914  }
2915 
2916  for (i = 0; i < NBuffers; i++)
2917  {
2918  BufferDesc *bufHdr = GetBufferDescriptor(i);
2919  uint32 buf_state;
2920 
2921  /*
2922  * We can make this a tad faster by prechecking the buffer tag before
2923  * we attempt to lock the buffer; this saves a lot of lock
2924  * acquisitions in typical cases. It should be safe because the
2925  * caller must have AccessExclusiveLock on the relation, or some other
2926  * reason to be certain that no one is loading new pages of the rel
2927  * into the buffer pool. (Otherwise we might well miss such pages
2928  * entirely.) Therefore, while the tag might be changing while we
2929  * look at it, it can't be changing *to* a value we care about, only
2930  * *away* from such a value. So false negatives are impossible, and
2931  * false positives are safe because we'll recheck after getting the
2932  * buffer lock.
2933  *
2934  * We could check forkNum and blockNum as well as the rnode, but the
2935  * incremental win from doing so seems small.
2936  */
2937  if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
2938  continue;
2939 
2940  buf_state = LockBufHdr(bufHdr);
2941 
2942  for (j = 0; j < nforks; j++)
2943  {
2944  if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
2945  bufHdr->tag.forkNum == forkNum[j] &&
2946  bufHdr->tag.blockNum >= firstDelBlock[j])
2947  {
2948  InvalidateBuffer(bufHdr); /* releases spinlock */
2949  break;
2950  }
2951  }
2952  if (j >= nforks)
2953  UnlockBufHdr(bufHdr, buf_state);
2954  }
2955 }
BackendId MyBackendId
Definition: globals.c:81
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ForkNumber forkNum
Definition: buf_internals.h:93
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1321
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:320
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
RelFileNode node
Definition: relfilenode.h:74
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
BackendId backend
Definition: relfilenode.h:75
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ DropRelFileNodesAllBuffers()

void DropRelFileNodesAllBuffers ( RelFileNodeBackend rnodes,
int  nnodes 
)

Definition at line 2967 of file bufmgr.c.

References DropRelFileNodeAllLocalBuffers(), GetBufferDescriptor, i, InvalidateBuffer(), LockBufHdr(), MyBackendId, NBuffers, RelFileNodeBackend::node, palloc(), pfree(), pg_qsort(), RelFileNodeBackendIsTemp, RelFileNodeEquals, RELS_BSEARCH_THRESHOLD, buftag::rnode, rnode_comparator(), BufferDesc::tag, and UnlockBufHdr.

Referenced by smgrdounlink(), and smgrdounlinkall().

2968 {
2969  int i,
2970  n = 0;
2971  RelFileNode *nodes;
2972  bool use_bsearch;
2973 
2974  if (nnodes == 0)
2975  return;
2976 
2977  nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
2978 
2979  /* If it's a local relation, it's localbuf.c's problem. */
2980  for (i = 0; i < nnodes; i++)
2981  {
2982  if (RelFileNodeBackendIsTemp(rnodes[i]))
2983  {
2984  if (rnodes[i].backend == MyBackendId)
2985  DropRelFileNodeAllLocalBuffers(rnodes[i].node);
2986  }
2987  else
2988  nodes[n++] = rnodes[i].node;
2989  }
2990 
2991  /*
2992  * If there are no non-local relations, then we're done. Release the
2993  * memory and return.
2994  */
2995  if (n == 0)
2996  {
2997  pfree(nodes);
2998  return;
2999  }
3000 
3001  /*
3002  * For low number of relations to drop just use a simple walk through, to
3003  * save the bsearch overhead. The threshold to use is rather a guess than
3004  * an exactly determined value, as it depends on many factors (CPU and RAM
3005  * speeds, amount of shared buffers etc.).
3006  */
3007  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3008 
3009  /* sort the list of rnodes if necessary */
3010  if (use_bsearch)
3011  pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
3012 
3013  for (i = 0; i < NBuffers; i++)
3014  {
3015  RelFileNode *rnode = NULL;
3016  BufferDesc *bufHdr = GetBufferDescriptor(i);
3017  uint32 buf_state;
3018 
3019  /*
3020  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3021  * and saves some cycles.
3022  */
3023 
3024  if (!use_bsearch)
3025  {
3026  int j;
3027 
3028  for (j = 0; j < n; j++)
3029  {
3030  if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
3031  {
3032  rnode = &nodes[j];
3033  break;
3034  }
3035  }
3036  }
3037  else
3038  {
3039  rnode = bsearch((const void *) &(bufHdr->tag.rnode),
3040  nodes, n, sizeof(RelFileNode),
3042  }
3043 
3044  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3045  if (rnode == NULL)
3046  continue;
3047 
3048  buf_state = LockBufHdr(bufHdr);
3049  if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
3050  InvalidateBuffer(bufHdr); /* releases spinlock */
3051  else
3052  UnlockBufHdr(bufHdr, buf_state);
3053  }
3054 
3055  pfree(nodes);
3056 }
BackendId MyBackendId
Definition: globals.c:81
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1321
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:70
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:367
void pfree(void *pointer)
Definition: mcxt.c:1056
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4214
RelFileNode node
Definition: relfilenode.h:74
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
void * palloc(Size size)
Definition: mcxt.c:949
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln 
)
static

Definition at line 2644 of file bufmgr.c.

References ErrorContextCallback::arg, BufferUsage::blk_write_time, buftag::blockNum, BM_JUST_DIRTIED, BM_PERMANENT, BufferGetLSN, BufHdrGetBlock, ErrorContextCallback::callback, RelFileNode::dbNode, error_context_stack, buftag::forkNum, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, InvalidBackendId, LockBufHdr(), RelFileNodeBackend::node, PageSetChecksumCopy(), pgBufferUsage, pgstat_count_buffer_write_time, ErrorContextCallback::previous, RelFileNode::relNode, buftag::rnode, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rnode, smgropen(), smgrwrite(), RelFileNode::spcNode, StartBufferIO(), BufferDesc::tag, TerminateBufferIO(), track_io_timing, UnlockBufHdr, and XLogFlush().

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushOneBuffer(), FlushRelationBuffers(), FlushRelationsAllBuffers(), and SyncOneBuffer().

2645 {
2646  XLogRecPtr recptr;
2647  ErrorContextCallback errcallback;
2648  instr_time io_start,
2649  io_time;
2650  Block bufBlock;
2651  char *bufToWrite;
2652  uint32 buf_state;
2653 
2654  /*
2655  * Acquire the buffer's io_in_progress lock. If StartBufferIO returns
2656  * false, then someone else flushed the buffer before we could, so we need
2657  * not do anything.
2658  */
2659  if (!StartBufferIO(buf, false))
2660  return;
2661 
2662  /* Setup error traceback support for ereport() */
2664  errcallback.arg = (void *) buf;
2665  errcallback.previous = error_context_stack;
2666  error_context_stack = &errcallback;
2667 
2668  /* Find smgr relation for buffer */
2669  if (reln == NULL)
2670  reln = smgropen(buf->tag.rnode, InvalidBackendId);
2671 
2672  TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
2673  buf->tag.blockNum,
2674  reln->smgr_rnode.node.spcNode,
2675  reln->smgr_rnode.node.dbNode,
2676  reln->smgr_rnode.node.relNode);
2677 
2678  buf_state = LockBufHdr(buf);
2679 
2680  /*
2681  * Run PageGetLSN while holding header lock, since we don't have the
2682  * buffer locked exclusively in all cases.
2683  */
2684  recptr = BufferGetLSN(buf);
2685 
2686  /* To check if block content changes while flushing. - vadim 01/17/97 */
2687  buf_state &= ~BM_JUST_DIRTIED;
2688  UnlockBufHdr(buf, buf_state);
2689 
2690  /*
2691  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
2692  * rule that log updates must hit disk before any of the data-file changes
2693  * they describe do.
2694  *
2695  * However, this rule does not apply to unlogged relations, which will be
2696  * lost after a crash anyway. Most unlogged relation pages do not bear
2697  * LSNs since we never emit WAL records for them, and therefore flushing
2698  * up through the buffer LSN would be useless, but harmless. However,
2699  * GiST indexes use LSNs internally to track page-splits, and therefore
2700  * unlogged GiST pages bear "fake" LSNs generated by
2701  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
2702  * LSN counter could advance past the WAL insertion point; and if it did
2703  * happen, attempting to flush WAL through that location would fail, with
2704  * disastrous system-wide consequences. To make sure that can't happen,
2705  * skip the flush if the buffer isn't permanent.
2706  */
2707  if (buf_state & BM_PERMANENT)
2708  XLogFlush(recptr);
2709 
2710  /*
2711  * Now it's safe to write buffer to disk. Note that no one else should
2712  * have been able to write it while we were busy with log flushing because
2713  * we have the io_in_progress lock.
2714  */
2715  bufBlock = BufHdrGetBlock(buf);
2716 
2717  /*
2718  * Update page checksum if desired. Since we have only shared lock on the
2719  * buffer, other processes might be updating hint bits in it, so we must
2720  * copy the page to private storage if we do checksumming.
2721  */
2722  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
2723 
2724  if (track_io_timing)
2725  INSTR_TIME_SET_CURRENT(io_start);
2726 
2727  /*
2728  * bufToWrite is either the shared buffer or a copy, as appropriate.
2729  */
2730  smgrwrite(reln,
2731  buf->tag.forkNum,
2732  buf->tag.blockNum,
2733  bufToWrite,
2734  false);
2735 
2736  if (track_io_timing)
2737  {
2738  INSTR_TIME_SET_CURRENT(io_time);
2739  INSTR_TIME_SUBTRACT(io_time, io_start);
2742  }
2743 
2745 
2746  /*
2747  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
2748  * end the io_in_progress state.
2749  */
2750  TerminateBufferIO(buf, true, 0);
2751 
2752  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
2753  buf->tag.blockNum,
2754  reln->smgr_rnode.node.spcNode,
2755  reln->smgr_rnode.node.dbNode,
2756  reln->smgr_rnode.node.relNode);
2757 
2758  /* Pop the error context stack */
2759  error_context_stack = errcallback.previous;
2760 }
#define BM_PERMANENT
Definition: buf_internals.h:66
ForkNumber forkNum
Definition: buf_internals.h:93
struct timeval instr_time
Definition: instr_time.h:150
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1165
void(* callback)(void *arg)
Definition: elog.h:229
struct ErrorContextCallback * previous
Definition: elog.h:228
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2842
ErrorContextCallback * error_context_stack
Definition: elog.c:92
long shared_blks_written
Definition: instrument.h:24
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4024
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:565
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
unsigned int uint32
Definition: c.h:367
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:145
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4091
#define InvalidBackendId
Definition: backendid.h:23
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:59
RelFileNode node
Definition: relfilenode.h:74
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
instr_time blk_write_time
Definition: instrument.h:32
#define pgstat_count_buffer_write_time(n)
Definition: pgstat.h:1439
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
BufferTag tag
#define UnlockBufHdr(desc, s)
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:60
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4176
bool track_io_timing
Definition: bufmgr.c:126
Pointer Page
Definition: bufpage.h:78
BufferUsage pgBufferUsage
Definition: instrument.c:20
void * Block
Definition: bufmgr.h:24

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 3371 of file bufmgr.c.

References BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock, CurrentResourceOwner, RelFileNode::dbNode, FlushBuffer(), GetBufferDescriptor, i, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by dbase_redo().

3372 {
3373  int i;
3374  BufferDesc *bufHdr;
3375 
3376  /* Make sure we can handle the pin inside the loop */
3378 
3379  for (i = 0; i < NBuffers; i++)
3380  {
3381  uint32 buf_state;
3382 
3383  bufHdr = GetBufferDescriptor(i);
3384 
3385  /*
3386  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3387  * and saves some cycles.
3388  */
3389  if (bufHdr->tag.rnode.dbNode != dbid)
3390  continue;
3391 
3393 
3394  buf_state = LockBufHdr(bufHdr);
3395  if (bufHdr->tag.rnode.dbNode == dbid &&
3396  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3397  {
3398  PinBuffer_Locked(bufHdr);
3400  FlushBuffer(bufHdr, NULL);
3402  UnpinBuffer(bufHdr, true);
3403  }
3404  else
3405  UnlockBufHdr(bufHdr, buf_state);
3406  }
3407 }
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
#define BM_DIRTY
Definition: buf_internals.h:58
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2644
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1727
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1663
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define BM_VALID
Definition: buf_internals.h:59
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1625
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1123
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 3414 of file bufmgr.c.

References Assert, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, FlushBuffer(), GetBufferDescriptor, and LWLockHeldByMe().

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), and XLogReadBufferForRedoExtended().

3415 {
3416  BufferDesc *bufHdr;
3417 
3418  /* currently not needed, but no fundamental reason not to support */
3419  Assert(!BufferIsLocal(buffer));
3420 
3421  Assert(BufferIsPinned(buffer));
3422 
3423  bufHdr = GetBufferDescriptor(buffer - 1);
3424 
3426 
3427  FlushBuffer(bufHdr, NULL);
3428 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1843
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2644
#define GetBufferDescriptor(id)
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 3175 of file bufmgr.c.

References ErrorContextCallback::arg, buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_VALID, BufferDescriptorGetContentLock, ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushBuffer(), buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, i, local_buffer_write_error_callback(), LocalBufHdrGetBlock, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, NLocBuffer, PageSetChecksumInplace(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), PinBuffer_Locked(), ErrorContextCallback::previous, RelationData::rd_node, RelationData::rd_smgr, RelationOpenSmgr, RelationUsesLocalBuffers, RelFileNodeEquals, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, smgrwrite(), BufferDesc::state, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by heapam_relation_copy_data(), and index_copy_data().

3176 {
3177  int i;
3178  BufferDesc *bufHdr;
3179 
3180  /* Open rel at the smgr level if not already done */
3181  RelationOpenSmgr(rel);
3182 
3183  if (RelationUsesLocalBuffers(rel))
3184  {
3185  for (i = 0; i < NLocBuffer; i++)
3186  {
3187  uint32 buf_state;
3188 
3189  bufHdr = GetLocalBufferDescriptor(i);
3190  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3191  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
3192  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3193  {
3194  ErrorContextCallback errcallback;
3195  Page localpage;
3196 
3197  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
3198 
3199  /* Setup error traceback support for ereport() */
3201  errcallback.arg = (void *) bufHdr;
3202  errcallback.previous = error_context_stack;
3203  error_context_stack = &errcallback;
3204 
3205  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
3206 
3207  smgrwrite(rel->rd_smgr,
3208  bufHdr->tag.forkNum,
3209  bufHdr->tag.blockNum,
3210  localpage,
3211  false);
3212 
3213  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
3214  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
3215 
3216  /* Pop the error context stack */
3217  error_context_stack = errcallback.previous;
3218  }
3219  }
3220 
3221  return;
3222  }
3223 
3224  /* Make sure we can handle the pin inside the loop */
3226 
3227  for (i = 0; i < NBuffers; i++)
3228  {
3229  uint32 buf_state;
3230 
3231  bufHdr = GetBufferDescriptor(i);
3232 
3233  /*
3234  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3235  * and saves some cycles.
3236  */
3237  if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
3238  continue;
3239 
3241 
3242  buf_state = LockBufHdr(bufHdr);
3243  if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
3244  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3245  {
3246  PinBuffer_Locked(bufHdr);
3248  FlushBuffer(bufHdr, rel->rd_smgr);
3250  UnpinBuffer(bufHdr, true);
3251  }
3252  else
3253  UnlockBufHdr(bufHdr, buf_state);
3254  }
3255 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:63
ForkNumber forkNum
Definition: buf_internals.h:93
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:4195
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
#define GetLocalBufferDescriptor(id)
#define BM_DIRTY
Definition: buf_internals.h:58
void(* callback)(void *arg)
Definition: elog.h:229
struct ErrorContextCallback * previous
Definition: elog.h:228
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2644
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1727
ErrorContextCallback * error_context_stack
Definition: elog.c:92
#define RelationOpenSmgr(relation)
Definition: rel.h:513
int NLocBuffer
Definition: localbuf.c:41
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:565
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
unsigned int uint32
Definition: c.h:367
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1663
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define BM_VALID
Definition: buf_internals.h:59
RelFileNode rd_node
Definition: rel.h:55
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1625
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1194
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1123
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:572
BufferTag tag
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206
pg_atomic_uint32 state
Pointer Page
Definition: bufpage.h:78
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 3267 of file bufmgr.c.

References Assert, BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock, CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor, i, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, RelFileNodeBackend::node, palloc(), pfree(), pg_qsort(), PinBuffer_Locked(), RelFileNodeBackendIsTemp, RelFileNodeEquals, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), buftag::rnode, SMgrSortArray::rnode, rnode_comparator(), SMgrRelationData::smgr_rnode, SMgrSortArray::srel, BufferDesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by smgrdosyncall().

3268 {
3269  int i;
3270  SMgrSortArray *srels;
3271  bool use_bsearch;
3272 
3273  if (nrels == 0)
3274  return;
3275 
3276  /* fill-in array for qsort */
3277  srels = palloc(sizeof(SMgrSortArray) * nrels);
3278 
3279  for (i = 0; i < nrels; i++)
3280  {
3281  Assert(!RelFileNodeBackendIsTemp(smgrs[i]->smgr_rnode));
3282 
3283  srels[i].rnode = smgrs[i]->smgr_rnode.node;
3284  srels[i].srel = smgrs[i];
3285  }
3286 
3287  /*
3288  * Save the bsearch overhead for low number of relations to sync. See
3289  * DropRelFileNodesAllBuffers for details.
3290  */
3291  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
3292 
3293  /* sort the list of SMgrRelations if necessary */
3294  if (use_bsearch)
3295  pg_qsort(srels, nrels, sizeof(SMgrSortArray), rnode_comparator);
3296 
3297  /* Make sure we can handle the pin inside the loop */
3299 
3300  for (i = 0; i < NBuffers; i++)
3301  {
3302  SMgrSortArray *srelent = NULL;
3303  BufferDesc *bufHdr = GetBufferDescriptor(i);
3304  uint32 buf_state;
3305 
3306  /*
3307  * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
3308  * and saves some cycles.
3309  */
3310 
3311  if (!use_bsearch)
3312  {
3313  int j;
3314 
3315  for (j = 0; j < nrels; j++)
3316  {
3317  if (RelFileNodeEquals(bufHdr->tag.rnode, srels[j].rnode))
3318  {
3319  srelent = &srels[j];
3320  break;
3321  }
3322  }
3323 
3324  }
3325  else
3326  {
3327  srelent = bsearch((const void *) &(bufHdr->tag.rnode),
3328  srels, nrels, sizeof(SMgrSortArray),
3330  }
3331 
3332  /* buffer doesn't belong to any of the given relfilenodes; skip it */
3333  if (srelent == NULL)
3334  continue;
3335 
3337 
3338  buf_state = LockBufHdr(bufHdr);
3339  if (RelFileNodeEquals(bufHdr->tag.rnode, srelent->rnode) &&
3340  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
3341  {
3342  PinBuffer_Locked(bufHdr);
3344  FlushBuffer(bufHdr, srelent->srel);
3346  UnpinBuffer(bufHdr, true);
3347  }
3348  else
3349  UnlockBufHdr(bufHdr, buf_state);
3350  }
3351 
3352  pfree(srels);
3353 }
#define RelFileNodeBackendIsTemp(rnode)
Definition: relfilenode.h:78
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:70
#define BM_DIRTY
Definition: buf_internals.h:58
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln)
Definition: bufmgr.c:2644
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1727
void pfree(void *pointer)
Definition: mcxt.c:1056
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
SMgrRelation srel
Definition: bufmgr.c:119
#define GetBufferDescriptor(id)
unsigned int uint32
Definition: c.h:367
static void UnpinBuffer(BufferDesc *buf, bool fixOwner)
Definition: bufmgr.c:1663
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
static int rnode_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4214
#define BM_VALID
Definition: buf_internals.h:59
RelFileNode node
Definition: relfilenode.h:74
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
#define Assert(condition)
Definition: c.h:738
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:1625
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))
Definition: qsort.c:113
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1123
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
void * palloc(Size size)
Definition: mcxt.c:949
#define UnlockBufHdr(desc, s)
int i
int NBuffers
Definition: globals.c:131
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206
RelFileNode rnode
Definition: bufmgr.c:118
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 401 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, and REFCOUNT_ARRAY_ENTRIES.

Referenced by UnpinBuffer().

402 {
403  Assert(ref->refcount == 0);
404 
405  if (ref >= &PrivateRefCountArray[0] &&
407  {
408  ref->buffer = InvalidBuffer;
409 
410  /*
411  * Mark the just used entry as reserved - in many scenarios that
412  * allows us to avoid ever having to search the array/hash for free
413  * entries.
414  */
415  ReservedRefCountEntry = ref;
416  }
417  else
418  {
419  bool found;
420  Buffer buffer = ref->buffer;
421 
423  (void *) &buffer,
424  HASH_REMOVE,
425  &found);
426  Assert(found);
429  }
430 }
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:190
#define InvalidBuffer
Definition: buf.h:25
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:907
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:188
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:192
#define Assert(condition)
Definition: c.h:738
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:79
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:189
int Buffer
Definition: buf.h:23

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 378 of file bufmgr.c.

References Assert, BufferIsLocal, BufferIsValid, GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), and PrintBufferLeakWarning().

379 {
381 
382  Assert(BufferIsValid(buffer));
383  Assert(!BufferIsLocal(buffer));
384 
385  /*
386  * Not moving the entry - that's ok for the current users, but we might
387  * want to change this one day.
388  */
389  ref = GetPrivateRefCountEntry(buffer, false);
390 
391  if (ref == NULL)
392  return 0;
393  return ref->refcount;
394 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:298
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 298 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid, free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBuffer().

299 {
301  int i;
302 
303  Assert(BufferIsValid(buffer));
304  Assert(!BufferIsLocal(buffer));
305 
306  /*
307  * First search for references in the array, that'll be sufficient in the
308  * majority of cases.
309  */
310  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
311  {
312  res = &PrivateRefCountArray[i];
313 
314  if (res->buffer == buffer)
315  return res;
316  }
317 
318  /*
319  * By here we know that the buffer, if already pinned, isn't residing in
320  * the array.
321  *
322  * Only look up the buffer in the hashtable if we've previously overflowed
323  * into it.
324  */
325  if (PrivateRefCountOverflowed == 0)
326  return NULL;
327 
329  (void *) &buffer,
330  HASH_FIND,
331  NULL);
332 
333  if (res == NULL)
334  return NULL;
335  else if (!do_move)
336  {
337  /* caller doesn't want us to move the hash entry into the array */
338  return res;
339  }
340  else
341  {
342  /* move buffer from hashtable into the free array slot */
343  bool found;
345 
346  /* Ensure there's a free array slot */
348 
349  /* Use up the reserved slot */
350  Assert(ReservedRefCountEntry != NULL);
351  free = ReservedRefCountEntry;
352  ReservedRefCountEntry = NULL;
353  Assert(free->buffer == InvalidBuffer);
354 
355  /* and fill it */
356  free->buffer = buffer;
357  free->refcount = res->refcount;
358 
359  /* delete from hashtable */
361  (void *) &buffer,
362  HASH_REMOVE,
363  &found);
364  Assert(found);
367 
368  return free;
369  }
370 }
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:190
#define InvalidBuffer
Definition: buf.h:25
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:907
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:188
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:192
#define free(a)
Definition: header.h:65
#define Assert(condition)
Definition: c.h:738
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:79
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:189
int i
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 3841 of file bufmgr.c.

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and RecoveryConflictInterrupt().

3842 {
3843  int bufid = GetStartupBufferPinWaitBufId();
3844 
3845  /*
3846  * If we get woken slowly then it's possible that the Startup process was
3847  * already woken by other backends before we got here. Also possible that
3848  * we get here by multiple interrupts or interrupts at inappropriate
3849  * times, so make sure we do nothing if the bufid is not set.
3850  */
3851  if (bufid < 0)
3852  return false;
3853 
3854  if (GetPrivateRefCount(bufid + 1) > 0)
3855  return true;
3856 
3857  return false;
3858 }
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:657

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 3472 of file bufmgr.c.

References Assert, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, GetPrivateRefCountEntry(), LocalRefCount, PrivateRefCountEntry::refcount, ResourceOwnerEnlargeBuffers(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

3473 {
3474  Assert(BufferIsPinned(buffer));
3476  if (BufferIsLocal(buffer))
3477  LocalRefCount[-buffer - 1]++;
3478  else
3479  {
3480  PrivateRefCountEntry *ref;
3481 
3482  ref = GetPrivateRefCountEntry(buffer, true);
3483  Assert(ref != NULL);
3484  ref->refcount++;
3485  }
3487 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:298
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:930
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
int32 * LocalRefCount
Definition: localbuf.c:45

◆ InitBufferPoolAccess()

void InitBufferPoolAccess ( void  )

Definition at line 2416 of file bufmgr.c.

References HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MemSet, and PrivateRefCountArray.

Referenced by BaseInit().

2417 {
2418  HASHCTL hash_ctl;
2419 
2420  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
2421 
2422  MemSet(&hash_ctl, 0, sizeof(hash_ctl));
2423  hash_ctl.keysize = sizeof(int32);
2424  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
2425 
2426  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
2427  HASH_ELEM | HASH_BLOBS);
2428 }
struct PrivateRefCountEntry PrivateRefCountEntry
#define HASH_ELEM
Definition: hsearch.h:87
Size entrysize
Definition: hsearch.h:73
#define MemSet(start, val, len)
Definition: c.h:971
signed int int32
Definition: c.h:355
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:188
#define HASH_BLOBS
Definition: hsearch.h:88
HTAB * hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
Definition: dynahash.c:317
Size keysize
Definition: hsearch.h:72
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:189

◆ InitBufferPoolBackend()

void InitBufferPoolBackend ( void  )

Definition at line 2440 of file bufmgr.c.

References AtProcExit_Buffers(), and on_shmem_exit().

Referenced by AuxiliaryProcessMain(), and InitPostgres().

2441 {
2443 }
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:2450

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 1321 of file bufmgr.c.

References Assert, BM_LOCKED, BM_TAG_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer, BUFFERTAGS_EQUAL, BufMappingPartitionLock, BufTableDelete(), BufTableHashCode(), CLEAR_BUFFERTAG, elog, ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), BufferDesc::state, StrategyFreeBuffer(), BufferDesc::tag, UnlockBufHdr, and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelFileNodeBuffers(), and DropRelFileNodesAllBuffers().

1322 {
1323  BufferTag oldTag;
1324  uint32 oldHash; /* hash value for oldTag */
1325  LWLock *oldPartitionLock; /* buffer partition lock for it */
1326  uint32 oldFlags;
1327  uint32 buf_state;
1328 
1329  /* Save the original buffer tag before dropping the spinlock */
1330  oldTag = buf->tag;
1331 
1332  buf_state = pg_atomic_read_u32(&buf->state);
1333  Assert(buf_state & BM_LOCKED);
1334  UnlockBufHdr(buf, buf_state);
1335 
1336  /*
1337  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1338  * worth storing the hashcode in BufferDesc so we need not recompute it
1339  * here? Probably not.
1340  */
1341  oldHash = BufTableHashCode(&oldTag);
1342  oldPartitionLock = BufMappingPartitionLock(oldHash);
1343 
1344 retry:
1345 
1346  /*
1347  * Acquire exclusive mapping lock in preparation for changing the buffer's
1348  * association.
1349  */
1350  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1351 
1352  /* Re-lock the buffer header */
1353  buf_state = LockBufHdr(buf);
1354 
1355  /* If it's changed while we were waiting for lock, do nothing */
1356  if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
1357  {
1358  UnlockBufHdr(buf, buf_state);
1359  LWLockRelease(oldPartitionLock);
1360  return;
1361  }
1362 
1363  /*
1364  * We assume the only reason for it to be pinned is that someone else is
1365  * flushing the page out. Wait for them to finish. (This could be an
1366  * infinite loop if the refcount is messed up... it would be nice to time
1367  * out after awhile, but there seems no way to be sure how many loops may
1368  * be needed. Note that if the other guy has pinned the buffer but not
1369  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1370  * be busy-looping here.)
1371  */
1372  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1373  {
1374  UnlockBufHdr(buf, buf_state);
1375  LWLockRelease(oldPartitionLock);
1376  /* safety check: should definitely not be our *own* pin */
1378  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1379  WaitIO(buf);
1380  goto retry;
1381  }
1382 
1383  /*
1384  * Clear out the buffer's tag and flags. We must do this to ensure that
1385  * linear scans of the buffer array don't think the buffer is valid.
1386  */
1387  oldFlags = buf_state & BUF_FLAG_MASK;
1388  CLEAR_BUFFERTAG(buf->tag);
1389  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1390  UnlockBufHdr(buf, buf_state);
1391 
1392  /*
1393  * Remove the buffer from the lookup hashtable, if it was in there.
1394  */
1395  if (oldFlags & BM_TAG_VALID)
1396  BufTableDelete(&oldTag, oldHash);
1397 
1398  /*
1399  * Done with mapping lock.
1400  */
1401  LWLockRelease(oldPartitionLock);
1402 
1403  /*
1404  * Insert the buffer at the head of the list of free buffers.
1405  */
1406  StrategyFreeBuffer(buf);
1407 }
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
Definition: lwlock.h:32
#define BufMappingPartitionLock(hashcode)
#define BM_TAG_VALID
Definition: buf_internals.h:60
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:3977
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:364
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1727
#define ERROR
Definition: elog.h:43
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:367
#define BUFFERTAGS_EQUAL(a, b)
#define BM_LOCKED
Definition: buf_internals.h:57
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
#define Assert(condition)
Definition: c.h:738
#define CLEAR_BUFFERTAG(a)
Definition: buf_internals.h:97
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:42
#define BufferDescriptorGetBuffer(bdesc)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1123
BufferTag tag
#define UnlockBufHdr(desc, s)
#define elog(elevel,...)
Definition: elog.h:214
pg_atomic_uint32 state
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 3923 of file bufmgr.c.

References Assert, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsValid, GetBufferDescriptor, GetPrivateRefCount(), LocalRefCount, LockBufHdr(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), and UnlockBufHdr.

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), hash_xlog_split_allocate_page(), and hashbucketcleanup().

3924 {
3925  BufferDesc *bufHdr;
3926  uint32 buf_state;
3927 
3928  Assert(BufferIsValid(buffer));
3929 
3930  if (BufferIsLocal(buffer))
3931  {
3932  /* There should be exactly one pin */
3933  if (LocalRefCount[-buffer - 1] != 1)
3934  return false;
3935  /* Nobody else to wait for */
3936  return true;
3937  }
3938 
3939  /* There should be exactly one local pin */
3940  if (GetPrivateRefCount(buffer) != 1)
3941  return false;
3942 
3943  bufHdr = GetBufferDescriptor(buffer - 1);
3944 
3945  /* caller must hold exclusive lock on buffer */
3947  LW_EXCLUSIVE));
3948 
3949  buf_state = LockBufHdr(bufHdr);
3950 
3951  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3952  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3953  {
3954  /* pincount is OK. */
3955  UnlockBufHdr(bufHdr, buf_state);
3956  return true;
3957  }
3958 
3959  UnlockBufHdr(bufHdr, buf_state);
3960  return false;
3961 }
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1861
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:367
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define UnlockBufHdr(desc, s)
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext context)

Definition at line 4427 of file bufmgr.c.

References buftag::blockNum, buffertag_comparator(), cur, buftag::forkNum, i, InvalidBackendId, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, qsort, RelFileNodeEquals, buftag::rnode, smgropen(), smgrwriteback(), and PendingWriteback::tag.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

4428 {
4429  int i;
4430 
4431  if (context->nr_pending == 0)
4432  return;
4433 
4434  /*
4435  * Executing the writes in-order can make them a lot faster, and allows to
4436  * merge writeback requests to consecutive blocks into larger writebacks.
4437  */
4438  qsort(&context->pending_writebacks, context->nr_pending,
4440 
4441  /*
4442  * Coalesce neighbouring writes, but nothing else. For that we iterate
4443  * through the, now sorted, array of pending flushes, and look forward to
4444  * find all neighbouring (or identical) writes.
4445  */
4446  for (i = 0; i < context->nr_pending; i++)
4447  {
4450  SMgrRelation reln;
4451  int ahead;
4452  BufferTag tag;
4453  Size nblocks = 1;
4454 
4455  cur = &context->pending_writebacks[i];
4456  tag = cur->tag;
4457 
4458  /*
4459  * Peek ahead, into following writeback requests, to see if they can
4460  * be combined with the current one.
4461  */
4462  for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++)
4463  {
4464  next = &context->pending_writebacks[i + ahead + 1];
4465 
4466  /* different file, stop */
4467  if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) ||
4468  cur->tag.forkNum != next->tag.forkNum)
4469  break;
4470 
4471  /* ok, block queued twice, skip */
4472  if (cur->tag.blockNum == next->tag.blockNum)
4473  continue;
4474 
4475  /* only merge consecutive writes */
4476  if (cur->tag.blockNum + 1 != next->tag.blockNum)
4477  break;
4478 
4479  nblocks++;
4480  cur = next;
4481  }
4482 
4483  i += ahead;
4484 
4485  /* and finally tell the kernel to write the data to storage */
4486  reln = smgropen(tag.rnode, InvalidBackendId);
4487  smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks);
4488  }
4489 
4490  context->nr_pending = 0;
4491 }
static int32 next
Definition: blutils.c:218
ForkNumber forkNum
Definition: buf_internals.h:93
struct cursor * cur
Definition: ecpg.c:28
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:578
static int buffertag_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:4293
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:145
#define InvalidBackendId
Definition: backendid.h:23
size_t Size
Definition: c.h:466
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
int i
#define qsort(a, b, c, d)
Definition: port.h:479
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 4195 of file bufmgr.c.

References buftag::blockNum, errcontext, buftag::forkNum, MyBackendId, pfree(), relpathbackend, buftag::rnode, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

4196 {
4197  BufferDesc *bufHdr = (BufferDesc *) arg;
4198 
4199  if (bufHdr != NULL)
4200  {
4201  char *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
4202  bufHdr->tag.forkNum);
4203 
4204  errcontext("writing block %u of relation %s",
4205  bufHdr->tag.blockNum, path);
4206  pfree(path);
4207  }
4208 }
BackendId MyBackendId
Definition: globals.c:81
ForkNumber forkNum
Definition: buf_internals.h:93
void pfree(void *pointer)
Definition: mcxt.c:1056
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define errcontext
Definition: elog.h:185
void * arg
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
int  mode 
)

Definition at line 3673 of file bufmgr.c.

References Assert, buf, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsValid, elog, ERROR, GetBufferDescriptor, LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), and LWLockRelease().

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_endpoint(), _bt_first(), _bt_getbuf(), _bt_getroot(), _bt_killitems(), _bt_moveright(), _bt_pagedel(), _bt_readnextpage(), _bt_relandgetbuf(), _bt_search(), _bt_unlink_halfdead_page(), _bt_update_meta_cleanup_info(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_getnewbuf(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), blbulkdelete(), blgetbitmap(), blinsert(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), brinbuild(), brinbuildempty(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_page_items(), bt_page_stats(), btvacuumpage(), checkXLogConsistency(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), entryLoadMoreItems(), fill_seq_with_data(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), ginbuildempty(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistbuildempty(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_compute_xid_horizon_for_tuples(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_inplace_update(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_bitmap_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgetpage(), heapgettup(), initBloomState(), lazy_scan_heap(), LockBufferForCleanup(), log_newpage_range(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferExtended(), XLogReadBufferForRedoExtended(), and XLogRecordPageWithFreeSpace().

3674 {
3675  BufferDesc *buf;
3676 
3677  Assert(BufferIsValid(buffer));
3678  if (BufferIsLocal(buffer))
3679  return; /* local buffers need no lock */
3680 
3681  buf = GetBufferDescriptor(buffer - 1);
3682 
3683  if (mode == BUFFER_LOCK_UNLOCK)
3685  else if (mode == BUFFER_LOCK_SHARE)
3687  else if (mode == BUFFER_LOCK_EXCLUSIVE)
3689  else
3690  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
3691 }
static PgChecksumMode mode
Definition: pg_checksums.c:61
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:89
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1727
#define ERROR
Definition: elog.h:43
static char * buf
Definition: pg_test_fsync.c:67
#define GetBufferDescriptor(id)
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1123
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
#define elog(elevel,...)
Definition: elog.h:214
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:88

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 3730 of file bufmgr.c.

References Assert, BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid, elog, ERROR, get_ps_display(), GetBufferDescriptor, GetPrivateRefCount(), InHotStandby, LocalRefCount, LockBuffer(), LockBufHdr(), MyProcPid, palloc(), pfree(), PG_WAIT_BUFFER_PIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), set_ps_display(), SetStartupBufferPinWaitBufId(), UnlockBufHdr, update_process_title, and BufferDesc::wait_backend_pid.

Referenced by btvacuumpage(), ginVacuumPostingTree(), hashbulkdelete(), lazy_scan_heap(), ReadBuffer_common(), and XLogReadBufferForRedoExtended().

3731 {
3732  BufferDesc *bufHdr;
3733  char *new_status = NULL;
3734 
3735  Assert(BufferIsValid(buffer));
3736  Assert(PinCountWaitBuf == NULL);
3737 
3738  if (BufferIsLocal(buffer))
3739  {
3740  /* There should be exactly one pin */
3741  if (LocalRefCount[-buffer - 1] != 1)
3742  elog(ERROR, "incorrect local pin count: %d",
3743  LocalRefCount[-buffer - 1]);
3744  /* Nobody else to wait for */
3745  return;
3746  }
3747 
3748  /* There should be exactly one local pin */
3749  if (GetPrivateRefCount(buffer) != 1)
3750  elog(ERROR, "incorrect local pin count: %d",
3751  GetPrivateRefCount(buffer));
3752 
3753  bufHdr = GetBufferDescriptor(buffer - 1);
3754 
3755  for (;;)
3756  {
3757  uint32 buf_state;
3758 
3759  /* Try to acquire lock */
3761  buf_state = LockBufHdr(bufHdr);
3762 
3763  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3764  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3765  {
3766  /* Successfully acquired exclusive lock with pincount 1 */
3767  UnlockBufHdr(bufHdr, buf_state);
3768 
3769  /* Report change to non-waiting status */
3770  if (new_status)
3771  {
3772  set_ps_display(new_status);
3773  pfree(new_status);
3774  }
3775  return;
3776  }
3777  /* Failed, so mark myself as waiting for pincount 1 */
3778  if (buf_state & BM_PIN_COUNT_WAITER)
3779  {
3780  UnlockBufHdr(bufHdr, buf_state);
3781  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3782  elog(ERROR, "multiple backends attempting to wait for pincount 1");
3783  }
3784  bufHdr->wait_backend_pid = MyProcPid;
3785  PinCountWaitBuf = bufHdr;
3786  buf_state |= BM_PIN_COUNT_WAITER;
3787  UnlockBufHdr(bufHdr, buf_state);
3788  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3789 
3790  /* Wait to be signaled by UnpinBuffer() */
3791  if (InHotStandby)
3792  {
3793  /* Report change to waiting status */
3794  if (update_process_title && new_status == NULL)
3795  {
3796  const char *old_status;
3797  int len;
3798 
3799  old_status = get_ps_display(&len);
3800  new_status = (char *) palloc(len + 8 + 1);
3801  memcpy(new_status, old_status, len);
3802  strcpy(new_status + len, " waiting");
3803  set_ps_display(new_status);
3804  new_status[len] = '\0'; /* truncate off " waiting" */
3805  }
3806 
3807  /* Publish the bufid that Startup process waits on */
3808  SetStartupBufferPinWaitBufId(buffer - 1);
3809  /* Set alarm and then wait to be signaled by UnpinBuffer() */
3811  /* Reset the published bufid */
3813  }
3814  else
3816 
3817  /*
3818  * Remove flag marking us as waiter. Normally this will not be set
3819  * anymore, but ProcWaitForSignal() can return for other signals as
3820  * well. We take care to only reset the flag if we're the waiter, as
3821  * theoretically another backend could have started waiting. That's
3822  * impossible with the current usages due to table level locking, but
3823  * better be safe.
3824  */
3825  buf_state = LockBufHdr(bufHdr);
3826  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
3827  bufHdr->wait_backend_pid == MyProcPid)
3828  buf_state &= ~BM_PIN_COUNT_WAITER;
3829  UnlockBufHdr(bufHdr, buf_state);
3830 
3831  PinCountWaitBuf = NULL;
3832  /* Loop back and try again */
3833  }
3834 }
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:87
int MyProcPid
Definition: globals.c:40
int wait_backend_pid
bool update_process_title
Definition: ps_status.c:36
#define InHotStandby
Definition: xlog.h:74
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:89
void set_ps_display(const char *activity)
Definition: ps_status.c:349
void pfree(void *pointer)
Definition: mcxt.c:1056
#define ERROR
Definition: elog.h:43
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:483
const char * get_ps_display(int *displen)
Definition: ps_status.c:430
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:645
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:367
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1800
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3673
#define PG_WAIT_BUFFER_PIN
Definition: pgstat.h:787
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
void * palloc(Size size)
Definition: mcxt.c:949
#define UnlockBufHdr(desc, s)
#define elog(elevel,...)
Definition: elog.h:214
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:156
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:64

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 4241 of file bufmgr.c.

References BM_LOCKED, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), DropRelFileNodeBuffers(), DropRelFileNodesAllBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetBufferFromRing(), InvalidateBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), pg_buffercache_pages(), ReadBuffer_common(), StartBufferIO(), StrategyGetBuffer(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), UnpinBuffer(), and WaitIO().

4242 {
4243  SpinDelayStatus delayStatus;
4244  uint32 old_buf_state;
4245 
4246  init_local_spin_delay(&delayStatus);
4247 
4248  while (true)
4249  {
4250  /* set BM_LOCKED flag */
4251  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
4252  /* if it wasn't set before we're OK */
4253  if (!(old_buf_state & BM_LOCKED))
4254  break;
4255  perform_spin_delay(&delayStatus);
4256  }
4257  finish_spin_delay(&delayStatus);
4258  return old_buf_state | BM_LOCKED;
4259 }
#define init_local_spin_delay(status)
Definition: s_lock.h:1043
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:174
unsigned int uint32
Definition: c.h:367
#define BM_LOCKED
Definition: buf_internals.h:57
pg_atomic_uint32 state
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:372
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:124

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 1419 of file bufmgr.c.

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock, BufferIsLocal, BufferIsPinned, BufferIsValid, elog, ERROR, GetBufferDescriptor, LW_EXCLUSIVE, LWLockHeldByMeInMode(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_one_page(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newroot(), _bt_restore_meta(), _bt_split(), _bt_unlink_halfdead_page(), _bt_update_meta_cleanup_info(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), do_setval(), doPickSplit(), fill_seq_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune(), heap_update(), heap_xlog_clean(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_freeze_page(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heap_xlog_visible(), lazy_scan_heap(), lazy_vacuum_page(), log_newpage_range(), moveLeafs(), nextval_internal(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

1420 {
1421  BufferDesc *bufHdr;
1422  uint32 buf_state;
1423  uint32 old_buf_state;
1424 
1425  if (!BufferIsValid(buffer))
1426  elog(ERROR, "bad buffer ID: %d", buffer);
1427 
1428  if (BufferIsLocal(buffer))
1429  {
1430  MarkLocalBufferDirty(buffer);
1431  return;
1432  }
1433 
1434  bufHdr = GetBufferDescriptor(buffer - 1);
1435 
1436  Assert(BufferIsPinned(buffer));
1438  LW_EXCLUSIVE));
1439 
1440  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
1441  for (;;)
1442  {
1443  if (old_buf_state & BM_LOCKED)
1444  old_buf_state = WaitBufHdrUnlocked(bufHdr);
1445 
1446  buf_state = old_buf_state;
1447 
1448  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1449  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
1450 
1451  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
1452  buf_state))
1453  break;
1454  }
1455 
1456  /*
1457  * If the buffer was not dirty already, do vacuum accounting.
1458  */
1459  if (!(old_buf_state & BM_DIRTY))
1460  {
1461  VacuumPageDirty++;
1463  if (VacuumCostActive)
1465  }
1466 }
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:439
bool LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
Definition: lwlock.c:1861
int VacuumCostBalance
Definition: globals.c:147
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
int64 VacuumPageDirty
Definition: globals.c:145
#define BM_DIRTY
Definition: buf_internals.h:58
int VacuumCostPageDirty
Definition: globals.c:139
#define ERROR
Definition: elog.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
unsigned int uint32
Definition: c.h:367
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:280
#define BM_LOCKED
Definition: buf_internals.h:57
#define BufferDescriptorGetContentLock(bdesc)
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4269
#define elog(elevel,...)
Definition: elog.h:214
pg_atomic_uint32 state
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 3504 of file bufmgr.c.

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock, BufferGetPage, BufferIsLocal, BufferIsValid, PGXACT::delayChkpt, elog, ERROR, GetBufferDescriptor, GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), LWLockHeldByMe(), MarkLocalBufferDirty(), MyPgXact, PageSetLSN, pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), RelFileNodeSkippingWAL(), buftag::rnode, BufferUsage::shared_blks_dirtied, BufferDesc::state, BufferDesc::tag, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

3505 {
3506  BufferDesc *bufHdr;
3507  Page page = BufferGetPage(buffer);
3508 
3509  if (!BufferIsValid(buffer))
3510  elog(ERROR, "bad buffer ID: %d", buffer);
3511 
3512  if (BufferIsLocal(buffer))
3513  {
3514  MarkLocalBufferDirty(buffer);
3515  return;
3516  }
3517 
3518  bufHdr = GetBufferDescriptor(buffer - 1);
3519 
3520  Assert(GetPrivateRefCount(buffer) > 0);
3521  /* here, either share or exclusive lock is OK */
3523 
3524  /*
3525  * This routine might get called many times on the same page, if we are
3526  * making the first scan after commit of an xact that added/deleted many
3527  * tuples. So, be as quick as we can if the buffer is already dirty. We
3528  * do this by not acquiring spinlock if it looks like the status bits are
3529  * already set. Since we make this test unlocked, there's a chance we
3530  * might fail to notice that the flags have just been cleared, and failed
3531  * to reset them, due to memory-ordering issues. But since this function
3532  * is only intended to be used in cases where failing to write out the
3533  * data would be harmless anyway, it doesn't really matter.
3534  */
3535  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
3537  {
3539  bool dirtied = false;
3540  bool delayChkpt = false;
3541  uint32 buf_state;
3542 
3543  /*
3544  * If we need to protect hint bit updates from torn writes, WAL-log a
3545  * full page image of the page. This full page image is only necessary
3546  * if the hint bit update is the first change to the page since the
3547  * last checkpoint.
3548  *
3549  * We don't check full_page_writes here because that logic is included
3550  * when we call XLogInsert() since the value changes dynamically.
3551  */
3552  if (XLogHintBitIsNeeded() &&
3553  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
3554  {
3555  /*
3556  * If we must not write WAL, due to a relfilenode-specific
3557  * condition or being in recovery, don't dirty the page. We can
3558  * set the hint, just not dirty the page as a result so the hint
3559  * is lost when we evict the page or shutdown.
3560  *
3561  * See src/backend/storage/page/README for longer discussion.
3562  */
3563  if (RecoveryInProgress() ||
3564  RelFileNodeSkippingWAL(bufHdr->tag.rnode))
3565  return;
3566 
3567  /*
3568  * If the block is already dirty because we either made a change
3569  * or set a hint already, then we don't need to write a full page
3570  * image. Note that aggressive cleaning of blocks dirtied by hint
3571  * bit setting would increase the call rate. Bulk setting of hint
3572  * bits would reduce the call rate...
3573  *
3574  * We must issue the WAL record before we mark the buffer dirty.
3575  * Otherwise we might write the page before we write the WAL. That
3576  * causes a race condition, since a checkpoint might occur between
3577  * writing the WAL record and marking the buffer dirty. We solve
3578  * that with a kluge, but one that is already in use during
3579  * transaction commit to prevent race conditions. Basically, we
3580  * simply prevent the checkpoint WAL record from being written
3581  * until we have marked the buffer dirty. We don't start the
3582  * checkpoint flush until we have marked dirty, so our checkpoint
3583  * must flush the change to disk successfully or the checkpoint
3584  * never gets written, so crash recovery will fix.
3585  *
3586  * It's possible we may enter here without an xid, so it is
3587  * essential that CreateCheckpoint waits for virtual transactions
3588  * rather than full transactionids.
3589  */
3590  MyPgXact->delayChkpt = delayChkpt = true;
3591  lsn = XLogSaveBufferForHint(buffer, buffer_std);
3592  }
3593 
3594  buf_state = LockBufHdr(bufHdr);
3595 
3596  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
3597 
3598  if (!(buf_state & BM_DIRTY))
3599  {
3600  dirtied = true; /* Means "will be dirtied by this action" */
3601 
3602  /*
3603  * Set the page LSN if we wrote a backup block. We aren't supposed
3604  * to set this when only holding a share lock but as long as we
3605  * serialise it somehow we're OK. We choose to set LSN while
3606  * holding the buffer header lock, which causes any reader of an
3607  * LSN who holds only a share lock to also obtain a buffer header
3608  * lock before using PageGetLSN(), which is enforced in
3609  * BufferGetLSNAtomic().
3610  *
3611  * If checksums are enabled, you might think we should reset the
3612  * checksum here. That will happen when the page is written
3613  * sometime later in this checkpoint cycle.
3614  */
3615  if (!XLogRecPtrIsInvalid(lsn))
3616  PageSetLSN(page, lsn);
3617  }
3618 
3619  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
3620  UnlockBufHdr(bufHdr, buf_state);
3621 
3622  if (delayChkpt)
3623  MyPgXact->delayChkpt = false;
3624 
3625  if (dirtied)
3626  {
3627  VacuumPageDirty++;
3629  if (VacuumCostActive)
3631  }
3632  }
3633 }
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
#define BM_PERMANENT
Definition: buf_internals.h:66
int VacuumCostBalance
Definition: globals.c:147
bool LWLockHeldByMe(LWLock *l)
Definition: lwlock.c:1843
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:898
int64 VacuumPageDirty
Definition: globals.c:145
bool RecoveryInProgress(void)
Definition: xlog.c:8016
#define BM_DIRTY
Definition: buf_internals.h:58
int VacuumCostPageDirty
Definition: globals.c:139
PGXACT * MyPgXact
Definition: proc.c:68
#define ERROR
Definition: elog.h:43
#define GetBufferDescriptor(id)
#define BM_JUST_DIRTIED
Definition: buf_internals.h:63
long shared_blks_dirtied
Definition: instrument.h:23
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:367
#define BufferGetPage(buffer)
Definition: bufmgr.h:160
bool delayChkpt
Definition: proc.h:235
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:280
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
RelFileNode rnode
Definition: buf_internals.h:92
bool RelFileNodeSkippingWAL(RelFileNode rnode)
Definition: storage.c:496
BufferTag tag
#define UnlockBufHdr(desc, s)
#define elog(elevel,...)
Definition: elog.h:214
pg_atomic_uint32 state
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
#define XLogHintBitIsNeeded()
Definition: xlog.h:193
Pointer Page
Definition: bufpage.h:78
BufferUsage pgBufferUsage
Definition: instrument.c:20
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 272 of file bufmgr.c.

References Assert, PrivateRefCountEntry::buffer, PrivateRefCountEntry::refcount, and ReservedRefCountEntry.

Referenced by PinBuffer(), and PinBuffer_Locked().

273 {
275 
276  /* only allowed to be called when a reservation has been made */
277  Assert(ReservedRefCountEntry != NULL);
278 
279  /* use up the reserved entry */
280  res = ReservedRefCountEntry;
281  ReservedRefCountEntry = NULL;
282 
283  /* and fill it */
284  res->buffer = buffer;
285  res->refcount = 0;
286 
287  return res;
288 }
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:192
#define Assert(condition)
Definition: c.h:738

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy 
)
static

Definition at line 1540 of file bufmgr.c.

References Assert, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservePrivateRefCountEntry(), ResourceOwnerRememberBuffer(), BufferDesc::state, and WaitBufHdrUnlocked().

Referenced by BufferAlloc().

1541 {
1543  bool result;
1544  PrivateRefCountEntry *ref;
1545 
1546  ref = GetPrivateRefCountEntry(b, true);
1547 
1548  if (ref == NULL)
1549  {
1550  uint32 buf_state;
1551  uint32 old_buf_state;
1552 
1554  ref = NewPrivateRefCountEntry(b);
1555 
1556  old_buf_state = pg_atomic_read_u32(&buf->state);
1557  for (;;)
1558  {
1559  if (old_buf_state & BM_LOCKED)
1560  old_buf_state = WaitBufHdrUnlocked(buf);
1561 
1562  buf_state = old_buf_state;
1563 
1564  /* increase refcount */
1565  buf_state += BUF_REFCOUNT_ONE;
1566 
1567  if (strategy == NULL)
1568  {
1569  /* Default case: increase usagecount unless already max. */
1571  buf_state += BUF_USAGECOUNT_ONE;
1572  }
1573  else
1574  {
1575  /*
1576  * Ring buffers shouldn't evict others from pool. Thus we
1577  * don't make usagecount more than 1.
1578  */
1579  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
1580  buf_state += BUF_USAGECOUNT_ONE;
1581  }
1582 
1583  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
1584  buf_state))
1585  {
1586  result = (buf_state & BM_VALID) != 0;
1587  break;
1588  }
1589  }
1590  }
1591  else
1592  {
1593  /* If we previously pinned the buffer, it must surely be valid */
1594  result = true;
1595  }
1596 
1597  ref->refcount++;
1598  Assert(ref->refcount > 0);
1600  return result;
1601 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:298
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:311
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:930
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:272
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:43
unsigned int uint32
Definition: c.h:367
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define BM_VALID
Definition: buf_internals.h:59
int result
Definition: header.h:19
#define Assert(condition)
Definition: c.h:738
#define BufferDescriptorGetBuffer(bdesc)
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:4269
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:76
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:206
pg_atomic_uint32 state
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:49
int Buffer
Definition: buf.h:23
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 1625 of file bufmgr.c.

References Assert, BM_LOCKED, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), BufferDesc::state, and UnlockBufHdr.

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), and SyncOneBuffer().

1626 {
1627  Buffer b;
1628  PrivateRefCountEntry *ref;
1629  uint32 buf_state;
1630 
1631  /*
1632  * As explained, We don't expect any preexisting pins. That allows us to
1633  * manipulate the PrivateRefCount after releasing the spinlock
1634  */
1636 
1637  /*
1638  * Since we hold the buffer spinlock, we can update the buffer state and
1639  * release the lock in one operation.
1640  */
1641  buf_state = pg_atomic_read_u32(&buf->state);
1642  Assert(buf_state & BM_LOCKED);
1643  buf_state += BUF_REFCOUNT_ONE;
1644  UnlockBufHdr(buf, buf_state);
1645 
1646  b = BufferDescriptorGetBuffer(buf);
1647 
1648  ref = NewPrivateRefCountEntry(b);
1649  ref->refcount++;
1650 
1652 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:298
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:930
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:272
unsigned int uint32
Definition: c.h:367
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:40
#define BM_LOCKED
Definition: buf_internals.h:57
#define Assert(condition)
Definition: c.h:738
#define BufferDescriptorGetBuffer(bdesc)
#define UnlockBufHdr(desc, s)
pg_atomic_uint32 state
int Buffer
Definition: buf.h:23
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ PrefetchBuffer()

void PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 492 of file bufmgr.c.

References Assert, BlockNumberIsValid, BufMappingPartitionLock, BufTableHashCode(), BufTableLookup(), ereport, errcode(), errmsg(), ERROR, INIT_BUFFERTAG, LocalPrefetchBuffer(), LW_SHARED, LWLockAcquire(), LWLockRelease(), RelFileNodeBackend::node, RelationData::rd_smgr, RELATION_IS_OTHER_TEMP, RelationIsValid, RelationOpenSmgr, RelationUsesLocalBuffers, SMgrRelationData::smgr_rnode, and smgrprefetch().

Referenced by BitmapPrefetch(), count_nondeletable_pages(), HeapTupleHeaderAdvanceLatestRemovedXid(), and pg_prewarm().

493 {
494 #ifdef USE_PREFETCH
495  Assert(RelationIsValid(reln));
496  Assert(BlockNumberIsValid(blockNum));
497 
498  /* Open it at the smgr level if not already done */
499  RelationOpenSmgr(reln);
500 
501  if (RelationUsesLocalBuffers(reln))
502  {
503  /* see comments in ReadBufferExtended */
504  if (RELATION_IS_OTHER_TEMP(reln))
505  ereport(ERROR,
506  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
507  errmsg("cannot access temporary tables of other sessions")));
508 
509  /* pass it off to localbuf.c */
510  LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
511  }
512  else
513  {
514  BufferTag newTag; /* identity of requested block */
515  uint32 newHash; /* hash value for newTag */
516  LWLock *newPartitionLock; /* buffer partition lock for it */
517  int buf_id;
518 
519  /* create a tag so we can lookup the buffer */
520  INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
521  forkNum, blockNum);
522 
523  /* determine its hash code and partition lock ID */
524  newHash = BufTableHashCode(&newTag);
525  newPartitionLock = BufMappingPartitionLock(newHash);
526 
527  /* see if the block is in the buffer pool already */
528  LWLockAcquire(newPartitionLock, LW_SHARED);
529  buf_id = BufTableLookup(&newTag, newHash);
530  LWLockRelease(newPartitionLock);
531 
532  /* If not in buffers, initiate prefetch */
533  if (buf_id < 0)
534  smgrprefetch(reln->rd_smgr, forkNum, blockNum);
535 
536  /*
537  * If the block *is* in buffers, we do nothing. This is not really
538  * ideal: the block might be just about to be evicted, which would be
539  * stupid since we know we are going to need it soon. But the only
540  * easy answer is to bump the usage_count, which does not seem like a
541  * great solution: when the caller does ultimately touch the block,
542  * usage_count would get bumped again, resulting in too much
543  * favoritism for blocks that are involved in a prefetch sequence. A
544  * real fix would involve some additional per-buffer state, and it's
545  * not clear that there's enough of a problem to justify that.
546  */
547  }
548 #endif /* USE_PREFETCH */
549 }
Definition: lwlock.h:32
#define BufMappingPartitionLock(hashcode)
void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
int errcode(int sqlerrcode)
Definition: elog.c:610
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1727
#define RelationOpenSmgr(relation)
Definition: rel.h:513
#define ERROR
Definition: elog.h:43
#define RelationIsValid(relation)
Definition: rel.h:429
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
unsigned int uint32
Definition: c.h:367
#define ereport(elevel,...)
Definition: elog.h:144
#define BlockNumberIsValid(blockNumber)
Definition: block.h:70
RelFileNode node
Definition: relfilenode.h:74
#define Assert(condition)
Definition: c.h:738
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:593
#define INIT_BUFFERTAG(a, xx_rnode, xx_forkNum, xx_blockNum)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1123
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:572
int errmsg(const char *fmt,...)
Definition: elog.c:824
void smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:529

◆ PrintBufferLeakWarning()

void PrintBufferLeakWarning ( Buffer  buffer)

Definition at line 2510 of file bufmgr.c.

References Assert, buftag::blockNum, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BufferIsLocal, BufferIsValid, elog, buftag::forkNum, GetBufferDescriptor, GetLocalBufferDescriptor, GetPrivateRefCount(), InvalidBackendId, LocalRefCount, MyBackendId, pfree(), pg_atomic_read_u32(), relpathbackend, buftag::rnode, BufferDesc::state, BufferDesc::tag, and WARNING.

Referenced by CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResourceOwnerReleaseInternal().

2511 {
2512  BufferDesc *buf;
2513  int32 loccount;
2514  char *path;
2515  BackendId backend;
2516  uint32 buf_state;
2517 
2518  Assert(BufferIsValid(buffer));
2519  if (BufferIsLocal(buffer))
2520  {
2521  buf = GetLocalBufferDescriptor(-buffer - 1);
2522  loccount = LocalRefCount[-buffer - 1];
2523  backend = MyBackendId;
2524  }
2525  else
2526  {
2527  buf = GetBufferDescriptor(buffer - 1);
2528  loccount = GetPrivateRefCount(buffer);
2529  backend = InvalidBackendId;
2530  }
2531 
2532  /* theoretically we should lock the bufhdr here */
2533  path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
2534  buf_state = pg_atomic_read_u32(&buf->state);
2535  elog(WARNING,
2536  "buffer refcount leak: [%03d] "
2537  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
2538  buffer, path,
2539  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
2540  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
2541  pfree(path);
2542 }
BackendId MyBackendId
Definition: globals.c:81
ForkNumber forkNum
Definition: buf_internals.h:93
#define GetLocalBufferDescriptor(id)
signed int int32
Definition: c.h:355
void pfree(void *pointer)
Definition: mcxt.c:1056
#define BUF_FLAG_MASK
Definition: buf_internals.h:45
static char * buf
Definition: pg_test_fsync.c:67
#define GetBufferDescriptor(id)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:378
unsigned int uint32
Definition: c.h:367
#define WARNING
Definition: elog.h:40
#define InvalidBackendId
Definition: backendid.h:23
int BackendId
Definition: backendid.h:21
#define Assert(condition)
Definition: c.h:738
#define BufferIsLocal(buffer)
Definition: buf.h:37
BlockNumber blockNum
Definition: buf_internals.h:94
#define BufferIsValid(bufnum)
Definition: bufmgr.h:114
RelFileNode rnode
Definition: buf_internals.h:92
BufferTag tag
#define elog(elevel,...)
Definition: elog.h:214
pg_atomic_uint32 state
#define relpathbackend(rnode, backend, forknum)
Definition: relpath.h:78
int32 * LocalRefCount
Definition: localbuf.c:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:48
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 557 of file bufmgr.c.

References MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_getbuf(), _bt_search_insert(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinbuild(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_page_items(), bt_page_stats(), fill_seq_with_data(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_compute_xid_horizon_for_tuples(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_inplace_update(), heap_lock_tuple(), heap_update(), initBloomState(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

558 {
559  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
560 }
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:603

◆ ReadBuffer_common()

static Buffer ReadBuffer_common ( SMgrRelation  reln,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool hit 
)
static

Definition at line 666 of file bufmgr.c.

References Assert, RelFileNodeBackend::backend, BufferUsage::blk_read_time, BM_VALID, BufferAlloc(), BufferDescriptorGetBuffer, BufferDescriptorGetContentLock, BufHdrGetBlock, CurrentResourceOwner, RelFileNode::dbNode, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errhint(), errmsg(), ERROR, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, BufferUsage::local_blks_hit, BufferUsage::local_blks_read, BufferUsage::local_blks_written, LocalBufferAlloc(), LocalBufHdrGetBlock, LockBufferForCleanup(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), MemSet, RelFileNodeBackend::node, P_NEW, PageIsNew, PageIsVerified(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), pgBufferUsage, pgstat_count_buffer_read_time, RBM_NORMAL, RBM_NORMAL_NO_LOG, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RBM_ZERO_ON_ERROR, RelFileNode::relNode, relpath, ResourceOwnerEnlargeBuffers(), BufferUsage::shared_blks_hit, BufferUsage::shared_blks_read, BufferUsage::shared_blks_written, SMgrRelationData::smgr_rnode, smgrextend(), SmgrIsTemp, smgrnblocks(), smgrread(), RelFileNode::spcNode, StartBufferIO(), BufferDesc::state, TerminateBufferIO(), track_io_timing, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageHit, VacuumCostPageMiss, VacuumPageHit, VacuumPageMiss, WARNING, and zero_damaged_pages.

Referenced by ReadBufferExtended(), and ReadBufferWithoutRelcache().

669 {
670  BufferDesc *bufHdr;
671  Block bufBlock;
672  bool found;
673  bool isExtend;
674  bool isLocalBuf = SmgrIsTemp(smgr);
675 
676  *hit = false;
677 
678  /* Make sure we will have room to remember the buffer pin */
680 
681  isExtend = (blockNum == P_NEW);
682 
683  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
684  smgr->smgr_rnode.node.spcNode,
685  smgr->smgr_rnode.node.dbNode,
686  smgr->smgr_rnode.node.relNode,
687  smgr->smgr_rnode.backend,
688  isExtend);
689 
690  /* Substitute proper block number if caller asked for P_NEW */
691  if (isExtend)
692  blockNum = smgrnblocks(smgr, forkNum);
693 
694  if (isLocalBuf)
695  {
696  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
697  if (found)
699  else if (isExtend)
701  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
704  }
705  else
706  {
707  /*
708  * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
709  * not currently in memory.
710  */
711  bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
712  strategy, &found);
713  if (found)
715  else if (isExtend)
717  else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
720  }
721 
722  /* At this point we do NOT hold any locks. */
723 
724  /* if it was already in the buffer pool, we're done */
725  if (found)
726  {
727  if (!isExtend)
728  {
729  /* Just need to update stats before we exit */
730  *hit = true;
731  VacuumPageHit++;
732 
733  if (VacuumCostActive)
735 
736  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
737  smgr->smgr_rnode.node.spcNode,
738  smgr->smgr_rnode.node.dbNode,
739  smgr->smgr_rnode.node.relNode,
740  smgr->smgr_rnode.backend,
741  isExtend,
742  found);
743 
744  /*
745  * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
746  * locked on return.
747  */
748  if (!isLocalBuf)
749  {
750  if (mode == RBM_ZERO_AND_LOCK)
752  LW_EXCLUSIVE);
753  else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
755  }
756 
757  return BufferDescriptorGetBuffer(bufHdr);
758  }
759 
760  /*
761  * We get here only in the corner case where we are trying to extend
762  * the relation but we found a pre-existing buffer marked BM_VALID.
763  * This can happen because mdread doesn't complain about reads beyond
764  * EOF (when zero_damaged_pages is ON) and so a previous attempt to
765  * read a block beyond EOF could have left a "valid" zero-filled
766  * buffer. Unfortunately, we have also seen this case occurring
767  * because of buggy Linux kernels that sometimes return an
768  * lseek(SEEK_END) result that doesn't account for a recent write. In
769  * that situation, the pre-existing buffer would contain valid data
770  * that we don't want to overwrite. Since the legitimate case should
771  * always have left a zero-filled buffer, complain if not PageIsNew.
772  */
773  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
774  if (!PageIsNew((Page) bufBlock))
775  ereport(ERROR,
776  (errmsg("unexpected data beyond EOF in block %u of relation %s",
777  blockNum, relpath(smgr->smgr_rnode, forkNum)),
778  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
779 
780  /*
781  * We *must* do smgrextend before succeeding, else the page will not
782  * be reserved by the kernel, and the next P_NEW call will decide to
783  * return the same page. Clear the BM_VALID bit, do the StartBufferIO
784  * call that BufferAlloc didn't, and proceed.
785  */
786  if (isLocalBuf)
787  {
788  /* Only need to adjust flags */
789  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
790 
791  Assert(buf_state & BM_VALID);
792  buf_state &= ~BM_VALID;
793  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
794  }
795  else
796  {
797  /*
798  * Loop to handle the very small possibility that someone re-sets
799  * BM_VALID between our clearing it and StartBufferIO inspecting
800  * it.
801  */
802  do
803  {
804  uint32 buf_state = LockBufHdr(bufHdr);
805 
806  Assert(buf_state & BM_VALID);
807  buf_state &= ~BM_VALID;
808  UnlockBufHdr(bufHdr, buf_state);
809  } while (!StartBufferIO(bufHdr, true));
810  }
811  }
812 
813  /*
814  * if we have gotten to this point, we have allocated a buffer for the
815  * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
816  * if it's a shared buffer.
817  *
818  * Note: if smgrextend fails, we will end up with a buffer that is
819  * allocated but not marked BM_VALID. P_NEW will still select the same
820  * block number (because the relation didn't get any longer on disk) and
821  * so future attempts to extend the relation will find the same buffer (if
822  * it's not been recycled) but come right back here to try smgrextend
823  * again.
824  */
825  Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
826 
827  bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
828 
829  if (isExtend)
830  {
831  /* new buffers are zero-filled */
832  MemSet((char *) bufBlock, 0, BLCKSZ);
833  /* don't set checksum for all-zero page */
834  smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
835 
836  /*
837  * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
838  * although we're essentially performing a write. At least on linux
839  * doing so defeats the 'delayed allocation' mechanism, leading to
840  * increased file fragmentation.
841  */
842  }
843  else
844  {
845  /*
846  * Read in the page, unless the caller intends to overwrite it and
847  * just wants us to allocate a buffer.
848  */
850  MemSet((char *) bufBlock, 0, BLCKSZ);
851  else
852  {
853  instr_time io_start,
854  io_time;
855 
856  if (track_io_timing)
857  INSTR_TIME_SET_CURRENT(io_start);
858 
859  smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
860 
861  if (track_io_timing)
862  {
863  INSTR_TIME_SET_CURRENT(io_time);
864  INSTR_TIME_SUBTRACT(io_time, io_start);
867  }
868 
869  /* check for garbage data */
870  if (!PageIsVerified((Page) bufBlock, blockNum))
871  {
873  {
876  errmsg("invalid page in block %u of relation %s; zeroing out page",
877  blockNum,
878  relpath(smgr->smgr_rnode, forkNum))));
879  MemSet((char *) bufBlock, 0, BLCKSZ);
880  }
881  else
882  ereport(ERROR,
884  errmsg("invalid page in block %u of relation %s",
885  blockNum,
886  relpath(smgr->smgr_rnode, forkNum))));
887  }
888  }
889  }
890 
891  /*
892  * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
893  * the page as valid, to make sure that no other backend sees the zeroed
894  * page before the caller has had a chance to initialize it.
895  *
896  * Since no-one else can be looking at the page contents yet, there is no
897  * difference between an exclusive lock and a cleanup-strength lock. (Note
898  * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
899  * they assert that the buffer is already valid.)
900  */
902  !isLocalBuf)
903  {
905  }
906 
907  if (isLocalBuf)
908  {
909  /* Only need to adjust flags */
910  uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
911 
912  buf_state |= BM_VALID;
913  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
914  }
915  else
916  {
917  /* Set BM_VALID, terminate IO, and wake up any waiters */
918  TerminateBufferIO(bufHdr, false, BM_VALID);
919  }
920 
921  VacuumPageMiss++;
922  if (VacuumCostActive)
924 
925  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
926  smgr->smgr_rnode.node.spcNode,
927  smgr->smgr_rnode.node.dbNode,
928  smgr->smgr_rnode.node.relNode,
929  smgr->smgr_rnode.backend,
930  isExtend,
931  found);
932 
933  return BufferDescriptorGetBuffer(bufHdr);
934 }
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:63
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:103
static PgChecksumMode mode
Definition: pg_checksums.c:61
long local_blks_hit
Definition: instrument.h:25
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:3730
int64 VacuumPageMiss
Definition: globals.c:144
int errhint(const char *fmt,...)
Definition: elog.c:1071
long local_blks_read
Definition: instrument.h:26
int VacuumCostBalance
Definition: globals.c:147
bool PageIsVerified(Page page, BlockNumber blkno)
Definition: bufpage.c:82
instr_time blk_read_time
Definition: instrument.h:31
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
struct timeval instr_time
Definition: instr_time.h:150
long shared_blks_read
Definition: instrument.h:22
int64 VacuumPageHit
Definition: globals.c:143
int errcode(int sqlerrcode)
Definition: elog.c:610
#define MemSet(start, val, len)
Definition: c.h:971
#define P_NEW
Definition: bufmgr.h:82
void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
Definition: smgr.c:543
#define SmgrIsTemp(smgr)
Definition: smgr.h:79
long shared_blks_written
Definition: instrument.h:24
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:4024
#define ERROR
Definition: elog.h:43
#define INSTR_TIME_SUBTRACT(x, y)
Definition: instr_time.h:170
#define INSTR_TIME_ADD(x, y)
Definition: instr_time.h:158
unsigned int uint32
Definition: c.h:367
int VacuumCostPageHit
Definition: globals.c:137
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:45
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:917
#define WARNING
Definition: elog.h:40
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:4091
#define BM_VALID
Definition: buf_internals.h:59
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:59
#define ereport(elevel,...)
Definition: elog.h:144
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:956
#define BufferDescriptorGetContentLock(bdesc)
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4241
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:590
#define Assert(condition)
Definition: c.h:738
#define pgstat_count_buffer_read_time(n)
Definition: pgstat.h:1437
#define INSTR_TIME_GET_MICROSEC(t)
Definition: instr_time.h:205
#define BufferDescriptorGetBuffer(bdesc)
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1123
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:156
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
Definition: smgr.c:518
#define PageIsNew(page)
Definition: bufpage.h:229
int errmsg(const char *fmt,...)
Definition: elog.c:824
long shared_blks_hit
Definition: instrument.h:21
#define UnlockBufHdr(desc, s)
long local_blks_written
Definition: instrument.h:28
#define relpath(rnode, forknum)
Definition: relpath.h:87
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:277
pg_atomic_uint32 state
int VacuumCostPageMiss
Definition: globals.c:138
bool track_io_timing
Definition: bufmgr.c:126
Pointer Page
Definition: bufpage.h:78
BufferUsage pgBufferUsage
Definition: instrument.c:20
void * Block
Definition: bufmgr.h:24
bool VacuumCostActive
Definition: globals.c:148
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:241
bool zero_damaged_pages
Definition: bufmgr.c:123

◆ ReadBufferExtended()

Buffer ReadBufferExtended ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)

Definition at line 603 of file bufmgr.c.

References buf, ereport, errcode(), errmsg(), ERROR, pgstat_count_buffer_hit, pgstat_count_buffer_read, RelationData::rd_rel, RelationData::rd_smgr, ReadBuffer_common(), RELATION_IS_OTHER_TEMP, and RelationOpenSmgr.

Referenced by _hash_getbuf_with_strategy(), _hash_getinitbuf(), _hash_getnewbuf(), autoprewarm_database_main(), blbulkdelete(), blgetbitmap(), blvacuumcleanup(), brin_vacuum_scan(), brinbuildempty(), btvacuumpage(), collect_corrupt_items(), collect_visibility_data(), count_nondeletable_pages(), fsm_readbuf(), get_raw_page_internal(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginScanToDelete(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistbuildempty(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbulkdelete(), heapam_scan_analyze_next_block(), heapgetpage(), lazy_scan_heap(), lazy_vacuum_heap(), log_newpage_range(), palloc_btree_page(), pg_prewarm(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstathashindex(), pgstatindex_impl(), ReadBuffer(), ReadBufferBI(), spgprocesspending(), spgvacuumpage(), statapprox_heap(), and vm_readbuf().

605 {
606  bool hit;
607  Buffer buf;
608 
609  /* Open it at the smgr level if not already done */
610  RelationOpenSmgr(reln);
611 
612  /*
613  * Reject attempts to read non-local temporary relations; we would be
614  * likely to get wrong data since we have no visibility into the owning
615  * session's local buffers.
616  */
617  if (RELATION_IS_OTHER_TEMP(reln))
618  ereport(ERROR,
619  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
620  errmsg("cannot access temporary tables of other sessions")));
621 
622  /*
623  * Read the buffer, and update pgstat counters to reflect a cache hit or
624  * miss.
625  */
627  buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
628  forkNum, blockNum, mode, strategy, &hit);
629  if (hit)
631  return buf;
632 }
static PgChecksumMode mode
Definition: pg_checksums.c:61
struct SMgrRelationData * rd_smgr
Definition: rel.h:57
int errcode(int sqlerrcode)
Definition: elog.c:610
Form_pg_class rd_rel
Definition: rel.h:109
#define RelationOpenSmgr(relation)
Definition: rel.h:513
#define ERROR
Definition: elog.h:43
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:1427
static char * buf
Definition: pg_test_fsync.c:67
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:1432
static Buffer ReadBuffer_common(SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:666
#define ereport(elevel,...)
Definition: elog.h:144
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:593
int errmsg(const char *fmt,...)
Definition: elog.c:824
int Buffer
Definition: buf.h:23