PostgreSQL Source Code  git master
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/catalog.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner.h"
#include "utils/timestamp.h"
#include <lib/sort_template.h>
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 
#define ST_SORT   sort_checkpoint_bufferids
 
#define ST_ELEMENT_TYPE   CkptSortItem
 
#define ST_COMPARE(a, b)   ckpt_buforder_comparator(a, b)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   sort_pending_writebacks
 
#define ST_ELEMENT_TYPE   PendingWriteback
 
#define ST_COMPARE(a, b)   buffertag_comparator(&a->tag, &b->tag)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static void ResOwnerReleaseBufferIO (Datum res)
 
static char * ResOwnerPrintBufferIO (Datum res)
 
static void ResOwnerReleaseBufferPin (Datum res)
 
static char * ResOwnerPrintBufferPin (Datum res)
 
static Buffer ReadBuffer_common (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
 
static BlockNumber ExtendBufferedRelCommon (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static BlockNumber ExtendBufferedRelShared (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf)
 
static void UnpinBufferNoOwner (BufferDesc *buf)
 
static void BufferSync (int flags)
 
static uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static bool StartBufferIO (BufferDesc *buf, bool forInput)
 
static void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner)
 
static void AbortBufferIO (Buffer buffer)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
 
static Buffer GetVictimBuffer (BufferAccessStrategy strategy, IOContext io_context)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FindAndDropRelationBuffers (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
 
static void RelationCopyStorageUsingBuffer (RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rlocator_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const BufferTag *ba, const BufferTag *bb)
 
static int ckpt_buforder_comparator (const CkptSortItem *a, const CkptSortItem *b)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
bool ReadRecentBuffer (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
 
Buffer ExtendBufferedRel (BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
 
BlockNumber ExtendBufferedRelBy (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
 
Buffer ExtendBufferedRelTo (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
 
static void InvalidateBuffer (BufferDesc *buf)
 
static bool InvalidateVictimBuffer (BufferDesc *buf_hdr)
 
static void LimitAdditionalPins (uint32 *additional_pins)
 
bool BufferIsExclusiveLocked (Buffer buffer)
 
bool BufferIsDirty (Buffer buffer)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferPoolAccess (void)
 
char * DebugPrintBufferRefcount (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelationBuffers (SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelationsAllBuffers (SMgrRelation *smgr_reln, int nlocators)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void CreateAndCopyRelationData (RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, int mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void CheckBufferIsPinnedOnce (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
uint32 LockBufHdr (BufferDesc *desc)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *wb_context, IOContext io_context)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY
 
int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY
 
int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER
 
int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER
 
int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 
const ResourceOwnerDesc buffer_io_resowner_desc
 
const ResourceOwnerDesc buffer_pin_resowner_desc
 

Macro Definition Documentation

◆ BUF_DROP_FULL_SCAN_THRESHOLD

#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)

Definition at line 83 of file bufmgr.c.

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 73 of file bufmgr.c.

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 72 of file bufmgr.c.

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 65 of file bufmgr.c.

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:405
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:301
int32 * LocalRefCount
Definition: localbuf.c:47

Definition at line 463 of file bufmgr.c.

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 64 of file bufmgr.c.

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 68 of file bufmgr.c.

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Definition at line 92 of file bufmgr.c.

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 75 of file bufmgr.c.

◆ ST_COMPARE [1/2]

#define ST_COMPARE (   a,
  b 
)    ckpt_buforder_comparator(a, b)

Definition at line 5577 of file bufmgr.c.

◆ ST_COMPARE [2/2]

#define ST_COMPARE (   a,
  b 
)    buffertag_comparator(&a->tag, &b->tag)

Definition at line 5577 of file bufmgr.c.

◆ ST_DEFINE [1/2]

#define ST_DEFINE

Definition at line 5579 of file bufmgr.c.

◆ ST_DEFINE [2/2]

#define ST_DEFINE

Definition at line 5579 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [1/2]

#define ST_ELEMENT_TYPE   CkptSortItem

Definition at line 5576 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [2/2]

#define ST_ELEMENT_TYPE   PendingWriteback

Definition at line 5576 of file bufmgr.c.

◆ ST_SCOPE [1/2]

#define ST_SCOPE   static

Definition at line 5578 of file bufmgr.c.

◆ ST_SCOPE [2/2]

#define ST_SCOPE   static

Definition at line 5578 of file bufmgr.c.

◆ ST_SORT [1/2]

#define ST_SORT   sort_checkpoint_bufferids

Definition at line 5575 of file bufmgr.c.

◆ ST_SORT [2/2]

#define ST_SORT   sort_pending_writebacks

Definition at line 5575 of file bufmgr.c.

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

◆ SMgrSortArray

typedef struct SMgrSortArray SMgrSortArray

Function Documentation

◆ AbortBufferIO()

static void AbortBufferIO ( Buffer  buffer)
static

Definition at line 5282 of file bufmgr.c.

5283 {
5284  BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
5285  uint32 buf_state;
5286 
5287  buf_state = LockBufHdr(buf_hdr);
5288  Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
5289 
5290  if (!(buf_state & BM_VALID))
5291  {
5292  Assert(!(buf_state & BM_DIRTY));
5293  UnlockBufHdr(buf_hdr, buf_state);
5294  }
5295  else
5296  {
5297  Assert(buf_state & BM_DIRTY);
5298  UnlockBufHdr(buf_hdr, buf_state);
5299 
5300  /* Issue notice if this is not the first failure... */
5301  if (buf_state & BM_IO_ERROR)
5302  {
5303  /* Buffer is pinned, so we can read tag without spinlock */
5304  char *path;
5305 
5306  path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
5307  BufTagGetForkNum(&buf_hdr->tag));
5308  ereport(WARNING,
5309  (errcode(ERRCODE_IO_ERROR),
5310  errmsg("could not write block %u of %s",
5311  buf_hdr->tag.blockNum, path),
5312  errdetail("Multiple failures --- write error might be permanent.")));
5313  pfree(path);
5314  }
5315  }
5316 
5317  TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false);
5318 }
#define BM_TAG_VALID
Definition: buf_internals.h:63
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static BufferDesc * GetBufferDescriptor(uint32 id)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
#define BM_DIRTY
Definition: buf_internals.h:61
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:64
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:62
#define BM_IO_ERROR
Definition: buf_internals.h:65
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner)
Definition: bufmgr.c:5245
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:5391
unsigned int uint32
Definition: c.h:493
int errdetail(const char *fmt,...)
Definition: elog.c:1206
int errcode(int sqlerrcode)
Definition: elog.c:860
int errmsg(const char *fmt,...)
Definition: elog.c:1073
#define WARNING
Definition: elog.h:36
#define ereport(elevel,...)
Definition: elog.h:149
Assert(fmt[strlen(fmt) - 1] !='\n')
void pfree(void *pointer)
Definition: mcxt.c:1405
#define relpathperm(rlocator, forknum)
Definition: relpath.h:90
BufferTag tag
BlockNumber blockNum
Definition: buf_internals.h:98

References Assert(), buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufTagGetForkNum(), BufTagGetRelFileLocator(), ereport, errcode(), errdetail(), errmsg(), GetBufferDescriptor(), LockBufHdr(), pfree(), relpathperm, BufferDesc::tag, TerminateBufferIO(), UnlockBufHdr(), and WARNING.

Referenced by ResOwnerReleaseBufferIO().

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 3213 of file bufmgr.c.

3214 {
3216 
3217  AtEOXact_LocalBuffers(isCommit);
3218 
3220 }
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:3273
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:199
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:820

References Assert(), AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 3255 of file bufmgr.c.

3256 {
3257  UnlockBuffers();
3258 
3260 
3261  /* localbuf.c needs a chance too */
3263 }
void UnlockBuffers(void)
Definition: bufmgr.c:4768
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:831

References AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferPoolAccess().

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 2842 of file bufmgr.c.

2843 {
2844  /* info obtained from freelist.c */
2845  int strategy_buf_id;
2846  uint32 strategy_passes;
2847  uint32 recent_alloc;
2848 
2849  /*
2850  * Information saved between calls so we can determine the strategy
2851  * point's advance rate and avoid scanning already-cleaned buffers.
2852  */
2853  static bool saved_info_valid = false;
2854  static int prev_strategy_buf_id;
2855  static uint32 prev_strategy_passes;
2856  static int next_to_clean;
2857  static uint32 next_passes;
2858 
2859  /* Moving averages of allocation rate and clean-buffer density */
2860  static float smoothed_alloc = 0;
2861  static float smoothed_density = 10.0;
2862 
2863  /* Potentially these could be tunables, but for now, not */
2864  float smoothing_samples = 16;
2865  float scan_whole_pool_milliseconds = 120000.0;
2866 
2867  /* Used to compute how far we scan ahead */
2868  long strategy_delta;
2869  int bufs_to_lap;
2870  int bufs_ahead;
2871  float scans_per_alloc;
2872  int reusable_buffers_est;
2873  int upcoming_alloc_est;
2874  int min_scan_buffers;
2875 
2876  /* Variables for the scanning loop proper */
2877  int num_to_scan;
2878  int num_written;
2879  int reusable_buffers;
2880 
2881  /* Variables for final smoothed_density update */
2882  long new_strategy_delta;
2883  uint32 new_recent_alloc;
2884 
2885  /*
2886  * Find out where the freelist clock sweep currently is, and how many
2887  * buffer allocations have happened since our last call.
2888  */
2889  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2890 
2891  /* Report buffer alloc counts to pgstat */
2892  PendingBgWriterStats.buf_alloc += recent_alloc;
2893 
2894  /*
2895  * If we're not running the LRU scan, just stop after doing the stats
2896  * stuff. We mark the saved state invalid so that we can recover sanely
2897  * if LRU scan is turned back on later.
2898  */
2899  if (bgwriter_lru_maxpages <= 0)
2900  {
2901  saved_info_valid = false;
2902  return true;
2903  }
2904 
2905  /*
2906  * Compute strategy_delta = how many buffers have been scanned by the
2907  * clock sweep since last time. If first time through, assume none. Then
2908  * see if we are still ahead of the clock sweep, and if so, how many
2909  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2910  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2911  * behavior when the passes counts wrap around.
2912  */
2913  if (saved_info_valid)
2914  {
2915  int32 passes_delta = strategy_passes - prev_strategy_passes;
2916 
2917  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2918  strategy_delta += (long) passes_delta * NBuffers;
2919 
2920  Assert(strategy_delta >= 0);
2921 
2922  if ((int32) (next_passes - strategy_passes) > 0)
2923  {
2924  /* we're one pass ahead of the strategy point */
2925  bufs_to_lap = strategy_buf_id - next_to_clean;
2926 #ifdef BGW_DEBUG
2927  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2928  next_passes, next_to_clean,
2929  strategy_passes, strategy_buf_id,
2930  strategy_delta, bufs_to_lap);
2931 #endif
2932  }
2933  else if (next_passes == strategy_passes &&
2934  next_to_clean >= strategy_buf_id)
2935  {
2936  /* on same pass, but ahead or at least not behind */
2937  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2938 #ifdef BGW_DEBUG
2939  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2940  next_passes, next_to_clean,
2941  strategy_passes, strategy_buf_id,
2942  strategy_delta, bufs_to_lap);
2943 #endif
2944  }
2945  else
2946  {
2947  /*
2948  * We're behind, so skip forward to the strategy point and start
2949  * cleaning from there.
2950  */
2951 #ifdef BGW_DEBUG
2952  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2953  next_passes, next_to_clean,
2954  strategy_passes, strategy_buf_id,
2955  strategy_delta);
2956 #endif
2957  next_to_clean = strategy_buf_id;
2958  next_passes = strategy_passes;
2959  bufs_to_lap = NBuffers;
2960  }
2961  }
2962  else
2963  {
2964  /*
2965  * Initializing at startup or after LRU scanning had been off. Always
2966  * start at the strategy point.
2967  */
2968 #ifdef BGW_DEBUG
2969  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2970  strategy_passes, strategy_buf_id);
2971 #endif
2972  strategy_delta = 0;
2973  next_to_clean = strategy_buf_id;
2974  next_passes = strategy_passes;
2975  bufs_to_lap = NBuffers;
2976  }
2977 
2978  /* Update saved info for next time */
2979  prev_strategy_buf_id = strategy_buf_id;
2980  prev_strategy_passes = strategy_passes;
2981  saved_info_valid = true;
2982 
2983  /*
2984  * Compute how many buffers had to be scanned for each new allocation, ie,
2985  * 1/density of reusable buffers, and track a moving average of that.
2986  *
2987  * If the strategy point didn't move, we don't update the density estimate
2988  */
2989  if (strategy_delta > 0 && recent_alloc > 0)
2990  {
2991  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2992  smoothed_density += (scans_per_alloc - smoothed_density) /
2993  smoothing_samples;
2994  }
2995 
2996  /*
2997  * Estimate how many reusable buffers there are between the current
2998  * strategy point and where we've scanned ahead to, based on the smoothed
2999  * density estimate.
3000  */
3001  bufs_ahead = NBuffers - bufs_to_lap;
3002  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3003 
3004  /*
3005  * Track a moving average of recent buffer allocations. Here, rather than
3006  * a true average we want a fast-attack, slow-decline behavior: we
3007  * immediately follow any increase.
3008  */
3009  if (smoothed_alloc <= (float) recent_alloc)
3010  smoothed_alloc = recent_alloc;
3011  else
3012  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3013  smoothing_samples;
3014 
3015  /* Scale the estimate by a GUC to allow more aggressive tuning. */
3016  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3017 
3018  /*
3019  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3020  * eventually underflow to zero, and the underflows produce annoying
3021  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3022  * zero, there's no point in tracking smaller and smaller values of
3023  * smoothed_alloc, so just reset it to exactly zero to avoid this
3024  * syndrome. It will pop back up as soon as recent_alloc increases.
3025  */
3026  if (upcoming_alloc_est == 0)
3027  smoothed_alloc = 0;
3028 
3029  /*
3030  * Even in cases where there's been little or no buffer allocation
3031  * activity, we want to make a small amount of progress through the buffer
3032  * cache so that as many reusable buffers as possible are clean after an
3033  * idle period.
3034  *
3035  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3036  * the BGW will be called during the scan_whole_pool time; slice the
3037  * buffer pool into that many sections.
3038  */
3039  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3040 
3041  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3042  {
3043 #ifdef BGW_DEBUG
3044  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3045  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3046 #endif
3047  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3048  }
3049 
3050  /*
3051  * Now write out dirty reusable buffers, working forward from the
3052  * next_to_clean point, until we have lapped the strategy scan, or cleaned
3053  * enough buffers to match our estimate of the next cycle's allocation
3054  * requirements, or hit the bgwriter_lru_maxpages limit.
3055  */
3056 
3057  num_to_scan = bufs_to_lap;
3058  num_written = 0;
3059  reusable_buffers = reusable_buffers_est;
3060 
3061  /* Execute the LRU scan */
3062  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3063  {
3064  int sync_state = SyncOneBuffer(next_to_clean, true,
3065  wb_context);
3066 
3067  if (++next_to_clean >= NBuffers)
3068  {
3069  next_to_clean = 0;
3070  next_passes++;
3071  }
3072  num_to_scan--;
3073 
3074  if (sync_state & BUF_WRITTEN)
3075  {
3076  reusable_buffers++;
3077  if (++num_written >= bgwriter_lru_maxpages)
3078  {
3080  break;
3081  }
3082  }
3083  else if (sync_state & BUF_REUSABLE)
3084  reusable_buffers++;
3085  }
3086 
3087  PendingBgWriterStats.buf_written_clean += num_written;
3088 
3089 #ifdef BGW_DEBUG
3090  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3091  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3092  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3093  bufs_to_lap - num_to_scan,
3094  num_written,
3095  reusable_buffers - reusable_buffers_est);
3096 #endif
3097 
3098  /*
3099  * Consider the above scan as being like a new allocation scan.
3100  * Characterize its density and update the smoothed one based on it. This
3101  * effectively halves the moving average period in cases where both the
3102  * strategy and the background writer are doing some useful scanning,
3103  * which is helpful because a long memory isn't as desirable on the
3104  * density estimates.
3105  */
3106  new_strategy_delta = bufs_to_lap - num_to_scan;
3107  new_recent_alloc = reusable_buffers - reusable_buffers_est;
3108  if (new_strategy_delta > 0 && new_recent_alloc > 0)
3109  {
3110  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3111  smoothed_density += (scans_per_alloc - smoothed_density) /
3112  smoothing_samples;
3113 
3114 #ifdef BGW_DEBUG
3115  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3116  new_recent_alloc, new_strategy_delta,
3117  scans_per_alloc, smoothed_density);
3118 #endif
3119  }
3120 
3121  /* Return true if OK to hibernate */
3122  return (bufs_to_lap == 0 && recent_alloc == 0);
3123 }
int BgWriterDelay
Definition: bgwriter.c:61
#define BUF_REUSABLE
Definition: bufmgr.c:73
double bgwriter_lru_multiplier
Definition: bufmgr.c:138
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3140
int bgwriter_lru_maxpages
Definition: bufmgr.c:137
#define BUF_WRITTEN
Definition: bufmgr.c:72
signed int int32
Definition: c.h:481
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:224
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
int NBuffers
Definition: globals.c:139
PgStat_BgWriterStats PendingBgWriterStats
PgStat_Counter buf_written_clean
Definition: pgstat.h:255
PgStat_Counter maxwritten_clean
Definition: pgstat.h:256
PgStat_Counter buf_alloc
Definition: pgstat.h:257

References Assert(), bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, PgStat_BgWriterStats::buf_alloc, BUF_REUSABLE, BUF_WRITTEN, PgStat_BgWriterStats::buf_written_clean, DEBUG1, DEBUG2, elog, PgStat_BgWriterStats::maxwritten_clean, NBuffers, PendingBgWriterStats, StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

◆ BufferAlloc()

static BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr,
IOContext  io_context 
)
static

Definition at line 1240 of file bufmgr.c.

1244 {
1245  BufferTag newTag; /* identity of requested block */
1246  uint32 newHash; /* hash value for newTag */
1247  LWLock *newPartitionLock; /* buffer partition lock for it */
1248  int existing_buf_id;
1249  Buffer victim_buffer;
1250  BufferDesc *victim_buf_hdr;
1251  uint32 victim_buf_state;
1252 
1253  /* Make sure we will have room to remember the buffer pin */
1256 
1257  /* create a tag so we can lookup the buffer */
1258  InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1259 
1260  /* determine its hash code and partition lock ID */
1261  newHash = BufTableHashCode(&newTag);
1262  newPartitionLock = BufMappingPartitionLock(newHash);
1263 
1264  /* see if the block is in the buffer pool already */
1265  LWLockAcquire(newPartitionLock, LW_SHARED);
1266  existing_buf_id = BufTableLookup(&newTag, newHash);
1267  if (existing_buf_id >= 0)
1268  {
1269  BufferDesc *buf;
1270  bool valid;
1271 
1272  /*
1273  * Found it. Now, pin the buffer so no one can steal it from the
1274  * buffer pool, and check to see if the correct data has been loaded
1275  * into the buffer.
1276  */
1277  buf = GetBufferDescriptor(existing_buf_id);
1278 
1279  valid = PinBuffer(buf, strategy);
1280 
1281  /* Can release the mapping lock as soon as we've pinned it */
1282  LWLockRelease(newPartitionLock);
1283 
1284  *foundPtr = true;
1285 
1286  if (!valid)
1287  {
1288  /*
1289  * We can only get here if (a) someone else is still reading in
1290  * the page, or (b) a previous read attempt failed. We have to
1291  * wait for any active read attempt to finish, and then set up our
1292  * own read attempt if the page is still not BM_VALID.
1293  * StartBufferIO does it all.
1294  */
1295  if (StartBufferIO(buf, true))
1296  {
1297  /*
1298  * If we get here, previous attempts to read the buffer must
1299  * have failed ... but we shall bravely try again.
1300  */
1301  *foundPtr = false;
1302  }
1303  }
1304 
1305  return buf;
1306  }
1307 
1308  /*
1309  * Didn't find it in the buffer pool. We'll have to initialize a new
1310  * buffer. Remember to unlock the mapping lock while doing the work.
1311  */
1312  LWLockRelease(newPartitionLock);
1313 
1314  /*
1315  * Acquire a victim buffer. Somebody else might try to do the same, we
1316  * don't hold any conflicting locks. If so we'll have to undo our work
1317  * later.
1318  */
1319  victim_buffer = GetVictimBuffer(strategy, io_context);
1320  victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
1321 
1322  /*
1323  * Try to make a hashtable entry for the buffer under its new tag. If
1324  * somebody else inserted another buffer for the tag, we'll release the
1325  * victim buffer we acquired and use the already inserted one.
1326  */
1327  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1328  existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
1329  if (existing_buf_id >= 0)
1330  {
1331  BufferDesc *existing_buf_hdr;
1332  bool valid;
1333 
1334  /*
1335  * Got a collision. Someone has already done what we were about to do.
1336  * We'll just handle this as if it were found in the buffer pool in
1337  * the first place. First, give up the buffer we were planning to
1338  * use.
1339  *
1340  * We could do this after releasing the partition lock, but then we'd
1341  * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
1342  * before acquiring the lock, for the rare case of such a collision.
1343  */
1344  UnpinBuffer(victim_buf_hdr);
1345 
1346  /*
1347  * The victim buffer we acquired previously is clean and unused, let
1348  * it be found again quickly
1349  */
1350  StrategyFreeBuffer(victim_buf_hdr);
1351 
1352  /* remaining code should match code at top of routine */
1353 
1354  existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
1355 
1356  valid = PinBuffer(existing_buf_hdr, strategy);
1357 
1358  /* Can release the mapping lock as soon as we've pinned it */
1359  LWLockRelease(newPartitionLock);
1360 
1361  *foundPtr = true;
1362 
1363  if (!valid)
1364  {
1365  /*
1366  * We can only get here if (a) someone else is still reading in
1367  * the page, or (b) a previous read attempt failed. We have to
1368  * wait for any active read attempt to finish, and then set up our
1369  * own read attempt if the page is still not BM_VALID.
1370  * StartBufferIO does it all.
1371  */
1372  if (StartBufferIO(existing_buf_hdr, true))
1373  {
1374  /*
1375  * If we get here, previous attempts to read the buffer must
1376  * have failed ... but we shall bravely try again.
1377  */
1378  *foundPtr = false;
1379  }
1380  }
1381 
1382  return existing_buf_hdr;
1383  }
1384 
1385  /*
1386  * Need to lock the buffer header too in order to change its tag.
1387  */
1388  victim_buf_state = LockBufHdr(victim_buf_hdr);
1389 
1390  /* some sanity checks while we hold the buffer header lock */
1391  Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
1392  Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
1393 
1394  victim_buf_hdr->tag = newTag;
1395 
1396  /*
1397  * Make sure BM_PERMANENT is set for buffers that must be written at every
1398  * checkpoint. Unlogged buffers only need to be written at shutdown
1399  * checkpoints, except for their "init" forks, which need to be treated
1400  * just like permanent relations.
1401  */
1402  victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1403  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1404  victim_buf_state |= BM_PERMANENT;
1405 
1406  UnlockBufHdr(victim_buf_hdr, victim_buf_state);
1407 
1408  LWLockRelease(newPartitionLock);
1409 
1410  /*
1411  * Buffer contents are currently invalid. Try to obtain the right to
1412  * start I/O. If StartBufferIO returns false, then someone else managed
1413  * to read it before we did, so there's nothing left for BufferAlloc() to
1414  * do.
1415  */
1416  if (StartBufferIO(victim_buf_hdr, true))
1417  *foundPtr = false;
1418  else
1419  *foundPtr = true;
1420 
1421  return victim_buf_hdr;
1422 }
int Buffer
Definition: buf.h:23
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_PERMANENT
Definition: buf_internals.h:69
static LWLock * BufMappingPartitionLock(uint32 hashcode)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:46
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:51
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:2311
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:1608
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:5190
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:239
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:2460
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1175
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1788
@ LW_SHARED
Definition: lwlock.h:117
@ LW_EXCLUSIVE
Definition: lwlock.h:116
static char * buf
Definition: pg_test_fsync.c:73
@ INIT_FORKNUM
Definition: relpath.h:53
ResourceOwner CurrentResourceOwner
Definition: resowner.c:165
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:442
Definition: lwlock.h:41
RelFileLocator locator
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:37

References Assert(), BM_DIRTY, BM_IO_IN_PROGRESS, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), CurrentResourceOwner, GetBufferDescriptor(), GetVictimBuffer(), INIT_FORKNUM, InitBufferTag(), RelFileLocatorBackend::locator, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrRelationData::smgr_rlocator, StartBufferIO(), StrategyFreeBuffer(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by ReadBuffer_common().

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 3378 of file bufmgr.c.

3379 {
3380  BufferDesc *bufHdr;
3381 
3382  Assert(BufferIsPinned(buffer));
3383 
3384  if (BufferIsLocal(buffer))
3385  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3386  else
3387  bufHdr = GetBufferDescriptor(buffer - 1);
3388 
3389  /* pinned, so OK to read tag without spinlock */
3390  return bufHdr->tag.blockNum;
3391 }
#define BufferIsLocal(buffer)
Definition: buf.h:37
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:463

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by _bt_binsrch_insert(), _bt_bottomupdel_pass(), _bt_check_unique(), _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_endpoint(), _bt_finish_split(), _bt_first(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readnextpage(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_simpledel_pass(), _bt_split(), _bt_unlink_halfdead_page(), _bt_walk_left(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), collectMatchBitmap(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_fork_with_data(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_is_all_visible(), heap_page_prune(), heap_prune_chain(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), index_compute_xid_horizon_for_tuples(), lazy_scan_noprune(), lazy_scan_prune(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), ReadBufferBI(), RelationAddBlocks(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), ScanSourceDatabasePgClassPage(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgWalk(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), and visibilitymap_set().

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 3639 of file bufmgr.c.

3640 {
3641  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
3642  char *page = BufferGetPage(buffer);
3643  XLogRecPtr lsn;
3644  uint32 buf_state;
3645 
3646  /*
3647  * If we don't need locking for correctness, fastpath out.
3648  */
3649  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
3650  return PageGetLSN(page);
3651 
3652  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3653  Assert(BufferIsValid(buffer));
3654  Assert(BufferIsPinned(buffer));
3655 
3656  buf_state = LockBufHdr(bufHdr);
3657  lsn = PageGetLSN(page);
3658  UnlockBufHdr(bufHdr, buf_state);
3659 
3660  return lsn;
3661 }
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
#define XLogHintBitIsNeeded()
Definition: xlog.h:118
uint64 XLogRecPtr
Definition: xlogdefs.h:21

References Assert(), PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), LockBufHdr(), PageGetLSN(), UnlockBufHdr(), and XLogHintBitIsNeeded.

Referenced by _bt_killitems(), _bt_readpage(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileLocator rlocator,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 3399 of file bufmgr.c.

3401 {
3402  BufferDesc *bufHdr;
3403 
3404  /* Do the same checks as BufferGetBlockNumber. */
3405  Assert(BufferIsPinned(buffer));
3406 
3407  if (BufferIsLocal(buffer))
3408  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3409  else
3410  bufHdr = GetBufferDescriptor(buffer - 1);
3411 
3412  /* pinned, so OK to read tag without spinlock */
3413  *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3414  *forknum = BufTagGetForkNum(&bufHdr->tag);
3415  *blknum = bufHdr->tag.blockNum;
3416 }

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

◆ BufferIsDirty()

bool BufferIsDirty ( Buffer  buffer)

Definition at line 2158 of file bufmgr.c.

2159 {
2160  BufferDesc *bufHdr;
2161 
2162  if (BufferIsLocal(buffer))
2163  {
2164  int bufid = -buffer - 1;
2165 
2166  bufHdr = GetLocalBufferDescriptor(bufid);
2167  }
2168  else
2169  {
2170  bufHdr = GetBufferDescriptor(buffer - 1);
2171  }
2172 
2173  Assert(BufferIsPinned(buffer));
2175  LW_EXCLUSIVE));
2176 
2177  return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2178 }
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:234
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1944
pg_atomic_uint32 state

References Assert(), BM_DIRTY, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by XLogRegisterBuffer().

◆ BufferIsExclusiveLocked()

bool BufferIsExclusiveLocked ( Buffer  buffer)

Definition at line 2129 of file bufmgr.c.

2130 {
2131  BufferDesc *bufHdr;
2132 
2133  if (BufferIsLocal(buffer))
2134  {
2135  int bufid = -buffer - 1;
2136 
2137  bufHdr = GetLocalBufferDescriptor(bufid);
2138  }
2139  else
2140  {
2141  bufHdr = GetBufferDescriptor(buffer - 1);
2142  }
2143 
2144  Assert(BufferIsPinned(buffer));
2146  LW_EXCLUSIVE);
2147 }

References Assert(), PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), LW_EXCLUSIVE, and LWLockHeldByMeInMode().

Referenced by XLogRegisterBuffer().

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 3609 of file bufmgr.c.

3610 {
3611  BufferDesc *bufHdr;
3612 
3613  /* Local buffers are used only for temp relations. */
3614  if (BufferIsLocal(buffer))
3615  return false;
3616 
3617  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3618  Assert(BufferIsValid(buffer));
3619  Assert(BufferIsPinned(buffer));
3620 
3621  /*
3622  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3623  * need not bother with the buffer header spinlock. Even if someone else
3624  * changes the buffer header state while we're doing this, the state is
3625  * changed atomically, so we'll read the old value or the new value, but
3626  * not random garbage.
3627  */
3628  bufHdr = GetBufferDescriptor(buffer - 1);
3629  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3630 }

References Assert(), BM_PERMANENT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 2566 of file bufmgr.c.

2567 {
2568  uint32 buf_state;
2569  int buf_id;
2570  int num_to_scan;
2571  int num_spaces;
2572  int num_processed;
2573  int num_written;
2574  CkptTsStatus *per_ts_stat = NULL;
2575  Oid last_tsid;
2576  binaryheap *ts_heap;
2577  int i;
2578  int mask = BM_DIRTY;
2579  WritebackContext wb_context;
2580 
2581  /*
2582  * Unless this is a shutdown checkpoint or we have been explicitly told,
2583  * we write only permanent, dirty buffers. But at shutdown or end of
2584  * recovery, we write all dirty buffers.
2585  */
2588  mask |= BM_PERMANENT;
2589 
2590  /*
2591  * Loop over all buffers, and mark the ones that need to be written with
2592  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2593  * can estimate how much work needs to be done.
2594  *
2595  * This allows us to write only those pages that were dirty when the
2596  * checkpoint began, and not those that get dirtied while it proceeds.
2597  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2598  * later in this function, or by normal backends or the bgwriter cleaning
2599  * scan, the flag is cleared. Any buffer dirtied after this point won't
2600  * have the flag set.
2601  *
2602  * Note that if we fail to write some buffer, we may leave buffers with
2603  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2604  * certainly need to be written for the next checkpoint attempt, too.
2605  */
2606  num_to_scan = 0;
2607  for (buf_id = 0; buf_id < NBuffers; buf_id++)
2608  {
2609  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2610 
2611  /*
2612  * Header spinlock is enough to examine BM_DIRTY, see comment in
2613  * SyncOneBuffer.
2614  */
2615  buf_state = LockBufHdr(bufHdr);
2616 
2617  if ((buf_state & mask) == mask)
2618  {
2619  CkptSortItem *item;
2620 
2621  buf_state |= BM_CHECKPOINT_NEEDED;
2622 
2623  item = &CkptBufferIds[num_to_scan++];
2624  item->buf_id = buf_id;
2625  item->tsId = bufHdr->tag.spcOid;
2626  item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2627  item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2628  item->blockNum = bufHdr->tag.blockNum;
2629  }
2630 
2631  UnlockBufHdr(bufHdr, buf_state);
2632 
2633  /* Check for barrier events in case NBuffers is large. */
2636  }
2637 
2638  if (num_to_scan == 0)
2639  return; /* nothing to do */
2640 
2642 
2643  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2644 
2645  /*
2646  * Sort buffers that need to be written to reduce the likelihood of random
2647  * IO. The sorting is also important for the implementation of balancing
2648  * writes between tablespaces. Without balancing writes we'd potentially
2649  * end up writing to the tablespaces one-by-one; possibly overloading the
2650  * underlying system.
2651  */
2652  sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2653 
2654  num_spaces = 0;
2655 
2656  /*
2657  * Allocate progress status for each tablespace with buffers that need to
2658  * be flushed. This requires the to-be-flushed array to be sorted.
2659  */
2660  last_tsid = InvalidOid;
2661  for (i = 0; i < num_to_scan; i++)
2662  {
2663  CkptTsStatus *s;
2664  Oid cur_tsid;
2665 
2666  cur_tsid = CkptBufferIds[i].tsId;
2667 
2668  /*
2669  * Grow array of per-tablespace status structs, every time a new
2670  * tablespace is found.
2671  */
2672  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
2673  {
2674  Size sz;
2675 
2676  num_spaces++;
2677 
2678  /*
2679  * Not worth adding grow-by-power-of-2 logic here - even with a
2680  * few hundred tablespaces this should be fine.
2681  */
2682  sz = sizeof(CkptTsStatus) * num_spaces;
2683 
2684  if (per_ts_stat == NULL)
2685  per_ts_stat = (CkptTsStatus *) palloc(sz);
2686  else
2687  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
2688 
2689  s = &per_ts_stat[num_spaces - 1];
2690  memset(s, 0, sizeof(*s));
2691  s->tsId = cur_tsid;
2692 
2693  /*
2694  * The first buffer in this tablespace. As CkptBufferIds is sorted
2695  * by tablespace all (s->num_to_scan) buffers in this tablespace
2696  * will follow afterwards.
2697  */
2698  s->index = i;
2699 
2700  /*
2701  * progress_slice will be determined once we know how many buffers
2702  * are in each tablespace, i.e. after this loop.
2703  */
2704 
2705  last_tsid = cur_tsid;
2706  }
2707  else
2708  {
2709  s = &per_ts_stat[num_spaces - 1];
2710  }
2711 
2712  s->num_to_scan++;
2713 
2714  /* Check for barrier events. */
2717  }
2718 
2719  Assert(num_spaces > 0);
2720 
2721  /*
2722  * Build a min-heap over the write-progress in the individual tablespaces,
2723  * and compute how large a portion of the total progress a single
2724  * processed buffer is.
2725  */
2726  ts_heap = binaryheap_allocate(num_spaces,
2728  NULL);
2729 
2730  for (i = 0; i < num_spaces; i++)
2731  {
2732  CkptTsStatus *ts_stat = &per_ts_stat[i];
2733 
2734  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
2735 
2736  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
2737  }
2738 
2739  binaryheap_build(ts_heap);
2740 
2741  /*
2742  * Iterate through to-be-checkpointed buffers and write the ones (still)
2743  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
2744  * tablespaces; otherwise the sorting would lead to only one tablespace
2745  * receiving writes at a time, making inefficient use of the hardware.
2746  */
2747  num_processed = 0;
2748  num_written = 0;
2749  while (!binaryheap_empty(ts_heap))
2750  {
2751  BufferDesc *bufHdr = NULL;
2752  CkptTsStatus *ts_stat = (CkptTsStatus *)
2754 
2755  buf_id = CkptBufferIds[ts_stat->index].buf_id;
2756  Assert(buf_id != -1);
2757 
2758  bufHdr = GetBufferDescriptor(buf_id);
2759 
2760  num_processed++;
2761 
2762  /*
2763  * We don't need to acquire the lock here, because we're only looking
2764  * at a single bit. It's possible that someone else writes the buffer
2765  * and clears the flag right after we check, but that doesn't matter
2766  * since SyncOneBuffer will then do nothing. However, there is a
2767  * further race condition: it's conceivable that between the time we
2768  * examine the bit here and the time SyncOneBuffer acquires the lock,
2769  * someone else not only wrote the buffer but replaced it with another
2770  * page and dirtied it. In that improbable case, SyncOneBuffer will
2771  * write the buffer though we didn't need to. It doesn't seem worth
2772  * guarding against this, though.
2773  */
2775  {
2776  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2777  {
2778  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2780  num_written++;
2781  }
2782  }
2783 
2784  /*
2785  * Measure progress independent of actually having to flush the buffer
2786  * - otherwise writing become unbalanced.
2787  */
2788  ts_stat->progress += ts_stat->progress_slice;
2789  ts_stat->num_scanned++;
2790  ts_stat->index++;
2791 
2792  /* Have all the buffers from the tablespace been processed? */
2793  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2794  {
2795  binaryheap_remove_first(ts_heap);
2796  }
2797  else
2798  {
2799  /* update heap with the new progress */
2800  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2801  }
2802 
2803  /*
2804  * Sleep to throttle our I/O rate.
2805  *
2806  * (This will check for barrier events even if it doesn't sleep.)
2807  */
2808  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2809  }
2810 
2811  /*
2812  * Issue all pending flushes. Only checkpointer calls BufferSync(), so
2813  * IOContext will always be IOCONTEXT_NORMAL.
2814  */
2816 
2817  pfree(per_ts_stat);
2818  per_ts_stat = NULL;
2819  binaryheap_free(ts_heap);
2820 
2821  /*
2822  * Update checkpoint statistics. As noted above, this doesn't include
2823  * buffers written by other backends or bgwriter scan.
2824  */
2825  CheckpointStats.ckpt_bufs_written += num_written;
2826 
2827  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2828 }
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
#define binaryheap_empty(h)
Definition: binaryheap.h:65
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:68
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:5510
int checkpoint_flush_after
Definition: bufmgr.c:160
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:5533
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:5590
struct CkptTsStatus CkptTsStatus
double float8
Definition: c.h:617
size_t Size
Definition: c.h:592
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:705
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:38
int i
Definition: isn.c:73
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1425
void * palloc(Size size)
Definition: mcxt.c:1201
@ IOCONTEXT_NORMAL
Definition: pgstat.h:290
PgStat_CheckpointerStats PendingCheckpointerStats
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:465
int ckpt_bufs_written
Definition: xlog.h:165
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:111
int index
Definition: bufmgr.c:119
int num_scanned
Definition: bufmgr.c:116
float8 progress
Definition: bufmgr.c:110
int num_to_scan
Definition: bufmgr.c:114
Oid tsId
Definition: bufmgr.c:101
PgStat_Counter buffers_written
Definition: pgstat.h:270
Oid spcOid
Definition: buf_internals.h:94
CheckpointStatsData CheckpointStats
Definition: xlog.c:214
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:138
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:141
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:137

References Assert(), binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, PgStat_CheckpointerStats::buffers_written, BufTagGetForkNum(), BufTagGetRelNumber(), CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_ALL, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, DatumGetPointer(), CkptSortItem::forkNum, GetBufferDescriptor(), i, CkptTsStatus::index, InvalidOid, IOCONTEXT_NORMAL, IssuePendingWritebacks(), LockBufHdr(), NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), PendingCheckpointerStats, pfree(), pg_atomic_read_u32(), PointerGetDatum(), ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress, CkptTsStatus::progress_slice, CkptSortItem::relNumber, repalloc(), buftag::spcOid, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdr(), and WritebackContextInit().

Referenced by CheckPointBuffers().

◆ buffertag_comparator()

static int buffertag_comparator ( const BufferTag ba,
const BufferTag bb 
)
inlinestatic

Definition at line 5445 of file bufmgr.c.

5446 {
5447  int ret;
5448  RelFileLocator rlocatora;
5449  RelFileLocator rlocatorb;
5450 
5451  rlocatora = BufTagGetRelFileLocator(ba);
5452  rlocatorb = BufTagGetRelFileLocator(bb);
5453 
5454  ret = rlocator_comparator(&rlocatora, &rlocatorb);
5455 
5456  if (ret != 0)
5457  return ret;
5458 
5459  if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
5460  return -1;
5461  if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
5462  return 1;
5463 
5464  if (ba->blockNum < bb->blockNum)
5465  return -1;
5466  if (ba->blockNum > bb->blockNum)
5467  return 1;
5468 
5469  return 0;
5470 }
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:5364

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), and rlocator_comparator().

◆ CheckBufferIsPinnedOnce()

void CheckBufferIsPinnedOnce ( Buffer  buffer)

Definition at line 4843 of file bufmgr.c.

4844 {
4845  if (BufferIsLocal(buffer))
4846  {
4847  if (LocalRefCount[-buffer - 1] != 1)
4848  elog(ERROR, "incorrect local pin count: %d",
4849  LocalRefCount[-buffer - 1]);
4850  }
4851  else
4852  {
4853  if (GetPrivateRefCount(buffer) != 1)
4854  elog(ERROR, "incorrect local pin count: %d",
4855  GetPrivateRefCount(buffer));
4856  }
4857 }
#define ERROR
Definition: elog.h:39

References PrivateRefCountEntry::buffer, BufferIsLocal, elog, ERROR, GetPrivateRefCount(), and LocalRefCount.

Referenced by GetVictimBuffer(), and LockBufferForCleanup().

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 3273 of file bufmgr.c.

3274 {
3275 #ifdef USE_ASSERT_CHECKING
3276  int RefCountErrors = 0;
3278  int i;
3279  char *s;
3280 
3281  /* check the array */
3282  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
3283  {
3285 
3286  if (res->buffer != InvalidBuffer)
3287  {
3288  s = DebugPrintBufferRefcount(res->buffer);
3289  elog(WARNING, "buffer refcount leak: %s", s);
3290  pfree(s);
3291 
3292  RefCountErrors++;
3293  }
3294  }
3295 
3296  /* if necessary search the hash */
3298  {
3299  HASH_SEQ_STATUS hstat;
3300 
3302  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
3303  {
3304  s = DebugPrintBufferRefcount(res->buffer);
3305  elog(WARNING, "buffer refcount leak: %s", s);
3306  pfree(s);
3307  RefCountErrors++;
3308  }
3309  }
3310 
3311  Assert(RefCountErrors == 0);
3312 #endif
3313 }
#define InvalidBuffer
Definition: buf.h:25
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:3319
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:92
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:197
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:198
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1431
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1421

References Assert(), DebugPrintBufferRefcount(), elog, hash_seq_init(), hash_seq_search(), i, InvalidBuffer, pfree(), PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, res, and WARNING.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 3364 of file bufmgr.c.

3365 {
3366  BufferSync(flags);
3367 }
static void BufferSync(int flags)
Definition: bufmgr.c:2566

References BufferSync().

Referenced by CheckPointGuts().

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const CkptSortItem a,
const CkptSortItem b 
)
inlinestatic

Definition at line 5479 of file bufmgr.c.

5480 {
5481  /* compare tablespace */
5482  if (a->tsId < b->tsId)
5483  return -1;
5484  else if (a->tsId > b->tsId)
5485  return 1;
5486  /* compare relation */
5487  if (a->relNumber < b->relNumber)
5488  return -1;
5489  else if (a->relNumber > b->relNumber)
5490  return 1;
5491  /* compare fork */
5492  else if (a->forkNum < b->forkNum)
5493  return -1;
5494  else if (a->forkNum > b->forkNum)
5495  return 1;
5496  /* compare block number */
5497  else if (a->blockNum < b->blockNum)
5498  return -1;
5499  else if (a->blockNum > b->blockNum)
5500  return 1;
5501  /* equal page IDs are unlikely, but not impossible */
5502  return 0;
5503 }
int b
Definition: isn.c:70
int a
Definition: isn.c:69

References a, and b.

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

Definition at line 4822 of file bufmgr.c.

4823 {
4824  BufferDesc *buf;
4825 
4826  Assert(BufferIsPinned(buffer));
4827  if (BufferIsLocal(buffer))
4828  return true; /* act as though we got it */
4829 
4830  buf = GetBufferDescriptor(buffer - 1);
4831 
4833  LW_EXCLUSIVE);
4834 }
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1346

References Assert(), buf, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), LW_EXCLUSIVE, and LWLockConditionalAcquire().

Referenced by _bt_conditionallockbuf(), BloomNewBuffer(), ConditionalLockBufferForCleanup(), GinNewBuffer(), gistNewBuffer(), RelationGetBufferForTuple(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), and SpGistUpdateMetaPage().

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 5037 of file bufmgr.c.

5038 {
5039  BufferDesc *bufHdr;
5040  uint32 buf_state,
5041  refcount;
5042 
5043  Assert(BufferIsValid(buffer));
5044 
5045  if (BufferIsLocal(buffer))
5046  {
5047  refcount = LocalRefCount[-buffer - 1];
5048  /* There should be exactly one pin */
5049  Assert(refcount > 0);
5050  if (refcount != 1)
5051  return false;
5052  /* Nobody else to wait for */
5053  return true;
5054  }
5055 
5056  /* There should be exactly one local pin */
5057  refcount = GetPrivateRefCount(buffer);
5058  Assert(refcount);
5059  if (refcount != 1)
5060  return false;
5061 
5062  /* Try to acquire lock */
5063  if (!ConditionalLockBuffer(buffer))
5064  return false;
5065 
5066  bufHdr = GetBufferDescriptor(buffer - 1);
5067  buf_state = LockBufHdr(bufHdr);
5068  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5069 
5070  Assert(refcount > 0);
5071  if (refcount == 1)
5072  {
5073  /* Successfully acquired exclusive lock with pincount 1 */
5074  UnlockBufHdr(bufHdr, buf_state);
5075  return true;
5076  }
5077 
5078  /* Failed, so release the lock */
5079  UnlockBufHdr(bufHdr, buf_state);
5080  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5081  return false;
5082 }
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:4822
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4796
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:157

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), ConditionalLockBuffer(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr().

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), and lazy_scan_heap().

◆ CreateAndCopyRelationData()

void CreateAndCopyRelationData ( RelFileLocator  src_rlocator,
RelFileLocator  dst_rlocator,
bool  permanent 
)

Definition at line 4435 of file bufmgr.c.

4437 {
4438  char relpersistence;
4439  SMgrRelation src_rel;
4440  SMgrRelation dst_rel;
4441 
4442  /* Set the relpersistence. */
4443  relpersistence = permanent ?
4444  RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
4445 
4446  src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
4447  dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
4448 
4449  /*
4450  * Create and copy all forks of the relation. During create database we
4451  * have a separate cleanup mechanism which deletes complete database
4452  * directory. Therefore, each individual relation doesn't need to be
4453  * registered for cleanup.
4454  */
4455  RelationCreateStorage(dst_rlocator, relpersistence, false);
4456 
4457  /* copy main fork. */
4458  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
4459  permanent);
4460 
4461  /* copy those extra forks that exist */
4462  for (ForkNumber forkNum = MAIN_FORKNUM + 1;
4463  forkNum <= MAX_FORKNUM; forkNum++)
4464  {
4465  if (smgrexists(src_rel, forkNum))
4466  {
4467  smgrcreate(dst_rel, forkNum, false);
4468 
4469  /*
4470  * WAL log creation if the relation is persistent, or this is the
4471  * init fork of an unlogged relation.
4472  */
4473  if (permanent || forkNum == INIT_FORKNUM)
4474  log_smgrcreate(&dst_rlocator, forkNum);
4475 
4476  /* Copy a fork's data, block by block. */
4477  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
4478  permanent);
4479  }
4480  }
4481 }
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:4344
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
#define MAX_FORKNUM
Definition: relpath.h:62
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:199
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:414
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:401
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:121
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:186

References INIT_FORKNUM, INVALID_PROC_NUMBER, log_smgrcreate(), MAIN_FORKNUM, MAX_FORKNUM, RelationCopyStorageUsingBuffer(), RelationCreateStorage(), smgrcreate(), smgrexists(), and smgropen().

Referenced by CreateDatabaseUsingWalLog().

◆ DebugPrintBufferRefcount()

char* DebugPrintBufferRefcount ( Buffer  buffer)

Definition at line 3319 of file bufmgr.c.

3320 {
3321  BufferDesc *buf;
3322  int32 loccount;
3323  char *path;
3324  char *result;
3325  ProcNumber backend;
3326  uint32 buf_state;
3327 
3328  Assert(BufferIsValid(buffer));
3329  if (BufferIsLocal(buffer))
3330  {
3331  buf = GetLocalBufferDescriptor(-buffer - 1);
3332  loccount = LocalRefCount[-buffer - 1];
3333  backend = MyProcNumber;
3334  }
3335  else
3336  {
3337  buf = GetBufferDescriptor(buffer - 1);
3338  loccount = GetPrivateRefCount(buffer);
3339  backend = INVALID_PROC_NUMBER;
3340  }
3341 
3342  /* theoretically we should lock the bufhdr here */
3343  path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
3344  BufTagGetForkNum(&buf->tag));
3345  buf_state = pg_atomic_read_u32(&buf->state);
3346 
3347  result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
3348  buffer, path,
3349  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
3350  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
3351  pfree(path);
3352  return result;
3353 }
#define BUF_FLAG_MASK
Definition: buf_internals.h:48
ProcNumber MyProcNumber
Definition: globals.c:86
int ProcNumber
Definition: procnumber.h:24
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:85

References Assert(), buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), INVALID_PROC_NUMBER, LocalRefCount, MyProcNumber, pfree(), pg_atomic_read_u32(), psprintf(), and relpathbackend.

Referenced by CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResOwnerPrintBufferPin().

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 4040 of file bufmgr.c.

4041 {
4042  int i;
4043 
4044  /*
4045  * We needn't consider local buffers, since by assumption the target
4046  * database isn't our own.
4047  */
4048 
4049  for (i = 0; i < NBuffers; i++)
4050  {
4051  BufferDesc *bufHdr = GetBufferDescriptor(i);
4052  uint32 buf_state;
4053 
4054  /*
4055  * As in DropRelationBuffers, an unlocked precheck should be safe and
4056  * saves some cycles.
4057  */
4058  if (bufHdr->tag.dbOid != dbid)
4059  continue;
4060 
4061  buf_state = LockBufHdr(bufHdr);
4062  if (bufHdr->tag.dbOid == dbid)
4063  InvalidateBuffer(bufHdr); /* releases spinlock */
4064  else
4065  UnlockBufHdr(bufHdr, buf_state);
4066  }
4067 }
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1442
Oid dbOid
Definition: buf_internals.h:95

References buftag::dbOid, GetBufferDescriptor(), i, InvalidateBuffer(), LockBufHdr(), NBuffers, BufferDesc::tag, and UnlockBufHdr().

Referenced by createdb_failure_callback(), dbase_redo(), dropdb(), and movedb().

◆ DropRelationBuffers()

void DropRelationBuffers ( SMgrRelation  smgr_reln,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 3685 of file bufmgr.c.

3687 {
3688  int i;
3689  int j;
3690  RelFileLocatorBackend rlocator;
3691  BlockNumber nForkBlock[MAX_FORKNUM];
3692  uint64 nBlocksToInvalidate = 0;
3693 
3694  rlocator = smgr_reln->smgr_rlocator;
3695 
3696  /* If it's a local relation, it's localbuf.c's problem. */
3697  if (RelFileLocatorBackendIsTemp(rlocator))
3698  {
3699  if (rlocator.backend == MyProcNumber)
3700  {
3701  for (j = 0; j < nforks; j++)
3702  DropRelationLocalBuffers(rlocator.locator, forkNum[j],
3703  firstDelBlock[j]);
3704  }
3705  return;
3706  }
3707 
3708  /*
3709  * To remove all the pages of the specified relation forks from the buffer
3710  * pool, we need to scan the entire buffer pool but we can optimize it by
3711  * finding the buffers from BufMapping table provided we know the exact
3712  * size of each fork of the relation. The exact size is required to ensure
3713  * that we don't leave any buffer for the relation being dropped as
3714  * otherwise the background writer or checkpointer can lead to a PANIC
3715  * error while flushing buffers corresponding to files that don't exist.
3716  *
3717  * To know the exact size, we rely on the size cached for each fork by us
3718  * during recovery which limits the optimization to recovery and on
3719  * standbys but we can easily extend it once we have shared cache for
3720  * relation size.
3721  *
3722  * In recovery, we cache the value returned by the first lseek(SEEK_END)
3723  * and the future writes keeps the cached value up-to-date. See
3724  * smgrextend. It is possible that the value of the first lseek is smaller
3725  * than the actual number of existing blocks in the file due to buggy
3726  * Linux kernels that might not have accounted for the recent write. But
3727  * that should be fine because there must not be any buffers after that
3728  * file size.
3729  */
3730  for (i = 0; i < nforks; i++)
3731  {
3732  /* Get the number of blocks for a relation's fork */
3733  nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
3734 
3735  if (nForkBlock[i] == InvalidBlockNumber)
3736  {
3737  nBlocksToInvalidate = InvalidBlockNumber;
3738  break;
3739  }
3740 
3741  /* calculate the number of blocks to be invalidated */
3742  nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
3743  }
3744 
3745  /*
3746  * We apply the optimization iff the total number of blocks to invalidate
3747  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3748  */
3749  if (BlockNumberIsValid(nBlocksToInvalidate) &&
3750  nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3751  {
3752  for (j = 0; j < nforks; j++)
3753  FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
3754  nForkBlock[j], firstDelBlock[j]);
3755  return;
3756  }
3757 
3758  for (i = 0; i < NBuffers; i++)
3759  {
3760  BufferDesc *bufHdr = GetBufferDescriptor(i);
3761  uint32 buf_state;
3762 
3763  /*
3764  * We can make this a tad faster by prechecking the buffer tag before
3765  * we attempt to lock the buffer; this saves a lot of lock
3766  * acquisitions in typical cases. It should be safe because the
3767  * caller must have AccessExclusiveLock on the relation, or some other
3768  * reason to be certain that no one is loading new pages of the rel
3769  * into the buffer pool. (Otherwise we might well miss such pages
3770  * entirely.) Therefore, while the tag might be changing while we
3771  * look at it, it can't be changing *to* a value we care about, only
3772  * *away* from such a value. So false negatives are impossible, and
3773  * false positives are safe because we'll recheck after getting the
3774  * buffer lock.
3775  *
3776  * We could check forkNum and blockNum as well as the rlocator, but
3777  * the incremental win from doing so seems small.
3778  */
3779  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
3780  continue;
3781 
3782  buf_state = LockBufHdr(bufHdr);
3783 
3784  for (j = 0; j < nforks; j++)
3785  {
3786  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
3787  BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
3788  bufHdr->tag.blockNum >= firstDelBlock[j])
3789  {
3790  InvalidateBuffer(bufHdr); /* releases spinlock */
3791  break;
3792  }
3793  }
3794  if (j >= nforks)
3795  UnlockBufHdr(bufHdr, buf_state);
3796  }
3797 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:83
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:3979
int j
Definition: isn.c:74
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:490
#define RelFileLocatorBackendIsTemp(rlocator)
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:682

References RelFileLocatorBackend::backend, buftag::blockNum, BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetForkNum(), BufTagMatchesRelFileLocator(), DropRelationLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, RelFileLocatorBackend::locator, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, RelFileLocatorBackendIsTemp, SMgrRelationData::smgr_rlocator, smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrtruncate().

◆ DropRelationsAllBuffers()

void DropRelationsAllBuffers ( SMgrRelation smgr_reln,
int  nlocators 
)

Definition at line 3808 of file bufmgr.c.

3809 {
3810  int i;
3811  int n = 0;
3812  SMgrRelation *rels;
3813  BlockNumber (*block)[MAX_FORKNUM + 1];
3814  uint64 nBlocksToInvalidate = 0;
3815  RelFileLocator *locators;
3816  bool cached = true;
3817  bool use_bsearch;
3818 
3819  if (nlocators == 0)
3820  return;
3821 
3822  rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
3823 
3824  /* If it's a local relation, it's localbuf.c's problem. */
3825  for (i = 0; i < nlocators; i++)
3826  {
3827  if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
3828  {
3829  if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
3830  DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
3831  }
3832  else
3833  rels[n++] = smgr_reln[i];
3834  }
3835 
3836  /*
3837  * If there are no non-local relations, then we're done. Release the
3838  * memory and return.
3839  */
3840  if (n == 0)
3841  {
3842  pfree(rels);
3843  return;
3844  }
3845 
3846  /*
3847  * This is used to remember the number of blocks for all the relations
3848  * forks.
3849  */
3850  block = (BlockNumber (*)[MAX_FORKNUM + 1])
3851  palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
3852 
3853  /*
3854  * We can avoid scanning the entire buffer pool if we know the exact size
3855  * of each of the given relation forks. See DropRelationBuffers.
3856  */
3857  for (i = 0; i < n && cached; i++)
3858  {
3859  for (int j = 0; j <= MAX_FORKNUM; j++)
3860  {
3861  /* Get the number of blocks for a relation's fork. */
3862  block[i][j] = smgrnblocks_cached(rels[i], j);
3863 
3864  /* We need to only consider the relation forks that exists. */
3865  if (block[i][j] == InvalidBlockNumber)
3866  {
3867  if (!smgrexists(rels[i], j))
3868  continue;
3869  cached = false;
3870  break;
3871  }
3872 
3873  /* calculate the total number of blocks to be invalidated */
3874  nBlocksToInvalidate += block[i][j];
3875  }
3876  }
3877 
3878  /*
3879  * We apply the optimization iff the total number of blocks to invalidate
3880  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3881  */
3882  if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3883  {
3884  for (i = 0; i < n; i++)
3885  {
3886  for (int j = 0; j <= MAX_FORKNUM; j++)
3887  {
3888  /* ignore relation forks that doesn't exist */
3889  if (!BlockNumberIsValid(block[i][j]))
3890  continue;
3891 
3892  /* drop all the buffers for a particular relation fork */
3893  FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
3894  j, block[i][j], 0);
3895  }
3896  }
3897 
3898  pfree(block);
3899  pfree(rels);
3900  return;
3901  }
3902 
3903  pfree(block);
3904  locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
3905  for (i = 0; i < n; i++)
3906  locators[i] = rels[i]->smgr_rlocator.locator;
3907 
3908  /*
3909  * For low number of relations to drop just use a simple walk through, to
3910  * save the bsearch overhead. The threshold to use is rather a guess than
3911  * an exactly determined value, as it depends on many factors (CPU and RAM
3912  * speeds, amount of shared buffers etc.).
3913  */
3914  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3915 
3916  /* sort the list of rlocators if necessary */
3917  if (use_bsearch)
3918  qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
3919 
3920  for (i = 0; i < NBuffers; i++)
3921  {
3922  RelFileLocator *rlocator = NULL;
3923  BufferDesc *bufHdr = GetBufferDescriptor(i);
3924  uint32 buf_state;
3925 
3926  /*
3927  * As in DropRelationBuffers, an unlocked precheck should be safe and
3928  * saves some cycles.
3929  */
3930 
3931  if (!use_bsearch)
3932  {
3933  int j;
3934 
3935  for (j = 0; j < n; j++)
3936  {
3937  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
3938  {
3939  rlocator = &locators[j];
3940  break;
3941  }
3942  }
3943  }
3944  else
3945  {
3946  RelFileLocator locator;
3947 
3948  locator = BufTagGetRelFileLocator(&bufHdr->tag);
3949  rlocator = bsearch((const void *) &(locator),
3950  locators, n, sizeof(RelFileLocator),
3952  }
3953 
3954  /* buffer doesn't belong to any of the given relfilelocators; skip it */
3955  if (rlocator == NULL)
3956  continue;
3957 
3958  buf_state = LockBufHdr(bufHdr);
3959  if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
3960  InvalidateBuffer(bufHdr); /* releases spinlock */
3961  else
3962  UnlockBufHdr(bufHdr, buf_state);
3963  }
3964 
3965  pfree(locators);
3966  pfree(rels);
3967 }
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:75
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:538
#define qsort(a, b, c, d)
Definition: port.h:449

References BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), DropRelationAllLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, if(), InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, palloc(), pfree(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, rlocator_comparator(), smgrexists(), smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrdounlinkall().

◆ ExtendBufferedRel()

Buffer ExtendBufferedRel ( BufferManagerRelation  bmr,
ForkNumber  forkNum,
BufferAccessStrategy  strategy,
uint32  flags 
)

Definition at line 839 of file bufmgr.c.

843 {
844  Buffer buf;
845  uint32 extend_by = 1;
846 
847  ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
848  &buf, &extend_by);
849 
850  return buf;
851 }
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:871

References buf, and ExtendBufferedRelBy().

Referenced by _bt_allocbuf(), _hash_getnewbuf(), BloomNewBuffer(), brinbuild(), brinbuildempty(), fill_seq_fork_with_data(), ginbuildempty(), GinNewBuffer(), gistbuildempty(), gistNewBuffer(), ReadBuffer_common(), revmap_physical_extend(), and SpGistNewBuffer().

◆ ExtendBufferedRelBy()

BlockNumber ExtendBufferedRelBy ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
Buffer buffers,
uint32 extended_by 
)

Definition at line 871 of file bufmgr.c.

878 {
879  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
880  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
881  Assert(extend_by > 0);
882 
883  if (bmr.smgr == NULL)
884  {
885  bmr.smgr = RelationGetSmgr(bmr.rel);
886  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
887  }
888 
889  return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
890  extend_by, InvalidBlockNumber,
891  buffers, extended_by);
892 }
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:1805
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:566
struct SMgrRelationData * smgr
Definition: bufmgr.h:102
Form_pg_class rd_rel
Definition: rel.h:111

References Assert(), ExtendBufferedRelCommon(), InvalidBlockNumber, RelationData::rd_rel, BufferManagerRelation::rel, RelationGetSmgr(), BufferManagerRelation::relpersistence, and BufferManagerRelation::smgr.

Referenced by ExtendBufferedRel(), and RelationAddBlocks().

◆ ExtendBufferedRelCommon()

static BlockNumber ExtendBufferedRelCommon ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 1805 of file bufmgr.c.

1813 {
1814  BlockNumber first_block;
1815 
1816  TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
1820  bmr.smgr->smgr_rlocator.backend,
1821  extend_by);
1822 
1823  if (bmr.relpersistence == RELPERSISTENCE_TEMP)
1824  first_block = ExtendBufferedRelLocal(bmr, fork, flags,
1825  extend_by, extend_upto,
1826  buffers, &extend_by);
1827  else
1828  first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
1829  extend_by, extend_upto,
1830  buffers, &extend_by);
1831  *extended_by = extend_by;
1832 
1833  TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
1837  bmr.smgr->smgr_rlocator.backend,
1838  *extended_by,
1839  first_block);
1840 
1841  return first_block;
1842 }
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:1849
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:314
RelFileNumber relNumber

References RelFileLocatorBackend::backend, RelFileLocator::dbOid, ExtendBufferedRelLocal(), ExtendBufferedRelShared(), RelFileLocatorBackend::locator, RelFileLocator::relNumber, BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, SMgrRelationData::smgr_rlocator, and RelFileLocator::spcOid.

Referenced by ExtendBufferedRelBy(), and ExtendBufferedRelTo().

◆ ExtendBufferedRelShared()

static BlockNumber ExtendBufferedRelShared ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 1849 of file bufmgr.c.

1857 {
1858  BlockNumber first_block;
1859  IOContext io_context = IOContextForStrategy(strategy);
1860  instr_time io_start;
1861 
1862  LimitAdditionalPins(&extend_by);
1863 
1864  /*
1865  * Acquire victim buffers for extension without holding extension lock.
1866  * Writing out victim buffers is the most expensive part of extending the
1867  * relation, particularly when doing so requires WAL flushes. Zeroing out
1868  * the buffers is also quite expensive, so do that before holding the
1869  * extension lock as well.
1870  *
1871  * These pages are pinned by us and not valid. While we hold the pin they
1872  * can't be acquired as victim buffers by another backend.
1873  */
1874  for (uint32 i = 0; i < extend_by; i++)
1875  {
1876  Block buf_block;
1877 
1878  buffers[i] = GetVictimBuffer(strategy, io_context);
1879  buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
1880 
1881  /* new buffers are zero-filled */
1882  MemSet((char *) buf_block, 0, BLCKSZ);
1883  }
1884 
1885  /*
1886  * Lock relation against concurrent extensions, unless requested not to.
1887  *
1888  * We use the same extension lock for all forks. That's unnecessarily
1889  * restrictive, but currently extensions for forks don't happen often
1890  * enough to make it worth locking more granularly.
1891  *
1892  * Note that another backend might have extended the relation by the time
1893  * we get the lock.
1894  */
1895  if (!(flags & EB_SKIP_EXTENSION_LOCK))
1897 
1898  /*
1899  * If requested, invalidate size cache, so that smgrnblocks asks the
1900  * kernel.
1901  */
1902  if (flags & EB_CLEAR_SIZE_CACHE)
1904 
1905  first_block = smgrnblocks(bmr.smgr, fork);
1906 
1907  /*
1908  * Now that we have the accurate relation size, check if the caller wants
1909  * us to extend to only up to a specific size. If there were concurrent
1910  * extensions, we might have acquired too many buffers and need to release
1911  * them.
1912  */
1913  if (extend_upto != InvalidBlockNumber)
1914  {
1915  uint32 orig_extend_by = extend_by;
1916 
1917  if (first_block > extend_upto)
1918  extend_by = 0;
1919  else if ((uint64) first_block + extend_by > extend_upto)
1920  extend_by = extend_upto - first_block;
1921 
1922  for (uint32 i = extend_by; i < orig_extend_by; i++)
1923  {
1924  BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
1925 
1926  /*
1927  * The victim buffer we acquired previously is clean and unused,
1928  * let it be found again quickly
1929  */
1930  StrategyFreeBuffer(buf_hdr);
1931  UnpinBuffer(buf_hdr);
1932  }
1933 
1934  if (extend_by == 0)
1935  {
1936  if (!(flags & EB_SKIP_EXTENSION_LOCK))
1938  *extended_by = extend_by;
1939  return first_block;
1940  }
1941  }
1942 
1943  /* Fail if relation is already at maximum possible length */
1944  if ((uint64) first_block + extend_by >= MaxBlockNumber)
1945  ereport(ERROR,
1946  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1947  errmsg("cannot extend relation %s beyond %u blocks",
1948  relpath(bmr.smgr->smgr_rlocator, fork),
1949  MaxBlockNumber)));
1950 
1951  /*
1952  * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
1953  *
1954  * This needs to happen before we extend the relation, because as soon as
1955  * we do, other backends can start to read in those pages.
1956  */
1957  for (uint32 i = 0; i < extend_by; i++)
1958  {
1959  Buffer victim_buf = buffers[i];
1960  BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
1961  BufferTag tag;
1962  uint32 hash;
1963  LWLock *partition_lock;
1964  int existing_id;
1965 
1966  /* in case we need to pin an existing buffer below */
1969 
1970  InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
1971  hash = BufTableHashCode(&tag);
1972  partition_lock = BufMappingPartitionLock(hash);
1973 
1974  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1975 
1976  existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
1977 
1978  /*
1979  * We get here only in the corner case where we are trying to extend
1980  * the relation but we found a pre-existing buffer. This can happen
1981  * because a prior attempt at extending the relation failed, and
1982  * because mdread doesn't complain about reads beyond EOF (when
1983  * zero_damaged_pages is ON) and so a previous attempt to read a block
1984  * beyond EOF could have left a "valid" zero-filled buffer.
1985  * Unfortunately, we have also seen this case occurring because of
1986  * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
1987  * that doesn't account for a recent write. In that situation, the
1988  * pre-existing buffer would contain valid data that we don't want to
1989  * overwrite. Since the legitimate cases should always have left a
1990  * zero-filled buffer, complain if not PageIsNew.
1991  */
1992  if (existing_id >= 0)
1993  {
1994  BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
1995  Block buf_block;
1996  bool valid;
1997 
1998  /*
1999  * Pin the existing buffer before releasing the partition lock,
2000  * preventing it from being evicted.
2001  */
2002  valid = PinBuffer(existing_hdr, strategy);
2003 
2004  LWLockRelease(partition_lock);
2005 
2006  /*
2007  * The victim buffer we acquired previously is clean and unused,
2008  * let it be found again quickly
2009  */
2010  StrategyFreeBuffer(victim_buf_hdr);
2011  UnpinBuffer(victim_buf_hdr);
2012 
2013  buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2014  buf_block = BufHdrGetBlock(existing_hdr);
2015 
2016  if (valid && !PageIsNew((Page) buf_block))
2017  ereport(ERROR,
2018  (errmsg("unexpected data beyond EOF in block %u of relation %s",
2019  existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
2020  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2021 
2022  /*
2023  * We *must* do smgr[zero]extend before succeeding, else the page
2024  * will not be reserved by the kernel, and the next P_NEW call
2025  * will decide to return the same page. Clear the BM_VALID bit,
2026  * do StartBufferIO() and proceed.
2027  *
2028  * Loop to handle the very small possibility that someone re-sets
2029  * BM_VALID between our clearing it and StartBufferIO inspecting
2030  * it.
2031  */
2032  do
2033  {
2034  uint32 buf_state = LockBufHdr(existing_hdr);
2035 
2036  buf_state &= ~BM_VALID;
2037  UnlockBufHdr(existing_hdr, buf_state);
2038  } while (!StartBufferIO(existing_hdr, true));
2039  }
2040  else
2041  {
2042  uint32 buf_state;
2043 
2044  buf_state = LockBufHdr(victim_buf_hdr);
2045 
2046  /* some sanity checks while we hold the buffer header lock */
2047  Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2048  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2049 
2050  victim_buf_hdr->tag = tag;
2051 
2052  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2053  if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2054  buf_state |= BM_PERMANENT;
2055 
2056  UnlockBufHdr(victim_buf_hdr, buf_state);
2057 
2058  LWLockRelease(partition_lock);
2059 
2060  /* XXX: could combine the locked operations in it with the above */
2061  StartBufferIO(victim_buf_hdr, true);
2062  }
2063  }
2064 
2066 
2067  /*
2068  * Note: if smgrzeroextend fails, we will end up with buffers that are
2069  * allocated but not marked BM_VALID. The next relation extension will
2070  * still select the same block number (because the relation didn't get any
2071  * longer on disk) and so future attempts to extend the relation will find
2072  * the same buffers (if they have not been recycled) but come right back
2073  * here to try smgrzeroextend again.
2074  *
2075  * We don't need to set checksum for all-zero pages.
2076  */
2077  smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2078 
2079  /*
2080  * Release the file-extension lock; it's now OK for someone else to extend
2081  * the relation some more.
2082  *
2083  * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2084  * take noticeable time.
2085  */
2086  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2088 
2090  io_start, extend_by);
2091 
2092  /* Set BM_VALID, terminate IO, and wake up any waiters */
2093  for (uint32 i = 0; i < extend_by; i++)
2094  {
2095  Buffer buf = buffers[i];
2096  BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2097  bool lock = false;
2098 
2099  if (flags & EB_LOCK_FIRST && i == 0)
2100  lock = true;
2101  else if (flags & EB_LOCK_TARGET)
2102  {
2103  Assert(extend_upto != InvalidBlockNumber);
2104  if (first_block + i + 1 == extend_upto)
2105  lock = true;
2106  }
2107 
2108  if (lock)
2110 
2111  TerminateBufferIO(buf_hdr, false, BM_VALID, true);
2112  }
2113 
2114  pgBufferUsage.shared_blks_written += extend_by;
2115 
2116  *extended_by = extend_by;
2117 
2118  return first_block;
2119 }
#define MaxBlockNumber
Definition: block.h:35
#define BM_JUST_DIRTIED
Definition: buf_internals.h:66
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
bool track_io_timing
Definition: bufmgr.c:139
static void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:1774
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:64
void * Block
Definition: bufmgr.h:24
@ EB_LOCK_TARGET
Definition: bufmgr.h:91
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:88
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:73
@ EB_LOCK_FIRST
Definition: bufmgr.h:85
Pointer Page
Definition: bufpage.h:78
static bool PageIsNew(Page page)
Definition: bufpage.h:230
#define MemSet(start, val, len)
Definition: c.h:1007
int errhint(const char *fmt,...)
Definition: elog.c:1320
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:716
BufferUsage pgBufferUsage
Definition: instrument.c:20
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:431
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:481
#define ExclusiveLock
Definition: lockdefs.h:42
@ IOOBJECT_RELATION
Definition: pgstat.h:280
IOContext
Definition: pgstat.h:287
@ IOOP_EXTEND
Definition: pgstat.h:299
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:100
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt)
Definition: pgstat_io.c:122
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
#define relpath(rlocator, forknum)
Definition: relpath.h:94
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:658
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:563
int64 shared_blks_written
Definition: instrument.h:29
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:46

References Assert(), buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufHdrGetBlock, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), CurrentResourceOwner, EB_CLEAR_SIZE_CACHE, EB_LOCK_FIRST, EB_LOCK_TARGET, EB_SKIP_EXTENSION_LOCK, ereport, errcode(), errhint(), errmsg(), ERROR, ExclusiveLock, GetBufferDescriptor(), GetVictimBuffer(), hash(), i, INIT_FORKNUM, InitBufferTag(), InvalidBlockNumber, IOContextForStrategy(), IOOBJECT_RELATION, IOOP_EXTEND, LimitAdditionalPins(), RelFileLocatorBackend::locator, LockBufHdr(), LockRelationForExtension(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MaxBlockNumber, MemSet, PageIsNew(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer(), BufferManagerRelation::rel, relpath, BufferManagerRelation::relpersistence, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_written, BufferManagerRelation::smgr, SMgrRelationData::smgr_cached_nblocks, SMgrRelationData::smgr_rlocator, smgrnblocks(), smgrzeroextend(), StartBufferIO(), StrategyFreeBuffer(), BufferDesc::tag, TerminateBufferIO(), track_io_timing, UnlockBufHdr(), UnlockRelationForExtension(), and UnpinBuffer().

Referenced by ExtendBufferedRelCommon().

◆ ExtendBufferedRelTo()

Buffer ExtendBufferedRelTo ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
BlockNumber  extend_to,
ReadBufferMode  mode 
)

Definition at line 903 of file bufmgr.c.

909 {
911  uint32 extended_by = 0;
912  Buffer buffer = InvalidBuffer;
913  Buffer buffers[64];
914 
915  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
916  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
917  Assert(extend_to != InvalidBlockNumber && extend_to > 0);
918 
919  if (bmr.smgr == NULL)
920  {
921  bmr.smgr = RelationGetSmgr(bmr.rel);
922  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
923  }
924 
925  /*
926  * If desired, create the file if it doesn't exist. If
927  * smgr_cached_nblocks[fork] is positive then it must exist, no need for
928  * an smgrexists call.
929  */
930  if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
931  (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
933  !smgrexists(bmr.smgr, fork))
934  {
936 
937  /* recheck, fork might have been created concurrently */
938  if (!smgrexists(bmr.smgr, fork))
939  smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
940 
942  }
943 
944  /*
945  * If requested, invalidate size cache, so that smgrnblocks asks the
946  * kernel.
947  */
948  if (flags & EB_CLEAR_SIZE_CACHE)
950 
951  /*
952  * Estimate how many pages we'll need to extend by. This avoids acquiring
953  * unnecessarily many victim buffers.
954  */
955  current_size = smgrnblocks(bmr.smgr, fork);
956 
957  /*
958  * Since no-one else can be looking at the page contents yet, there is no
959  * difference between an exclusive lock and a cleanup-strength lock. Note
960  * that we pass the original mode to ReadBuffer_common() below, when
961  * falling back to reading the buffer to a concurrent relation extension.
962  */
964  flags |= EB_LOCK_TARGET;
965 
966  while (current_size < extend_to)
967  {
968  uint32 num_pages = lengthof(buffers);
969  BlockNumber first_block;
970 
971  if ((uint64) current_size + num_pages > extend_to)
972  num_pages = extend_to - current_size;
973 
974  first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
975  num_pages, extend_to,
976  buffers, &extended_by);
977 
978  current_size = first_block + extended_by;
979  Assert(num_pages != 0 || current_size >= extend_to);
980 
981  for (uint32 i = 0; i < extended_by; i++)
982  {
983  if (first_block + i != extend_to - 1)
984  ReleaseBuffer(buffers[i]);
985  else
986  buffer = buffers[i];
987  }
988  }
989 
990  /*
991  * It's possible that another backend concurrently extended the relation.
992  * In that case read the buffer.
993  *
994  * XXX: Should we control this via a flag?
995  */
996  if (buffer == InvalidBuffer)
997  {
998  bool hit;
999 
1000  Assert(extended_by == 0);
1001  buffer = ReadBuffer_common(bmr.smgr, bmr.relpersistence,
1002  fork, extend_to - 1, mode, strategy,
1003  &hit);
1004  }
1005 
1006  return buffer;
1007 }
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4561
static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:1015
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:76
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:82
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:47
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:45
#define lengthof(array)
Definition: c.h:775
static PgChecksumMode mode
Definition: pg_checksums.c:56
int64 current_size
Definition: pg_checksums.c:64

References Assert(), PrivateRefCountEntry::buffer, current_size, EB_CLEAR_SIZE_CACHE, EB_CREATE_FORK_IF_NEEDED, EB_LOCK_TARGET, EB_PERFORMING_RECOVERY, ExclusiveLock, ExtendBufferedRelCommon(), i, InvalidBlockNumber, InvalidBuffer, lengthof, LockRelationForExtension(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RelationData::rd_rel, ReadBuffer_common(), BufferManagerRelation::rel, RelationGetSmgr(), ReleaseBuffer(), BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, SMgrRelationData::smgr_cached_nblocks, smgrcreate(), smgrexists(), smgrnblocks(), and UnlockRelationForExtension().

Referenced by fsm_extend(), vm_extend(), and XLogReadBufferExtended().

◆ FindAndDropRelationBuffers()

static void FindAndDropRelationBuffers ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  nForkBlock,
BlockNumber  firstDelBlock 
)
static

Definition at line 3979 of file bufmgr.c.

3982 {
3983  BlockNumber curBlock;
3984 
3985  for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
3986  {
3987  uint32 bufHash; /* hash value for tag */
3988  BufferTag bufTag; /* identity of requested block */
3989  LWLock *bufPartitionLock; /* buffer partition lock for it */
3990  int buf_id;
3991  BufferDesc *bufHdr;
3992  uint32 buf_state;
3993 
3994  /* create a tag so we can lookup the buffer */
3995  InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
3996 
3997  /* determine its hash code and partition lock ID */
3998  bufHash = BufTableHashCode(&bufTag);
3999  bufPartitionLock = BufMappingPartitionLock(bufHash);
4000 
4001  /* Check that it is in the buffer pool. If not, do nothing. */
4002  LWLockAcquire(bufPartitionLock, LW_SHARED);
4003  buf_id = BufTableLookup(&bufTag, bufHash);
4004  LWLockRelease(bufPartitionLock);
4005 
4006  if (buf_id < 0)
4007  continue;
4008 
4009  bufHdr = GetBufferDescriptor(buf_id);
4010 
4011  /*
4012  * We need to lock the buffer header and recheck if the buffer is
4013  * still associated with the same block because the buffer could be
4014  * evicted by some other backend loading blocks for a different
4015  * relation after we release lock on the BufMapping table.
4016  */
4017  buf_state = LockBufHdr(bufHdr);
4018 
4019  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4020  BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4021  bufHdr->tag.blockNum >= firstDelBlock)
4022  InvalidateBuffer(bufHdr); /* releases spinlock */
4023  else
4024  UnlockBufHdr(bufHdr, buf_state);
4025  }
4026 }

References buftag::blockNum, BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), GetBufferDescriptor(), InitBufferTag(), InvalidateBuffer(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), BufferDesc::tag, and UnlockBufHdr().

Referenced by DropRelationBuffers(), and DropRelationsAllBuffers().

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 3438 of file bufmgr.c.

3440 {
3441  XLogRecPtr recptr;
3442  ErrorContextCallback errcallback;
3443  instr_time io_start;
3444  Block bufBlock;
3445  char *bufToWrite;
3446  uint32 buf_state;
3447 
3448  /*
3449  * Try to start an I/O operation. If StartBufferIO returns false, then
3450  * someone else flushed the buffer before we could, so we need not do
3451  * anything.
3452  */
3453  if (!StartBufferIO(buf, false))
3454  return;
3455 
3456  /* Setup error traceback support for ereport() */
3458  errcallback.arg = (void *) buf;
3459  errcallback.previous = error_context_stack;
3460  error_context_stack = &errcallback;
3461 
3462  /* Find smgr relation for buffer */
3463  if (reln == NULL)
3465 
3466  TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
3467  buf->tag.blockNum,
3469  reln->smgr_rlocator.locator.dbOid,
3471 
3472  buf_state = LockBufHdr(buf);
3473 
3474  /*
3475  * Run PageGetLSN while holding header lock, since we don't have the
3476  * buffer locked exclusively in all cases.
3477  */
3478  recptr = BufferGetLSN(buf);
3479 
3480  /* To check if block content changes while flushing. - vadim 01/17/97 */
3481  buf_state &= ~BM_JUST_DIRTIED;
3482  UnlockBufHdr(buf, buf_state);
3483 
3484  /*
3485  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
3486  * rule that log updates must hit disk before any of the data-file changes
3487  * they describe do.
3488  *
3489  * However, this rule does not apply to unlogged relations, which will be
3490  * lost after a crash anyway. Most unlogged relation pages do not bear
3491  * LSNs since we never emit WAL records for them, and therefore flushing
3492  * up through the buffer LSN would be useless, but harmless. However,
3493  * GiST indexes use LSNs internally to track page-splits, and therefore
3494  * unlogged GiST pages bear "fake" LSNs generated by
3495  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
3496  * LSN counter could advance past the WAL insertion point; and if it did
3497  * happen, attempting to flush WAL through that location would fail, with
3498  * disastrous system-wide consequences. To make sure that can't happen,
3499  * skip the flush if the buffer isn't permanent.
3500  */
3501  if (buf_state & BM_PERMANENT)
3502  XLogFlush(recptr);
3503 
3504  /*
3505  * Now it's safe to write buffer to disk. Note that no one else should
3506  * have been able to write it while we were busy with log flushing because
3507  * only one process at a time can set the BM_IO_IN_PROGRESS bit.
3508  */
3509  bufBlock = BufHdrGetBlock(buf);
3510 
3511  /*
3512  * Update page checksum if desired. Since we have only shared lock on the
3513  * buffer, other processes might be updating hint bits in it, so we must
3514  * copy the page to private storage if we do checksumming.
3515  */
3516  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
3517 
3519 
3520  /*
3521  * bufToWrite is either the shared buffer or a copy, as appropriate.
3522  */
3523  smgrwrite(reln,
3524  BufTagGetForkNum(&buf->tag),
3525  buf->tag.blockNum,
3526  bufToWrite,
3527  false);
3528 
3529  /*
3530  * When a strategy is in use, only flushes of dirty buffers already in the
3531  * strategy ring are counted as strategy writes (IOCONTEXT
3532  * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
3533  * statistics tracking.
3534  *
3535  * If a shared buffer initially added to the ring must be flushed before
3536  * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
3537  *
3538  * If a shared buffer which was added to the ring later because the
3539  * current strategy buffer is pinned or in use or because all strategy
3540  * buffers were dirty and rejected (for BAS_BULKREAD operations only)
3541  * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
3542  * (from_ring will be false).
3543  *
3544  * When a strategy is not in use, the write can only be a "regular" write
3545  * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
3546  */
3548  IOOP_WRITE, io_start, 1);
3549 
3551 
3552  /*
3553  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
3554  * end the BM_IO_IN_PROGRESS state.
3555  */
3556  TerminateBufferIO(buf, true, 0, true);
3557 
3558  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3559  buf->tag.blockNum,
3561  reln->smgr_rlocator.locator.dbOid,
3563 
3564  /* Pop the error context stack */
3565  error_context_stack = errcallback.previous;
3566 }
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:65
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:5324
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1510
ErrorContextCallback * error_context_stack
Definition: elog.c:95
@ IOOP_WRITE
Definition: pgstat.h:304
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:121
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2733

References ErrorContextCallback::arg, BM_JUST_DIRTIED, BM_PERMANENT, buf, BufferGetLSN, BufHdrGetBlock, BufTagGetForkNum(), BufTagGetRelFileLocator(), ErrorContextCallback::callback, RelFileLocator::dbOid, error_context_stack, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITE, RelFileLocatorBackend::locator, LockBufHdr(), PageSetChecksumCopy(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), ErrorContextCallback::previous, RelFileLocator::relNumber, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rlocator, smgropen(), smgrwrite(), RelFileLocator::spcOid, StartBufferIO(), TerminateBufferIO(), track_io_timing, UnlockBufHdr(), and XLogFlush().

Referenced by FlushDatabaseBuffers(), FlushOneBuffer(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), and SyncOneBuffer().

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 4499 of file bufmgr.c.

4500 {
4501  int i;
4502  BufferDesc *bufHdr;
4503 
4504  for (i = 0; i < NBuffers; i++)
4505  {
4506  uint32 buf_state;
4507 
4508  bufHdr = GetBufferDescriptor(i);
4509 
4510  /*
4511  * As in DropRelationBuffers, an unlocked precheck should be safe and
4512  * saves some cycles.
4513  */
4514  if (bufHdr->tag.dbOid != dbid)
4515  continue;
4516 
4517  /* Make sure we can handle the pin */
4520 
4521  buf_state = LockBufHdr(bufHdr);
4522  if (bufHdr->tag.dbOid == dbid &&
4523  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4524  {
4525  PinBuffer_Locked(bufHdr);
4529  UnpinBuffer(bufHdr);
4530  }
4531  else
4532  UnlockBufHdr(bufHdr, buf_state);
4533  }
4534 }
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:3438
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:2417

References BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), CurrentResourceOwner, buftag::dbOid, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by dbase_redo().

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 4541 of file bufmgr.c.

4542 {
4543  BufferDesc *bufHdr;
4544 
4545  /* currently not needed, but no fundamental reason not to support */
4546  Assert(!BufferIsLocal(buffer));
4547 
4548  Assert(BufferIsPinned(buffer));
4549 
4550  bufHdr = GetBufferDescriptor(buffer - 1);
4551 
4553 
4555 }
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1900

References Assert(), PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, and LWLockHeldByMe().

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), and XLogReadBufferForRedoExtended().

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 4146 of file bufmgr.c.

4147 {
4148  int i;
4149  BufferDesc *bufHdr;
4150  SMgrRelation srel = RelationGetSmgr(rel);
4151 
4152  if (RelationUsesLocalBuffers(rel))
4153  {
4154  for (i = 0; i < NLocBuffer; i++)
4155  {
4156  uint32 buf_state;
4157  instr_time io_start;
4158 
4159  bufHdr = GetLocalBufferDescriptor(i);
4160  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4161  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4162  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4163  {
4164  ErrorContextCallback errcallback;
4165  Page localpage;
4166 
4167  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
4168 
4169  /* Setup error traceback support for ereport() */
4171  errcallback.arg = (void *) bufHdr;
4172  errcallback.previous = error_context_stack;
4173  error_context_stack = &errcallback;
4174 
4175  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
4176 
4178 
4179  smgrwrite(srel,
4180  BufTagGetForkNum(&bufHdr->tag),
4181  bufHdr->tag.blockNum,
4182  localpage,
4183  false);
4184 
4187  io_start, 1);
4188 
4189  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
4190  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
4191 
4193 
4194  /* Pop the error context stack */
4195  error_context_stack = errcallback.previous;
4196  }
4197  }
4198 
4199  return;
4200  }
4201 
4202  for (i = 0; i < NBuffers; i++)
4203  {
4204  uint32 buf_state;
4205 
4206  bufHdr = GetBufferDescriptor(i);
4207 
4208  /*
4209  * As in DropRelationBuffers, an unlocked precheck should be safe and
4210  * saves some cycles.
4211  */
4212  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4213  continue;
4214 
4215  /* Make sure we can handle the pin */
4218 
4219  buf_state = LockBufHdr(bufHdr);
4220  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4221  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4222  {
4223  PinBuffer_Locked(bufHdr);
4227  UnpinBuffer(bufHdr);
4228  }
4229  else
4230  UnlockBufHdr(bufHdr, buf_state);
4231  }
4232 }
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:290
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:68
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:5344
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1542
int NLocBuffer
Definition: localbuf.c:43
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:281
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:636
int64 local_blks_written
Definition: instrument.h:33
RelFileLocator rd_locator
Definition: rel.h:57

References ErrorContextCallback::arg, buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_VALID, BufferDescriptorGetContentLock(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushBuffer(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_WRITE, BufferUsage::local_blks_written, local_buffer_write_error_callback(), LocalBufHdrGetBlock, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, NLocBuffer, PageSetChecksumInplace(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer_Locked(), ErrorContextCallback::previous, RelationData::rd_locator, RelationGetSmgr(), RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), smgrwrite(), BufferDesc::state, BufferDesc::tag, track_io_timing, UnlockBufHdr(), and UnpinBuffer().

Referenced by fill_seq_with_data(), heapam_relation_copy_data(), and index_copy_data().

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 4244 of file bufmgr.c.

4245 {
4246  int i;
4247  SMgrSortArray *srels;
4248  bool use_bsearch;
4249 
4250  if (nrels == 0)
4251  return;
4252 
4253  /* fill-in array for qsort */
4254  srels = palloc(sizeof(SMgrSortArray) * nrels);
4255 
4256  for (i = 0; i < nrels; i++)
4257  {
4258  Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4259 
4260  srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4261  srels[i].srel = smgrs[i];
4262  }
4263 
4264  /*
4265  * Save the bsearch overhead for low number of relations to sync. See
4266  * DropRelationsAllBuffers for details.
4267  */
4268  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4269 
4270  /* sort the list of SMgrRelations if necessary */
4271  if (use_bsearch)
4272  qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4273 
4274  for (i = 0; i < NBuffers; i++)
4275  {
4276  SMgrSortArray *srelent = NULL;
4277  BufferDesc *bufHdr = GetBufferDescriptor(i);
4278  uint32 buf_state;
4279 
4280  /*
4281  * As in DropRelationBuffers, an unlocked precheck should be safe and
4282  * saves some cycles.
4283  */
4284 
4285  if (!use_bsearch)
4286  {
4287  int j;
4288 
4289  for (j = 0; j < nrels; j++)
4290  {
4291  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
4292  {
4293  srelent = &srels[j];
4294  break;
4295  }
4296  }
4297  }
4298  else
4299  {
4300  RelFileLocator rlocator;
4301 
4302  rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4303  srelent = bsearch((const void *) &(rlocator),
4304  srels, nrels, sizeof(SMgrSortArray),
4306  }
4307 
4308  /* buffer doesn't belong to any of the given relfilelocators; skip it */
4309  if (srelent == NULL)
4310  continue;
4311 
4312  /* Make sure we can handle the pin */
4315 
4316  buf_state = LockBufHdr(bufHdr);
4317  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
4318  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4319  {
4320  PinBuffer_Locked(bufHdr);
4322  FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4324  UnpinBuffer(bufHdr);
4325  }
4326  else
4327  UnlockBufHdr(bufHdr, buf_state);
4328  }
4329 
4330  pfree(srels);
4331 }
SMgrRelation srel
Definition: bufmgr.c:132
RelFileLocator rlocator
Definition: bufmgr.c:131

References Assert(), BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, j, RelFileLocatorBackend::locator, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, palloc(), pfree(), PinBuffer_Locked(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrSortArray::rlocator, rlocator_comparator(), SMgrRelationData::smgr_rlocator, SMgrSortArray::srel, BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by smgrdosyncall().

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 428 of file bufmgr.c.

429 {
430  Assert(ref->refcount == 0);
431 
432  if (ref >= &PrivateRefCountArray[0] &&
434  {
435  ref->buffer = InvalidBuffer;
436 
437  /*
438  * Mark the just used entry as reserved - in many scenarios that
439  * allows us to avoid ever having to search the array/hash for free
440  * entries.
441  */
442  ReservedRefCountEntry = ref;
443  }
444  else
445  {
446  bool found;
447  Buffer buffer = ref->buffer;
448 
449  hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
450  Assert(found);
453  }
454 }
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:201
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:953
@ HASH_REMOVE
Definition: hsearch.h:115

References Assert(), PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountEntry.

Referenced by UnpinBufferNoOwner().

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 405 of file bufmgr.c.

406 {
408 
409  Assert(BufferIsValid(buffer));
410  Assert(!BufferIsLocal(buffer));
411 
412  /*
413  * Not moving the entry - that's ok for the current users, but we might
414  * want to change this one day.
415  */
416  ref = GetPrivateRefCountEntry(buffer, false);
417 
418  if (ref == NULL)
419  return 0;
420  return ref->refcount;
421 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:331

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by CheckBufferIsPinnedOnce(), ConditionalLockBufferForCleanup(), DebugPrintBufferRefcount(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), MarkBufferDirtyHint(), and ReadRecentBuffer().

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 331 of file bufmgr.c.

332 {
334  int i;
335 
336  Assert(BufferIsValid(buffer));
337  Assert(!BufferIsLocal(buffer));
338 
339  /*
340  * First search for references in the array, that'll be sufficient in the
341  * majority of cases.
342  */
343  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
344  {
346 
347  if (res->buffer == buffer)
348  return res;
349  }
350 
351  /*
352  * By here we know that the buffer, if already pinned, isn't residing in
353  * the array.
354  *
355  * Only look up the buffer in the hashtable if we've previously overflowed
356  * into it.
357  */
358  if (PrivateRefCountOverflowed == 0)
359  return NULL;
360 
361  res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
362 
363  if (res == NULL)
364  return NULL;
365  else if (!do_move)
366  {
367  /* caller doesn't want us to move the hash entry into the array */
368  return res;
369  }
370  else
371  {
372  /* move buffer from hashtable into the free array slot */
373  bool found;
375 
376  /* Ensure there's a free array slot */
378 
379  /* Use up the reserved slot */
380  Assert(ReservedRefCountEntry != NULL);
382  ReservedRefCountEntry = NULL;
383  Assert(free->buffer == InvalidBuffer);
384 
385  /* and fill it */
386  free->buffer = buffer;
387  free->refcount = res->refcount;
388 
389  /* delete from hashtable */
390  hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
391  Assert(found);
394 
395  return free;
396  }
397 }
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, res, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBufferNoOwner().

◆ GetVictimBuffer()

static Buffer GetVictimBuffer ( BufferAccessStrategy  strategy,
IOContext  io_context 
)
static

Definition at line 1608 of file bufmgr.c.

1609 {
1610  BufferDesc *buf_hdr;
1611  Buffer buf;
1612  uint32 buf_state;
1613  bool from_ring;
1614 
1615  /*
1616  * Ensure, while the spinlock's not yet held, that there's a free refcount
1617  * entry, and a resource owner slot for the pin.
1618  */
1621 
1622  /* we return here if a prospective victim buffer gets used concurrently */
1623 again:
1624 
1625  /*
1626  * Select a victim buffer. The buffer is returned with its header
1627  * spinlock still held!
1628  */
1629  buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1630  buf = BufferDescriptorGetBuffer(buf_hdr);
1631 
1632  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1633 
1634  /* Pin the buffer and then release the buffer spinlock */
1635  PinBuffer_Locked(buf_hdr);
1636 
1637  /*
1638  * We shouldn't have any other pins for this buffer.
1639  */
1641 
1642  /*
1643  * If the buffer was dirty, try to write it out. There is a race
1644  * condition here, in that someone might dirty it after we released the
1645  * buffer header lock above, or even while we are writing it out (since
1646  * our share-lock won't prevent hint-bit updates). We will recheck the
1647  * dirty bit after re-locking the buffer header.
1648  */
1649  if (buf_state & BM_DIRTY)
1650  {
1651  LWLock *content_lock;
1652 
1653  Assert(buf_state & BM_TAG_VALID);
1654  Assert(buf_state & BM_VALID);
1655 
1656  /*
1657  * We need a share-lock on the buffer contents to write it out (else
1658  * we might write invalid data, eg because someone else is compacting
1659  * the page contents while we write). We must use a conditional lock
1660  * acquisition here to avoid deadlock. Even though the buffer was not
1661  * pinned (and therefore surely not locked) when StrategyGetBuffer
1662  * returned it, someone else could have pinned and exclusive-locked it
1663  * by the time we get here. If we try to get the lock unconditionally,
1664  * we'd block waiting for them; if they later block waiting for us,
1665  * deadlock ensues. (This has been observed to happen when two
1666  * backends are both trying to split btree index pages, and the second
1667  * one just happens to be trying to split the page the first one got
1668  * from StrategyGetBuffer.)
1669  */
1670  content_lock = BufferDescriptorGetContentLock(buf_hdr);
1671  if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
1672  {
1673  /*
1674  * Someone else has locked the buffer, so give it up and loop back
1675  * to get another one.
1676  */
1677  UnpinBuffer(buf_hdr);
1678  goto again;
1679  }
1680 
1681  /*
1682  * If using a nondefault strategy, and writing the buffer would
1683  * require a WAL flush, let the strategy decide whether to go ahead
1684  * and write/reuse the buffer or to choose another victim. We need a
1685  * lock to inspect the page LSN, so this can't be done inside
1686  * StrategyGetBuffer.
1687  */
1688  if (strategy != NULL)
1689  {
1690  XLogRecPtr lsn;
1691 
1692  /* Read the LSN while holding buffer header lock */
1693  buf_state = LockBufHdr(buf_hdr);
1694  lsn = BufferGetLSN(buf_hdr);
1695  UnlockBufHdr(buf_hdr, buf_state);
1696 
1697  if (XLogNeedsFlush(lsn)
1698  && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
1699  {
1700  LWLockRelease(content_lock);
1701  UnpinBuffer(buf_hdr);
1702  goto again;
1703  }
1704  }
1705 
1706  /* OK, do the I/O */
1707  FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
1708  LWLockRelease(content_lock);
1709 
1711  &buf_hdr->tag);
1712  }
1713 
1714 
1715  if (buf_state & BM_VALID)
1716  {
1717  /*
1718  * When a BufferAccessStrategy is in use, blocks evicted from shared
1719  * buffers are counted as IOOP_EVICT in the corresponding context
1720  * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
1721  * strategy in two cases: 1) while initially claiming buffers for the
1722  * strategy ring 2) to replace an existing strategy ring buffer
1723  * because it is pinned or in use and cannot be reused.
1724  *
1725  * Blocks evicted from buffers already in the strategy ring are
1726  * counted as IOOP_REUSE in the corresponding strategy context.
1727  *
1728  * At this point, we can accurately count evictions and reuses,
1729  * because we have successfully claimed the valid buffer. Previously,
1730  * we may have been forced to release the buffer due to concurrent
1731  * pinners or erroring out.
1732  */
1734  from_ring ? IOOP_REUSE : IOOP_EVICT);
1735  }
1736 
1737  /*
1738  * If the buffer has an entry in the buffer mapping table, delete it. This
1739  * can fail because another backend could have pinned or dirtied the
1740  * buffer.
1741  */
1742  if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
1743  {
1744  UnpinBuffer(buf_hdr);
1745  goto again;
1746  }
1747 
1748  /* a final set of sanity checks */
1749 #ifdef USE_ASSERT_CHECKING
1750  buf_state = pg_atomic_read_u32(&buf_hdr->state);
1751 
1752  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
1753  Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
1754 
1756 #endif
1757 
1758  return buf;
1759 }
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:4843
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:1540
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:5545
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:756
@ IOOP_EVICT
Definition: pgstat.h:298
@ IOOP_REUSE
Definition: pgstat.h:303
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
Definition: pgstat_io.c:77
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3066

References Assert(), BackendWritebackContext, BM_DIRTY, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufferGetLSN, CheckBufferIsPinnedOnce(), CurrentResourceOwner, FlushBuffer(), InvalidateVictimBuffer(), IOOBJECT_RELATION, IOOP_EVICT, IOOP_REUSE, LockBufHdr(), LW_SHARED, LWLockConditionalAcquire(), LWLockRelease(), pg_atomic_read_u32(), pgstat_count_io_op(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), BufferDesc::state, StrategyGetBuffer(), StrategyRejectBuffer(), BufferDesc::tag, UnlockBufHdr(), UnpinBuffer(), and XLogNeedsFlush().

Referenced by BufferAlloc(), and ExtendBufferedRelShared().

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 5011 of file bufmgr.c.

5012 {
5013  int bufid = GetStartupBufferPinWaitBufId();
5014 
5015  /*
5016  * If we get woken slowly then it's possible that the Startup process was
5017  * already woken by other backends before we got here. Also possible that
5018  * we get here by multiple interrupts or interrupts at inappropriate
5019  * times, so make sure we do nothing if the bufid is not set.
5020  */
5021  if (bufid < 0)
5022  return false;
5023 
5024  if (GetPrivateRefCount(bufid + 1) > 0)
5025  return true;
5026 
5027  return false;
5028 }
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:673

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and ProcessRecoveryConflictInterrupt().

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 4593 of file bufmgr.c.

4594 {
4595  Assert(BufferIsPinned(buffer));
4597  if (BufferIsLocal(buffer))
4598  LocalRefCount[-buffer - 1]++;
4599  else
4600  {
4601  PrivateRefCountEntry *ref;
4602 
4603  ref = GetPrivateRefCountEntry(buffer, true);
4604  Assert(ref != NULL);
4605  ref->refcount++;
4606  }
4608 }
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, GetPrivateRefCountEntry(), LocalRefCount, PrivateRefCountEntry::refcount, ResourceOwnerEnlarge(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), RelationAddBlocks(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

◆ InitBufferPoolAccess()

void InitBufferPoolAccess ( void  )

Definition at line 3230 of file bufmgr.c.

3231 {
3232  HASHCTL hash_ctl;
3233 
3234  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3235 
3236  hash_ctl.keysize = sizeof(int32);
3237  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3238 
3239  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
3240  HASH_ELEM | HASH_BLOBS);
3241 
3242  /*
3243  * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
3244  * the corresponding phase of backend shutdown.
3245  */
3246  Assert(MyProc != NULL);
3248 }
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:3255
struct PrivateRefCountEntry PrivateRefCountEntry
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:350
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
PGPROC * MyProc
Definition: proc.c:68
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76

References Assert(), AtProcExit_Buffers(), HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MyProc, on_shmem_exit(), PrivateRefCountArray, and PrivateRefCountHash.

Referenced by BaseInit().

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 1442 of file bufmgr.c.

1443 {
1444  BufferTag oldTag;
1445  uint32 oldHash; /* hash value for oldTag */
1446  LWLock *oldPartitionLock; /* buffer partition lock for it */
1447  uint32 oldFlags;
1448  uint32 buf_state;
1449 
1450  /* Save the original buffer tag before dropping the spinlock */
1451  oldTag = buf->tag;
1452 
1453  buf_state = pg_atomic_read_u32(&buf->state);
1454  Assert(buf_state & BM_LOCKED);
1455  UnlockBufHdr(buf, buf_state);
1456 
1457  /*
1458  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1459  * worth storing the hashcode in BufferDesc so we need not recompute it
1460  * here? Probably not.
1461  */
1462  oldHash = BufTableHashCode(&oldTag);
1463  oldPartitionLock = BufMappingPartitionLock(oldHash);
1464 
1465 retry:
1466 
1467  /*
1468  * Acquire exclusive mapping lock in preparation for changing the buffer's
1469  * association.
1470  */
1471  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1472 
1473  /* Re-lock the buffer header */
1474  buf_state = LockBufHdr(buf);
1475 
1476  /* If it's changed while we were waiting for lock, do nothing */
1477  if (!BufferTagsEqual(&buf->tag, &oldTag))
1478  {
1479  UnlockBufHdr(buf, buf_state);
1480  LWLockRelease(oldPartitionLock);
1481  return;
1482  }
1483 
1484  /*
1485  * We assume the only reason for it to be pinned is that someone else is
1486  * flushing the page out. Wait for them to finish. (This could be an
1487  * infinite loop if the refcount is messed up... it would be nice to time
1488  * out after awhile, but there seems no way to be sure how many loops may
1489  * be needed. Note that if the other guy has pinned the buffer but not
1490  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1491  * be busy-looping here.)
1492  */
1493  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1494  {
1495  UnlockBufHdr(buf, buf_state);
1496  LWLockRelease(oldPartitionLock);
1497  /* safety check: should definitely not be our *own* pin */
1499  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1500  WaitIO(buf);
1501  goto retry;
1502  }
1503 
1504  /*
1505  * Clear out the buffer's tag and flags. We must do this to ensure that
1506  * linear scans of the buffer array don't think the buffer is valid.
1507  */
1508  oldFlags = buf_state & BUF_FLAG_MASK;
1509  ClearBufferTag(&buf->tag);
1510  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1511  UnlockBufHdr(buf, buf_state);
1512 
1513  /*
1514  * Remove the buffer from the lookup hashtable, if it was in there.
1515  */
1516  if (oldFlags & BM_TAG_VALID)
1517  BufTableDelete(&oldTag, oldHash);
1518 
1519  /*
1520  * Done with mapping lock.
1521  */
1522  LWLockRelease(oldPartitionLock);
1523 
1524  /*
1525  * Insert the buffer at the head of the list of free buffers.
1526  */
1528 }
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:45
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
#define BM_LOCKED
Definition: buf_internals.h:60
static void ClearBufferTag(BufferTag *tag)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5147

References Assert(), BM_LOCKED, BM_TAG_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), elog, ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), StrategyFreeBuffer(), UnlockBufHdr(), and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), and FindAndDropRelationBuffers().

◆ InvalidateVictimBuffer()

static bool InvalidateVictimBuffer ( BufferDesc buf_hdr)
static

Definition at line 1540 of file bufmgr.c.

1541 {
1542  uint32 buf_state;
1543  uint32 hash;
1544  LWLock *partition_lock;
1545  BufferTag tag;
1546 
1548 
1549  /* have buffer pinned, so it's safe to read tag without lock */
1550  tag = buf_hdr->tag;
1551 
1552  hash = BufTableHashCode(&tag);
1553  partition_lock = BufMappingPartitionLock(hash);
1554 
1555  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1556 
1557  /* lock the buffer header */
1558  buf_state = LockBufHdr(buf_hdr);
1559 
1560  /*
1561  * We have the buffer pinned nobody else should have been able to unset
1562  * this concurrently.
1563  */
1564  Assert(buf_state & BM_TAG_VALID);
1565  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1566  Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
1567 
1568  /*
1569  * If somebody else pinned the buffer since, or even worse, dirtied it,
1570  * give up on this buffer: It's clearly in use.
1571  */
1572  if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
1573  {
1574  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1575 
1576  UnlockBufHdr(buf_hdr, buf_state);
1577  LWLockRelease(partition_lock);
1578 
1579  return false;
1580  }
1581 
1582  /*
1583  * Clear out the buffer's tag and flags and usagecount. This is not
1584  * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
1585  * doing anything with the buffer. But currently it's beneficial, as the
1586  * cheaper pre-check for several linear scans of shared buffers use the
1587  * tag (see e.g. FlushDatabaseBuffers()).
1588  */
1589  ClearBufferTag(&buf_hdr->tag);
1590  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1591  UnlockBufHdr(buf_hdr, buf_state);
1592 
1593  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1594 
1595  /* finally delete buffer from the buffer mapping table */
1596  BufTableDelete(&tag, hash);
1597 
1598  LWLockRelease(partition_lock);
1599 
1600  Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
1601  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1603 
1604  return true;
1605 }

References Assert(), BM_DIRTY, BM_TAG_VALID, BM_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), GetPrivateRefCount(), hash(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by GetVictimBuffer().

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 5093 of file bufmgr.c.

5094 {
5095  BufferDesc *bufHdr;
5096  uint32 buf_state;
5097 
5098  Assert(BufferIsValid(buffer));
5099 
5100  if (BufferIsLocal(buffer))
5101  {
5102  /* There should be exactly one pin */
5103  if (LocalRefCount[-buffer - 1] != 1)
5104  return false;
5105  /* Nobody else to wait for */
5106  return true;
5107  }
5108 
5109  /* There should be exactly one local pin */
5110  if (GetPrivateRefCount(buffer) != 1)
5111  return false;
5112 
5113  bufHdr = GetBufferDescriptor(buffer - 1);
5114 
5115  /* caller must hold exclusive lock on buffer */
5117  LW_EXCLUSIVE));
5118 
5119  buf_state = LockBufHdr(bufHdr);
5120 
5121  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5122  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5123  {
5124  /* pincount is OK. */
5125  UnlockBufHdr(bufHdr, buf_state);
5126  return true;
5127  }
5128 
5129  UnlockBufHdr(bufHdr, buf_state);
5130  return false;
5131 }

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsValid(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBufHdr(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), and UnlockBufHdr().

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), and hashbucketcleanup().

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext wb_context,
IOContext  io_context 
)

Definition at line 5590 of file bufmgr.c.

5591 {
5592  instr_time io_start;
5593  int i;
5594 
5595  if (wb_context->nr_pending == 0)
5596  return;
5597 
5598  /*
5599  * Executing the writes in-order can make them a lot faster, and allows to
5600  * merge writeback requests to consecutive blocks into larger writebacks.
5601  */
5602  sort_pending_writebacks(wb_context->pending_writebacks,
5603  wb_context->nr_pending);
5604 
5606 
5607  /*
5608  * Coalesce neighbouring writes, but nothing else. For that we iterate
5609  * through the, now sorted, array of pending flushes, and look forward to
5610  * find all neighbouring (or identical) writes.
5611  */
5612  for (i = 0; i < wb_context->nr_pending; i++)
5613  {
5616  SMgrRelation reln;
5617  int ahead;
5618  BufferTag tag;
5619  RelFileLocator currlocator;
5620  Size nblocks = 1;
5621 
5622  cur = &wb_context->pending_writebacks[i];
5623  tag = cur->tag;
5624  currlocator = BufTagGetRelFileLocator(&tag);
5625 
5626  /*
5627  * Peek ahead, into following writeback requests, to see if they can
5628  * be combined with the current one.
5629  */
5630  for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
5631  {
5632 
5633  next = &wb_context->pending_writebacks[i + ahead + 1];
5634 
5635  /* different file, stop */
5636  if (!RelFileLocatorEquals(currlocator,
5637  BufTagGetRelFileLocator(&next->tag)) ||
5638  BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
5639  break;
5640 
5641  /* ok, block queued twice, skip */
5642  if (cur->tag.blockNum == next->tag.blockNum)
5643  continue;
5644 
5645  /* only merge consecutive writes */
5646  if (cur->tag.blockNum + 1 != next->tag.blockNum)
5647  break;
5648 
5649  nblocks++;
5650  cur = next;
5651  }
5652 
5653  i += ahead;
5654 
5655  /* and finally tell the kernel to write the data to storage */
5656  reln = smgropen(currlocator, INVALID_PROC_NUMBER);
5657  smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
5658  }
5659 
5660  /*
5661  * Assume that writeback requests are only issued for buffers containing
5662  * blocks of permanent relations.
5663  */
5665  IOOP_WRITEBACK, io_start, wb_context->nr_pending);
5666 
5667  wb_context->nr_pending = 0;
5668 }
static int32 next
Definition: blutils.c:221
struct cursor * cur
Definition: ecpg.c:28
@ IOOP_WRITEBACK
Definition: pgstat.h:305
#define RelFileLocatorEquals(locator1, locator2)
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:646
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), cur, i, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITEBACK, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RelFileLocatorEquals, smgropen(), smgrwriteback(), and track_io_timing.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

◆ LimitAdditionalPins()

static void LimitAdditionalPins ( uint32 additional_pins)
static

Definition at line 1774 of file bufmgr.c.

1775 {
1776  uint32 max_backends;
1777  int max_proportional_pins;
1778 
1779  if (*additional_pins <= 1)
1780  return;
1781 
1782  max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
1783  max_proportional_pins = NBuffers / max_backends;
1784 
1785  /*
1786  * Subtract the approximate number of buffers already pinned by this
1787  * backend. We get the number of "overflowed" pins for free, but don't
1788  * know the number of pins in PrivateRefCountArray. The cost of
1789  * calculating that exactly doesn't seem worth it, so just assume the max.
1790  */
1791  max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
1792 
1793  if (max_proportional_pins <= 0)
1794  max_proportional_pins = 1;
1795 
1796  if (*additional_pins > max_proportional_pins)
1797  *additional_pins = max_proportional_pins;
1798 }
int MaxBackends
Definition: globals.c:143
#define NUM_AUXILIARY_PROCS
Definition: proc.h:440

References MaxBackends, NBuffers, NUM_AUXILIARY_PROCS, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by ExtendBufferedRelShared().

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 5344 of file bufmgr.c.

5345 {
5346  BufferDesc *bufHdr = (BufferDesc *) arg;
5347 
5348  if (bufHdr != NULL)
5349  {
5350  char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
5351  MyProcNumber,
5352  BufTagGetForkNum(&bufHdr->tag));
5353 
5354  errcontext("writing block %u of relation %s",
5355  bufHdr->tag.blockNum, path);
5356  pfree(path);
5357  }
5358 }
#define errcontext
Definition: elog.h:196
void * arg

References arg, buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, MyProcNumber, pfree(), relpathbackend, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
int  mode 
)

Definition at line 4796 of file bufmgr.c.

4797 {
4798  BufferDesc *buf;
4799 
4800  Assert(BufferIsPinned(buffer));
4801  if (BufferIsLocal(buffer))
4802  return; /* local buffers need no lock */
4803 
4804  buf = GetBufferDescriptor(buffer - 1);
4805 
4806  if (mode == BUFFER_LOCK_UNLOCK)
4808  else if (mode == BUFFER_LOCK_SHARE)
4810  else if (mode == BUFFER_LOCK_EXCLUSIVE)
4812  else
4813  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
4814 }
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:158
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:159

References Assert(), buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, elog, ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), and mode.

Referenced by _bt_lockbuf(), _bt_unlockbuf(), _bt_upgradelockbufcleanup(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), entryLoadMoreItems(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishOldSplit(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_inplace_update(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_bitmap_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgetpage(), heapgettup(), initBloomState(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), LockBufferForCleanup(), log_newpage_range(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), ScanSourceDatabasePgClass(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferForRedoExtended(), and XLogRecordPageWithFreeSpace().

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 4876 of file bufmgr.c.

4877 {
4878  BufferDesc *bufHdr;
4879  TimestampTz waitStart = 0;
4880  bool waiting = false;
4881  bool logged_recovery_conflict = false;
4882 
4883  Assert(BufferIsPinned(buffer));
4884  Assert(PinCountWaitBuf == NULL);
4885 
4886  CheckBufferIsPinnedOnce(buffer);
4887 
4888  /* Nobody else to wait for */
4889  if (BufferIsLocal(buffer))
4890  return;
4891 
4892  bufHdr = GetBufferDescriptor(buffer - 1);
4893 
4894  for (;;)
4895  {
4896  uint32 buf_state;
4897 
4898  /* Try to acquire lock */
4900  buf_state = LockBufHdr(bufHdr);
4901 
4902  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4903  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4904  {
4905  /* Successfully acquired exclusive lock with pincount 1 */
4906  UnlockBufHdr(bufHdr, buf_state);
4907 
4908  /*
4909  * Emit the log message if recovery conflict on buffer pin was
4910  * resolved but the startup process waited longer than
4911  * deadlock_timeout for it.
4912  */
4913  if (logged_recovery_conflict)
4915  waitStart, GetCurrentTimestamp(),
4916  NULL, false);
4917 
4918  if (waiting)
4919  {
4920  /* reset ps display to remove the suffix if we added one */
4922  waiting = false;
4923  }
4924  return;
4925  }
4926  /* Failed, so mark myself as waiting for pincount 1 */
4927  if (buf_state & BM_PIN_COUNT_WAITER)
4928  {
4929  UnlockBufHdr(bufHdr, buf_state);
4930  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4931  elog(ERROR, "multiple backends attempting to wait for pincount 1");
4932  }
4934  PinCountWaitBuf = bufHdr;
4935  buf_state |= BM_PIN_COUNT_WAITER;
4936  UnlockBufHdr(bufHdr, buf_state);
4937  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4938 
4939  /* Wait to be signaled by UnpinBuffer() */
4940  if (InHotStandby)
4941  {
4942  if (!waiting)
4943  {
4944  /* adjust the process title to indicate that it's waiting */
4945  set_ps_display_suffix("waiting");
4946  waiting = true;
4947  }
4948 
4949  /*
4950  * Emit the log message if the startup process is waiting longer
4951  * than deadlock_timeout for recovery conflict on buffer pin.
4952  *
4953  * Skip this if first time through because the startup process has
4954  * not started waiting yet in this case. So, the wait start
4955  * timestamp is set after this logic.
4956  */
4957  if (waitStart != 0 && !logged_recovery_conflict)
4958  {
4960 
4961  if (TimestampDifferenceExceeds(waitStart, now,
4962  DeadlockTimeout))
4963  {
4965  waitStart, now, NULL, true);
4966  logged_recovery_conflict = true;
4967  }
4968  }
4969 
4970  /*
4971  * Set the wait start timestamp if logging is enabled and first
4972  * time through.
4973  */
4974  if (log_recovery_conflict_waits && waitStart == 0)
4975  waitStart = GetCurrentTimestamp();
4976 
4977  /* Publish the bufid that Startup process waits on */
4978  SetStartupBufferPinWaitBufId(buffer - 1);
4979  /* Set alarm and then wait to be signaled by UnpinBuffer() */
4981  /* Reset the published bufid */
4983  }
4984  else
4985  ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
4986 
4987  /*
4988  * Remove flag marking us as waiter. Normally this will not be set
4989  * anymore, but ProcWaitForSignal() can return for other signals as
4990  * well. We take care to only reset the flag if we're the waiter, as
4991  * theoretically another backend could have started waiting. That's
4992  * impossible with the current usages due to table level locking, but
4993  * better be safe.
4994  */
4995  buf_state = LockBufHdr(bufHdr);
4996  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
4998  buf_state &= ~BM_PIN_COUNT_WAITER;
4999  UnlockBufHdr(bufHdr, buf_state);
5000 
5001  PinCountWaitBuf = NULL;
5002  /* Loop back and try again */
5003  }
5004 }
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1791
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1655
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1619
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:67
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:165
int64 TimestampTz
Definition: timestamp.h:39
static volatile sig_atomic_t waiting
Definition: latch.c:163
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:47
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:396
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:344
int DeadlockTimeout
Definition: proc.c:59
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:661
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1852
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:794
bool log_recovery_conflict_waits
Definition: standby.c:43
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:275
int wait_backend_pgprocno
#define InHotStandby
Definition: xlogutils.h:57

References Assert(), BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, CheckBufferIsPinnedOnce(), DeadlockTimeout, elog, ERROR, GetBufferDescriptor(), GetCurrentTimestamp(), InHotStandby, LockBuffer(), LockBufHdr(), log_recovery_conflict_waits, LogRecoveryConflict(), MyProcNumber, now(), PinCountWaitBuf, PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), set_ps_display_remove_suffix(), set_ps_display_suffix(), SetStartupBufferPinWaitBufId(), TimestampDifferenceExceeds(), UnlockBufHdr(), BufferDesc::wait_backend_pgprocno, and waiting.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), ReadBuffer_common(), and XLogReadBufferForRedoExtended().

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 5391 of file bufmgr.c.

5392 {
5393  SpinDelayStatus delayStatus;
5394  uint32 old_buf_state;
5395 
5397 
5398  init_local_spin_delay(&delayStatus);
5399 
5400  while (true)
5401  {
5402  /* set BM_LOCKED flag */
5403  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
5404  /* if it wasn't set before we're OK */
5405  if (!(old_buf_state & BM_LOCKED))
5406  break;
5407  perform_spin_delay(&delayStatus);
5408  }
5409  finish_spin_delay(&delayStatus);
5410  return old_buf_state | BM_LOCKED;
5411 }
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:405
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:132
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:192
#define init_local_spin_delay(status)
Definition: s_lock.h:843

References Assert(), BM_LOCKED, BufferDescriptorGetBuffer(), BufferIsLocal, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), ExtendBufferedRelShared(), FindAndDropRelationBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetBufferFromRing(), GetVictimBuffer(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), pg_buffercache_pages(), ReadRecentBuffer(), StartBufferIO(), StrategyGetBuffer(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), UnpinBufferNoOwner(), and WaitIO().

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 2190 of file bufmgr.c.

2191 {
2192  BufferDesc *bufHdr;
2193  uint32 buf_state;
2194  uint32 old_buf_state;
2195 
2196  if (!BufferIsValid(buffer))
2197  elog(ERROR, "bad buffer ID: %d", buffer);
2198 
2199  if (BufferIsLocal(buffer))
2200  {
2201  MarkLocalBufferDirty(buffer);
2202  return;
2203  }
2204 
2205  bufHdr = GetBufferDescriptor(buffer - 1);
2206 
2207  Assert(BufferIsPinned(buffer));
2209  LW_EXCLUSIVE));
2210 
2211  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2212  for (;;)
2213  {
2214  if (old_buf_state & BM_LOCKED)
2215  old_buf_state = WaitBufHdrUnlocked(bufHdr);
2216 
2217  buf_state = old_buf_state;
2218 
2219  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2220  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2221 
2222  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2223  buf_state))
2224  break;
2225  }
2226 
2227  /*
2228  * If the buffer was not dirty already, do vacuum accounting.
2229  */
2230  if (!(old_buf_state & BM_DIRTY))
2231  {
2232  VacuumPageDirty++;
2234  if (VacuumCostActive)
2236  }
2237 }
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:344
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:5421
bool VacuumCostActive
Definition: globals.c:159
int64 VacuumPageDirty
Definition: globals.c:156
int VacuumCostBalance
Definition: globals.c:158
int VacuumCostPageDirty
Definition: globals.c:150
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:450
int64 shared_blks_dirtied
Definition: instrument.h:28

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, BufferIsValid(), elog, ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_pass(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_restore_meta(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), dataExecPlaceToPageInternal(), dataExecPlaceToPageLeaf(), do_setval(), doPickSplit(), entryExecPlaceToPage(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_freeze_execute_prepared(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_freeze_page(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune(), heap_xlog_update(), heap_xlog_vacuum(), heap_xlog_visible(), lazy_scan_new_or_empty(), lazy_scan_prune(), lazy_vacuum_heap_page(), log_newpage_range(), moveLeafs(), nextval_internal(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 4625 of file bufmgr.c.

4626 {
4627  BufferDesc *bufHdr;
4628  Page page = BufferGetPage(buffer);
4629 
4630  if (!BufferIsValid(buffer))
4631  elog(ERROR, "bad buffer ID: %d", buffer);
4632 
4633  if (BufferIsLocal(buffer))
4634  {
4635  MarkLocalBufferDirty(buffer);
4636  return;
4637  }
4638 
4639  bufHdr = GetBufferDescriptor(buffer - 1);
4640 
4641  Assert(GetPrivateRefCount(buffer) > 0);
4642  /* here, either share or exclusive lock is OK */
4644 
4645  /*
4646  * This routine might get called many times on the same page, if we are
4647  * making the first scan after commit of an xact that added/deleted many
4648  * tuples. So, be as quick as we can if the buffer is already dirty. We
4649  * do this by not acquiring spinlock if it looks like the status bits are
4650  * already set. Since we make this test unlocked, there's a chance we
4651  * might fail to notice that the flags have just been cleared, and failed
4652  * to reset them, due to memory-ordering issues. But since this function
4653  * is only intended to be used in cases where failing to write out the
4654  * data would be harmless anyway, it doesn't really matter.
4655  */
4656  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
4658  {
4660  bool dirtied = false;
4661  bool delayChkptFlags = false;
4662  uint32 buf_state;
4663 
4664  /*
4665  * If we need to protect hint bit updates from torn writes, WAL-log a
4666  * full page image of the page. This full page image is only necessary
4667  * if the hint bit update is the first change to the page since the
4668  * last checkpoint.
4669  *
4670  * We don't check full_page_writes here because that logic is included
4671  * when we call XLogInsert() since the value changes dynamically.
4672  */
4673  if (XLogHintBitIsNeeded() &&
4674  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
4675  {
4676  /*
4677  * If we must not write WAL, due to a relfilelocator-specific
4678  * condition or being in recovery, don't dirty the page. We can
4679  * set the hint, just not dirty the page as a result so the hint
4680  * is lost when we evict the page or shutdown.
4681  *
4682  * See src/backend/storage/page/README for longer discussion.
4683  */
4684  if (RecoveryInProgress() ||
4686  return;
4687 
4688  /*
4689  * If the block is already dirty because we either made a change
4690  * or set a hint already, then we don't need to write a full page
4691  * image. Note that aggressive cleaning of blocks dirtied by hint
4692  * bit setting would increase the call rate. Bulk setting of hint
4693  * bits would reduce the call rate...
4694  *
4695  * We must issue the WAL record before we mark the buffer dirty.
4696  * Otherwise we might write the page before we write the WAL. That
4697  * causes a race condition, since a checkpoint might occur between
4698  * writing the WAL record and marking the buffer dirty. We solve
4699  * that with a kluge, but one that is already in use during
4700  * transaction commit to prevent race conditions. Basically, we
4701  * simply prevent the checkpoint WAL record from being written
4702  * until we have marked the buffer dirty. We don't start the
4703  * checkpoint flush until we have marked dirty, so our checkpoint
4704  * must flush the change to disk successfully or the checkpoint
4705  * never gets written, so crash recovery will fix.
4706  *
4707  * It's possible we may enter here without an xid, so it is
4708  * essential that CreateCheckPoint waits for virtual transactions
4709  * rather than full transactionids.
4710  */
4713  delayChkptFlags = true;
4714  lsn = XLogSaveBufferForHint(buffer, buffer_std);
4715  }
4716 
4717  buf_state = LockBufHdr(bufHdr);
4718 
4719  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4720 
4721  if (!(buf_state & BM_DIRTY))
4722  {
4723  dirtied = true; /* Means "will be dirtied by this action" */
4724 
4725  /*
4726  * Set the page LSN if we wrote a backup block. We aren't supposed
4727  * to set this when only holding a share lock but as long as we
4728  * serialise it somehow we're OK. We choose to set LSN while
4729  * holding the buffer header lock, which causes any reader of an
4730  * LSN who holds only a share lock to also obtain a buffer header
4731  * lock before using PageGetLSN(), which is enforced in
4732  * BufferGetLSNAtomic().
4733  *
4734  * If checksums are enabled, you might think we should reset the
4735  * checksum here. That will happen when the page is written
4736  * sometime later in this checkpoint cycle.
4737  */
4738  if (!XLogRecPtrIsInvalid(lsn))
4739  PageSetLSN(page, lsn);
4740  }
4741 
4742  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
4743  UnlockBufHdr(bufHdr, buf_state);
4744 
4745  if (delayChkptFlags)
4747 
4748  if (dirtied)
4749  {
4750  VacuumPageDirty++;
4752  if (VacuumCostActive)
4754  }
4755  }
4756 }
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
#define DELAY_CHKPT_START
Definition: proc.h:114
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:532
int delayChkptFlags
Definition: proc.h:236
bool RecoveryInProgress(void)
Definition: xlog.c:6206
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1066

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferGetPage(), BufferIsLocal, BufferIsValid(), BufTagGetRelFileLocator(), DELAY_CHKPT_START, PGPROC::delayChkptFlags, elog, ERROR, GetBufferDescriptor(), GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), LWLockHeldByMe(), MarkLocalBufferDirty(), MyProc, PageSetLSN(), pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), RelFileLocatorSkippingWAL(), BufferUsage::shared_blks_dirtied, BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 305 of file bufmgr.c.

306 {
308 
309  /* only allowed to be called when a reservation has been made */
310  Assert(ReservedRefCountEntry != NULL);
311 
312  /* use up the reserved entry */
314  ReservedRefCountEntry = NULL;
315 
316  /* and fill it */
317  res->buffer = buffer;
318  res->refcount = 0;
319 
320  return res;
321 }

References Assert(), PrivateRefCountEntry::buffer, res, and ReservedRefCountEntry.

Referenced by PinBuffer(), and PinBuffer_Locked().

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy 
)
static

Definition at line 2311 of file bufmgr.c.

2312 {
2314  bool result;
2315  PrivateRefCountEntry *ref;
2316 
2317  Assert(!BufferIsLocal(b));
2318  Assert(ReservedRefCountEntry != NULL);
2319 
2320  ref = GetPrivateRefCountEntry(b, true);
2321 
2322  if (ref == NULL)
2323  {
2324  uint32 buf_state;
2325  uint32 old_buf_state;
2326 
2327  ref = NewPrivateRefCountEntry(b);
2328 
2329  old_buf_state = pg_atomic_read_u32(&buf->state);
2330  for (;;)
2331  {
2332  if (old_buf_state & BM_LOCKED)
2333  old_buf_state = WaitBufHdrUnlocked(buf);
2334 
2335  buf_state = old_buf_state;
2336 
2337  /* increase refcount */
2338  buf_state += BUF_REFCOUNT_ONE;
2339 
2340  if (strategy == NULL)
2341  {
2342  /* Default case: increase usagecount unless already max. */
2344  buf_state += BUF_USAGECOUNT_ONE;
2345  }
2346  else
2347  {
2348  /*
2349  * Ring buffers shouldn't evict others from pool. Thus we
2350  * don't make usagecount more than 1.
2351  */
2352  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2353  buf_state += BUF_USAGECOUNT_ONE;
2354  }
2355 
2356  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2357  buf_state))
2358  {
2359  result = (buf_state & BM_VALID) != 0;
2360 
2361  /*
2362  * Assume that we acquired a buffer pin for the purposes of
2363  * Valgrind buffer client checks (even in !result case) to
2364  * keep things simple. Buffers that are unsafe to access are
2365  * not generally guaranteed to be marked undefined or
2366  * non-accessible in any case.
2367  */
2369  break;
2370  }
2371  }
2372  }
2373  else
2374  {
2375  /*
2376  * If we previously pinned the buffer, it must surely be valid.
2377  *
2378  * Note: We deliberately avoid a Valgrind client request here.
2379  * Individual access methods can optionally superimpose buffer page
2380  * client requests on top of our client requests to enforce that
2381  * buffers are only accessed while locked (and pinned). It's possible
2382  * that the buffer page is legitimately non-accessible here. We
2383  * cannot meddle with that.
2384  */
2385  result = true;
2386  }
2387 
2388  ref->refcount++;
2389  Assert(ref->refcount > 0);
2391  return result;
2392 }
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:78
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:43
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:52
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:305
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26

References Assert(), b, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservedRefCountEntry, ResourceOwnerRememberBuffer(), VALGRIND_MAKE_MEM_DEFINED, and WaitBufHdrUnlocked().

Referenced by BufferAlloc(), ExtendBufferedRelShared(), and ReadRecentBuffer().

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 2417 of file bufmgr.c.

2418 {
2419  Buffer b;
2420  PrivateRefCountEntry *ref;
2421  uint32 buf_state;
2422 
2423  /*
2424  * As explained, We don't expect any preexisting pins. That allows us to
2425  * manipulate the PrivateRefCount after releasing the spinlock
2426  */
2428 
2429  /*
2430  * Buffer can't have a preexisting pin, so mark its page as defined to
2431  * Valgrind (this is similar to the PinBuffer() case where the backend
2432  * doesn't already have a buffer pin)
2433  */
2435 
2436  /*
2437  * Since we hold the buffer spinlock, we can update the buffer state and
2438  * release the lock in one operation.
2439  */
2440  buf_state = pg_atomic_read_u32(&buf->state);
2441  Assert(buf_state & BM_LOCKED);
2442  buf_state += BUF_REFCOUNT_ONE;
2443  UnlockBufHdr(buf, buf_state);
2444 
2446 
2447  ref = NewPrivateRefCountEntry(b);
2448  ref->refcount++;
2449 
2451 }

References Assert(), b, BM_LOCKED, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), UnlockBufHdr(), and VALGRIND_MAKE_MEM_DEFINED.

Referenced by FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ PrefetchBuffer()

PrefetchBufferResult PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 628 of file bufmgr.c.

629 {
630  Assert(RelationIsValid(reln));
631  Assert(BlockNumberIsValid(blockNum));
632 
633  if (RelationUsesLocalBuffers(reln))
634  {
635  /* see comments in ReadBufferExtended */
636  if (RELATION_IS_OTHER_TEMP(reln))
637  ereport(ERROR,
638  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
639  errmsg("cannot access temporary tables of other sessions")));
640 
641  /* pass it off to localbuf.c */
642  return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
643  }
644  else
645  {
646  /* pass it to the shared buffer version */
647  return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
648  }
649 }
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:538
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:70
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:657
#define RelationIsValid(relation)
Definition: rel.h:477

References Assert(), BlockNumberIsValid(), ereport, errcode(), errmsg(), ERROR, PrefetchLocalBuffer(), PrefetchSharedBuffer(), RELATION_IS_OTHER_TEMP, RelationGetSmgr(), RelationIsValid, and RelationUsesLocalBuffers.

Referenced by acquire_sample_rows(), BitmapPrefetch(), count_nondeletable_pages(), and pg_prewarm().

◆ PrefetchSharedBuffer()

PrefetchBufferResult PrefetchSharedBuffer ( SMgrRelation  smgr_reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 538 of file bufmgr.c.

541 {
542  PrefetchBufferResult result = {InvalidBuffer, false};
543  BufferTag newTag; /* identity of requested block */
544  uint32 newHash; /* hash value for newTag */
545  LWLock *newPartitionLock; /* buffer partition lock for it */
546  int buf_id;
547 
548  Assert(BlockNumberIsValid(blockNum));
549 
550  /* create a tag so we can lookup the buffer */
551  InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
552  forkNum, blockNum);
553 
554  /* determine its hash code and partition lock ID */
555  newHash = BufTableHashCode(&newTag);
556  newPartitionLock = BufMappingPartitionLock(newHash);
557 
558  /* see if the block is in the buffer pool already */
559  LWLockAcquire(newPartitionLock, LW_SHARED);
560  buf_id = BufTableLookup(&newTag, newHash);
561  LWLockRelease(newPartitionLock);
562 
563  /* If not in buffers, initiate prefetch */
564  if (buf_id < 0)
565  {
566 #ifdef USE_PREFETCH
567  /*
568  * Try to initiate an asynchronous read. This returns false in
569  * recovery if the relation file doesn't exist.
570  */
571  if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
572  smgrprefetch(smgr_reln, forkNum, blockNum, 1))
573  {
574  result.initiated_io = true;
575  }
576 #endif /* USE_PREFETCH */
577  }
578  else
579  {
580  /*
581  * Report the buffer it was in at that time. The caller may be able
582  * to avoid a buffer table lookup, but it's not pinned and it must be
583  * rechecked!
584  */
585  result.recent_buffer = buf_id + 1;
586  }
587 
588  /*
589  * If the block *is* in buffers, we do nothing. This is not really ideal:
590  * the block might be just about to be evicted, which would be stupid
591  * since we know we are going to need it soon. But the only easy answer
592  * is to bump the usage_count, which does not seem like a great solution:
593  * when the caller does ultimately touch the block, usage_count would get
594  * bumped again, resulting in too much favoritism for blocks that are
595  * involved in a prefetch sequence. A real fix would involve some
596  * additional per-buffer state, and it's not clear that there's enough of
597  * a problem to justify that.
598  */
599 
600  return result;
601 }
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:588
Buffer recent_buffer
Definition: bufmgr.h:59

References Assert(), BlockNumberIsValid(), BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), InitBufferTag(), PrefetchBufferResult::initiated_io, InvalidBuffer, IO_DIRECT_DATA, io_direct_flags, RelFileLocatorBackend::locator, LW_SHARED, LWLockAcquire(), LWLockRelease(), PrefetchBufferResult::recent_buffer, SMgrRelationData::smgr_rlocator, and smgrprefetch().

Referenced by PrefetchBuffer(), and XLogPrefetcherNextBlock().

◆ ReadBuffer()