PostgreSQL Source Code  git master
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/catalog.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner_private.h"
#include "utils/timestamp.h"
#include <lib/sort_template.h>
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 
#define ST_SORT   sort_checkpoint_bufferids
 
#define ST_ELEMENT_TYPE   CkptSortItem
 
#define ST_COMPARE(a, b)   ckpt_buforder_comparator(a, b)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   sort_pending_writebacks
 
#define ST_ELEMENT_TYPE   PendingWriteback
 
#define ST_COMPARE(a, b)   buffertag_comparator(&a->tag, &b->tag)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static Buffer ReadBuffer_common (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
 
static BlockNumber ExtendBufferedRelCommon (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static BlockNumber ExtendBufferedRelShared (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf)
 
static void BufferSync (int flags)
 
static uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static bool StartBufferIO (BufferDesc *buf, bool forInput)
 
static void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
 
static Buffer GetVictimBuffer (BufferAccessStrategy strategy, IOContext io_context)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FindAndDropRelationBuffers (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
 
static void RelationCopyStorageUsingBuffer (RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rlocator_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const BufferTag *ba, const BufferTag *bb)
 
static int ckpt_buforder_comparator (const CkptSortItem *a, const CkptSortItem *b)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
bool ReadRecentBuffer (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
 
Buffer ExtendBufferedRel (BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
 
BlockNumber ExtendBufferedRelBy (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
 
Buffer ExtendBufferedRelTo (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
 
static void InvalidateBuffer (BufferDesc *buf)
 
static bool InvalidateVictimBuffer (BufferDesc *buf_hdr)
 
static void LimitAdditionalPins (uint32 *additional_pins)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferPoolAccess (void)
 
void PrintBufferLeakWarning (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelationBuffers (SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelationsAllBuffers (SMgrRelation *smgr_reln, int nlocators)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void CreateAndCopyRelationData (RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, int mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void CheckBufferIsPinnedOnce (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
void AbortBufferIO (Buffer buffer)
 
uint32 LockBufHdr (BufferDesc *desc)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *wb_context, IOContext io_context)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY
 
int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY
 
int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER
 
int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER
 
int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 

Macro Definition Documentation

◆ BUF_DROP_FULL_SCAN_THRESHOLD

#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)

Definition at line 82 of file bufmgr.c.

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 72 of file bufmgr.c.

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 71 of file bufmgr.c.

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 64 of file bufmgr.c.

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:380
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:301
int32 * LocalRefCount
Definition: localbuf.c:46

Definition at line 438 of file bufmgr.c.

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 63 of file bufmgr.c.

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 67 of file bufmgr.c.

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Definition at line 91 of file bufmgr.c.

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 74 of file bufmgr.c.

◆ ST_COMPARE [1/2]

#define ST_COMPARE (   a,
  b 
)    ckpt_buforder_comparator(a, b)

Definition at line 5486 of file bufmgr.c.

◆ ST_COMPARE [2/2]

#define ST_COMPARE (   a,
  b 
)    buffertag_comparator(&a->tag, &b->tag)

Definition at line 5486 of file bufmgr.c.

◆ ST_DEFINE [1/2]

#define ST_DEFINE

Definition at line 5488 of file bufmgr.c.

◆ ST_DEFINE [2/2]

#define ST_DEFINE

Definition at line 5488 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [1/2]

#define ST_ELEMENT_TYPE   CkptSortItem

Definition at line 5485 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [2/2]

#define ST_ELEMENT_TYPE   PendingWriteback

Definition at line 5485 of file bufmgr.c.

◆ ST_SCOPE [1/2]

#define ST_SCOPE   static

Definition at line 5487 of file bufmgr.c.

◆ ST_SCOPE [2/2]

#define ST_SCOPE   static

Definition at line 5487 of file bufmgr.c.

◆ ST_SORT [1/2]

#define ST_SORT   sort_checkpoint_bufferids

Definition at line 5484 of file bufmgr.c.

◆ ST_SORT [2/2]

#define ST_SORT   sort_pending_writebacks

Definition at line 5484 of file bufmgr.c.

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

◆ SMgrSortArray

typedef struct SMgrSortArray SMgrSortArray

Function Documentation

◆ AbortBufferIO()

void AbortBufferIO ( Buffer  buffer)

Definition at line 5191 of file bufmgr.c.

5192 {
5193  BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
5194  uint32 buf_state;
5195 
5196  buf_state = LockBufHdr(buf_hdr);
5197  Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
5198 
5199  if (!(buf_state & BM_VALID))
5200  {
5201  Assert(!(buf_state & BM_DIRTY));
5202  UnlockBufHdr(buf_hdr, buf_state);
5203  }
5204  else
5205  {
5206  Assert(buf_state & BM_DIRTY);
5207  UnlockBufHdr(buf_hdr, buf_state);
5208 
5209  /* Issue notice if this is not the first failure... */
5210  if (buf_state & BM_IO_ERROR)
5211  {
5212  /* Buffer is pinned, so we can read tag without spinlock */
5213  char *path;
5214 
5215  path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
5216  BufTagGetForkNum(&buf_hdr->tag));
5217  ereport(WARNING,
5218  (errcode(ERRCODE_IO_ERROR),
5219  errmsg("could not write block %u of %s",
5220  buf_hdr->tag.blockNum, path),
5221  errdetail("Multiple failures --- write error might be permanent.")));
5222  pfree(path);
5223  }
5224  }
5225 
5226  TerminateBufferIO(buf_hdr, false, BM_IO_ERROR);
5227 }
#define BM_TAG_VALID
Definition: buf_internals.h:62
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static BufferDesc * GetBufferDescriptor(uint32 id)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
#define BM_DIRTY
Definition: buf_internals.h:60
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:63
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:61
#define BM_IO_ERROR
Definition: buf_internals.h:64
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:5300
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
Definition: bufmgr.c:5160
unsigned int uint32
Definition: c.h:495
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define WARNING
Definition: elog.h:36
#define ereport(elevel,...)
Definition: elog.h:149
Assert(fmt[strlen(fmt) - 1] !='\n')
void pfree(void *pointer)
Definition: mcxt.c:1456
#define relpathperm(rlocator, forknum)
Definition: relpath.h:90
BufferTag tag
BlockNumber blockNum
Definition: buf_internals.h:97

References Assert(), buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufTagGetForkNum(), BufTagGetRelFileLocator(), ereport, errcode(), errdetail(), errmsg(), GetBufferDescriptor(), LockBufHdr(), pfree(), relpathperm, BufferDesc::tag, TerminateBufferIO(), UnlockBufHdr(), and WARNING.

Referenced by ResourceOwnerReleaseInternal().

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 3132 of file bufmgr.c.

3133 {
3135 
3136  AtEOXact_LocalBuffers(isCommit);
3137 
3139 }
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:3192
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:198
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:802

References Assert(), AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 3174 of file bufmgr.c.

3175 {
3176  UnlockBuffers();
3177 
3179 
3180  /* localbuf.c needs a chance too */
3182 }
void UnlockBuffers(void)
Definition: bufmgr.c:4687
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:813

References AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferPoolAccess().

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 2758 of file bufmgr.c.

2759 {
2760  /* info obtained from freelist.c */
2761  int strategy_buf_id;
2762  uint32 strategy_passes;
2763  uint32 recent_alloc;
2764 
2765  /*
2766  * Information saved between calls so we can determine the strategy
2767  * point's advance rate and avoid scanning already-cleaned buffers.
2768  */
2769  static bool saved_info_valid = false;
2770  static int prev_strategy_buf_id;
2771  static uint32 prev_strategy_passes;
2772  static int next_to_clean;
2773  static uint32 next_passes;
2774 
2775  /* Moving averages of allocation rate and clean-buffer density */
2776  static float smoothed_alloc = 0;
2777  static float smoothed_density = 10.0;
2778 
2779  /* Potentially these could be tunables, but for now, not */
2780  float smoothing_samples = 16;
2781  float scan_whole_pool_milliseconds = 120000.0;
2782 
2783  /* Used to compute how far we scan ahead */
2784  long strategy_delta;
2785  int bufs_to_lap;
2786  int bufs_ahead;
2787  float scans_per_alloc;
2788  int reusable_buffers_est;
2789  int upcoming_alloc_est;
2790  int min_scan_buffers;
2791 
2792  /* Variables for the scanning loop proper */
2793  int num_to_scan;
2794  int num_written;
2795  int reusable_buffers;
2796 
2797  /* Variables for final smoothed_density update */
2798  long new_strategy_delta;
2799  uint32 new_recent_alloc;
2800 
2801  /*
2802  * Find out where the freelist clock sweep currently is, and how many
2803  * buffer allocations have happened since our last call.
2804  */
2805  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
2806 
2807  /* Report buffer alloc counts to pgstat */
2808  PendingBgWriterStats.buf_alloc += recent_alloc;
2809 
2810  /*
2811  * If we're not running the LRU scan, just stop after doing the stats
2812  * stuff. We mark the saved state invalid so that we can recover sanely
2813  * if LRU scan is turned back on later.
2814  */
2815  if (bgwriter_lru_maxpages <= 0)
2816  {
2817  saved_info_valid = false;
2818  return true;
2819  }
2820 
2821  /*
2822  * Compute strategy_delta = how many buffers have been scanned by the
2823  * clock sweep since last time. If first time through, assume none. Then
2824  * see if we are still ahead of the clock sweep, and if so, how many
2825  * buffers we could scan before we'd catch up with it and "lap" it. Note:
2826  * weird-looking coding of xxx_passes comparisons are to avoid bogus
2827  * behavior when the passes counts wrap around.
2828  */
2829  if (saved_info_valid)
2830  {
2831  int32 passes_delta = strategy_passes - prev_strategy_passes;
2832 
2833  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2834  strategy_delta += (long) passes_delta * NBuffers;
2835 
2836  Assert(strategy_delta >= 0);
2837 
2838  if ((int32) (next_passes - strategy_passes) > 0)
2839  {
2840  /* we're one pass ahead of the strategy point */
2841  bufs_to_lap = strategy_buf_id - next_to_clean;
2842 #ifdef BGW_DEBUG
2843  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2844  next_passes, next_to_clean,
2845  strategy_passes, strategy_buf_id,
2846  strategy_delta, bufs_to_lap);
2847 #endif
2848  }
2849  else if (next_passes == strategy_passes &&
2850  next_to_clean >= strategy_buf_id)
2851  {
2852  /* on same pass, but ahead or at least not behind */
2853  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
2854 #ifdef BGW_DEBUG
2855  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
2856  next_passes, next_to_clean,
2857  strategy_passes, strategy_buf_id,
2858  strategy_delta, bufs_to_lap);
2859 #endif
2860  }
2861  else
2862  {
2863  /*
2864  * We're behind, so skip forward to the strategy point and start
2865  * cleaning from there.
2866  */
2867 #ifdef BGW_DEBUG
2868  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
2869  next_passes, next_to_clean,
2870  strategy_passes, strategy_buf_id,
2871  strategy_delta);
2872 #endif
2873  next_to_clean = strategy_buf_id;
2874  next_passes = strategy_passes;
2875  bufs_to_lap = NBuffers;
2876  }
2877  }
2878  else
2879  {
2880  /*
2881  * Initializing at startup or after LRU scanning had been off. Always
2882  * start at the strategy point.
2883  */
2884 #ifdef BGW_DEBUG
2885  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
2886  strategy_passes, strategy_buf_id);
2887 #endif
2888  strategy_delta = 0;
2889  next_to_clean = strategy_buf_id;
2890  next_passes = strategy_passes;
2891  bufs_to_lap = NBuffers;
2892  }
2893 
2894  /* Update saved info for next time */
2895  prev_strategy_buf_id = strategy_buf_id;
2896  prev_strategy_passes = strategy_passes;
2897  saved_info_valid = true;
2898 
2899  /*
2900  * Compute how many buffers had to be scanned for each new allocation, ie,
2901  * 1/density of reusable buffers, and track a moving average of that.
2902  *
2903  * If the strategy point didn't move, we don't update the density estimate
2904  */
2905  if (strategy_delta > 0 && recent_alloc > 0)
2906  {
2907  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
2908  smoothed_density += (scans_per_alloc - smoothed_density) /
2909  smoothing_samples;
2910  }
2911 
2912  /*
2913  * Estimate how many reusable buffers there are between the current
2914  * strategy point and where we've scanned ahead to, based on the smoothed
2915  * density estimate.
2916  */
2917  bufs_ahead = NBuffers - bufs_to_lap;
2918  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
2919 
2920  /*
2921  * Track a moving average of recent buffer allocations. Here, rather than
2922  * a true average we want a fast-attack, slow-decline behavior: we
2923  * immediately follow any increase.
2924  */
2925  if (smoothed_alloc <= (float) recent_alloc)
2926  smoothed_alloc = recent_alloc;
2927  else
2928  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
2929  smoothing_samples;
2930 
2931  /* Scale the estimate by a GUC to allow more aggressive tuning. */
2932  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
2933 
2934  /*
2935  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
2936  * eventually underflow to zero, and the underflows produce annoying
2937  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
2938  * zero, there's no point in tracking smaller and smaller values of
2939  * smoothed_alloc, so just reset it to exactly zero to avoid this
2940  * syndrome. It will pop back up as soon as recent_alloc increases.
2941  */
2942  if (upcoming_alloc_est == 0)
2943  smoothed_alloc = 0;
2944 
2945  /*
2946  * Even in cases where there's been little or no buffer allocation
2947  * activity, we want to make a small amount of progress through the buffer
2948  * cache so that as many reusable buffers as possible are clean after an
2949  * idle period.
2950  *
2951  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
2952  * the BGW will be called during the scan_whole_pool time; slice the
2953  * buffer pool into that many sections.
2954  */
2955  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
2956 
2957  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
2958  {
2959 #ifdef BGW_DEBUG
2960  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
2961  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
2962 #endif
2963  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
2964  }
2965 
2966  /*
2967  * Now write out dirty reusable buffers, working forward from the
2968  * next_to_clean point, until we have lapped the strategy scan, or cleaned
2969  * enough buffers to match our estimate of the next cycle's allocation
2970  * requirements, or hit the bgwriter_lru_maxpages limit.
2971  */
2972 
2973  /* Make sure we can handle the pin inside SyncOneBuffer */
2975 
2976  num_to_scan = bufs_to_lap;
2977  num_written = 0;
2978  reusable_buffers = reusable_buffers_est;
2979 
2980  /* Execute the LRU scan */
2981  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
2982  {
2983  int sync_state = SyncOneBuffer(next_to_clean, true,
2984  wb_context);
2985 
2986  if (++next_to_clean >= NBuffers)
2987  {
2988  next_to_clean = 0;
2989  next_passes++;
2990  }
2991  num_to_scan--;
2992 
2993  if (sync_state & BUF_WRITTEN)
2994  {
2995  reusable_buffers++;
2996  if (++num_written >= bgwriter_lru_maxpages)
2997  {
2999  break;
3000  }
3001  }
3002  else if (sync_state & BUF_REUSABLE)
3003  reusable_buffers++;
3004  }
3005 
3006  PendingBgWriterStats.buf_written_clean += num_written;
3007 
3008 #ifdef BGW_DEBUG
3009  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3010  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3011  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3012  bufs_to_lap - num_to_scan,
3013  num_written,
3014  reusable_buffers - reusable_buffers_est);
3015 #endif
3016 
3017  /*
3018  * Consider the above scan as being like a new allocation scan.
3019  * Characterize its density and update the smoothed one based on it. This
3020  * effectively halves the moving average period in cases where both the
3021  * strategy and the background writer are doing some useful scanning,
3022  * which is helpful because a long memory isn't as desirable on the
3023  * density estimates.
3024  */
3025  new_strategy_delta = bufs_to_lap - num_to_scan;
3026  new_recent_alloc = reusable_buffers - reusable_buffers_est;
3027  if (new_strategy_delta > 0 && new_recent_alloc > 0)
3028  {
3029  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3030  smoothed_density += (scans_per_alloc - smoothed_density) /
3031  smoothing_samples;
3032 
3033 #ifdef BGW_DEBUG
3034  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3035  new_recent_alloc, new_strategy_delta,
3036  scans_per_alloc, smoothed_density);
3037 #endif
3038  }
3039 
3040  /* Return true if OK to hibernate */
3041  return (bufs_to_lap == 0 && recent_alloc == 0);
3042 }
int BgWriterDelay
Definition: bgwriter.c:61
#define BUF_REUSABLE
Definition: bufmgr.c:72
double bgwriter_lru_multiplier
Definition: bufmgr.c:137
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3061
int bgwriter_lru_maxpages
Definition: bufmgr.c:136
#define BUF_WRITTEN
Definition: bufmgr.c:71
signed int int32
Definition: c.h:483
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
int NBuffers
Definition: globals.c:136
PgStat_BgWriterStats PendingBgWriterStats
ResourceOwner CurrentResourceOwner
Definition: resowner.c:147
void ResourceOwnerEnlargeBuffers(ResourceOwner owner)
Definition: resowner.c:972
PgStat_Counter buf_written_clean
Definition: pgstat.h:255
PgStat_Counter maxwritten_clean
Definition: pgstat.h:256
PgStat_Counter buf_alloc
Definition: pgstat.h:257

References Assert(), bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, PgStat_BgWriterStats::buf_alloc, BUF_REUSABLE, BUF_WRITTEN, PgStat_BgWriterStats::buf_written_clean, CurrentResourceOwner, DEBUG1, DEBUG2, elog(), PgStat_BgWriterStats::maxwritten_clean, NBuffers, PendingBgWriterStats, ResourceOwnerEnlargeBuffers(), StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

◆ BufferAlloc()

static BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr,
IOContext  io_context 
)
static

Definition at line 1220 of file bufmgr.c.

1224 {
1225  BufferTag newTag; /* identity of requested block */
1226  uint32 newHash; /* hash value for newTag */
1227  LWLock *newPartitionLock; /* buffer partition lock for it */
1228  int existing_buf_id;
1229  Buffer victim_buffer;
1230  BufferDesc *victim_buf_hdr;
1231  uint32 victim_buf_state;
1232 
1233  /* create a tag so we can lookup the buffer */
1234  InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1235 
1236  /* determine its hash code and partition lock ID */
1237  newHash = BufTableHashCode(&newTag);
1238  newPartitionLock = BufMappingPartitionLock(newHash);
1239 
1240  /* see if the block is in the buffer pool already */
1241  LWLockAcquire(newPartitionLock, LW_SHARED);
1242  existing_buf_id = BufTableLookup(&newTag, newHash);
1243  if (existing_buf_id >= 0)
1244  {
1245  BufferDesc *buf;
1246  bool valid;
1247 
1248  /*
1249  * Found it. Now, pin the buffer so no one can steal it from the
1250  * buffer pool, and check to see if the correct data has been loaded
1251  * into the buffer.
1252  */
1253  buf = GetBufferDescriptor(existing_buf_id);
1254 
1255  valid = PinBuffer(buf, strategy);
1256 
1257  /* Can release the mapping lock as soon as we've pinned it */
1258  LWLockRelease(newPartitionLock);
1259 
1260  *foundPtr = true;
1261 
1262  if (!valid)
1263  {
1264  /*
1265  * We can only get here if (a) someone else is still reading in
1266  * the page, or (b) a previous read attempt failed. We have to
1267  * wait for any active read attempt to finish, and then set up our
1268  * own read attempt if the page is still not BM_VALID.
1269  * StartBufferIO does it all.
1270  */
1271  if (StartBufferIO(buf, true))
1272  {
1273  /*
1274  * If we get here, previous attempts to read the buffer must
1275  * have failed ... but we shall bravely try again.
1276  */
1277  *foundPtr = false;
1278  }
1279  }
1280 
1281  return buf;
1282  }
1283 
1284  /*
1285  * Didn't find it in the buffer pool. We'll have to initialize a new
1286  * buffer. Remember to unlock the mapping lock while doing the work.
1287  */
1288  LWLockRelease(newPartitionLock);
1289 
1290  /*
1291  * Acquire a victim buffer. Somebody else might try to do the same, we
1292  * don't hold any conflicting locks. If so we'll have to undo our work
1293  * later.
1294  */
1295  victim_buffer = GetVictimBuffer(strategy, io_context);
1296  victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
1297 
1298  /*
1299  * Try to make a hashtable entry for the buffer under its new tag. If
1300  * somebody else inserted another buffer for the tag, we'll release the
1301  * victim buffer we acquired and use the already inserted one.
1302  */
1303  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1304  existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
1305  if (existing_buf_id >= 0)
1306  {
1307  BufferDesc *existing_buf_hdr;
1308  bool valid;
1309 
1310  /*
1311  * Got a collision. Someone has already done what we were about to do.
1312  * We'll just handle this as if it were found in the buffer pool in
1313  * the first place. First, give up the buffer we were planning to
1314  * use.
1315  *
1316  * We could do this after releasing the partition lock, but then we'd
1317  * have to call ResourceOwnerEnlargeBuffers() &
1318  * ReservePrivateRefCountEntry() before acquiring the lock, for the
1319  * rare case of such a collision.
1320  */
1321  UnpinBuffer(victim_buf_hdr);
1322 
1323  /*
1324  * The victim buffer we acquired peviously is clean and unused, let it
1325  * be found again quickly
1326  */
1327  StrategyFreeBuffer(victim_buf_hdr);
1328 
1329  /* remaining code should match code at top of routine */
1330 
1331  existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
1332 
1333  valid = PinBuffer(existing_buf_hdr, strategy);
1334 
1335  /* Can release the mapping lock as soon as we've pinned it */
1336  LWLockRelease(newPartitionLock);
1337 
1338  *foundPtr = true;
1339 
1340  if (!valid)
1341  {
1342  /*
1343  * We can only get here if (a) someone else is still reading in
1344  * the page, or (b) a previous read attempt failed. We have to
1345  * wait for any active read attempt to finish, and then set up our
1346  * own read attempt if the page is still not BM_VALID.
1347  * StartBufferIO does it all.
1348  */
1349  if (StartBufferIO(existing_buf_hdr, true))
1350  {
1351  /*
1352  * If we get here, previous attempts to read the buffer must
1353  * have failed ... but we shall bravely try again.
1354  */
1355  *foundPtr = false;
1356  }
1357  }
1358 
1359  return existing_buf_hdr;
1360  }
1361 
1362  /*
1363  * Need to lock the buffer header too in order to change its tag.
1364  */
1365  victim_buf_state = LockBufHdr(victim_buf_hdr);
1366 
1367  /* some sanity checks while we hold the buffer header lock */
1368  Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
1369  Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
1370 
1371  victim_buf_hdr->tag = newTag;
1372 
1373  /*
1374  * Make sure BM_PERMANENT is set for buffers that must be written at every
1375  * checkpoint. Unlogged buffers only need to be written at shutdown
1376  * checkpoints, except for their "init" forks, which need to be treated
1377  * just like permanent relations.
1378  */
1379  victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1380  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1381  victim_buf_state |= BM_PERMANENT;
1382 
1383  UnlockBufHdr(victim_buf_hdr, victim_buf_state);
1384 
1385  LWLockRelease(newPartitionLock);
1386 
1387  /*
1388  * Buffer contents are currently invalid. Try to obtain the right to
1389  * start I/O. If StartBufferIO returns false, then someone else managed
1390  * to read it before we did, so there's nothing left for BufferAlloc() to
1391  * do.
1392  */
1393  if (StartBufferIO(victim_buf_hdr, true))
1394  *foundPtr = false;
1395  else
1396  *foundPtr = true;
1397 
1398  return victim_buf_hdr;
1399 }
int Buffer
Definition: buf.h:23
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_PERMANENT
Definition: buf_internals.h:68
static LWLock * BufMappingPartitionLock(uint32 hashcode)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:45
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:50
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:2231
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:1585
static bool StartBufferIO(BufferDesc *buf, bool forInput)
Definition: bufmgr.c:5109
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:2379
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1195
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1808
@ LW_SHARED
Definition: lwlock.h:117
@ LW_EXCLUSIVE
Definition: lwlock.h:116
static char * buf
Definition: pg_test_fsync.c:67
@ INIT_FORKNUM
Definition: relpath.h:53
Definition: lwlock.h:41
RelFileLocator locator
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:42

References Assert(), BM_DIRTY, BM_IO_IN_PROGRESS, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), GetBufferDescriptor(), GetVictimBuffer(), INIT_FORKNUM, InitBufferTag(), RelFileLocatorBackend::locator, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer(), SMgrRelationData::smgr_rlocator, StartBufferIO(), StrategyFreeBuffer(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by ReadBuffer_common().

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 3290 of file bufmgr.c.

3291 {
3292  BufferDesc *bufHdr;
3293 
3294  Assert(BufferIsPinned(buffer));
3295 
3296  if (BufferIsLocal(buffer))
3297  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3298  else
3299  bufHdr = GetBufferDescriptor(buffer - 1);
3300 
3301  /* pinned, so OK to read tag without spinlock */
3302  return bufHdr->tag.blockNum;
3303 }
#define BufferIsLocal(buffer)
Definition: buf.h:37
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:438

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by _bt_binsrch_insert(), _bt_bottomupdel_pass(), _bt_check_unique(), _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_endpoint(), _bt_finish_split(), _bt_first(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readnextpage(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_simpledel_pass(), _bt_split(), _bt_unlink_halfdead_page(), _bt_walk_left(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), btbuildempty(), collectMatchBitmap(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_fork_with_data(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_is_all_visible(), heap_page_prune(), heap_prune_chain(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), index_compute_xid_horizon_for_tuples(), lazy_scan_noprune(), lazy_scan_prune(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), ReadBufferBI(), RelationAddBlocks(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), ScanSourceDatabasePgClassPage(), spgAddNodeAction(), spgbuild(), spgbuildempty(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgWalk(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), and visibilitymap_set().

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 3551 of file bufmgr.c.

3552 {
3553  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
3554  char *page = BufferGetPage(buffer);
3555  XLogRecPtr lsn;
3556  uint32 buf_state;
3557 
3558  /*
3559  * If we don't need locking for correctness, fastpath out.
3560  */
3561  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
3562  return PageGetLSN(page);
3563 
3564  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3565  Assert(BufferIsValid(buffer));
3566  Assert(BufferIsPinned(buffer));
3567 
3568  buf_state = LockBufHdr(bufHdr);
3569  lsn = PageGetLSN(page);
3570  UnlockBufHdr(bufHdr, buf_state);
3571 
3572  return lsn;
3573 }
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
#define XLogHintBitIsNeeded()
Definition: xlog.h:115
uint64 XLogRecPtr
Definition: xlogdefs.h:21

References Assert(), PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), LockBufHdr(), PageGetLSN(), UnlockBufHdr(), and XLogHintBitIsNeeded.

Referenced by _bt_killitems(), _bt_readpage(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileLocator rlocator,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 3311 of file bufmgr.c.

3313 {
3314  BufferDesc *bufHdr;
3315 
3316  /* Do the same checks as BufferGetBlockNumber. */
3317  Assert(BufferIsPinned(buffer));
3318 
3319  if (BufferIsLocal(buffer))
3320  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3321  else
3322  bufHdr = GetBufferDescriptor(buffer - 1);
3323 
3324  /* pinned, so OK to read tag without spinlock */
3325  *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3326  *forknum = BufTagGetForkNum(&bufHdr->tag);
3327  *blknum = bufHdr->tag.blockNum;
3328 }

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 3521 of file bufmgr.c.

3522 {
3523  BufferDesc *bufHdr;
3524 
3525  /* Local buffers are used only for temp relations. */
3526  if (BufferIsLocal(buffer))
3527  return false;
3528 
3529  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3530  Assert(BufferIsValid(buffer));
3531  Assert(BufferIsPinned(buffer));
3532 
3533  /*
3534  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3535  * need not bother with the buffer header spinlock. Even if someone else
3536  * changes the buffer header state while we're doing this, the state is
3537  * changed atomically, so we'll read the old value or the new value, but
3538  * not random garbage.
3539  */
3540  bufHdr = GetBufferDescriptor(buffer - 1);
3541  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3542 }
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:236
pg_atomic_uint32 state

References Assert(), BM_PERMANENT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 2479 of file bufmgr.c.

2480 {
2481  uint32 buf_state;
2482  int buf_id;
2483  int num_to_scan;
2484  int num_spaces;
2485  int num_processed;
2486  int num_written;
2487  CkptTsStatus *per_ts_stat = NULL;
2488  Oid last_tsid;
2489  binaryheap *ts_heap;
2490  int i;
2491  int mask = BM_DIRTY;
2492  WritebackContext wb_context;
2493 
2494  /* Make sure we can handle the pin inside SyncOneBuffer */
2496 
2497  /*
2498  * Unless this is a shutdown checkpoint or we have been explicitly told,
2499  * we write only permanent, dirty buffers. But at shutdown or end of
2500  * recovery, we write all dirty buffers.
2501  */
2504  mask |= BM_PERMANENT;
2505 
2506  /*
2507  * Loop over all buffers, and mark the ones that need to be written with
2508  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2509  * can estimate how much work needs to be done.
2510  *
2511  * This allows us to write only those pages that were dirty when the
2512  * checkpoint began, and not those that get dirtied while it proceeds.
2513  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2514  * later in this function, or by normal backends or the bgwriter cleaning
2515  * scan, the flag is cleared. Any buffer dirtied after this point won't
2516  * have the flag set.
2517  *
2518  * Note that if we fail to write some buffer, we may leave buffers with
2519  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2520  * certainly need to be written for the next checkpoint attempt, too.
2521  */
2522  num_to_scan = 0;
2523  for (buf_id = 0; buf_id < NBuffers; buf_id++)
2524  {
2525  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2526 
2527  /*
2528  * Header spinlock is enough to examine BM_DIRTY, see comment in
2529  * SyncOneBuffer.
2530  */
2531  buf_state = LockBufHdr(bufHdr);
2532 
2533  if ((buf_state & mask) == mask)
2534  {
2535  CkptSortItem *item;
2536 
2537  buf_state |= BM_CHECKPOINT_NEEDED;
2538 
2539  item = &CkptBufferIds[num_to_scan++];
2540  item->buf_id = buf_id;
2541  item->tsId = bufHdr->tag.spcOid;
2542  item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2543  item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2544  item->blockNum = bufHdr->tag.blockNum;
2545  }
2546 
2547  UnlockBufHdr(bufHdr, buf_state);
2548 
2549  /* Check for barrier events in case NBuffers is large. */
2552  }
2553 
2554  if (num_to_scan == 0)
2555  return; /* nothing to do */
2556 
2558 
2559  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2560 
2561  /*
2562  * Sort buffers that need to be written to reduce the likelihood of random
2563  * IO. The sorting is also important for the implementation of balancing
2564  * writes between tablespaces. Without balancing writes we'd potentially
2565  * end up writing to the tablespaces one-by-one; possibly overloading the
2566  * underlying system.
2567  */
2568  sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2569 
2570  num_spaces = 0;
2571 
2572  /*
2573  * Allocate progress status for each tablespace with buffers that need to
2574  * be flushed. This requires the to-be-flushed array to be sorted.
2575  */
2576  last_tsid = InvalidOid;
2577  for (i = 0; i < num_to_scan; i++)
2578  {
2579  CkptTsStatus *s;
2580  Oid cur_tsid;
2581 
2582  cur_tsid = CkptBufferIds[i].tsId;
2583 
2584  /*
2585  * Grow array of per-tablespace status structs, every time a new
2586  * tablespace is found.
2587  */
2588  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
2589  {
2590  Size sz;
2591 
2592  num_spaces++;
2593 
2594  /*
2595  * Not worth adding grow-by-power-of-2 logic here - even with a
2596  * few hundred tablespaces this should be fine.
2597  */
2598  sz = sizeof(CkptTsStatus) * num_spaces;
2599 
2600  if (per_ts_stat == NULL)
2601  per_ts_stat = (CkptTsStatus *) palloc(sz);
2602  else
2603  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
2604 
2605  s = &per_ts_stat[num_spaces - 1];
2606  memset(s, 0, sizeof(*s));
2607  s->tsId = cur_tsid;
2608 
2609  /*
2610  * The first buffer in this tablespace. As CkptBufferIds is sorted
2611  * by tablespace all (s->num_to_scan) buffers in this tablespace
2612  * will follow afterwards.
2613  */
2614  s->index = i;
2615 
2616  /*
2617  * progress_slice will be determined once we know how many buffers
2618  * are in each tablespace, i.e. after this loop.
2619  */
2620 
2621  last_tsid = cur_tsid;
2622  }
2623  else
2624  {
2625  s = &per_ts_stat[num_spaces - 1];
2626  }
2627 
2628  s->num_to_scan++;
2629 
2630  /* Check for barrier events. */
2633  }
2634 
2635  Assert(num_spaces > 0);
2636 
2637  /*
2638  * Build a min-heap over the write-progress in the individual tablespaces,
2639  * and compute how large a portion of the total progress a single
2640  * processed buffer is.
2641  */
2642  ts_heap = binaryheap_allocate(num_spaces,
2644  NULL);
2645 
2646  for (i = 0; i < num_spaces; i++)
2647  {
2648  CkptTsStatus *ts_stat = &per_ts_stat[i];
2649 
2650  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
2651 
2652  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
2653  }
2654 
2655  binaryheap_build(ts_heap);
2656 
2657  /*
2658  * Iterate through to-be-checkpointed buffers and write the ones (still)
2659  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
2660  * tablespaces; otherwise the sorting would lead to only one tablespace
2661  * receiving writes at a time, making inefficient use of the hardware.
2662  */
2663  num_processed = 0;
2664  num_written = 0;
2665  while (!binaryheap_empty(ts_heap))
2666  {
2667  BufferDesc *bufHdr = NULL;
2668  CkptTsStatus *ts_stat = (CkptTsStatus *)
2670 
2671  buf_id = CkptBufferIds[ts_stat->index].buf_id;
2672  Assert(buf_id != -1);
2673 
2674  bufHdr = GetBufferDescriptor(buf_id);
2675 
2676  num_processed++;
2677 
2678  /*
2679  * We don't need to acquire the lock here, because we're only looking
2680  * at a single bit. It's possible that someone else writes the buffer
2681  * and clears the flag right after we check, but that doesn't matter
2682  * since SyncOneBuffer will then do nothing. However, there is a
2683  * further race condition: it's conceivable that between the time we
2684  * examine the bit here and the time SyncOneBuffer acquires the lock,
2685  * someone else not only wrote the buffer but replaced it with another
2686  * page and dirtied it. In that improbable case, SyncOneBuffer will
2687  * write the buffer though we didn't need to. It doesn't seem worth
2688  * guarding against this, though.
2689  */
2691  {
2692  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
2693  {
2694  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
2696  num_written++;
2697  }
2698  }
2699 
2700  /*
2701  * Measure progress independent of actually having to flush the buffer
2702  * - otherwise writing become unbalanced.
2703  */
2704  ts_stat->progress += ts_stat->progress_slice;
2705  ts_stat->num_scanned++;
2706  ts_stat->index++;
2707 
2708  /* Have all the buffers from the tablespace been processed? */
2709  if (ts_stat->num_scanned == ts_stat->num_to_scan)
2710  {
2711  binaryheap_remove_first(ts_heap);
2712  }
2713  else
2714  {
2715  /* update heap with the new progress */
2716  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
2717  }
2718 
2719  /*
2720  * Sleep to throttle our I/O rate.
2721  *
2722  * (This will check for barrier events even if it doesn't sleep.)
2723  */
2724  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
2725  }
2726 
2727  /*
2728  * Issue all pending flushes. Only checkpointer calls BufferSync(), so
2729  * IOContext will always be IOCONTEXT_NORMAL.
2730  */
2732 
2733  pfree(per_ts_stat);
2734  per_ts_stat = NULL;
2735  binaryheap_free(ts_heap);
2736 
2737  /*
2738  * Update checkpoint statistics. As noted above, this doesn't include
2739  * buffers written by other backends or bgwriter scan.
2740  */
2741  CheckpointStats.ckpt_bufs_written += num_written;
2742 
2743  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
2744 }
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
#define binaryheap_empty(h)
Definition: binaryheap.h:65
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:67
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:5419
int checkpoint_flush_after
Definition: bufmgr.c:159
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:5442
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:5499
struct CkptTsStatus CkptTsStatus
double float8
Definition: c.h:619
size_t Size
Definition: c.h:594
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:696
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:37
int i
Definition: isn.c:73
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1476
void * palloc(Size size)
Definition: mcxt.c:1226
@ IOCONTEXT_NORMAL
Definition: pgstat.h:288
PgStat_CheckpointerStats PendingCheckpointerStats
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:468
int ckpt_bufs_written
Definition: xlog.h:162
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:110
int index
Definition: bufmgr.c:118
int num_scanned
Definition: bufmgr.c:115
float8 progress
Definition: bufmgr.c:109
int num_to_scan
Definition: bufmgr.c:113
Oid tsId
Definition: bufmgr.c:100
PgStat_Counter buf_written_checkpoints
Definition: pgstat.h:267
Oid spcOid
Definition: buf_internals.h:93
CheckpointStatsData CheckpointStats
Definition: xlog.c:212
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:135
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:138
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:134

References Assert(), binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, PgStat_CheckpointerStats::buf_written_checkpoints, BufTagGetForkNum(), BufTagGetRelNumber(), CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_ALL, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, CurrentResourceOwner, DatumGetPointer(), CkptSortItem::forkNum, GetBufferDescriptor(), i, CkptTsStatus::index, InvalidOid, IOCONTEXT_NORMAL, IssuePendingWritebacks(), LockBufHdr(), NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), PendingCheckpointerStats, pfree(), pg_atomic_read_u32(), PointerGetDatum(), ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress, CkptTsStatus::progress_slice, CkptSortItem::relNumber, repalloc(), ResourceOwnerEnlargeBuffers(), buftag::spcOid, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdr(), and WritebackContextInit().

Referenced by CheckPointBuffers().

◆ buffertag_comparator()

static int buffertag_comparator ( const BufferTag ba,
const BufferTag bb 
)
inlinestatic

Definition at line 5354 of file bufmgr.c.

5355 {
5356  int ret;
5357  RelFileLocator rlocatora;
5358  RelFileLocator rlocatorb;
5359 
5360  rlocatora = BufTagGetRelFileLocator(ba);
5361  rlocatorb = BufTagGetRelFileLocator(bb);
5362 
5363  ret = rlocator_comparator(&rlocatora, &rlocatorb);
5364 
5365  if (ret != 0)
5366  return ret;
5367 
5368  if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
5369  return -1;
5370  if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
5371  return 1;
5372 
5373  if (ba->blockNum < bb->blockNum)
5374  return -1;
5375  if (ba->blockNum > bb->blockNum)
5376  return 1;
5377 
5378  return 0;
5379 }
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:5273

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), and rlocator_comparator().

◆ CheckBufferIsPinnedOnce()

void CheckBufferIsPinnedOnce ( Buffer  buffer)

Definition at line 4762 of file bufmgr.c.

4763 {
4764  if (BufferIsLocal(buffer))
4765  {
4766  if (LocalRefCount[-buffer - 1] != 1)
4767  elog(ERROR, "incorrect local pin count: %d",
4768  LocalRefCount[-buffer - 1]);
4769  }
4770  else
4771  {
4772  if (GetPrivateRefCount(buffer) != 1)
4773  elog(ERROR, "incorrect local pin count: %d",
4774  GetPrivateRefCount(buffer));
4775  }
4776 }
#define ERROR
Definition: elog.h:39

References PrivateRefCountEntry::buffer, BufferIsLocal, elog(), ERROR, GetPrivateRefCount(), and LocalRefCount.

Referenced by GetVictimBuffer(), and LockBufferForCleanup().

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 3192 of file bufmgr.c.

3193 {
3194 #ifdef USE_ASSERT_CHECKING
3195  int RefCountErrors = 0;
3197  int i;
3198 
3199  /* check the array */
3200  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
3201  {
3203 
3204  if (res->buffer != InvalidBuffer)
3205  {
3206  PrintBufferLeakWarning(res->buffer);
3207  RefCountErrors++;
3208  }
3209  }
3210 
3211  /* if necessary search the hash */
3213  {
3214  HASH_SEQ_STATUS hstat;
3215 
3217  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
3218  {
3219  PrintBufferLeakWarning(res->buffer);
3220  RefCountErrors++;
3221  }
3222  }
3223 
3224  Assert(RefCountErrors == 0);
3225 #endif
3226 }
#define InvalidBuffer
Definition: buf.h:25
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:91
void PrintBufferLeakWarning(Buffer buffer)
Definition: bufmgr.c:3232
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:196
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:197
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1431
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1421

References Assert(), hash_seq_init(), hash_seq_search(), i, InvalidBuffer, PrintBufferLeakWarning(), PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and res.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 3276 of file bufmgr.c.

3277 {
3278  BufferSync(flags);
3279 }
static void BufferSync(int flags)
Definition: bufmgr.c:2479

References BufferSync().

Referenced by CheckPointGuts().

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const CkptSortItem a,
const CkptSortItem b 
)
inlinestatic

Definition at line 5388 of file bufmgr.c.

5389 {
5390  /* compare tablespace */
5391  if (a->tsId < b->tsId)
5392  return -1;
5393  else if (a->tsId > b->tsId)
5394  return 1;
5395  /* compare relation */
5396  if (a->relNumber < b->relNumber)
5397  return -1;
5398  else if (a->relNumber > b->relNumber)
5399  return 1;
5400  /* compare fork */
5401  else if (a->forkNum < b->forkNum)
5402  return -1;
5403  else if (a->forkNum > b->forkNum)
5404  return 1;
5405  /* compare block number */
5406  else if (a->blockNum < b->blockNum)
5407  return -1;
5408  else if (a->blockNum > b->blockNum)
5409  return 1;
5410  /* equal page IDs are unlikely, but not impossible */
5411  return 0;
5412 }
int b
Definition: isn.c:70
int a
Definition: isn.c:69

References a, and b.

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

Definition at line 4741 of file bufmgr.c.

4742 {
4743  BufferDesc *buf;
4744 
4745  Assert(BufferIsPinned(buffer));
4746  if (BufferIsLocal(buffer))
4747  return true; /* act as though we got it */
4748 
4749  buf = GetBufferDescriptor(buffer - 1);
4750 
4752  LW_EXCLUSIVE);
4753 }
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1366

References Assert(), buf, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), LW_EXCLUSIVE, and LWLockConditionalAcquire().

Referenced by _bt_conditionallockbuf(), BloomNewBuffer(), ConditionalLockBufferForCleanup(), GinNewBuffer(), gistNewBuffer(), RelationGetBufferForTuple(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), and SpGistUpdateMetaPage().

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 4956 of file bufmgr.c.

4957 {
4958  BufferDesc *bufHdr;
4959  uint32 buf_state,
4960  refcount;
4961 
4962  Assert(BufferIsValid(buffer));
4963 
4964  if (BufferIsLocal(buffer))
4965  {
4966  refcount = LocalRefCount[-buffer - 1];
4967  /* There should be exactly one pin */
4968  Assert(refcount > 0);
4969  if (refcount != 1)
4970  return false;
4971  /* Nobody else to wait for */
4972  return true;
4973  }
4974 
4975  /* There should be exactly one local pin */
4976  refcount = GetPrivateRefCount(buffer);
4977  Assert(refcount);
4978  if (refcount != 1)
4979  return false;
4980 
4981  /* Try to acquire lock */
4982  if (!ConditionalLockBuffer(buffer))
4983  return false;
4984 
4985  bufHdr = GetBufferDescriptor(buffer - 1);
4986  buf_state = LockBufHdr(bufHdr);
4987  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
4988 
4989  Assert(refcount > 0);
4990  if (refcount == 1)
4991  {
4992  /* Successfully acquired exclusive lock with pincount 1 */
4993  UnlockBufHdr(bufHdr, buf_state);
4994  return true;
4995  }
4996 
4997  /* Failed, so release the lock */
4998  UnlockBufHdr(bufHdr, buf_state);
4999  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5000  return false;
5001 }
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:4741
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4715
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:157

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), ConditionalLockBuffer(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr().

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), and lazy_scan_heap().

◆ CreateAndCopyRelationData()

void CreateAndCopyRelationData ( RelFileLocator  src_rlocator,
RelFileLocator  dst_rlocator,
bool  permanent 
)

Definition at line 4348 of file bufmgr.c.

4350 {
4351  RelFileLocatorBackend rlocator;
4352  char relpersistence;
4353 
4354  /* Set the relpersistence. */
4355  relpersistence = permanent ?
4356  RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
4357 
4358  /*
4359  * Create and copy all forks of the relation. During create database we
4360  * have a separate cleanup mechanism which deletes complete database
4361  * directory. Therefore, each individual relation doesn't need to be
4362  * registered for cleanup.
4363  */
4364  RelationCreateStorage(dst_rlocator, relpersistence, false);
4365 
4366  /* copy main fork. */
4367  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
4368  permanent);
4369 
4370  /* copy those extra forks that exist */
4371  for (ForkNumber forkNum = MAIN_FORKNUM + 1;
4372  forkNum <= MAX_FORKNUM; forkNum++)
4373  {
4374  if (smgrexists(smgropen(src_rlocator, InvalidBackendId), forkNum))
4375  {
4376  smgrcreate(smgropen(dst_rlocator, InvalidBackendId), forkNum, false);
4377 
4378  /*
4379  * WAL log creation if the relation is persistent, or this is the
4380  * init fork of an unlogged relation.
4381  */
4382  if (permanent || forkNum == INIT_FORKNUM)
4383  log_smgrcreate(&dst_rlocator, forkNum);
4384 
4385  /* Copy a fork's data, block by block. */
4386  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
4387  permanent);
4388  }
4389  }
4390 
4391  /* close source and destination smgr if exists. */
4392  rlocator.backend = InvalidBackendId;
4393 
4394  rlocator.locator = src_rlocator;
4395  smgrcloserellocator(rlocator);
4396 
4397  rlocator.locator = dst_rlocator;
4398  smgrcloserellocator(rlocator);
4399 }
#define InvalidBackendId
Definition: backendid.h:23
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:4257
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
#define MAX_FORKNUM
Definition: relpath.h:62
void smgrcloserellocator(RelFileLocatorBackend rlocator)
Definition: smgr.c:351
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:374
SMgrRelation smgropen(RelFileLocator rlocator, BackendId backend)
Definition: smgr.c:150
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:251
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:120
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:185

References RelFileLocatorBackend::backend, INIT_FORKNUM, InvalidBackendId, RelFileLocatorBackend::locator, log_smgrcreate(), MAIN_FORKNUM, MAX_FORKNUM, RelationCopyStorageUsingBuffer(), RelationCreateStorage(), smgrcloserellocator(), smgrcreate(), smgrexists(), and smgropen().

Referenced by CreateDatabaseUsingWalLog().

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 3952 of file bufmgr.c.

3953 {
3954  int i;
3955 
3956  /*
3957  * We needn't consider local buffers, since by assumption the target
3958  * database isn't our own.
3959  */
3960 
3961  for (i = 0; i < NBuffers; i++)
3962  {
3963  BufferDesc *bufHdr = GetBufferDescriptor(i);
3964  uint32 buf_state;
3965 
3966  /*
3967  * As in DropRelationBuffers, an unlocked precheck should be safe and
3968  * saves some cycles.
3969  */
3970  if (bufHdr->tag.dbOid != dbid)
3971  continue;
3972 
3973  buf_state = LockBufHdr(bufHdr);
3974  if (bufHdr->tag.dbOid == dbid)
3975  InvalidateBuffer(bufHdr); /* releases spinlock */
3976  else
3977  UnlockBufHdr(bufHdr, buf_state);
3978  }
3979 }
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1419
Oid dbOid
Definition: buf_internals.h:94

References buftag::dbOid, GetBufferDescriptor(), i, InvalidateBuffer(), LockBufHdr(), NBuffers, BufferDesc::tag, and UnlockBufHdr().

Referenced by createdb_failure_callback(), dbase_redo(), dropdb(), and movedb().

◆ DropRelationBuffers()

void DropRelationBuffers ( SMgrRelation  smgr_reln,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 3597 of file bufmgr.c.

3599 {
3600  int i;
3601  int j;
3602  RelFileLocatorBackend rlocator;
3603  BlockNumber nForkBlock[MAX_FORKNUM];
3604  uint64 nBlocksToInvalidate = 0;
3605 
3606  rlocator = smgr_reln->smgr_rlocator;
3607 
3608  /* If it's a local relation, it's localbuf.c's problem. */
3609  if (RelFileLocatorBackendIsTemp(rlocator))
3610  {
3611  if (rlocator.backend == MyBackendId)
3612  {
3613  for (j = 0; j < nforks; j++)
3614  DropRelationLocalBuffers(rlocator.locator, forkNum[j],
3615  firstDelBlock[j]);
3616  }
3617  return;
3618  }
3619 
3620  /*
3621  * To remove all the pages of the specified relation forks from the buffer
3622  * pool, we need to scan the entire buffer pool but we can optimize it by
3623  * finding the buffers from BufMapping table provided we know the exact
3624  * size of each fork of the relation. The exact size is required to ensure
3625  * that we don't leave any buffer for the relation being dropped as
3626  * otherwise the background writer or checkpointer can lead to a PANIC
3627  * error while flushing buffers corresponding to files that don't exist.
3628  *
3629  * To know the exact size, we rely on the size cached for each fork by us
3630  * during recovery which limits the optimization to recovery and on
3631  * standbys but we can easily extend it once we have shared cache for
3632  * relation size.
3633  *
3634  * In recovery, we cache the value returned by the first lseek(SEEK_END)
3635  * and the future writes keeps the cached value up-to-date. See
3636  * smgrextend. It is possible that the value of the first lseek is smaller
3637  * than the actual number of existing blocks in the file due to buggy
3638  * Linux kernels that might not have accounted for the recent write. But
3639  * that should be fine because there must not be any buffers after that
3640  * file size.
3641  */
3642  for (i = 0; i < nforks; i++)
3643  {
3644  /* Get the number of blocks for a relation's fork */
3645  nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
3646 
3647  if (nForkBlock[i] == InvalidBlockNumber)
3648  {
3649  nBlocksToInvalidate = InvalidBlockNumber;
3650  break;
3651  }
3652 
3653  /* calculate the number of blocks to be invalidated */
3654  nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
3655  }
3656 
3657  /*
3658  * We apply the optimization iff the total number of blocks to invalidate
3659  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3660  */
3661  if (BlockNumberIsValid(nBlocksToInvalidate) &&
3662  nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3663  {
3664  for (j = 0; j < nforks; j++)
3665  FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
3666  nForkBlock[j], firstDelBlock[j]);
3667  return;
3668  }
3669 
3670  for (i = 0; i < NBuffers; i++)
3671  {
3672  BufferDesc *bufHdr = GetBufferDescriptor(i);
3673  uint32 buf_state;
3674 
3675  /*
3676  * We can make this a tad faster by prechecking the buffer tag before
3677  * we attempt to lock the buffer; this saves a lot of lock
3678  * acquisitions in typical cases. It should be safe because the
3679  * caller must have AccessExclusiveLock on the relation, or some other
3680  * reason to be certain that no one is loading new pages of the rel
3681  * into the buffer pool. (Otherwise we might well miss such pages
3682  * entirely.) Therefore, while the tag might be changing while we
3683  * look at it, it can't be changing *to* a value we care about, only
3684  * *away* from such a value. So false negatives are impossible, and
3685  * false positives are safe because we'll recheck after getting the
3686  * buffer lock.
3687  *
3688  * We could check forkNum and blockNum as well as the rlocator, but
3689  * the incremental win from doing so seems small.
3690  */
3691  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
3692  continue;
3693 
3694  buf_state = LockBufHdr(bufHdr);
3695 
3696  for (j = 0; j < nforks; j++)
3697  {
3698  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
3699  BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
3700  bufHdr->tag.blockNum >= firstDelBlock[j])
3701  {
3702  InvalidateBuffer(bufHdr); /* releases spinlock */
3703  break;
3704  }
3705  }
3706  if (j >= nforks)
3707  UnlockBufHdr(bufHdr, buf_state);
3708  }
3709 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:82
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:3891
BackendId MyBackendId
Definition: globals.c:85
int j
Definition: isn.c:74
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:484
#define RelFileLocatorBackendIsTemp(rlocator)
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:633

References RelFileLocatorBackend::backend, buftag::blockNum, BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetForkNum(), BufTagMatchesRelFileLocator(), DropRelationLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, RelFileLocatorBackend::locator, LockBufHdr(), MAX_FORKNUM, MyBackendId, NBuffers, RelFileLocatorBackendIsTemp, SMgrRelationData::smgr_rlocator, smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrtruncate().

◆ DropRelationsAllBuffers()

void DropRelationsAllBuffers ( SMgrRelation smgr_reln,
int  nlocators 
)

Definition at line 3720 of file bufmgr.c.

3721 {
3722  int i;
3723  int n = 0;
3724  SMgrRelation *rels;
3725  BlockNumber (*block)[MAX_FORKNUM + 1];
3726  uint64 nBlocksToInvalidate = 0;
3727  RelFileLocator *locators;
3728  bool cached = true;
3729  bool use_bsearch;
3730 
3731  if (nlocators == 0)
3732  return;
3733 
3734  rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
3735 
3736  /* If it's a local relation, it's localbuf.c's problem. */
3737  for (i = 0; i < nlocators; i++)
3738  {
3739  if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
3740  {
3741  if (smgr_reln[i]->smgr_rlocator.backend == MyBackendId)
3742  DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
3743  }
3744  else
3745  rels[n++] = smgr_reln[i];
3746  }
3747 
3748  /*
3749  * If there are no non-local relations, then we're done. Release the
3750  * memory and return.
3751  */
3752  if (n == 0)
3753  {
3754  pfree(rels);
3755  return;
3756  }
3757 
3758  /*
3759  * This is used to remember the number of blocks for all the relations
3760  * forks.
3761  */
3762  block = (BlockNumber (*)[MAX_FORKNUM + 1])
3763  palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
3764 
3765  /*
3766  * We can avoid scanning the entire buffer pool if we know the exact size
3767  * of each of the given relation forks. See DropRelationBuffers.
3768  */
3769  for (i = 0; i < n && cached; i++)
3770  {
3771  for (int j = 0; j <= MAX_FORKNUM; j++)
3772  {
3773  /* Get the number of blocks for a relation's fork. */
3774  block[i][j] = smgrnblocks_cached(rels[i], j);
3775 
3776  /* We need to only consider the relation forks that exists. */
3777  if (block[i][j] == InvalidBlockNumber)
3778  {
3779  if (!smgrexists(rels[i], j))
3780  continue;
3781  cached = false;
3782  break;
3783  }
3784 
3785  /* calculate the total number of blocks to be invalidated */
3786  nBlocksToInvalidate += block[i][j];
3787  }
3788  }
3789 
3790  /*
3791  * We apply the optimization iff the total number of blocks to invalidate
3792  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
3793  */
3794  if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
3795  {
3796  for (i = 0; i < n; i++)
3797  {
3798  for (int j = 0; j <= MAX_FORKNUM; j++)
3799  {
3800  /* ignore relation forks that doesn't exist */
3801  if (!BlockNumberIsValid(block[i][j]))
3802  continue;
3803 
3804  /* drop all the buffers for a particular relation fork */
3805  FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
3806  j, block[i][j], 0);
3807  }
3808  }
3809 
3810  pfree(block);
3811  pfree(rels);
3812  return;
3813  }
3814 
3815  pfree(block);
3816  locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
3817  for (i = 0; i < n; i++)
3818  locators[i] = rels[i]->smgr_rlocator.locator;
3819 
3820  /*
3821  * For low number of relations to drop just use a simple walk through, to
3822  * save the bsearch overhead. The threshold to use is rather a guess than
3823  * an exactly determined value, as it depends on many factors (CPU and RAM
3824  * speeds, amount of shared buffers etc.).
3825  */
3826  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
3827 
3828  /* sort the list of rlocators if necessary */
3829  if (use_bsearch)
3830  pg_qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
3831 
3832  for (i = 0; i < NBuffers; i++)
3833  {
3834  RelFileLocator *rlocator = NULL;
3835  BufferDesc *bufHdr = GetBufferDescriptor(i);
3836  uint32 buf_state;
3837 
3838  /*
3839  * As in DropRelationBuffers, an unlocked precheck should be safe and
3840  * saves some cycles.
3841  */
3842 
3843  if (!use_bsearch)
3844  {
3845  int j;
3846 
3847  for (j = 0; j < n; j++)
3848  {
3849  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
3850  {
3851  rlocator = &locators[j];
3852  break;
3853  }
3854  }
3855  }
3856  else
3857  {
3858  RelFileLocator locator;
3859 
3860  locator = BufTagGetRelFileLocator(&bufHdr->tag);
3861  rlocator = bsearch((const void *) &(locator),
3862  locators, n, sizeof(RelFileLocator),
3864  }
3865 
3866  /* buffer doesn't belong to any of the given relfilelocators; skip it */
3867  if (rlocator == NULL)
3868  continue;
3869 
3870  buf_state = LockBufHdr(bufHdr);
3871  if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
3872  InvalidateBuffer(bufHdr); /* releases spinlock */
3873  else
3874  UnlockBufHdr(bufHdr, buf_state);
3875  }
3876 
3877  pfree(locators);
3878  pfree(rels);
3879 }
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:74
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:532
void pg_qsort(void *base, size_t nel, size_t elsize, int(*cmp)(const void *, const void *))

References BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), DropRelationAllLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, if(), InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyBackendId, NBuffers, palloc(), pfree(), pg_qsort(), RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, rlocator_comparator(), smgrexists(), smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrdounlinkall().

◆ ExtendBufferedRel()

Buffer ExtendBufferedRel ( BufferManagerRelation  bmr,
ForkNumber  forkNum,
BufferAccessStrategy  strategy,
uint32  flags 
)

Definition at line 812 of file bufmgr.c.

816 {
817  Buffer buf;
818  uint32 extend_by = 1;
819 
820  ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
821  &buf, &extend_by);
822 
823  return buf;
824 }
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:844

References buf, and ExtendBufferedRelBy().

Referenced by _bt_allocbuf(), _hash_getnewbuf(), BloomNewBuffer(), brinbuild(), brinbuildempty(), fill_seq_fork_with_data(), ginbuildempty(), GinNewBuffer(), gistbuildempty(), gistNewBuffer(), ReadBuffer_common(), revmap_physical_extend(), and SpGistNewBuffer().

◆ ExtendBufferedRelBy()

BlockNumber ExtendBufferedRelBy ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
Buffer buffers,
uint32 extended_by 
)

Definition at line 844 of file bufmgr.c.

851 {
852  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
853  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
854  Assert(extend_by > 0);
855 
856  if (bmr.smgr == NULL)
857  {
858  bmr.smgr = RelationGetSmgr(bmr.rel);
859  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
860  }
861 
862  return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
863  extend_by, InvalidBlockNumber,
864  buffers, extended_by);
865 }
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:1782
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:572
struct SMgrRelationData * smgr
Definition: bufmgr.h:102
Form_pg_class rd_rel
Definition: rel.h:111

References Assert(), ExtendBufferedRelCommon(), InvalidBlockNumber, RelationData::rd_rel, BufferManagerRelation::rel, RelationGetSmgr(), BufferManagerRelation::relpersistence, and BufferManagerRelation::smgr.

Referenced by ExtendBufferedRel(), and RelationAddBlocks().

◆ ExtendBufferedRelCommon()

static BlockNumber ExtendBufferedRelCommon ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 1782 of file bufmgr.c.

1790 {
1791  BlockNumber first_block;
1792 
1793  TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
1797  bmr.smgr->smgr_rlocator.backend,
1798  extend_by);
1799 
1800  if (bmr.relpersistence == RELPERSISTENCE_TEMP)
1801  first_block = ExtendBufferedRelLocal(bmr, fork, flags,
1802  extend_by, extend_upto,
1803  buffers, &extend_by);
1804  else
1805  first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
1806  extend_by, extend_upto,
1807  buffers, &extend_by);
1808  *extended_by = extend_by;
1809 
1810  TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
1814  bmr.smgr->smgr_rlocator.backend,
1815  *extended_by,
1816  first_block);
1817 
1818  return first_block;
1819 }
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:1826
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:311
RelFileNumber relNumber

References RelFileLocatorBackend::backend, RelFileLocator::dbOid, ExtendBufferedRelLocal(), ExtendBufferedRelShared(), RelFileLocatorBackend::locator, RelFileLocator::relNumber, BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, SMgrRelationData::smgr_rlocator, and RelFileLocator::spcOid.

Referenced by ExtendBufferedRelBy(), and ExtendBufferedRelTo().

◆ ExtendBufferedRelShared()

static BlockNumber ExtendBufferedRelShared ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 1826 of file bufmgr.c.

1834 {
1835  BlockNumber first_block;
1836  IOContext io_context = IOContextForStrategy(strategy);
1837  instr_time io_start;
1838 
1839  LimitAdditionalPins(&extend_by);
1840 
1841  /*
1842  * Acquire victim buffers for extension without holding extension lock.
1843  * Writing out victim buffers is the most expensive part of extending the
1844  * relation, particularly when doing so requires WAL flushes. Zeroing out
1845  * the buffers is also quite expensive, so do that before holding the
1846  * extension lock as well.
1847  *
1848  * These pages are pinned by us and not valid. While we hold the pin they
1849  * can't be acquired as victim buffers by another backend.
1850  */
1851  for (uint32 i = 0; i < extend_by; i++)
1852  {
1853  Block buf_block;
1854 
1855  buffers[i] = GetVictimBuffer(strategy, io_context);
1856  buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
1857 
1858  /* new buffers are zero-filled */
1859  MemSet((char *) buf_block, 0, BLCKSZ);
1860  }
1861 
1862  /* in case we need to pin an existing buffer below */
1864 
1865  /*
1866  * Lock relation against concurrent extensions, unless requested not to.
1867  *
1868  * We use the same extension lock for all forks. That's unnecessarily
1869  * restrictive, but currently extensions for forks don't happen often
1870  * enough to make it worth locking more granularly.
1871  *
1872  * Note that another backend might have extended the relation by the time
1873  * we get the lock.
1874  */
1875  if (!(flags & EB_SKIP_EXTENSION_LOCK))
1876  {
1878  if (bmr.rel)
1879  bmr.smgr = RelationGetSmgr(bmr.rel);
1880  }
1881 
1882  /*
1883  * If requested, invalidate size cache, so that smgrnblocks asks the
1884  * kernel.
1885  */
1886  if (flags & EB_CLEAR_SIZE_CACHE)
1888 
1889  first_block = smgrnblocks(bmr.smgr, fork);
1890 
1891  /*
1892  * Now that we have the accurate relation size, check if the caller wants
1893  * us to extend to only up to a specific size. If there were concurrent
1894  * extensions, we might have acquired too many buffers and need to release
1895  * them.
1896  */
1897  if (extend_upto != InvalidBlockNumber)
1898  {
1899  uint32 orig_extend_by = extend_by;
1900 
1901  if (first_block > extend_upto)
1902  extend_by = 0;
1903  else if ((uint64) first_block + extend_by > extend_upto)
1904  extend_by = extend_upto - first_block;
1905 
1906  for (uint32 i = extend_by; i < orig_extend_by; i++)
1907  {
1908  BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
1909 
1910  /*
1911  * The victim buffer we acquired peviously is clean and unused,
1912  * let it be found again quickly
1913  */
1914  StrategyFreeBuffer(buf_hdr);
1915  UnpinBuffer(buf_hdr);
1916  }
1917 
1918  if (extend_by == 0)
1919  {
1920  if (!(flags & EB_SKIP_EXTENSION_LOCK))
1922  *extended_by = extend_by;
1923  return first_block;
1924  }
1925  }
1926 
1927  /* Fail if relation is already at maximum possible length */
1928  if ((uint64) first_block + extend_by >= MaxBlockNumber)
1929  ereport(ERROR,
1930  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1931  errmsg("cannot extend relation %s beyond %u blocks",
1932  relpath(bmr.smgr->smgr_rlocator, fork),
1933  MaxBlockNumber)));
1934 
1935  /*
1936  * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
1937  *
1938  * This needs to happen before we extend the relation, because as soon as
1939  * we do, other backends can start to read in those pages.
1940  */
1941  for (uint32 i = 0; i < extend_by; i++)
1942  {
1943  Buffer victim_buf = buffers[i];
1944  BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
1945  BufferTag tag;
1946  uint32 hash;
1947  LWLock *partition_lock;
1948  int existing_id;
1949 
1950  InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
1951  hash = BufTableHashCode(&tag);
1952  partition_lock = BufMappingPartitionLock(hash);
1953 
1954  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1955 
1956  existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
1957 
1958  /*
1959  * We get here only in the corner case where we are trying to extend
1960  * the relation but we found a pre-existing buffer. This can happen
1961  * because a prior attempt at extending the relation failed, and
1962  * because mdread doesn't complain about reads beyond EOF (when
1963  * zero_damaged_pages is ON) and so a previous attempt to read a block
1964  * beyond EOF could have left a "valid" zero-filled buffer.
1965  * Unfortunately, we have also seen this case occurring because of
1966  * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
1967  * that doesn't account for a recent write. In that situation, the
1968  * pre-existing buffer would contain valid data that we don't want to
1969  * overwrite. Since the legitimate cases should always have left a
1970  * zero-filled buffer, complain if not PageIsNew.
1971  */
1972  if (existing_id >= 0)
1973  {
1974  BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
1975  Block buf_block;
1976  bool valid;
1977 
1978  /*
1979  * Pin the existing buffer before releasing the partition lock,
1980  * preventing it from being evicted.
1981  */
1982  valid = PinBuffer(existing_hdr, strategy);
1983 
1984  LWLockRelease(partition_lock);
1985 
1986  /*
1987  * The victim buffer we acquired peviously is clean and unused,
1988  * let it be found again quickly
1989  */
1990  StrategyFreeBuffer(victim_buf_hdr);
1991  UnpinBuffer(victim_buf_hdr);
1992 
1993  buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
1994  buf_block = BufHdrGetBlock(existing_hdr);
1995 
1996  if (valid && !PageIsNew((Page) buf_block))
1997  ereport(ERROR,
1998  (errmsg("unexpected data beyond EOF in block %u of relation %s",
1999  existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
2000  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2001 
2002  /*
2003  * We *must* do smgr[zero]extend before succeeding, else the page
2004  * will not be reserved by the kernel, and the next P_NEW call
2005  * will decide to return the same page. Clear the BM_VALID bit,
2006  * do StartBufferIO() and proceed.
2007  *
2008  * Loop to handle the very small possibility that someone re-sets
2009  * BM_VALID between our clearing it and StartBufferIO inspecting
2010  * it.
2011  */
2012  do
2013  {
2014  uint32 buf_state = LockBufHdr(existing_hdr);
2015 
2016  buf_state &= ~BM_VALID;
2017  UnlockBufHdr(existing_hdr, buf_state);
2018  } while (!StartBufferIO(existing_hdr, true));
2019  }
2020  else
2021  {
2022  uint32 buf_state;
2023 
2024  buf_state = LockBufHdr(victim_buf_hdr);
2025 
2026  /* some sanity checks while we hold the buffer header lock */
2027  Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2028  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2029 
2030  victim_buf_hdr->tag = tag;
2031 
2032  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2033  if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2034  buf_state |= BM_PERMANENT;
2035 
2036  UnlockBufHdr(victim_buf_hdr, buf_state);
2037 
2038  LWLockRelease(partition_lock);
2039 
2040  /* XXX: could combine the locked operations in it with the above */
2041  StartBufferIO(victim_buf_hdr, true);
2042  }
2043  }
2044 
2045  io_start = pgstat_prepare_io_time();
2046 
2047  /*
2048  * Note: if smgrzeroextend fails, we will end up with buffers that are
2049  * allocated but not marked BM_VALID. The next relation extension will
2050  * still select the same block number (because the relation didn't get any
2051  * longer on disk) and so future attempts to extend the relation will find
2052  * the same buffers (if they have not been recycled) but come right back
2053  * here to try smgrzeroextend again.
2054  *
2055  * We don't need to set checksum for all-zero pages.
2056  */
2057  smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2058 
2059  /*
2060  * Release the file-extension lock; it's now OK for someone else to extend
2061  * the relation some more.
2062  *
2063  * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2064  * take noticeable time.
2065  */
2066  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2068 
2070  io_start, extend_by);
2071 
2072  /* Set BM_VALID, terminate IO, and wake up any waiters */
2073  for (uint32 i = 0; i < extend_by; i++)
2074  {
2075  Buffer buf = buffers[i];
2076  BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2077  bool lock = false;
2078 
2079  if (flags & EB_LOCK_FIRST && i == 0)
2080  lock = true;
2081  else if (flags & EB_LOCK_TARGET)
2082  {
2083  Assert(extend_upto != InvalidBlockNumber);
2084  if (first_block + i + 1 == extend_upto)
2085  lock = true;
2086  }
2087 
2088  if (lock)
2090 
2091  TerminateBufferIO(buf_hdr, false, BM_VALID);
2092  }
2093 
2094  pgBufferUsage.shared_blks_written += extend_by;
2095 
2096  *extended_by = extend_by;
2097 
2098  return first_block;
2099 }
#define MaxBlockNumber
Definition: block.h:35
#define BM_JUST_DIRTIED
Definition: buf_internals.h:65
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
static void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:1751
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:63
void * Block
Definition: bufmgr.h:24
@ EB_LOCK_TARGET
Definition: bufmgr.h:91
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:88
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:73
@ EB_LOCK_FIRST
Definition: bufmgr.h:85
Pointer Page
Definition: bufpage.h:78
static bool PageIsNew(Page page)
Definition: bufpage.h:230
#define MemSet(start, val, len)
Definition: c.h:1009
int errhint(const char *fmt,...)
Definition: elog.c:1316
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:716
BufferUsage pgBufferUsage
Definition: instrument.c:20
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:431
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:481
#define ExclusiveLock
Definition: lockdefs.h:42
@ IOOBJECT_RELATION
Definition: pgstat.h:278
IOContext
Definition: pgstat.h:285
@ IOOP_EXTEND
Definition: pgstat.h:297
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt)
Definition: pgstat_io.c:112
instr_time pgstat_prepare_io_time(void)
Definition: pgstat_io.c:96
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
#define relpath(rlocator, forknum)
Definition: relpath.h:94
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:609
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:523
int64 shared_blks_written
Definition: instrument.h:29
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:54

References Assert(), buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufHdrGetBlock, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), CurrentResourceOwner, EB_CLEAR_SIZE_CACHE, EB_LOCK_FIRST, EB_LOCK_TARGET, EB_SKIP_EXTENSION_LOCK, ereport, errcode(), errhint(), errmsg(), ERROR, ExclusiveLock, GetBufferDescriptor(), GetVictimBuffer(), hash(), i, INIT_FORKNUM, InitBufferTag(), InvalidBlockNumber, IOContextForStrategy(), IOOBJECT_RELATION, IOOP_EXTEND, LimitAdditionalPins(), RelFileLocatorBackend::locator, LockBufHdr(), LockRelationForExtension(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MaxBlockNumber, MemSet, PageIsNew(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer(), BufferManagerRelation::rel, RelationGetSmgr(), relpath, BufferManagerRelation::relpersistence, ResourceOwnerEnlargeBuffers(), BufferUsage::shared_blks_written, BufferManagerRelation::smgr, SMgrRelationData::smgr_cached_nblocks, SMgrRelationData::smgr_rlocator, smgrnblocks(), smgrzeroextend(), StartBufferIO(), StrategyFreeBuffer(), BufferDesc::tag, TerminateBufferIO(), UnlockBufHdr(), UnlockRelationForExtension(), and UnpinBuffer().

Referenced by ExtendBufferedRelCommon().

◆ ExtendBufferedRelTo()

Buffer ExtendBufferedRelTo ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
BlockNumber  extend_to,
ReadBufferMode  mode 
)

Definition at line 876 of file bufmgr.c.

882 {
884  uint32 extended_by = 0;
885  Buffer buffer = InvalidBuffer;
886  Buffer buffers[64];
887 
888  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
889  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
890  Assert(extend_to != InvalidBlockNumber && extend_to > 0);
891 
892  if (bmr.smgr == NULL)
893  {
894  bmr.smgr = RelationGetSmgr(bmr.rel);
895  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
896  }
897 
898  /*
899  * If desired, create the file if it doesn't exist. If
900  * smgr_cached_nblocks[fork] is positive then it must exist, no need for
901  * an smgrexists call.
902  */
903  if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
904  (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
906  !smgrexists(bmr.smgr, fork))
907  {
909 
910  /* could have been closed while waiting for lock */
911  if (bmr.rel)
912  bmr.smgr = RelationGetSmgr(bmr.rel);
913 
914  /* recheck, fork might have been created concurrently */
915  if (!smgrexists(bmr.smgr, fork))
916  smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
917 
919  }
920 
921  /*
922  * If requested, invalidate size cache, so that smgrnblocks asks the
923  * kernel.
924  */
925  if (flags & EB_CLEAR_SIZE_CACHE)
927 
928  /*
929  * Estimate how many pages we'll need to extend by. This avoids acquiring
930  * unnecessarily many victim buffers.
931  */
932  current_size = smgrnblocks(bmr.smgr, fork);
933 
934  /*
935  * Since no-one else can be looking at the page contents yet, there is no
936  * difference between an exclusive lock and a cleanup-strength lock. Note
937  * that we pass the original mode to ReadBuffer_common() below, when
938  * falling back to reading the buffer to a concurrent relation extension.
939  */
941  flags |= EB_LOCK_TARGET;
942 
943  while (current_size < extend_to)
944  {
945  uint32 num_pages = lengthof(buffers);
946  BlockNumber first_block;
947 
948  if ((uint64) current_size + num_pages > extend_to)
949  num_pages = extend_to - current_size;
950 
951  first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
952  num_pages, extend_to,
953  buffers, &extended_by);
954 
955  current_size = first_block + extended_by;
956  Assert(num_pages != 0 || current_size >= extend_to);
957 
958  for (uint32 i = 0; i < extended_by; i++)
959  {
960  if (first_block + i != extend_to - 1)
961  ReleaseBuffer(buffers[i]);
962  else
963  buffer = buffers[i];
964  }
965  }
966 
967  /*
968  * It's possible that another backend concurrently extended the relation.
969  * In that case read the buffer.
970  *
971  * XXX: Should we control this via a flag?
972  */
973  if (buffer == InvalidBuffer)
974  {
975  bool hit;
976 
977  Assert(extended_by == 0);
978  buffer = ReadBuffer_common(bmr.smgr, bmr.relpersistence,
979  fork, extend_to - 1, mode, strategy,
980  &hit);
981  }
982 
983  return buffer;
984 }
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4480
static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
Definition: bufmgr.c:992
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:76
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:82
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:47
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:45
#define lengthof(array)
Definition: c.h:777
static PgChecksumMode mode
Definition: pg_checksums.c:56
int64 current_size
Definition: pg_checksums.c:64

References Assert(), PrivateRefCountEntry::buffer, current_size, EB_CLEAR_SIZE_CACHE, EB_CREATE_FORK_IF_NEEDED, EB_LOCK_TARGET, EB_PERFORMING_RECOVERY, ExclusiveLock, ExtendBufferedRelCommon(), i, InvalidBlockNumber, InvalidBuffer, lengthof, LockRelationForExtension(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RelationData::rd_rel, ReadBuffer_common(), BufferManagerRelation::rel, RelationGetSmgr(), ReleaseBuffer(), BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, SMgrRelationData::smgr_cached_nblocks, smgrcreate(), smgrexists(), smgrnblocks(), and UnlockRelationForExtension().

Referenced by fsm_extend(), vm_extend(), and XLogReadBufferExtended().

◆ FindAndDropRelationBuffers()

static void FindAndDropRelationBuffers ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  nForkBlock,
BlockNumber  firstDelBlock 
)
static

Definition at line 3891 of file bufmgr.c.

3894 {
3895  BlockNumber curBlock;
3896 
3897  for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
3898  {
3899  uint32 bufHash; /* hash value for tag */
3900  BufferTag bufTag; /* identity of requested block */
3901  LWLock *bufPartitionLock; /* buffer partition lock for it */
3902  int buf_id;
3903  BufferDesc *bufHdr;
3904  uint32 buf_state;
3905 
3906  /* create a tag so we can lookup the buffer */
3907  InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
3908 
3909  /* determine its hash code and partition lock ID */
3910  bufHash = BufTableHashCode(&bufTag);
3911  bufPartitionLock = BufMappingPartitionLock(bufHash);
3912 
3913  /* Check that it is in the buffer pool. If not, do nothing. */
3914  LWLockAcquire(bufPartitionLock, LW_SHARED);
3915  buf_id = BufTableLookup(&bufTag, bufHash);
3916  LWLockRelease(bufPartitionLock);
3917 
3918  if (buf_id < 0)
3919  continue;
3920 
3921  bufHdr = GetBufferDescriptor(buf_id);
3922 
3923  /*
3924  * We need to lock the buffer header and recheck if the buffer is
3925  * still associated with the same block because the buffer could be
3926  * evicted by some other backend loading blocks for a different
3927  * relation after we release lock on the BufMapping table.
3928  */
3929  buf_state = LockBufHdr(bufHdr);
3930 
3931  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
3932  BufTagGetForkNum(&bufHdr->tag) == forkNum &&
3933  bufHdr->tag.blockNum >= firstDelBlock)
3934  InvalidateBuffer(bufHdr); /* releases spinlock */
3935  else
3936  UnlockBufHdr(bufHdr, buf_state);
3937  }
3938 }

References buftag::blockNum, BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), GetBufferDescriptor(), InitBufferTag(), InvalidateBuffer(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), BufferDesc::tag, and UnlockBufHdr().

Referenced by DropRelationBuffers(), and DropRelationsAllBuffers().

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 3350 of file bufmgr.c.

3352 {
3353  XLogRecPtr recptr;
3354  ErrorContextCallback errcallback;
3355  instr_time io_start;
3356  Block bufBlock;
3357  char *bufToWrite;
3358  uint32 buf_state;
3359 
3360  /*
3361  * Try to start an I/O operation. If StartBufferIO returns false, then
3362  * someone else flushed the buffer before we could, so we need not do
3363  * anything.
3364  */
3365  if (!StartBufferIO(buf, false))
3366  return;
3367 
3368  /* Setup error traceback support for ereport() */
3370  errcallback.arg = (void *) buf;
3371  errcallback.previous = error_context_stack;
3372  error_context_stack = &errcallback;
3373 
3374  /* Find smgr relation for buffer */
3375  if (reln == NULL)
3377 
3378  TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
3379  buf->tag.blockNum,
3381  reln->smgr_rlocator.locator.dbOid,
3383 
3384  buf_state = LockBufHdr(buf);
3385 
3386  /*
3387  * Run PageGetLSN while holding header lock, since we don't have the
3388  * buffer locked exclusively in all cases.
3389  */
3390  recptr = BufferGetLSN(buf);
3391 
3392  /* To check if block content changes while flushing. - vadim 01/17/97 */
3393  buf_state &= ~BM_JUST_DIRTIED;
3394  UnlockBufHdr(buf, buf_state);
3395 
3396  /*
3397  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
3398  * rule that log updates must hit disk before any of the data-file changes
3399  * they describe do.
3400  *
3401  * However, this rule does not apply to unlogged relations, which will be
3402  * lost after a crash anyway. Most unlogged relation pages do not bear
3403  * LSNs since we never emit WAL records for them, and therefore flushing
3404  * up through the buffer LSN would be useless, but harmless. However,
3405  * GiST indexes use LSNs internally to track page-splits, and therefore
3406  * unlogged GiST pages bear "fake" LSNs generated by
3407  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
3408  * LSN counter could advance past the WAL insertion point; and if it did
3409  * happen, attempting to flush WAL through that location would fail, with
3410  * disastrous system-wide consequences. To make sure that can't happen,
3411  * skip the flush if the buffer isn't permanent.
3412  */
3413  if (buf_state & BM_PERMANENT)
3414  XLogFlush(recptr);
3415 
3416  /*
3417  * Now it's safe to write buffer to disk. Note that no one else should
3418  * have been able to write it while we were busy with log flushing because
3419  * only one process at a time can set the BM_IO_IN_PROGRESS bit.
3420  */
3421  bufBlock = BufHdrGetBlock(buf);
3422 
3423  /*
3424  * Update page checksum if desired. Since we have only shared lock on the
3425  * buffer, other processes might be updating hint bits in it, so we must
3426  * copy the page to private storage if we do checksumming.
3427  */
3428  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
3429 
3430  io_start = pgstat_prepare_io_time();
3431 
3432  /*
3433  * bufToWrite is either the shared buffer or a copy, as appropriate.
3434  */
3435  smgrwrite(reln,
3436  BufTagGetForkNum(&buf->tag),
3437  buf->tag.blockNum,
3438  bufToWrite,
3439  false);
3440 
3441  /*
3442  * When a strategy is in use, only flushes of dirty buffers already in the
3443  * strategy ring are counted as strategy writes (IOCONTEXT
3444  * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
3445  * statistics tracking.
3446  *
3447  * If a shared buffer initially added to the ring must be flushed before
3448  * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
3449  *
3450  * If a shared buffer which was added to the ring later because the
3451  * current strategy buffer is pinned or in use or because all strategy
3452  * buffers were dirty and rejected (for BAS_BULKREAD operations only)
3453  * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
3454  * (from_ring will be false).
3455  *
3456  * When a strategy is not in use, the write can only be a "regular" write
3457  * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
3458  */
3460  IOOP_WRITE, io_start, 1);
3461 
3463 
3464  /*
3465  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
3466  * end the BM_IO_IN_PROGRESS state.
3467  */
3468  TerminateBufferIO(buf, true, 0);
3469 
3470  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3471  buf->tag.blockNum,
3473  reln->smgr_rlocator.locator.dbOid,
3475 
3476  /* Pop the error context stack */
3477  error_context_stack = errcallback.previous;
3478 }
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:64
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:5233
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1510
ErrorContextCallback * error_context_stack
Definition: elog.c:95
@ IOOP_WRITE
Definition: pgstat.h:302
void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:584
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2535

References ErrorContextCallback::arg, BM_JUST_DIRTIED, BM_PERMANENT, buf, BufferGetLSN, BufHdrGetBlock, BufTagGetForkNum(), BufTagGetRelFileLocator(), ErrorContextCallback::callback, RelFileLocator::dbOid, error_context_stack, InvalidBackendId, IOOBJECT_RELATION, IOOP_WRITE, RelFileLocatorBackend::locator, LockBufHdr(), PageSetChecksumCopy(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), ErrorContextCallback::previous, RelFileLocator::relNumber, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rlocator, smgropen(), smgrwrite(), RelFileLocator::spcOid, StartBufferIO(), TerminateBufferIO(), UnlockBufHdr(), and XLogFlush().

Referenced by FlushDatabaseBuffers(), FlushOneBuffer(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), and SyncOneBuffer().

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 4417 of file bufmgr.c.

4418 {
4419  int i;
4420  BufferDesc *bufHdr;
4421 
4422  /* Make sure we can handle the pin inside the loop */
4424 
4425  for (i = 0; i < NBuffers; i++)
4426  {
4427  uint32 buf_state;
4428 
4429  bufHdr = GetBufferDescriptor(i);
4430 
4431  /*
4432  * As in DropRelationBuffers, an unlocked precheck should be safe and
4433  * saves some cycles.
4434  */
4435  if (bufHdr->tag.dbOid != dbid)
4436  continue;
4437 
4439 
4440  buf_state = LockBufHdr(bufHdr);
4441  if (bufHdr->tag.dbOid == dbid &&
4442  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4443  {
4444  PinBuffer_Locked(bufHdr);
4448  UnpinBuffer(bufHdr);
4449  }
4450  else
4451  UnlockBufHdr(bufHdr, buf_state);
4452  }
4453 }
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:3350
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:2336
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:214

References BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), CurrentResourceOwner, buftag::dbOid, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by dbase_redo().

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 4460 of file bufmgr.c.

4461 {
4462  BufferDesc *bufHdr;
4463 
4464  /* currently not needed, but no fundamental reason not to support */
4465  Assert(!BufferIsLocal(buffer));
4466 
4467  Assert(BufferIsPinned(buffer));
4468 
4469  bufHdr = GetBufferDescriptor(buffer - 1);
4470 
4472 
4474 }
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1920

References Assert(), PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, and LWLockHeldByMe().

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), and XLogReadBufferForRedoExtended().

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 4058 of file bufmgr.c.

4059 {
4060  int i;
4061  BufferDesc *bufHdr;
4062 
4063  if (RelationUsesLocalBuffers(rel))
4064  {
4065  for (i = 0; i < NLocBuffer; i++)
4066  {
4067  uint32 buf_state;
4068  instr_time io_start;
4069 
4070  bufHdr = GetLocalBufferDescriptor(i);
4071  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4072  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4073  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4074  {
4075  ErrorContextCallback errcallback;
4076  Page localpage;
4077 
4078  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
4079 
4080  /* Setup error traceback support for ereport() */
4082  errcallback.arg = (void *) bufHdr;
4083  errcallback.previous = error_context_stack;
4084  error_context_stack = &errcallback;
4085 
4086  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
4087 
4088  io_start = pgstat_prepare_io_time();
4089 
4091  BufTagGetForkNum(&bufHdr->tag),
4092  bufHdr->tag.blockNum,
4093  localpage,
4094  false);
4095 
4098  io_start, 1);
4099 
4100  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
4101  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
4102 
4104 
4105  /* Pop the error context stack */
4106  error_context_stack = errcallback.previous;
4107  }
4108  }
4109 
4110  return;
4111  }
4112 
4113  /* Make sure we can handle the pin inside the loop */
4115 
4116  for (i = 0; i < NBuffers; i++)
4117  {
4118  uint32 buf_state;
4119 
4120  bufHdr = GetBufferDescriptor(i);
4121 
4122  /*
4123  * As in DropRelationBuffers, an unlocked precheck should be safe and
4124  * saves some cycles.
4125  */
4126  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4127  continue;
4128 
4130 
4131  buf_state = LockBufHdr(bufHdr);
4132  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4133  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4134  {
4135  PinBuffer_Locked(bufHdr);
4139  UnpinBuffer(bufHdr);
4140  }
4141  else
4142  UnlockBufHdr(bufHdr, buf_state);
4143  }
4144 }
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:272
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:67
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:5253
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1542
int NLocBuffer
Definition: localbuf.c:42
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:279
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:638
int64 local_blks_written
Definition: instrument.h:33
RelFileLocator rd_locator
Definition: rel.h:57

References ErrorContextCallback::arg, buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_VALID, BufferDescriptorGetContentLock(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushBuffer(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_WRITE, BufferUsage::local_blks_written, local_buffer_write_error_callback(), LocalBufHdrGetBlock, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, NLocBuffer, PageSetChecksumInplace(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer_Locked(), ErrorContextCallback::previous, RelationData::rd_locator, RelationGetSmgr(), RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), smgrwrite(), BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by fill_seq_with_data(), heapam_relation_copy_data(), and index_copy_data().

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 4156 of file bufmgr.c.

4157 {
4158  int i;
4159  SMgrSortArray *srels;
4160  bool use_bsearch;
4161 
4162  if (nrels == 0)
4163  return;
4164 
4165  /* fill-in array for qsort */
4166  srels = palloc(sizeof(SMgrSortArray) * nrels);
4167 
4168  for (i = 0; i < nrels; i++)
4169  {
4170  Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4171 
4172  srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4173  srels[i].srel = smgrs[i];
4174  }
4175 
4176  /*
4177  * Save the bsearch overhead for low number of relations to sync. See
4178  * DropRelationsAllBuffers for details.
4179  */
4180  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4181 
4182  /* sort the list of SMgrRelations if necessary */
4183  if (use_bsearch)
4184  pg_qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4185 
4186  /* Make sure we can handle the pin inside the loop */
4188 
4189  for (i = 0; i < NBuffers; i++)
4190  {
4191  SMgrSortArray *srelent = NULL;
4192  BufferDesc *bufHdr = GetBufferDescriptor(i);
4193  uint32 buf_state;
4194 
4195  /*
4196  * As in DropRelationBuffers, an unlocked precheck should be safe and
4197  * saves some cycles.
4198  */
4199 
4200  if (!use_bsearch)
4201  {
4202  int j;
4203 
4204  for (j = 0; j < nrels; j++)
4205  {
4206  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
4207  {
4208  srelent = &srels[j];
4209  break;
4210  }
4211  }
4212  }
4213  else
4214  {
4215  RelFileLocator rlocator;
4216 
4217  rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4218  srelent = bsearch((const void *) &(rlocator),
4219  srels, nrels, sizeof(SMgrSortArray),
4221  }
4222 
4223  /* buffer doesn't belong to any of the given relfilelocators; skip it */
4224  if (srelent == NULL)
4225  continue;
4226 
4228 
4229  buf_state = LockBufHdr(bufHdr);
4230  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
4231  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4232  {
4233  PinBuffer_Locked(bufHdr);
4235  FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4237  UnpinBuffer(bufHdr);
4238  }
4239  else
4240  UnlockBufHdr(bufHdr, buf_state);
4241  }
4242 
4243  pfree(srels);
4244 }
SMgrRelation srel
Definition: bufmgr.c:131
RelFileLocator rlocator
Definition: bufmgr.c:130

References Assert(), BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, j, RelFileLocatorBackend::locator, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, palloc(), pfree(), pg_qsort(), PinBuffer_Locked(), RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), SMgrSortArray::rlocator, rlocator_comparator(), SMgrRelationData::smgr_rlocator, SMgrSortArray::srel, BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by smgrdosyncall().

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 403 of file bufmgr.c.

404 {
405  Assert(ref->refcount == 0);
406 
407  if (ref >= &PrivateRefCountArray[0] &&
409  {
410  ref->buffer = InvalidBuffer;
411 
412  /*
413  * Mark the just used entry as reserved - in many scenarios that
414  * allows us to avoid ever having to search the array/hash for free
415  * entries.
416  */
417  ReservedRefCountEntry = ref;
418  }
419  else
420  {
421  bool found;
422  Buffer buffer = ref->buffer;
423 
424  hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
425  Assert(found);
428  }
429 }
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:200
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:953
@ HASH_REMOVE
Definition: hsearch.h:115

References Assert(), PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountEntry.

Referenced by UnpinBuffer().

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 380 of file bufmgr.c.

381 {
383 
384  Assert(BufferIsValid(buffer));
385  Assert(!BufferIsLocal(buffer));
386 
387  /*
388  * Not moving the entry - that's ok for the current users, but we might
389  * want to change this one day.
390  */
391  ref = GetPrivateRefCountEntry(buffer, false);
392 
393  if (ref == NULL)
394  return 0;
395  return ref->refcount;
396 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:306

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by CheckBufferIsPinnedOnce(), ConditionalLockBufferForCleanup(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), MarkBufferDirtyHint(), PrintBufferLeakWarning(), and ReadRecentBuffer().

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 306 of file bufmgr.c.

307 {
309  int i;
310 
311  Assert(BufferIsValid(buffer));
312  Assert(!BufferIsLocal(buffer));
313 
314  /*
315  * First search for references in the array, that'll be sufficient in the
316  * majority of cases.
317  */
318  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
319  {
321 
322  if (res->buffer == buffer)
323  return res;
324  }
325 
326  /*
327  * By here we know that the buffer, if already pinned, isn't residing in
328  * the array.
329  *
330  * Only look up the buffer in the hashtable if we've previously overflowed
331  * into it.
332  */
333  if (PrivateRefCountOverflowed == 0)
334  return NULL;
335 
336  res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
337 
338  if (res == NULL)
339  return NULL;
340  else if (!do_move)
341  {
342  /* caller doesn't want us to move the hash entry into the array */
343  return res;
344  }
345  else
346  {
347  /* move buffer from hashtable into the free array slot */
348  bool found;
350 
351  /* Ensure there's a free array slot */
353 
354  /* Use up the reserved slot */
355  Assert(ReservedRefCountEntry != NULL);
357  ReservedRefCountEntry = NULL;
358  Assert(free->buffer == InvalidBuffer);
359 
360  /* and fill it */
361  free->buffer = buffer;
362  free->refcount = res->refcount;
363 
364  /* delete from hashtable */
365  hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
366  Assert(found);
369 
370  return free;
371  }
372 }
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, res, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBuffer().

◆ GetVictimBuffer()

static Buffer GetVictimBuffer ( BufferAccessStrategy  strategy,
IOContext  io_context 
)
static

Definition at line 1585 of file bufmgr.c.

1586 {
1587  BufferDesc *buf_hdr;
1588  Buffer buf;
1589  uint32 buf_state;
1590  bool from_ring;
1591 
1592  /*
1593  * Ensure, while the spinlock's not yet held, that there's a free refcount
1594  * entry.
1595  */
1598 
1599  /* we return here if a prospective victim buffer gets used concurrently */
1600 again:
1601 
1602  /*
1603  * Select a victim buffer. The buffer is returned with its header
1604  * spinlock still held!
1605  */
1606  buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1607  buf = BufferDescriptorGetBuffer(buf_hdr);
1608 
1609  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1610 
1611  /* Pin the buffer and then release the buffer spinlock */
1612  PinBuffer_Locked(buf_hdr);
1613 
1614  /*
1615  * We shouldn't have any other pins for this buffer.
1616  */
1618 
1619  /*
1620  * If the buffer was dirty, try to write it out. There is a race
1621  * condition here, in that someone might dirty it after we released the
1622  * buffer header lock above, or even while we are writing it out (since
1623  * our share-lock won't prevent hint-bit updates). We will recheck the
1624  * dirty bit after re-locking the buffer header.
1625  */
1626  if (buf_state & BM_DIRTY)
1627  {
1628  LWLock *content_lock;
1629 
1630  Assert(buf_state & BM_TAG_VALID);
1631  Assert(buf_state & BM_VALID);
1632 
1633  /*
1634  * We need a share-lock on the buffer contents to write it out (else
1635  * we might write invalid data, eg because someone else is compacting
1636  * the page contents while we write). We must use a conditional lock
1637  * acquisition here to avoid deadlock. Even though the buffer was not
1638  * pinned (and therefore surely not locked) when StrategyGetBuffer
1639  * returned it, someone else could have pinned and exclusive-locked it
1640  * by the time we get here. If we try to get the lock unconditionally,
1641  * we'd block waiting for them; if they later block waiting for us,
1642  * deadlock ensues. (This has been observed to happen when two
1643  * backends are both trying to split btree index pages, and the second
1644  * one just happens to be trying to split the page the first one got
1645  * from StrategyGetBuffer.)
1646  */
1647  content_lock = BufferDescriptorGetContentLock(buf_hdr);
1648  if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
1649  {
1650  /*
1651  * Someone else has locked the buffer, so give it up and loop back
1652  * to get another one.
1653  */
1654  UnpinBuffer(buf_hdr);
1655  goto again;
1656  }
1657 
1658  /*
1659  * If using a nondefault strategy, and writing the buffer would
1660  * require a WAL flush, let the strategy decide whether to go ahead
1661  * and write/reuse the buffer or to choose another victim. We need a
1662  * lock to inspect the page LSN, so this can't be done inside
1663  * StrategyGetBuffer.
1664  */
1665  if (strategy != NULL)
1666  {
1667  XLogRecPtr lsn;
1668 
1669  /* Read the LSN while holding buffer header lock */
1670  buf_state = LockBufHdr(buf_hdr);
1671  lsn = BufferGetLSN(buf_hdr);
1672  UnlockBufHdr(buf_hdr, buf_state);
1673 
1674  if (XLogNeedsFlush(lsn)
1675  && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
1676  {
1677  LWLockRelease(content_lock);
1678  UnpinBuffer(buf_hdr);
1679  goto again;
1680  }
1681  }
1682 
1683  /* OK, do the I/O */
1684  FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
1685  LWLockRelease(content_lock);
1686 
1688  &buf_hdr->tag);
1689  }
1690 
1691 
1692  if (buf_state & BM_VALID)
1693  {
1694  /*
1695  * When a BufferAccessStrategy is in use, blocks evicted from shared
1696  * buffers are counted as IOOP_EVICT in the corresponding context
1697  * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
1698  * strategy in two cases: 1) while initially claiming buffers for the
1699  * strategy ring 2) to replace an existing strategy ring buffer
1700  * because it is pinned or in use and cannot be reused.
1701  *
1702  * Blocks evicted from buffers already in the strategy ring are
1703  * counted as IOOP_REUSE in the corresponding strategy context.
1704  *
1705  * At this point, we can accurately count evictions and reuses,
1706  * because we have successfully claimed the valid buffer. Previously,
1707  * we may have been forced to release the buffer due to concurrent
1708  * pinners or erroring out.
1709  */
1711  from_ring ? IOOP_REUSE : IOOP_EVICT);
1712  }
1713 
1714  /*
1715  * If the buffer has an entry in the buffer mapping table, delete it. This
1716  * can fail because another backend could have pinned or dirtied the
1717  * buffer.
1718  */
1719  if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
1720  {
1721  UnpinBuffer(buf_hdr);
1722  goto again;
1723  }
1724 
1725  /* a final set of sanity checks */
1726 #ifdef USE_ASSERT_CHECKING
1727  buf_state = pg_atomic_read_u32(&buf_hdr->state);
1728 
1729  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
1730  Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
1731 
1733 #endif
1734 
1735  return buf;
1736 }
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:4762
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:1517
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:5454
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:756
@ IOOP_EVICT
Definition: pgstat.h:296
@ IOOP_REUSE
Definition: pgstat.h:301
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
Definition: pgstat_io.c:77
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:2864

References Assert(), BackendWritebackContext, BM_DIRTY, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufferGetLSN, CheckBufferIsPinnedOnce(), CurrentResourceOwner, FlushBuffer(), InvalidateVictimBuffer(), IOOBJECT_RELATION, IOOP_EVICT, IOOP_REUSE, LockBufHdr(), LW_SHARED, LWLockConditionalAcquire(), LWLockRelease(), pg_atomic_read_u32(), pgstat_count_io_op(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlargeBuffers(), ScheduleBufferTagForWriteback(), BufferDesc::state, StrategyGetBuffer(), StrategyRejectBuffer(), BufferDesc::tag, UnlockBufHdr(), UnpinBuffer(), and XLogNeedsFlush().

Referenced by BufferAlloc(), and ExtendBufferedRelShared().

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 4930 of file bufmgr.c.

4931 {
4932  int bufid = GetStartupBufferPinWaitBufId();
4933 
4934  /*
4935  * If we get woken slowly then it's possible that the Startup process was
4936  * already woken by other backends before we got here. Also possible that
4937  * we get here by multiple interrupts or interrupts at inappropriate
4938  * times, so make sure we do nothing if the bufid is not set.
4939  */
4940  if (bufid < 0)
4941  return false;
4942 
4943  if (GetPrivateRefCount(bufid + 1) > 0)
4944  return true;
4945 
4946  return false;
4947 }
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:639

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and ProcessRecoveryConflictInterrupt().

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 4512 of file bufmgr.c.

4513 {
4514  Assert(BufferIsPinned(buffer));
4516  if (BufferIsLocal(buffer))
4517  LocalRefCount[-buffer - 1]++;
4518  else
4519  {
4520  PrivateRefCountEntry *ref;
4521 
4522  ref = GetPrivateRefCountEntry(buffer, true);
4523  Assert(ref != NULL);
4524  ref->refcount++;
4525  }
4527 }
void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
Definition: resowner.c:985

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, GetPrivateRefCountEntry(), LocalRefCount, PrivateRefCountEntry::refcount, ResourceOwnerEnlargeBuffers(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), RelationAddBlocks(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

◆ InitBufferPoolAccess()

void InitBufferPoolAccess ( void  )

Definition at line 3149 of file bufmgr.c.

3150 {
3151  HASHCTL hash_ctl;
3152 
3153  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3154 
3155  hash_ctl.keysize = sizeof(int32);
3156  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3157 
3158  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
3159  HASH_ELEM | HASH_BLOBS);
3160 
3161  /*
3162  * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
3163  * the corresponding phase of backend shutdown.
3164  */
3165  Assert(MyProc != NULL);
3167 }
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:3174
struct PrivateRefCountEntry PrivateRefCountEntry
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:350
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
PGPROC * MyProc
Definition: proc.c:66
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76

References Assert(), AtProcExit_Buffers(), HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MyProc, on_shmem_exit(), PrivateRefCountArray, and PrivateRefCountHash.

Referenced by BaseInit().

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 1419 of file bufmgr.c.

1420 {
1421  BufferTag oldTag;
1422  uint32 oldHash; /* hash value for oldTag */
1423  LWLock *oldPartitionLock; /* buffer partition lock for it */
1424  uint32 oldFlags;
1425  uint32 buf_state;
1426 
1427  /* Save the original buffer tag before dropping the spinlock */
1428  oldTag = buf->tag;
1429 
1430  buf_state = pg_atomic_read_u32(&buf->state);
1431  Assert(buf_state & BM_LOCKED);
1432  UnlockBufHdr(buf, buf_state);
1433 
1434  /*
1435  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1436  * worth storing the hashcode in BufferDesc so we need not recompute it
1437  * here? Probably not.
1438  */
1439  oldHash = BufTableHashCode(&oldTag);
1440  oldPartitionLock = BufMappingPartitionLock(oldHash);
1441 
1442 retry:
1443 
1444  /*
1445  * Acquire exclusive mapping lock in preparation for changing the buffer's
1446  * association.
1447  */
1448  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1449 
1450  /* Re-lock the buffer header */
1451  buf_state = LockBufHdr(buf);
1452 
1453  /* If it's changed while we were waiting for lock, do nothing */
1454  if (!BufferTagsEqual(&buf->tag, &oldTag))
1455  {
1456  UnlockBufHdr(buf, buf_state);
1457  LWLockRelease(oldPartitionLock);
1458  return;
1459  }
1460 
1461  /*
1462  * We assume the only reason for it to be pinned is that someone else is
1463  * flushing the page out. Wait for them to finish. (This could be an
1464  * infinite loop if the refcount is messed up... it would be nice to time
1465  * out after awhile, but there seems no way to be sure how many loops may
1466  * be needed. Note that if the other guy has pinned the buffer but not
1467  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1468  * be busy-looping here.)
1469  */
1470  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1471  {
1472  UnlockBufHdr(buf, buf_state);
1473  LWLockRelease(oldPartitionLock);
1474  /* safety check: should definitely not be our *own* pin */
1476  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1477  WaitIO(buf);
1478  goto retry;
1479  }
1480 
1481  /*
1482  * Clear out the buffer's tag and flags. We must do this to ensure that
1483  * linear scans of the buffer array don't think the buffer is valid.
1484  */
1485  oldFlags = buf_state & BUF_FLAG_MASK;
1486  ClearBufferTag(&buf->tag);
1487  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1488  UnlockBufHdr(buf, buf_state);
1489 
1490  /*
1491  * Remove the buffer from the lookup hashtable, if it was in there.
1492  */
1493  if (oldFlags & BM_TAG_VALID)
1494  BufTableDelete(&oldTag, oldHash);
1495 
1496  /*
1497  * Done with mapping lock.
1498  */
1499  LWLockRelease(oldPartitionLock);
1500 
1501  /*
1502  * Insert the buffer at the head of the list of free buffers.
1503  */
1505 }
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:44
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
#define BUF_FLAG_MASK
Definition: buf_internals.h:47
#define BM_LOCKED
Definition: buf_internals.h:59
static void ClearBufferTag(BufferTag *tag)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5066

References Assert(), BM_LOCKED, BM_TAG_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), elog(), ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), StrategyFreeBuffer(), UnlockBufHdr(), and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), and FindAndDropRelationBuffers().

◆ InvalidateVictimBuffer()

static bool InvalidateVictimBuffer ( BufferDesc buf_hdr)
static

Definition at line 1517 of file bufmgr.c.

1518 {
1519  uint32 buf_state;
1520  uint32 hash;
1521  LWLock *partition_lock;
1522  BufferTag tag;
1523 
1525 
1526  /* have buffer pinned, so it's safe to read tag without lock */
1527  tag = buf_hdr->tag;
1528 
1529  hash = BufTableHashCode(&tag);
1530  partition_lock = BufMappingPartitionLock(hash);
1531 
1532  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1533 
1534  /* lock the buffer header */
1535  buf_state = LockBufHdr(buf_hdr);
1536 
1537  /*
1538  * We have the buffer pinned nobody else should have been able to unset
1539  * this concurrently.
1540  */
1541  Assert(buf_state & BM_TAG_VALID);
1542  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1543  Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
1544 
1545  /*
1546  * If somebody else pinned the buffer since, or even worse, dirtied it,
1547  * give up on this buffer: It's clearly in use.
1548  */
1549  if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
1550  {
1551  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1552 
1553  UnlockBufHdr(buf_hdr, buf_state);
1554  LWLockRelease(partition_lock);
1555 
1556  return false;
1557  }
1558 
1559  /*
1560  * Clear out the buffer's tag and flags and usagecount. This is not
1561  * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
1562  * doing anything with the buffer. But currently it's beneficial, as the
1563  * cheaper pre-check for several linear scans of shared buffers use the
1564  * tag (see e.g. FlushDatabaseBuffers()).
1565  */
1566  ClearBufferTag(&buf_hdr->tag);
1567  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1568  UnlockBufHdr(buf_hdr, buf_state);
1569 
1570  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1571 
1572  /* finally delete buffer from the buffer mapping table */
1573  BufTableDelete(&tag, hash);
1574 
1575  LWLockRelease(partition_lock);
1576 
1577  Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
1578  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1580 
1581  return true;
1582 }

References Assert(), BM_DIRTY, BM_TAG_VALID, BM_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), GetPrivateRefCount(), hash(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by GetVictimBuffer().

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 5012 of file bufmgr.c.

5013 {
5014  BufferDesc *bufHdr;
5015  uint32 buf_state;
5016 
5017  Assert(BufferIsValid(buffer));
5018 
5019  if (BufferIsLocal(buffer))
5020  {
5021  /* There should be exactly one pin */
5022  if (LocalRefCount[-buffer - 1] != 1)
5023  return false;
5024  /* Nobody else to wait for */
5025  return true;
5026  }
5027 
5028  /* There should be exactly one local pin */
5029  if (GetPrivateRefCount(buffer) != 1)
5030  return false;
5031 
5032  bufHdr = GetBufferDescriptor(buffer - 1);
5033 
5034  /* caller must hold exclusive lock on buffer */
5036  LW_EXCLUSIVE));
5037 
5038  buf_state = LockBufHdr(bufHdr);
5039 
5040  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5041  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5042  {
5043  /* pincount is OK. */
5044  UnlockBufHdr(bufHdr, buf_state);
5045  return true;
5046  }
5047 
5048  UnlockBufHdr(bufHdr, buf_state);
5049  return false;
5050 }
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1964

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsValid(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBufHdr(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), and UnlockBufHdr().

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), and hashbucketcleanup().

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext wb_context,
IOContext  io_context 
)

Definition at line 5499 of file bufmgr.c.

5500 {
5501  instr_time io_start;
5502  int i;
5503 
5504  if (wb_context->nr_pending == 0)
5505  return;
5506 
5507  /*
5508  * Executing the writes in-order can make them a lot faster, and allows to
5509  * merge writeback requests to consecutive blocks into larger writebacks.
5510  */
5511  sort_pending_writebacks(wb_context->pending_writebacks,
5512  wb_context->nr_pending);
5513 
5514  io_start = pgstat_prepare_io_time();
5515 
5516  /*
5517  * Coalesce neighbouring writes, but nothing else. For that we iterate
5518  * through the, now sorted, array of pending flushes, and look forward to
5519  * find all neighbouring (or identical) writes.
5520  */
5521  for (i = 0; i < wb_context->nr_pending; i++)
5522  {
5525  SMgrRelation reln;
5526  int ahead;
5527  BufferTag tag;
5528  RelFileLocator currlocator;
5529  Size nblocks = 1;
5530 
5531  cur = &wb_context->pending_writebacks[i];
5532  tag = cur->tag;
5533  currlocator = BufTagGetRelFileLocator(&tag);
5534 
5535  /*
5536  * Peek ahead, into following writeback requests, to see if they can
5537  * be combined with the current one.
5538  */
5539  for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
5540  {
5541 
5542  next = &wb_context->pending_writebacks[i + ahead + 1];
5543 
5544  /* different file, stop */
5545  if (!RelFileLocatorEquals(currlocator,
5546  BufTagGetRelFileLocator(&next->tag)) ||
5547  BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
5548  break;
5549 
5550  /* ok, block queued twice, skip */
5551  if (cur->tag.blockNum == next->tag.blockNum)
5552  continue;
5553 
5554  /* only merge consecutive writes */
5555  if (cur->tag.blockNum + 1 != next->tag.blockNum)
5556  break;
5557 
5558  nblocks++;
5559  cur = next;
5560  }
5561 
5562  i += ahead;
5563 
5564  /* and finally tell the kernel to write the data to storage */
5565  reln = smgropen(currlocator, InvalidBackendId);
5566  smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
5567  }
5568 
5569  /*
5570  * Assume that writeback requests are only issued for buffers containing
5571  * blocks of permanent relations.
5572  */
5574  IOOP_WRITEBACK, io_start, wb_context->nr_pending);
5575 
5576  wb_context->nr_pending = 0;
5577 }
static int32 next
Definition: blutils.c:219
struct cursor * cur
Definition: ecpg.c:28
@ IOOP_WRITEBACK
Definition: pgstat.h:303
#define RelFileLocatorEquals(locator1, locator2)
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:597
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), cur, i, InvalidBackendId, IOOBJECT_RELATION, IOOP_WRITEBACK, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RelFileLocatorEquals, smgropen(), and smgrwriteback().

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

◆ LimitAdditionalPins()

static void LimitAdditionalPins ( uint32 additional_pins)
static

Definition at line 1751 of file bufmgr.c.

1752 {
1753  uint32 max_backends;
1754  int max_proportional_pins;
1755 
1756  if (*additional_pins <= 1)
1757  return;
1758 
1759  max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
1760  max_proportional_pins = NBuffers / max_backends;
1761 
1762  /*
1763  * Subtract the approximate number of buffers already pinned by this
1764  * backend. We get the number of "overflowed" pins for free, but don't
1765  * know the number of pins in PrivateRefCountArray. The cost of
1766  * calculating that exactly doesn't seem worth it, so just assume the max.
1767  */
1768  max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
1769 
1770  if (max_proportional_pins <= 0)
1771  max_proportional_pins = 1;
1772 
1773  if (*additional_pins > max_proportional_pins)
1774  *additional_pins = max_proportional_pins;
1775 }
int MaxBackends
Definition: globals.c:140
#define NUM_AUXILIARY_PROCS
Definition: proc.h:418

References MaxBackends, NBuffers, NUM_AUXILIARY_PROCS, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by ExtendBufferedRelShared().

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 5253 of file bufmgr.c.

5254 {
5255  BufferDesc *bufHdr = (BufferDesc *) arg;
5256 
5257  if (bufHdr != NULL)
5258  {
5259  char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
5260  MyBackendId,
5261  BufTagGetForkNum(&bufHdr->tag));
5262 
5263  errcontext("writing block %u of relation %s",
5264  bufHdr->tag.blockNum, path);
5265  pfree(path);
5266  }
5267 }
#define errcontext
Definition: elog.h:196
void * arg
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:85

References arg, buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, MyBackendId, pfree(), relpathbackend, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
int  mode 
)

Definition at line 4715 of file bufmgr.c.

4716 {
4717  BufferDesc *buf;
4718 
4719  Assert(BufferIsPinned(buffer));
4720  if (BufferIsLocal(buffer))
4721  return; /* local buffers need no lock */
4722 
4723  buf = GetBufferDescriptor(buffer - 1);
4724 
4725  if (mode == BUFFER_LOCK_UNLOCK)
4727  else if (mode == BUFFER_LOCK_SHARE)
4729  else if (mode == BUFFER_LOCK_EXCLUSIVE)
4731  else
4732  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
4733 }
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:158
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:159

References Assert(), buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, elog(), ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), and mode.

Referenced by _bt_lockbuf(), _bt_unlockbuf(), _bt_upgradelockbufcleanup(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), entryLoadMoreItems(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_inplace_update(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_bitmap_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgetpage(), heapgettup(), initBloomState(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), LockBufferForCleanup(), log_newpage_range(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), ScanSourceDatabasePgClass(), shiftList(), spgbuildempty(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferForRedoExtended(), and XLogRecordPageWithFreeSpace().

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 4795 of file bufmgr.c.

4796 {
4797  BufferDesc *bufHdr;
4798  TimestampTz waitStart = 0;
4799  bool waiting = false;
4800  bool logged_recovery_conflict = false;
4801 
4802  Assert(BufferIsPinned(buffer));
4803  Assert(PinCountWaitBuf == NULL);
4804 
4805  CheckBufferIsPinnedOnce(buffer);
4806 
4807  /* Nobody else to wait for */
4808  if (BufferIsLocal(buffer))
4809  return;
4810 
4811  bufHdr = GetBufferDescriptor(buffer - 1);
4812 
4813  for (;;)
4814  {
4815  uint32 buf_state;
4816 
4817  /* Try to acquire lock */
4819  buf_state = LockBufHdr(bufHdr);
4820 
4821  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4822  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
4823  {
4824  /* Successfully acquired exclusive lock with pincount 1 */
4825  UnlockBufHdr(bufHdr, buf_state);
4826 
4827  /*
4828  * Emit the log message if recovery conflict on buffer pin was
4829  * resolved but the startup process waited longer than
4830  * deadlock_timeout for it.
4831  */
4832  if (logged_recovery_conflict)
4834  waitStart, GetCurrentTimestamp(),
4835  NULL, false);
4836 
4837  if (waiting)
4838  {
4839  /* reset ps display to remove the suffix if we added one */
4841  waiting = false;
4842  }
4843  return;
4844  }
4845  /* Failed, so mark myself as waiting for pincount 1 */
4846  if (buf_state & BM_PIN_COUNT_WAITER)
4847  {
4848  UnlockBufHdr(bufHdr, buf_state);
4849  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4850  elog(ERROR, "multiple backends attempting to wait for pincount 1");
4851  }
4853  PinCountWaitBuf = bufHdr;
4854  buf_state |= BM_PIN_COUNT_WAITER;
4855  UnlockBufHdr(bufHdr, buf_state);
4856  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4857 
4858  /* Wait to be signaled by UnpinBuffer() */
4859  if (InHotStandby)
4860  {
4861  if (!waiting)
4862  {
4863  /* adjust the process title to indicate that it's waiting */
4864  set_ps_display_suffix("waiting");
4865  waiting = true;
4866  }
4867 
4868  /*
4869  * Emit the log message if the startup process is waiting longer
4870  * than deadlock_timeout for recovery conflict on buffer pin.
4871  *
4872  * Skip this if first time through because the startup process has
4873  * not started waiting yet in this case. So, the wait start
4874  * timestamp is set after this logic.
4875  */
4876  if (waitStart != 0 && !logged_recovery_conflict)
4877  {
4879 
4880  if (TimestampDifferenceExceeds(waitStart, now,
4881  DeadlockTimeout))
4882  {
4884  waitStart, now, NULL, true);
4885  logged_recovery_conflict = true;
4886  }
4887  }
4888 
4889  /*
4890  * Set the wait start timestamp if logging is enabled and first
4891  * time through.
4892  */
4893  if (log_recovery_conflict_waits && waitStart == 0)
4894  waitStart = GetCurrentTimestamp();
4895 
4896  /* Publish the bufid that Startup process waits on */
4897  SetStartupBufferPinWaitBufId(buffer - 1);
4898  /* Set alarm and then wait to be signaled by UnpinBuffer() */
4900  /* Reset the published bufid */
4902  }
4903  else
4904  ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
4905 
4906  /*
4907  * Remove flag marking us as waiter. Normally this will not be set
4908  * anymore, but ProcWaitForSignal() can return for other signals as
4909  * well. We take care to only reset the flag if we're the waiter, as
4910  * theoretically another backend could have started waiting. That's
4911  * impossible with the current usages due to table level locking, but
4912  * better be safe.
4913  */
4914  buf_state = LockBufHdr(bufHdr);
4915  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
4917  buf_state &= ~BM_PIN_COUNT_WAITER;
4918  UnlockBufHdr(bufHdr, buf_state);
4919 
4920  PinCountWaitBuf = NULL;
4921  /* Loop back and try again */
4922  }
4923 }
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1719
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1583
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1547
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:66
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:164
int64 TimestampTz
Definition: timestamp.h:39
static volatile sig_atomic_t waiting
Definition: latch.c:160
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:47
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:396
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:344
int DeadlockTimeout
Definition: proc.c:58
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:627
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1797
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:793
bool log_recovery_conflict_waits
Definition: standby.c:43
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:274
int wait_backend_pgprocno
int pgprocno
Definition: proc.h:191
#define InHotStandby
Definition: xlogutils.h:57

References Assert(), BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, CheckBufferIsPinnedOnce(), DeadlockTimeout, elog(), ERROR, GetBufferDescriptor(), GetCurrentTimestamp(), InHotStandby, LockBuffer(), LockBufHdr(), log_recovery_conflict_waits, LogRecoveryConflict(), MyProc, now(), PGPROC::pgprocno, PinCountWaitBuf, PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), set_ps_display_remove_suffix(), set_ps_display_suffix(), SetStartupBufferPinWaitBufId(), TimestampDifferenceExceeds(), UnlockBufHdr(), BufferDesc::wait_backend_pgprocno, and waiting.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), ReadBuffer_common(), and XLogReadBufferForRedoExtended().

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 5300 of file bufmgr.c.

5301 {
5302  SpinDelayStatus delayStatus;
5303  uint32 old_buf_state;
5304 
5306 
5307  init_local_spin_delay(&delayStatus);
5308 
5309  while (true)
5310  {
5311  /* set BM_LOCKED flag */
5312  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
5313  /* if it wasn't set before we're OK */
5314  if (!(old_buf_state & BM_LOCKED))
5315  break;
5316  perform_spin_delay(&delayStatus);
5317  }
5318  finish_spin_delay(&delayStatus);
5319  return old_buf_state | BM_LOCKED;
5320 }
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:367
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:186
#define init_local_spin_delay(status)
Definition: s_lock.h:863

References Assert(), BM_LOCKED, BufferDescriptorGetBuffer(), BufferIsLocal, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), ExtendBufferedRelShared(), FindAndDropRelationBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetBufferFromRing(), GetVictimBuffer(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), pg_buffercache_pages(), ReadRecentBuffer(), StartBufferIO(), StrategyGetBuffer(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), UnpinBuffer(), and WaitIO().

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 2111 of file bufmgr.c.

2112 {
2113  BufferDesc *bufHdr;
2114  uint32 buf_state;
2115  uint32 old_buf_state;
2116 
2117  if (!BufferIsValid(buffer))
2118  elog(ERROR, "bad buffer ID: %d", buffer);
2119 
2120  if (BufferIsLocal(buffer))
2121  {
2122  MarkLocalBufferDirty(buffer);
2123  return;
2124  }
2125 
2126  bufHdr = GetBufferDescriptor(buffer - 1);
2127 
2128  Assert(BufferIsPinned(buffer));
2130  LW_EXCLUSIVE));
2131 
2132  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2133  for (;;)
2134  {
2135  if (old_buf_state & BM_LOCKED)
2136  old_buf_state = WaitBufHdrUnlocked(bufHdr);
2137 
2138  buf_state = old_buf_state;
2139 
2140  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2141  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2142 
2143  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2144  buf_state))
2145  break;
2146  }
2147 
2148  /*
2149  * If the buffer was not dirty already, do vacuum accounting.
2150  */
2151  if (!(old_buf_state & BM_DIRTY))
2152  {
2153  VacuumPageDirty++;
2155  if (VacuumCostActive)
2157  }
2158 }
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:306
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:5330
bool VacuumCostActive
Definition: globals.c:156
int64 VacuumPageDirty
Definition: globals.c:153
int VacuumCostBalance
Definition: globals.c:155
int VacuumCostPageDirty
Definition: globals.c:147
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:444
int64 shared_blks_dirtied
Definition: instrument.h:28

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, BufferIsValid(), elog(), ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_pass(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_restore_meta(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btbuildempty(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), do_setval(), doPickSplit(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_freeze_execute_prepared(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_freeze_page(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune(), heap_xlog_update(), heap_xlog_vacuum(), heap_xlog_visible(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_page(), log_newpage_range(), moveLeafs(), nextval_internal(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), spgbuildempty(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 4544 of file bufmgr.c.

4545 {
4546  BufferDesc *bufHdr;
4547  Page page = BufferGetPage(buffer);
4548 
4549  if (!BufferIsValid(buffer))
4550  elog(ERROR, "bad buffer ID: %d", buffer);
4551 
4552  if (BufferIsLocal(buffer))
4553  {
4554  MarkLocalBufferDirty(buffer);
4555  return;
4556  }
4557 
4558  bufHdr = GetBufferDescriptor(buffer - 1);
4559 
4560  Assert(GetPrivateRefCount(buffer) > 0);
4561  /* here, either share or exclusive lock is OK */
4563 
4564  /*
4565  * This routine might get called many times on the same page, if we are
4566  * making the first scan after commit of an xact that added/deleted many
4567  * tuples. So, be as quick as we can if the buffer is already dirty. We
4568  * do this by not acquiring spinlock if it looks like the status bits are
4569  * already set. Since we make this test unlocked, there's a chance we
4570  * might fail to notice that the flags have just been cleared, and failed
4571  * to reset them, due to memory-ordering issues. But since this function
4572  * is only intended to be used in cases where failing to write out the
4573  * data would be harmless anyway, it doesn't really matter.
4574  */
4575  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
4577  {
4579  bool dirtied = false;
4580  bool delayChkptFlags = false;
4581  uint32 buf_state;
4582 
4583  /*
4584  * If we need to protect hint bit updates from torn writes, WAL-log a
4585  * full page image of the page. This full page image is only necessary
4586  * if the hint bit update is the first change to the page since the
4587  * last checkpoint.
4588  *
4589  * We don't check full_page_writes here because that logic is included
4590  * when we call XLogInsert() since the value changes dynamically.
4591  */
4592  if (XLogHintBitIsNeeded() &&
4593  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
4594  {
4595  /*
4596  * If we must not write WAL, due to a relfilelocator-specific
4597  * condition or being in recovery, don't dirty the page. We can
4598  * set the hint, just not dirty the page as a result so the hint
4599  * is lost when we evict the page or shutdown.
4600  *
4601  * See src/backend/storage/page/README for longer discussion.
4602  */
4603  if (RecoveryInProgress() ||
4605  return;
4606 
4607  /*
4608  * If the block is already dirty because we either made a change
4609  * or set a hint already, then we don't need to write a full page
4610  * image. Note that aggressive cleaning of blocks dirtied by hint
4611  * bit setting would increase the call rate. Bulk setting of hint
4612  * bits would reduce the call rate...
4613  *
4614  * We must issue the WAL record before we mark the buffer dirty.
4615  * Otherwise we might write the page before we write the WAL. That
4616  * causes a race condition, since a checkpoint might occur between
4617  * writing the WAL record and marking the buffer dirty. We solve
4618  * that with a kluge, but one that is already in use during
4619  * transaction commit to prevent race conditions. Basically, we
4620  * simply prevent the checkpoint WAL record from being written
4621  * until we have marked the buffer dirty. We don't start the
4622  * checkpoint flush until we have marked dirty, so our checkpoint
4623  * must flush the change to disk successfully or the checkpoint
4624  * never gets written, so crash recovery will fix.
4625  *
4626  * It's possible we may enter here without an xid, so it is
4627  * essential that CreateCheckPoint waits for virtual transactions
4628  * rather than full transactionids.
4629  */
4632  delayChkptFlags = true;
4633  lsn = XLogSaveBufferForHint(buffer, buffer_std);
4634  }
4635 
4636  buf_state = LockBufHdr(bufHdr);
4637 
4638  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
4639 
4640  if (!(buf_state & BM_DIRTY))
4641  {
4642  dirtied = true; /* Means "will be dirtied by this action" */
4643 
4644  /*
4645  * Set the page LSN if we wrote a backup block. We aren't supposed
4646  * to set this when only holding a share lock but as long as we
4647  * serialise it somehow we're OK. We choose to set LSN while
4648  * holding the buffer header lock, which causes any reader of an
4649  * LSN who holds only a share lock to also obtain a buffer header
4650  * lock before using PageGetLSN(), which is enforced in
4651  * BufferGetLSNAtomic().
4652  *
4653  * If checksums are enabled, you might think we should reset the
4654  * checksum here. That will happen when the page is written
4655  * sometime later in this checkpoint cycle.
4656  */
4657  if (!XLogRecPtrIsInvalid(lsn))
4658  PageSetLSN(page, lsn);
4659  }
4660 
4661  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
4662  UnlockBufHdr(bufHdr, buf_state);
4663 
4664  if (delayChkptFlags)
4666 
4667  if (dirtied)
4668  {
4669  VacuumPageDirty++;
4671  if (VacuumCostActive)
4673  }
4674  }
4675 }
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
#define DELAY_CHKPT_START
Definition: proc.h:119
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:550
int delayChkptFlags
Definition: proc.h:231
bool RecoveryInProgress(void)
Definition: xlog.c:5948
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1053

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferGetPage(), BufferIsLocal, BufferIsValid(), BufTagGetRelFileLocator(), DELAY_CHKPT_START, PGPROC::delayChkptFlags, elog(), ERROR, GetBufferDescriptor(), GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), LWLockHeldByMe(), MarkLocalBufferDirty(), MyProc, PageSetLSN(), pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), RelFileLocatorSkippingWAL(), BufferUsage::shared_blks_dirtied, BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 280 of file bufmgr.c.

281 {
283 
284  /* only allowed to be called when a reservation has been made */
285  Assert(ReservedRefCountEntry != NULL);
286 
287  /* use up the reserved entry */
289  ReservedRefCountEntry = NULL;
290 
291  /* and fill it */
292  res->buffer = buffer;
293  res->refcount = 0;
294 
295  return res;
296 }

References Assert(), PrivateRefCountEntry::buffer, res, and ReservedRefCountEntry.

Referenced by PinBuffer(), and PinBuffer_Locked().

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy 
)
static

Definition at line 2231 of file bufmgr.c.

2232 {
2234  bool result;
2235  PrivateRefCountEntry *ref;
2236 
2237  Assert(!BufferIsLocal(b));
2238 
2239  ref = GetPrivateRefCountEntry(b, true);
2240 
2241  if (ref == NULL)
2242  {
2243  uint32 buf_state;
2244  uint32 old_buf_state;
2245 
2247  ref = NewPrivateRefCountEntry(b);
2248 
2249  old_buf_state = pg_atomic_read_u32(&buf->state);
2250  for (;;)
2251  {
2252  if (old_buf_state & BM_LOCKED)
2253  old_buf_state = WaitBufHdrUnlocked(buf);
2254 
2255  buf_state = old_buf_state;
2256 
2257  /* increase refcount */
2258  buf_state += BUF_REFCOUNT_ONE;
2259 
2260  if (strategy == NULL)
2261  {
2262  /* Default case: increase usagecount unless already max. */
2264  buf_state += BUF_USAGECOUNT_ONE;
2265  }
2266  else
2267  {
2268  /*
2269  * Ring buffers shouldn't evict others from pool. Thus we
2270  * don't make usagecount more than 1.
2271  */
2272  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2273  buf_state += BUF_USAGECOUNT_ONE;
2274  }
2275 
2276  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2277  buf_state))
2278  {
2279  result = (buf_state & BM_VALID) != 0;
2280 
2281  /*
2282  * Assume that we acquired a buffer pin for the purposes of
2283  * Valgrind buffer client checks (even in !result case) to
2284  * keep things simple. Buffers that are unsafe to access are
2285  * not generally guaranteed to be marked undefined or
2286  * non-accessible in any case.
2287  */
2289  break;
2290  }
2291  }
2292  }
2293  else
2294  {
2295  /*
2296  * If we previously pinned the buffer, it must surely be valid.
2297  *
2298  * Note: We deliberately avoid a Valgrind client request here.
2299  * Individual access methods can optionally superimpose buffer page
2300  * client requests on top of our client requests to enforce that
2301  * buffers are only accessed while locked (and pinned). It's possible
2302  * that the buffer page is legitimately non-accessible here. We
2303  * cannot meddle with that.
2304  */
2305  result = true;
2306  }
2307 
2308  ref->refcount++;
2309  Assert(ref->refcount > 0);
2311  return result;
2312 }
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:77
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:42
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:51
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:280
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26

References Assert(), b, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservePrivateRefCountEntry(), ResourceOwnerRememberBuffer(), VALGRIND_MAKE_MEM_DEFINED, and WaitBufHdrUnlocked().

Referenced by BufferAlloc(), ExtendBufferedRelShared(), and ReadRecentBuffer().

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 2336 of file bufmgr.c.

2337 {
2338  Buffer b;
2339  PrivateRefCountEntry *ref;
2340  uint32 buf_state;
2341 
2342  /*
2343  * As explained, We don't expect any preexisting pins. That allows us to
2344  * manipulate the PrivateRefCount after releasing the spinlock
2345  */
2347 
2348  /*
2349  * Buffer can't have a preexisting pin, so mark its page as defined to
2350  * Valgrind (this is similar to the PinBuffer() case where the backend
2351  * doesn't already have a buffer pin)
2352  */
2354 
2355  /*
2356  * Since we hold the buffer spinlock, we can update the buffer state and
2357  * release the lock in one operation.
2358  */
2359  buf_state = pg_atomic_read_u32(&buf->state);
2360  Assert(buf_state & BM_LOCKED);
2361  buf_state += BUF_REFCOUNT_ONE;
2362  UnlockBufHdr(buf, buf_state);
2363 
2365 
2366  ref = NewPrivateRefCountEntry(b);
2367  ref->refcount++;
2368 
2370 }

References Assert(), b, BM_LOCKED, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), UnlockBufHdr(), and VALGRIND_MAKE_MEM_DEFINED.

Referenced by FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ PrefetchBuffer()

PrefetchBufferResult PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 601 of file bufmgr.c.

602 {
603  Assert(RelationIsValid(reln));
604  Assert(BlockNumberIsValid(blockNum));
605 
606  if (RelationUsesLocalBuffers(reln))
607  {
608  /* see comments in ReadBufferExtended */
609  if (RELATION_IS_OTHER_TEMP(reln))
610  ereport(ERROR,
611  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
612  errmsg("cannot access temporary tables of other sessions")));
613 
614  /* pass it off to localbuf.c */
615  return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
616  }
617  else
618  {
619  /* pass it to the shared buffer version */
620  return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
621  }
622 }
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:511
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:69
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:659
#define RelationIsValid(relation)
Definition: rel.h:477

References Assert(), BlockNumberIsValid(), ereport, errcode(), errmsg(), ERROR, PrefetchLocalBuffer(), PrefetchSharedBuffer(), RELATION_IS_OTHER_TEMP, RelationGetSmgr(), RelationIsValid, and RelationUsesLocalBuffers.

Referenced by acquire_sample_rows(), BitmapPrefetch(), count_nondeletable_pages(), and pg_prewarm().

◆ PrefetchSharedBuffer()

PrefetchBufferResult PrefetchSharedBuffer ( SMgrRelation  smgr_reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 511 of file bufmgr.c.

514 {
515  PrefetchBufferResult result = {InvalidBuffer, false};
516  BufferTag newTag; /* identity of requested block */
517  uint32 newHash; /* hash value for newTag */
518  LWLock *newPartitionLock; /* buffer partition lock for it */
519  int buf_id;
520 
521  Assert(BlockNumberIsValid(blockNum));
522 
523  /* create a tag so we can lookup the buffer */
524  InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
525  forkNum, blockNum);
526 
527  /* determine its hash code and partition lock ID */
528  newHash = BufTableHashCode(&newTag);
529  newPartitionLock = BufMappingPartitionLock(newHash);
530 
531  /* see if the block is in the buffer pool already */
532  LWLockAcquire(newPartitionLock, LW_SHARED);
533  buf_id = BufTableLookup(&newTag, newHash);
534  LWLockRelease(newPartitionLock);
535 
536  /* If not in buffers, initiate prefetch */
537  if (buf_id < 0)
538  {
539 #ifdef USE_PREFETCH
540  /*
541  * Try to initiate an asynchronous read. This returns false in
542  * recovery if the relation file doesn't exist.
543  */
544  if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
545  smgrprefetch(smgr_reln, forkNum, blockNum))
546  {
547  result.initiated_io = true;
548  }
549 #endif /* USE_PREFETCH */
550  }
551  else
552  {
553  /*
554  * Report the buffer it was in at that time. The caller may be able
555  * to avoid a buffer table lookup, but it's not pinned and it must be
556  * rechecked!
557  */
558  result.recent_buffer = buf_id + 1;
559  }
560 
561  /*
562  * If the block *is* in buffers, we do nothing. This is not really ideal:
563  * the block might be just about to be evicted, which would be stupid
564  * since we know we are going to need it soon. But the only easy answer
565  * is to bump the usage_count, which does not seem like a great solution:
566  * when the caller does ultimately touch the block, usage_count would get
567  * bumped again, resulting in too much favoritism for blocks that are
568  * involved in a prefetch sequence. A real fix would involve some
569  * additional per-buffer state, and it's not clear that there's enough of
570  * a problem to justify that.
571  */
572 
573  return result;
574 }
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:52
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:548
Buffer recent_buffer
Definition: bufmgr.h:59

References Assert(), BlockNumberIsValid(), BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), InitBufferTag(), PrefetchBufferResult::initiated_io, InvalidBuffer, IO_DIRECT_DATA, io_direct_flags, RelFileLocatorBackend::locator, LW_SHARED, LWLockAcquire(), LWLockRelease(), PrefetchBufferResult::recent_buffer, SMgrRelationData::smgr_rlocator, and smgrprefetch().

Referenced by PrefetchBuffer(), and XLogPrefetcherNextBlock().

◆ PrintBufferLeakWarning()

void PrintBufferLeakWarning ( Buffer  buffer)

Definition at line 3232 of file bufmgr.c.

3233 {
3234  BufferDesc *buf;
3235  int32 loccount;
3236  char *path;
3237  BackendId backend;
3238  uint32 buf_state;
3239 
3240  Assert(BufferIsValid(buffer));
3241  if (BufferIsLocal(buffer))
3242  {
3243  buf = GetLocalBufferDescriptor(-buffer - 1);
3244  loccount = LocalRefCount[-buffer - 1];
3245  backend = MyBackendId;
3246  }
3247  else
3248  {
3249  buf = GetBufferDescriptor(buffer - 1);
3250  loccount = GetPrivateRefCount(buffer);
3251  backend = InvalidBackendId;
3252  }
3253 
3254  /* theoretically we should lock the bufhdr here */
3255  path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
3256  BufTagGetForkNum(&buf->tag));
3257  buf_state = pg_atomic_read_u32(&buf->state);
3258  elog(WARNING,
3259  "buffer refcount leak: [%03d] "
3260  "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
3261  buffer, path,
3262  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
3263  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
3264  pfree(path);
3265 }
int BackendId
Definition: backendid.h:21

References Assert(), buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), BufTagGetForkNum(), BufTagGetRelFileLocator(), elog(), GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), InvalidBackendId, LocalRefCount, MyBackendId, pfree(), pg_atomic_read_u32(), relpathbackend, and WARNING.

Referenced by CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResourceOwnerReleaseInternal().

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 708 of file bufmgr.c.

709 {
710  return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
711 }
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:755
@ RBM_NORMAL
Definition: bufmgr.h:44

References MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_allocbuf(), _bt_getbuf(), _bt_search_insert(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_inplace_update(), heap_lock_tuple(), heap_update(), initBloomState(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

◆ ReadBuffer_common()

static Buffer ReadBuffer_common ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool hit 
)
static

Definition at line 992 of file bufmgr.c.

995 {
996  BufferDesc *bufHdr;
997  Block bufBlock;
998  bool found;
999  IOContext io_context;
1000  IOObject io_object;
1001  bool isLocalBuf = SmgrIsTemp(smgr);
1002 
1003  *hit = false;
1004 
1005  /*
1006  * Backward compatibility path, most code should use ExtendBufferedRel()
1007  * instead, as acquiring the extension lock inside ExtendBufferedRel()
1008  * scales a lot better.
1009  */
1010  if (unlikely(blockNum == P_NEW))
1011  {
1013 
1014  /*
1015  * Since no-one else can be looking at the page contents yet, there is
1016  * no difference between an exclusive lock and a cleanup-strength
1017  * lock.
1018  */
1020  flags |= EB_LOCK_FIRST;
1021 
1022  return ExtendBufferedRel(BMR_SMGR(smgr, relpersistence),
1023  forkNum, strategy, flags);
1024  }
1025 
1026  /* Make sure we will have room to remember the buffer pin */
1028 
1029  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1031  smgr->smgr_rlocator.locator.dbOid,
1033  smgr->smgr_rlocator.backend);
1034 
1035  if (isLocalBuf)
1036  {
1037  /*
1038  * We do not use a BufferAccessStrategy for I/O of temporary tables.
1039  * However, in some cases, the "strategy" may not be NULL, so we can't
1040  * rely on IOContextForStrategy() to set the right IOContext for us.
1041  * This may happen in cases lik