PostgreSQL Source Code  git master
buf_internals.h
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * buf_internals.h
4  * Internal definitions for buffer manager and the buffer replacement
5  * strategy.
6  *
7  *
8  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
9  * Portions Copyright (c) 1994, Regents of the University of California
10  *
11  * src/include/storage/buf_internals.h
12  *
13  *-------------------------------------------------------------------------
14  */
15 #ifndef BUFMGR_INTERNALS_H
16 #define BUFMGR_INTERNALS_H
17 
18 #include "pgstat.h"
19 #include "port/atomics.h"
20 #include "storage/buf.h"
21 #include "storage/bufmgr.h"
23 #include "storage/lwlock.h"
24 #include "storage/shmem.h"
25 #include "storage/smgr.h"
26 #include "storage/spin.h"
27 #include "utils/relcache.h"
28 #include "utils/resowner.h"
29 
30 /*
31  * Buffer state is a single 32-bit variable where following data is combined.
32  *
33  * - 18 bits refcount
34  * - 4 bits usage count
35  * - 10 bits of flags
36  *
37  * Combining these values allows to perform some operations without locking
38  * the buffer header, by modifying them together with a CAS loop.
39  *
40  * The definition of buffer state components is below.
41  */
42 #define BUF_REFCOUNT_ONE 1
43 #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
44 #define BUF_USAGECOUNT_MASK 0x003C0000U
45 #define BUF_USAGECOUNT_ONE (1U << 18)
46 #define BUF_USAGECOUNT_SHIFT 18
47 #define BUF_FLAG_MASK 0xFFC00000U
48 
49 /* Get refcount and usagecount from buffer state */
50 #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
51 #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
52 
53 /*
54  * Flags for buffer descriptors
55  *
56  * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
57  * entry associated with the buffer's tag.
58  */
59 #define BM_LOCKED (1U << 22) /* buffer header is locked */
60 #define BM_DIRTY (1U << 23) /* data needs writing */
61 #define BM_VALID (1U << 24) /* data is valid */
62 #define BM_TAG_VALID (1U << 25) /* tag is assigned */
63 #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
64 #define BM_IO_ERROR (1U << 27) /* previous I/O failed */
65 #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
66 #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
67 #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
68 #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
69  * or init fork) */
70 /*
71  * The maximum allowed value of usage_count represents a tradeoff between
72  * accuracy and speed of the clock-sweep buffer management algorithm. A
73  * large value (comparable to NBuffers) would approximate LRU semantics.
74  * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
75  * clock sweeps to find a free buffer, so in practice we don't want the
76  * value to be very large.
77  */
78 #define BM_MAX_USAGE_COUNT 5
79 
80 /*
81  * Buffer tag identifies which disk block the buffer contains.
82  *
83  * Note: the BufferTag data must be sufficient to determine where to write the
84  * block, without reference to pg_class or pg_tablespace entries. It's
85  * possible that the backend flushing the buffer doesn't even believe the
86  * relation is visible yet (its xact may have started before the xact that
87  * created the rel). The storage manager must be able to cope anyway.
88  *
89  * Note: if there's any pad bytes in the struct, InitBufferTag will have
90  * to be fixed to zero them, since this struct is used as a hash key.
91  */
92 typedef struct buftag
93 {
94  Oid spcOid; /* tablespace oid */
95  Oid dbOid; /* database oid */
96  RelFileNumber relNumber; /* relation file number */
97  ForkNumber forkNum; /* fork number */
98  BlockNumber blockNum; /* blknum relative to begin of reln */
99 } BufferTag;
100 
101 static inline RelFileNumber
102 BufTagGetRelNumber(const BufferTag *tag)
103 {
104  return tag->relNumber;
105 }
106 
107 static inline ForkNumber
108 BufTagGetForkNum(const BufferTag *tag)
109 {
110  return tag->forkNum;
111 }
112 
113 static inline void
115  ForkNumber forknum)
116 {
117  tag->relNumber = relnumber;
118  tag->forkNum = forknum;
119 }
120 
121 static inline RelFileLocator
123 {
124  RelFileLocator rlocator;
125 
126  rlocator.spcOid = tag->spcOid;
127  rlocator.dbOid = tag->dbOid;
128  rlocator.relNumber = BufTagGetRelNumber(tag);
129 
130  return rlocator;
131 }
132 
133 static inline void
135 {
136  tag->spcOid = InvalidOid;
137  tag->dbOid = InvalidOid;
140 }
141 
142 static inline void
143 InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator,
144  ForkNumber forkNum, BlockNumber blockNum)
145 {
146  tag->spcOid = rlocator->spcOid;
147  tag->dbOid = rlocator->dbOid;
148  BufTagSetRelForkDetails(tag, rlocator->relNumber, forkNum);
149  tag->blockNum = blockNum;
150 }
151 
152 static inline bool
153 BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
154 {
155  return (tag1->spcOid == tag2->spcOid) &&
156  (tag1->dbOid == tag2->dbOid) &&
157  (tag1->relNumber == tag2->relNumber) &&
158  (tag1->blockNum == tag2->blockNum) &&
159  (tag1->forkNum == tag2->forkNum);
160 }
161 
162 static inline bool
164  const RelFileLocator *rlocator)
165 {
166  return (tag->spcOid == rlocator->spcOid) &&
167  (tag->dbOid == rlocator->dbOid) &&
168  (BufTagGetRelNumber(tag) == rlocator->relNumber);
169 }
170 
171 
172 /*
173  * The shared buffer mapping table is partitioned to reduce contention.
174  * To determine which partition lock a given tag requires, compute the tag's
175  * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
176  * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
177  */
178 static inline uint32
180 {
181  return hashcode % NUM_BUFFER_PARTITIONS;
182 }
183 
184 static inline LWLock *
186 {
188  BufTableHashPartition(hashcode)].lock;
189 }
190 
191 static inline LWLock *
193 {
195 }
196 
197 /*
198  * BufferDesc -- shared descriptor/state data for a single shared buffer.
199  *
200  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
201  * tag, state or wait_backend_pgprocno fields. In general, buffer header lock
202  * is a spinlock which is combined with flags, refcount and usagecount into
203  * single atomic variable. This layout allow us to do some operations in a
204  * single atomic operation, without actually acquiring and releasing spinlock;
205  * for instance, increase or decrease refcount. buf_id field never changes
206  * after initialization, so does not need locking. freeNext is protected by
207  * the buffer_strategy_lock not buffer header lock. The LWLock can take care
208  * of itself. The buffer header lock is *not* used to control access to the
209  * data in the buffer!
210  *
211  * It's assumed that nobody changes the state field while buffer header lock
212  * is held. Thus buffer header lock holder can do complex updates of the
213  * state variable in single write, simultaneously with lock release (cleaning
214  * BM_LOCKED flag). On the other hand, updating of state without holding
215  * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag
216  * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
217  *
218  * An exception is that if we have the buffer pinned, its tag can't change
219  * underneath us, so we can examine the tag without locking the buffer header.
220  * Also, in places we do one-time reads of the flags without bothering to
221  * lock the buffer header; this is generally for situations where we don't
222  * expect the flag bit being tested to be changing.
223  *
224  * We can't physically remove items from a disk page if another backend has
225  * the buffer pinned. Hence, a backend may need to wait for all other pins
226  * to go away. This is signaled by storing its own pgprocno into
227  * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present,
228  * there can be only one such waiter per buffer.
229  *
230  * We use this same struct for local buffer headers, but the locks are not
231  * used and not all of the flag bits are useful either. To avoid unnecessary
232  * overhead, manipulations of the state field should be done without actual
233  * atomic operations (i.e. only pg_atomic_read_u32() and
234  * pg_atomic_unlocked_write_u32()).
235  *
236  * Be careful to avoid increasing the size of the struct when adding or
237  * reordering members. Keeping it below 64 bytes (the most common CPU
238  * cache line size) is fairly important for performance.
239  *
240  * Per-buffer I/O condition variables are currently kept outside this struct in
241  * a separate array. They could be moved in here and still fit within that
242  * limit on common systems, but for now that is not done.
243  */
244 typedef struct BufferDesc
245 {
246  BufferTag tag; /* ID of page contained in buffer */
247  int buf_id; /* buffer's index number (from 0) */
248 
249  /* state of the tag, containing flags, refcount and usagecount */
251 
252  int wait_backend_pgprocno; /* backend of pin-count waiter */
253  int freeNext; /* link in freelist chain */
254  LWLock content_lock; /* to lock access to buffer contents */
255 } BufferDesc;
256 
257 /*
258  * Concurrent access to buffer headers has proven to be more efficient if
259  * they're cache line aligned. So we force the start of the BufferDescriptors
260  * array to be on a cache line boundary and force the elements to be cache
261  * line sized.
262  *
263  * XXX: As this is primarily matters in highly concurrent workloads which
264  * probably all are 64bit these days, and the space wastage would be a bit
265  * more noticeable on 32bit systems, we don't force the stride to be cache
266  * line sized on those. If somebody does actual performance testing, we can
267  * reevaluate.
268  *
269  * Note that local buffer descriptors aren't forced to be aligned - as there's
270  * no concurrent access to those it's unlikely to be beneficial.
271  *
272  * We use a 64-byte cache line size here, because that's the most common
273  * size. Making it bigger would be a waste of memory. Even if running on a
274  * platform with either 32 or 128 byte line sizes, it's good to align to
275  * boundaries and avoid false sharing.
276  */
277 #define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
278 
279 typedef union BufferDescPadded
280 {
284 
285 /*
286  * The PendingWriteback & WritebackContext structure are used to keep
287  * information about pending flush requests to be issued to the OS.
288  */
289 typedef struct PendingWriteback
290 {
291  /* could store different types of pending flushes here */
294 
295 /* struct forward declared in bufmgr.h */
296 typedef struct WritebackContext
297 {
298  /* pointer to the max number of writeback requests to coalesce */
299  int *max_pending;
300 
301  /* current number of pending writeback requests */
302  int nr_pending;
303 
304  /* pending requests */
307 
308 /* in buf_init.c */
312 
313 /* in localbuf.c */
315 
316 
317 static inline BufferDesc *
319 {
320  return &(BufferDescriptors[id]).bufferdesc;
321 }
322 
323 static inline BufferDesc *
325 {
326  return &LocalBufferDescriptors[id];
327 }
328 
329 static inline Buffer
331 {
332  return (Buffer) (bdesc->buf_id + 1);
333 }
334 
335 static inline ConditionVariable *
337 {
338  return &(BufferIOCVArray[bdesc->buf_id]).cv;
339 }
340 
341 static inline LWLock *
343 {
344  return (LWLock *) (&bdesc->content_lock);
345 }
346 
347 /*
348  * The freeNext field is either the index of the next freelist entry,
349  * or one of these special values:
350  */
351 #define FREENEXT_END_OF_LIST (-1)
352 #define FREENEXT_NOT_IN_LIST (-2)
353 
354 /*
355  * Functions for acquiring/releasing a shared buffer header's spinlock. Do
356  * not apply these to local buffers!
357  */
358 extern uint32 LockBufHdr(BufferDesc *desc);
359 
360 static inline void
361 UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
362 {
364  pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
365 }
366 
367 /* in bufmgr.c */
368 
369 /*
370  * Structure to sort buffers per file on checkpoints.
371  *
372  * This structure is allocated per buffer in shared memory, so it should be
373  * kept as small as possible.
374  */
375 typedef struct CkptSortItem
376 {
381  int buf_id;
382 } CkptSortItem;
383 
385 
386 /* ResourceOwner callbacks to hold buffer I/Os and pins */
389 
390 /* Convenience wrappers over ResourceOwnerRemember/Forget */
391 static inline void
393 {
395 }
396 static inline void
398 {
400 }
401 static inline void
403 {
405 }
406 static inline void
408 {
410 }
411 
412 /*
413  * Internal buffer management routines
414  */
415 /* bufmgr.c */
416 extern void WritebackContextInit(WritebackContext *context, int *max_pending);
417 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
418 extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
419  IOContext io_context, BufferTag *tag);
420 
421 /* freelist.c */
424  uint32 *buf_state, bool *from_ring);
425 extern void StrategyFreeBuffer(BufferDesc *buf);
426 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
427  BufferDesc *buf, bool from_ring);
428 
429 extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
430 extern void StrategyNotifyBgWriter(int bgwprocno);
431 
432 extern Size StrategyShmemSize(void);
433 extern void StrategyInitialize(bool init);
434 extern bool have_free_buffer(void);
435 
436 /* buf_table.c */
437 extern Size BufTableShmemSize(int size);
438 extern void InitBufTable(int size);
439 extern uint32 BufTableHashCode(BufferTag *tagPtr);
440 extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
441 extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
442 extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
443 
444 /* localbuf.c */
445 extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
446 extern void UnpinLocalBuffer(Buffer buffer);
447 extern void UnpinLocalBufferNoOwner(Buffer buffer);
449  ForkNumber forkNum,
450  BlockNumber blockNum);
451 extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
452  BlockNumber blockNum, bool *foundPtr);
454  ForkNumber fork,
455  uint32 flags,
456  uint32 extend_by,
457  BlockNumber extend_upto,
458  Buffer *buffers,
459  uint32 *extended_by);
460 extern void MarkLocalBufferDirty(Buffer buffer);
461 extern void DropRelationLocalBuffers(RelFileLocator rlocator,
462  ForkNumber forkNum,
463  BlockNumber firstDelBlock);
464 extern void DropRelationAllLocalBuffers(RelFileLocator rlocator);
465 extern void AtEOXact_LocalBuffers(bool isCommit);
466 
467 #endif /* BUFMGR_INTERNALS_H */
#define pg_write_barrier()
Definition: atomics.h:157
static void pg_atomic_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:276
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
int Buffer
Definition: buf.h:23
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
struct CkptSortItem CkptSortItem
static uint32 BufTableHashPartition(uint32 hashcode)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
union BufferDescPadded BufferDescPadded
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:681
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:116
struct BufferDesc BufferDesc
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:819
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
bool have_free_buffer(void)
Definition: freelist.c:175
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:489
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
struct WritebackContext WritebackContext
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:655
struct PendingWriteback PendingWriteback
static void BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber, ForkNumber forknum)
static LWLock * BufMappingPartitionLockByIndex(uint32 index)
void InitBufTable(int size)
Definition: buf_table.c:51
static LWLock * BufMappingPartitionLock(uint32 hashcode)
PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc
Definition: bufmgr.c:226
void StrategyInitialize(bool init)
Definition: freelist.c:474
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
Definition: buf_internals.h:59
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:449
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
#define BUFFERDESC_PAD_TO_SIZE
PGDLLIMPORT WritebackContext BackendWritebackContext
Definition: buf_init.c:24
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
Size BufTableShmemSize(int size)
Definition: buf_table.c:41
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:537
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:5915
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:5903
void StrategyNotifyBgWriter(int bgwprocno)
Definition: freelist.c:431
struct buftag BufferTag
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
PGDLLIMPORT BufferDescPadded * BufferDescriptors
Definition: buf_init.c:21
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:69
PGDLLIMPORT ConditionVariableMinimallyPadded * BufferIOCVArray
Definition: buf_init.c:23
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:313
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:5965
PGDLLIMPORT CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:758
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:688
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
Size StrategyShmemSize(void)
Definition: freelist.c:453
PGDLLIMPORT BufferDesc * LocalBufferDescriptors
Definition: localbuf.c:44
PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc
Definition: bufmgr.c:235
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:5761
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:798
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
unsigned int uint32
Definition: c.h:506
#define PGDLLIMPORT
Definition: c.h:1307
size_t Size
Definition: c.h:596
int init
Definition: isn.c:75
LWLockPadded * MainLWLockArray
Definition: lwlock.c:188
#define BUFFER_MAPPING_LWLOCK_OFFSET
Definition: lwlock.h:104
#define NUM_BUFFER_PARTITIONS
Definition: lwlock.h:93
#define WRITEBACK_MAX_PENDING_FLUSHES
static char * buf
Definition: pg_test_fsync.c:73
IOContext
Definition: pgstat.h:321
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
tree context
Definition: radixtree.h:1835
Oid RelFileNumber
Definition: relpath.h:25
ForkNumber
Definition: relpath.h:56
@ InvalidForkNumber
Definition: relpath.h:57
#define InvalidRelFileNumber
Definition: relpath.h:26
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:554
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:514
static pg_noinline void Size size
Definition: slab.c:607
LWLock content_lock
int wait_backend_pgprocno
BufferTag tag
pg_atomic_uint32 state
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
Definition: lwlock.h:42
RelFileNumber relNumber
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
BlockNumber blockNum
Definition: buf_internals.h:97
RelFileNumber relNumber
Definition: buf_internals.h:95
ForkNumber forkNum
Definition: buf_internals.h:96
Oid spcOid
Definition: buf_internals.h:93
Oid dbOid
Definition: buf_internals.h:94
Definition: type.h:95
char pad[BUFFERDESC_PAD_TO_SIZE]
BufferDesc bufferdesc
LWLock lock
Definition: lwlock.h:70