PostgreSQL Source Code  git master
buf_internals.h
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * buf_internals.h
4  * Internal definitions for buffer manager and the buffer replacement
5  * strategy.
6  *
7  *
8  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
9  * Portions Copyright (c) 1994, Regents of the University of California
10  *
11  * src/include/storage/buf_internals.h
12  *
13  *-------------------------------------------------------------------------
14  */
15 #ifndef BUFMGR_INTERNALS_H
16 #define BUFMGR_INTERNALS_H
17 
18 #include "pgstat.h"
19 #include "port/atomics.h"
20 #include "storage/buf.h"
21 #include "storage/bufmgr.h"
23 #include "storage/latch.h"
24 #include "storage/lwlock.h"
25 #include "storage/shmem.h"
26 #include "storage/smgr.h"
27 #include "storage/spin.h"
28 #include "utils/relcache.h"
29 #include "utils/resowner.h"
30 
31 /*
32  * Buffer state is a single 32-bit variable where following data is combined.
33  *
34  * - 18 bits refcount
35  * - 4 bits usage count
36  * - 10 bits of flags
37  *
38  * Combining these values allows to perform some operations without locking
39  * the buffer header, by modifying them together with a CAS loop.
40  *
41  * The definition of buffer state components is below.
42  */
43 #define BUF_REFCOUNT_ONE 1
44 #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
45 #define BUF_USAGECOUNT_MASK 0x003C0000U
46 #define BUF_USAGECOUNT_ONE (1U << 18)
47 #define BUF_USAGECOUNT_SHIFT 18
48 #define BUF_FLAG_MASK 0xFFC00000U
49 
50 /* Get refcount and usagecount from buffer state */
51 #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
52 #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
53 
54 /*
55  * Flags for buffer descriptors
56  *
57  * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
58  * entry associated with the buffer's tag.
59  */
60 #define BM_LOCKED (1U << 22) /* buffer header is locked */
61 #define BM_DIRTY (1U << 23) /* data needs writing */
62 #define BM_VALID (1U << 24) /* data is valid */
63 #define BM_TAG_VALID (1U << 25) /* tag is assigned */
64 #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
65 #define BM_IO_ERROR (1U << 27) /* previous I/O failed */
66 #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
67 #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
68 #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
69 #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
70  * or init fork) */
71 /*
72  * The maximum allowed value of usage_count represents a tradeoff between
73  * accuracy and speed of the clock-sweep buffer management algorithm. A
74  * large value (comparable to NBuffers) would approximate LRU semantics.
75  * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
76  * clock sweeps to find a free buffer, so in practice we don't want the
77  * value to be very large.
78  */
79 #define BM_MAX_USAGE_COUNT 5
80 
81 /*
82  * Buffer tag identifies which disk block the buffer contains.
83  *
84  * Note: the BufferTag data must be sufficient to determine where to write the
85  * block, without reference to pg_class or pg_tablespace entries. It's
86  * possible that the backend flushing the buffer doesn't even believe the
87  * relation is visible yet (its xact may have started before the xact that
88  * created the rel). The storage manager must be able to cope anyway.
89  *
90  * Note: if there's any pad bytes in the struct, InitBufferTag will have
91  * to be fixed to zero them, since this struct is used as a hash key.
92  */
93 typedef struct buftag
94 {
95  Oid spcOid; /* tablespace oid */
96  Oid dbOid; /* database oid */
97  RelFileNumber relNumber; /* relation file number */
98  ForkNumber forkNum; /* fork number */
99  BlockNumber blockNum; /* blknum relative to begin of reln */
100 } BufferTag;
101 
102 static inline RelFileNumber
103 BufTagGetRelNumber(const BufferTag *tag)
104 {
105  return tag->relNumber;
106 }
107 
108 static inline ForkNumber
109 BufTagGetForkNum(const BufferTag *tag)
110 {
111  return tag->forkNum;
112 }
113 
114 static inline void
116  ForkNumber forknum)
117 {
118  tag->relNumber = relnumber;
119  tag->forkNum = forknum;
120 }
121 
122 static inline RelFileLocator
124 {
125  RelFileLocator rlocator;
126 
127  rlocator.spcOid = tag->spcOid;
128  rlocator.dbOid = tag->dbOid;
129  rlocator.relNumber = BufTagGetRelNumber(tag);
130 
131  return rlocator;
132 }
133 
134 static inline void
136 {
137  tag->spcOid = InvalidOid;
138  tag->dbOid = InvalidOid;
141 }
142 
143 static inline void
144 InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator,
145  ForkNumber forkNum, BlockNumber blockNum)
146 {
147  tag->spcOid = rlocator->spcOid;
148  tag->dbOid = rlocator->dbOid;
149  BufTagSetRelForkDetails(tag, rlocator->relNumber, forkNum);
150  tag->blockNum = blockNum;
151 }
152 
153 static inline bool
154 BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
155 {
156  return (tag1->spcOid == tag2->spcOid) &&
157  (tag1->dbOid == tag2->dbOid) &&
158  (tag1->relNumber == tag2->relNumber) &&
159  (tag1->blockNum == tag2->blockNum) &&
160  (tag1->forkNum == tag2->forkNum);
161 }
162 
163 static inline bool
165  const RelFileLocator *rlocator)
166 {
167  return (tag->spcOid == rlocator->spcOid) &&
168  (tag->dbOid == rlocator->dbOid) &&
169  (BufTagGetRelNumber(tag) == rlocator->relNumber);
170 }
171 
172 
173 /*
174  * The shared buffer mapping table is partitioned to reduce contention.
175  * To determine which partition lock a given tag requires, compute the tag's
176  * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
177  * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
178  */
179 static inline uint32
181 {
182  return hashcode % NUM_BUFFER_PARTITIONS;
183 }
184 
185 static inline LWLock *
187 {
189  BufTableHashPartition(hashcode)].lock;
190 }
191 
192 static inline LWLock *
194 {
196 }
197 
198 /*
199  * BufferDesc -- shared descriptor/state data for a single shared buffer.
200  *
201  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
202  * tag, state or wait_backend_pgprocno fields. In general, buffer header lock
203  * is a spinlock which is combined with flags, refcount and usagecount into
204  * single atomic variable. This layout allow us to do some operations in a
205  * single atomic operation, without actually acquiring and releasing spinlock;
206  * for instance, increase or decrease refcount. buf_id field never changes
207  * after initialization, so does not need locking. freeNext is protected by
208  * the buffer_strategy_lock not buffer header lock. The LWLock can take care
209  * of itself. The buffer header lock is *not* used to control access to the
210  * data in the buffer!
211  *
212  * It's assumed that nobody changes the state field while buffer header lock
213  * is held. Thus buffer header lock holder can do complex updates of the
214  * state variable in single write, simultaneously with lock release (cleaning
215  * BM_LOCKED flag). On the other hand, updating of state without holding
216  * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag
217  * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
218  *
219  * An exception is that if we have the buffer pinned, its tag can't change
220  * underneath us, so we can examine the tag without locking the buffer header.
221  * Also, in places we do one-time reads of the flags without bothering to
222  * lock the buffer header; this is generally for situations where we don't
223  * expect the flag bit being tested to be changing.
224  *
225  * We can't physically remove items from a disk page if another backend has
226  * the buffer pinned. Hence, a backend may need to wait for all other pins
227  * to go away. This is signaled by storing its own pgprocno into
228  * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present,
229  * there can be only one such waiter per buffer.
230  *
231  * We use this same struct for local buffer headers, but the locks are not
232  * used and not all of the flag bits are useful either. To avoid unnecessary
233  * overhead, manipulations of the state field should be done without actual
234  * atomic operations (i.e. only pg_atomic_read_u32() and
235  * pg_atomic_unlocked_write_u32()).
236  *
237  * Be careful to avoid increasing the size of the struct when adding or
238  * reordering members. Keeping it below 64 bytes (the most common CPU
239  * cache line size) is fairly important for performance.
240  *
241  * Per-buffer I/O condition variables are currently kept outside this struct in
242  * a separate array. They could be moved in here and still fit within that
243  * limit on common systems, but for now that is not done.
244  */
245 typedef struct BufferDesc
246 {
247  BufferTag tag; /* ID of page contained in buffer */
248  int buf_id; /* buffer's index number (from 0) */
249 
250  /* state of the tag, containing flags, refcount and usagecount */
252 
253  int wait_backend_pgprocno; /* backend of pin-count waiter */
254  int freeNext; /* link in freelist chain */
255  LWLock content_lock; /* to lock access to buffer contents */
256 } BufferDesc;
257 
258 /*
259  * Concurrent access to buffer headers has proven to be more efficient if
260  * they're cache line aligned. So we force the start of the BufferDescriptors
261  * array to be on a cache line boundary and force the elements to be cache
262  * line sized.
263  *
264  * XXX: As this is primarily matters in highly concurrent workloads which
265  * probably all are 64bit these days, and the space wastage would be a bit
266  * more noticeable on 32bit systems, we don't force the stride to be cache
267  * line sized on those. If somebody does actual performance testing, we can
268  * reevaluate.
269  *
270  * Note that local buffer descriptors aren't forced to be aligned - as there's
271  * no concurrent access to those it's unlikely to be beneficial.
272  *
273  * We use a 64-byte cache line size here, because that's the most common
274  * size. Making it bigger would be a waste of memory. Even if running on a
275  * platform with either 32 or 128 byte line sizes, it's good to align to
276  * boundaries and avoid false sharing.
277  */
278 #define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
279 
280 typedef union BufferDescPadded
281 {
285 
286 /*
287  * The PendingWriteback & WritebackContext structure are used to keep
288  * information about pending flush requests to be issued to the OS.
289  */
290 typedef struct PendingWriteback
291 {
292  /* could store different types of pending flushes here */
295 
296 /* struct forward declared in bufmgr.h */
297 typedef struct WritebackContext
298 {
299  /* pointer to the max number of writeback requests to coalesce */
300  int *max_pending;
301 
302  /* current number of pending writeback requests */
303  int nr_pending;
304 
305  /* pending requests */
308 
309 /* in buf_init.c */
313 
314 /* in localbuf.c */
316 
317 
318 static inline BufferDesc *
320 {
321  return &(BufferDescriptors[id]).bufferdesc;
322 }
323 
324 static inline BufferDesc *
326 {
327  return &LocalBufferDescriptors[id];
328 }
329 
330 static inline Buffer
332 {
333  return (Buffer) (bdesc->buf_id + 1);
334 }
335 
336 static inline ConditionVariable *
338 {
339  return &(BufferIOCVArray[bdesc->buf_id]).cv;
340 }
341 
342 static inline LWLock *
344 {
345  return (LWLock *) (&bdesc->content_lock);
346 }
347 
348 /*
349  * The freeNext field is either the index of the next freelist entry,
350  * or one of these special values:
351  */
352 #define FREENEXT_END_OF_LIST (-1)
353 #define FREENEXT_NOT_IN_LIST (-2)
354 
355 /*
356  * Functions for acquiring/releasing a shared buffer header's spinlock. Do
357  * not apply these to local buffers!
358  */
359 extern uint32 LockBufHdr(BufferDesc *desc);
360 
361 static inline void
362 UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
363 {
365  pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
366 }
367 
368 /* in bufmgr.c */
369 
370 /*
371  * Structure to sort buffers per file on checkpoints.
372  *
373  * This structure is allocated per buffer in shared memory, so it should be
374  * kept as small as possible.
375  */
376 typedef struct CkptSortItem
377 {
382  int buf_id;
383 } CkptSortItem;
384 
386 
387 /* ResourceOwner callbacks to hold buffer I/Os and pins */
390 
391 /* Convenience wrappers over ResourceOwnerRemember/Forget */
392 static inline void
394 {
396 }
397 static inline void
399 {
401 }
402 static inline void
404 {
406 }
407 static inline void
409 {
411 }
412 
413 /*
414  * Internal buffer management routines
415  */
416 /* bufmgr.c */
417 extern void WritebackContextInit(WritebackContext *context, int *max_pending);
418 extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
419 extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
420  IOContext io_context, BufferTag *tag);
421 
422 /* freelist.c */
425  uint32 *buf_state, bool *from_ring);
426 extern void StrategyFreeBuffer(BufferDesc *buf);
427 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
428  BufferDesc *buf, bool from_ring);
429 
430 extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
431 extern void StrategyNotifyBgWriter(int bgwprocno);
432 
433 extern Size StrategyShmemSize(void);
434 extern void StrategyInitialize(bool init);
435 extern bool have_free_buffer(void);
436 
437 /* buf_table.c */
438 extern Size BufTableShmemSize(int size);
439 extern void InitBufTable(int size);
440 extern uint32 BufTableHashCode(BufferTag *tagPtr);
441 extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
442 extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
443 extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
444 
445 /* localbuf.c */
446 extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
447 extern void UnpinLocalBuffer(Buffer buffer);
448 extern void UnpinLocalBufferNoOwner(Buffer buffer);
450  ForkNumber forkNum,
451  BlockNumber blockNum);
452 extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
453  BlockNumber blockNum, bool *foundPtr);
455  ForkNumber fork,
456  uint32 flags,
457  uint32 extend_by,
458  BlockNumber extend_upto,
459  Buffer *buffers,
460  uint32 *extended_by);
461 extern void MarkLocalBufferDirty(Buffer buffer);
462 extern void DropRelationLocalBuffers(RelFileLocator rlocator,
463  ForkNumber forkNum,
464  BlockNumber firstDelBlock);
465 extern void DropRelationAllLocalBuffers(RelFileLocator rlocator);
466 extern void AtEOXact_LocalBuffers(bool isCommit);
467 
468 #endif /* BUFMGR_INTERNALS_H */
#define pg_write_barrier()
Definition: atomics.h:152
static void pg_atomic_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:271
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
int Buffer
Definition: buf.h:23
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
struct CkptSortItem CkptSortItem
static uint32 BufTableHashPartition(uint32 hashcode)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
union BufferDescPadded BufferDescPadded
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:681
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:117
struct BufferDesc BufferDesc
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:819
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
bool have_free_buffer(void)
Definition: freelist.c:175
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:489
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
struct WritebackContext WritebackContext
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:655
struct PendingWriteback PendingWriteback
static void BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber, ForkNumber forknum)
static LWLock * BufMappingPartitionLockByIndex(uint32 index)
void InitBufTable(int size)
Definition: buf_table.c:51
static LWLock * BufMappingPartitionLock(uint32 hashcode)
PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc
Definition: bufmgr.c:214
void StrategyInitialize(bool init)
Definition: freelist.c:474
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
#define BM_LOCKED
Definition: buf_internals.h:60
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:449
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
#define BUFFERDESC_PAD_TO_SIZE
PGDLLIMPORT WritebackContext BackendWritebackContext
Definition: buf_init.c:24
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
Size BufTableShmemSize(int size)
Definition: buf_table.c:41
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:537
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:5544
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:5532
void StrategyNotifyBgWriter(int bgwprocno)
Definition: freelist.c:431
struct buftag BufferTag
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
PGDLLIMPORT BufferDescPadded * BufferDescriptors
Definition: buf_init.c:21
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:69
PGDLLIMPORT ConditionVariableMinimallyPadded * BufferIOCVArray
Definition: buf_init.c:23
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:313
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:5589
PGDLLIMPORT CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:716
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:688
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
Size StrategyShmemSize(void)
Definition: freelist.c:453
PGDLLIMPORT BufferDesc * LocalBufferDescriptors
Definition: localbuf.c:44
PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc
Definition: bufmgr.c:223
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:5390
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:756
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
unsigned int uint32
Definition: c.h:493
#define PGDLLIMPORT
Definition: c.h:1303
size_t Size
Definition: c.h:592
int init
Definition: isn.c:75
LWLockPadded * MainLWLockArray
Definition: lwlock.c:190
#define BUFFER_MAPPING_LWLOCK_OFFSET
Definition: lwlock.h:104
#define NUM_BUFFER_PARTITIONS
Definition: lwlock.h:93
#define WRITEBACK_MAX_PENDING_FLUSHES
static char * buf
Definition: pg_test_fsync.c:73
IOContext
Definition: pgstat.h:287
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
tree context
Definition: radixtree.h:1797
Oid RelFileNumber
Definition: relpath.h:25
ForkNumber
Definition: relpath.h:48
@ InvalidForkNumber
Definition: relpath.h:49
#define InvalidRelFileNumber
Definition: relpath.h:26
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:554
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:514
static pg_noinline void Size size
Definition: slab.c:607
LWLock content_lock
int wait_backend_pgprocno
BufferTag tag
pg_atomic_uint32 state
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
Definition: lwlock.h:42
RelFileNumber relNumber
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
BlockNumber blockNum
Definition: buf_internals.h:98
RelFileNumber relNumber
Definition: buf_internals.h:96
ForkNumber forkNum
Definition: buf_internals.h:97
Oid spcOid
Definition: buf_internals.h:94
Oid dbOid
Definition: buf_internals.h:95
Definition: type.h:95
char pad[BUFFERDESC_PAD_TO_SIZE]
BufferDesc bufferdesc
LWLock lock
Definition: lwlock.h:70