PostgreSQL Source Code  git master
buf_internals.h
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * buf_internals.h
4  * Internal definitions for buffer manager and the buffer replacement
5  * strategy.
6  *
7  *
8  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
9  * Portions Copyright (c) 1994, Regents of the University of California
10  *
11  * src/include/storage/buf_internals.h
12  *
13  *-------------------------------------------------------------------------
14  */
15 #ifndef BUFMGR_INTERNALS_H
16 #define BUFMGR_INTERNALS_H
17 
18 #include "port/atomics.h"
19 #include "storage/buf.h"
20 #include "storage/bufmgr.h"
21 #include "storage/latch.h"
22 #include "storage/lwlock.h"
23 #include "storage/shmem.h"
24 #include "storage/smgr.h"
25 #include "storage/spin.h"
26 #include "utils/relcache.h"
27 
28 /*
29  * Buffer state is a single 32-bit variable where following data is combined.
30  *
31  * - 18 bits refcount
32  * - 4 bits usage count
33  * - 10 bits of flags
34  *
35  * Combining these values allows to perform some operations without locking
36  * the buffer header, by modifying them together with a CAS loop.
37  *
38  * The definition of buffer state components is below.
39  */
40 #define BUF_REFCOUNT_ONE 1
41 #define BUF_REFCOUNT_MASK ((1U << 18) - 1)
42 #define BUF_USAGECOUNT_MASK 0x003C0000U
43 #define BUF_USAGECOUNT_ONE (1U << 18)
44 #define BUF_USAGECOUNT_SHIFT 18
45 #define BUF_FLAG_MASK 0xFFC00000U
46 
47 /* Get refcount and usagecount from buffer state */
48 #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
49 #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
50 
51 /*
52  * Flags for buffer descriptors
53  *
54  * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
55  * entry associated with the buffer's tag.
56  */
57 #define BM_LOCKED (1U << 22) /* buffer header is locked */
58 #define BM_DIRTY (1U << 23) /* data needs writing */
59 #define BM_VALID (1U << 24) /* data is valid */
60 #define BM_TAG_VALID (1U << 25) /* tag is assigned */
61 #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
62 #define BM_IO_ERROR (1U << 27) /* previous I/O failed */
63 #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
64 #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
65 #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
66 #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
67  * or init fork) */
68 /*
69  * The maximum allowed value of usage_count represents a tradeoff between
70  * accuracy and speed of the clock-sweep buffer management algorithm. A
71  * large value (comparable to NBuffers) would approximate LRU semantics.
72  * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
73  * clock sweeps to find a free buffer, so in practice we don't want the
74  * value to be very large.
75  */
76 #define BM_MAX_USAGE_COUNT 5
77 
78 /*
79  * Buffer tag identifies which disk block the buffer contains.
80  *
81  * Note: the BufferTag data must be sufficient to determine where to write the
82  * block, without reference to pg_class or pg_tablespace entries. It's
83  * possible that the backend flushing the buffer doesn't even believe the
84  * relation is visible yet (its xact may have started before the xact that
85  * created the rel). The storage manager must be able to cope anyway.
86  *
87  * Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have
88  * to be fixed to zero them, since this struct is used as a hash key.
89  */
90 typedef struct buftag
91 {
92  RelFileNode rnode; /* physical relation identifier */
94  BlockNumber blockNum; /* blknum relative to begin of reln */
95 } BufferTag;
96 
97 #define CLEAR_BUFFERTAG(a) \
98 ( \
99  (a).rnode.spcNode = InvalidOid, \
100  (a).rnode.dbNode = InvalidOid, \
101  (a).rnode.relNode = InvalidOid, \
102  (a).forkNum = InvalidForkNumber, \
103  (a).blockNum = InvalidBlockNumber \
104 )
105 
106 #define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \
107 ( \
108  (a).rnode = (xx_rnode), \
109  (a).forkNum = (xx_forkNum), \
110  (a).blockNum = (xx_blockNum) \
111 )
112 
113 #define BUFFERTAGS_EQUAL(a,b) \
114 ( \
115  RelFileNodeEquals((a).rnode, (b).rnode) && \
116  (a).blockNum == (b).blockNum && \
117  (a).forkNum == (b).forkNum \
118 )
119 
120 /*
121  * The shared buffer mapping table is partitioned to reduce contention.
122  * To determine which partition lock a given tag requires, compute the tag's
123  * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
124  * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
125  */
126 #define BufTableHashPartition(hashcode) \
127  ((hashcode) % NUM_BUFFER_PARTITIONS)
128 #define BufMappingPartitionLock(hashcode) \
129  (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
130  BufTableHashPartition(hashcode)].lock)
131 #define BufMappingPartitionLockByIndex(i) \
132  (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
133 
134 /*
135  * BufferDesc -- shared descriptor/state data for a single shared buffer.
136  *
137  * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
138  * the tag, state or wait_backend_pid fields. In general, buffer header lock
139  * is a spinlock which is combined with flags, refcount and usagecount into
140  * single atomic variable. This layout allow us to do some operations in a
141  * single atomic operation, without actually acquiring and releasing spinlock;
142  * for instance, increase or decrease refcount. buf_id field never changes
143  * after initialization, so does not need locking. freeNext is protected by
144  * the buffer_strategy_lock not buffer header lock. The LWLock can take care
145  * of itself. The buffer header lock is *not* used to control access to the
146  * data in the buffer!
147  *
148  * It's assumed that nobody changes the state field while buffer header lock
149  * is held. Thus buffer header lock holder can do complex updates of the
150  * state variable in single write, simultaneously with lock release (cleaning
151  * BM_LOCKED flag). On the other hand, updating of state without holding
152  * buffer header lock is restricted to CAS, which insure that BM_LOCKED flag
153  * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
154  *
155  * An exception is that if we have the buffer pinned, its tag can't change
156  * underneath us, so we can examine the tag without locking the buffer header.
157  * Also, in places we do one-time reads of the flags without bothering to
158  * lock the buffer header; this is generally for situations where we don't
159  * expect the flag bit being tested to be changing.
160  *
161  * We can't physically remove items from a disk page if another backend has
162  * the buffer pinned. Hence, a backend may need to wait for all other pins
163  * to go away. This is signaled by storing its own PID into
164  * wait_backend_pid and setting flag bit BM_PIN_COUNT_WAITER. At present,
165  * there can be only one such waiter per buffer.
166  *
167  * We use this same struct for local buffer headers, but the locks are not
168  * used and not all of the flag bits are useful either. To avoid unnecessary
169  * overhead, manipulations of the state field should be done without actual
170  * atomic operations (i.e. only pg_atomic_read_u32() and
171  * pg_atomic_unlocked_write_u32()).
172  *
173  * Be careful to avoid increasing the size of the struct when adding or
174  * reordering members. Keeping it below 64 bytes (the most common CPU
175  * cache line size) is fairly important for performance.
176  */
177 typedef struct BufferDesc
178 {
179  BufferTag tag; /* ID of page contained in buffer */
180  int buf_id; /* buffer's index number (from 0) */
181 
182  /* state of the tag, containing flags, refcount and usagecount */
184 
185  int wait_backend_pid; /* backend PID of pin-count waiter */
186  int freeNext; /* link in freelist chain */
187 
188  LWLock content_lock; /* to lock access to buffer contents */
189 } BufferDesc;
190 
191 /*
192  * Concurrent access to buffer headers has proven to be more efficient if
193  * they're cache line aligned. So we force the start of the BufferDescriptors
194  * array to be on a cache line boundary and force the elements to be cache
195  * line sized.
196  *
197  * XXX: As this is primarily matters in highly concurrent workloads which
198  * probably all are 64bit these days, and the space wastage would be a bit
199  * more noticeable on 32bit systems, we don't force the stride to be cache
200  * line sized on those. If somebody does actual performance testing, we can
201  * reevaluate.
202  *
203  * Note that local buffer descriptors aren't forced to be aligned - as there's
204  * no concurrent access to those it's unlikely to be beneficial.
205  *
206  * We use 64bit as the cache line size here, because that's the most common
207  * size. Making it bigger would be a waste of memory. Even if running on a
208  * platform with either 32 or 128 byte line sizes, it's good to align to
209  * boundaries and avoid false sharing.
210  */
211 #define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
212 
213 typedef union BufferDescPadded
214 {
218 
219 #define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc)
220 #define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)])
221 
222 #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
223 
224 #define BufferDescriptorGetIOLock(bdesc) \
225  (&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
226 #define BufferDescriptorGetContentLock(bdesc) \
227  ((LWLock*) (&(bdesc)->content_lock))
228 
230 
231 /*
232  * The freeNext field is either the index of the next freelist entry,
233  * or one of these special values:
234  */
235 #define FREENEXT_END_OF_LIST (-1)
236 #define FREENEXT_NOT_IN_LIST (-2)
237 
238 /*
239  * Functions for acquiring/releasing a shared buffer header's spinlock. Do
240  * not apply these to local buffers!
241  */
242 extern uint32 LockBufHdr(BufferDesc *desc);
243 #define UnlockBufHdr(desc, s) \
244  do { \
245  pg_write_barrier(); \
246  pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
247  } while (0)
248 
249 
250 /*
251  * The PendingWriteback & WritebackContext structure are used to keep
252  * information about pending flush requests to be issued to the OS.
253  */
254 typedef struct PendingWriteback
255 {
256  /* could store different types of pending flushes here */
259 
260 /* struct forward declared in bufmgr.h */
261 typedef struct WritebackContext
262 {
263  /* pointer to the max number of writeback requests to coalesce */
265 
266  /* current number of pending writeback requests */
268 
269  /* pending requests */
272 
273 /* in buf_init.c */
276 
277 /* in localbuf.c */
279 
280 /* in bufmgr.c */
281 
282 /*
283  * Structure to sort buffers per file on checkpoints.
284  *
285  * This structure is allocated per buffer in shared memory, so it should be
286  * kept as small as possible.
287  */
288 typedef struct CkptSortItem
289 {
294  int buf_id;
295 } CkptSortItem;
296 
298 
299 /*
300  * Internal buffer management routines
301  */
302 /* bufmgr.c */
303 extern void WritebackContextInit(WritebackContext *context, int *max_pending);
304 extern void IssuePendingWritebacks(WritebackContext *context);
305 extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag);
306 
307 /* freelist.c */
309  uint32 *buf_state);
310 extern void StrategyFreeBuffer(BufferDesc *buf);
311 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
312  BufferDesc *buf);
313 
314 extern int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
315 extern void StrategyNotifyBgWriter(int bgwprocno);
316 
317 extern Size StrategyShmemSize(void);
318 extern void StrategyInitialize(bool init);
319 extern bool have_free_buffer(void);
320 
321 /* buf_table.c */
322 extern Size BufTableShmemSize(int size);
323 extern void InitBufTable(int size);
324 extern uint32 BufTableHashCode(BufferTag *tagPtr);
325 extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
326 extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
327 extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
328 
329 /* localbuf.c */
333  BlockNumber blockNum, bool *foundPtr);
334 extern void MarkLocalBufferDirty(Buffer buffer);
336  BlockNumber firstDelBlock);
338 extern void AtEOXact_LocalBuffers(bool isCommit);
339 
340 #endif /* BUFMGR_INTERNALS_H */
void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:320
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:103
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:91
void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
Definition: bufmgr.c:4300
void InitBufTable(int size)
Definition: buf_table.c:52
void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:64
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:4288
Definition: lwlock.h:32
Size BufTableShmemSize(int size)
Definition: buf_table.c:42
int wait_backend_pid
ForkNumber forkNum
Definition: buf_internals.h:93
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:79
void StrategyInitialize(bool init)
Definition: freelist.c:475
union BufferDescPadded BufferDescPadded
void IssuePendingWritebacks(WritebackContext *context)
Definition: bufmgr.c:4334
struct BufferDesc BufferDesc
#define BUFFERDESC_PAD_TO_SIZE
struct buftag BufferTag
uint32 BlockNumber
Definition: block.h:31
unsigned int Oid
Definition: postgres_ext.h:31
#define PGDLLIMPORT
Definition: c.h:1272
PGDLLIMPORT WritebackContext BackendWritebackContext
Definition: buf_init.c:23
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:572
Size StrategyShmemSize(void)
Definition: freelist.c:454
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:149
static char * buf
Definition: pg_test_fsync.c:67
unsigned int uint32
Definition: c.h:359
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:280
#define init()
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:4148
ForkNumber
Definition: relpath.h:40
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:119
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
Definition: freelist.c:201
BlockNumber blockNum
struct PendingWriteback PendingWriteback
CkptSortItem * CkptBufferIds
Definition: buf_init.c:24
PGDLLIMPORT BufferDescPadded * BufferDescriptors
Definition: buf_init.c:20
PGDLLIMPORT LWLockMinimallyPadded * BufferIOLWLockArray
Definition: buf_init.c:22
bool have_free_buffer(void)
Definition: freelist.c:180
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
Definition: freelist.c:686
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:395
LWLock content_lock
size_t Size
Definition: c.h:467
BlockNumber blockNum
Definition: buf_internals.h:94
RelFileNode rnode
Definition: buf_internals.h:92
struct CkptSortItem CkptSortItem
BufferDesc bufferdesc
BufferTag tag
struct WritebackContext WritebackContext
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:364
pg_atomic_uint32 state
#define WRITEBACK_MAX_PENDING_FLUSHES
BufferDesc * LocalBufferDescriptors
Definition: localbuf.c:43
void DropRelFileNodeAllLocalBuffers(RelFileNode rnode)
Definition: localbuf.c:367
int Buffer
Definition: buf.h:23
ForkNumber forkNum
void StrategyNotifyBgWriter(int bgwprocno)
Definition: freelist.c:432