PostgreSQL Source Code  git master
slru.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * slru.c
4  * Simple LRU buffering for wrap-around-able permanent metadata
5  *
6  * This module is used to maintain various pieces of transaction status
7  * indexed by TransactionId (such as commit status, parent transaction ID,
8  * commit timestamp), as well as storage for multixacts, serializable
9  * isolation locks and NOTIFY traffic. Extensions can define their own
10  * SLRUs, too.
11  *
12  * Under ordinary circumstances we expect that write traffic will occur
13  * mostly to the latest page (and to the just-prior page, soon after a
14  * page transition). Read traffic will probably touch a larger span of
15  * pages, but a relatively small number of buffers should be sufficient.
16  *
17  * We use a simple least-recently-used scheme to manage a pool of shared
18  * page buffers, split in banks by the lowest bits of the page number, and
19  * the management algorithm only processes the bank to which the desired
20  * page belongs, so a linear search is sufficient; there's no need for a
21  * hashtable or anything fancy. The algorithm is straight LRU except that
22  * we will never swap out the latest page (since we know it's going to be
23  * hit again eventually).
24  *
25  * We use per-bank control LWLocks to protect the shared data structures,
26  * plus per-buffer LWLocks that synchronize I/O for each buffer. The
27  * bank's control lock must be held to examine or modify any of the bank's
28  * shared state. A process that is reading in or writing out a page
29  * buffer does not hold the control lock, only the per-buffer lock for the
30  * buffer it is working on. One exception is latest_page_number, which is
31  * read and written using atomic ops.
32  *
33  * "Holding the bank control lock" means exclusive lock in all cases
34  * except for SimpleLruReadPage_ReadOnly(); see comments for
35  * SlruRecentlyUsed() for the implications of that.
36  *
37  * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
38  * before releasing the control lock. The per-buffer lock is released after
39  * completing the I/O, re-acquiring the control lock, and updating the shared
40  * state. (Deadlock is not possible here, because we never try to initiate
41  * I/O when someone else is already doing I/O on the same buffer.)
42  * To wait for I/O to complete, release the control lock, acquire the
43  * per-buffer lock in shared mode, immediately release the per-buffer lock,
44  * reacquire the control lock, and then recheck state (since arbitrary things
45  * could have happened while we didn't have the lock).
46  *
47  * As with the regular buffer manager, it is possible for another process
48  * to re-dirty a page that is currently being written out. This is handled
49  * by re-setting the page's page_dirty flag.
50  *
51  *
52  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
53  * Portions Copyright (c) 1994, Regents of the University of California
54  *
55  * src/backend/access/transam/slru.c
56  *
57  *-------------------------------------------------------------------------
58  */
59 #include "postgres.h"
60 
61 #include <fcntl.h>
62 #include <sys/stat.h>
63 #include <unistd.h>
64 
65 #include "access/slru.h"
66 #include "access/transam.h"
67 #include "access/xlog.h"
68 #include "access/xlogutils.h"
69 #include "miscadmin.h"
70 #include "pgstat.h"
71 #include "storage/fd.h"
72 #include "storage/shmem.h"
73 #include "utils/guc_hooks.h"
74 
75 /*
76  * Converts segment number to the filename of the segment.
77  *
78  * "path" should point to a buffer at least MAXPGPATH characters long.
79  *
80  * If ctl->long_segment_names is true, segno can be in the range [0, 2^60-1].
81  * The resulting file name is made of 15 characters, e.g. dir/123456789ABCDEF.
82  *
83  * If ctl->long_segment_names is false, segno can be in the range [0, 2^24-1].
84  * The resulting file name is made of 4 to 6 characters, as of:
85  *
86  * dir/1234 for [0, 2^16-1]
87  * dir/12345 for [2^16, 2^20-1]
88  * dir/123456 for [2^20, 2^24-1]
89  */
90 static inline int
91 SlruFileName(SlruCtl ctl, char *path, int64 segno)
92 {
93  if (ctl->long_segment_names)
94  {
95  /*
96  * We could use 16 characters here but the disadvantage would be that
97  * the SLRU segments will be hard to distinguish from WAL segments.
98  *
99  * For this reason we use 15 characters. It is enough but also means
100  * that in the future we can't decrease SLRU_PAGES_PER_SEGMENT easily.
101  */
102  Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
103  return snprintf(path, MAXPGPATH, "%s/%015llX", ctl->Dir,
104  (long long) segno);
105  }
106  else
107  {
108  /*
109  * Despite the fact that %04X format string is used up to 24 bit
110  * integers are allowed. See SlruCorrectSegmentFilenameLength()
111  */
112  Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
113  return snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir,
114  (unsigned int) segno);
115  }
116 }
117 
118 /*
119  * During SimpleLruWriteAll(), we will usually not need to write more than one
120  * or two physical files, but we may need to write several pages per file. We
121  * can consolidate the I/O requests by leaving files open until control returns
122  * to SimpleLruWriteAll(). This data structure remembers which files are open.
123  */
124 #define MAX_WRITEALL_BUFFERS 16
125 
126 typedef struct SlruWriteAllData
127 {
128  int num_files; /* # files actually open */
129  int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */
130  int64 segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */
132 
134 
135 
136 /*
137  * Bank size for the slot array. Pages are assigned a bank according to their
138  * page number, with each bank being this size. We want a power of 2 so that
139  * we can determine the bank number for a page with just bit shifting; we also
140  * want to keep the bank size small so that LRU victim search is fast. 16
141  * buffers per bank seems a good number.
142  */
143 #define SLRU_BANK_BITSHIFT 4
144 #define SLRU_BANK_SIZE (1 << SLRU_BANK_BITSHIFT)
145 
146 /*
147  * Macro to get the bank number to which the slot belongs.
148  */
149 #define SlotGetBankNumber(slotno) ((slotno) >> SLRU_BANK_BITSHIFT)
150 
151 
152 /*
153  * Populate a file tag describing a segment file. We only use the segment
154  * number, since we can derive everything else we need by having separate
155  * sync handler functions for clog, multixact etc.
156  */
157 #define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
158 ( \
159  memset(&(a), 0, sizeof(FileTag)), \
160  (a).handler = (xx_handler), \
161  (a).segno = (xx_segno) \
162 )
163 
164 /* Saved info for SlruReportIOError */
165 typedef enum
166 {
174 
176 static int slru_errno;
177 
178 
179 static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
180 static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
181 static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
182 static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
183 static bool SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno,
184  SlruWriteAll fdata);
185 static void SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid);
186 static int SlruSelectLRUPage(SlruCtl ctl, int64 pageno);
187 
188 static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
189  int64 segpage, void *data);
190 static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
191 static inline void SlruRecentlyUsed(SlruShared shared, int slotno);
192 
193 
194 /*
195  * Initialization of shared memory
196  */
197 
198 Size
199 SimpleLruShmemSize(int nslots, int nlsns)
200 {
201  int nbanks = nslots / SLRU_BANK_SIZE;
202  Size sz;
203 
204  Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
205  Assert(nslots % SLRU_BANK_SIZE == 0);
206 
207  /* we assume nslots isn't so large as to risk overflow */
208  sz = MAXALIGN(sizeof(SlruSharedData));
209  sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */
210  sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */
211  sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */
212  sz += MAXALIGN(nslots * sizeof(int64)); /* page_number[] */
213  sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
214  sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
215  sz += MAXALIGN(nbanks * sizeof(LWLockPadded)); /* bank_locks[] */
216  sz += MAXALIGN(nbanks * sizeof(int)); /* bank_cur_lru_count[] */
217 
218  if (nlsns > 0)
219  sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
220 
221  return BUFFERALIGN(sz) + BLCKSZ * nslots;
222 }
223 
224 /*
225  * Determine a number of SLRU buffers to use.
226  *
227  * We simply divide shared_buffers by the divisor given and cap
228  * that at the maximum given; but always at least SLRU_BANK_SIZE.
229  * Round down to the nearest multiple of SLRU_BANK_SIZE.
230  */
231 int
232 SimpleLruAutotuneBuffers(int divisor, int max)
233 {
234  return Min(max - (max % SLRU_BANK_SIZE),
236  NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE));
237 }
238 
239 /*
240  * Initialize, or attach to, a simple LRU cache in shared memory.
241  *
242  * ctl: address of local (unshared) control structure.
243  * name: name of SLRU. (This is user-visible, pick with care!)
244  * nslots: number of page slots to use.
245  * nlsns: number of LSN groups per page (set to zero if not relevant).
246  * subdir: PGDATA-relative subdirectory that will contain the files.
247  * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks.
248  * bank_tranche_id: tranche ID to use for the bank LWLocks.
249  * sync_handler: which set of functions to use to handle sync requests
250  */
251 void
252 SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
253  const char *subdir, int buffer_tranche_id, int bank_tranche_id,
254  SyncRequestHandler sync_handler, bool long_segment_names)
255 {
256  SlruShared shared;
257  bool found;
258  int nbanks = nslots / SLRU_BANK_SIZE;
259 
260  Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
261 
262  shared = (SlruShared) ShmemInitStruct(name,
263  SimpleLruShmemSize(nslots, nlsns),
264  &found);
265 
266  if (!IsUnderPostmaster)
267  {
268  /* Initialize locks and shared memory area */
269  char *ptr;
270  Size offset;
271 
272  Assert(!found);
273 
274  memset(shared, 0, sizeof(SlruSharedData));
275 
276  shared->num_slots = nslots;
277  shared->lsn_groups_per_page = nlsns;
278 
280 
282 
283  ptr = (char *) shared;
284  offset = MAXALIGN(sizeof(SlruSharedData));
285  shared->page_buffer = (char **) (ptr + offset);
286  offset += MAXALIGN(nslots * sizeof(char *));
287  shared->page_status = (SlruPageStatus *) (ptr + offset);
288  offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
289  shared->page_dirty = (bool *) (ptr + offset);
290  offset += MAXALIGN(nslots * sizeof(bool));
291  shared->page_number = (int64 *) (ptr + offset);
292  offset += MAXALIGN(nslots * sizeof(int64));
293  shared->page_lru_count = (int *) (ptr + offset);
294  offset += MAXALIGN(nslots * sizeof(int));
295 
296  /* Initialize LWLocks */
297  shared->buffer_locks = (LWLockPadded *) (ptr + offset);
298  offset += MAXALIGN(nslots * sizeof(LWLockPadded));
299  shared->bank_locks = (LWLockPadded *) (ptr + offset);
300  offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
301  shared->bank_cur_lru_count = (int *) (ptr + offset);
302  offset += MAXALIGN(nbanks * sizeof(int));
303 
304  if (nlsns > 0)
305  {
306  shared->group_lsn = (XLogRecPtr *) (ptr + offset);
307  offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
308  }
309 
310  ptr += BUFFERALIGN(offset);
311  for (int slotno = 0; slotno < nslots; slotno++)
312  {
313  LWLockInitialize(&shared->buffer_locks[slotno].lock,
314  buffer_tranche_id);
315 
316  shared->page_buffer[slotno] = ptr;
317  shared->page_status[slotno] = SLRU_PAGE_EMPTY;
318  shared->page_dirty[slotno] = false;
319  shared->page_lru_count[slotno] = 0;
320  ptr += BLCKSZ;
321  }
322 
323  /* Initialize the slot banks. */
324  for (int bankno = 0; bankno < nbanks; bankno++)
325  {
326  LWLockInitialize(&shared->bank_locks[bankno].lock, bank_tranche_id);
327  shared->bank_cur_lru_count[bankno] = 0;
328  }
329 
330  /* Should fit to estimated shmem size */
331  Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
332  }
333  else
334  {
335  Assert(found);
336  Assert(shared->num_slots == nslots);
337  }
338 
339  /*
340  * Initialize the unshared control struct, including directory path. We
341  * assume caller set PagePrecedes.
342  */
343  ctl->shared = shared;
344  ctl->sync_handler = sync_handler;
345  ctl->long_segment_names = long_segment_names;
346  ctl->bank_mask = (nslots / SLRU_BANK_SIZE) - 1;
347  strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
348 }
349 
350 /*
351  * Helper function for GUC check_hook to check whether slru buffers are in
352  * multiples of SLRU_BANK_SIZE.
353  */
354 bool
355 check_slru_buffers(const char *name, int *newval)
356 {
357  /* Valid values are multiples of SLRU_BANK_SIZE */
358  if (*newval % SLRU_BANK_SIZE == 0)
359  return true;
360 
361  GUC_check_errdetail("\"%s\" must be a multiple of %d", name,
363  return false;
364 }
365 
366 /*
367  * Initialize (or reinitialize) a page to zeroes.
368  *
369  * The page is not actually written, just set up in shared memory.
370  * The slot number of the new page is returned.
371  *
372  * Bank lock must be held at entry, and will be held at exit.
373  */
374 int
376 {
377  SlruShared shared = ctl->shared;
378  int slotno;
379 
381 
382  /* Find a suitable buffer slot for the page */
383  slotno = SlruSelectLRUPage(ctl, pageno);
384  Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
385  (shared->page_status[slotno] == SLRU_PAGE_VALID &&
386  !shared->page_dirty[slotno]) ||
387  shared->page_number[slotno] == pageno);
388 
389  /* Mark the slot as containing this page */
390  shared->page_number[slotno] = pageno;
391  shared->page_status[slotno] = SLRU_PAGE_VALID;
392  shared->page_dirty[slotno] = true;
393  SlruRecentlyUsed(shared, slotno);
394 
395  /* Set the buffer to zeroes */
396  MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
397 
398  /* Set the LSNs for this new page to zero */
399  SimpleLruZeroLSNs(ctl, slotno);
400 
401  /*
402  * Assume this page is now the latest active page.
403  *
404  * Note that because both this routine and SlruSelectLRUPage run with
405  * ControlLock held, it is not possible for this to be zeroing a page that
406  * SlruSelectLRUPage is going to evict simultaneously. Therefore, there's
407  * no memory barrier here.
408  */
409  pg_atomic_write_u64(&shared->latest_page_number, pageno);
410 
411  /* update the stats counter of zeroed pages */
413 
414  return slotno;
415 }
416 
417 /*
418  * Zero all the LSNs we store for this slru page.
419  *
420  * This should be called each time we create a new page, and each time we read
421  * in a page from disk into an existing buffer. (Such an old page cannot
422  * have any interesting LSNs, since we'd have flushed them before writing
423  * the page in the first place.)
424  *
425  * This assumes that InvalidXLogRecPtr is bitwise-all-0.
426  */
427 static void
429 {
430  SlruShared shared = ctl->shared;
431 
432  if (shared->lsn_groups_per_page > 0)
433  MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
434  shared->lsn_groups_per_page * sizeof(XLogRecPtr));
435 }
436 
437 /*
438  * Wait for any active I/O on a page slot to finish. (This does not
439  * guarantee that new I/O hasn't been started before we return, though.
440  * In fact the slot might not even contain the same page anymore.)
441  *
442  * Bank lock must be held at entry, and will be held at exit.
443  */
444 static void
446 {
447  SlruShared shared = ctl->shared;
448  int bankno = SlotGetBankNumber(slotno);
449 
450  Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
451 
452  /* See notes at top of file */
453  LWLockRelease(&shared->bank_locks[bankno].lock);
454  LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
455  LWLockRelease(&shared->buffer_locks[slotno].lock);
456  LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
457 
458  /*
459  * If the slot is still in an io-in-progress state, then either someone
460  * already started a new I/O on the slot, or a previous I/O failed and
461  * neglected to reset the page state. That shouldn't happen, really, but
462  * it seems worth a few extra cycles to check and recover from it. We can
463  * cheaply test for failure by seeing if the buffer lock is still held (we
464  * assume that transaction abort would release the lock).
465  */
466  if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
467  shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
468  {
469  if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
470  {
471  /* indeed, the I/O must have failed */
472  if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
473  shared->page_status[slotno] = SLRU_PAGE_EMPTY;
474  else /* write_in_progress */
475  {
476  shared->page_status[slotno] = SLRU_PAGE_VALID;
477  shared->page_dirty[slotno] = true;
478  }
479  LWLockRelease(&shared->buffer_locks[slotno].lock);
480  }
481  }
482 }
483 
484 /*
485  * Find a page in a shared buffer, reading it in if necessary.
486  * The page number must correspond to an already-initialized page.
487  *
488  * If write_ok is true then it is OK to return a page that is in
489  * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
490  * that modification of the page is safe. If write_ok is false then we
491  * will not return the page until it is not undergoing active I/O.
492  *
493  * The passed-in xid is used only for error reporting, and may be
494  * InvalidTransactionId if no specific xid is associated with the action.
495  *
496  * Return value is the shared-buffer slot number now holding the page.
497  * The buffer's LRU access info is updated.
498  *
499  * The correct bank lock must be held at entry, and will be held at exit.
500  */
501 int
502 SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
503  TransactionId xid)
504 {
505  SlruShared shared = ctl->shared;
506  LWLock *banklock = SimpleLruGetBankLock(ctl, pageno);
507 
509 
510  /* Outer loop handles restart if we must wait for someone else's I/O */
511  for (;;)
512  {
513  int slotno;
514  bool ok;
515 
516  /* See if page already is in memory; if not, pick victim slot */
517  slotno = SlruSelectLRUPage(ctl, pageno);
518 
519  /* Did we find the page in memory? */
520  if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
521  shared->page_number[slotno] == pageno)
522  {
523  /*
524  * If page is still being read in, we must wait for I/O. Likewise
525  * if the page is being written and the caller said that's not OK.
526  */
527  if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
528  (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
529  !write_ok))
530  {
531  SimpleLruWaitIO(ctl, slotno);
532  /* Now we must recheck state from the top */
533  continue;
534  }
535  /* Otherwise, it's ready to use */
536  SlruRecentlyUsed(shared, slotno);
537 
538  /* update the stats counter of pages found in the SLRU */
540 
541  return slotno;
542  }
543 
544  /* We found no match; assert we selected a freeable slot */
545  Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
546  (shared->page_status[slotno] == SLRU_PAGE_VALID &&
547  !shared->page_dirty[slotno]));
548 
549  /* Mark the slot read-busy */
550  shared->page_number[slotno] = pageno;
551  shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
552  shared->page_dirty[slotno] = false;
553 
554  /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
555  LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
556 
557  /* Release bank lock while doing I/O */
558  LWLockRelease(banklock);
559 
560  /* Do the read */
561  ok = SlruPhysicalReadPage(ctl, pageno, slotno);
562 
563  /* Set the LSNs for this newly read-in page to zero */
564  SimpleLruZeroLSNs(ctl, slotno);
565 
566  /* Re-acquire bank control lock and update page state */
567  LWLockAcquire(banklock, LW_EXCLUSIVE);
568 
569  Assert(shared->page_number[slotno] == pageno &&
570  shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
571  !shared->page_dirty[slotno]);
572 
573  shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
574 
575  LWLockRelease(&shared->buffer_locks[slotno].lock);
576 
577  /* Now it's okay to ereport if we failed */
578  if (!ok)
579  SlruReportIOError(ctl, pageno, xid);
580 
581  SlruRecentlyUsed(shared, slotno);
582 
583  /* update the stats counter of pages not found in SLRU */
585 
586  return slotno;
587  }
588 }
589 
590 /*
591  * Find a page in a shared buffer, reading it in if necessary.
592  * The page number must correspond to an already-initialized page.
593  * The caller must intend only read-only access to the page.
594  *
595  * The passed-in xid is used only for error reporting, and may be
596  * InvalidTransactionId if no specific xid is associated with the action.
597  *
598  * Return value is the shared-buffer slot number now holding the page.
599  * The buffer's LRU access info is updated.
600  *
601  * Bank control lock must NOT be held at entry, but will be held at exit.
602  * It is unspecified whether the lock will be shared or exclusive.
603  */
604 int
606 {
607  SlruShared shared = ctl->shared;
608  LWLock *banklock = SimpleLruGetBankLock(ctl, pageno);
609  int bankno = pageno & ctl->bank_mask;
610  int bankstart = bankno * SLRU_BANK_SIZE;
611  int bankend = bankstart + SLRU_BANK_SIZE;
612 
613  /* Try to find the page while holding only shared lock */
614  LWLockAcquire(banklock, LW_SHARED);
615 
616  /* See if page is already in a buffer */
617  for (int slotno = bankstart; slotno < bankend; slotno++)
618  {
619  if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
620  shared->page_number[slotno] == pageno &&
621  shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
622  {
623  /* See comments for SlruRecentlyUsed macro */
624  SlruRecentlyUsed(shared, slotno);
625 
626  /* update the stats counter of pages found in the SLRU */
628 
629  return slotno;
630  }
631  }
632 
633  /* No luck, so switch to normal exclusive lock and do regular read */
634  LWLockRelease(banklock);
635  LWLockAcquire(banklock, LW_EXCLUSIVE);
636 
637  return SimpleLruReadPage(ctl, pageno, true, xid);
638 }
639 
640 /*
641  * Write a page from a shared buffer, if necessary.
642  * Does nothing if the specified slot is not dirty.
643  *
644  * NOTE: only one write attempt is made here. Hence, it is possible that
645  * the page is still dirty at exit (if someone else re-dirtied it during
646  * the write). However, we *do* attempt a fresh write even if the page
647  * is already being written; this is for checkpoints.
648  *
649  * Bank lock must be held at entry, and will be held at exit.
650  */
651 static void
653 {
654  SlruShared shared = ctl->shared;
655  int64 pageno = shared->page_number[slotno];
656  int bankno = SlotGetBankNumber(slotno);
657  bool ok;
658 
659  Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
661 
662  /* If a write is in progress, wait for it to finish */
663  while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
664  shared->page_number[slotno] == pageno)
665  {
666  SimpleLruWaitIO(ctl, slotno);
667  }
668 
669  /*
670  * Do nothing if page is not dirty, or if buffer no longer contains the
671  * same page we were called for.
672  */
673  if (!shared->page_dirty[slotno] ||
674  shared->page_status[slotno] != SLRU_PAGE_VALID ||
675  shared->page_number[slotno] != pageno)
676  return;
677 
678  /*
679  * Mark the slot write-busy, and clear the dirtybit. After this point, a
680  * transaction status update on this page will mark it dirty again.
681  */
682  shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
683  shared->page_dirty[slotno] = false;
684 
685  /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
686  LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
687 
688  /* Release bank lock while doing I/O */
689  LWLockRelease(&shared->bank_locks[bankno].lock);
690 
691  /* Do the write */
692  ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
693 
694  /* If we failed, and we're in a flush, better close the files */
695  if (!ok && fdata)
696  {
697  for (int i = 0; i < fdata->num_files; i++)
698  CloseTransientFile(fdata->fd[i]);
699  }
700 
701  /* Re-acquire bank lock and update page state */
702  LWLockAcquire(&shared->bank_locks[bankno].lock, LW_EXCLUSIVE);
703 
704  Assert(shared->page_number[slotno] == pageno &&
705  shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
706 
707  /* If we failed to write, mark the page dirty again */
708  if (!ok)
709  shared->page_dirty[slotno] = true;
710 
711  shared->page_status[slotno] = SLRU_PAGE_VALID;
712 
713  LWLockRelease(&shared->buffer_locks[slotno].lock);
714 
715  /* Now it's okay to ereport if we failed */
716  if (!ok)
718 
719  /* If part of a checkpoint, count this as a buffer written. */
720  if (fdata)
722 }
723 
724 /*
725  * Wrapper of SlruInternalWritePage, for external callers.
726  * fdata is always passed a NULL here.
727  */
728 void
730 {
731  Assert(ctl->shared->page_status[slotno] != SLRU_PAGE_EMPTY);
732 
733  SlruInternalWritePage(ctl, slotno, NULL);
734 }
735 
736 /*
737  * Return whether the given page exists on disk.
738  *
739  * A false return means that either the file does not exist, or that it's not
740  * large enough to contain the given page.
741  */
742 bool
744 {
745  int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
746  int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
747  int offset = rpageno * BLCKSZ;
748  char path[MAXPGPATH];
749  int fd;
750  bool result;
751  off_t endpos;
752 
753  /* update the stats counter of checked pages */
754  pgstat_count_slru_page_exists(ctl->shared->slru_stats_idx);
755 
756  SlruFileName(ctl, path, segno);
757 
758  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
759  if (fd < 0)
760  {
761  /* expected: file doesn't exist */
762  if (errno == ENOENT)
763  return false;
764 
765  /* report error normally */
767  slru_errno = errno;
768  SlruReportIOError(ctl, pageno, 0);
769  }
770 
771  if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
772  {
774  slru_errno = errno;
775  SlruReportIOError(ctl, pageno, 0);
776  }
777 
778  result = endpos >= (off_t) (offset + BLCKSZ);
779 
780  if (CloseTransientFile(fd) != 0)
781  {
783  slru_errno = errno;
784  return false;
785  }
786 
787  return result;
788 }
789 
790 /*
791  * Physical read of a (previously existing) page into a buffer slot
792  *
793  * On failure, we cannot just ereport(ERROR) since caller has put state in
794  * shared memory that must be undone. So, we return false and save enough
795  * info in static variables to let SlruReportIOError make the report.
796  *
797  * For now, assume it's not worth keeping a file pointer open across
798  * read/write operations. We could cache one virtual file pointer ...
799  */
800 static bool
801 SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
802 {
803  SlruShared shared = ctl->shared;
804  int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
805  int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
806  off_t offset = rpageno * BLCKSZ;
807  char path[MAXPGPATH];
808  int fd;
809 
810  SlruFileName(ctl, path, segno);
811 
812  /*
813  * In a crash-and-restart situation, it's possible for us to receive
814  * commands to set the commit status of transactions whose bits are in
815  * already-truncated segments of the commit log (see notes in
816  * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
817  * where the file doesn't exist, and return zeroes instead.
818  */
819  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
820  if (fd < 0)
821  {
822  if (errno != ENOENT || !InRecovery)
823  {
825  slru_errno = errno;
826  return false;
827  }
828 
829  ereport(LOG,
830  (errmsg("file \"%s\" doesn't exist, reading as zeroes",
831  path)));
832  MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
833  return true;
834  }
835 
836  errno = 0;
837  pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
838  if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
839  {
842  slru_errno = errno;
844  return false;
845  }
847 
848  if (CloseTransientFile(fd) != 0)
849  {
851  slru_errno = errno;
852  return false;
853  }
854 
855  return true;
856 }
857 
858 /*
859  * Physical write of a page from a buffer slot
860  *
861  * On failure, we cannot just ereport(ERROR) since caller has put state in
862  * shared memory that must be undone. So, we return false and save enough
863  * info in static variables to let SlruReportIOError make the report.
864  *
865  * For now, assume it's not worth keeping a file pointer open across
866  * independent read/write operations. We do batch operations during
867  * SimpleLruWriteAll, though.
868  *
869  * fdata is NULL for a standalone write, pointer to open-file info during
870  * SimpleLruWriteAll.
871  */
872 static bool
873 SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
874 {
875  SlruShared shared = ctl->shared;
876  int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
877  int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
878  off_t offset = rpageno * BLCKSZ;
879  char path[MAXPGPATH];
880  int fd = -1;
881 
882  /* update the stats counter of written pages */
884 
885  /*
886  * Honor the write-WAL-before-data rule, if appropriate, so that we do not
887  * write out data before associated WAL records. This is the same action
888  * performed during FlushBuffer() in the main buffer manager.
889  */
890  if (shared->group_lsn != NULL)
891  {
892  /*
893  * We must determine the largest async-commit LSN for the page. This
894  * is a bit tedious, but since this entire function is a slow path
895  * anyway, it seems better to do this here than to maintain a per-page
896  * LSN variable (which'd need an extra comparison in the
897  * transaction-commit path).
898  */
899  XLogRecPtr max_lsn;
900  int lsnindex;
901 
902  lsnindex = slotno * shared->lsn_groups_per_page;
903  max_lsn = shared->group_lsn[lsnindex++];
904  for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
905  {
906  XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
907 
908  if (max_lsn < this_lsn)
909  max_lsn = this_lsn;
910  }
911 
912  if (!XLogRecPtrIsInvalid(max_lsn))
913  {
914  /*
915  * As noted above, elog(ERROR) is not acceptable here, so if
916  * XLogFlush were to fail, we must PANIC. This isn't much of a
917  * restriction because XLogFlush is just about all critical
918  * section anyway, but let's make sure.
919  */
921  XLogFlush(max_lsn);
923  }
924  }
925 
926  /*
927  * During a SimpleLruWriteAll, we may already have the desired file open.
928  */
929  if (fdata)
930  {
931  for (int i = 0; i < fdata->num_files; i++)
932  {
933  if (fdata->segno[i] == segno)
934  {
935  fd = fdata->fd[i];
936  break;
937  }
938  }
939  }
940 
941  if (fd < 0)
942  {
943  /*
944  * If the file doesn't already exist, we should create it. It is
945  * possible for this to need to happen when writing a page that's not
946  * first in its segment; we assume the OS can cope with that. (Note:
947  * it might seem that it'd be okay to create files only when
948  * SimpleLruZeroPage is called for the first page of a segment.
949  * However, if after a crash and restart the REDO logic elects to
950  * replay the log from a checkpoint before the latest one, then it's
951  * possible that we will get commands to set transaction status of
952  * transactions that have already been truncated from the commit log.
953  * Easiest way to deal with that is to accept references to
954  * nonexistent files here and in SlruPhysicalReadPage.)
955  *
956  * Note: it is possible for more than one backend to be executing this
957  * code simultaneously for different pages of the same file. Hence,
958  * don't use O_EXCL or O_TRUNC or anything like that.
959  */
960  SlruFileName(ctl, path, segno);
961  fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
962  if (fd < 0)
963  {
965  slru_errno = errno;
966  return false;
967  }
968 
969  if (fdata)
970  {
971  if (fdata->num_files < MAX_WRITEALL_BUFFERS)
972  {
973  fdata->fd[fdata->num_files] = fd;
974  fdata->segno[fdata->num_files] = segno;
975  fdata->num_files++;
976  }
977  else
978  {
979  /*
980  * In the unlikely event that we exceed MAX_WRITEALL_BUFFERS,
981  * fall back to treating it as a standalone write.
982  */
983  fdata = NULL;
984  }
985  }
986  }
987 
988  errno = 0;
989  pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
990  if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
991  {
993  /* if write didn't set errno, assume problem is no disk space */
994  if (errno == 0)
995  errno = ENOSPC;
997  slru_errno = errno;
998  if (!fdata)
1000  return false;
1001  }
1003 
1004  /* Queue up a sync request for the checkpointer. */
1005  if (ctl->sync_handler != SYNC_HANDLER_NONE)
1006  {
1007  FileTag tag;
1008 
1009  INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
1010  if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
1011  {
1012  /* No space to enqueue sync request. Do it synchronously. */
1013  pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
1014  if (pg_fsync(fd) != 0)
1015  {
1018  slru_errno = errno;
1020  return false;
1021  }
1023  }
1024  }
1025 
1026  /* Close file, unless part of flush request. */
1027  if (!fdata)
1028  {
1029  if (CloseTransientFile(fd) != 0)
1030  {
1032  slru_errno = errno;
1033  return false;
1034  }
1035  }
1036 
1037  return true;
1038 }
1039 
1040 /*
1041  * Issue the error message after failure of SlruPhysicalReadPage or
1042  * SlruPhysicalWritePage. Call this after cleaning up shared-memory state.
1043  */
1044 static void
1046 {
1047  int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
1048  int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
1049  int offset = rpageno * BLCKSZ;
1050  char path[MAXPGPATH];
1051 
1052  SlruFileName(ctl, path, segno);
1053  errno = slru_errno;
1054  switch (slru_errcause)
1055  {
1056  case SLRU_OPEN_FAILED:
1057  ereport(ERROR,
1059  errmsg("could not access status of transaction %u", xid),
1060  errdetail("Could not open file \"%s\": %m.", path)));
1061  break;
1062  case SLRU_SEEK_FAILED:
1063  ereport(ERROR,
1065  errmsg("could not access status of transaction %u", xid),
1066  errdetail("Could not seek in file \"%s\" to offset %d: %m.",
1067  path, offset)));
1068  break;
1069  case SLRU_READ_FAILED:
1070  if (errno)
1071  ereport(ERROR,
1073  errmsg("could not access status of transaction %u", xid),
1074  errdetail("Could not read from file \"%s\" at offset %d: %m.",
1075  path, offset)));
1076  else
1077  ereport(ERROR,
1078  (errmsg("could not access status of transaction %u", xid),
1079  errdetail("Could not read from file \"%s\" at offset %d: read too few bytes.", path, offset)));
1080  break;
1081  case SLRU_WRITE_FAILED:
1082  if (errno)
1083  ereport(ERROR,
1085  errmsg("could not access status of transaction %u", xid),
1086  errdetail("Could not write to file \"%s\" at offset %d: %m.",
1087  path, offset)));
1088  else
1089  ereport(ERROR,
1090  (errmsg("could not access status of transaction %u", xid),
1091  errdetail("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
1092  path, offset)));
1093  break;
1094  case SLRU_FSYNC_FAILED:
1097  errmsg("could not access status of transaction %u", xid),
1098  errdetail("Could not fsync file \"%s\": %m.",
1099  path)));
1100  break;
1101  case SLRU_CLOSE_FAILED:
1102  ereport(ERROR,
1104  errmsg("could not access status of transaction %u", xid),
1105  errdetail("Could not close file \"%s\": %m.",
1106  path)));
1107  break;
1108  default:
1109  /* can't get here, we trust */
1110  elog(ERROR, "unrecognized SimpleLru error cause: %d",
1111  (int) slru_errcause);
1112  break;
1113  }
1114 }
1115 
1116 /*
1117  * Mark a buffer slot "most recently used".
1118  */
1119 static inline void
1120 SlruRecentlyUsed(SlruShared shared, int slotno)
1121 {
1122  int bankno = SlotGetBankNumber(slotno);
1123  int new_lru_count = shared->bank_cur_lru_count[bankno];
1124 
1125  Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
1126 
1127  /*
1128  * The reason for the if-test is that there are often many consecutive
1129  * accesses to the same page (particularly the latest page). By
1130  * suppressing useless increments of bank_cur_lru_count, we reduce the
1131  * probability that old pages' counts will "wrap around" and make them
1132  * appear recently used.
1133  *
1134  * We allow this code to be executed concurrently by multiple processes
1135  * within SimpleLruReadPage_ReadOnly(). As long as int reads and writes
1136  * are atomic, this should not cause any completely-bogus values to enter
1137  * the computation. However, it is possible for either bank_cur_lru_count
1138  * or individual page_lru_count entries to be "reset" to lower values than
1139  * they should have, in case a process is delayed while it executes this
1140  * function. With care in SlruSelectLRUPage(), this does little harm, and
1141  * in any case the absolute worst possible consequence is a nonoptimal
1142  * choice of page to evict. The gain from allowing concurrent reads of
1143  * SLRU pages seems worth it.
1144  */
1145  if (new_lru_count != shared->page_lru_count[slotno])
1146  {
1147  shared->bank_cur_lru_count[bankno] = ++new_lru_count;
1148  shared->page_lru_count[slotno] = new_lru_count;
1149  }
1150 }
1151 
1152 /*
1153  * Select the slot to re-use when we need a free slot for the given page.
1154  *
1155  * The target page number is passed not only because we need to know the
1156  * correct bank to use, but also because we need to consider the possibility
1157  * that some other process reads in the target page while we are doing I/O to
1158  * free a slot. Hence, check or recheck to see if any slot already holds the
1159  * target page, and return that slot if so. Thus, the returned slot is
1160  * *either* a slot already holding the pageno (could be any state except
1161  * EMPTY), *or* a freeable slot (state EMPTY or CLEAN).
1162  *
1163  * The correct bank lock must be held at entry, and will be held at exit.
1164  */
1165 static int
1167 {
1168  SlruShared shared = ctl->shared;
1169 
1170  /* Outer loop handles restart after I/O */
1171  for (;;)
1172  {
1173  int cur_count;
1174  int bestvalidslot = 0; /* keep compiler quiet */
1175  int best_valid_delta = -1;
1176  int64 best_valid_page_number = 0; /* keep compiler quiet */
1177  int bestinvalidslot = 0; /* keep compiler quiet */
1178  int best_invalid_delta = -1;
1179  int64 best_invalid_page_number = 0; /* keep compiler quiet */
1180  int bankno = pageno & ctl->bank_mask;
1181  int bankstart = bankno * SLRU_BANK_SIZE;
1182  int bankend = bankstart + SLRU_BANK_SIZE;
1183 
1185 
1186  /* See if page already has a buffer assigned */
1187  for (int slotno = bankstart; slotno < bankend; slotno++)
1188  {
1189  if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
1190  shared->page_number[slotno] == pageno)
1191  return slotno;
1192  }
1193 
1194  /*
1195  * If we find any EMPTY slot, just select that one. Else choose a
1196  * victim page to replace. We normally take the least recently used
1197  * valid page, but we will never take the slot containing
1198  * latest_page_number, even if it appears least recently used. We
1199  * will select a slot that is already I/O busy only if there is no
1200  * other choice: a read-busy slot will not be least recently used once
1201  * the read finishes, and waiting for an I/O on a write-busy slot is
1202  * inferior to just picking some other slot. Testing shows the slot
1203  * we pick instead will often be clean, allowing us to begin a read at
1204  * once.
1205  *
1206  * Normally the page_lru_count values will all be different and so
1207  * there will be a well-defined LRU page. But since we allow
1208  * concurrent execution of SlruRecentlyUsed() within
1209  * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
1210  * acquire the same lru_count values. In that case we break ties by
1211  * choosing the furthest-back page.
1212  *
1213  * Notice that this next line forcibly advances cur_lru_count to a
1214  * value that is certainly beyond any value that will be in the
1215  * page_lru_count array after the loop finishes. This ensures that
1216  * the next execution of SlruRecentlyUsed will mark the page newly
1217  * used, even if it's for a page that has the current counter value.
1218  * That gets us back on the path to having good data when there are
1219  * multiple pages with the same lru_count.
1220  */
1221  cur_count = (shared->bank_cur_lru_count[bankno])++;
1222  for (int slotno = bankstart; slotno < bankend; slotno++)
1223  {
1224  int this_delta;
1225  int64 this_page_number;
1226 
1227  if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1228  return slotno;
1229 
1230  this_delta = cur_count - shared->page_lru_count[slotno];
1231  if (this_delta < 0)
1232  {
1233  /*
1234  * Clean up in case shared updates have caused cur_count
1235  * increments to get "lost". We back off the page counts,
1236  * rather than trying to increase cur_count, to avoid any
1237  * question of infinite loops or failure in the presence of
1238  * wrapped-around counts.
1239  */
1240  shared->page_lru_count[slotno] = cur_count;
1241  this_delta = 0;
1242  }
1243 
1244  /*
1245  * If this page is the one most recently zeroed, don't consider it
1246  * an eviction candidate. See comments in SimpleLruZeroPage for an
1247  * explanation about the lack of a memory barrier here.
1248  */
1249  this_page_number = shared->page_number[slotno];
1250  if (this_page_number ==
1252  continue;
1253 
1254  if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1255  {
1256  if (this_delta > best_valid_delta ||
1257  (this_delta == best_valid_delta &&
1258  ctl->PagePrecedes(this_page_number,
1259  best_valid_page_number)))
1260  {
1261  bestvalidslot = slotno;
1262  best_valid_delta = this_delta;
1263  best_valid_page_number = this_page_number;
1264  }
1265  }
1266  else
1267  {
1268  if (this_delta > best_invalid_delta ||
1269  (this_delta == best_invalid_delta &&
1270  ctl->PagePrecedes(this_page_number,
1271  best_invalid_page_number)))
1272  {
1273  bestinvalidslot = slotno;
1274  best_invalid_delta = this_delta;
1275  best_invalid_page_number = this_page_number;
1276  }
1277  }
1278  }
1279 
1280  /*
1281  * If all pages (except possibly the latest one) are I/O busy, we'll
1282  * have to wait for an I/O to complete and then retry. In that
1283  * unhappy case, we choose to wait for the I/O on the least recently
1284  * used slot, on the assumption that it was likely initiated first of
1285  * all the I/Os in progress and may therefore finish first.
1286  */
1287  if (best_valid_delta < 0)
1288  {
1289  SimpleLruWaitIO(ctl, bestinvalidslot);
1290  continue;
1291  }
1292 
1293  /*
1294  * If the selected page is clean, we're set.
1295  */
1296  if (!shared->page_dirty[bestvalidslot])
1297  return bestvalidslot;
1298 
1299  /*
1300  * Write the page.
1301  */
1302  SlruInternalWritePage(ctl, bestvalidslot, NULL);
1303 
1304  /*
1305  * Now loop back and try again. This is the easiest way of dealing
1306  * with corner cases such as the victim page being re-dirtied while we
1307  * wrote it.
1308  */
1309  }
1310 }
1311 
1312 /*
1313  * Write dirty pages to disk during checkpoint or database shutdown. Flushing
1314  * is deferred until the next call to ProcessSyncRequests(), though we do fsync
1315  * the containing directory here to make sure that newly created directory
1316  * entries are on disk.
1317  */
1318 void
1319 SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
1320 {
1321  SlruShared shared = ctl->shared;
1322  SlruWriteAllData fdata;
1323  int64 pageno = 0;
1324  int prevbank = SlotGetBankNumber(0);
1325  bool ok;
1326 
1327  /* update the stats counter of flushes */
1329 
1330  /*
1331  * Find and write dirty pages
1332  */
1333  fdata.num_files = 0;
1334 
1335  LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1336 
1337  for (int slotno = 0; slotno < shared->num_slots; slotno++)
1338  {
1339  int curbank = SlotGetBankNumber(slotno);
1340 
1341  /*
1342  * If the current bank lock is not same as the previous bank lock then
1343  * release the previous lock and acquire the new lock.
1344  */
1345  if (curbank != prevbank)
1346  {
1347  LWLockRelease(&shared->bank_locks[prevbank].lock);
1348  LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1349  prevbank = curbank;
1350  }
1351 
1352  /* Do nothing if slot is unused */
1353  if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1354  continue;
1355 
1356  SlruInternalWritePage(ctl, slotno, &fdata);
1357 
1358  /*
1359  * In some places (e.g. checkpoints), we cannot assert that the slot
1360  * is clean now, since another process might have re-dirtied it
1361  * already. That's okay.
1362  */
1363  Assert(allow_redirtied ||
1364  shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
1365  (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1366  !shared->page_dirty[slotno]));
1367  }
1368 
1369  LWLockRelease(&shared->bank_locks[prevbank].lock);
1370 
1371  /*
1372  * Now close any files that were open
1373  */
1374  ok = true;
1375  for (int i = 0; i < fdata.num_files; i++)
1376  {
1377  if (CloseTransientFile(fdata.fd[i]) != 0)
1378  {
1380  slru_errno = errno;
1381  pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
1382  ok = false;
1383  }
1384  }
1385  if (!ok)
1387 
1388  /* Ensure that directory entries for new files are on disk. */
1389  if (ctl->sync_handler != SYNC_HANDLER_NONE)
1390  fsync_fname(ctl->Dir, true);
1391 }
1392 
1393 /*
1394  * Remove all segments before the one holding the passed page number
1395  *
1396  * All SLRUs prevent concurrent calls to this function, either with an LWLock
1397  * or by calling it only as part of a checkpoint. Mutual exclusion must begin
1398  * before computing cutoffPage. Mutual exclusion must end after any limit
1399  * update that would permit other backends to write fresh data into the
1400  * segment immediately preceding the one containing cutoffPage. Otherwise,
1401  * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
1402  * after it has accrued freshly-written data.
1403  */
1404 void
1405 SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
1406 {
1407  SlruShared shared = ctl->shared;
1408  int prevbank;
1409 
1410  /* update the stats counter of truncates */
1412 
1413  /*
1414  * Scan shared memory and remove any pages preceding the cutoff page, to
1415  * ensure we won't rewrite them later. (Since this is normally called in
1416  * or just after a checkpoint, any dirty pages should have been flushed
1417  * already ... we're just being extra careful here.)
1418  */
1419 restart:
1420 
1421  /*
1422  * An important safety check: the current endpoint page must not be
1423  * eligible for removal. This check is just a backstop against wraparound
1424  * bugs elsewhere in SLRU handling, so we don't care if we read a slightly
1425  * outdated value; therefore we don't add a memory barrier.
1426  */
1427  if (ctl->PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number),
1428  cutoffPage))
1429  {
1430  ereport(LOG,
1431  (errmsg("could not truncate directory \"%s\": apparent wraparound",
1432  ctl->Dir)));
1433  return;
1434  }
1435 
1436  prevbank = SlotGetBankNumber(0);
1437  LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1438  for (int slotno = 0; slotno < shared->num_slots; slotno++)
1439  {
1440  int curbank = SlotGetBankNumber(slotno);
1441 
1442  /*
1443  * If the current bank lock is not same as the previous bank lock then
1444  * release the previous lock and acquire the new lock.
1445  */
1446  if (curbank != prevbank)
1447  {
1448  LWLockRelease(&shared->bank_locks[prevbank].lock);
1449  LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1450  prevbank = curbank;
1451  }
1452 
1453  if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1454  continue;
1455  if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
1456  continue;
1457 
1458  /*
1459  * If page is clean, just change state to EMPTY (expected case).
1460  */
1461  if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1462  !shared->page_dirty[slotno])
1463  {
1464  shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1465  continue;
1466  }
1467 
1468  /*
1469  * Hmm, we have (or may have) I/O operations acting on the page, so
1470  * we've got to wait for them to finish and then start again. This is
1471  * the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
1472  * wouldn't it be OK to just discard it without writing it?
1473  * SlruMayDeleteSegment() uses a stricter qualification, so we might
1474  * not delete this page in the end; even if we don't delete it, we
1475  * won't have cause to read its data again. For now, keep the logic
1476  * the same as it was.)
1477  */
1478  if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1479  SlruInternalWritePage(ctl, slotno, NULL);
1480  else
1481  SimpleLruWaitIO(ctl, slotno);
1482 
1483  LWLockRelease(&shared->bank_locks[prevbank].lock);
1484  goto restart;
1485  }
1486 
1487  LWLockRelease(&shared->bank_locks[prevbank].lock);
1488 
1489  /* Now we can remove the old segment(s) */
1490  (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
1491 }
1492 
1493 /*
1494  * Delete an individual SLRU segment.
1495  *
1496  * NB: This does not touch the SLRU buffers themselves, callers have to ensure
1497  * they either can't yet contain anything, or have already been cleaned out.
1498  */
1499 static void
1501 {
1502  char path[MAXPGPATH];
1503 
1504  /* Forget any fsync requests queued for this segment. */
1505  if (ctl->sync_handler != SYNC_HANDLER_NONE)
1506  {
1507  FileTag tag;
1508 
1509  INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
1511  }
1512 
1513  /* Unlink the file. */
1514  SlruFileName(ctl, path, segno);
1515  ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
1516  unlink(path);
1517 }
1518 
1519 /*
1520  * Delete an individual SLRU segment, identified by the segment number.
1521  */
1522 void
1524 {
1525  SlruShared shared = ctl->shared;
1526  int prevbank = SlotGetBankNumber(0);
1527  bool did_write;
1528 
1529  /* Clean out any possibly existing references to the segment. */
1530  LWLockAcquire(&shared->bank_locks[prevbank].lock, LW_EXCLUSIVE);
1531 restart:
1532  did_write = false;
1533  for (int slotno = 0; slotno < shared->num_slots; slotno++)
1534  {
1535  int64 pagesegno;
1536  int curbank = SlotGetBankNumber(slotno);
1537 
1538  /*
1539  * If the current bank lock is not same as the previous bank lock then
1540  * release the previous lock and acquire the new lock.
1541  */
1542  if (curbank != prevbank)
1543  {
1544  LWLockRelease(&shared->bank_locks[prevbank].lock);
1545  LWLockAcquire(&shared->bank_locks[curbank].lock, LW_EXCLUSIVE);
1546  prevbank = curbank;
1547  }
1548 
1549  if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1550  continue;
1551 
1552  pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
1553  /* not the segment we're looking for */
1554  if (pagesegno != segno)
1555  continue;
1556 
1557  /* If page is clean, just change state to EMPTY (expected case). */
1558  if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1559  !shared->page_dirty[slotno])
1560  {
1561  shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1562  continue;
1563  }
1564 
1565  /* Same logic as SimpleLruTruncate() */
1566  if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1567  SlruInternalWritePage(ctl, slotno, NULL);
1568  else
1569  SimpleLruWaitIO(ctl, slotno);
1570 
1571  did_write = true;
1572  }
1573 
1574  /*
1575  * Be extra careful and re-check. The IO functions release the control
1576  * lock, so new pages could have been read in.
1577  */
1578  if (did_write)
1579  goto restart;
1580 
1582 
1583  LWLockRelease(&shared->bank_locks[prevbank].lock);
1584 }
1585 
1586 /*
1587  * Determine whether a segment is okay to delete.
1588  *
1589  * segpage is the first page of the segment, and cutoffPage is the oldest (in
1590  * PagePrecedes order) page in the SLRU containing still-useful data. Since
1591  * every core PagePrecedes callback implements "wrap around", check the
1592  * segment's first and last pages:
1593  *
1594  * first<cutoff && last<cutoff: yes
1595  * first<cutoff && last>=cutoff: no; cutoff falls inside this segment
1596  * first>=cutoff && last<cutoff: no; wrap point falls inside this segment
1597  * first>=cutoff && last>=cutoff: no; every page of this segment is too young
1598  */
1599 static bool
1600 SlruMayDeleteSegment(SlruCtl ctl, int64 segpage, int64 cutoffPage)
1601 {
1602  int64 seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
1603 
1604  Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
1605 
1606  return (ctl->PagePrecedes(segpage, cutoffPage) &&
1607  ctl->PagePrecedes(seg_last_page, cutoffPage));
1608 }
1609 
1610 #ifdef USE_ASSERT_CHECKING
1611 static void
1612 SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset)
1613 {
1614  TransactionId lhs,
1615  rhs;
1616  int64 newestPage,
1617  oldestPage;
1618  TransactionId newestXact,
1619  oldestXact;
1620 
1621  /*
1622  * Compare an XID pair having undefined order (see RFC 1982), a pair at
1623  * "opposite ends" of the XID space. TransactionIdPrecedes() treats each
1624  * as preceding the other. If RHS is oldestXact, LHS is the first XID we
1625  * must not assign.
1626  */
1627  lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */
1628  rhs = lhs + (1U << 31);
1629  Assert(TransactionIdPrecedes(lhs, rhs));
1630  Assert(TransactionIdPrecedes(rhs, lhs));
1631  Assert(!TransactionIdPrecedes(lhs - 1, rhs));
1632  Assert(TransactionIdPrecedes(rhs, lhs - 1));
1633  Assert(TransactionIdPrecedes(lhs + 1, rhs));
1634  Assert(!TransactionIdPrecedes(rhs, lhs + 1));
1637  Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
1638  Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
1639  Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
1640  Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
1641  Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
1642  Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
1643  Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
1644  || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */
1645  Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
1646  || (1U << 31) % per_page != 0);
1647  Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
1648  Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
1649  Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
1650 
1651  /*
1652  * GetNewTransactionId() has assigned the last XID it can safely use, and
1653  * that XID is in the *LAST* page of the second segment. We must not
1654  * delete that segment.
1655  */
1656  newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
1657  newestXact = newestPage * per_page + offset;
1658  Assert(newestXact / per_page == newestPage);
1659  oldestXact = newestXact + 1;
1660  oldestXact -= 1U << 31;
1661  oldestPage = oldestXact / per_page;
1663  (newestPage -
1664  newestPage % SLRU_PAGES_PER_SEGMENT),
1665  oldestPage));
1666 
1667  /*
1668  * GetNewTransactionId() has assigned the last XID it can safely use, and
1669  * that XID is in the *FIRST* page of the second segment. We must not
1670  * delete that segment.
1671  */
1672  newestPage = SLRU_PAGES_PER_SEGMENT;
1673  newestXact = newestPage * per_page + offset;
1674  Assert(newestXact / per_page == newestPage);
1675  oldestXact = newestXact + 1;
1676  oldestXact -= 1U << 31;
1677  oldestPage = oldestXact / per_page;
1679  (newestPage -
1680  newestPage % SLRU_PAGES_PER_SEGMENT),
1681  oldestPage));
1682 }
1683 
1684 /*
1685  * Unit-test a PagePrecedes function.
1686  *
1687  * This assumes every uint32 >= FirstNormalTransactionId is a valid key. It
1688  * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
1689  * (MultiXactMemberCtl separates flags from XIDs. NotifyCtl has
1690  * variable-length entries, no keys, and no random access. These unit tests
1691  * do not apply to them.)
1692  */
1693 void
1694 SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page)
1695 {
1696  /* Test first, middle and last entries of a page. */
1697  SlruPagePrecedesTestOffset(ctl, per_page, 0);
1698  SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
1699  SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
1700 }
1701 #endif
1702 
1703 /*
1704  * SlruScanDirectory callback
1705  * This callback reports true if there's any segment wholly prior to the
1706  * one containing the page passed as "data".
1707  */
1708 bool
1710  void *data)
1711 {
1712  int64 cutoffPage = *(int64 *) data;
1713 
1714  if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
1715  return true; /* found one; don't iterate any more */
1716 
1717  return false; /* keep going */
1718 }
1719 
1720 /*
1721  * SlruScanDirectory callback.
1722  * This callback deletes segments prior to the one passed in as "data".
1723  */
1724 static bool
1726  void *data)
1727 {
1728  int64 cutoffPage = *(int64 *) data;
1729 
1730  if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
1732 
1733  return false; /* keep going */
1734 }
1735 
1736 /*
1737  * SlruScanDirectory callback.
1738  * This callback deletes all segments.
1739  */
1740 bool
1741 SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int64 segpage, void *data)
1742 {
1744 
1745  return false; /* keep going */
1746 }
1747 
1748 /*
1749  * An internal function used by SlruScanDirectory().
1750  *
1751  * Returns true if a file with a name of a given length may be a correct
1752  * SLRU segment.
1753  */
1754 static inline bool
1756 {
1757  if (ctl->long_segment_names)
1758  return (len == 15); /* see SlruFileName() */
1759  else
1760 
1761  /*
1762  * Commit 638cf09e76d allowed 5-character lengths. Later commit
1763  * 73c986adde5 allowed 6-character length.
1764  *
1765  * Note: There is an ongoing plan to migrate all SLRUs to 64-bit page
1766  * numbers, and the corresponding 15-character file names, which may
1767  * eventually deprecate the support for 4, 5, and 6-character names.
1768  */
1769  return (len == 4 || len == 5 || len == 6);
1770 }
1771 
1772 /*
1773  * Scan the SimpleLru directory and apply a callback to each file found in it.
1774  *
1775  * If the callback returns true, the scan is stopped. The last return value
1776  * from the callback is returned.
1777  *
1778  * The callback receives the following arguments: 1. the SlruCtl struct for the
1779  * slru being truncated; 2. the filename being considered; 3. the page number
1780  * for the first page of that file; 4. a pointer to the opaque data given to us
1781  * by the caller.
1782  *
1783  * Note that the ordering in which the directory is scanned is not guaranteed.
1784  *
1785  * Note that no locking is applied.
1786  */
1787 bool
1789 {
1790  bool retval = false;
1791  DIR *cldir;
1792  struct dirent *clde;
1793  int64 segno;
1794  int64 segpage;
1795 
1796  cldir = AllocateDir(ctl->Dir);
1797  while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
1798  {
1799  size_t len;
1800 
1801  len = strlen(clde->d_name);
1802 
1804  strspn(clde->d_name, "0123456789ABCDEF") == len)
1805  {
1806  segno = strtoi64(clde->d_name, NULL, 16);
1807  segpage = segno * SLRU_PAGES_PER_SEGMENT;
1808 
1809  elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
1810  ctl->Dir, clde->d_name);
1811  retval = callback(ctl, clde->d_name, segpage, data);
1812  if (retval)
1813  break;
1814  }
1815  }
1816  FreeDir(cldir);
1817 
1818  return retval;
1819 }
1820 
1821 /*
1822  * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
1823  * that they can provide the correct "SlruCtl" (otherwise we don't know how to
1824  * build the path), but they just forward to this common implementation that
1825  * performs the fsync.
1826  */
1827 int
1828 SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
1829 {
1830  int fd;
1831  int save_errno;
1832  int result;
1833 
1834  SlruFileName(ctl, path, ftag->segno);
1835 
1836  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
1837  if (fd < 0)
1838  return -1;
1839 
1840  pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
1841  result = pg_fsync(fd);
1843  save_errno = errno;
1844 
1846 
1847  errno = save_errno;
1848  return result;
1849 }
static void pg_atomic_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition: atomics.h:485
static void pg_atomic_init_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition: atomics.h:453
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition: atomics.h:467
unsigned int uint32
Definition: c.h:506
#define Min(x, y)
Definition: c.h:1004
#define MAXALIGN(LEN)
Definition: c.h:811
#define Max(x, y)
Definition: c.h:998
#define strtoi64(str, endptr, base)
Definition: c.h:1297
#define BUFFERALIGN(LEN)
Definition: c.h:813
#define Assert(condition)
Definition: c.h:858
#define PG_BINARY
Definition: c.h:1273
#define MemSet(start, val, len)
Definition: c.h:1020
uint32 TransactionId
Definition: c.h:652
size_t Size
Definition: c.h:605
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode_for_file_access(void)
Definition: elog.c:876
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define DEBUG2
Definition: elog.h:29
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2932
int FreeDir(DIR *dir)
Definition: fd.c:2984
int CloseTransientFile(int fd)
Definition: fd.c:2832
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:756
int data_sync_elevel(int elevel)
Definition: fd.c:3960
int pg_fsync(int fd)
Definition: fd.c:386
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2656
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2866
int NBuffers
Definition: globals.c:141
bool IsUnderPostmaster
Definition: globals.c:119
#define newval
#define GUC_check_errdetail
Definition: guc.h:476
int i
Definition: isn.c:73
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1893
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1937
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:707
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1339
@ LW_SHARED
Definition: lwlock.h:115
@ LW_EXCLUSIVE
Definition: lwlock.h:114
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
#define MAXPGPATH
const void size_t len
const void * data
static char * filename
Definition: pg_dumpall.c:119
static XLogRecPtr endpos
Definition: pg_receivewal.c:56
void pgstat_count_slru_page_exists(int slru_idx)
Definition: pgstat_slru.c:71
void pgstat_count_slru_page_read(int slru_idx)
Definition: pgstat_slru.c:77
int pgstat_get_slru_index(const char *name)
Definition: pgstat_slru.c:132
void pgstat_count_slru_page_hit(int slru_idx)
Definition: pgstat_slru.c:65
void pgstat_count_slru_page_zeroed(int slru_idx)
Definition: pgstat_slru.c:59
void pgstat_count_slru_truncate(int slru_idx)
Definition: pgstat_slru.c:95
void pgstat_count_slru_page_written(int slru_idx)
Definition: pgstat_slru.c:83
void pgstat_count_slru_flush(int slru_idx)
Definition: pgstat_slru.c:89
#define pg_pwrite
Definition: port.h:226
#define pg_pread
Definition: port.h:225
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static int fd(const char *x, int i)
Definition: preproc-init.c:105
tree ctl
Definition: radixtree.h:1853
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:387
void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, const char *subdir, int buffer_tranche_id, int bank_tranche_id, SyncRequestHandler sync_handler, bool long_segment_names)
Definition: slru.c:252
static int SlruFileName(SlruCtl ctl, char *path, int64 segno)
Definition: slru.c:91
static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
Definition: slru.c:801
int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid)
Definition: slru.c:605
#define INIT_SLRUFILETAG(a, xx_handler, xx_segno)
Definition: slru.c:157
void SimpleLruWritePage(SlruCtl ctl, int slotno)
Definition: slru.c:729
void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
Definition: slru.c:1319
static bool SlruMayDeleteSegment(SlruCtl ctl, int64 segpage, int64 cutoffPage)
Definition: slru.c:1600
static void SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
Definition: slru.c:1045
struct SlruWriteAllData SlruWriteAllData
static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
Definition: slru.c:428
#define SLRU_BANK_SIZE
Definition: slru.c:144
int SimpleLruAutotuneBuffers(int divisor, int max)
Definition: slru.c:232
static bool SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
Definition: slru.c:873
static bool SlruCorrectSegmentFilenameLength(SlruCtl ctl, size_t len)
Definition: slru.c:1755
static SlruErrorCause slru_errcause
Definition: slru.c:175
#define MAX_WRITEALL_BUFFERS
Definition: slru.c:124
static void SimpleLruWaitIO(SlruCtl ctl, int slotno)
Definition: slru.c:445
static int slru_errno
Definition: slru.c:176
bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
Definition: slru.c:743
void SlruDeleteSegment(SlruCtl ctl, int64 segno)
Definition: slru.c:1523
static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
Definition: slru.c:652
bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
Definition: slru.c:1788
bool SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition: slru.c:1741
int SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok, TransactionId xid)
Definition: slru.c:502
int SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
Definition: slru.c:1828
static int SlruSelectLRUPage(SlruCtl ctl, int64 pageno)
Definition: slru.c:1166
#define SlotGetBankNumber(slotno)
Definition: slru.c:149
int SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
Definition: slru.c:375
void SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
Definition: slru.c:1405
static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno)
Definition: slru.c:1500
struct SlruWriteAllData * SlruWriteAll
Definition: slru.c:133
SlruErrorCause
Definition: slru.c:166
@ SLRU_WRITE_FAILED
Definition: slru.c:170
@ SLRU_FSYNC_FAILED
Definition: slru.c:171
@ SLRU_SEEK_FAILED
Definition: slru.c:168
@ SLRU_OPEN_FAILED
Definition: slru.c:167
@ SLRU_CLOSE_FAILED
Definition: slru.c:172
@ SLRU_READ_FAILED
Definition: slru.c:169
Size SimpleLruShmemSize(int nslots, int nlsns)
Definition: slru.c:199
bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition: slru.c:1709
static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition: slru.c:1725
static void SlruRecentlyUsed(SlruShared shared, int slotno)
Definition: slru.c:1120
bool check_slru_buffers(const char *name, int *newval)
Definition: slru.c:355
static LWLock * SimpleLruGetBankLock(SlruCtl ctl, int64 pageno)
Definition: slru.h:178
SlruSharedData * SlruShared
Definition: slru.h:121
#define SlruPagePrecedesUnitTests(ctl, per_page)
Definition: slru.h:202
bool(* SlruScanCallback)(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition: slru.h:207
#define SLRU_PAGES_PER_SEGMENT
Definition: slru.h:39
#define SLRU_MAX_ALLOWED_BUFFERS
Definition: slru.h:24
SlruPageStatus
Definition: slru.h:48
@ SLRU_PAGE_VALID
Definition: slru.h:51
@ SLRU_PAGE_WRITE_IN_PROGRESS
Definition: slru.h:52
@ SLRU_PAGE_READ_IN_PROGRESS
Definition: slru.h:50
@ SLRU_PAGE_EMPTY
Definition: slru.h:49
int ckpt_bufs_written
Definition: xlog.h:167
Definition: dirent.c:26
Definition: sync.h:51
uint64 segno
Definition: sync.h:55
Definition: lwlock.h:42
int slru_stats_idx
Definition: slru.h:118
int64 * page_number
Definition: slru.h:73
int num_slots
Definition: slru.h:64
LWLockPadded * bank_locks
Definition: slru.h:80
int * page_lru_count
Definition: slru.h:74
pg_atomic_uint64 latest_page_number
Definition: slru.h:115
XLogRecPtr * group_lsn
Definition: slru.h:107
int * bank_cur_lru_count
Definition: slru.h:97
int lsn_groups_per_page
Definition: slru.h:108
SlruPageStatus * page_status
Definition: slru.h:71
bool * page_dirty
Definition: slru.h:72
LWLockPadded * buffer_locks
Definition: slru.h:77
char ** page_buffer
Definition: slru.h:70
int num_files
Definition: slru.c:128
int fd[MAX_WRITEALL_BUFFERS]
Definition: slru.c:129
int64 segno[MAX_WRITEALL_BUFFERS]
Definition: slru.c:130
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition: sync.c:580
SyncRequestHandler
Definition: sync.h:36
@ SYNC_HANDLER_NONE
Definition: sync.h:42
@ SYNC_FORGET_REQUEST
Definition: sync.h:27
@ SYNC_REQUEST
Definition: sync.h:25
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:280
bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:329
#define InvalidTransactionId
Definition: transam.h:31
LWLock lock
Definition: lwlock.h:70
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:85
static void pgstat_report_wait_end(void)
Definition: wait_event.h:101
const char * name
CheckpointStatsData CheckpointStats
Definition: xlog.c:208
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2795
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
bool InRecovery
Definition: xlogutils.c:50