PostgreSQL Source Code  git master
slru.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * slru.c
4  * Simple LRU buffering for transaction status logfiles
5  *
6  * We use a simple least-recently-used scheme to manage a pool of page
7  * buffers. Under ordinary circumstances we expect that write
8  * traffic will occur mostly to the latest page (and to the just-prior
9  * page, soon after a page transition). Read traffic will probably touch
10  * a larger span of pages, but in any case a fairly small number of page
11  * buffers should be sufficient. So, we just search the buffers using plain
12  * linear search; there's no need for a hashtable or anything fancy.
13  * The management algorithm is straight LRU except that we will never swap
14  * out the latest page (since we know it's going to be hit again eventually).
15  *
16  * We use a control LWLock to protect the shared data structures, plus
17  * per-buffer LWLocks that synchronize I/O for each buffer. The control lock
18  * must be held to examine or modify any shared state. A process that is
19  * reading in or writing out a page buffer does not hold the control lock,
20  * only the per-buffer lock for the buffer it is working on. One exception
21  * is latest_page_number, which is read and written using atomic ops.
22  *
23  * "Holding the control lock" means exclusive lock in all cases except for
24  * SimpleLruReadPage_ReadOnly(); see comments for SlruRecentlyUsed() for
25  * the implications of that.
26  *
27  * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
28  * before releasing the control lock. The per-buffer lock is released after
29  * completing the I/O, re-acquiring the control lock, and updating the shared
30  * state. (Deadlock is not possible here, because we never try to initiate
31  * I/O when someone else is already doing I/O on the same buffer.)
32  * To wait for I/O to complete, release the control lock, acquire the
33  * per-buffer lock in shared mode, immediately release the per-buffer lock,
34  * reacquire the control lock, and then recheck state (since arbitrary things
35  * could have happened while we didn't have the lock).
36  *
37  * As with the regular buffer manager, it is possible for another process
38  * to re-dirty a page that is currently being written out. This is handled
39  * by re-setting the page's page_dirty flag.
40  *
41  *
42  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
43  * Portions Copyright (c) 1994, Regents of the University of California
44  *
45  * src/backend/access/transam/slru.c
46  *
47  *-------------------------------------------------------------------------
48  */
49 #include "postgres.h"
50 
51 #include <fcntl.h>
52 #include <sys/stat.h>
53 #include <unistd.h>
54 
55 #include "access/slru.h"
56 #include "access/transam.h"
57 #include "access/xlog.h"
58 #include "access/xlogutils.h"
59 #include "miscadmin.h"
60 #include "pgstat.h"
61 #include "storage/fd.h"
62 #include "storage/shmem.h"
63 
64 static inline int
65 SlruFileName(SlruCtl ctl, char *path, int64 segno)
66 {
67  if (ctl->long_segment_names)
68  {
69  /*
70  * We could use 16 characters here but the disadvantage would be that
71  * the SLRU segments will be hard to distinguish from WAL segments.
72  *
73  * For this reason we use 15 characters. It is enough but also means
74  * that in the future we can't decrease SLRU_PAGES_PER_SEGMENT easily.
75  */
76  Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
77  return snprintf(path, MAXPGPATH, "%s/%015llX", ctl->Dir,
78  (long long) segno);
79  }
80  else
81  {
82  /*
83  * Despite the fact that %04X format string is used up to 24 bit
84  * integers are allowed. See SlruCorrectSegmentFilenameLength()
85  */
86  Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
87  return snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir,
88  (unsigned int) segno);
89  }
90 }
91 
92 /*
93  * During SimpleLruWriteAll(), we will usually not need to write more than one
94  * or two physical files, but we may need to write several pages per file. We
95  * can consolidate the I/O requests by leaving files open until control returns
96  * to SimpleLruWriteAll(). This data structure remembers which files are open.
97  */
98 #define MAX_WRITEALL_BUFFERS 16
99 
100 typedef struct SlruWriteAllData
101 {
102  int num_files; /* # files actually open */
103  int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */
104  int64 segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */
106 
108 
109 /*
110  * Populate a file tag describing a segment file. We only use the segment
111  * number, since we can derive everything else we need by having separate
112  * sync handler functions for clog, multixact etc.
113  */
114 #define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
115 ( \
116  memset(&(a), 0, sizeof(FileTag)), \
117  (a).handler = (xx_handler), \
118  (a).segno = (xx_segno) \
119 )
120 
121 /*
122  * Macro to mark a buffer slot "most recently used". Note multiple evaluation
123  * of arguments!
124  *
125  * The reason for the if-test is that there are often many consecutive
126  * accesses to the same page (particularly the latest page). By suppressing
127  * useless increments of cur_lru_count, we reduce the probability that old
128  * pages' counts will "wrap around" and make them appear recently used.
129  *
130  * We allow this code to be executed concurrently by multiple processes within
131  * SimpleLruReadPage_ReadOnly(). As long as int reads and writes are atomic,
132  * this should not cause any completely-bogus values to enter the computation.
133  * However, it is possible for either cur_lru_count or individual
134  * page_lru_count entries to be "reset" to lower values than they should have,
135  * in case a process is delayed while it executes this macro. With care in
136  * SlruSelectLRUPage(), this does little harm, and in any case the absolute
137  * worst possible consequence is a nonoptimal choice of page to evict. The
138  * gain from allowing concurrent reads of SLRU pages seems worth it.
139  */
140 #define SlruRecentlyUsed(shared, slotno) \
141  do { \
142  int new_lru_count = (shared)->cur_lru_count; \
143  if (new_lru_count != (shared)->page_lru_count[slotno]) { \
144  (shared)->cur_lru_count = ++new_lru_count; \
145  (shared)->page_lru_count[slotno] = new_lru_count; \
146  } \
147  } while (0)
148 
149 /* Saved info for SlruReportIOError */
150 typedef enum
151 {
159 
161 static int slru_errno;
162 
163 
164 static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
165 static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
166 static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
167 static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
168 static bool SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno,
169  SlruWriteAll fdata);
170 static void SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid);
171 static int SlruSelectLRUPage(SlruCtl ctl, int64 pageno);
172 
173 static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
174  int64 segpage, void *data);
175 static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
176 
177 
178 /*
179  * Initialization of shared memory
180  */
181 
182 Size
183 SimpleLruShmemSize(int nslots, int nlsns)
184 {
185  Size sz;
186 
187  /* we assume nslots isn't so large as to risk overflow */
188  sz = MAXALIGN(sizeof(SlruSharedData));
189  sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */
190  sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */
191  sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */
192  sz += MAXALIGN(nslots * sizeof(int64)); /* page_number[] */
193  sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
194  sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
195 
196  if (nlsns > 0)
197  sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
198 
199  return BUFFERALIGN(sz) + BLCKSZ * nslots;
200 }
201 
202 /*
203  * Initialize, or attach to, a simple LRU cache in shared memory.
204  *
205  * ctl: address of local (unshared) control structure.
206  * name: name of SLRU. (This is user-visible, pick with care!)
207  * nslots: number of page slots to use.
208  * nlsns: number of LSN groups per page (set to zero if not relevant).
209  * ctllock: LWLock to use to control access to the shared control structure.
210  * subdir: PGDATA-relative subdirectory that will contain the files.
211  * tranche_id: LWLock tranche ID to use for the SLRU's per-buffer LWLocks.
212  * sync_handler: which set of functions to use to handle sync requests
213  */
214 void
215 SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
216  LWLock *ctllock, const char *subdir, int tranche_id,
217  SyncRequestHandler sync_handler, bool long_segment_names)
218 {
219  SlruShared shared;
220  bool found;
221 
222  shared = (SlruShared) ShmemInitStruct(name,
223  SimpleLruShmemSize(nslots, nlsns),
224  &found);
225 
226  if (!IsUnderPostmaster)
227  {
228  /* Initialize locks and shared memory area */
229  char *ptr;
230  Size offset;
231  int slotno;
232 
233  Assert(!found);
234 
235  memset(shared, 0, sizeof(SlruSharedData));
236 
237  shared->ControlLock = ctllock;
238 
239  shared->num_slots = nslots;
240  shared->lsn_groups_per_page = nlsns;
241 
242  shared->cur_lru_count = 0;
244 
246 
247  ptr = (char *) shared;
248  offset = MAXALIGN(sizeof(SlruSharedData));
249  shared->page_buffer = (char **) (ptr + offset);
250  offset += MAXALIGN(nslots * sizeof(char *));
251  shared->page_status = (SlruPageStatus *) (ptr + offset);
252  offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
253  shared->page_dirty = (bool *) (ptr + offset);
254  offset += MAXALIGN(nslots * sizeof(bool));
255  shared->page_number = (int64 *) (ptr + offset);
256  offset += MAXALIGN(nslots * sizeof(int64));
257  shared->page_lru_count = (int *) (ptr + offset);
258  offset += MAXALIGN(nslots * sizeof(int));
259 
260  /* Initialize LWLocks */
261  shared->buffer_locks = (LWLockPadded *) (ptr + offset);
262  offset += MAXALIGN(nslots * sizeof(LWLockPadded));
263 
264  if (nlsns > 0)
265  {
266  shared->group_lsn = (XLogRecPtr *) (ptr + offset);
267  offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
268  }
269 
270  ptr += BUFFERALIGN(offset);
271  for (slotno = 0; slotno < nslots; slotno++)
272  {
273  LWLockInitialize(&shared->buffer_locks[slotno].lock,
274  tranche_id);
275 
276  shared->page_buffer[slotno] = ptr;
277  shared->page_status[slotno] = SLRU_PAGE_EMPTY;
278  shared->page_dirty[slotno] = false;
279  shared->page_lru_count[slotno] = 0;
280  ptr += BLCKSZ;
281  }
282 
283  /* Should fit to estimated shmem size */
284  Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
285  }
286  else
287  Assert(found);
288 
289  /*
290  * Initialize the unshared control struct, including directory path. We
291  * assume caller set PagePrecedes.
292  */
293  ctl->shared = shared;
294  ctl->sync_handler = sync_handler;
295  ctl->long_segment_names = long_segment_names;
296  strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
297 }
298 
299 /*
300  * Initialize (or reinitialize) a page to zeroes.
301  *
302  * The page is not actually written, just set up in shared memory.
303  * The slot number of the new page is returned.
304  *
305  * Control lock must be held at entry, and will be held at exit.
306  */
307 int
308 SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
309 {
310  SlruShared shared = ctl->shared;
311  int slotno;
312 
313  /* Find a suitable buffer slot for the page */
314  slotno = SlruSelectLRUPage(ctl, pageno);
315  Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
316  (shared->page_status[slotno] == SLRU_PAGE_VALID &&
317  !shared->page_dirty[slotno]) ||
318  shared->page_number[slotno] == pageno);
319 
320  /* Mark the slot as containing this page */
321  shared->page_number[slotno] = pageno;
322  shared->page_status[slotno] = SLRU_PAGE_VALID;
323  shared->page_dirty[slotno] = true;
324  SlruRecentlyUsed(shared, slotno);
325 
326  /* Set the buffer to zeroes */
327  MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
328 
329  /* Set the LSNs for this new page to zero */
330  SimpleLruZeroLSNs(ctl, slotno);
331 
332  /*
333  * Assume this page is now the latest active page.
334  *
335  * Note that because both this routine and SlruSelectLRUPage run with
336  * ControlLock held, it is not possible for this to be zeroing a page that
337  * SlruSelectLRUPage is going to evict simultaneously. Therefore, there's
338  * no memory barrier here.
339  */
340  pg_atomic_write_u64(&shared->latest_page_number, pageno);
341 
342  /* update the stats counter of zeroed pages */
344 
345  return slotno;
346 }
347 
348 /*
349  * Zero all the LSNs we store for this slru page.
350  *
351  * This should be called each time we create a new page, and each time we read
352  * in a page from disk into an existing buffer. (Such an old page cannot
353  * have any interesting LSNs, since we'd have flushed them before writing
354  * the page in the first place.)
355  *
356  * This assumes that InvalidXLogRecPtr is bitwise-all-0.
357  */
358 static void
359 SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
360 {
361  SlruShared shared = ctl->shared;
362 
363  if (shared->lsn_groups_per_page > 0)
364  MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
365  shared->lsn_groups_per_page * sizeof(XLogRecPtr));
366 }
367 
368 /*
369  * Wait for any active I/O on a page slot to finish. (This does not
370  * guarantee that new I/O hasn't been started before we return, though.
371  * In fact the slot might not even contain the same page anymore.)
372  *
373  * Control lock must be held at entry, and will be held at exit.
374  */
375 static void
376 SimpleLruWaitIO(SlruCtl ctl, int slotno)
377 {
378  SlruShared shared = ctl->shared;
379 
380  /* See notes at top of file */
381  LWLockRelease(shared->ControlLock);
382  LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED);
383  LWLockRelease(&shared->buffer_locks[slotno].lock);
385 
386  /*
387  * If the slot is still in an io-in-progress state, then either someone
388  * already started a new I/O on the slot, or a previous I/O failed and
389  * neglected to reset the page state. That shouldn't happen, really, but
390  * it seems worth a few extra cycles to check and recover from it. We can
391  * cheaply test for failure by seeing if the buffer lock is still held (we
392  * assume that transaction abort would release the lock).
393  */
394  if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
395  shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
396  {
397  if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
398  {
399  /* indeed, the I/O must have failed */
400  if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
401  shared->page_status[slotno] = SLRU_PAGE_EMPTY;
402  else /* write_in_progress */
403  {
404  shared->page_status[slotno] = SLRU_PAGE_VALID;
405  shared->page_dirty[slotno] = true;
406  }
407  LWLockRelease(&shared->buffer_locks[slotno].lock);
408  }
409  }
410 }
411 
412 /*
413  * Find a page in a shared buffer, reading it in if necessary.
414  * The page number must correspond to an already-initialized page.
415  *
416  * If write_ok is true then it is OK to return a page that is in
417  * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
418  * that modification of the page is safe. If write_ok is false then we
419  * will not return the page until it is not undergoing active I/O.
420  *
421  * The passed-in xid is used only for error reporting, and may be
422  * InvalidTransactionId if no specific xid is associated with the action.
423  *
424  * Return value is the shared-buffer slot number now holding the page.
425  * The buffer's LRU access info is updated.
426  *
427  * Control lock must be held at entry, and will be held at exit.
428  */
429 int
430 SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
431  TransactionId xid)
432 {
433  SlruShared shared = ctl->shared;
434 
435  /* Outer loop handles restart if we must wait for someone else's I/O */
436  for (;;)
437  {
438  int slotno;
439  bool ok;
440 
441  /* See if page already is in memory; if not, pick victim slot */
442  slotno = SlruSelectLRUPage(ctl, pageno);
443 
444  /* Did we find the page in memory? */
445  if (shared->page_number[slotno] == pageno &&
446  shared->page_status[slotno] != SLRU_PAGE_EMPTY)
447  {
448  /*
449  * If page is still being read in, we must wait for I/O. Likewise
450  * if the page is being written and the caller said that's not OK.
451  */
452  if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
453  (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
454  !write_ok))
455  {
456  SimpleLruWaitIO(ctl, slotno);
457  /* Now we must recheck state from the top */
458  continue;
459  }
460  /* Otherwise, it's ready to use */
461  SlruRecentlyUsed(shared, slotno);
462 
463  /* update the stats counter of pages found in the SLRU */
465 
466  return slotno;
467  }
468 
469  /* We found no match; assert we selected a freeable slot */
470  Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
471  (shared->page_status[slotno] == SLRU_PAGE_VALID &&
472  !shared->page_dirty[slotno]));
473 
474  /* Mark the slot read-busy */
475  shared->page_number[slotno] = pageno;
476  shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
477  shared->page_dirty[slotno] = false;
478 
479  /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
480  LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
481 
482  /* Release control lock while doing I/O */
483  LWLockRelease(shared->ControlLock);
484 
485  /* Do the read */
486  ok = SlruPhysicalReadPage(ctl, pageno, slotno);
487 
488  /* Set the LSNs for this newly read-in page to zero */
489  SimpleLruZeroLSNs(ctl, slotno);
490 
491  /* Re-acquire control lock and update page state */
493 
494  Assert(shared->page_number[slotno] == pageno &&
495  shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
496  !shared->page_dirty[slotno]);
497 
498  shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
499 
500  LWLockRelease(&shared->buffer_locks[slotno].lock);
501 
502  /* Now it's okay to ereport if we failed */
503  if (!ok)
504  SlruReportIOError(ctl, pageno, xid);
505 
506  SlruRecentlyUsed(shared, slotno);
507 
508  /* update the stats counter of pages not found in SLRU */
510 
511  return slotno;
512  }
513 }
514 
515 /*
516  * Find a page in a shared buffer, reading it in if necessary.
517  * The page number must correspond to an already-initialized page.
518  * The caller must intend only read-only access to the page.
519  *
520  * The passed-in xid is used only for error reporting, and may be
521  * InvalidTransactionId if no specific xid is associated with the action.
522  *
523  * Return value is the shared-buffer slot number now holding the page.
524  * The buffer's LRU access info is updated.
525  *
526  * Control lock must NOT be held at entry, but will be held at exit.
527  * It is unspecified whether the lock will be shared or exclusive.
528  */
529 int
531 {
532  SlruShared shared = ctl->shared;
533  int slotno;
534 
535  /* Try to find the page while holding only shared lock */
537 
538  /* See if page is already in a buffer */
539  for (slotno = 0; slotno < shared->num_slots; slotno++)
540  {
541  if (shared->page_number[slotno] == pageno &&
542  shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
543  shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
544  {
545  /* See comments for SlruRecentlyUsed macro */
546  SlruRecentlyUsed(shared, slotno);
547 
548  /* update the stats counter of pages found in the SLRU */
550 
551  return slotno;
552  }
553  }
554 
555  /* No luck, so switch to normal exclusive lock and do regular read */
556  LWLockRelease(shared->ControlLock);
558 
559  return SimpleLruReadPage(ctl, pageno, true, xid);
560 }
561 
562 /*
563  * Write a page from a shared buffer, if necessary.
564  * Does nothing if the specified slot is not dirty.
565  *
566  * NOTE: only one write attempt is made here. Hence, it is possible that
567  * the page is still dirty at exit (if someone else re-dirtied it during
568  * the write). However, we *do* attempt a fresh write even if the page
569  * is already being written; this is for checkpoints.
570  *
571  * Control lock must be held at entry, and will be held at exit.
572  */
573 static void
575 {
576  SlruShared shared = ctl->shared;
577  int64 pageno = shared->page_number[slotno];
578  bool ok;
579 
580  /* If a write is in progress, wait for it to finish */
581  while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
582  shared->page_number[slotno] == pageno)
583  {
584  SimpleLruWaitIO(ctl, slotno);
585  }
586 
587  /*
588  * Do nothing if page is not dirty, or if buffer no longer contains the
589  * same page we were called for.
590  */
591  if (!shared->page_dirty[slotno] ||
592  shared->page_status[slotno] != SLRU_PAGE_VALID ||
593  shared->page_number[slotno] != pageno)
594  return;
595 
596  /*
597  * Mark the slot write-busy, and clear the dirtybit. After this point, a
598  * transaction status update on this page will mark it dirty again.
599  */
600  shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
601  shared->page_dirty[slotno] = false;
602 
603  /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
604  LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
605 
606  /* Release control lock while doing I/O */
607  LWLockRelease(shared->ControlLock);
608 
609  /* Do the write */
610  ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
611 
612  /* If we failed, and we're in a flush, better close the files */
613  if (!ok && fdata)
614  {
615  int i;
616 
617  for (i = 0; i < fdata->num_files; i++)
618  CloseTransientFile(fdata->fd[i]);
619  }
620 
621  /* Re-acquire control lock and update page state */
623 
624  Assert(shared->page_number[slotno] == pageno &&
625  shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
626 
627  /* If we failed to write, mark the page dirty again */
628  if (!ok)
629  shared->page_dirty[slotno] = true;
630 
631  shared->page_status[slotno] = SLRU_PAGE_VALID;
632 
633  LWLockRelease(&shared->buffer_locks[slotno].lock);
634 
635  /* Now it's okay to ereport if we failed */
636  if (!ok)
638 
639  /* If part of a checkpoint, count this as a buffer written. */
640  if (fdata)
642 }
643 
644 /*
645  * Wrapper of SlruInternalWritePage, for external callers.
646  * fdata is always passed a NULL here.
647  */
648 void
649 SimpleLruWritePage(SlruCtl ctl, int slotno)
650 {
651  SlruInternalWritePage(ctl, slotno, NULL);
652 }
653 
654 /*
655  * Return whether the given page exists on disk.
656  *
657  * A false return means that either the file does not exist, or that it's not
658  * large enough to contain the given page.
659  */
660 bool
662 {
663  int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
664  int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
665  int offset = rpageno * BLCKSZ;
666  char path[MAXPGPATH];
667  int fd;
668  bool result;
669  off_t endpos;
670 
671  /* update the stats counter of checked pages */
673 
674  SlruFileName(ctl, path, segno);
675 
676  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
677  if (fd < 0)
678  {
679  /* expected: file doesn't exist */
680  if (errno == ENOENT)
681  return false;
682 
683  /* report error normally */
685  slru_errno = errno;
686  SlruReportIOError(ctl, pageno, 0);
687  }
688 
689  if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
690  {
692  slru_errno = errno;
693  SlruReportIOError(ctl, pageno, 0);
694  }
695 
696  result = endpos >= (off_t) (offset + BLCKSZ);
697 
698  if (CloseTransientFile(fd) != 0)
699  {
701  slru_errno = errno;
702  return false;
703  }
704 
705  return result;
706 }
707 
708 /*
709  * Physical read of a (previously existing) page into a buffer slot
710  *
711  * On failure, we cannot just ereport(ERROR) since caller has put state in
712  * shared memory that must be undone. So, we return false and save enough
713  * info in static variables to let SlruReportIOError make the report.
714  *
715  * For now, assume it's not worth keeping a file pointer open across
716  * read/write operations. We could cache one virtual file pointer ...
717  */
718 static bool
719 SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
720 {
721  SlruShared shared = ctl->shared;
722  int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
723  int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
724  off_t offset = rpageno * BLCKSZ;
725  char path[MAXPGPATH];
726  int fd;
727 
728  SlruFileName(ctl, path, segno);
729 
730  /*
731  * In a crash-and-restart situation, it's possible for us to receive
732  * commands to set the commit status of transactions whose bits are in
733  * already-truncated segments of the commit log (see notes in
734  * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
735  * where the file doesn't exist, and return zeroes instead.
736  */
737  fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
738  if (fd < 0)
739  {
740  if (errno != ENOENT || !InRecovery)
741  {
743  slru_errno = errno;
744  return false;
745  }
746 
747  ereport(LOG,
748  (errmsg("file \"%s\" doesn't exist, reading as zeroes",
749  path)));
750  MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
751  return true;
752  }
753 
754  errno = 0;
755  pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
756  if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
757  {
760  slru_errno = errno;
762  return false;
763  }
765 
766  if (CloseTransientFile(fd) != 0)
767  {
769  slru_errno = errno;
770  return false;
771  }
772 
773  return true;
774 }
775 
776 /*
777  * Physical write of a page from a buffer slot
778  *
779  * On failure, we cannot just ereport(ERROR) since caller has put state in
780  * shared memory that must be undone. So, we return false and save enough
781  * info in static variables to let SlruReportIOError make the report.
782  *
783  * For now, assume it's not worth keeping a file pointer open across
784  * independent read/write operations. We do batch operations during
785  * SimpleLruWriteAll, though.
786  *
787  * fdata is NULL for a standalone write, pointer to open-file info during
788  * SimpleLruWriteAll.
789  */
790 static bool
791 SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
792 {
793  SlruShared shared = ctl->shared;
794  int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
795  int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
796  off_t offset = rpageno * BLCKSZ;
797  char path[MAXPGPATH];
798  int fd = -1;
799 
800  /* update the stats counter of written pages */
802 
803  /*
804  * Honor the write-WAL-before-data rule, if appropriate, so that we do not
805  * write out data before associated WAL records. This is the same action
806  * performed during FlushBuffer() in the main buffer manager.
807  */
808  if (shared->group_lsn != NULL)
809  {
810  /*
811  * We must determine the largest async-commit LSN for the page. This
812  * is a bit tedious, but since this entire function is a slow path
813  * anyway, it seems better to do this here than to maintain a per-page
814  * LSN variable (which'd need an extra comparison in the
815  * transaction-commit path).
816  */
817  XLogRecPtr max_lsn;
818  int lsnindex,
819  lsnoff;
820 
821  lsnindex = slotno * shared->lsn_groups_per_page;
822  max_lsn = shared->group_lsn[lsnindex++];
823  for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
824  {
825  XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
826 
827  if (max_lsn < this_lsn)
828  max_lsn = this_lsn;
829  }
830 
831  if (!XLogRecPtrIsInvalid(max_lsn))
832  {
833  /*
834  * As noted above, elog(ERROR) is not acceptable here, so if
835  * XLogFlush were to fail, we must PANIC. This isn't much of a
836  * restriction because XLogFlush is just about all critical
837  * section anyway, but let's make sure.
838  */
840  XLogFlush(max_lsn);
842  }
843  }
844 
845  /*
846  * During a SimpleLruWriteAll, we may already have the desired file open.
847  */
848  if (fdata)
849  {
850  int i;
851 
852  for (i = 0; i < fdata->num_files; i++)
853  {
854  if (fdata->segno[i] == segno)
855  {
856  fd = fdata->fd[i];
857  break;
858  }
859  }
860  }
861 
862  if (fd < 0)
863  {
864  /*
865  * If the file doesn't already exist, we should create it. It is
866  * possible for this to need to happen when writing a page that's not
867  * first in its segment; we assume the OS can cope with that. (Note:
868  * it might seem that it'd be okay to create files only when
869  * SimpleLruZeroPage is called for the first page of a segment.
870  * However, if after a crash and restart the REDO logic elects to
871  * replay the log from a checkpoint before the latest one, then it's
872  * possible that we will get commands to set transaction status of
873  * transactions that have already been truncated from the commit log.
874  * Easiest way to deal with that is to accept references to
875  * nonexistent files here and in SlruPhysicalReadPage.)
876  *
877  * Note: it is possible for more than one backend to be executing this
878  * code simultaneously for different pages of the same file. Hence,
879  * don't use O_EXCL or O_TRUNC or anything like that.
880  */
881  SlruFileName(ctl, path, segno);
882  fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
883  if (fd < 0)
884  {
886  slru_errno = errno;
887  return false;
888  }
889 
890  if (fdata)
891  {
892  if (fdata->num_files < MAX_WRITEALL_BUFFERS)
893  {
894  fdata->fd[fdata->num_files] = fd;
895  fdata->segno[fdata->num_files] = segno;
896  fdata->num_files++;
897  }
898  else
899  {
900  /*
901  * In the unlikely event that we exceed MAX_WRITEALL_BUFFERS,
902  * fall back to treating it as a standalone write.
903  */
904  fdata = NULL;
905  }
906  }
907  }
908 
909  errno = 0;
910  pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
911  if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
912  {
914  /* if write didn't set errno, assume problem is no disk space */
915  if (errno == 0)
916  errno = ENOSPC;
918  slru_errno = errno;
919  if (!fdata)
921  return false;
922  }
924 
925  /* Queue up a sync request for the checkpointer. */
926  if (ctl->sync_handler != SYNC_HANDLER_NONE)
927  {
928  FileTag tag;
929 
930  INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
931  if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
932  {
933  /* No space to enqueue sync request. Do it synchronously. */
934  pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
935  if (pg_fsync(fd) != 0)
936  {
939  slru_errno = errno;
941  return false;
942  }
944  }
945  }
946 
947  /* Close file, unless part of flush request. */
948  if (!fdata)
949  {
950  if (CloseTransientFile(fd) != 0)
951  {
953  slru_errno = errno;
954  return false;
955  }
956  }
957 
958  return true;
959 }
960 
961 /*
962  * Issue the error message after failure of SlruPhysicalReadPage or
963  * SlruPhysicalWritePage. Call this after cleaning up shared-memory state.
964  */
965 static void
966 SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
967 {
968  int64 segno = pageno / SLRU_PAGES_PER_SEGMENT;
969  int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
970  int offset = rpageno * BLCKSZ;
971  char path[MAXPGPATH];
972 
973  SlruFileName(ctl, path, segno);
974  errno = slru_errno;
975  switch (slru_errcause)
976  {
977  case SLRU_OPEN_FAILED:
978  ereport(ERROR,
980  errmsg("could not access status of transaction %u", xid),
981  errdetail("Could not open file \"%s\": %m.", path)));
982  break;
983  case SLRU_SEEK_FAILED:
984  ereport(ERROR,
986  errmsg("could not access status of transaction %u", xid),
987  errdetail("Could not seek in file \"%s\" to offset %d: %m.",
988  path, offset)));
989  break;
990  case SLRU_READ_FAILED:
991  if (errno)
992  ereport(ERROR,
994  errmsg("could not access status of transaction %u", xid),
995  errdetail("Could not read from file \"%s\" at offset %d: %m.",
996  path, offset)));
997  else
998  ereport(ERROR,
999  (errmsg("could not access status of transaction %u", xid),
1000  errdetail("Could not read from file \"%s\" at offset %d: read too few bytes.", path, offset)));
1001  break;
1002  case SLRU_WRITE_FAILED:
1003  if (errno)
1004  ereport(ERROR,
1006  errmsg("could not access status of transaction %u", xid),
1007  errdetail("Could not write to file \"%s\" at offset %d: %m.",
1008  path, offset)));
1009  else
1010  ereport(ERROR,
1011  (errmsg("could not access status of transaction %u", xid),
1012  errdetail("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
1013  path, offset)));
1014  break;
1015  case SLRU_FSYNC_FAILED:
1018  errmsg("could not access status of transaction %u", xid),
1019  errdetail("Could not fsync file \"%s\": %m.",
1020  path)));
1021  break;
1022  case SLRU_CLOSE_FAILED:
1023  ereport(ERROR,
1025  errmsg("could not access status of transaction %u", xid),
1026  errdetail("Could not close file \"%s\": %m.",
1027  path)));
1028  break;
1029  default:
1030  /* can't get here, we trust */
1031  elog(ERROR, "unrecognized SimpleLru error cause: %d",
1032  (int) slru_errcause);
1033  break;
1034  }
1035 }
1036 
1037 /*
1038  * Select the slot to re-use when we need a free slot.
1039  *
1040  * The target page number is passed because we need to consider the
1041  * possibility that some other process reads in the target page while
1042  * we are doing I/O to free a slot. Hence, check or recheck to see if
1043  * any slot already holds the target page, and return that slot if so.
1044  * Thus, the returned slot is *either* a slot already holding the pageno
1045  * (could be any state except EMPTY), *or* a freeable slot (state EMPTY
1046  * or CLEAN).
1047  *
1048  * Control lock must be held at entry, and will be held at exit.
1049  */
1050 static int
1051 SlruSelectLRUPage(SlruCtl ctl, int64 pageno)
1052 {
1053  SlruShared shared = ctl->shared;
1054 
1055  /* Outer loop handles restart after I/O */
1056  for (;;)
1057  {
1058  int slotno;
1059  int cur_count;
1060  int bestvalidslot = 0; /* keep compiler quiet */
1061  int best_valid_delta = -1;
1062  int64 best_valid_page_number = 0; /* keep compiler quiet */
1063  int bestinvalidslot = 0; /* keep compiler quiet */
1064  int best_invalid_delta = -1;
1065  int64 best_invalid_page_number = 0; /* keep compiler quiet */
1066 
1067  /* See if page already has a buffer assigned */
1068  for (slotno = 0; slotno < shared->num_slots; slotno++)
1069  {
1070  if (shared->page_number[slotno] == pageno &&
1071  shared->page_status[slotno] != SLRU_PAGE_EMPTY)
1072  return slotno;
1073  }
1074 
1075  /*
1076  * If we find any EMPTY slot, just select that one. Else choose a
1077  * victim page to replace. We normally take the least recently used
1078  * valid page, but we will never take the slot containing
1079  * latest_page_number, even if it appears least recently used. We
1080  * will select a slot that is already I/O busy only if there is no
1081  * other choice: a read-busy slot will not be least recently used once
1082  * the read finishes, and waiting for an I/O on a write-busy slot is
1083  * inferior to just picking some other slot. Testing shows the slot
1084  * we pick instead will often be clean, allowing us to begin a read at
1085  * once.
1086  *
1087  * Normally the page_lru_count values will all be different and so
1088  * there will be a well-defined LRU page. But since we allow
1089  * concurrent execution of SlruRecentlyUsed() within
1090  * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
1091  * acquire the same lru_count values. In that case we break ties by
1092  * choosing the furthest-back page.
1093  *
1094  * Notice that this next line forcibly advances cur_lru_count to a
1095  * value that is certainly beyond any value that will be in the
1096  * page_lru_count array after the loop finishes. This ensures that
1097  * the next execution of SlruRecentlyUsed will mark the page newly
1098  * used, even if it's for a page that has the current counter value.
1099  * That gets us back on the path to having good data when there are
1100  * multiple pages with the same lru_count.
1101  */
1102  cur_count = (shared->cur_lru_count)++;
1103  for (slotno = 0; slotno < shared->num_slots; slotno++)
1104  {
1105  int this_delta;
1106  int64 this_page_number;
1107 
1108  if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1109  return slotno;
1110  this_delta = cur_count - shared->page_lru_count[slotno];
1111  if (this_delta < 0)
1112  {
1113  /*
1114  * Clean up in case shared updates have caused cur_count
1115  * increments to get "lost". We back off the page counts,
1116  * rather than trying to increase cur_count, to avoid any
1117  * question of infinite loops or failure in the presence of
1118  * wrapped-around counts.
1119  */
1120  shared->page_lru_count[slotno] = cur_count;
1121  this_delta = 0;
1122  }
1123 
1124  /*
1125  * If this page is the one most recently zeroed, don't consider it
1126  * an eviction candidate. See comments in SimpleLruZeroPage for an
1127  * explanation about the lack of a memory barrier here.
1128  */
1129  this_page_number = shared->page_number[slotno];
1130  if (this_page_number ==
1132  continue;
1133 
1134  if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1135  {
1136  if (this_delta > best_valid_delta ||
1137  (this_delta == best_valid_delta &&
1138  ctl->PagePrecedes(this_page_number,
1139  best_valid_page_number)))
1140  {
1141  bestvalidslot = slotno;
1142  best_valid_delta = this_delta;
1143  best_valid_page_number = this_page_number;
1144  }
1145  }
1146  else
1147  {
1148  if (this_delta > best_invalid_delta ||
1149  (this_delta == best_invalid_delta &&
1150  ctl->PagePrecedes(this_page_number,
1151  best_invalid_page_number)))
1152  {
1153  bestinvalidslot = slotno;
1154  best_invalid_delta = this_delta;
1155  best_invalid_page_number = this_page_number;
1156  }
1157  }
1158  }
1159 
1160  /*
1161  * If all pages (except possibly the latest one) are I/O busy, we'll
1162  * have to wait for an I/O to complete and then retry. In that
1163  * unhappy case, we choose to wait for the I/O on the least recently
1164  * used slot, on the assumption that it was likely initiated first of
1165  * all the I/Os in progress and may therefore finish first.
1166  */
1167  if (best_valid_delta < 0)
1168  {
1169  SimpleLruWaitIO(ctl, bestinvalidslot);
1170  continue;
1171  }
1172 
1173  /*
1174  * If the selected page is clean, we're set.
1175  */
1176  if (!shared->page_dirty[bestvalidslot])
1177  return bestvalidslot;
1178 
1179  /*
1180  * Write the page.
1181  */
1182  SlruInternalWritePage(ctl, bestvalidslot, NULL);
1183 
1184  /*
1185  * Now loop back and try again. This is the easiest way of dealing
1186  * with corner cases such as the victim page being re-dirtied while we
1187  * wrote it.
1188  */
1189  }
1190 }
1191 
1192 /*
1193  * Write dirty pages to disk during checkpoint or database shutdown. Flushing
1194  * is deferred until the next call to ProcessSyncRequests(), though we do fsync
1195  * the containing directory here to make sure that newly created directory
1196  * entries are on disk.
1197  */
1198 void
1199 SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
1200 {
1201  SlruShared shared = ctl->shared;
1202  SlruWriteAllData fdata;
1203  int slotno;
1204  int64 pageno = 0;
1205  int i;
1206  bool ok;
1207 
1208  /* update the stats counter of flushes */
1210 
1211  /*
1212  * Find and write dirty pages
1213  */
1214  fdata.num_files = 0;
1215 
1217 
1218  for (slotno = 0; slotno < shared->num_slots; slotno++)
1219  {
1220  SlruInternalWritePage(ctl, slotno, &fdata);
1221 
1222  /*
1223  * In some places (e.g. checkpoints), we cannot assert that the slot
1224  * is clean now, since another process might have re-dirtied it
1225  * already. That's okay.
1226  */
1227  Assert(allow_redirtied ||
1228  shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
1229  (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1230  !shared->page_dirty[slotno]));
1231  }
1232 
1233  LWLockRelease(shared->ControlLock);
1234 
1235  /*
1236  * Now close any files that were open
1237  */
1238  ok = true;
1239  for (i = 0; i < fdata.num_files; i++)
1240  {
1241  if (CloseTransientFile(fdata.fd[i]) != 0)
1242  {
1244  slru_errno = errno;
1245  pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
1246  ok = false;
1247  }
1248  }
1249  if (!ok)
1251 
1252  /* Ensure that directory entries for new files are on disk. */
1253  if (ctl->sync_handler != SYNC_HANDLER_NONE)
1254  fsync_fname(ctl->Dir, true);
1255 }
1256 
1257 /*
1258  * Remove all segments before the one holding the passed page number
1259  *
1260  * All SLRUs prevent concurrent calls to this function, either with an LWLock
1261  * or by calling it only as part of a checkpoint. Mutual exclusion must begin
1262  * before computing cutoffPage. Mutual exclusion must end after any limit
1263  * update that would permit other backends to write fresh data into the
1264  * segment immediately preceding the one containing cutoffPage. Otherwise,
1265  * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
1266  * after it has accrued freshly-written data.
1267  */
1268 void
1269 SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
1270 {
1271  SlruShared shared = ctl->shared;
1272 
1273  /* update the stats counter of truncates */
1275 
1276  /*
1277  * Scan shared memory and remove any pages preceding the cutoff page, to
1278  * ensure we won't rewrite them later. (Since this is normally called in
1279  * or just after a checkpoint, any dirty pages should have been flushed
1280  * already ... we're just being extra careful here.)
1281  */
1283 
1284 restart:
1285 
1286  /*
1287  * An important safety check: the current endpoint page must not be
1288  * eligible for removal. This check is just a backstop against wraparound
1289  * bugs elsewhere in SLRU handling, so we don't care if we read a slightly
1290  * outdated value; therefore we don't add a memory barrier.
1291  */
1293  cutoffPage))
1294  {
1295  LWLockRelease(shared->ControlLock);
1296  ereport(LOG,
1297  (errmsg("could not truncate directory \"%s\": apparent wraparound",
1298  ctl->Dir)));
1299  return;
1300  }
1301 
1302  for (int slotno = 0; slotno < shared->num_slots; slotno++)
1303  {
1304  if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1305  continue;
1306  if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
1307  continue;
1308 
1309  /*
1310  * If page is clean, just change state to EMPTY (expected case).
1311  */
1312  if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1313  !shared->page_dirty[slotno])
1314  {
1315  shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1316  continue;
1317  }
1318 
1319  /*
1320  * Hmm, we have (or may have) I/O operations acting on the page, so
1321  * we've got to wait for them to finish and then start again. This is
1322  * the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
1323  * wouldn't it be OK to just discard it without writing it?
1324  * SlruMayDeleteSegment() uses a stricter qualification, so we might
1325  * not delete this page in the end; even if we don't delete it, we
1326  * won't have cause to read its data again. For now, keep the logic
1327  * the same as it was.)
1328  */
1329  if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1330  SlruInternalWritePage(ctl, slotno, NULL);
1331  else
1332  SimpleLruWaitIO(ctl, slotno);
1333  goto restart;
1334  }
1335 
1336  LWLockRelease(shared->ControlLock);
1337 
1338  /* Now we can remove the old segment(s) */
1339  (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage);
1340 }
1341 
1342 /*
1343  * Delete an individual SLRU segment.
1344  *
1345  * NB: This does not touch the SLRU buffers themselves, callers have to ensure
1346  * they either can't yet contain anything, or have already been cleaned out.
1347  */
1348 static void
1350 {
1351  char path[MAXPGPATH];
1352 
1353  /* Forget any fsync requests queued for this segment. */
1354  if (ctl->sync_handler != SYNC_HANDLER_NONE)
1355  {
1356  FileTag tag;
1357 
1358  INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
1360  }
1361 
1362  /* Unlink the file. */
1363  SlruFileName(ctl, path, segno);
1364  ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
1365  unlink(path);
1366 }
1367 
1368 /*
1369  * Delete an individual SLRU segment, identified by the segment number.
1370  */
1371 void
1373 {
1374  SlruShared shared = ctl->shared;
1375  int slotno;
1376  bool did_write;
1377 
1378  /* Clean out any possibly existing references to the segment. */
1380 restart:
1381  did_write = false;
1382  for (slotno = 0; slotno < shared->num_slots; slotno++)
1383  {
1384  int pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
1385 
1386  if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1387  continue;
1388 
1389  /* not the segment we're looking for */
1390  if (pagesegno != segno)
1391  continue;
1392 
1393  /* If page is clean, just change state to EMPTY (expected case). */
1394  if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1395  !shared->page_dirty[slotno])
1396  {
1397  shared->page_status[slotno] = SLRU_PAGE_EMPTY;
1398  continue;
1399  }
1400 
1401  /* Same logic as SimpleLruTruncate() */
1402  if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1403  SlruInternalWritePage(ctl, slotno, NULL);
1404  else
1405  SimpleLruWaitIO(ctl, slotno);
1406 
1407  did_write = true;
1408  }
1409 
1410  /*
1411  * Be extra careful and re-check. The IO functions release the control
1412  * lock, so new pages could have been read in.
1413  */
1414  if (did_write)
1415  goto restart;
1416 
1418 
1419  LWLockRelease(shared->ControlLock);
1420 }
1421 
1422 /*
1423  * Determine whether a segment is okay to delete.
1424  *
1425  * segpage is the first page of the segment, and cutoffPage is the oldest (in
1426  * PagePrecedes order) page in the SLRU containing still-useful data. Since
1427  * every core PagePrecedes callback implements "wrap around", check the
1428  * segment's first and last pages:
1429  *
1430  * first<cutoff && last<cutoff: yes
1431  * first<cutoff && last>=cutoff: no; cutoff falls inside this segment
1432  * first>=cutoff && last<cutoff: no; wrap point falls inside this segment
1433  * first>=cutoff && last>=cutoff: no; every page of this segment is too young
1434  */
1435 static bool
1436 SlruMayDeleteSegment(SlruCtl ctl, int64 segpage, int64 cutoffPage)
1437 {
1438  int64 seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
1439 
1440  Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
1441 
1442  return (ctl->PagePrecedes(segpage, cutoffPage) &&
1443  ctl->PagePrecedes(seg_last_page, cutoffPage));
1444 }
1445 
1446 #ifdef USE_ASSERT_CHECKING
1447 static void
1448 SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset)
1449 {
1450  TransactionId lhs,
1451  rhs;
1452  int64 newestPage,
1453  oldestPage;
1454  TransactionId newestXact,
1455  oldestXact;
1456 
1457  /*
1458  * Compare an XID pair having undefined order (see RFC 1982), a pair at
1459  * "opposite ends" of the XID space. TransactionIdPrecedes() treats each
1460  * as preceding the other. If RHS is oldestXact, LHS is the first XID we
1461  * must not assign.
1462  */
1463  lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */
1464  rhs = lhs + (1U << 31);
1465  Assert(TransactionIdPrecedes(lhs, rhs));
1466  Assert(TransactionIdPrecedes(rhs, lhs));
1467  Assert(!TransactionIdPrecedes(lhs - 1, rhs));
1468  Assert(TransactionIdPrecedes(rhs, lhs - 1));
1469  Assert(TransactionIdPrecedes(lhs + 1, rhs));
1470  Assert(!TransactionIdPrecedes(rhs, lhs + 1));
1473  Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
1474  Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
1475  Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
1476  Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
1477  Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
1478  Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
1479  Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
1480  || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */
1481  Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
1482  || (1U << 31) % per_page != 0);
1483  Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
1484  Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
1485  Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
1486 
1487  /*
1488  * GetNewTransactionId() has assigned the last XID it can safely use, and
1489  * that XID is in the *LAST* page of the second segment. We must not
1490  * delete that segment.
1491  */
1492  newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
1493  newestXact = newestPage * per_page + offset;
1494  Assert(newestXact / per_page == newestPage);
1495  oldestXact = newestXact + 1;
1496  oldestXact -= 1U << 31;
1497  oldestPage = oldestXact / per_page;
1499  (newestPage -
1500  newestPage % SLRU_PAGES_PER_SEGMENT),
1501  oldestPage));
1502 
1503  /*
1504  * GetNewTransactionId() has assigned the last XID it can safely use, and
1505  * that XID is in the *FIRST* page of the second segment. We must not
1506  * delete that segment.
1507  */
1508  newestPage = SLRU_PAGES_PER_SEGMENT;
1509  newestXact = newestPage * per_page + offset;
1510  Assert(newestXact / per_page == newestPage);
1511  oldestXact = newestXact + 1;
1512  oldestXact -= 1U << 31;
1513  oldestPage = oldestXact / per_page;
1515  (newestPage -
1516  newestPage % SLRU_PAGES_PER_SEGMENT),
1517  oldestPage));
1518 }
1519 
1520 /*
1521  * Unit-test a PagePrecedes function.
1522  *
1523  * This assumes every uint32 >= FirstNormalTransactionId is a valid key. It
1524  * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
1525  * (MultiXactMemberCtl separates flags from XIDs. NotifyCtl has
1526  * variable-length entries, no keys, and no random access. These unit tests
1527  * do not apply to them.)
1528  */
1529 void
1530 SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page)
1531 {
1532  /* Test first, middle and last entries of a page. */
1533  SlruPagePrecedesTestOffset(ctl, per_page, 0);
1534  SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
1535  SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
1536 }
1537 #endif
1538 
1539 /*
1540  * SlruScanDirectory callback
1541  * This callback reports true if there's any segment wholly prior to the
1542  * one containing the page passed as "data".
1543  */
1544 bool
1546  void *data)
1547 {
1548  int64 cutoffPage = *(int64 *) data;
1549 
1550  if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
1551  return true; /* found one; don't iterate any more */
1552 
1553  return false; /* keep going */
1554 }
1555 
1556 /*
1557  * SlruScanDirectory callback.
1558  * This callback deletes segments prior to the one passed in as "data".
1559  */
1560 static bool
1561 SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int64 segpage,
1562  void *data)
1563 {
1564  int64 cutoffPage = *(int64 *) data;
1565 
1566  if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
1568 
1569  return false; /* keep going */
1570 }
1571 
1572 /*
1573  * SlruScanDirectory callback.
1574  * This callback deletes all segments.
1575  */
1576 bool
1577 SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int64 segpage, void *data)
1578 {
1580 
1581  return false; /* keep going */
1582 }
1583 
1584 /*
1585  * An internal function used by SlruScanDirectory().
1586  *
1587  * Returns true if a file with a name of a given length may be a correct
1588  * SLRU segment.
1589  */
1590 static inline bool
1592 {
1593  if (ctl->long_segment_names)
1594  return (len == 15); /* see SlruFileName() */
1595  else
1596 
1597  /*
1598  * Commit 638cf09e76d allowed 5-character lengths. Later commit
1599  * 73c986adde5 allowed 6-character length.
1600  *
1601  * Note: There is an ongoing plan to migrate all SLRUs to 64-bit page
1602  * numbers, and the corresponding 15-character file names, which may
1603  * eventually deprecate the support for 4, 5, and 6-character names.
1604  */
1605  return (len == 4 || len == 5 || len == 6);
1606 }
1607 
1608 /*
1609  * Scan the SimpleLru directory and apply a callback to each file found in it.
1610  *
1611  * If the callback returns true, the scan is stopped. The last return value
1612  * from the callback is returned.
1613  *
1614  * The callback receives the following arguments: 1. the SlruCtl struct for the
1615  * slru being truncated; 2. the filename being considered; 3. the page number
1616  * for the first page of that file; 4. a pointer to the opaque data given to us
1617  * by the caller.
1618  *
1619  * Note that the ordering in which the directory is scanned is not guaranteed.
1620  *
1621  * Note that no locking is applied.
1622  */
1623 bool
1625 {
1626  bool retval = false;
1627  DIR *cldir;
1628  struct dirent *clde;
1629  int64 segno;
1630  int64 segpage;
1631 
1632  cldir = AllocateDir(ctl->Dir);
1633  while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
1634  {
1635  size_t len;
1636 
1637  len = strlen(clde->d_name);
1638 
1640  strspn(clde->d_name, "0123456789ABCDEF") == len)
1641  {
1642  segno = strtoi64(clde->d_name, NULL, 16);
1643  segpage = segno * SLRU_PAGES_PER_SEGMENT;
1644 
1645  elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
1646  ctl->Dir, clde->d_name);
1647  retval = callback(ctl, clde->d_name, segpage, data);
1648  if (retval)
1649  break;
1650  }
1651  }
1652  FreeDir(cldir);
1653 
1654  return retval;
1655 }
1656 
1657 /*
1658  * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
1659  * that they can provide the correct "SlruCtl" (otherwise we don't know how to
1660  * build the path), but they just forward to this common implementation that
1661  * performs the fsync.
1662  */
1663 int
1664 SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
1665 {
1666  int fd;
1667  int save_errno;
1668  int result;
1669 
1670  SlruFileName(ctl, path, ftag->segno);
1671 
1672  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
1673  if (fd < 0)
1674  return -1;
1675 
1676  pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
1677  result = pg_fsync(fd);
1679  save_errno = errno;
1680 
1682 
1683  errno = save_errno;
1684  return result;
1685 }
static void pg_atomic_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition: atomics.h:433
static void pg_atomic_init_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition: atomics.h:410
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition: atomics.h:424
unsigned int uint32
Definition: c.h:495
#define MAXALIGN(LEN)
Definition: c.h:800
#define strtoi64(str, endptr, base)
Definition: c.h:1286
#define BUFFERALIGN(LEN)
Definition: c.h:802
#define PG_BINARY
Definition: c.h:1262
#define MemSet(start, val, len)
Definition: c.h:1009
uint32 TransactionId
Definition: c.h:641
size_t Size
Definition: c.h:594
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1162
int errcode_for_file_access(void)
Definition: elog.c:883
int errdetail(const char *fmt,...)
Definition: elog.c:1208
int errmsg(const char *fmt,...)
Definition: elog.c:1075
#define LOG
Definition: elog.h:31
#define DEBUG2
Definition: elog.h:29
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2909
int FreeDir(DIR *dir)
Definition: fd.c:2961
int CloseTransientFile(int fd)
Definition: fd.c:2809
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:756
int data_sync_elevel(int elevel)
Definition: fd.c:3936
int pg_fsync(int fd)
Definition: fd.c:386
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2633
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2843
bool IsUnderPostmaster
Definition: globals.c:116
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:703
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1339
@ LW_SHARED
Definition: lwlock.h:117
@ LW_EXCLUSIVE
Definition: lwlock.h:116
#define START_CRIT_SECTION()
Definition: miscadmin.h:149
#define END_CRIT_SECTION()
Definition: miscadmin.h:151
#define MAXPGPATH
const void size_t len
const void * data
static char * filename
Definition: pg_dumpall.c:121
static XLogRecPtr endpos
Definition: pg_receivewal.c:56
void pgstat_count_slru_page_exists(int slru_idx)
Definition: pgstat_slru.c:71
void pgstat_count_slru_page_read(int slru_idx)
Definition: pgstat_slru.c:77
int pgstat_get_slru_index(const char *name)
Definition: pgstat_slru.c:132
void pgstat_count_slru_page_hit(int slru_idx)
Definition: pgstat_slru.c:65
void pgstat_count_slru_page_zeroed(int slru_idx)
Definition: pgstat_slru.c:59
void pgstat_count_slru_truncate(int slru_idx)
Definition: pgstat_slru.c:95
void pgstat_count_slru_page_written(int slru_idx)
Definition: pgstat_slru.c:83
void pgstat_count_slru_flush(int slru_idx)
Definition: pgstat_slru.c:89
#define pg_pwrite
Definition: port.h:226
#define pg_pread
Definition: port.h:225
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:388
static int SlruFileName(SlruCtl ctl, char *path, int64 segno)
Definition: slru.c:65
static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
Definition: slru.c:719
int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, TransactionId xid)
Definition: slru.c:530
#define INIT_SLRUFILETAG(a, xx_handler, xx_segno)
Definition: slru.c:114
void SimpleLruWritePage(SlruCtl ctl, int slotno)
Definition: slru.c:649
void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
Definition: slru.c:1199
static bool SlruMayDeleteSegment(SlruCtl ctl, int64 segpage, int64 cutoffPage)
Definition: slru.c:1436
static void SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
Definition: slru.c:966
#define SlruRecentlyUsed(shared, slotno)
Definition: slru.c:140
struct SlruWriteAllData SlruWriteAllData
static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
Definition: slru.c:359
void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, LWLock *ctllock, const char *subdir, int tranche_id, SyncRequestHandler sync_handler, bool long_segment_names)
Definition: slru.c:215
static bool SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
Definition: slru.c:791
static bool SlruCorrectSegmentFilenameLength(SlruCtl ctl, size_t len)
Definition: slru.c:1591
static SlruErrorCause slru_errcause
Definition: slru.c:160
#define MAX_WRITEALL_BUFFERS
Definition: slru.c:98
static void SimpleLruWaitIO(SlruCtl ctl, int slotno)
Definition: slru.c:376
static int slru_errno
Definition: slru.c:161
bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
Definition: slru.c:661
void SlruDeleteSegment(SlruCtl ctl, int64 segno)
Definition: slru.c:1372
static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
Definition: slru.c:574
bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
Definition: slru.c:1624
bool SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition: slru.c:1577
int SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok, TransactionId xid)
Definition: slru.c:430
int SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
Definition: slru.c:1664
static int SlruSelectLRUPage(SlruCtl ctl, int64 pageno)
Definition: slru.c:1051
int SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
Definition: slru.c:308
void SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
Definition: slru.c:1269
static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno)
Definition: slru.c:1349
struct SlruWriteAllData * SlruWriteAll
Definition: slru.c:107
SlruErrorCause
Definition: slru.c:151
@ SLRU_WRITE_FAILED
Definition: slru.c:155
@ SLRU_FSYNC_FAILED
Definition: slru.c:156
@ SLRU_SEEK_FAILED
Definition: slru.c:153
@ SLRU_OPEN_FAILED
Definition: slru.c:152
@ SLRU_CLOSE_FAILED
Definition: slru.c:157
@ SLRU_READ_FAILED
Definition: slru.c:154
Size SimpleLruShmemSize(int nslots, int nlsns)
Definition: slru.c:183
bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition: slru.c:1545
static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition: slru.c:1561
SlruSharedData * SlruShared
Definition: slru.h:107
#define SlruPagePrecedesUnitTests(ctl, per_page)
Definition: slru.h:168
bool(* SlruScanCallback)(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition: slru.h:173
#define SLRU_PAGES_PER_SEGMENT
Definition: slru.h:34
SlruPageStatus
Definition: slru.h:43
@ SLRU_PAGE_VALID
Definition: slru.h:46
@ SLRU_PAGE_WRITE_IN_PROGRESS
Definition: slru.h:47
@ SLRU_PAGE_READ_IN_PROGRESS
Definition: slru.h:45
@ SLRU_PAGE_EMPTY
Definition: slru.h:44
int ckpt_bufs_written
Definition: xlog.h:165
Definition: dirent.c:26
Definition: sync.h:51
uint64 segno
Definition: sync.h:55
Definition: lwlock.h:41
bool(* PagePrecedes)(int64, int64)
Definition: slru.h:133
bool long_segment_names
Definition: slru.h:141
SyncRequestHandler sync_handler
Definition: slru.h:121
SlruShared shared
Definition: slru.h:115
char Dir[64]
Definition: slru.h:147
int slru_stats_idx
Definition: slru.h:104
int64 * page_number
Definition: slru.h:70
int num_slots
Definition: slru.h:61
int * page_lru_count
Definition: slru.h:71
pg_atomic_uint64 latest_page_number
Definition: slru.h:101
XLogRecPtr * group_lsn
Definition: slru.h:82
int cur_lru_count
Definition: slru.h:94
int lsn_groups_per_page
Definition: slru.h:83
SlruPageStatus * page_status
Definition: slru.h:68
LWLock * ControlLock
Definition: slru.h:58
bool * page_dirty
Definition: slru.h:69
LWLockPadded * buffer_locks
Definition: slru.h:72
char ** page_buffer
Definition: slru.h:67
int num_files
Definition: slru.c:102
int fd[MAX_WRITEALL_BUFFERS]
Definition: slru.c:103
int64 segno[MAX_WRITEALL_BUFFERS]
Definition: slru.c:104
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition: sync.c:585
SyncRequestHandler
Definition: sync.h:36
@ SYNC_HANDLER_NONE
Definition: sync.h:42
@ SYNC_FORGET_REQUEST
Definition: sync.h:27
@ SYNC_REQUEST
Definition: sync.h:25
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
Definition: test_ifaddrs.c:46
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:280
bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition: transam.c:329
#define InvalidTransactionId
Definition: transam.h:31
LWLock lock
Definition: lwlock.h:69
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:88
static void pgstat_report_wait_end(void)
Definition: wait_event.h:104
const char * name
CheckpointStatsData CheckpointStats
Definition: xlog.c:213
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2733
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
bool InRecovery
Definition: xlogutils.c:53