PostgreSQL Source Code git master
Loading...
Searching...
No Matches
slru.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * slru.c
4 * Simple LRU buffering for wrap-around-able permanent metadata
5 *
6 * This module is used to maintain various pieces of transaction status
7 * indexed by TransactionId (such as commit status, parent transaction ID,
8 * commit timestamp), as well as storage for multixacts, serializable
9 * isolation locks and NOTIFY traffic. Extensions can define their own
10 * SLRUs, too.
11 *
12 * Under ordinary circumstances we expect that write traffic will occur
13 * mostly to the latest page (and to the just-prior page, soon after a
14 * page transition). Read traffic will probably touch a larger span of
15 * pages, but a relatively small number of buffers should be sufficient.
16 *
17 * We use a simple least-recently-used scheme to manage a pool of shared
18 * page buffers, split in banks by the lowest bits of the page number, and
19 * the management algorithm only processes the bank to which the desired
20 * page belongs, so a linear search is sufficient; there's no need for a
21 * hashtable or anything fancy. The algorithm is straight LRU except that
22 * we will never swap out the latest page (since we know it's going to be
23 * hit again eventually).
24 *
25 * We use per-bank control LWLocks to protect the shared data structures,
26 * plus per-buffer LWLocks that synchronize I/O for each buffer. The
27 * bank's control lock must be held to examine or modify any of the bank's
28 * shared state. A process that is reading in or writing out a page
29 * buffer does not hold the control lock, only the per-buffer lock for the
30 * buffer it is working on. One exception is latest_page_number, which is
31 * read and written using atomic ops.
32 *
33 * "Holding the bank control lock" means exclusive lock in all cases
34 * except for SimpleLruReadPage_ReadOnly(); see comments for
35 * SlruRecentlyUsed() for the implications of that.
36 *
37 * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
38 * before releasing the control lock. The per-buffer lock is released after
39 * completing the I/O, re-acquiring the control lock, and updating the shared
40 * state. (Deadlock is not possible here, because we never try to initiate
41 * I/O when someone else is already doing I/O on the same buffer.)
42 * To wait for I/O to complete, release the control lock, acquire the
43 * per-buffer lock in shared mode, immediately release the per-buffer lock,
44 * reacquire the control lock, and then recheck state (since arbitrary things
45 * could have happened while we didn't have the lock).
46 *
47 * As with the regular buffer manager, it is possible for another process
48 * to re-dirty a page that is currently being written out. This is handled
49 * by re-setting the page's page_dirty flag.
50 *
51 *
52 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
53 * Portions Copyright (c) 1994, Regents of the University of California
54 *
55 * src/backend/access/transam/slru.c
56 *
57 *-------------------------------------------------------------------------
58 */
59#include "postgres.h"
60
61#include <fcntl.h>
62#include <sys/stat.h>
63#include <unistd.h>
64
65#include "access/slru.h"
66#include "access/transam.h"
67#include "access/xlog.h"
68#include "access/xlogutils.h"
69#include "miscadmin.h"
70#include "pgstat.h"
71#include "storage/fd.h"
72#include "storage/shmem.h"
74#include "utils/guc.h"
75#include "utils/memutils.h"
76#include "utils/wait_event.h"
77
78/*
79 * Converts segment number to the filename of the segment.
80 *
81 * "path" should point to a buffer at least MAXPGPATH characters long.
82 *
83 * If ctl->long_segment_names is true, segno can be in the range [0, 2^60-1].
84 * The resulting file name is made of 15 characters, e.g. dir/123456789ABCDEF.
85 *
86 * If ctl->long_segment_names is false, segno can be in the range [0, 2^24-1].
87 * The resulting file name is made of 4 to 6 characters, as of:
88 *
89 * dir/1234 for [0, 2^16-1]
90 * dir/12345 for [2^16, 2^20-1]
91 * dir/123456 for [2^20, 2^24-1]
92 */
93static inline int
94SlruFileName(SlruDesc *ctl, char *path, int64 segno)
95{
96 if (ctl->options.long_segment_names)
97 {
98 /*
99 * We could use 16 characters here but the disadvantage would be that
100 * the SLRU segments will be hard to distinguish from WAL segments.
101 *
102 * For this reason we use 15 characters. It is enough but also means
103 * that in the future we can't decrease SLRU_PAGES_PER_SEGMENT easily.
104 */
105 Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
106 return snprintf(path, MAXPGPATH, "%s/%015" PRIX64, ctl->options.Dir, segno);
107 }
108 else
109 {
110 /*
111 * Despite the fact that %04X format string is used up to 24 bit
112 * integers are allowed. See SlruCorrectSegmentFilenameLength()
113 */
114 Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
115 return snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->options.Dir,
116 (unsigned int) segno);
117 }
118}
119
120/*
121 * During SimpleLruWriteAll(), we will usually not need to write more than one
122 * or two physical files, but we may need to write several pages per file. We
123 * can consolidate the I/O requests by leaving files open until control returns
124 * to SimpleLruWriteAll(). This data structure remembers which files are open.
125 */
126#define MAX_WRITEALL_BUFFERS 16
127
128typedef struct SlruWriteAllData
129{
130 int num_files; /* # files actually open */
131 int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */
132 int64 segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */
134
136
137
138/*
139 * Bank size for the slot array. Pages are assigned a bank according to their
140 * page number, with each bank being this size. We want a power of 2 so that
141 * we can determine the bank number for a page with just bit shifting; we also
142 * want to keep the bank size small so that LRU victim search is fast. 16
143 * buffers per bank seems a good number.
144 */
145#define SLRU_BANK_BITSHIFT 4
146#define SLRU_BANK_SIZE (1 << SLRU_BANK_BITSHIFT)
147
148/*
149 * Macro to get the bank number to which the slot belongs.
150 */
151#define SlotGetBankNumber(slotno) ((slotno) >> SLRU_BANK_BITSHIFT)
152
153
154/*
155 * Populate a file tag describing a segment file. We only use the segment
156 * number, since we can derive everything else we need by having separate
157 * sync handler functions for clog, multixact etc.
158 */
159#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
160( \
161 memset(&(a), 0, sizeof(FileTag)), \
162 (a).handler = (xx_handler), \
163 (a).segno = (xx_segno) \
164)
165
166/* Saved info for SlruReportIOError */
176
178static int slru_errno;
179
180
181static void SimpleLruZeroLSNs(SlruDesc *ctl, int slotno);
182static void SimpleLruWaitIO(SlruDesc *ctl, int slotno);
184static bool SlruPhysicalReadPage(SlruDesc *ctl, int64 pageno, int slotno);
185static bool SlruPhysicalWritePage(SlruDesc *ctl, int64 pageno, int slotno,
187static void SlruReportIOError(SlruDesc *ctl, int64 pageno,
188 const void *opaque_data);
189static int SlruSelectLRUPage(SlruDesc *ctl, int64 pageno);
190
192 int64 segpage, void *data);
194static inline void SlruRecentlyUsed(SlruShared shared, int slotno);
195
196
197/*
198 * Initialization of shared memory
199 */
200
201static Size
202SimpleLruShmemSize(int nslots, int nlsns)
203{
204 int nbanks = nslots / SLRU_BANK_SIZE;
205 Size sz;
206
208 Assert(nslots % SLRU_BANK_SIZE == 0);
209
210 /* we assume nslots isn't so large as to risk overflow */
211 sz = MAXALIGN(sizeof(SlruSharedData));
212 sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */
213 sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */
214 sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */
215 sz += MAXALIGN(nslots * sizeof(int64)); /* page_number[] */
216 sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
217 sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
218 sz += MAXALIGN(nbanks * sizeof(LWLockPadded)); /* bank_locks[] */
219 sz += MAXALIGN(nbanks * sizeof(int)); /* bank_cur_lru_count[] */
220
221 if (nlsns > 0)
222 sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
223
224 return BUFFERALIGN(sz) + BLCKSZ * nslots;
225}
226
227/*
228 * Determine a number of SLRU buffers to use.
229 *
230 * We simply divide shared_buffers by the divisor given and cap
231 * that at the maximum given; but always at least SLRU_BANK_SIZE.
232 * Round down to the nearest multiple of SLRU_BANK_SIZE.
233 */
234int
236{
237 return Min(max - (max % SLRU_BANK_SIZE),
240}
241
242/*
243 * Register a simple LRU cache in shared memory.
244 */
245void
247{
249
250 Assert(options->name != NULL);
251 Assert(options->nslots > 0);
252 Assert(options->PagePrecedes != NULL);
253 Assert(options->errdetail_for_io_error != NULL);
254
256 sizeof(SlruOpts));
258
259 options_copy->base.name = options->name;
260 options_copy->base.size = SimpleLruShmemSize(options_copy->nslots, options_copy->nlsns);
261
263}
264
265/* Initialize locks and shared memory area */
266void
268{
270 SlruDesc *desc = (SlruDesc *) options->desc;
271 char namebuf[NAMEDATALEN];
272 SlruShared shared;
273 int nslots = options->nslots;
274 int nbanks = nslots / SLRU_BANK_SIZE;
275 int nlsns = options->nlsns;
276 char *ptr;
277 Size offset;
278
279 shared = (SlruShared) location;
280 desc->shared = shared;
281 desc->nbanks = nbanks;
282 memcpy(&desc->options, options, sizeof(SlruOpts));
283
284 /* assign new tranche IDs, if not given */
285 if (desc->options.buffer_tranche_id == 0)
286 {
287 snprintf(namebuf, sizeof(namebuf), "%s buffer", desc->options.name);
289 }
290 if (desc->options.bank_tranche_id == 0)
291 {
292 snprintf(namebuf, sizeof(namebuf), "%s bank", desc->options.name);
294 }
295
297
298 memset(shared, 0, sizeof(SlruSharedData));
299
300 shared->num_slots = nslots;
301 shared->lsn_groups_per_page = nlsns;
302
304
306
307 ptr = (char *) shared;
308 offset = MAXALIGN(sizeof(SlruSharedData));
309 shared->page_buffer = (char **) (ptr + offset);
310 offset += MAXALIGN(nslots * sizeof(char *));
311 shared->page_status = (SlruPageStatus *) (ptr + offset);
312 offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
313 shared->page_dirty = (bool *) (ptr + offset);
314 offset += MAXALIGN(nslots * sizeof(bool));
315 shared->page_number = (int64 *) (ptr + offset);
316 offset += MAXALIGN(nslots * sizeof(int64));
317 shared->page_lru_count = (int *) (ptr + offset);
318 offset += MAXALIGN(nslots * sizeof(int));
319
320 /* Initialize LWLocks */
321 shared->buffer_locks = (LWLockPadded *) (ptr + offset);
322 offset += MAXALIGN(nslots * sizeof(LWLockPadded));
323 shared->bank_locks = (LWLockPadded *) (ptr + offset);
324 offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
325 shared->bank_cur_lru_count = (int *) (ptr + offset);
326 offset += MAXALIGN(nbanks * sizeof(int));
327
328 if (nlsns > 0)
329 {
330 shared->group_lsn = (XLogRecPtr *) (ptr + offset);
331 offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
332 }
333
334 ptr += BUFFERALIGN(offset);
335 for (int slotno = 0; slotno < nslots; slotno++)
336 {
339
340 shared->page_buffer[slotno] = ptr;
342 shared->page_dirty[slotno] = false;
343 shared->page_lru_count[slotno] = 0;
344 ptr += BLCKSZ;
345 }
346
347 /* Initialize the slot banks. */
348 for (int bankno = 0; bankno < nbanks; bankno++)
349 {
351 shared->bank_cur_lru_count[bankno] = 0;
352 }
353
354 /* Should fit to estimated shmem size */
355 Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
356}
357
358void
360{
362 SlruDesc *desc = (SlruDesc *) options->desc;
363 int nslots = options->nslots;
364 int nbanks = nslots / SLRU_BANK_SIZE;
365
366 desc->shared = (SlruShared) location;
367 desc->nbanks = nbanks;
368 memcpy(&desc->options, options, sizeof(SlruOpts));
369}
370
371
372/*
373 * Helper function for GUC check_hook to check whether slru buffers are in
374 * multiples of SLRU_BANK_SIZE.
375 */
376bool
378{
379 /* Valid values are multiples of SLRU_BANK_SIZE */
380 if (*newval % SLRU_BANK_SIZE == 0)
381 return true;
382
383 GUC_check_errdetail("\"%s\" must be a multiple of %d.", name,
385 return false;
386}
387
388/*
389 * Initialize (or reinitialize) a page to zeroes.
390 *
391 * The page is not actually written, just set up in shared memory.
392 * The slot number of the new page is returned.
393 *
394 * Bank lock must be held at entry, and will be held at exit.
395 */
396int
398{
399 SlruShared shared = ctl->shared;
400 int slotno;
401
403
404 /* Find a suitable buffer slot for the page */
405 slotno = SlruSelectLRUPage(ctl, pageno);
407 (shared->page_status[slotno] == SLRU_PAGE_VALID &&
408 !shared->page_dirty[slotno]) ||
409 shared->page_number[slotno] == pageno);
410
411 /* Mark the slot as containing this page */
412 shared->page_number[slotno] = pageno;
414 shared->page_dirty[slotno] = true;
415 SlruRecentlyUsed(shared, slotno);
416
417 /* Set the buffer to zeroes */
418 MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
419
420 /* Set the LSNs for this new page to zero */
422
423 /*
424 * Assume this page is now the latest active page.
425 *
426 * Note that because both this routine and SlruSelectLRUPage run with a
427 * SLRU bank lock held, it is not possible for this to be zeroing a page
428 * that SlruSelectLRUPage is going to evict simultaneously. Therefore,
429 * there's no memory barrier here.
430 */
431 pg_atomic_write_u64(&shared->latest_page_number, pageno);
432
433 /* update the stats counter of zeroed pages */
435
436 return slotno;
437}
438
439/*
440 * Zero all the LSNs we store for this slru page.
441 *
442 * This should be called each time we create a new page, and each time we read
443 * in a page from disk into an existing buffer. (Such an old page cannot
444 * have any interesting LSNs, since we'd have flushed them before writing
445 * the page in the first place.)
446 *
447 * This assumes that InvalidXLogRecPtr is bitwise-all-0.
448 */
449static void
451{
452 SlruShared shared = ctl->shared;
453
454 if (shared->lsn_groups_per_page > 0)
455 MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
456 shared->lsn_groups_per_page * sizeof(XLogRecPtr));
457}
458
459/*
460 * This is a convenience wrapper for the common case of zeroing a page and
461 * immediately flushing it to disk.
462 *
463 * SLRU bank lock is acquired and released here.
464 */
465void
467{
468 int slotno;
469 LWLock *lock;
470
471 lock = SimpleLruGetBankLock(ctl, pageno);
473
474 /* Create and zero the page */
475 slotno = SimpleLruZeroPage(ctl, pageno);
476
477 /* Make sure it's written out */
479 Assert(!ctl->shared->page_dirty[slotno]);
480
481 LWLockRelease(lock);
482}
483
484/*
485 * Wait for any active I/O on a page slot to finish. (This does not
486 * guarantee that new I/O hasn't been started before we return, though.
487 * In fact the slot might not even contain the same page anymore.)
488 *
489 * Bank lock must be held at entry, and will be held at exit.
490 */
491static void
493{
494 SlruShared shared = ctl->shared;
496
498
499 /* See notes at top of file */
504
505 /*
506 * If the slot is still in an io-in-progress state, then either someone
507 * already started a new I/O on the slot, or a previous I/O failed and
508 * neglected to reset the page state. That shouldn't happen, really, but
509 * it seems worth a few extra cycles to check and recover from it. We can
510 * cheaply test for failure by seeing if the buffer lock is still held (we
511 * assume that transaction abort would release the lock).
512 */
515 {
517 {
518 /* indeed, the I/O must have failed */
521 else /* write_in_progress */
522 {
524 shared->page_dirty[slotno] = true;
525 }
527 }
528 }
529}
530
531/*
532 * Find a page in a shared buffer, reading it in if necessary.
533 * The page number must correspond to an already-initialized page.
534 *
535 * If write_ok is true then it is OK to return a page that is in
536 * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
537 * that modification of the page is safe. If write_ok is false then we
538 * will not return the page until it is not undergoing active I/O.
539 *
540 * On error, the passed-in 'opaque_data' is passed to the
541 * 'errdetail_for_io_error' callback, to provide details on the operation that
542 * failed. It is only used for error reporting.
543 *
544 * Return value is the shared-buffer slot number now holding the page.
545 * The buffer's LRU access info is updated.
546 *
547 * The correct bank lock must be held at entry, and will be held at exit.
548 */
549int
551 const void *opaque_data)
552{
553 SlruShared shared = ctl->shared;
555
557
558 /* Outer loop handles restart if we must wait for someone else's I/O */
559 for (;;)
560 {
561 int slotno;
562 bool ok;
563
564 /* See if page already is in memory; if not, pick victim slot */
565 slotno = SlruSelectLRUPage(ctl, pageno);
566
567 /* Did we find the page in memory? */
568 if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
569 shared->page_number[slotno] == pageno)
570 {
571 /*
572 * If page is still being read in, we must wait for I/O. Likewise
573 * if the page is being written and the caller said that's not OK.
574 */
577 !write_ok))
578 {
580 /* Now we must recheck state from the top */
581 continue;
582 }
583 /* Otherwise, it's ready to use */
584 SlruRecentlyUsed(shared, slotno);
585
586 /* update the stats counter of pages found in the SLRU */
588
589 return slotno;
590 }
591
592 /* We found no match; assert we selected a freeable slot */
594 (shared->page_status[slotno] == SLRU_PAGE_VALID &&
595 !shared->page_dirty[slotno]));
596
597 /* Mark the slot read-busy */
598 shared->page_number[slotno] = pageno;
600 shared->page_dirty[slotno] = false;
601
602 /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
604
605 /* Release bank lock while doing I/O */
607
608 /* Do the read */
609 ok = SlruPhysicalReadPage(ctl, pageno, slotno);
610
611 /* Set the LSNs for this newly read-in page to zero */
613
614 /* Re-acquire bank control lock and update page state */
616
617 Assert(shared->page_number[slotno] == pageno &&
619 !shared->page_dirty[slotno]);
620
622
624
625 /* Now it's okay to ereport if we failed */
626 if (!ok)
628
629 SlruRecentlyUsed(shared, slotno);
630
631 /* update the stats counter of pages not found in SLRU */
633
634 return slotno;
635 }
636}
637
638/*
639 * Find a page in a shared buffer, reading it in if necessary.
640 * The page number must correspond to an already-initialized page.
641 * The caller must intend only read-only access to the page.
642 *
643 * On error, the passed-in 'opaque_data' is passed to the
644 * 'errdetail_for_io_error' callback, to provide details on the operation that
645 * failed. It is only used for error reporting.
646 *
647 * Return value is the shared-buffer slot number now holding the page.
648 * The buffer's LRU access info is updated.
649 *
650 * Bank control lock must NOT be held at entry, but will be held at exit.
651 * It is unspecified whether the lock will be shared or exclusive.
652 */
653int
655{
656 SlruShared shared = ctl->shared;
658 int bankno = pageno % ctl->nbanks;
661
662 /* Try to find the page while holding only shared lock */
664
665 /* See if page is already in a buffer */
666 for (int slotno = bankstart; slotno < bankend; slotno++)
667 {
668 if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
669 shared->page_number[slotno] == pageno &&
671 {
672 /* See comments for SlruRecentlyUsed() */
673 SlruRecentlyUsed(shared, slotno);
674
675 /* update the stats counter of pages found in the SLRU */
677
678 return slotno;
679 }
680 }
681
682 /* No luck, so switch to normal exclusive lock and do regular read */
685
686 return SimpleLruReadPage(ctl, pageno, true, opaque_data);
687}
688
689/*
690 * Write a page from a shared buffer, if necessary.
691 * Does nothing if the specified slot is not dirty.
692 *
693 * NOTE: only one write attempt is made here. Hence, it is possible that
694 * the page is still dirty at exit (if someone else re-dirtied it during
695 * the write). However, we *do* attempt a fresh write even if the page
696 * is already being written; this is for checkpoints.
697 *
698 * Bank lock must be held at entry, and will be held at exit.
699 */
700static void
702{
703 SlruShared shared = ctl->shared;
704 int64 pageno = shared->page_number[slotno];
706 bool ok;
707
710
711 /* If a write is in progress, wait for it to finish */
713 shared->page_number[slotno] == pageno)
714 {
716 }
717
718 /*
719 * Do nothing if page is not dirty, or if buffer no longer contains the
720 * same page we were called for.
721 */
722 if (!shared->page_dirty[slotno] ||
723 shared->page_status[slotno] != SLRU_PAGE_VALID ||
724 shared->page_number[slotno] != pageno)
725 return;
726
727 /*
728 * Mark the slot write-busy, and clear the dirtybit. After this point, a
729 * transaction status update on this page will mark it dirty again.
730 */
732 shared->page_dirty[slotno] = false;
733
734 /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
736
737 /* Release bank lock while doing I/O */
739
740 /* Do the write */
742
743 /* If we failed, and we're in a flush, better close the files */
744 if (!ok && fdata)
745 {
746 for (int i = 0; i < fdata->num_files; i++)
748 }
749
750 /* Re-acquire bank lock and update page state */
752
753 Assert(shared->page_number[slotno] == pageno &&
755
756 /* If we failed to write, mark the page dirty again */
757 if (!ok)
758 shared->page_dirty[slotno] = true;
759
761
763
764 /* Now it's okay to ereport if we failed */
765 if (!ok)
766 SlruReportIOError(ctl, pageno, NULL);
767
768 /* If part of a checkpoint, count this as a SLRU buffer written. */
769 if (fdata)
770 {
773 }
774}
775
776/*
777 * Wrapper of SlruInternalWritePage, for external callers.
778 * fdata is always passed a NULL here.
779 */
780void
782{
783 Assert(ctl->shared->page_status[slotno] != SLRU_PAGE_EMPTY);
784
786}
787
788/*
789 * Return whether the given page exists on disk.
790 *
791 * A false return means that either the file does not exist, or that it's not
792 * large enough to contain the given page.
793 */
794bool
796{
798 int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
799 int offset = rpageno * BLCKSZ;
800 char path[MAXPGPATH];
801 int fd;
802 bool result;
804
805 /* update the stats counter of checked pages */
806 pgstat_count_slru_blocks_exists(ctl->shared->slru_stats_idx);
807
808 SlruFileName(ctl, path, segno);
809
811 if (fd < 0)
812 {
813 /* expected: file doesn't exist */
814 if (errno == ENOENT)
815 return false;
816
817 /* report error normally */
820 SlruReportIOError(ctl, pageno, NULL);
821 }
822
823 if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
824 {
827 SlruReportIOError(ctl, pageno, NULL);
828 }
829
830 result = endpos >= (off_t) (offset + BLCKSZ);
831
832 if (CloseTransientFile(fd) != 0)
833 {
836 return false;
837 }
838
839 return result;
840}
841
842/*
843 * Physical read of a (previously existing) page into a buffer slot
844 *
845 * On failure, we cannot just ereport(ERROR) since caller has put state in
846 * shared memory that must be undone. So, we return false and save enough
847 * info in static variables to let SlruReportIOError make the report.
848 *
849 * For now, assume it's not worth keeping a file pointer open across
850 * read/write operations. We could cache one virtual file pointer ...
851 */
852static bool
854{
855 SlruShared shared = ctl->shared;
857 int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
858 off_t offset = rpageno * BLCKSZ;
859 char path[MAXPGPATH];
860 int fd;
861
862 SlruFileName(ctl, path, segno);
863
864 /*
865 * In a crash-and-restart situation, it's possible for us to receive
866 * commands to set the commit status of transactions whose bits are in
867 * already-truncated segments of the commit log (see notes in
868 * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
869 * where the file doesn't exist, and return zeroes instead.
870 */
872 if (fd < 0)
873 {
874 if (errno != ENOENT || !InRecovery)
875 {
878 return false;
879 }
880
881 ereport(LOG,
882 (errmsg("file \"%s\" doesn't exist, reading as zeroes",
883 path)));
884 MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
885 return true;
886 }
887
888 errno = 0;
890 if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
891 {
896 return false;
897 }
899
900 if (CloseTransientFile(fd) != 0)
901 {
904 return false;
905 }
906
907 return true;
908}
909
910/*
911 * Physical write of a page from a buffer slot
912 *
913 * On failure, we cannot just ereport(ERROR) since caller has put state in
914 * shared memory that must be undone. So, we return false and save enough
915 * info in static variables to let SlruReportIOError make the report.
916 *
917 * For now, assume it's not worth keeping a file pointer open across
918 * independent read/write operations. We do batch operations during
919 * SimpleLruWriteAll, though.
920 *
921 * fdata is NULL for a standalone write, pointer to open-file info during
922 * SimpleLruWriteAll.
923 */
924static bool
926{
927 SlruShared shared = ctl->shared;
929 int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
930 off_t offset = rpageno * BLCKSZ;
931 char path[MAXPGPATH];
932 int fd = -1;
933
934 /* update the stats counter of written pages */
936
937 /*
938 * Honor the write-WAL-before-data rule, if appropriate, so that we do not
939 * write out data before associated WAL records. This is the same action
940 * performed during FlushBuffer() in the main buffer manager.
941 */
942 if (shared->group_lsn != NULL)
943 {
944 /*
945 * We must determine the largest async-commit LSN for the page. This
946 * is a bit tedious, but since this entire function is a slow path
947 * anyway, it seems better to do this here than to maintain a per-page
948 * LSN variable (which'd need an extra comparison in the
949 * transaction-commit path).
950 */
952 int lsnindex;
953
955 max_lsn = shared->group_lsn[lsnindex++];
956 for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
957 {
959
960 if (max_lsn < this_lsn)
962 }
963
965 {
966 /*
967 * As noted above, elog(ERROR) is not acceptable here, so if
968 * XLogFlush were to fail, we must PANIC. This isn't much of a
969 * restriction because XLogFlush is just about all critical
970 * section anyway, but let's make sure.
971 */
975 }
976 }
977
978 /*
979 * During a SimpleLruWriteAll, we may already have the desired file open.
980 */
981 if (fdata)
982 {
983 for (int i = 0; i < fdata->num_files; i++)
984 {
985 if (fdata->segno[i] == segno)
986 {
987 fd = fdata->fd[i];
988 break;
989 }
990 }
991 }
992
993 if (fd < 0)
994 {
995 /*
996 * If the file doesn't already exist, we should create it. It is
997 * possible for this to need to happen when writing a page that's not
998 * first in its segment; we assume the OS can cope with that. (Note:
999 * it might seem that it'd be okay to create files only when
1000 * SimpleLruZeroPage is called for the first page of a segment.
1001 * However, if after a crash and restart the REDO logic elects to
1002 * replay the log from a checkpoint before the latest one, then it's
1003 * possible that we will get commands to set transaction status of
1004 * transactions that have already been truncated from the commit log.
1005 * Easiest way to deal with that is to accept references to
1006 * nonexistent files here and in SlruPhysicalReadPage.)
1007 *
1008 * Note: it is possible for more than one backend to be executing this
1009 * code simultaneously for different pages of the same file. Hence,
1010 * don't use O_EXCL or O_TRUNC or anything like that.
1011 */
1012 SlruFileName(ctl, path, segno);
1014 if (fd < 0)
1015 {
1017 slru_errno = errno;
1018 return false;
1019 }
1020
1021 if (fdata)
1022 {
1023 if (fdata->num_files < MAX_WRITEALL_BUFFERS)
1024 {
1025 fdata->fd[fdata->num_files] = fd;
1026 fdata->segno[fdata->num_files] = segno;
1027 fdata->num_files++;
1028 }
1029 else
1030 {
1031 /*
1032 * In the unlikely event that we exceed MAX_WRITEALL_BUFFERS,
1033 * fall back to treating it as a standalone write.
1034 */
1035 fdata = NULL;
1036 }
1037 }
1038 }
1039
1040 errno = 0;
1042 if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
1043 {
1045 /* if write didn't set errno, assume problem is no disk space */
1046 if (errno == 0)
1047 errno = ENOSPC;
1049 slru_errno = errno;
1050 if (!fdata)
1052 return false;
1053 }
1055
1056 /* Queue up a sync request for the checkpointer. */
1057 if (ctl->options.sync_handler != SYNC_HANDLER_NONE)
1058 {
1059 FileTag tag;
1060
1061 INIT_SLRUFILETAG(tag, ctl->options.sync_handler, segno);
1062 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
1063 {
1064 /* No space to enqueue sync request. Do it synchronously. */
1066 if (pg_fsync(fd) != 0)
1067 {
1070 slru_errno = errno;
1072 return false;
1073 }
1075 }
1076 }
1077
1078 /* Close file, unless part of flush request. */
1079 if (!fdata)
1080 {
1081 if (CloseTransientFile(fd) != 0)
1082 {
1084 slru_errno = errno;
1085 return false;
1086 }
1087 }
1088
1089 return true;
1090}
1091
1092/*
1093 * Issue the error message after failure of SlruPhysicalReadPage or
1094 * SlruPhysicalWritePage. Call this after cleaning up shared-memory state.
1095 */
1096static void
1098{
1100 int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
1101 int offset = rpageno * BLCKSZ;
1102 char path[MAXPGPATH];
1103
1104 SlruFileName(ctl, path, segno);
1105 errno = slru_errno;
1106 switch (slru_errcause)
1107 {
1108 case SLRU_OPEN_FAILED:
1109 ereport(ERROR,
1111 errmsg("could not open file \"%s\": %m", path),
1112 opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1113 break;
1114 case SLRU_SEEK_FAILED:
1115 ereport(ERROR,
1117 errmsg("could not seek in file \"%s\" to offset %d: %m",
1118 path, offset),
1119 opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1120 break;
1121 case SLRU_READ_FAILED:
1122 if (errno)
1123 ereport(ERROR,
1125 errmsg("could not read from file \"%s\" at offset %d: %m",
1126 path, offset),
1127 opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1128 else
1129 ereport(ERROR,
1130 (errmsg("could not read from file \"%s\" at offset %d: read too few bytes",
1131 path, offset),
1132 opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1133 break;
1134 case SLRU_WRITE_FAILED:
1135 if (errno)
1136 ereport(ERROR,
1138 errmsg("Could not write to file \"%s\" at offset %d: %m",
1139 path, offset),
1140 opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1141 else
1142 ereport(ERROR,
1143 (errmsg("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
1144 path, offset),
1145 opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1146 break;
1147 case SLRU_FSYNC_FAILED:
1150 errmsg("could not fsync file \"%s\": %m",
1151 path),
1152 opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1153 break;
1154 case SLRU_CLOSE_FAILED:
1155 ereport(ERROR,
1157 errmsg("could not close file \"%s\": %m",
1158 path),
1159 opaque_data ? ctl->options.errdetail_for_io_error(opaque_data) : 0));
1160 break;
1161 default:
1162 /* can't get here, we trust */
1163 elog(ERROR, "unrecognized SimpleLru error cause: %d",
1164 (int) slru_errcause);
1165 break;
1166 }
1167}
1168
1169/*
1170 * Mark a buffer slot "most recently used".
1171 */
1172static inline void
1174{
1177
1179
1180 /*
1181 * The reason for the if-test is that there are often many consecutive
1182 * accesses to the same page (particularly the latest page). By
1183 * suppressing useless increments of bank_cur_lru_count, we reduce the
1184 * probability that old pages' counts will "wrap around" and make them
1185 * appear recently used.
1186 *
1187 * We allow this code to be executed concurrently by multiple processes
1188 * within SimpleLruReadPage_ReadOnly(). As long as int reads and writes
1189 * are atomic, this should not cause any completely-bogus values to enter
1190 * the computation. However, it is possible for either bank_cur_lru_count
1191 * or individual page_lru_count entries to be "reset" to lower values than
1192 * they should have, in case a process is delayed while it executes this
1193 * function. With care in SlruSelectLRUPage(), this does little harm, and
1194 * in any case the absolute worst possible consequence is a nonoptimal
1195 * choice of page to evict. The gain from allowing concurrent reads of
1196 * SLRU pages seems worth it.
1197 */
1198 if (new_lru_count != shared->page_lru_count[slotno])
1199 {
1202 }
1203}
1204
1205/*
1206 * Select the slot to re-use when we need a free slot for the given page.
1207 *
1208 * The target page number is passed not only because we need to know the
1209 * correct bank to use, but also because we need to consider the possibility
1210 * that some other process reads in the target page while we are doing I/O to
1211 * free a slot. Hence, check or recheck to see if any slot already holds the
1212 * target page, and return that slot if so. Thus, the returned slot is
1213 * *either* a slot already holding the pageno (could be any state except
1214 * EMPTY), *or* a freeable slot (state EMPTY or CLEAN).
1215 *
1216 * The correct bank lock must be held at entry, and will be held at exit.
1217 */
1218static int
1220{
1221 SlruShared shared = ctl->shared;
1222
1223 /* Outer loop handles restart after I/O */
1224 for (;;)
1225 {
1226 int cur_count;
1227 int bestvalidslot = 0; /* keep compiler quiet */
1228 int best_valid_delta = -1;
1229 int64 best_valid_page_number = 0; /* keep compiler quiet */
1230 int bestinvalidslot = 0; /* keep compiler quiet */
1231 int best_invalid_delta = -1;
1232 int64 best_invalid_page_number = 0; /* keep compiler quiet */
1233 int bankno = pageno % ctl->nbanks;
1236
1238
1239 /* See if page already has a buffer assigned */
1240 for (int slotno = bankstart; slotno < bankend; slotno++)
1241 {
1242 if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
1243 shared->page_number[slotno] == pageno)
1244 return slotno;
1245 }
1246
1247 /*
1248 * If we find any EMPTY slot, just select that one. Else choose a
1249 * victim page to replace. We normally take the least recently used
1250 * valid page, but we will never take the slot containing
1251 * latest_page_number, even if it appears least recently used. We
1252 * will select a slot that is already I/O busy only if there is no
1253 * other choice: a read-busy slot will not be least recently used once
1254 * the read finishes, and waiting for an I/O on a write-busy slot is
1255 * inferior to just picking some other slot. Testing shows the slot
1256 * we pick instead will often be clean, allowing us to begin a read at
1257 * once.
1258 *
1259 * Normally the page_lru_count values will all be different and so
1260 * there will be a well-defined LRU page. But since we allow
1261 * concurrent execution of SlruRecentlyUsed() within
1262 * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
1263 * acquire the same lru_count values. In that case we break ties by
1264 * choosing the furthest-back page.
1265 *
1266 * Notice that this next line forcibly advances cur_lru_count to a
1267 * value that is certainly beyond any value that will be in the
1268 * page_lru_count array after the loop finishes. This ensures that
1269 * the next execution of SlruRecentlyUsed will mark the page newly
1270 * used, even if it's for a page that has the current counter value.
1271 * That gets us back on the path to having good data when there are
1272 * multiple pages with the same lru_count.
1273 */
1274 cur_count = (shared->bank_cur_lru_count[bankno])++;
1275 for (int slotno = bankstart; slotno < bankend; slotno++)
1276 {
1277 int this_delta;
1279
1280 if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1281 return slotno;
1282
1284 if (this_delta < 0)
1285 {
1286 /*
1287 * Clean up in case shared updates have caused cur_count
1288 * increments to get "lost". We back off the page counts,
1289 * rather than trying to increase cur_count, to avoid any
1290 * question of infinite loops or failure in the presence of
1291 * wrapped-around counts.
1292 */
1293 shared->page_lru_count[slotno] = cur_count;
1294 this_delta = 0;
1295 }
1296
1297 /*
1298 * If this page is the one most recently zeroed, don't consider it
1299 * an eviction candidate. See comments in SimpleLruZeroPage for an
1300 * explanation about the lack of a memory barrier here.
1301 */
1303 if (this_page_number ==
1305 continue;
1306
1307 if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1308 {
1311 ctl->options.PagePrecedes(this_page_number,
1313 {
1317 }
1318 }
1319 else
1320 {
1323 ctl->options.PagePrecedes(this_page_number,
1325 {
1329 }
1330 }
1331 }
1332
1333 /*
1334 * If all pages (except possibly the latest one) are I/O busy, we'll
1335 * have to wait for an I/O to complete and then retry. In that
1336 * unhappy case, we choose to wait for the I/O on the least recently
1337 * used slot, on the assumption that it was likely initiated first of
1338 * all the I/Os in progress and may therefore finish first.
1339 */
1340 if (best_valid_delta < 0)
1341 {
1343 continue;
1344 }
1345
1346 /*
1347 * If the selected page is clean, we're set.
1348 */
1349 if (!shared->page_dirty[bestvalidslot])
1350 return bestvalidslot;
1351
1352 /*
1353 * Write the page.
1354 */
1356
1357 /*
1358 * Now loop back and try again. This is the easiest way of dealing
1359 * with corner cases such as the victim page being re-dirtied while we
1360 * wrote it.
1361 */
1362 }
1363}
1364
1365/*
1366 * Write dirty pages to disk during checkpoint or database shutdown. Flushing
1367 * is deferred until the next call to ProcessSyncRequests(), though we do fsync
1368 * the containing directory here to make sure that newly created directory
1369 * entries are on disk.
1370 */
1371void
1373{
1374 SlruShared shared = ctl->shared;
1376 int64 pageno = 0;
1377 int prevbank = SlotGetBankNumber(0);
1378 bool ok;
1379
1380 /* update the stats counter of flushes */
1382
1383 /*
1384 * Find and write dirty pages
1385 */
1386 fdata.num_files = 0;
1387
1389
1390 for (int slotno = 0; slotno < shared->num_slots; slotno++)
1391 {
1393
1394 /*
1395 * If the current bank lock is not same as the previous bank lock then
1396 * release the previous lock and acquire the new lock.
1397 */
1398 if (curbank != prevbank)
1399 {
1402 prevbank = curbank;
1403 }
1404
1405 /* Do nothing if slot is unused */
1406 if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1407 continue;
1408
1410
1411 /*
1412 * In some places (e.g. checkpoints), we cannot assert that the slot
1413 * is clean now, since another process might have re-dirtied it
1414 * already. That's okay.
1415 */
1417 shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
1418 (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1419 !shared->page_dirty[slotno]));
1420 }
1421
1423
1424 /*
1425 * Now close any files that were open
1426 */
1427 ok = true;
1428 for (int i = 0; i < fdata.num_files; i++)
1429 {
1430 if (CloseTransientFile(fdata.fd[i]) != 0)
1431 {
1433 slru_errno = errno;
1434 pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
1435 ok = false;
1436 }
1437 }
1438 if (!ok)
1439 SlruReportIOError(ctl, pageno, NULL);
1440
1441 /* Ensure that directory entries for new files are on disk. */
1442 if (ctl->options.sync_handler != SYNC_HANDLER_NONE)
1443 fsync_fname(ctl->options.Dir, true);
1444}
1445
1446/*
1447 * Remove all segments before the one holding the passed page number
1448 *
1449 * All SLRUs prevent concurrent calls to this function, either with an LWLock
1450 * or by calling it only as part of a checkpoint. Mutual exclusion must begin
1451 * before computing cutoffPage. Mutual exclusion must end after any limit
1452 * update that would permit other backends to write fresh data into the
1453 * segment immediately preceding the one containing cutoffPage. Otherwise,
1454 * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
1455 * after it has accrued freshly-written data.
1456 */
1457void
1459{
1460 SlruShared shared = ctl->shared;
1461 int prevbank;
1462
1463 /* update the stats counter of truncates */
1465
1466 /*
1467 * Scan shared memory and remove any pages preceding the cutoff page, to
1468 * ensure we won't rewrite them later. (Since this is normally called in
1469 * or just after a checkpoint, any dirty pages should have been flushed
1470 * already ... we're just being extra careful here.)
1471 */
1472restart:
1473
1474 /*
1475 * An important safety check: the current endpoint page must not be
1476 * eligible for removal. This check is just a backstop against wraparound
1477 * bugs elsewhere in SLRU handling, so we don't care if we read a slightly
1478 * outdated value; therefore we don't add a memory barrier.
1479 */
1480 if (ctl->options.PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number),
1481 cutoffPage))
1482 {
1483 ereport(LOG,
1484 (errmsg("could not truncate directory \"%s\": apparent wraparound",
1485 ctl->options.Dir)));
1486 return;
1487 }
1488
1491 for (int slotno = 0; slotno < shared->num_slots; slotno++)
1492 {
1494
1495 /*
1496 * If the current bank lock is not same as the previous bank lock then
1497 * release the previous lock and acquire the new lock.
1498 */
1499 if (curbank != prevbank)
1500 {
1503 prevbank = curbank;
1504 }
1505
1506 if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1507 continue;
1508 if (!ctl->options.PagePrecedes(shared->page_number[slotno], cutoffPage))
1509 continue;
1510
1511 /*
1512 * If page is clean, just change state to EMPTY (expected case).
1513 */
1514 if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1515 !shared->page_dirty[slotno])
1516 {
1518 continue;
1519 }
1520
1521 /*
1522 * Hmm, we have (or may have) I/O operations acting on the page, so
1523 * we've got to wait for them to finish and then start again. This is
1524 * the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
1525 * wouldn't it be OK to just discard it without writing it?
1526 * SlruMayDeleteSegment() uses a stricter qualification, so we might
1527 * not delete this page in the end; even if we don't delete it, we
1528 * won't have cause to read its data again. For now, keep the logic
1529 * the same as it was.)
1530 */
1531 if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1533 else
1535
1537 goto restart;
1538 }
1539
1541
1542 /* Now we can remove the old segment(s) */
1544}
1545
1546/*
1547 * Delete an individual SLRU segment.
1548 *
1549 * NB: This does not touch the SLRU buffers themselves, callers have to ensure
1550 * they either can't yet contain anything, or have already been cleaned out.
1551 */
1552static void
1554{
1555 char path[MAXPGPATH];
1556
1557 /* Forget any fsync requests queued for this segment. */
1558 if (ctl->options.sync_handler != SYNC_HANDLER_NONE)
1559 {
1560 FileTag tag;
1561
1562 INIT_SLRUFILETAG(tag, ctl->options.sync_handler, segno);
1564 }
1565
1566 /* Unlink the file. */
1567 SlruFileName(ctl, path, segno);
1568 ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
1569 unlink(path);
1570}
1571
1572/*
1573 * Delete an individual SLRU segment, identified by the segment number.
1574 */
1575void
1577{
1578 SlruShared shared = ctl->shared;
1579 int prevbank = SlotGetBankNumber(0);
1580 bool did_write;
1581
1582 /* Clean out any possibly existing references to the segment. */
1584restart:
1585 did_write = false;
1586 for (int slotno = 0; slotno < shared->num_slots; slotno++)
1587 {
1590
1591 /*
1592 * If the current bank lock is not same as the previous bank lock then
1593 * release the previous lock and acquire the new lock.
1594 */
1595 if (curbank != prevbank)
1596 {
1599 prevbank = curbank;
1600 }
1601
1602 if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1603 continue;
1604
1606 /* not the segment we're looking for */
1607 if (pagesegno != segno)
1608 continue;
1609
1610 /* If page is clean, just change state to EMPTY (expected case). */
1611 if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1612 !shared->page_dirty[slotno])
1613 {
1615 continue;
1616 }
1617
1618 /* Same logic as SimpleLruTruncate() */
1619 if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1621 else
1623
1624 did_write = true;
1625 }
1626
1627 /*
1628 * Be extra careful and re-check. The IO functions release the control
1629 * lock, so new pages could have been read in.
1630 */
1631 if (did_write)
1632 goto restart;
1633
1635
1637}
1638
1639/*
1640 * Determine whether a segment is okay to delete.
1641 *
1642 * segpage is the first page of the segment, and cutoffPage is the oldest (in
1643 * PagePrecedes order) page in the SLRU containing still-useful data. Since
1644 * every core PagePrecedes callback implements "wrap around", check the
1645 * segment's first and last pages:
1646 *
1647 * first<cutoff && last<cutoff: yes
1648 * first<cutoff && last>=cutoff: no; cutoff falls inside this segment
1649 * first>=cutoff && last<cutoff: no; wrap point falls inside this segment
1650 * first>=cutoff && last>=cutoff: no; every page of this segment is too young
1651 */
1652static bool
1654{
1656
1658
1659 return (ctl->options.PagePrecedes(segpage, cutoffPage) &&
1660 ctl->options.PagePrecedes(seg_last_page, cutoffPage));
1661}
1662
1663#ifdef USE_ASSERT_CHECKING
1664static void
1666{
1668 rhs;
1670 oldestPage;
1672 oldestXact;
1673
1674 /* This must be called after the Slru has been initialized */
1675 Assert(ctl->options.PagePrecedes);
1676
1677 /*
1678 * Compare an XID pair having undefined order (see RFC 1982), a pair at
1679 * "opposite ends" of the XID space. TransactionIdPrecedes() treats each
1680 * as preceding the other. If RHS is oldestXact, LHS is the first XID we
1681 * must not assign.
1682 */
1683 lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */
1684 rhs = lhs + (1U << 31);
1693 Assert(!ctl->options.PagePrecedes(lhs / per_page, lhs / per_page));
1694 Assert(!ctl->options.PagePrecedes(lhs / per_page, rhs / per_page));
1695 Assert(!ctl->options.PagePrecedes(rhs / per_page, lhs / per_page));
1696 Assert(!ctl->options.PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
1697 Assert(ctl->options.PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
1698 Assert(ctl->options.PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
1699 Assert(ctl->options.PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
1700 || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */
1701 Assert(ctl->options.PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
1702 || (1U << 31) % per_page != 0);
1703 Assert(ctl->options.PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
1704 Assert(ctl->options.PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
1705 Assert(!ctl->options.PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
1706
1707 /*
1708 * GetNewTransactionId() has assigned the last XID it can safely use, and
1709 * that XID is in the *LAST* page of the second segment. We must not
1710 * delete that segment.
1711 */
1713 newestXact = newestPage * per_page + offset;
1715 oldestXact = newestXact + 1;
1716 oldestXact -= 1U << 31;
1717 oldestPage = oldestXact / per_page;
1719 (newestPage -
1721 oldestPage));
1722
1723 /*
1724 * GetNewTransactionId() has assigned the last XID it can safely use, and
1725 * that XID is in the *FIRST* page of the second segment. We must not
1726 * delete that segment.
1727 */
1729 newestXact = newestPage * per_page + offset;
1731 oldestXact = newestXact + 1;
1732 oldestXact -= 1U << 31;
1733 oldestPage = oldestXact / per_page;
1735 (newestPage -
1737 oldestPage));
1738}
1739
1740/*
1741 * Unit-test a PagePrecedes function.
1742 *
1743 * This assumes every uint32 >= FirstNormalTransactionId is a valid key. It
1744 * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
1745 * (MultiXactMemberCtl separates flags from XIDs. NotifyCtl has
1746 * variable-length entries, no keys, and no random access. These unit tests
1747 * do not apply to them.)
1748 */
1749void
1751{
1752 /* Test first, middle and last entries of a page. */
1756}
1757#endif
1758
1759/*
1760 * SlruScanDirectory callback
1761 * This callback reports true if there's any segment wholly prior to the
1762 * one containing the page passed as "data".
1763 */
1764bool
1766 void *data)
1767{
1768 int64 cutoffPage = *(int64 *) data;
1769
1771 return true; /* found one; don't iterate any more */
1772
1773 return false; /* keep going */
1774}
1775
1776/*
1777 * SlruScanDirectory callback.
1778 * This callback deletes segments prior to the one passed in as "data".
1779 */
1780static bool
1782 void *data)
1783{
1784 int64 cutoffPage = *(int64 *) data;
1785
1788
1789 return false; /* keep going */
1790}
1791
1792/*
1793 * SlruScanDirectory callback.
1794 * This callback deletes all segments.
1795 */
1796bool
1798{
1800
1801 return false; /* keep going */
1802}
1803
1804/*
1805 * An internal function used by SlruScanDirectory().
1806 *
1807 * Returns true if a file with a name of a given length may be a correct
1808 * SLRU segment.
1809 */
1810static inline bool
1812{
1813 if (ctl->options.long_segment_names)
1814 return (len == 15); /* see SlruFileName() */
1815 else
1816
1817 /*
1818 * Commit 638cf09e76d allowed 5-character lengths. Later commit
1819 * 73c986adde5 allowed 6-character length.
1820 *
1821 * Note: There is an ongoing plan to migrate all SLRUs to 64-bit page
1822 * numbers, and the corresponding 15-character file names, which may
1823 * eventually deprecate the support for 4, 5, and 6-character names.
1824 */
1825 return (len == 4 || len == 5 || len == 6);
1826}
1827
1828/*
1829 * Scan the SimpleLru directory and apply a callback to each file found in it.
1830 *
1831 * If the callback returns true, the scan is stopped. The last return value
1832 * from the callback is returned.
1833 *
1834 * The callback receives the following arguments: 1. the SlruCtl struct for the
1835 * slru being truncated; 2. the filename being considered; 3. the page number
1836 * for the first page of that file; 4. a pointer to the opaque data given to us
1837 * by the caller.
1838 *
1839 * Note that the ordering in which the directory is scanned is not guaranteed.
1840 *
1841 * Note that no locking is applied.
1842 */
1843bool
1845{
1846 bool retval = false;
1847 DIR *cldir;
1848 struct dirent *clde;
1849 int64 segno;
1850 int64 segpage;
1851
1852 cldir = AllocateDir(ctl->options.Dir);
1853 while ((clde = ReadDir(cldir, ctl->options.Dir)) != NULL)
1854 {
1855 size_t len;
1856
1857 len = strlen(clde->d_name);
1858
1860 strspn(clde->d_name, "0123456789ABCDEF") == len)
1861 {
1862 segno = strtoi64(clde->d_name, NULL, 16);
1864
1865 elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
1866 ctl->options.Dir, clde->d_name);
1867 retval = callback(ctl, clde->d_name, segpage, data);
1868 if (retval)
1869 break;
1870 }
1871 }
1872 FreeDir(cldir);
1873
1874 return retval;
1875}
1876
1877/*
1878 * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
1879 * that they can provide the correct "SlruCtl" (otherwise we don't know how to
1880 * build the path), but they just forward to this common implementation that
1881 * performs the fsync.
1882 */
1883int
1884SlruSyncFileTag(SlruDesc *ctl, const FileTag *ftag, char *path)
1885{
1886 int fd;
1887 int save_errno;
1888 int result;
1889
1890 SlruFileName(ctl, path, ftag->segno);
1891
1893 if (fd < 0)
1894 return -1;
1895
1897 result = pg_fsync(fd);
1899 save_errno = errno;
1900
1902
1903 errno = save_errno;
1904 return result;
1905}
static void pg_atomic_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition atomics.h:485
static void pg_atomic_init_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition atomics.h:453
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
#define INT64CONST(x)
Definition c.h:630
#define Min(x, y)
Definition c.h:1091
#define MAXALIGN(LEN)
Definition c.h:896
#define Max(x, y)
Definition c.h:1085
#define BUFFERALIGN(LEN)
Definition c.h:898
#define Assert(condition)
Definition c.h:943
int64_t int64
Definition c.h:621
#define PG_BINARY
Definition c.h:1374
uint32_t uint32
Definition c.h:624
#define MemSet(start, val, len)
Definition c.h:1107
uint32 TransactionId
Definition c.h:736
size_t Size
Definition c.h:689
uint32 result
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
int errcode_for_file_access(void)
Definition elog.c:897
#define LOG
Definition elog.h:32
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define DEBUG2
Definition elog.h:30
#define ERROR
Definition elog.h:40
#define elog(elevel,...)
Definition elog.h:228
#define ereport(elevel,...)
Definition elog.h:152
int FreeDir(DIR *dir)
Definition fd.c:3009
int CloseTransientFile(int fd)
Definition fd.c:2855
void fsync_fname(const char *fname, bool isdir)
Definition fd.c:757
int data_sync_elevel(int elevel)
Definition fd.c:3986
DIR * AllocateDir(const char *dirname)
Definition fd.c:2891
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition fd.c:2957
int pg_fsync(int fd)
Definition fd.c:390
int OpenTransientFile(const char *fileName, int fileFlags)
Definition fd.c:2678
int NBuffers
Definition globals.c:144
#define newval
#define GUC_check_errdetail
Definition guc.h:507
int i
Definition isn.c:77
bool LWLockHeldByMe(LWLock *lock)
Definition lwlock.c:1885
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1150
int LWLockNewTrancheId(const char *name)
Definition lwlock.c:562
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1929
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1767
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition lwlock.c:670
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1321
@ LW_SHARED
Definition lwlock.h:105
@ LW_EXCLUSIVE
Definition lwlock.h:104
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1232
MemoryContext TopMemoryContext
Definition mcxt.c:166
#define START_CRIT_SECTION()
Definition miscadmin.h:152
#define END_CRIT_SECTION()
Definition miscadmin.h:154
static char * errmsg
#define NAMEDATALEN
#define MAXPGPATH
#define SLRU_PAGES_PER_SEGMENT
const void size_t len
const void * data
static char * filename
Definition pg_dumpall.c:133
static XLogRecPtr endpos
void pgstat_count_slru_blocks_zeroed(int slru_idx)
void pgstat_count_slru_blocks_hit(int slru_idx)
void pgstat_count_slru_truncate(int slru_idx)
void pgstat_count_slru_blocks_read(int slru_idx)
void pgstat_count_slru_blocks_written(int slru_idx)
void pgstat_count_slru_flush(int slru_idx)
void pgstat_count_slru_blocks_exists(int slru_idx)
PgStat_CheckpointerStats PendingCheckpointerStats
int pgstat_get_slru_index(const char *name)
#define pg_pwrite
Definition port.h:248
#define pg_pread
Definition port.h:247
#define snprintf
Definition port.h:260
static int fd(const char *x, int i)
static int fb(int x)
tree ctl
Definition radixtree.h:1838
void ShmemRequestInternal(ShmemStructOpts *options, ShmemRequestKind kind)
Definition shmem.c:337
@ SHMEM_KIND_SLRU
static void SimpleLruZeroLSNs(SlruDesc *ctl, int slotno)
Definition slru.c:450
bool SlruScanDirectory(SlruDesc *ctl, SlruScanCallback callback, void *data)
Definition slru.c:1844
static bool SlruMayDeleteSegment(SlruDesc *ctl, int64 segpage, int64 cutoffPage)
Definition slru.c:1653
#define INIT_SLRUFILETAG(a, xx_handler, xx_segno)
Definition slru.c:159
int SimpleLruReadPage_ReadOnly(SlruDesc *ctl, int64 pageno, const void *opaque_data)
Definition slru.c:654
static int SlruSelectLRUPage(SlruDesc *ctl, int64 pageno)
Definition slru.c:1219
#define SLRU_BANK_SIZE
Definition slru.c:146
static void SlruReportIOError(SlruDesc *ctl, int64 pageno, const void *opaque_data)
Definition slru.c:1097
int SimpleLruAutotuneBuffers(int divisor, int max)
Definition slru.c:235
bool SlruScanDirCbReportPresence(SlruDesc *ctl, char *filename, int64 segpage, void *data)
Definition slru.c:1765
static SlruErrorCause slru_errcause
Definition slru.c:177
static bool SlruScanDirCbDeleteCutoff(SlruDesc *ctl, char *filename, int64 segpage, void *data)
Definition slru.c:1781
static int SlruFileName(SlruDesc *ctl, char *path, int64 segno)
Definition slru.c:94
#define MAX_WRITEALL_BUFFERS
Definition slru.c:126
static bool SlruCorrectSegmentFilenameLength(SlruDesc *ctl, size_t len)
Definition slru.c:1811
static void SlruInternalDeleteSegment(SlruDesc *ctl, int64 segno)
Definition slru.c:1553
static bool SlruPhysicalWritePage(SlruDesc *ctl, int64 pageno, int slotno, SlruWriteAll fdata)
Definition slru.c:925
static int slru_errno
Definition slru.c:178
void shmem_slru_init(void *location, ShmemStructOpts *base_options)
Definition slru.c:267
void SimpleLruTruncate(SlruDesc *ctl, int64 cutoffPage)
Definition slru.c:1458
static bool SlruPhysicalReadPage(SlruDesc *ctl, int64 pageno, int slotno)
Definition slru.c:853
void SimpleLruZeroAndWritePage(SlruDesc *ctl, int64 pageno)
Definition slru.c:466
static void SimpleLruWaitIO(SlruDesc *ctl, int slotno)
Definition slru.c:492
static void SlruInternalWritePage(SlruDesc *ctl, int slotno, SlruWriteAll fdata)
Definition slru.c:701
void SlruDeleteSegment(SlruDesc *ctl, int64 segno)
Definition slru.c:1576
int SimpleLruZeroPage(SlruDesc *ctl, int64 pageno)
Definition slru.c:397
#define SlotGetBankNumber(slotno)
Definition slru.c:151
void shmem_slru_attach(void *location, ShmemStructOpts *base_options)
Definition slru.c:359
bool SimpleLruDoesPhysicalPageExist(SlruDesc *ctl, int64 pageno)
Definition slru.c:795
static Size SimpleLruShmemSize(int nslots, int nlsns)
Definition slru.c:202
struct SlruWriteAllData * SlruWriteAll
Definition slru.c:135
SlruErrorCause
Definition slru.c:168
@ SLRU_WRITE_FAILED
Definition slru.c:172
@ SLRU_FSYNC_FAILED
Definition slru.c:173
@ SLRU_SEEK_FAILED
Definition slru.c:170
@ SLRU_OPEN_FAILED
Definition slru.c:169
@ SLRU_CLOSE_FAILED
Definition slru.c:174
@ SLRU_READ_FAILED
Definition slru.c:171
bool SlruScanDirCbDeleteAll(SlruDesc *ctl, char *filename, int64 segpage, void *data)
Definition slru.c:1797
void SimpleLruWritePage(SlruDesc *ctl, int slotno)
Definition slru.c:781
void SimpleLruWriteAll(SlruDesc *ctl, bool allow_redirtied)
Definition slru.c:1372
int SimpleLruReadPage(SlruDesc *ctl, int64 pageno, bool write_ok, const void *opaque_data)
Definition slru.c:550
static void SlruRecentlyUsed(SlruShared shared, int slotno)
Definition slru.c:1173
void SimpleLruRequestWithOpts(const SlruOpts *options)
Definition slru.c:246
bool check_slru_buffers(const char *name, int *newval)
Definition slru.c:377
int SlruSyncFileTag(SlruDesc *ctl, const FileTag *ftag, char *path)
Definition slru.c:1884
SlruSharedData * SlruShared
Definition slru.h:108
#define SlruPagePrecedesUnitTests(ctl, per_page)
Definition slru.h:233
bool(* SlruScanCallback)(SlruDesc *ctl, char *filename, int64 segpage, void *data)
Definition slru.h:238
#define SLRU_MAX_ALLOWED_BUFFERS
Definition slru.h:26
static LWLock * SimpleLruGetBankLock(SlruDesc *ctl, int64 pageno)
Definition slru.h:207
SlruPageStatus
Definition slru.h:35
@ SLRU_PAGE_VALID
Definition slru.h:38
@ SLRU_PAGE_WRITE_IN_PROGRESS
Definition slru.h:39
@ SLRU_PAGE_READ_IN_PROGRESS
Definition slru.h:37
@ SLRU_PAGE_EMPTY
Definition slru.h:36
int ckpt_slru_written
Definition xlog.h:180
Definition dirent.c:26
Definition sync.h:51
uint64 segno
Definition sync.h:55
PgStat_Counter slru_written
Definition pgstat.h:271
uint16 nbanks
Definition slru.h:197
SlruOpts options
Definition slru.h:192
SlruShared shared
Definition slru.h:194
int bank_tranche_id
Definition slru.h:183
const char * name
Definition slru.h:123
int buffer_tranche_id
Definition slru.h:182
int slru_stats_idx
Definition slru.h:105
int64 * page_number
Definition slru.h:60
int num_slots
Definition slru.h:51
LWLockPadded * bank_locks
Definition slru.h:67
int * page_lru_count
Definition slru.h:61
pg_atomic_uint64 latest_page_number
Definition slru.h:102
XLogRecPtr * group_lsn
Definition slru.h:94
int * bank_cur_lru_count
Definition slru.h:84
int lsn_groups_per_page
Definition slru.h:95
SlruPageStatus * page_status
Definition slru.h:58
bool * page_dirty
Definition slru.h:59
LWLockPadded * buffer_locks
Definition slru.h:64
char ** page_buffer
Definition slru.h:57
int fd[MAX_WRITEALL_BUFFERS]
Definition slru.c:131
int64 segno[MAX_WRITEALL_BUFFERS]
Definition slru.c:132
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition sync.c:581
@ SYNC_HANDLER_NONE
Definition sync.h:42
@ SYNC_FORGET_REQUEST
Definition sync.h:27
@ SYNC_REQUEST
Definition sync.h:25
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
static bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition transam.h:312
static bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition transam.h:263
LWLock lock
Definition lwlock.h:70
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:67
static void pgstat_report_wait_end(void)
Definition wait_event.h:83
const char * name
CheckpointStatsData CheckpointStats
Definition xlog.c:216
void XLogFlush(XLogRecPtr record)
Definition xlog.c:2801
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
uint64 XLogRecPtr
Definition xlogdefs.h:21
bool InRecovery
Definition xlogutils.c:50