PostgreSQL Source Code git master
Loading...
Searching...
No Matches
slru.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * slru.c
4 * Simple LRU buffering for wrap-around-able permanent metadata
5 *
6 * This module is used to maintain various pieces of transaction status
7 * indexed by TransactionId (such as commit status, parent transaction ID,
8 * commit timestamp), as well as storage for multixacts, serializable
9 * isolation locks and NOTIFY traffic. Extensions can define their own
10 * SLRUs, too.
11 *
12 * Under ordinary circumstances we expect that write traffic will occur
13 * mostly to the latest page (and to the just-prior page, soon after a
14 * page transition). Read traffic will probably touch a larger span of
15 * pages, but a relatively small number of buffers should be sufficient.
16 *
17 * We use a simple least-recently-used scheme to manage a pool of shared
18 * page buffers, split in banks by the lowest bits of the page number, and
19 * the management algorithm only processes the bank to which the desired
20 * page belongs, so a linear search is sufficient; there's no need for a
21 * hashtable or anything fancy. The algorithm is straight LRU except that
22 * we will never swap out the latest page (since we know it's going to be
23 * hit again eventually).
24 *
25 * We use per-bank control LWLocks to protect the shared data structures,
26 * plus per-buffer LWLocks that synchronize I/O for each buffer. The
27 * bank's control lock must be held to examine or modify any of the bank's
28 * shared state. A process that is reading in or writing out a page
29 * buffer does not hold the control lock, only the per-buffer lock for the
30 * buffer it is working on. One exception is latest_page_number, which is
31 * read and written using atomic ops.
32 *
33 * "Holding the bank control lock" means exclusive lock in all cases
34 * except for SimpleLruReadPage_ReadOnly(); see comments for
35 * SlruRecentlyUsed() for the implications of that.
36 *
37 * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively
38 * before releasing the control lock. The per-buffer lock is released after
39 * completing the I/O, re-acquiring the control lock, and updating the shared
40 * state. (Deadlock is not possible here, because we never try to initiate
41 * I/O when someone else is already doing I/O on the same buffer.)
42 * To wait for I/O to complete, release the control lock, acquire the
43 * per-buffer lock in shared mode, immediately release the per-buffer lock,
44 * reacquire the control lock, and then recheck state (since arbitrary things
45 * could have happened while we didn't have the lock).
46 *
47 * As with the regular buffer manager, it is possible for another process
48 * to re-dirty a page that is currently being written out. This is handled
49 * by re-setting the page's page_dirty flag.
50 *
51 *
52 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
53 * Portions Copyright (c) 1994, Regents of the University of California
54 *
55 * src/backend/access/transam/slru.c
56 *
57 *-------------------------------------------------------------------------
58 */
59#include "postgres.h"
60
61#include <fcntl.h>
62#include <sys/stat.h>
63#include <unistd.h>
64
65#include "access/slru.h"
66#include "access/transam.h"
67#include "access/xlog.h"
68#include "access/xlogutils.h"
69#include "miscadmin.h"
70#include "pgstat.h"
71#include "storage/fd.h"
72#include "storage/shmem.h"
73#include "utils/guc.h"
74#include "utils/wait_event.h"
75
76/*
77 * Converts segment number to the filename of the segment.
78 *
79 * "path" should point to a buffer at least MAXPGPATH characters long.
80 *
81 * If ctl->long_segment_names is true, segno can be in the range [0, 2^60-1].
82 * The resulting file name is made of 15 characters, e.g. dir/123456789ABCDEF.
83 *
84 * If ctl->long_segment_names is false, segno can be in the range [0, 2^24-1].
85 * The resulting file name is made of 4 to 6 characters, as of:
86 *
87 * dir/1234 for [0, 2^16-1]
88 * dir/12345 for [2^16, 2^20-1]
89 * dir/123456 for [2^20, 2^24-1]
90 */
91static inline int
92SlruFileName(SlruCtl ctl, char *path, int64 segno)
93{
94 if (ctl->long_segment_names)
95 {
96 /*
97 * We could use 16 characters here but the disadvantage would be that
98 * the SLRU segments will be hard to distinguish from WAL segments.
99 *
100 * For this reason we use 15 characters. It is enough but also means
101 * that in the future we can't decrease SLRU_PAGES_PER_SEGMENT easily.
102 */
103 Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
104 return snprintf(path, MAXPGPATH, "%s/%015" PRIX64, ctl->Dir, segno);
105 }
106 else
107 {
108 /*
109 * Despite the fact that %04X format string is used up to 24 bit
110 * integers are allowed. See SlruCorrectSegmentFilenameLength()
111 */
112 Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
113 return snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir,
114 (unsigned int) segno);
115 }
116}
117
118/*
119 * During SimpleLruWriteAll(), we will usually not need to write more than one
120 * or two physical files, but we may need to write several pages per file. We
121 * can consolidate the I/O requests by leaving files open until control returns
122 * to SimpleLruWriteAll(). This data structure remembers which files are open.
123 */
124#define MAX_WRITEALL_BUFFERS 16
125
126typedef struct SlruWriteAllData
127{
128 int num_files; /* # files actually open */
129 int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */
130 int64 segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */
132
134
135
136/*
137 * Bank size for the slot array. Pages are assigned a bank according to their
138 * page number, with each bank being this size. We want a power of 2 so that
139 * we can determine the bank number for a page with just bit shifting; we also
140 * want to keep the bank size small so that LRU victim search is fast. 16
141 * buffers per bank seems a good number.
142 */
143#define SLRU_BANK_BITSHIFT 4
144#define SLRU_BANK_SIZE (1 << SLRU_BANK_BITSHIFT)
145
146/*
147 * Macro to get the bank number to which the slot belongs.
148 */
149#define SlotGetBankNumber(slotno) ((slotno) >> SLRU_BANK_BITSHIFT)
150
151
152/*
153 * Populate a file tag describing a segment file. We only use the segment
154 * number, since we can derive everything else we need by having separate
155 * sync handler functions for clog, multixact etc.
156 */
157#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
158( \
159 memset(&(a), 0, sizeof(FileTag)), \
160 (a).handler = (xx_handler), \
161 (a).segno = (xx_segno) \
162)
163
164/* Saved info for SlruReportIOError */
174
176static int slru_errno;
177
178
179static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
180static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
182static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
183static bool SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno,
185static void SlruReportIOError(SlruCtl ctl, int64 pageno,
186 const void *opaque_data);
187static int SlruSelectLRUPage(SlruCtl ctl, int64 pageno);
188
190 int64 segpage, void *data);
192static inline void SlruRecentlyUsed(SlruShared shared, int slotno);
193
194
195/*
196 * Initialization of shared memory
197 */
198
199Size
200SimpleLruShmemSize(int nslots, int nlsns)
201{
202 int nbanks = nslots / SLRU_BANK_SIZE;
203 Size sz;
204
206 Assert(nslots % SLRU_BANK_SIZE == 0);
207
208 /* we assume nslots isn't so large as to risk overflow */
209 sz = MAXALIGN(sizeof(SlruSharedData));
210 sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */
211 sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */
212 sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */
213 sz += MAXALIGN(nslots * sizeof(int64)); /* page_number[] */
214 sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
215 sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
216 sz += MAXALIGN(nbanks * sizeof(LWLockPadded)); /* bank_locks[] */
217 sz += MAXALIGN(nbanks * sizeof(int)); /* bank_cur_lru_count[] */
218
219 if (nlsns > 0)
220 sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
221
222 return BUFFERALIGN(sz) + BLCKSZ * nslots;
223}
224
225/*
226 * Determine a number of SLRU buffers to use.
227 *
228 * We simply divide shared_buffers by the divisor given and cap
229 * that at the maximum given; but always at least SLRU_BANK_SIZE.
230 * Round down to the nearest multiple of SLRU_BANK_SIZE.
231 */
232int
234{
235 return Min(max - (max % SLRU_BANK_SIZE),
238}
239
240/*
241 * Initialize, or attach to, a simple LRU cache in shared memory.
242 *
243 * ctl: address of local (unshared) control structure.
244 * name: name of SLRU. (This is user-visible, pick with care!)
245 * nslots: number of page slots to use.
246 * nlsns: number of LSN groups per page (set to zero if not relevant).
247 * subdir: PGDATA-relative subdirectory that will contain the files.
248 * buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks.
249 * bank_tranche_id: tranche ID to use for the bank LWLocks.
250 * sync_handler: which set of functions to use to handle sync requests
251 * long_segment_names: use short or long segment names
252 */
253void
254SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
255 const char *subdir, int buffer_tranche_id, int bank_tranche_id,
256 SyncRequestHandler sync_handler, bool long_segment_names)
257{
258 SlruShared shared;
259 bool found;
260 int nbanks = nslots / SLRU_BANK_SIZE;
261
263
264 Assert(ctl->PagePrecedes != NULL);
265 Assert(ctl->errdetail_for_io_error != NULL);
266
268 SimpleLruShmemSize(nslots, nlsns),
269 &found);
270
272 {
273 /* Initialize locks and shared memory area */
274 char *ptr;
275 Size offset;
276
277 Assert(!found);
278
279 memset(shared, 0, sizeof(SlruSharedData));
280
281 shared->num_slots = nslots;
282 shared->lsn_groups_per_page = nlsns;
283
285
287
288 ptr = (char *) shared;
289 offset = MAXALIGN(sizeof(SlruSharedData));
290 shared->page_buffer = (char **) (ptr + offset);
291 offset += MAXALIGN(nslots * sizeof(char *));
292 shared->page_status = (SlruPageStatus *) (ptr + offset);
293 offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
294 shared->page_dirty = (bool *) (ptr + offset);
295 offset += MAXALIGN(nslots * sizeof(bool));
296 shared->page_number = (int64 *) (ptr + offset);
297 offset += MAXALIGN(nslots * sizeof(int64));
298 shared->page_lru_count = (int *) (ptr + offset);
299 offset += MAXALIGN(nslots * sizeof(int));
300
301 /* Initialize LWLocks */
302 shared->buffer_locks = (LWLockPadded *) (ptr + offset);
303 offset += MAXALIGN(nslots * sizeof(LWLockPadded));
304 shared->bank_locks = (LWLockPadded *) (ptr + offset);
305 offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
306 shared->bank_cur_lru_count = (int *) (ptr + offset);
307 offset += MAXALIGN(nbanks * sizeof(int));
308
309 if (nlsns > 0)
310 {
311 shared->group_lsn = (XLogRecPtr *) (ptr + offset);
312 offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
313 }
314
315 ptr += BUFFERALIGN(offset);
316 for (int slotno = 0; slotno < nslots; slotno++)
317 {
320
321 shared->page_buffer[slotno] = ptr;
323 shared->page_dirty[slotno] = false;
324 shared->page_lru_count[slotno] = 0;
325 ptr += BLCKSZ;
326 }
327
328 /* Initialize the slot banks. */
329 for (int bankno = 0; bankno < nbanks; bankno++)
330 {
332 shared->bank_cur_lru_count[bankno] = 0;
333 }
334
335 /* Should fit to estimated shmem size */
336 Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
337 }
338 else
339 {
340 Assert(found);
341 Assert(shared->num_slots == nslots);
342 }
343
344 /*
345 * Initialize the unshared control struct, including directory path. We
346 * assume caller set PagePrecedes.
347 */
348 ctl->shared = shared;
349 ctl->sync_handler = sync_handler;
350 ctl->long_segment_names = long_segment_names;
351 ctl->nbanks = nbanks;
352 strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
353}
354
355/*
356 * Helper function for GUC check_hook to check whether slru buffers are in
357 * multiples of SLRU_BANK_SIZE.
358 */
359bool
361{
362 /* Valid values are multiples of SLRU_BANK_SIZE */
363 if (*newval % SLRU_BANK_SIZE == 0)
364 return true;
365
366 GUC_check_errdetail("\"%s\" must be a multiple of %d.", name,
368 return false;
369}
370
371/*
372 * Initialize (or reinitialize) a page to zeroes.
373 *
374 * The page is not actually written, just set up in shared memory.
375 * The slot number of the new page is returned.
376 *
377 * Bank lock must be held at entry, and will be held at exit.
378 */
379int
381{
382 SlruShared shared = ctl->shared;
383 int slotno;
384
386
387 /* Find a suitable buffer slot for the page */
388 slotno = SlruSelectLRUPage(ctl, pageno);
390 (shared->page_status[slotno] == SLRU_PAGE_VALID &&
391 !shared->page_dirty[slotno]) ||
392 shared->page_number[slotno] == pageno);
393
394 /* Mark the slot as containing this page */
395 shared->page_number[slotno] = pageno;
397 shared->page_dirty[slotno] = true;
398 SlruRecentlyUsed(shared, slotno);
399
400 /* Set the buffer to zeroes */
401 MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
402
403 /* Set the LSNs for this new page to zero */
405
406 /*
407 * Assume this page is now the latest active page.
408 *
409 * Note that because both this routine and SlruSelectLRUPage run with a
410 * SLRU bank lock held, it is not possible for this to be zeroing a page
411 * that SlruSelectLRUPage is going to evict simultaneously. Therefore,
412 * there's no memory barrier here.
413 */
414 pg_atomic_write_u64(&shared->latest_page_number, pageno);
415
416 /* update the stats counter of zeroed pages */
418
419 return slotno;
420}
421
422/*
423 * Zero all the LSNs we store for this slru page.
424 *
425 * This should be called each time we create a new page, and each time we read
426 * in a page from disk into an existing buffer. (Such an old page cannot
427 * have any interesting LSNs, since we'd have flushed them before writing
428 * the page in the first place.)
429 *
430 * This assumes that InvalidXLogRecPtr is bitwise-all-0.
431 */
432static void
434{
435 SlruShared shared = ctl->shared;
436
437 if (shared->lsn_groups_per_page > 0)
438 MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
439 shared->lsn_groups_per_page * sizeof(XLogRecPtr));
440}
441
442/*
443 * This is a convenience wrapper for the common case of zeroing a page and
444 * immediately flushing it to disk.
445 *
446 * SLRU bank lock is acquired and released here.
447 */
448void
450{
451 int slotno;
452 LWLock *lock;
453
454 lock = SimpleLruGetBankLock(ctl, pageno);
456
457 /* Create and zero the page */
458 slotno = SimpleLruZeroPage(ctl, pageno);
459
460 /* Make sure it's written out */
462 Assert(!ctl->shared->page_dirty[slotno]);
463
464 LWLockRelease(lock);
465}
466
467/*
468 * Wait for any active I/O on a page slot to finish. (This does not
469 * guarantee that new I/O hasn't been started before we return, though.
470 * In fact the slot might not even contain the same page anymore.)
471 *
472 * Bank lock must be held at entry, and will be held at exit.
473 */
474static void
476{
477 SlruShared shared = ctl->shared;
479
481
482 /* See notes at top of file */
487
488 /*
489 * If the slot is still in an io-in-progress state, then either someone
490 * already started a new I/O on the slot, or a previous I/O failed and
491 * neglected to reset the page state. That shouldn't happen, really, but
492 * it seems worth a few extra cycles to check and recover from it. We can
493 * cheaply test for failure by seeing if the buffer lock is still held (we
494 * assume that transaction abort would release the lock).
495 */
498 {
500 {
501 /* indeed, the I/O must have failed */
504 else /* write_in_progress */
505 {
507 shared->page_dirty[slotno] = true;
508 }
510 }
511 }
512}
513
514/*
515 * Find a page in a shared buffer, reading it in if necessary.
516 * The page number must correspond to an already-initialized page.
517 *
518 * If write_ok is true then it is OK to return a page that is in
519 * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure
520 * that modification of the page is safe. If write_ok is false then we
521 * will not return the page until it is not undergoing active I/O.
522 *
523 * On error, the passed-in 'opaque_data' is passed to the
524 * 'errdetail_for_io_error' callback, to provide details on the operation that
525 * failed. It is only used for error reporting.
526 *
527 * Return value is the shared-buffer slot number now holding the page.
528 * The buffer's LRU access info is updated.
529 *
530 * The correct bank lock must be held at entry, and will be held at exit.
531 */
532int
534 const void *opaque_data)
535{
536 SlruShared shared = ctl->shared;
538
540
541 /* Outer loop handles restart if we must wait for someone else's I/O */
542 for (;;)
543 {
544 int slotno;
545 bool ok;
546
547 /* See if page already is in memory; if not, pick victim slot */
548 slotno = SlruSelectLRUPage(ctl, pageno);
549
550 /* Did we find the page in memory? */
551 if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
552 shared->page_number[slotno] == pageno)
553 {
554 /*
555 * If page is still being read in, we must wait for I/O. Likewise
556 * if the page is being written and the caller said that's not OK.
557 */
560 !write_ok))
561 {
563 /* Now we must recheck state from the top */
564 continue;
565 }
566 /* Otherwise, it's ready to use */
567 SlruRecentlyUsed(shared, slotno);
568
569 /* update the stats counter of pages found in the SLRU */
571
572 return slotno;
573 }
574
575 /* We found no match; assert we selected a freeable slot */
577 (shared->page_status[slotno] == SLRU_PAGE_VALID &&
578 !shared->page_dirty[slotno]));
579
580 /* Mark the slot read-busy */
581 shared->page_number[slotno] = pageno;
583 shared->page_dirty[slotno] = false;
584
585 /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
587
588 /* Release bank lock while doing I/O */
590
591 /* Do the read */
592 ok = SlruPhysicalReadPage(ctl, pageno, slotno);
593
594 /* Set the LSNs for this newly read-in page to zero */
596
597 /* Re-acquire bank control lock and update page state */
599
600 Assert(shared->page_number[slotno] == pageno &&
602 !shared->page_dirty[slotno]);
603
605
607
608 /* Now it's okay to ereport if we failed */
609 if (!ok)
611
612 SlruRecentlyUsed(shared, slotno);
613
614 /* update the stats counter of pages not found in SLRU */
616
617 return slotno;
618 }
619}
620
621/*
622 * Find a page in a shared buffer, reading it in if necessary.
623 * The page number must correspond to an already-initialized page.
624 * The caller must intend only read-only access to the page.
625 *
626 * On error, the passed-in 'opaque_data' is passed to the
627 * 'errdetail_for_io_error' callback, to provide details on the operation that
628 * failed. It is only used for error reporting.
629 *
630 * Return value is the shared-buffer slot number now holding the page.
631 * The buffer's LRU access info is updated.
632 *
633 * Bank control lock must NOT be held at entry, but will be held at exit.
634 * It is unspecified whether the lock will be shared or exclusive.
635 */
636int
638{
639 SlruShared shared = ctl->shared;
641 int bankno = pageno % ctl->nbanks;
644
645 /* Try to find the page while holding only shared lock */
647
648 /* See if page is already in a buffer */
649 for (int slotno = bankstart; slotno < bankend; slotno++)
650 {
651 if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
652 shared->page_number[slotno] == pageno &&
654 {
655 /* See comments for SlruRecentlyUsed() */
656 SlruRecentlyUsed(shared, slotno);
657
658 /* update the stats counter of pages found in the SLRU */
660
661 return slotno;
662 }
663 }
664
665 /* No luck, so switch to normal exclusive lock and do regular read */
668
669 return SimpleLruReadPage(ctl, pageno, true, opaque_data);
670}
671
672/*
673 * Write a page from a shared buffer, if necessary.
674 * Does nothing if the specified slot is not dirty.
675 *
676 * NOTE: only one write attempt is made here. Hence, it is possible that
677 * the page is still dirty at exit (if someone else re-dirtied it during
678 * the write). However, we *do* attempt a fresh write even if the page
679 * is already being written; this is for checkpoints.
680 *
681 * Bank lock must be held at entry, and will be held at exit.
682 */
683static void
685{
686 SlruShared shared = ctl->shared;
687 int64 pageno = shared->page_number[slotno];
689 bool ok;
690
693
694 /* If a write is in progress, wait for it to finish */
696 shared->page_number[slotno] == pageno)
697 {
699 }
700
701 /*
702 * Do nothing if page is not dirty, or if buffer no longer contains the
703 * same page we were called for.
704 */
705 if (!shared->page_dirty[slotno] ||
706 shared->page_status[slotno] != SLRU_PAGE_VALID ||
707 shared->page_number[slotno] != pageno)
708 return;
709
710 /*
711 * Mark the slot write-busy, and clear the dirtybit. After this point, a
712 * transaction status update on this page will mark it dirty again.
713 */
715 shared->page_dirty[slotno] = false;
716
717 /* Acquire per-buffer lock (cannot deadlock, see notes at top) */
719
720 /* Release bank lock while doing I/O */
722
723 /* Do the write */
725
726 /* If we failed, and we're in a flush, better close the files */
727 if (!ok && fdata)
728 {
729 for (int i = 0; i < fdata->num_files; i++)
731 }
732
733 /* Re-acquire bank lock and update page state */
735
736 Assert(shared->page_number[slotno] == pageno &&
738
739 /* If we failed to write, mark the page dirty again */
740 if (!ok)
741 shared->page_dirty[slotno] = true;
742
744
746
747 /* Now it's okay to ereport if we failed */
748 if (!ok)
749 SlruReportIOError(ctl, pageno, NULL);
750
751 /* If part of a checkpoint, count this as a SLRU buffer written. */
752 if (fdata)
753 {
756 }
757}
758
759/*
760 * Wrapper of SlruInternalWritePage, for external callers.
761 * fdata is always passed a NULL here.
762 */
763void
765{
766 Assert(ctl->shared->page_status[slotno] != SLRU_PAGE_EMPTY);
767
769}
770
771/*
772 * Return whether the given page exists on disk.
773 *
774 * A false return means that either the file does not exist, or that it's not
775 * large enough to contain the given page.
776 */
777bool
779{
781 int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
782 int offset = rpageno * BLCKSZ;
783 char path[MAXPGPATH];
784 int fd;
785 bool result;
787
788 /* update the stats counter of checked pages */
789 pgstat_count_slru_blocks_exists(ctl->shared->slru_stats_idx);
790
791 SlruFileName(ctl, path, segno);
792
794 if (fd < 0)
795 {
796 /* expected: file doesn't exist */
797 if (errno == ENOENT)
798 return false;
799
800 /* report error normally */
803 SlruReportIOError(ctl, pageno, NULL);
804 }
805
806 if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
807 {
810 SlruReportIOError(ctl, pageno, NULL);
811 }
812
813 result = endpos >= (off_t) (offset + BLCKSZ);
814
815 if (CloseTransientFile(fd) != 0)
816 {
819 return false;
820 }
821
822 return result;
823}
824
825/*
826 * Physical read of a (previously existing) page into a buffer slot
827 *
828 * On failure, we cannot just ereport(ERROR) since caller has put state in
829 * shared memory that must be undone. So, we return false and save enough
830 * info in static variables to let SlruReportIOError make the report.
831 *
832 * For now, assume it's not worth keeping a file pointer open across
833 * read/write operations. We could cache one virtual file pointer ...
834 */
835static bool
837{
838 SlruShared shared = ctl->shared;
840 int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
841 off_t offset = rpageno * BLCKSZ;
842 char path[MAXPGPATH];
843 int fd;
844
845 SlruFileName(ctl, path, segno);
846
847 /*
848 * In a crash-and-restart situation, it's possible for us to receive
849 * commands to set the commit status of transactions whose bits are in
850 * already-truncated segments of the commit log (see notes in
851 * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
852 * where the file doesn't exist, and return zeroes instead.
853 */
855 if (fd < 0)
856 {
857 if (errno != ENOENT || !InRecovery)
858 {
861 return false;
862 }
863
864 ereport(LOG,
865 (errmsg("file \"%s\" doesn't exist, reading as zeroes",
866 path)));
867 MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
868 return true;
869 }
870
871 errno = 0;
873 if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
874 {
879 return false;
880 }
882
883 if (CloseTransientFile(fd) != 0)
884 {
887 return false;
888 }
889
890 return true;
891}
892
893/*
894 * Physical write of a page from a buffer slot
895 *
896 * On failure, we cannot just ereport(ERROR) since caller has put state in
897 * shared memory that must be undone. So, we return false and save enough
898 * info in static variables to let SlruReportIOError make the report.
899 *
900 * For now, assume it's not worth keeping a file pointer open across
901 * independent read/write operations. We do batch operations during
902 * SimpleLruWriteAll, though.
903 *
904 * fdata is NULL for a standalone write, pointer to open-file info during
905 * SimpleLruWriteAll.
906 */
907static bool
909{
910 SlruShared shared = ctl->shared;
912 int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
913 off_t offset = rpageno * BLCKSZ;
914 char path[MAXPGPATH];
915 int fd = -1;
916
917 /* update the stats counter of written pages */
919
920 /*
921 * Honor the write-WAL-before-data rule, if appropriate, so that we do not
922 * write out data before associated WAL records. This is the same action
923 * performed during FlushBuffer() in the main buffer manager.
924 */
925 if (shared->group_lsn != NULL)
926 {
927 /*
928 * We must determine the largest async-commit LSN for the page. This
929 * is a bit tedious, but since this entire function is a slow path
930 * anyway, it seems better to do this here than to maintain a per-page
931 * LSN variable (which'd need an extra comparison in the
932 * transaction-commit path).
933 */
935 int lsnindex;
936
938 max_lsn = shared->group_lsn[lsnindex++];
939 for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
940 {
942
943 if (max_lsn < this_lsn)
945 }
946
948 {
949 /*
950 * As noted above, elog(ERROR) is not acceptable here, so if
951 * XLogFlush were to fail, we must PANIC. This isn't much of a
952 * restriction because XLogFlush is just about all critical
953 * section anyway, but let's make sure.
954 */
958 }
959 }
960
961 /*
962 * During a SimpleLruWriteAll, we may already have the desired file open.
963 */
964 if (fdata)
965 {
966 for (int i = 0; i < fdata->num_files; i++)
967 {
968 if (fdata->segno[i] == segno)
969 {
970 fd = fdata->fd[i];
971 break;
972 }
973 }
974 }
975
976 if (fd < 0)
977 {
978 /*
979 * If the file doesn't already exist, we should create it. It is
980 * possible for this to need to happen when writing a page that's not
981 * first in its segment; we assume the OS can cope with that. (Note:
982 * it might seem that it'd be okay to create files only when
983 * SimpleLruZeroPage is called for the first page of a segment.
984 * However, if after a crash and restart the REDO logic elects to
985 * replay the log from a checkpoint before the latest one, then it's
986 * possible that we will get commands to set transaction status of
987 * transactions that have already been truncated from the commit log.
988 * Easiest way to deal with that is to accept references to
989 * nonexistent files here and in SlruPhysicalReadPage.)
990 *
991 * Note: it is possible for more than one backend to be executing this
992 * code simultaneously for different pages of the same file. Hence,
993 * don't use O_EXCL or O_TRUNC or anything like that.
994 */
995 SlruFileName(ctl, path, segno);
997 if (fd < 0)
998 {
1000 slru_errno = errno;
1001 return false;
1002 }
1003
1004 if (fdata)
1005 {
1006 if (fdata->num_files < MAX_WRITEALL_BUFFERS)
1007 {
1008 fdata->fd[fdata->num_files] = fd;
1009 fdata->segno[fdata->num_files] = segno;
1010 fdata->num_files++;
1011 }
1012 else
1013 {
1014 /*
1015 * In the unlikely event that we exceed MAX_WRITEALL_BUFFERS,
1016 * fall back to treating it as a standalone write.
1017 */
1018 fdata = NULL;
1019 }
1020 }
1021 }
1022
1023 errno = 0;
1025 if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
1026 {
1028 /* if write didn't set errno, assume problem is no disk space */
1029 if (errno == 0)
1030 errno = ENOSPC;
1032 slru_errno = errno;
1033 if (!fdata)
1035 return false;
1036 }
1038
1039 /* Queue up a sync request for the checkpointer. */
1040 if (ctl->sync_handler != SYNC_HANDLER_NONE)
1041 {
1042 FileTag tag;
1043
1044 INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
1045 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
1046 {
1047 /* No space to enqueue sync request. Do it synchronously. */
1049 if (pg_fsync(fd) != 0)
1050 {
1053 slru_errno = errno;
1055 return false;
1056 }
1058 }
1059 }
1060
1061 /* Close file, unless part of flush request. */
1062 if (!fdata)
1063 {
1064 if (CloseTransientFile(fd) != 0)
1065 {
1067 slru_errno = errno;
1068 return false;
1069 }
1070 }
1071
1072 return true;
1073}
1074
1075/*
1076 * Issue the error message after failure of SlruPhysicalReadPage or
1077 * SlruPhysicalWritePage. Call this after cleaning up shared-memory state.
1078 */
1079static void
1081{
1083 int rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
1084 int offset = rpageno * BLCKSZ;
1085 char path[MAXPGPATH];
1086
1087 SlruFileName(ctl, path, segno);
1088 errno = slru_errno;
1089 switch (slru_errcause)
1090 {
1091 case SLRU_OPEN_FAILED:
1092 ereport(ERROR,
1094 errmsg("could not open file \"%s\": %m", path),
1095 opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
1096 break;
1097 case SLRU_SEEK_FAILED:
1098 ereport(ERROR,
1100 errmsg("could not seek in file \"%s\" to offset %d: %m",
1101 path, offset),
1102 opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
1103 break;
1104 case SLRU_READ_FAILED:
1105 if (errno)
1106 ereport(ERROR,
1108 errmsg("could not read from file \"%s\" at offset %d: %m",
1109 path, offset),
1110 opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
1111 else
1112 ereport(ERROR,
1113 (errmsg("could not read from file \"%s\" at offset %d: read too few bytes",
1114 path, offset),
1115 opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
1116 break;
1117 case SLRU_WRITE_FAILED:
1118 if (errno)
1119 ereport(ERROR,
1121 errmsg("Could not write to file \"%s\" at offset %d: %m",
1122 path, offset),
1123 opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
1124 else
1125 ereport(ERROR,
1126 (errmsg("Could not write to file \"%s\" at offset %d: wrote too few bytes.",
1127 path, offset),
1128 opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
1129 break;
1130 case SLRU_FSYNC_FAILED:
1133 errmsg("could not fsync file \"%s\": %m",
1134 path),
1135 opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
1136 break;
1137 case SLRU_CLOSE_FAILED:
1138 ereport(ERROR,
1140 errmsg("could not close file \"%s\": %m",
1141 path),
1142 opaque_data ? ctl->errdetail_for_io_error(opaque_data) : 0));
1143 break;
1144 default:
1145 /* can't get here, we trust */
1146 elog(ERROR, "unrecognized SimpleLru error cause: %d",
1147 (int) slru_errcause);
1148 break;
1149 }
1150}
1151
1152/*
1153 * Mark a buffer slot "most recently used".
1154 */
1155static inline void
1157{
1160
1162
1163 /*
1164 * The reason for the if-test is that there are often many consecutive
1165 * accesses to the same page (particularly the latest page). By
1166 * suppressing useless increments of bank_cur_lru_count, we reduce the
1167 * probability that old pages' counts will "wrap around" and make them
1168 * appear recently used.
1169 *
1170 * We allow this code to be executed concurrently by multiple processes
1171 * within SimpleLruReadPage_ReadOnly(). As long as int reads and writes
1172 * are atomic, this should not cause any completely-bogus values to enter
1173 * the computation. However, it is possible for either bank_cur_lru_count
1174 * or individual page_lru_count entries to be "reset" to lower values than
1175 * they should have, in case a process is delayed while it executes this
1176 * function. With care in SlruSelectLRUPage(), this does little harm, and
1177 * in any case the absolute worst possible consequence is a nonoptimal
1178 * choice of page to evict. The gain from allowing concurrent reads of
1179 * SLRU pages seems worth it.
1180 */
1181 if (new_lru_count != shared->page_lru_count[slotno])
1182 {
1185 }
1186}
1187
1188/*
1189 * Select the slot to re-use when we need a free slot for the given page.
1190 *
1191 * The target page number is passed not only because we need to know the
1192 * correct bank to use, but also because we need to consider the possibility
1193 * that some other process reads in the target page while we are doing I/O to
1194 * free a slot. Hence, check or recheck to see if any slot already holds the
1195 * target page, and return that slot if so. Thus, the returned slot is
1196 * *either* a slot already holding the pageno (could be any state except
1197 * EMPTY), *or* a freeable slot (state EMPTY or CLEAN).
1198 *
1199 * The correct bank lock must be held at entry, and will be held at exit.
1200 */
1201static int
1203{
1204 SlruShared shared = ctl->shared;
1205
1206 /* Outer loop handles restart after I/O */
1207 for (;;)
1208 {
1209 int cur_count;
1210 int bestvalidslot = 0; /* keep compiler quiet */
1211 int best_valid_delta = -1;
1212 int64 best_valid_page_number = 0; /* keep compiler quiet */
1213 int bestinvalidslot = 0; /* keep compiler quiet */
1214 int best_invalid_delta = -1;
1215 int64 best_invalid_page_number = 0; /* keep compiler quiet */
1216 int bankno = pageno % ctl->nbanks;
1219
1221
1222 /* See if page already has a buffer assigned */
1223 for (int slotno = bankstart; slotno < bankend; slotno++)
1224 {
1225 if (shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
1226 shared->page_number[slotno] == pageno)
1227 return slotno;
1228 }
1229
1230 /*
1231 * If we find any EMPTY slot, just select that one. Else choose a
1232 * victim page to replace. We normally take the least recently used
1233 * valid page, but we will never take the slot containing
1234 * latest_page_number, even if it appears least recently used. We
1235 * will select a slot that is already I/O busy only if there is no
1236 * other choice: a read-busy slot will not be least recently used once
1237 * the read finishes, and waiting for an I/O on a write-busy slot is
1238 * inferior to just picking some other slot. Testing shows the slot
1239 * we pick instead will often be clean, allowing us to begin a read at
1240 * once.
1241 *
1242 * Normally the page_lru_count values will all be different and so
1243 * there will be a well-defined LRU page. But since we allow
1244 * concurrent execution of SlruRecentlyUsed() within
1245 * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages
1246 * acquire the same lru_count values. In that case we break ties by
1247 * choosing the furthest-back page.
1248 *
1249 * Notice that this next line forcibly advances cur_lru_count to a
1250 * value that is certainly beyond any value that will be in the
1251 * page_lru_count array after the loop finishes. This ensures that
1252 * the next execution of SlruRecentlyUsed will mark the page newly
1253 * used, even if it's for a page that has the current counter value.
1254 * That gets us back on the path to having good data when there are
1255 * multiple pages with the same lru_count.
1256 */
1257 cur_count = (shared->bank_cur_lru_count[bankno])++;
1258 for (int slotno = bankstart; slotno < bankend; slotno++)
1259 {
1260 int this_delta;
1262
1263 if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1264 return slotno;
1265
1267 if (this_delta < 0)
1268 {
1269 /*
1270 * Clean up in case shared updates have caused cur_count
1271 * increments to get "lost". We back off the page counts,
1272 * rather than trying to increase cur_count, to avoid any
1273 * question of infinite loops or failure in the presence of
1274 * wrapped-around counts.
1275 */
1276 shared->page_lru_count[slotno] = cur_count;
1277 this_delta = 0;
1278 }
1279
1280 /*
1281 * If this page is the one most recently zeroed, don't consider it
1282 * an eviction candidate. See comments in SimpleLruZeroPage for an
1283 * explanation about the lack of a memory barrier here.
1284 */
1286 if (this_page_number ==
1288 continue;
1289
1290 if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1291 {
1294 ctl->PagePrecedes(this_page_number,
1296 {
1300 }
1301 }
1302 else
1303 {
1306 ctl->PagePrecedes(this_page_number,
1308 {
1312 }
1313 }
1314 }
1315
1316 /*
1317 * If all pages (except possibly the latest one) are I/O busy, we'll
1318 * have to wait for an I/O to complete and then retry. In that
1319 * unhappy case, we choose to wait for the I/O on the least recently
1320 * used slot, on the assumption that it was likely initiated first of
1321 * all the I/Os in progress and may therefore finish first.
1322 */
1323 if (best_valid_delta < 0)
1324 {
1326 continue;
1327 }
1328
1329 /*
1330 * If the selected page is clean, we're set.
1331 */
1332 if (!shared->page_dirty[bestvalidslot])
1333 return bestvalidslot;
1334
1335 /*
1336 * Write the page.
1337 */
1339
1340 /*
1341 * Now loop back and try again. This is the easiest way of dealing
1342 * with corner cases such as the victim page being re-dirtied while we
1343 * wrote it.
1344 */
1345 }
1346}
1347
1348/*
1349 * Write dirty pages to disk during checkpoint or database shutdown. Flushing
1350 * is deferred until the next call to ProcessSyncRequests(), though we do fsync
1351 * the containing directory here to make sure that newly created directory
1352 * entries are on disk.
1353 */
1354void
1356{
1357 SlruShared shared = ctl->shared;
1359 int64 pageno = 0;
1360 int prevbank = SlotGetBankNumber(0);
1361 bool ok;
1362
1363 /* update the stats counter of flushes */
1365
1366 /*
1367 * Find and write dirty pages
1368 */
1369 fdata.num_files = 0;
1370
1372
1373 for (int slotno = 0; slotno < shared->num_slots; slotno++)
1374 {
1376
1377 /*
1378 * If the current bank lock is not same as the previous bank lock then
1379 * release the previous lock and acquire the new lock.
1380 */
1381 if (curbank != prevbank)
1382 {
1385 prevbank = curbank;
1386 }
1387
1388 /* Do nothing if slot is unused */
1389 if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1390 continue;
1391
1393
1394 /*
1395 * In some places (e.g. checkpoints), we cannot assert that the slot
1396 * is clean now, since another process might have re-dirtied it
1397 * already. That's okay.
1398 */
1400 shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
1401 (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1402 !shared->page_dirty[slotno]));
1403 }
1404
1406
1407 /*
1408 * Now close any files that were open
1409 */
1410 ok = true;
1411 for (int i = 0; i < fdata.num_files; i++)
1412 {
1413 if (CloseTransientFile(fdata.fd[i]) != 0)
1414 {
1416 slru_errno = errno;
1417 pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
1418 ok = false;
1419 }
1420 }
1421 if (!ok)
1422 SlruReportIOError(ctl, pageno, NULL);
1423
1424 /* Ensure that directory entries for new files are on disk. */
1425 if (ctl->sync_handler != SYNC_HANDLER_NONE)
1426 fsync_fname(ctl->Dir, true);
1427}
1428
1429/*
1430 * Remove all segments before the one holding the passed page number
1431 *
1432 * All SLRUs prevent concurrent calls to this function, either with an LWLock
1433 * or by calling it only as part of a checkpoint. Mutual exclusion must begin
1434 * before computing cutoffPage. Mutual exclusion must end after any limit
1435 * update that would permit other backends to write fresh data into the
1436 * segment immediately preceding the one containing cutoffPage. Otherwise,
1437 * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
1438 * after it has accrued freshly-written data.
1439 */
1440void
1442{
1443 SlruShared shared = ctl->shared;
1444 int prevbank;
1445
1446 /* update the stats counter of truncates */
1448
1449 /*
1450 * Scan shared memory and remove any pages preceding the cutoff page, to
1451 * ensure we won't rewrite them later. (Since this is normally called in
1452 * or just after a checkpoint, any dirty pages should have been flushed
1453 * already ... we're just being extra careful here.)
1454 */
1455restart:
1456
1457 /*
1458 * An important safety check: the current endpoint page must not be
1459 * eligible for removal. This check is just a backstop against wraparound
1460 * bugs elsewhere in SLRU handling, so we don't care if we read a slightly
1461 * outdated value; therefore we don't add a memory barrier.
1462 */
1463 if (ctl->PagePrecedes(pg_atomic_read_u64(&shared->latest_page_number),
1464 cutoffPage))
1465 {
1466 ereport(LOG,
1467 (errmsg("could not truncate directory \"%s\": apparent wraparound",
1468 ctl->Dir)));
1469 return;
1470 }
1471
1474 for (int slotno = 0; slotno < shared->num_slots; slotno++)
1475 {
1477
1478 /*
1479 * If the current bank lock is not same as the previous bank lock then
1480 * release the previous lock and acquire the new lock.
1481 */
1482 if (curbank != prevbank)
1483 {
1486 prevbank = curbank;
1487 }
1488
1489 if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1490 continue;
1491 if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
1492 continue;
1493
1494 /*
1495 * If page is clean, just change state to EMPTY (expected case).
1496 */
1497 if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1498 !shared->page_dirty[slotno])
1499 {
1501 continue;
1502 }
1503
1504 /*
1505 * Hmm, we have (or may have) I/O operations acting on the page, so
1506 * we've got to wait for them to finish and then start again. This is
1507 * the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
1508 * wouldn't it be OK to just discard it without writing it?
1509 * SlruMayDeleteSegment() uses a stricter qualification, so we might
1510 * not delete this page in the end; even if we don't delete it, we
1511 * won't have cause to read its data again. For now, keep the logic
1512 * the same as it was.)
1513 */
1514 if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1516 else
1518
1520 goto restart;
1521 }
1522
1524
1525 /* Now we can remove the old segment(s) */
1527}
1528
1529/*
1530 * Delete an individual SLRU segment.
1531 *
1532 * NB: This does not touch the SLRU buffers themselves, callers have to ensure
1533 * they either can't yet contain anything, or have already been cleaned out.
1534 */
1535static void
1537{
1538 char path[MAXPGPATH];
1539
1540 /* Forget any fsync requests queued for this segment. */
1541 if (ctl->sync_handler != SYNC_HANDLER_NONE)
1542 {
1543 FileTag tag;
1544
1545 INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
1547 }
1548
1549 /* Unlink the file. */
1550 SlruFileName(ctl, path, segno);
1551 ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path)));
1552 unlink(path);
1553}
1554
1555/*
1556 * Delete an individual SLRU segment, identified by the segment number.
1557 */
1558void
1560{
1561 SlruShared shared = ctl->shared;
1562 int prevbank = SlotGetBankNumber(0);
1563 bool did_write;
1564
1565 /* Clean out any possibly existing references to the segment. */
1567restart:
1568 did_write = false;
1569 for (int slotno = 0; slotno < shared->num_slots; slotno++)
1570 {
1573
1574 /*
1575 * If the current bank lock is not same as the previous bank lock then
1576 * release the previous lock and acquire the new lock.
1577 */
1578 if (curbank != prevbank)
1579 {
1582 prevbank = curbank;
1583 }
1584
1585 if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
1586 continue;
1587
1589 /* not the segment we're looking for */
1590 if (pagesegno != segno)
1591 continue;
1592
1593 /* If page is clean, just change state to EMPTY (expected case). */
1594 if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
1595 !shared->page_dirty[slotno])
1596 {
1598 continue;
1599 }
1600
1601 /* Same logic as SimpleLruTruncate() */
1602 if (shared->page_status[slotno] == SLRU_PAGE_VALID)
1604 else
1606
1607 did_write = true;
1608 }
1609
1610 /*
1611 * Be extra careful and re-check. The IO functions release the control
1612 * lock, so new pages could have been read in.
1613 */
1614 if (did_write)
1615 goto restart;
1616
1618
1620}
1621
1622/*
1623 * Determine whether a segment is okay to delete.
1624 *
1625 * segpage is the first page of the segment, and cutoffPage is the oldest (in
1626 * PagePrecedes order) page in the SLRU containing still-useful data. Since
1627 * every core PagePrecedes callback implements "wrap around", check the
1628 * segment's first and last pages:
1629 *
1630 * first<cutoff && last<cutoff: yes
1631 * first<cutoff && last>=cutoff: no; cutoff falls inside this segment
1632 * first>=cutoff && last<cutoff: no; wrap point falls inside this segment
1633 * first>=cutoff && last>=cutoff: no; every page of this segment is too young
1634 */
1635static bool
1637{
1639
1641
1642 return (ctl->PagePrecedes(segpage, cutoffPage) &&
1643 ctl->PagePrecedes(seg_last_page, cutoffPage));
1644}
1645
1646#ifdef USE_ASSERT_CHECKING
1647static void
1649{
1651 rhs;
1653 oldestPage;
1655 oldestXact;
1656
1657 /*
1658 * Compare an XID pair having undefined order (see RFC 1982), a pair at
1659 * "opposite ends" of the XID space. TransactionIdPrecedes() treats each
1660 * as preceding the other. If RHS is oldestXact, LHS is the first XID we
1661 * must not assign.
1662 */
1663 lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */
1664 rhs = lhs + (1U << 31);
1673 Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
1674 Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
1675 Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
1676 Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
1677 Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
1678 Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
1679 Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
1680 || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */
1681 Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
1682 || (1U << 31) % per_page != 0);
1683 Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
1684 Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
1685 Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
1686
1687 /*
1688 * GetNewTransactionId() has assigned the last XID it can safely use, and
1689 * that XID is in the *LAST* page of the second segment. We must not
1690 * delete that segment.
1691 */
1693 newestXact = newestPage * per_page + offset;
1695 oldestXact = newestXact + 1;
1696 oldestXact -= 1U << 31;
1697 oldestPage = oldestXact / per_page;
1699 (newestPage -
1701 oldestPage));
1702
1703 /*
1704 * GetNewTransactionId() has assigned the last XID it can safely use, and
1705 * that XID is in the *FIRST* page of the second segment. We must not
1706 * delete that segment.
1707 */
1709 newestXact = newestPage * per_page + offset;
1711 oldestXact = newestXact + 1;
1712 oldestXact -= 1U << 31;
1713 oldestPage = oldestXact / per_page;
1715 (newestPage -
1717 oldestPage));
1718}
1719
1720/*
1721 * Unit-test a PagePrecedes function.
1722 *
1723 * This assumes every uint32 >= FirstNormalTransactionId is a valid key. It
1724 * assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
1725 * (MultiXactMemberCtl separates flags from XIDs. NotifyCtl has
1726 * variable-length entries, no keys, and no random access. These unit tests
1727 * do not apply to them.)
1728 */
1729void
1731{
1732 /* Test first, middle and last entries of a page. */
1736}
1737#endif
1738
1739/*
1740 * SlruScanDirectory callback
1741 * This callback reports true if there's any segment wholly prior to the
1742 * one containing the page passed as "data".
1743 */
1744bool
1746 void *data)
1747{
1748 int64 cutoffPage = *(int64 *) data;
1749
1751 return true; /* found one; don't iterate any more */
1752
1753 return false; /* keep going */
1754}
1755
1756/*
1757 * SlruScanDirectory callback.
1758 * This callback deletes segments prior to the one passed in as "data".
1759 */
1760static bool
1762 void *data)
1763{
1764 int64 cutoffPage = *(int64 *) data;
1765
1768
1769 return false; /* keep going */
1770}
1771
1772/*
1773 * SlruScanDirectory callback.
1774 * This callback deletes all segments.
1775 */
1776bool
1778{
1780
1781 return false; /* keep going */
1782}
1783
1784/*
1785 * An internal function used by SlruScanDirectory().
1786 *
1787 * Returns true if a file with a name of a given length may be a correct
1788 * SLRU segment.
1789 */
1790static inline bool
1792{
1793 if (ctl->long_segment_names)
1794 return (len == 15); /* see SlruFileName() */
1795 else
1796
1797 /*
1798 * Commit 638cf09e76d allowed 5-character lengths. Later commit
1799 * 73c986adde5 allowed 6-character length.
1800 *
1801 * Note: There is an ongoing plan to migrate all SLRUs to 64-bit page
1802 * numbers, and the corresponding 15-character file names, which may
1803 * eventually deprecate the support for 4, 5, and 6-character names.
1804 */
1805 return (len == 4 || len == 5 || len == 6);
1806}
1807
1808/*
1809 * Scan the SimpleLru directory and apply a callback to each file found in it.
1810 *
1811 * If the callback returns true, the scan is stopped. The last return value
1812 * from the callback is returned.
1813 *
1814 * The callback receives the following arguments: 1. the SlruCtl struct for the
1815 * slru being truncated; 2. the filename being considered; 3. the page number
1816 * for the first page of that file; 4. a pointer to the opaque data given to us
1817 * by the caller.
1818 *
1819 * Note that the ordering in which the directory is scanned is not guaranteed.
1820 *
1821 * Note that no locking is applied.
1822 */
1823bool
1825{
1826 bool retval = false;
1827 DIR *cldir;
1828 struct dirent *clde;
1829 int64 segno;
1830 int64 segpage;
1831
1832 cldir = AllocateDir(ctl->Dir);
1833 while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
1834 {
1835 size_t len;
1836
1837 len = strlen(clde->d_name);
1838
1840 strspn(clde->d_name, "0123456789ABCDEF") == len)
1841 {
1842 segno = strtoi64(clde->d_name, NULL, 16);
1844
1845 elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s",
1846 ctl->Dir, clde->d_name);
1847 retval = callback(ctl, clde->d_name, segpage, data);
1848 if (retval)
1849 break;
1850 }
1851 }
1852 FreeDir(cldir);
1853
1854 return retval;
1855}
1856
1857/*
1858 * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
1859 * that they can provide the correct "SlruCtl" (otherwise we don't know how to
1860 * build the path), but they just forward to this common implementation that
1861 * performs the fsync.
1862 */
1863int
1864SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
1865{
1866 int fd;
1867 int save_errno;
1868 int result;
1869
1870 SlruFileName(ctl, path, ftag->segno);
1871
1873 if (fd < 0)
1874 return -1;
1875
1877 result = pg_fsync(fd);
1879 save_errno = errno;
1880
1882
1883 errno = save_errno;
1884 return result;
1885}
static void pg_atomic_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition atomics.h:485
static void pg_atomic_init_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition atomics.h:453
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
#define INT64CONST(x)
Definition c.h:632
#define Min(x, y)
Definition c.h:1093
#define MAXALIGN(LEN)
Definition c.h:898
#define Max(x, y)
Definition c.h:1087
#define BUFFERALIGN(LEN)
Definition c.h:900
#define Assert(condition)
Definition c.h:945
int64_t int64
Definition c.h:615
#define PG_BINARY
Definition c.h:1376
uint32_t uint32
Definition c.h:618
#define MemSet(start, val, len)
Definition c.h:1109
uint32 TransactionId
Definition c.h:738
size_t Size
Definition c.h:691
int errcode_for_file_access(void)
Definition elog.c:897
#define LOG
Definition elog.h:31
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define DEBUG2
Definition elog.h:29
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int FreeDir(DIR *dir)
Definition fd.c:3009
int CloseTransientFile(int fd)
Definition fd.c:2855
void fsync_fname(const char *fname, bool isdir)
Definition fd.c:757
int data_sync_elevel(int elevel)
Definition fd.c:3986
DIR * AllocateDir(const char *dirname)
Definition fd.c:2891
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition fd.c:2957
int pg_fsync(int fd)
Definition fd.c:390
int OpenTransientFile(const char *fileName, int fileFlags)
Definition fd.c:2678
int NBuffers
Definition globals.c:142
bool IsUnderPostmaster
Definition globals.c:120
#define newval
#define GUC_check_errdetail
Definition guc.h:507
int i
Definition isn.c:77
bool LWLockHeldByMe(LWLock *lock)
Definition lwlock.c:1912
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1177
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1956
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1794
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition lwlock.c:699
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1348
@ LW_SHARED
Definition lwlock.h:113
@ LW_EXCLUSIVE
Definition lwlock.h:112
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define END_CRIT_SECTION()
Definition miscadmin.h:152
static char * errmsg
#define MAXPGPATH
#define SLRU_PAGES_PER_SEGMENT
const void size_t len
const void * data
static char * filename
Definition pg_dumpall.c:133
static XLogRecPtr endpos
void pgstat_count_slru_blocks_zeroed(int slru_idx)
void pgstat_count_slru_blocks_hit(int slru_idx)
void pgstat_count_slru_truncate(int slru_idx)
void pgstat_count_slru_blocks_read(int slru_idx)
void pgstat_count_slru_blocks_written(int slru_idx)
void pgstat_count_slru_flush(int slru_idx)
void pgstat_count_slru_blocks_exists(int slru_idx)
PgStat_CheckpointerStats PendingCheckpointerStats
int pgstat_get_slru_index(const char *name)
#define pg_pwrite
Definition port.h:248
#define pg_pread
Definition port.h:247
#define snprintf
Definition port.h:260
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
static int fd(const char *x, int i)
static int fb(int x)
tree ctl
Definition radixtree.h:1838
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition shmem.c:381
void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, const char *subdir, int buffer_tranche_id, int bank_tranche_id, SyncRequestHandler sync_handler, bool long_segment_names)
Definition slru.c:254
static int SlruFileName(SlruCtl ctl, char *path, int64 segno)
Definition slru.c:92
static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
Definition slru.c:836
#define INIT_SLRUFILETAG(a, xx_handler, xx_segno)
Definition slru.c:157
void SimpleLruWritePage(SlruCtl ctl, int slotno)
Definition slru.c:764
void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
Definition slru.c:1355
static bool SlruMayDeleteSegment(SlruCtl ctl, int64 segpage, int64 cutoffPage)
Definition slru.c:1636
static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
Definition slru.c:433
#define SLRU_BANK_SIZE
Definition slru.c:144
int SimpleLruAutotuneBuffers(int divisor, int max)
Definition slru.c:233
static void SlruReportIOError(SlruCtl ctl, int64 pageno, const void *opaque_data)
Definition slru.c:1080
static bool SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
Definition slru.c:908
static bool SlruCorrectSegmentFilenameLength(SlruCtl ctl, size_t len)
Definition slru.c:1791
static SlruErrorCause slru_errcause
Definition slru.c:175
#define MAX_WRITEALL_BUFFERS
Definition slru.c:124
int SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok, const void *opaque_data)
Definition slru.c:533
static void SimpleLruWaitIO(SlruCtl ctl, int slotno)
Definition slru.c:475
static int slru_errno
Definition slru.c:176
bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int64 pageno)
Definition slru.c:778
void SlruDeleteSegment(SlruCtl ctl, int64 segno)
Definition slru.c:1559
static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
Definition slru.c:684
bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
Definition slru.c:1824
bool SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition slru.c:1777
int SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
Definition slru.c:1864
static int SlruSelectLRUPage(SlruCtl ctl, int64 pageno)
Definition slru.c:1202
#define SlotGetBankNumber(slotno)
Definition slru.c:149
int SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
Definition slru.c:380
void SimpleLruZeroAndWritePage(SlruCtl ctl, int64 pageno)
Definition slru.c:449
void SimpleLruTruncate(SlruCtl ctl, int64 cutoffPage)
Definition slru.c:1441
static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno)
Definition slru.c:1536
int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int64 pageno, const void *opaque_data)
Definition slru.c:637
struct SlruWriteAllData * SlruWriteAll
Definition slru.c:133
SlruErrorCause
Definition slru.c:166
@ SLRU_WRITE_FAILED
Definition slru.c:170
@ SLRU_FSYNC_FAILED
Definition slru.c:171
@ SLRU_SEEK_FAILED
Definition slru.c:168
@ SLRU_OPEN_FAILED
Definition slru.c:167
@ SLRU_CLOSE_FAILED
Definition slru.c:172
@ SLRU_READ_FAILED
Definition slru.c:169
Size SimpleLruShmemSize(int nslots, int nlsns)
Definition slru.c:200
bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition slru.c:1745
static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition slru.c:1761
static void SlruRecentlyUsed(SlruShared shared, int slotno)
Definition slru.c:1156
bool check_slru_buffers(const char *name, int *newval)
Definition slru.c:360
static LWLock * SimpleLruGetBankLock(SlruCtl ctl, int64 pageno)
Definition slru.h:171
SlruSharedData * SlruShared
Definition slru.h:107
#define SlruPagePrecedesUnitTests(ctl, per_page)
Definition slru.h:196
bool(* SlruScanCallback)(SlruCtl ctl, char *filename, int64 segpage, void *data)
Definition slru.h:201
#define SLRU_MAX_ALLOWED_BUFFERS
Definition slru.h:25
SlruPageStatus
Definition slru.h:34
@ SLRU_PAGE_VALID
Definition slru.h:37
@ SLRU_PAGE_WRITE_IN_PROGRESS
Definition slru.h:38
@ SLRU_PAGE_READ_IN_PROGRESS
Definition slru.h:36
@ SLRU_PAGE_EMPTY
Definition slru.h:35
int ckpt_slru_written
Definition xlog.h:179
Definition dirent.c:26
Definition sync.h:51
uint64 segno
Definition sync.h:55
PgStat_Counter slru_written
Definition pgstat.h:270
int slru_stats_idx
Definition slru.h:104
int64 * page_number
Definition slru.h:59
int num_slots
Definition slru.h:50
LWLockPadded * bank_locks
Definition slru.h:66
int * page_lru_count
Definition slru.h:60
pg_atomic_uint64 latest_page_number
Definition slru.h:101
XLogRecPtr * group_lsn
Definition slru.h:93
int * bank_cur_lru_count
Definition slru.h:83
int lsn_groups_per_page
Definition slru.h:94
SlruPageStatus * page_status
Definition slru.h:57
bool * page_dirty
Definition slru.h:58
LWLockPadded * buffer_locks
Definition slru.h:63
char ** page_buffer
Definition slru.h:56
int fd[MAX_WRITEALL_BUFFERS]
Definition slru.c:129
int64 segno[MAX_WRITEALL_BUFFERS]
Definition slru.c:130
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition sync.c:581
SyncRequestHandler
Definition sync.h:36
@ SYNC_HANDLER_NONE
Definition sync.h:42
@ SYNC_FORGET_REQUEST
Definition sync.h:27
@ SYNC_REQUEST
Definition sync.h:25
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
static bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition transam.h:312
static bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition transam.h:263
LWLock lock
Definition lwlock.h:70
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
const char * name
CheckpointStatsData CheckpointStats
Definition xlog.c:213
void XLogFlush(XLogRecPtr record)
Definition xlog.c:2767
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
uint64 XLogRecPtr
Definition xlogdefs.h:21
bool InRecovery
Definition xlogutils.c:50