PostgreSQL Source Code git master
walsummarizer.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * walsummarizer.c
4 *
5 * Background process to perform WAL summarization, if it is enabled.
6 * It continuously scans the write-ahead log and periodically emits a
7 * summary file which indicates which blocks in which relation forks
8 * were modified by WAL records in the LSN range covered by the summary
9 * file. See walsummary.c and blkreftable.c for more details on the
10 * naming and contents of WAL summary files.
11 *
12 * If configured to do, this background process will also remove WAL
13 * summary files when the file timestamp is older than a configurable
14 * threshold (but only if the WAL has been removed first).
15 *
16 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
17 *
18 * IDENTIFICATION
19 * src/backend/postmaster/walsummarizer.c
20 *
21 *-------------------------------------------------------------------------
22 */
23#include "postgres.h"
24
25#include "access/timeline.h"
26#include "access/xlog.h"
28#include "access/xlogrecovery.h"
29#include "access/xlogutils.h"
30#include "backup/walsummary.h"
33#include "common/blkreftable.h"
34#include "libpq/pqsignal.h"
35#include "miscadmin.h"
40#include "storage/fd.h"
41#include "storage/ipc.h"
42#include "storage/latch.h"
43#include "storage/lwlock.h"
44#include "storage/proc.h"
45#include "storage/procsignal.h"
46#include "storage/shmem.h"
47#include "utils/guc.h"
48#include "utils/memutils.h"
49#include "utils/wait_event.h"
50
51/*
52 * Data in shared memory related to WAL summarization.
53 */
54typedef struct
55{
56 /*
57 * These fields are protected by WALSummarizerLock.
58 *
59 * Until we've discovered what summary files already exist on disk and
60 * stored that information in shared memory, initialized is false and the
61 * other fields here contain no meaningful information. After that has
62 * been done, initialized is true.
63 *
64 * summarized_tli and summarized_lsn indicate the last LSN and TLI at
65 * which the next summary file will start. Normally, these are the LSN and
66 * TLI at which the last file ended; in such case, lsn_is_exact is true.
67 * If, however, the LSN is just an approximation, then lsn_is_exact is
68 * false. This can happen if, for example, there are no existing WAL
69 * summary files at startup. In that case, we have to derive the position
70 * at which to start summarizing from the WAL files that exist on disk,
71 * and so the LSN might point to the start of the next file even though
72 * that might happen to be in the middle of a WAL record.
73 *
74 * summarizer_pgprocno is the proc number of the summarizer process, if
75 * one is running, or else INVALID_PROC_NUMBER.
76 *
77 * pending_lsn is used by the summarizer to advertise the ending LSN of a
78 * record it has recently read. It shouldn't ever be less than
79 * summarized_lsn, but might be greater, because the summarizer buffers
80 * data for a range of LSNs in memory before writing out a new file.
81 */
88
89 /*
90 * This field handles its own synchronization.
91 */
94
95/*
96 * Private data for our xlogreader's page read callback.
97 */
98typedef struct
99{
105
106/* Pointer to shared memory state. */
108
109/*
110 * When we reach end of WAL and need to read more, we sleep for a number of
111 * milliseconds that is an integer multiple of MS_PER_SLEEP_QUANTUM. This is
112 * the multiplier. It should vary between 1 and MAX_SLEEP_QUANTA, depending
113 * on system activity. See summarizer_wait_for_wal() for how we adjust this.
114 */
115static long sleep_quanta = 1;
116
117/*
118 * The sleep time will always be a multiple of 200ms and will not exceed
119 * thirty seconds (150 * 200 = 30 * 1000). Note that the timeout here needs
120 * to be substantially less than the maximum amount of time for which an
121 * incremental backup will wait for this process to catch up. Otherwise, an
122 * incremental backup might time out on an idle system just because we sleep
123 * for too long.
124 */
125#define MAX_SLEEP_QUANTA 150
126#define MS_PER_SLEEP_QUANTUM 200
127
128/*
129 * This is a count of the number of pages of WAL that we've read since the
130 * last time we waited for more WAL to appear.
131 */
133
134/*
135 * Most recent RedoRecPtr value observed by MaybeRemoveOldWalSummaries.
136 */
138
139/*
140 * GUC parameters
141 */
142bool summarize_wal = false;
144
145static void WalSummarizerShutdown(int code, Datum arg);
147static void HandleWalSummarizerInterrupts(void);
148static XLogRecPtr SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn,
149 bool exact, XLogRecPtr switch_lsn,
150 XLogRecPtr maximum_lsn);
152 BlockRefTable *brtab);
154 BlockRefTable *brtab);
156 BlockRefTable *brtab);
158 bool *new_fast_forward);
160 XLogRecPtr targetPagePtr,
161 int reqLen,
162 XLogRecPtr targetRecPtr,
163 char *cur_page);
164static void summarizer_wait_for_wal(void);
165static void MaybeRemoveOldWalSummaries(void);
166
167/*
168 * Amount of shared memory required for this module.
169 */
170Size
172{
173 return sizeof(WalSummarizerData);
174}
175
176/*
177 * Create or attach to shared memory segment for this module.
178 */
179void
181{
182 bool found;
183
185 ShmemInitStruct("Wal Summarizer Ctl", WalSummarizerShmemSize(),
186 &found);
187
188 if (!found)
189 {
190 /*
191 * First time through, so initialize.
192 *
193 * We're just filling in dummy values here -- the real initialization
194 * will happen when GetOldestUnsummarizedLSN() is called for the first
195 * time.
196 */
204 }
205}
206
207/*
208 * Entry point for walsummarizer process.
209 */
210void
211WalSummarizerMain(char *startup_data, size_t startup_data_len)
212{
213 sigjmp_buf local_sigjmp_buf;
214 MemoryContext context;
215
216 /*
217 * Within this function, 'current_lsn' and 'current_tli' refer to the
218 * point from which the next WAL summary file should start. 'exact' is
219 * true if 'current_lsn' is known to be the start of a WAL record or WAL
220 * segment, and false if it might be in the middle of a record someplace.
221 *
222 * 'switch_lsn' and 'switch_tli', if set, are the LSN at which we need to
223 * switch to a new timeline and the timeline to which we need to switch.
224 * If not set, we either haven't figured out the answers yet or we're
225 * already on the latest timeline.
226 */
227 XLogRecPtr current_lsn;
228 TimeLineID current_tli;
229 bool exact;
230 XLogRecPtr switch_lsn = InvalidXLogRecPtr;
231 TimeLineID switch_tli = 0;
232
233 Assert(startup_data_len == 0);
234
237
239 (errmsg_internal("WAL summarizer started")));
240
241 /*
242 * Properly accept or ignore signals the postmaster might send us
243 *
244 * We have no particular use for SIGINT at the moment, but seems
245 * reasonable to treat like SIGTERM.
246 */
250 /* SIGQUIT handler was already set up by InitPostmasterChild */
251 pqsignal(SIGALRM, SIG_IGN);
252 pqsignal(SIGPIPE, SIG_IGN);
254 pqsignal(SIGUSR2, SIG_IGN); /* not used */
255
256 /* Advertise ourselves. */
258 LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
260 LWLockRelease(WALSummarizerLock);
261
262 /* Create and switch to a memory context that we can reset on error. */
264 "Wal Summarizer",
266 MemoryContextSwitchTo(context);
267
268 /*
269 * Reset some signals that are accepted by postmaster but not here
270 */
271 pqsignal(SIGCHLD, SIG_DFL);
272
273 /*
274 * If an exception is encountered, processing resumes here.
275 */
276 if (sigsetjmp(local_sigjmp_buf, 1) != 0)
277 {
278 /* Since not using PG_TRY, must reset error stack by hand */
279 error_context_stack = NULL;
280
281 /* Prevent interrupts while cleaning up */
283
284 /* Report the error to the server log */
286
287 /* Release resources we might have acquired. */
292 AtEOXact_Files(false);
293 AtEOXact_HashTables(false);
294
295 /*
296 * Now return to normal top-level context and clear ErrorContext for
297 * next time.
298 */
299 MemoryContextSwitchTo(context);
301
302 /* Flush any leaked data in the top-level context */
303 MemoryContextReset(context);
304
305 /* Now we can allow interrupts again */
307
308 /*
309 * Sleep for 10 seconds before attempting to resume operations in
310 * order to avoid excessive logging.
311 *
312 * Many of the likely error conditions are things that will repeat
313 * every time. For example, if the WAL can't be read or the summary
314 * can't be written, only administrator action will cure the problem.
315 * So a really fast retry time doesn't seem to be especially
316 * beneficial, and it will clutter the logs.
317 */
318 (void) WaitLatch(NULL,
320 10000,
321 WAIT_EVENT_WAL_SUMMARIZER_ERROR);
322 }
323
324 /* We can now handle ereport(ERROR) */
325 PG_exception_stack = &local_sigjmp_buf;
326
327 /*
328 * Unblock signals (they were blocked when the postmaster forked us)
329 */
330 sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
331
332 /*
333 * Fetch information about previous progress from shared memory, and ask
334 * GetOldestUnsummarizedLSN to reset pending_lsn to summarized_lsn. We
335 * might be recovering from an error, and if so, pending_lsn might have
336 * advanced past summarized_lsn, but any WAL we read previously has been
337 * lost and will need to be reread.
338 *
339 * If we discover that WAL summarization is not enabled, just exit.
340 */
341 current_lsn = GetOldestUnsummarizedLSN(&current_tli, &exact);
342 if (XLogRecPtrIsInvalid(current_lsn))
343 proc_exit(0);
344
345 /*
346 * Loop forever
347 */
348 for (;;)
349 {
350 XLogRecPtr latest_lsn;
351 TimeLineID latest_tli;
352 XLogRecPtr end_of_summary_lsn;
353
354 /* Flush any leaked data in the top-level context */
355 MemoryContextReset(context);
356
357 /* Process any signals received recently. */
359
360 /* If it's time to remove any old WAL summaries, do that now. */
362
363 /* Find the LSN and TLI up to which we can safely summarize. */
364 latest_lsn = GetLatestLSN(&latest_tli);
365
366 /*
367 * If we're summarizing a historic timeline and we haven't yet
368 * computed the point at which to switch to the next timeline, do that
369 * now.
370 *
371 * Note that if this is a standby, what was previously the current
372 * timeline could become historic at any time.
373 *
374 * We could try to make this more efficient by caching the results of
375 * readTimeLineHistory when latest_tli has not changed, but since we
376 * only have to do this once per timeline switch, we probably wouldn't
377 * save any significant amount of work in practice.
378 */
379 if (current_tli != latest_tli && XLogRecPtrIsInvalid(switch_lsn))
380 {
381 List *tles = readTimeLineHistory(latest_tli);
382
383 switch_lsn = tliSwitchPoint(current_tli, tles, &switch_tli);
385 errmsg_internal("switch point from TLI %u to TLI %u is at %X/%X",
386 current_tli, switch_tli, LSN_FORMAT_ARGS(switch_lsn)));
387 }
388
389 /*
390 * If we've reached the switch LSN, we can't summarize anything else
391 * on this timeline. Switch to the next timeline and go around again,
392 * backing up to the exact switch point if we passed it.
393 */
394 if (!XLogRecPtrIsInvalid(switch_lsn) && current_lsn >= switch_lsn)
395 {
396 /* Restart summarization from switch point. */
397 current_tli = switch_tli;
398 current_lsn = switch_lsn;
399
400 /* Next timeline and switch point, if any, not yet known. */
401 switch_lsn = InvalidXLogRecPtr;
402 switch_tli = 0;
403
404 /* Update (really, rewind, if needed) state in shared memory. */
405 LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
406 WalSummarizerCtl->summarized_lsn = current_lsn;
407 WalSummarizerCtl->summarized_tli = current_tli;
409 WalSummarizerCtl->pending_lsn = current_lsn;
410 LWLockRelease(WALSummarizerLock);
411
412 continue;
413 }
414
415 /* Summarize WAL. */
416 end_of_summary_lsn = SummarizeWAL(current_tli,
417 current_lsn, exact,
418 switch_lsn, latest_lsn);
419 Assert(!XLogRecPtrIsInvalid(end_of_summary_lsn));
420 Assert(end_of_summary_lsn >= current_lsn);
421
422 /*
423 * Update state for next loop iteration.
424 *
425 * Next summary file should start from exactly where this one ended.
426 */
427 current_lsn = end_of_summary_lsn;
428 exact = true;
429
430 /* Update state in shared memory. */
431 LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
432 WalSummarizerCtl->summarized_lsn = end_of_summary_lsn;
433 WalSummarizerCtl->summarized_tli = current_tli;
435 WalSummarizerCtl->pending_lsn = end_of_summary_lsn;
436 LWLockRelease(WALSummarizerLock);
437
438 /* Wake up anyone waiting for more summary files to be written. */
440 }
441}
442
443/*
444 * Get information about the state of the WAL summarizer.
445 */
446void
447GetWalSummarizerState(TimeLineID *summarized_tli, XLogRecPtr *summarized_lsn,
448 XLogRecPtr *pending_lsn, int *summarizer_pid)
449{
450 LWLockAcquire(WALSummarizerLock, LW_SHARED);
452 {
453 /*
454 * If initialized is false, the rest of the structure contents are
455 * undefined.
456 */
457 *summarized_tli = 0;
458 *summarized_lsn = InvalidXLogRecPtr;
459 *pending_lsn = InvalidXLogRecPtr;
460 *summarizer_pid = -1;
461 }
462 else
463 {
464 int summarizer_pgprocno = WalSummarizerCtl->summarizer_pgprocno;
465
466 *summarized_tli = WalSummarizerCtl->summarized_tli;
467 *summarized_lsn = WalSummarizerCtl->summarized_lsn;
468 if (summarizer_pgprocno == INVALID_PROC_NUMBER)
469 {
470 /*
471 * If the summarizer has exited, the fact that it had processed
472 * beyond summarized_lsn is irrelevant now.
473 */
474 *pending_lsn = WalSummarizerCtl->summarized_lsn;
475 *summarizer_pid = -1;
476 }
477 else
478 {
479 *pending_lsn = WalSummarizerCtl->pending_lsn;
480
481 /*
482 * We're not fussed about inexact answers here, since they could
483 * become stale instantly, so we don't bother taking the lock, but
484 * make sure that invalid PID values are normalized to -1.
485 */
486 *summarizer_pid = GetPGProcByNumber(summarizer_pgprocno)->pid;
487 if (*summarizer_pid <= 0)
488 *summarizer_pid = -1;
489 }
490 }
491 LWLockRelease(WALSummarizerLock);
492}
493
494/*
495 * Get the oldest LSN in this server's timeline history that has not yet been
496 * summarized, and update shared memory state as appropriate.
497 *
498 * If *tli != NULL, it will be set to the TLI for the LSN that is returned.
499 *
500 * If *lsn_is_exact != NULL, it will be set to true if the returned LSN is
501 * necessarily the start of a WAL record and false if it's just the beginning
502 * of a WAL segment.
503 */
505GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact)
506{
507 TimeLineID latest_tli;
508 int n;
509 List *tles;
510 XLogRecPtr unsummarized_lsn = InvalidXLogRecPtr;
511 TimeLineID unsummarized_tli = 0;
512 bool should_make_exact = false;
513 List *existing_summaries;
514 ListCell *lc;
515 bool am_wal_summarizer = AmWalSummarizerProcess();
516
517 /* If not summarizing WAL, do nothing. */
518 if (!summarize_wal)
519 return InvalidXLogRecPtr;
520
521 /*
522 * If we are not the WAL summarizer process, then we normally just want to
523 * read the values from shared memory. However, as an exception, if shared
524 * memory hasn't been initialized yet, then we need to do that so that we
525 * can read legal values and not remove any WAL too early.
526 */
527 if (!am_wal_summarizer)
528 {
529 LWLockAcquire(WALSummarizerLock, LW_SHARED);
530
532 {
533 unsummarized_lsn = WalSummarizerCtl->summarized_lsn;
534 if (tli != NULL)
536 if (lsn_is_exact != NULL)
537 *lsn_is_exact = WalSummarizerCtl->lsn_is_exact;
538 LWLockRelease(WALSummarizerLock);
539 return unsummarized_lsn;
540 }
541
542 LWLockRelease(WALSummarizerLock);
543 }
544
545 /*
546 * Find the oldest timeline on which WAL still exists, and the earliest
547 * segment for which it exists.
548 *
549 * Note that we do this every time the WAL summarizer process restarts or
550 * recovers from an error, in case the contents of pg_wal have changed
551 * under us e.g. if some files were removed, either manually - which
552 * shouldn't really happen, but might - or by postgres itself, if
553 * summarize_wal was turned off and then back on again.
554 */
555 (void) GetLatestLSN(&latest_tli);
556 tles = readTimeLineHistory(latest_tli);
557 for (n = list_length(tles) - 1; n >= 0; --n)
558 {
559 TimeLineHistoryEntry *tle = list_nth(tles, n);
560 XLogSegNo oldest_segno;
561
562 oldest_segno = XLogGetOldestSegno(tle->tli);
563 if (oldest_segno != 0)
564 {
565 /* Compute oldest LSN that still exists on disk. */
567 unsummarized_lsn);
568
569 unsummarized_tli = tle->tli;
570 break;
571 }
572 }
573
574 /*
575 * Don't try to summarize anything older than the end LSN of the newest
576 * summary file that exists for this timeline.
577 */
578 existing_summaries =
579 GetWalSummaries(unsummarized_tli,
581 foreach(lc, existing_summaries)
582 {
583 WalSummaryFile *ws = lfirst(lc);
584
585 if (ws->end_lsn > unsummarized_lsn)
586 {
587 unsummarized_lsn = ws->end_lsn;
588 should_make_exact = true;
589 }
590 }
591
592 /* It really should not be possible for us to find no WAL. */
593 if (unsummarized_tli == 0)
595 errcode(ERRCODE_INTERNAL_ERROR),
596 errmsg_internal("no WAL found on timeline %u", latest_tli));
597
598 /*
599 * If we're the WAL summarizer, we always want to store the values we just
600 * computed into shared memory, because those are the values we're going
601 * to use to drive our operation, and so they are the authoritative
602 * values. Otherwise, we only store values into shared memory if shared
603 * memory is uninitialized. Our values are not canonical in such a case,
604 * but it's better to have something than nothing, to guide WAL retention.
605 */
606 LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
607 if (am_wal_summarizer || !WalSummarizerCtl->initialized)
608 {
610 WalSummarizerCtl->summarized_lsn = unsummarized_lsn;
611 WalSummarizerCtl->summarized_tli = unsummarized_tli;
612 WalSummarizerCtl->lsn_is_exact = should_make_exact;
613 WalSummarizerCtl->pending_lsn = unsummarized_lsn;
614 }
615 else
616 unsummarized_lsn = WalSummarizerCtl->summarized_lsn;
617
618 /* Also return the to the caller as required. */
619 if (tli != NULL)
621 if (lsn_is_exact != NULL)
622 *lsn_is_exact = WalSummarizerCtl->lsn_is_exact;
623 LWLockRelease(WALSummarizerLock);
624
625 return unsummarized_lsn;
626}
627
628/*
629 * Wake up the WAL summarizer process.
630 *
631 * This might not work, because there's no guarantee that the WAL summarizer
632 * process was successfully started, and it also might have started but
633 * subsequently terminated. So, under normal circumstances, this will get the
634 * latch set, but there's no guarantee.
635 */
636void
638{
639 ProcNumber pgprocno;
640
641 if (WalSummarizerCtl == NULL)
642 return;
643
644 LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
646 LWLockRelease(WALSummarizerLock);
647
648 if (pgprocno != INVALID_PROC_NUMBER)
650}
651
652/*
653 * Wait until WAL summarization reaches the given LSN, but time out with an
654 * error if the summarizer seems to be stick.
655 *
656 * Returns immediately if summarize_wal is turned off while we wait. Caller
657 * is expected to handle this case, if necessary.
658 */
659void
661{
662 TimestampTz initial_time,
663 cycle_time,
664 current_time;
665 XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
666 int deadcycles = 0;
667
668 initial_time = cycle_time = GetCurrentTimestamp();
669
670 while (1)
671 {
672 long timeout_in_ms = 10000;
673 XLogRecPtr summarized_lsn;
674 XLogRecPtr pending_lsn;
675
677
678 /* If WAL summarization is disabled while we're waiting, give up. */
679 if (!summarize_wal)
680 return;
681
682 /*
683 * If the LSN summarized on disk has reached the target value, stop.
684 */
685 LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
686 summarized_lsn = WalSummarizerCtl->summarized_lsn;
687 pending_lsn = WalSummarizerCtl->pending_lsn;
688 LWLockRelease(WALSummarizerLock);
689
690 /* If WAL summarization has progressed sufficiently, stop waiting. */
691 if (summarized_lsn >= lsn)
692 break;
693
694 /* Recheck current time. */
695 current_time = GetCurrentTimestamp();
696
697 /* Have we finished the current cycle of waiting? */
698 if (TimestampDifferenceMilliseconds(cycle_time,
699 current_time) >= timeout_in_ms)
700 {
701 long elapsed_seconds;
702
703 /* Begin new wait cycle. */
704 cycle_time = TimestampTzPlusMilliseconds(cycle_time,
705 timeout_in_ms);
706
707 /*
708 * Keep track of the number of cycles during which there has been
709 * no progression of pending_lsn. If pending_lsn is not advancing,
710 * that means that not only are no new files appearing on disk,
711 * but we're not even incorporating new records into the in-memory
712 * state.
713 */
714 if (pending_lsn > prior_pending_lsn)
715 {
716 prior_pending_lsn = pending_lsn;
717 deadcycles = 0;
718 }
719 else
720 ++deadcycles;
721
722 /*
723 * If we've managed to wait for an entire minute without the WAL
724 * summarizer absorbing a single WAL record, error out; probably
725 * something is wrong.
726 *
727 * We could consider also erroring out if the summarizer is taking
728 * too long to catch up, but it's not clear what rate of progress
729 * would be acceptable and what would be too slow. So instead, we
730 * just try to error out in the case where there's no progress at
731 * all. That seems likely to catch a reasonable number of the
732 * things that can go wrong in practice (e.g. the summarizer
733 * process is completely hung, say because somebody hooked up a
734 * debugger to it or something) without giving up too quickly when
735 * the system is just slow.
736 */
737 if (deadcycles >= 6)
739 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
740 errmsg("WAL summarization is not progressing"),
741 errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
742 LSN_FORMAT_ARGS(lsn),
743 LSN_FORMAT_ARGS(summarized_lsn),
744 LSN_FORMAT_ARGS(pending_lsn))));
745
746
747 /*
748 * Otherwise, just let the user know what's happening.
749 */
750 elapsed_seconds =
752 current_time) / 1000;
754 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
755 errmsg_plural("still waiting for WAL summarization through %X/%X after %ld second",
756 "still waiting for WAL summarization through %X/%X after %ld seconds",
757 elapsed_seconds,
758 LSN_FORMAT_ARGS(lsn),
759 elapsed_seconds),
760 errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
761 LSN_FORMAT_ARGS(summarized_lsn),
762 LSN_FORMAT_ARGS(pending_lsn))));
763 }
764
765 /*
766 * Align the wait time to prevent drift. This doesn't really matter,
767 * but we'd like the warnings about how long we've been waiting to say
768 * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
769 * drifting to something that is not a multiple of ten.
770 */
771 timeout_in_ms -=
772 TimestampDifferenceMilliseconds(cycle_time, current_time);
773
774 /* Wait and see. */
776 timeout_in_ms,
777 WAIT_EVENT_WAL_SUMMARY_READY);
778 }
779
781}
782
783/*
784 * On exit, update shared memory to make it clear that we're no longer
785 * running.
786 */
787static void
789{
790 LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
792 LWLockRelease(WALSummarizerLock);
793}
794
795/*
796 * Get the latest LSN that is eligible to be summarized, and set *tli to the
797 * corresponding timeline.
798 */
799static XLogRecPtr
801{
802 if (!RecoveryInProgress())
803 {
804 /* Don't summarize WAL before it's flushed. */
805 return GetFlushRecPtr(tli);
806 }
807 else
808 {
809 XLogRecPtr flush_lsn;
810 TimeLineID flush_tli;
811 XLogRecPtr replay_lsn;
812 TimeLineID replay_tli;
813 TimeLineID insert_tli;
814
815 /*
816 * After the insert TLI has been set and before the control file has
817 * been updated to show the DB in production, RecoveryInProgress()
818 * will return true, because it's not yet safe for all backends to
819 * begin writing WAL. However, replay has already ceased, so from our
820 * point of view, recovery is already over. We should summarize up to
821 * where replay stopped and then prepare to resume at the start of the
822 * insert timeline.
823 */
824 if ((insert_tli = GetWALInsertionTimeLineIfSet()) != 0)
825 {
826 *tli = insert_tli;
827 return GetXLogReplayRecPtr(NULL);
828 }
829
830 /*
831 * What we really want to know is how much WAL has been flushed to
832 * disk, but the only flush position available is the one provided by
833 * the walreceiver, which may not be running, because this could be
834 * crash recovery or recovery via restore_command. So use either the
835 * WAL receiver's flush position or the replay position, whichever is
836 * further ahead, on the theory that if the WAL has been replayed then
837 * it must also have been flushed to disk.
838 */
839 flush_lsn = GetWalRcvFlushRecPtr(NULL, &flush_tli);
840 replay_lsn = GetXLogReplayRecPtr(&replay_tli);
841 if (flush_lsn > replay_lsn)
842 {
843 *tli = flush_tli;
844 return flush_lsn;
845 }
846 else
847 {
848 *tli = replay_tli;
849 return replay_lsn;
850 }
851 }
852}
853
854/*
855 * Interrupt handler for main loop of WAL summarizer process.
856 */
857static void
859{
862
864 {
865 ConfigReloadPending = false;
867 }
868
870 {
872 errmsg_internal("WAL summarizer shutting down"));
873 proc_exit(0);
874 }
875
876 /* Perform logging of memory contexts of this process */
879}
880
881/*
882 * Summarize a range of WAL records on a single timeline.
883 *
884 * 'tli' is the timeline to be summarized.
885 *
886 * 'start_lsn' is the point at which we should start summarizing. If this
887 * value comes from the end LSN of the previous record as returned by the
888 * xlogreader machinery, 'exact' should be true; otherwise, 'exact' should
889 * be false, and this function will search forward for the start of a valid
890 * WAL record.
891 *
892 * 'switch_lsn' is the point at which we should switch to a later timeline,
893 * if we're summarizing a historic timeline.
894 *
895 * 'maximum_lsn' identifies the point beyond which we can't count on being
896 * able to read any more WAL. It should be the switch point when reading a
897 * historic timeline, or the most-recently-measured end of WAL when reading
898 * the current timeline.
899 *
900 * The return value is the LSN at which the WAL summary actually ends. Most
901 * often, a summary file ends because we notice that a checkpoint has
902 * occurred and reach the redo pointer of that checkpoint, but sometimes
903 * we stop for other reasons, such as a timeline switch.
904 */
905static XLogRecPtr
906SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
907 XLogRecPtr switch_lsn, XLogRecPtr maximum_lsn)
908{
909 SummarizerReadLocalXLogPrivate *private_data;
911 XLogRecPtr summary_start_lsn;
912 XLogRecPtr summary_end_lsn = switch_lsn;
913 char temp_path[MAXPGPATH];
914 char final_path[MAXPGPATH];
915 WalSummaryIO io;
917 bool fast_forward = true;
918
919 /* Initialize private data for xlogreader. */
920 private_data = (SummarizerReadLocalXLogPrivate *)
922 private_data->tli = tli;
923 private_data->historic = !XLogRecPtrIsInvalid(switch_lsn);
924 private_data->read_upto = maximum_lsn;
925
926 /* Create xlogreader. */
929 .segment_open = &wal_segment_open,
930 .segment_close = &wal_segment_close),
931 private_data);
932 if (xlogreader == NULL)
934 (errcode(ERRCODE_OUT_OF_MEMORY),
935 errmsg("out of memory"),
936 errdetail("Failed while allocating a WAL reading processor.")));
937
938 /*
939 * When exact = false, we're starting from an arbitrary point in the WAL
940 * and must search forward for the start of the next record.
941 *
942 * When exact = true, start_lsn should be either the LSN where a record
943 * begins, or the LSN of a page where the page header is immediately
944 * followed by the start of a new record. XLogBeginRead should tolerate
945 * either case.
946 *
947 * We need to allow for both cases because the behavior of xlogreader
948 * varies. When a record spans two or more xlog pages, the ending LSN
949 * reported by xlogreader will be the starting LSN of the following
950 * record, but when an xlog page boundary falls between two records, the
951 * end LSN for the first will be reported as the first byte of the
952 * following page. We can't know until we read that page how large the
953 * header will be, but we'll have to skip over it to find the next record.
954 */
955 if (exact)
956 {
957 /*
958 * Even if start_lsn is the beginning of a page rather than the
959 * beginning of the first record on that page, we should still use it
960 * as the start LSN for the summary file. That's because we detect
961 * missing summary files by looking for cases where the end LSN of one
962 * file is less than the start LSN of the next file. When only a page
963 * header is skipped, nothing has been missed.
964 */
965 XLogBeginRead(xlogreader, start_lsn);
966 summary_start_lsn = start_lsn;
967 }
968 else
969 {
970 summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn);
971 if (XLogRecPtrIsInvalid(summary_start_lsn))
972 {
973 /*
974 * If we hit end-of-WAL while trying to find the next valid
975 * record, we must be on a historic timeline that has no valid
976 * records that begin after start_lsn and before end of WAL.
977 */
978 if (private_data->end_of_wal)
979 {
981 errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X",
982 tli,
983 LSN_FORMAT_ARGS(start_lsn),
984 LSN_FORMAT_ARGS(private_data->read_upto)));
985
986 /*
987 * The timeline ends at or after start_lsn, without containing
988 * any records. Thus, we must make sure the main loop does not
989 * iterate. If start_lsn is the end of the timeline, then we
990 * won't actually emit an empty summary file, but otherwise,
991 * we must, to capture the fact that the LSN range in question
992 * contains no interesting WAL records.
993 */
994 summary_start_lsn = start_lsn;
995 summary_end_lsn = private_data->read_upto;
996 switch_lsn = xlogreader->EndRecPtr;
997 }
998 else
1000 (errmsg("could not find a valid record after %X/%X",
1001 LSN_FORMAT_ARGS(start_lsn))));
1002 }
1003
1004 /* We shouldn't go backward. */
1005 Assert(summary_start_lsn >= start_lsn);
1006 }
1007
1008 /*
1009 * Main loop: read xlog records one by one.
1010 */
1011 while (1)
1012 {
1013 int block_id;
1014 char *errormsg;
1015 XLogRecord *record;
1016 uint8 rmid;
1017
1019
1020 /* We shouldn't go backward. */
1021 Assert(summary_start_lsn <= xlogreader->EndRecPtr);
1022
1023 /* Now read the next record. */
1024 record = XLogReadRecord(xlogreader, &errormsg);
1025 if (record == NULL)
1026 {
1027 if (private_data->end_of_wal)
1028 {
1029 /*
1030 * This timeline must be historic and must end before we were
1031 * able to read a complete record.
1032 */
1034 errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X",
1035 tli,
1037 LSN_FORMAT_ARGS(private_data->read_upto)));
1038 /* Summary ends at end of WAL. */
1039 summary_end_lsn = private_data->read_upto;
1040 break;
1041 }
1042 if (errormsg)
1043 ereport(ERROR,
1045 errmsg("could not read WAL from timeline %u at %X/%X: %s",
1047 errormsg)));
1048 else
1049 ereport(ERROR,
1051 errmsg("could not read WAL from timeline %u at %X/%X",
1053 }
1054
1055 /* We shouldn't go backward. */
1056 Assert(summary_start_lsn <= xlogreader->EndRecPtr);
1057
1058 if (!XLogRecPtrIsInvalid(switch_lsn) &&
1059 xlogreader->ReadRecPtr >= switch_lsn)
1060 {
1061 /*
1062 * Whoops! We've read a record that *starts* after the switch LSN,
1063 * contrary to our goal of reading only until we hit the first
1064 * record that ends at or after the switch LSN. Pretend we didn't
1065 * read it after all by bailing out of this loop right here,
1066 * before we do anything with this record.
1067 *
1068 * This can happen because the last record before the switch LSN
1069 * might be continued across multiple pages, and then we might
1070 * come to a page with XLP_FIRST_IS_OVERWRITE_CONTRECORD set. In
1071 * that case, the record that was continued across multiple pages
1072 * is incomplete and will be disregarded, and the read will
1073 * restart from the beginning of the page that is flagged
1074 * XLP_FIRST_IS_OVERWRITE_CONTRECORD.
1075 *
1076 * If this case occurs, we can fairly say that the current summary
1077 * file ends at the switch LSN exactly. The first record on the
1078 * page marked XLP_FIRST_IS_OVERWRITE_CONTRECORD will be
1079 * discovered when generating the next summary file.
1080 */
1081 summary_end_lsn = switch_lsn;
1082 break;
1083 }
1084
1085 /*
1086 * Certain types of records require special handling. Redo points and
1087 * shutdown checkpoints trigger creation of new summary files and can
1088 * also cause us to enter or exit "fast forward" mode. Other types of
1089 * records can require special updates to the block reference table.
1090 */
1091 rmid = XLogRecGetRmid(xlogreader);
1092 if (rmid == RM_XLOG_ID)
1093 {
1094 bool new_fast_forward;
1095
1096 /*
1097 * If we've already processed some WAL records when we hit a redo
1098 * point or shutdown checkpoint, then we stop summarization before
1099 * including this record in the current file, so that it will be
1100 * the first record in the next file.
1101 *
1102 * When we hit one of those record types as the first record in a
1103 * file, we adjust our notion of whether we're fast-forwarding.
1104 * Any WAL generated with wal_level=minimal must be skipped
1105 * without actually generating any summary file, because an
1106 * incremental backup that crosses such WAL would be unsafe.
1107 */
1108 if (SummarizeXlogRecord(xlogreader, &new_fast_forward))
1109 {
1110 if (xlogreader->ReadRecPtr > summary_start_lsn)
1111 {
1112 summary_end_lsn = xlogreader->ReadRecPtr;
1113 break;
1114 }
1115 else
1116 fast_forward = new_fast_forward;
1117 }
1118 }
1119 else if (!fast_forward)
1120 {
1121 /*
1122 * This switch handles record types that require extra updates to
1123 * the contents of the block reference table.
1124 */
1125 switch (rmid)
1126 {
1127 case RM_DBASE_ID:
1129 break;
1130 case RM_SMGR_ID:
1132 break;
1133 case RM_XACT_ID:
1135 break;
1136 }
1137 }
1138
1139 /*
1140 * If we're in fast-forward mode, we don't really need to do anything.
1141 * Otherwise, feed block references from xlog record to block
1142 * reference table.
1143 */
1144 if (!fast_forward)
1145 {
1146 for (block_id = 0; block_id <= XLogRecMaxBlockId(xlogreader);
1147 block_id++)
1148 {
1149 RelFileLocator rlocator;
1150 ForkNumber forknum;
1151 BlockNumber blocknum;
1152
1153 if (!XLogRecGetBlockTagExtended(xlogreader, block_id, &rlocator,
1154 &forknum, &blocknum, NULL))
1155 continue;
1156
1157 /*
1158 * As we do elsewhere, ignore the FSM fork, because it's not
1159 * fully WAL-logged.
1160 */
1161 if (forknum != FSM_FORKNUM)
1162 BlockRefTableMarkBlockModified(brtab, &rlocator, forknum,
1163 blocknum);
1164 }
1165 }
1166
1167 /* Update our notion of where this summary file ends. */
1168 summary_end_lsn = xlogreader->EndRecPtr;
1169
1170 /* Also update shared memory. */
1171 LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
1172 Assert(summary_end_lsn >= WalSummarizerCtl->summarized_lsn);
1173 WalSummarizerCtl->pending_lsn = summary_end_lsn;
1174 LWLockRelease(WALSummarizerLock);
1175
1176 /*
1177 * If we have a switch LSN and have reached it, stop before reading
1178 * the next record.
1179 */
1180 if (!XLogRecPtrIsInvalid(switch_lsn) &&
1181 xlogreader->EndRecPtr >= switch_lsn)
1182 break;
1183 }
1184
1185 /* Destroy xlogreader. */
1188
1189 /*
1190 * If a timeline switch occurs, we may fail to make any progress at all
1191 * before exiting the loop above. If that happens, we don't write a WAL
1192 * summary file at all. We can also skip writing a file if we're in
1193 * fast-forward mode.
1194 */
1195 if (summary_end_lsn > summary_start_lsn && !fast_forward)
1196 {
1197 /* Generate temporary and final path name. */
1198 snprintf(temp_path, MAXPGPATH,
1199 XLOGDIR "/summaries/temp.summary");
1200 snprintf(final_path, MAXPGPATH,
1201 XLOGDIR "/summaries/%08X%08X%08X%08X%08X.summary",
1202 tli,
1203 LSN_FORMAT_ARGS(summary_start_lsn),
1204 LSN_FORMAT_ARGS(summary_end_lsn));
1205
1206 /* Open the temporary file for writing. */
1207 io.filepos = 0;
1208 io.file = PathNameOpenFile(temp_path, O_WRONLY | O_CREAT | O_TRUNC);
1209 if (io.file < 0)
1210 ereport(ERROR,
1212 errmsg("could not create file \"%s\": %m", temp_path)));
1213
1214 /* Write the data. */
1216
1217 /* Close temporary file and shut down xlogreader. */
1218 FileClose(io.file);
1219
1220 /* Tell the user what we did. */
1222 errmsg_internal("summarized WAL on TLI %u from %X/%X to %X/%X",
1223 tli,
1224 LSN_FORMAT_ARGS(summary_start_lsn),
1225 LSN_FORMAT_ARGS(summary_end_lsn)));
1226
1227 /* Durably rename the new summary into place. */
1228 durable_rename(temp_path, final_path, ERROR);
1229 }
1230
1231 /* If we skipped a non-zero amount of WAL, log a debug message. */
1232 if (summary_end_lsn > summary_start_lsn && fast_forward)
1234 errmsg_internal("skipped summarizing WAL on TLI %u from %X/%X to %X/%X",
1235 tli,
1236 LSN_FORMAT_ARGS(summary_start_lsn),
1237 LSN_FORMAT_ARGS(summary_end_lsn)));
1238
1239 return summary_end_lsn;
1240}
1241
1242/*
1243 * Special handling for WAL records with RM_DBASE_ID.
1244 */
1245static void
1247{
1248 uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1249
1250 /*
1251 * We use relfilenode zero for a given database OID and tablespace OID to
1252 * indicate that all relations with that pair of IDs have been recreated
1253 * if they exist at all. Effectively, we're setting a limit block of 0 for
1254 * all such relfilenodes.
1255 *
1256 * Technically, this special handling is only needed in the case of
1257 * XLOG_DBASE_CREATE_FILE_COPY, because that can create a whole bunch of
1258 * relation files in a directory without logging anything specific to each
1259 * one. If we didn't mark the whole DB OID/TS OID combination in some way,
1260 * then a tablespace that was dropped after the reference backup and
1261 * recreated using the FILE_COPY method prior to the incremental backup
1262 * would look just like one that was never touched at all, which would be
1263 * catastrophic.
1264 *
1265 * But it seems best to adopt this treatment for all records that drop or
1266 * create a DB OID/TS OID combination. That's similar to how we treat the
1267 * limit block for individual relations, and it's an extra layer of safety
1268 * here. We can never lose data by marking more stuff as needing to be
1269 * backed up in full.
1270 */
1271 if (info == XLOG_DBASE_CREATE_FILE_COPY)
1272 {
1274 RelFileLocator rlocator;
1275
1276 xlrec =
1278 rlocator.spcOid = xlrec->tablespace_id;
1279 rlocator.dbOid = xlrec->db_id;
1280 rlocator.relNumber = 0;
1281 BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0);
1282 }
1283 else if (info == XLOG_DBASE_CREATE_WAL_LOG)
1284 {
1286 RelFileLocator rlocator;
1287
1289 rlocator.spcOid = xlrec->tablespace_id;
1290 rlocator.dbOid = xlrec->db_id;
1291 rlocator.relNumber = 0;
1292 BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0);
1293 }
1294 else if (info == XLOG_DBASE_DROP)
1295 {
1296 xl_dbase_drop_rec *xlrec;
1297 RelFileLocator rlocator;
1298 int i;
1299
1301 rlocator.dbOid = xlrec->db_id;
1302 rlocator.relNumber = 0;
1303 for (i = 0; i < xlrec->ntablespaces; ++i)
1304 {
1305 rlocator.spcOid = xlrec->tablespace_ids[i];
1306 BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0);
1307 }
1308 }
1309}
1310
1311/*
1312 * Special handling for WAL records with RM_SMGR_ID.
1313 */
1314static void
1316{
1317 uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1318
1319 if (info == XLOG_SMGR_CREATE)
1320 {
1321 xl_smgr_create *xlrec;
1322
1323 /*
1324 * If a new relation fork is created on disk, there is no point
1325 * tracking anything about which blocks have been modified, because
1326 * the whole thing will be new. Hence, set the limit block for this
1327 * fork to 0.
1328 *
1329 * Ignore the FSM fork, which is not fully WAL-logged.
1330 */
1332
1333 if (xlrec->forkNum != FSM_FORKNUM)
1334 BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator,
1335 xlrec->forkNum, 0);
1336 }
1337 else if (info == XLOG_SMGR_TRUNCATE)
1338 {
1339 xl_smgr_truncate *xlrec;
1340
1342
1343 /*
1344 * If a relation fork is truncated on disk, there is no point in
1345 * tracking anything about block modifications beyond the truncation
1346 * point.
1347 *
1348 * We ignore SMGR_TRUNCATE_FSM here because the FSM isn't fully
1349 * WAL-logged and thus we can't track modified blocks for it anyway.
1350 */
1351 if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
1352 BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator,
1353 MAIN_FORKNUM, xlrec->blkno);
1354 if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0)
1355 BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator,
1357 }
1358}
1359
1360/*
1361 * Special handling for WAL records with RM_XACT_ID.
1362 */
1363static void
1365{
1366 uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1367 uint8 xact_info = info & XLOG_XACT_OPMASK;
1368
1369 if (xact_info == XLOG_XACT_COMMIT ||
1370 xact_info == XLOG_XACT_COMMIT_PREPARED)
1371 {
1373 xl_xact_parsed_commit parsed;
1374 int i;
1375
1376 /*
1377 * Don't track modified blocks for any relations that were removed on
1378 * commit.
1379 */
1381 for (i = 0; i < parsed.nrels; ++i)
1382 {
1383 ForkNumber forknum;
1384
1385 for (forknum = 0; forknum <= MAX_FORKNUM; ++forknum)
1386 if (forknum != FSM_FORKNUM)
1387 BlockRefTableSetLimitBlock(brtab, &parsed.xlocators[i],
1388 forknum, 0);
1389 }
1390 }
1391 else if (xact_info == XLOG_XACT_ABORT ||
1392 xact_info == XLOG_XACT_ABORT_PREPARED)
1393 {
1395 xl_xact_parsed_abort parsed;
1396 int i;
1397
1398 /*
1399 * Don't track modified blocks for any relations that were removed on
1400 * abort.
1401 */
1402 ParseAbortRecord(XLogRecGetInfo(xlogreader), xlrec, &parsed);
1403 for (i = 0; i < parsed.nrels; ++i)
1404 {
1405 ForkNumber forknum;
1406
1407 for (forknum = 0; forknum <= MAX_FORKNUM; ++forknum)
1408 if (forknum != FSM_FORKNUM)
1409 BlockRefTableSetLimitBlock(brtab, &parsed.xlocators[i],
1410 forknum, 0);
1411 }
1412 }
1413}
1414
1415/*
1416 * Special handling for WAL records with RM_XLOG_ID.
1417 *
1418 * The return value is true if WAL summarization should stop before this
1419 * record and false otherwise. When the return value is true,
1420 * *new_fast_forward indicates whether future processing should be done
1421 * in fast forward mode (i.e. read WAL without emitting summaries) or not.
1422 */
1423static bool
1425{
1426 uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1427 int record_wal_level;
1428
1429 if (info == XLOG_CHECKPOINT_REDO)
1430 {
1431 /* Payload is wal_level at the time record was written. */
1432 memcpy(&record_wal_level, XLogRecGetData(xlogreader), sizeof(int));
1433 }
1434 else if (info == XLOG_CHECKPOINT_SHUTDOWN)
1435 {
1436 CheckPoint rec_ckpt;
1437
1438 /* Extract wal_level at time record was written from payload. */
1439 memcpy(&rec_ckpt, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1440 record_wal_level = rec_ckpt.wal_level;
1441 }
1442 else if (info == XLOG_PARAMETER_CHANGE)
1443 {
1444 xl_parameter_change xlrec;
1445
1446 /* Extract wal_level at time record was written from payload. */
1447 memcpy(&xlrec, XLogRecGetData(xlogreader),
1448 sizeof(xl_parameter_change));
1449 record_wal_level = xlrec.wal_level;
1450 }
1451 else if (info == XLOG_END_OF_RECOVERY)
1452 {
1453 xl_end_of_recovery xlrec;
1454
1455 /* Extract wal_level at time record was written from payload. */
1456 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1457 record_wal_level = xlrec.wal_level;
1458 }
1459 else
1460 {
1461 /* No special handling required. Return false. */
1462 return false;
1463 }
1464
1465 /*
1466 * Redo can only begin at an XLOG_CHECKPOINT_REDO or
1467 * XLOG_CHECKPOINT_SHUTDOWN record, so we want WAL summarization to begin
1468 * at those points. Hence, when those records are encountered, return
1469 * true, so that we stop just before summarizing either of those records.
1470 *
1471 * We also reach here if we just saw XLOG_END_OF_RECOVERY or
1472 * XLOG_PARAMETER_CHANGE. These are not places where recovery can start,
1473 * but they're still relevant here. A new timeline can begin with
1474 * XLOG_END_OF_RECOVERY, so we need to confirm the WAL level at that
1475 * point; and a restart can provoke XLOG_PARAMETER_CHANGE after an
1476 * intervening change to postgresql.conf, which might force us to stop
1477 * summarizing.
1478 */
1479 *new_fast_forward = (record_wal_level == WAL_LEVEL_MINIMAL);
1480 return true;
1481}
1482
1483/*
1484 * Similar to read_local_xlog_page, but limited to read from one particular
1485 * timeline. If the end of WAL is reached, it will wait for more if reading
1486 * from the current timeline, or give up if reading from a historic timeline.
1487 * In the latter case, it will also set private_data->end_of_wal = true.
1488 *
1489 * Caller must set private_data->tli to the TLI of interest,
1490 * private_data->read_upto to the lowest LSN that is not known to be safe
1491 * to read on that timeline, and private_data->historic to true if and only
1492 * if the timeline is not the current timeline. This function will update
1493 * private_data->read_upto and private_data->historic if more WAL appears
1494 * on the current timeline or if the current timeline becomes historic.
1495 */
1496static int
1498 XLogRecPtr targetPagePtr, int reqLen,
1499 XLogRecPtr targetRecPtr, char *cur_page)
1500{
1501 int count;
1502 WALReadError errinfo;
1503 SummarizerReadLocalXLogPrivate *private_data;
1504
1506
1507 private_data = (SummarizerReadLocalXLogPrivate *)
1508 state->private_data;
1509
1510 while (1)
1511 {
1512 if (targetPagePtr + XLOG_BLCKSZ <= private_data->read_upto)
1513 {
1514 /*
1515 * more than one block available; read only that block, have
1516 * caller come back if they need more.
1517 */
1518 count = XLOG_BLCKSZ;
1519 break;
1520 }
1521 else if (targetPagePtr + reqLen > private_data->read_upto)
1522 {
1523 /* We don't seem to have enough data. */
1524 if (private_data->historic)
1525 {
1526 /*
1527 * This is a historic timeline, so there will never be any
1528 * more data than we have currently.
1529 */
1530 private_data->end_of_wal = true;
1531 return -1;
1532 }
1533 else
1534 {
1535 XLogRecPtr latest_lsn;
1536 TimeLineID latest_tli;
1537
1538 /*
1539 * This is - or at least was up until very recently - the
1540 * current timeline, so more data might show up. Delay here
1541 * so we don't tight-loop.
1542 */
1545
1546 /* Recheck end-of-WAL. */
1547 latest_lsn = GetLatestLSN(&latest_tli);
1548 if (private_data->tli == latest_tli)
1549 {
1550 /* Still the current timeline, update max LSN. */
1551 Assert(latest_lsn >= private_data->read_upto);
1552 private_data->read_upto = latest_lsn;
1553 }
1554 else
1555 {
1556 List *tles = readTimeLineHistory(latest_tli);
1557 XLogRecPtr switchpoint;
1558
1559 /*
1560 * The timeline we're scanning is no longer the latest
1561 * one. Figure out when it ended.
1562 */
1563 private_data->historic = true;
1564 switchpoint = tliSwitchPoint(private_data->tli, tles,
1565 NULL);
1566
1567 /*
1568 * Allow reads up to exactly the switch point.
1569 *
1570 * It's possible that this will cause read_upto to move
1571 * backwards, because we might have been promoted before
1572 * reaching the end of the previous timeline. In that
1573 * case, the next loop iteration will likely conclude that
1574 * we've reached end of WAL.
1575 */
1576 private_data->read_upto = switchpoint;
1577
1578 /* Debugging output. */
1580 errmsg_internal("timeline %u became historic, can read up to %X/%X",
1581 private_data->tli, LSN_FORMAT_ARGS(private_data->read_upto)));
1582 }
1583
1584 /* Go around and try again. */
1585 }
1586 }
1587 else
1588 {
1589 /* enough bytes available to satisfy the request */
1590 count = private_data->read_upto - targetPagePtr;
1591 break;
1592 }
1593 }
1594
1595 if (!WALRead(state, cur_page, targetPagePtr, count,
1596 private_data->tli, &errinfo))
1597 WALReadRaiseError(&errinfo);
1598
1599 /* Track that we read a page, for sleep time calculation. */
1601
1602 /* number of valid bytes in the buffer */
1603 return count;
1604}
1605
1606/*
1607 * Sleep for long enough that we believe it's likely that more WAL will
1608 * be available afterwards.
1609 */
1610static void
1612{
1614 {
1615 /*
1616 * No pages were read since the last sleep, so double the sleep time,
1617 * but not beyond the maximum allowable value.
1618 */
1620 }
1621 else if (pages_read_since_last_sleep > 1)
1622 {
1623 /*
1624 * Multiple pages were read since the last sleep, so reduce the sleep
1625 * time.
1626 *
1627 * A large burst of activity should be able to quickly reduce the
1628 * sleep time to the minimum, but we don't want a handful of extra WAL
1629 * records to provoke a strong reaction. We choose to reduce the sleep
1630 * time by 1 quantum for each page read beyond the first, which is a
1631 * fairly arbitrary way of trying to be reactive without overreacting.
1632 */
1634 sleep_quanta = 1;
1635 else
1637 }
1638
1639 /* OK, now sleep. */
1640 (void) WaitLatch(MyLatch,
1643 WAIT_EVENT_WAL_SUMMARIZER_WAL);
1645
1646 /* Reset count of pages read. */
1648}
1649
1650/*
1651 * Remove WAL summaries whose mtimes are older than wal_summary_keep_time.
1652 */
1653static void
1655{
1656 XLogRecPtr redo_pointer = GetRedoRecPtr();
1657 List *wslist;
1658 time_t cutoff_time;
1659
1660 /* If WAL summary removal is disabled, don't do anything. */
1661 if (wal_summary_keep_time == 0)
1662 return;
1663
1664 /*
1665 * If the redo pointer has not advanced, don't do anything.
1666 *
1667 * This has the effect that we only try to remove old WAL summary files
1668 * once per checkpoint cycle.
1669 */
1670 if (redo_pointer == redo_pointer_at_last_summary_removal)
1671 return;
1673
1674 /*
1675 * Files should only be removed if the last modification time precedes the
1676 * cutoff time we compute here.
1677 */
1678 cutoff_time = time(NULL) - wal_summary_keep_time * SECS_PER_MINUTE;
1679
1680 /* Get all the summaries that currently exist. */
1682
1683 /* Loop until all summaries have been considered for removal. */
1684 while (wslist != NIL)
1685 {
1686 ListCell *lc;
1687 XLogSegNo oldest_segno;
1688 XLogRecPtr oldest_lsn = InvalidXLogRecPtr;
1689 TimeLineID selected_tli;
1690
1692
1693 /*
1694 * Pick a timeline for which some summary files still exist on disk,
1695 * and find the oldest LSN that still exists on disk for that
1696 * timeline.
1697 */
1698 selected_tli = ((WalSummaryFile *) linitial(wslist))->tli;
1699 oldest_segno = XLogGetOldestSegno(selected_tli);
1700 if (oldest_segno != 0)
1702 oldest_lsn);
1703
1704
1705 /* Consider each WAL file on the selected timeline in turn. */
1706 foreach(lc, wslist)
1707 {
1708 WalSummaryFile *ws = lfirst(lc);
1709
1711
1712 /* If it's not on this timeline, it's not time to consider it. */
1713 if (selected_tli != ws->tli)
1714 continue;
1715
1716 /*
1717 * If the WAL doesn't exist any more, we can remove it if the file
1718 * modification time is old enough.
1719 */
1720 if (XLogRecPtrIsInvalid(oldest_lsn) || ws->end_lsn <= oldest_lsn)
1721 RemoveWalSummaryIfOlderThan(ws, cutoff_time);
1722
1723 /*
1724 * Whether we removed the file or not, we need not consider it
1725 * again.
1726 */
1727 wslist = foreach_delete_current(wslist, lc);
1728 pfree(ws);
1729 }
1730 }
1731}
void AuxiliaryProcessMainCommon(void)
Definition: auxprocess.c:39
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:572
sigset_t UnBlockSig
Definition: pqsignal.c:22
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1756
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1644
void BlockRefTableMarkBlockModified(BlockRefTable *brtab, const RelFileLocator *rlocator, ForkNumber forknum, BlockNumber blknum)
Definition: blkreftable.c:297
void BlockRefTableSetLimitBlock(BlockRefTable *brtab, const RelFileLocator *rlocator, ForkNumber forknum, BlockNumber limit_block)
Definition: blkreftable.c:262
void WriteBlockRefTable(BlockRefTable *brtab, io_callback_fn write_callback, void *write_callback_arg)
Definition: blkreftable.c:474
void(*) BlockRefTable CreateEmptyBlockRefTable)(void)
uint32 BlockNumber
Definition: block.h:31
#define Min(x, y)
Definition: c.h:961
uint8_t uint8
Definition: c.h:486
#define Assert(condition)
Definition: c.h:815
size_t Size
Definition: c.h:562
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition: timestamp.h:39
#define MINS_PER_HOUR
Definition: timestamp.h:129
#define SECS_PER_MINUTE
Definition: timestamp.h:128
#define HOURS_PER_DAY
Definition: timestamp.h:118
#define XLOG_DBASE_CREATE_WAL_LOG
#define XLOG_DBASE_DROP
#define XLOG_DBASE_CREATE_FILE_COPY
void AtEOXact_HashTables(bool isCommit)
Definition: dynahash.c:1912
int errmsg_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1180
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
void EmitErrorReport(void)
Definition: elog.c:1687
int errcode_for_file_access(void)
Definition: elog.c:876
int errdetail(const char *fmt,...)
Definition: elog.c:1203
ErrorContextCallback * error_context_stack
Definition: elog.c:94
void FlushErrorState(void)
Definition: elog.c:1867
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
sigjmp_buf * PG_exception_stack
Definition: elog.c:96
#define WARNING
Definition: elog.h:36
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:781
void AtEOXact_Files(bool isCommit)
Definition: fd.c:3187
void FileClose(File file)
Definition: fd.c:1977
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1574
volatile sig_atomic_t LogMemoryContextPending
Definition: globals.c:40
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:39
ProcNumber MyProcNumber
Definition: globals.c:89
struct Latch * MyLatch
Definition: globals.c:62
void ProcessConfigFile(GucContext context)
Definition: guc-file.l:120
@ PGC_SIGHUP
Definition: guc.h:71
void SignalHandlerForShutdownRequest(SIGNAL_ARGS)
Definition: interrupt.c:105
volatile sig_atomic_t ShutdownRequestPending
Definition: interrupt.c:28
volatile sig_atomic_t ConfigReloadPending
Definition: interrupt.c:27
void SignalHandlerForConfigReload(SIGNAL_ARGS)
Definition: interrupt.c:61
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
void proc_exit(int code)
Definition: ipc.c:104
int i
Definition: isn.c:72
void SetLatch(Latch *latch)
Definition: latch.c:632
void ResetLatch(Latch *latch)
Definition: latch.c:724
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:517
#define WL_TIMEOUT
Definition: latch.h:130
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:132
#define WL_LATCH_SET
Definition: latch.h:127
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
void LWLockReleaseAll(void)
Definition: lwlock.c:1876
@ LW_SHARED
Definition: lwlock.h:115
@ LW_EXCLUSIVE
Definition: lwlock.h:114
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:383
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
MemoryContext TopMemoryContext
Definition: mcxt.c:149
void ProcessLogMemoryContextInterrupt(void)
Definition: mcxt.c:1289
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
#define RESUME_INTERRUPTS()
Definition: miscadmin.h:135
#define AmWalSummarizerProcess()
Definition: miscadmin.h:390
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define HOLD_INTERRUPTS()
Definition: miscadmin.h:133
@ B_WAL_SUMMARIZER
Definition: miscadmin.h:365
BackendType MyBackendType
Definition: miscinit.c:64
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
void * arg
#define MAXPGPATH
#define XLOG_CHECKPOINT_REDO
Definition: pg_control.h:82
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:68
#define XLOG_PARAMETER_CHANGE
Definition: pg_control.h:74
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:77
while(p+4<=pend)
#define lfirst(lc)
Definition: pg_list.h:172
static int list_length(const List *l)
Definition: pg_list.h:152
#define NIL
Definition: pg_list.h:68
#define foreach_delete_current(lst, var_or_cell)
Definition: pg_list.h:391
static void * list_nth(const List *list, int n)
Definition: pg_list.h:299
#define linitial(l)
Definition: pg_list.h:178
#define pqsignal
Definition: port.h:520
#define snprintf
Definition: port.h:238
uintptr_t Datum
Definition: postgres.h:69
#define GetPGProcByNumber(n)
Definition: proc.h:423
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
int ProcNumber
Definition: procnumber.h:24
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:496
void procsignal_sigusr1_handler(SIGNAL_ARGS)
Definition: procsignal.c:671
ForkNumber
Definition: relpath.h:56
@ FSM_FORKNUM
Definition: relpath.h:59
@ VISIBILITYMAP_FORKNUM
Definition: relpath.h:60
@ MAIN_FORKNUM
Definition: relpath.h:58
#define MAX_FORKNUM
Definition: relpath.h:70
void ReleaseAuxProcessResources(bool isCommit)
Definition: resowner.c:1002
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:382
PROC_HDR * ProcGlobal
Definition: proc.c:78
#define SMGR_TRUNCATE_VM
Definition: storage_xlog.h:41
#define XLOG_SMGR_CREATE
Definition: storage_xlog.h:30
#define XLOG_SMGR_TRUNCATE
Definition: storage_xlog.h:31
#define SMGR_TRUNCATE_HEAP
Definition: storage_xlog.h:40
int wal_level
Definition: pg_control.h:43
Definition: pg_list.h:54
Latch procLatch
Definition: proc.h:169
PGPROC * allProcs
Definition: proc.h:371
RelFileNumber relNumber
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr summarized_lsn
Definition: walsummarizer.c:84
TimeLineID summarized_tli
Definition: walsummarizer.c:83
ConditionVariable summary_file_cv
Definition: walsummarizer.c:92
ProcNumber summarizer_pgprocno
Definition: walsummarizer.c:86
XLogRecPtr pending_lsn
Definition: walsummarizer.c:87
XLogRecPtr end_lsn
Definition: walsummary.h:30
TimeLineID tli
Definition: walsummary.h:31
off_t filepos
Definition: walsummary.h:24
XLogRecPtr EndRecPtr
Definition: xlogreader.h:207
XLogRecPtr ReadRecPtr
Definition: xlogreader.h:206
void * private_data
Definition: xlogreader.h:196
Definition: regguts.h:323
Oid tablespace_ids[FLEXIBLE_ARRAY_MEMBER]
ForkNumber forkNum
Definition: storage_xlog.h:36
RelFileLocator rlocator
Definition: storage_xlog.h:35
RelFileLocator rlocator
Definition: storage_xlog.h:49
BlockNumber blkno
Definition: storage_xlog.h:48
RelFileLocator * xlocators
Definition: xact.h:422
RelFileLocator * xlocators
Definition: xact.h:389
#define TimestampTzPlusMilliseconds(tz, ms)
Definition: timestamp.h:85
static void pgstat_report_wait_end(void)
Definition: wait_event.h:101
XLogRecPtr GetWalRcvFlushRecPtr(XLogRecPtr *latestChunkStart, TimeLineID *receiveTLI)
static XLogRecPtr redo_pointer_at_last_summary_removal
#define MAX_SLEEP_QUANTA
static long pages_read_since_last_sleep
void WalSummarizerMain(char *startup_data, size_t startup_data_len)
Size WalSummarizerShmemSize(void)
static XLogRecPtr GetLatestLSN(TimeLineID *tli)
static bool SummarizeXlogRecord(XLogReaderState *xlogreader, bool *new_fast_forward)
static XLogRecPtr SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, XLogRecPtr switch_lsn, XLogRecPtr maximum_lsn)
static WalSummarizerData * WalSummarizerCtl
static void SummarizeXactRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
bool summarize_wal
void WaitForWalSummarization(XLogRecPtr lsn)
static void SummarizeDbaseRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
static void HandleWalSummarizerInterrupts(void)
#define MS_PER_SLEEP_QUANTUM
void GetWalSummarizerState(TimeLineID *summarized_tli, XLogRecPtr *summarized_lsn, XLogRecPtr *pending_lsn, int *summarizer_pid)
static long sleep_quanta
int wal_summary_keep_time
static int summarizer_read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *cur_page)
static void WalSummarizerShutdown(int code, Datum arg)
static void SummarizeSmgrRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
static void MaybeRemoveOldWalSummaries(void)
void WakeupWalSummarizer(void)
XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact)
void WalSummarizerShmemInit(void)
static void summarizer_wait_for_wal(void)
void RemoveWalSummaryIfOlderThan(WalSummaryFile *ws, time_t cutoff_time)
Definition: walsummary.c:230
List * GetWalSummaries(TimeLineID tli, XLogRecPtr start_lsn, XLogRecPtr end_lsn)
Definition: walsummary.c:43
int WriteWalSummary(void *wal_summary_io, void *data, int length)
Definition: walsummary.c:294
#define SIGCHLD
Definition: win32_port.h:168
#define SIGHUP
Definition: win32_port.h:158
#define SIGPIPE
Definition: win32_port.h:163
#define SIGUSR1
Definition: win32_port.h:170
#define SIGALRM
Definition: win32_port.h:164
#define SIGUSR2
Definition: win32_port.h:171
#define XLOG_XACT_COMMIT_PREPARED
Definition: xact.h:172
#define XLOG_XACT_COMMIT
Definition: xact.h:169
#define XLOG_XACT_OPMASK
Definition: xact.h:179
#define XLOG_XACT_ABORT
Definition: xact.h:171
#define XLOG_XACT_ABORT_PREPARED
Definition: xact.h:173
void ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed)
Definition: xactdesc.c:35
void ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed)
Definition: xactdesc.c:141
bool RecoveryInProgress(void)
Definition: xlog.c:6334
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:6437
int wal_segment_size
Definition: xlog.c:143
XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI)
Definition: xlog.c:6499
XLogSegNo XLogGetOldestSegno(TimeLineID tli)
Definition: xlog.c:3774
TimeLineID GetWALInsertionTimeLineIfSet(void)
Definition: xlog.c:6536
@ WAL_LEVEL_MINIMAL
Definition: xlog.h:74
#define XLogSegNoOffsetToRecPtr(segno, offset, wal_segsz_bytes, dest)
#define XLOGDIR
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
uint32 TimeLineID
Definition: xlogdefs.h:59
uint64 XLogSegNo
Definition: xlogdefs.h:48
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
Definition: xlogreader.c:1997
XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, XLogReaderRoutine *routine, void *private_data)
Definition: xlogreader.c:106
bool WALRead(XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli, WALReadError *errinfo)
Definition: xlogreader.c:1503
XLogRecord * XLogReadRecord(XLogReaderState *state, char **errormsg)
Definition: xlogreader.c:389
void XLogReaderFree(XLogReaderState *state)
Definition: xlogreader.c:161
XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr)
Definition: xlogreader.c:1383
void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr)
Definition: xlogreader.c:231
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:410
#define XLogRecGetRmid(decoder)
Definition: xlogreader.h:411
#define XLogRecGetData(decoder)
Definition: xlogreader.h:415
#define XL_ROUTINE(...)
Definition: xlogreader.h:117
#define XLogRecMaxBlockId(decoder)
Definition: xlogreader.h:418
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI)
static XLogReaderState * xlogreader
Definition: xlogrecovery.c:188
void wal_segment_close(XLogReaderState *state)
Definition: xlogutils.c:842
void wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID *tli_p)
Definition: xlogutils.c:817
void WALReadRaiseError(WALReadError *errinfo)
Definition: xlogutils.c:1020