PostgreSQL Source Code git master
Loading...
Searching...
No Matches
xlogrecovery.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * xlogrecovery.c
4 * Functions for WAL recovery, standby mode
5 *
6 * This source file contains functions controlling WAL recovery.
7 * InitWalRecovery() initializes the system for crash or archive recovery,
8 * or standby mode, depending on configuration options and the state of
9 * the control file and possible backup label file. PerformWalRecovery()
10 * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 * and prepares information needed to initialize the WAL for writes. In
13 * addition to these three main functions, there are a bunch of functions
14 * for interrogating recovery state and controlling the recovery process.
15 *
16 *
17 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
19 *
20 * src/backend/access/transam/xlogrecovery.c
21 *
22 *-------------------------------------------------------------------------
23 */
24
25#include "postgres.h"
26
27#include <ctype.h>
28#include <time.h>
29#include <sys/stat.h>
30#include <sys/time.h>
31#include <unistd.h>
32
33#include "access/timeline.h"
34#include "access/transam.h"
35#include "access/xact.h"
37#include "access/xlogarchive.h"
39#include "access/xlogreader.h"
40#include "access/xlogrecovery.h"
41#include "access/xlogutils.h"
42#include "access/xlogwait.h"
43#include "backup/basebackup.h"
44#include "catalog/pg_control.h"
45#include "commands/tablespace.h"
46#include "common/file_utils.h"
47#include "miscadmin.h"
48#include "nodes/miscnodes.h"
49#include "pgstat.h"
50#include "postmaster/bgwriter.h"
51#include "postmaster/startup.h"
52#include "replication/slot.h"
55#include "storage/fd.h"
56#include "storage/ipc.h"
57#include "storage/latch.h"
58#include "storage/pmsignal.h"
59#include "storage/procarray.h"
60#include "storage/spin.h"
61#include "utils/datetime.h"
62#include "utils/fmgrprotos.h"
63#include "utils/guc_hooks.h"
65#include "utils/pg_lsn.h"
66#include "utils/ps_status.h"
67#include "utils/pg_rusage.h"
68
69/* Unsupported old recovery command file names (relative to $PGDATA) */
70#define RECOVERY_COMMAND_FILE "recovery.conf"
71#define RECOVERY_COMMAND_DONE "recovery.done"
72
73/*
74 * GUC support
75 */
77 {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
78 {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
79 {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
80 {NULL, 0, false}
81};
82
83/* options formerly taken from recovery.conf for archive recovery */
96
97/* options formerly taken from recovery.conf for XLOG streaming */
101
102/*
103 * recoveryTargetTimeLineGoal: what the user requested, if any
104 *
105 * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
106 *
107 * recoveryTargetTLI: the currently understood target timeline; changes
108 *
109 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
110 * the timelines of its known parents, newest first (so recoveryTargetTLI is
111 * always the first list member). Only these TLIs are expected to be seen in
112 * the WAL segments we read, and indeed only these TLIs will be considered as
113 * candidate WAL files to open at all.
114 *
115 * curFileTLI: the TLI appearing in the name of the current input WAL file.
116 * (This is not necessarily the same as the timeline from which we are
117 * replaying WAL, which StartupXLOG calls replayTLI, because we could be
118 * scanning data that was copied from an ancestor timeline when the current
119 * file was created.) During a sequential scan we do not allow this value
120 * to decrease.
121 */
127
128/*
129 * When ArchiveRecoveryRequested is set, archive recovery was requested,
130 * ie. signal files were present. When InArchiveRecovery is set, we are
131 * currently recovering using offline XLOG archives. These variables are only
132 * valid in the startup process.
133 *
134 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
135 * currently performing crash recovery using only XLOG files in pg_wal, but
136 * will switch to using offline XLOG archives as soon as we reach the end of
137 * WAL in pg_wal.
138 */
140bool InArchiveRecovery = false;
141
142/*
143 * When StandbyModeRequested is set, standby mode was requested, i.e.
144 * standby.signal file was present. When StandbyMode is set, we are currently
145 * in standby mode. These variables are only valid in the startup process.
146 * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
147 */
148static bool StandbyModeRequested = false;
149bool StandbyMode = false;
150
151/* was a signal file present at startup? */
152static bool standby_signal_file_found = false;
153static bool recovery_signal_file_found = false;
154
155/*
156 * CheckPointLoc is the position of the checkpoint record that determines
157 * where to start the replay. It comes from the backup label file or the
158 * control file.
159 *
160 * RedoStartLSN is the checkpoint's REDO location, also from the backup label
161 * file or the control file. In standby mode, XLOG streaming usually starts
162 * from the position where an invalid record was found. But if we fail to
163 * read even the initial checkpoint record, we use the REDO location instead
164 * of the checkpoint location as the start position of XLOG streaming.
165 * Otherwise we would have to jump backwards to the REDO location after
166 * reading the checkpoint record, because the REDO record can precede the
167 * checkpoint record.
168 */
173
174/*
175 * Local copy of SharedHotStandbyActive variable. False actually means "not
176 * known, need to check the shared state".
177 */
178static bool LocalHotStandbyActive = false;
179
180/*
181 * Local copy of SharedPromoteIsTriggered variable. False actually means "not
182 * known, need to check the shared state".
183 */
184static bool LocalPromoteIsTriggered = false;
185
186/* Has the recovery code requested a walreceiver wakeup? */
188
189/* XLogReader object used to parse the WAL records */
191
192/* XLogPrefetcher object used to consume WAL records with read-ahead */
194
195/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
197{
198 int emode;
199 bool fetching_ckpt; /* are we fetching a checkpoint record? */
203
204/* flag to tell XLogPageRead that we have started replaying */
205static bool InRedo = false;
206
207/*
208 * Codes indicating where we got a WAL file from during recovery, or where
209 * to attempt to get one.
210 */
211typedef enum
212{
213 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
214 XLOG_FROM_ARCHIVE, /* restored using restore_command */
215 XLOG_FROM_PG_WAL, /* existing file in pg_wal */
216 XLOG_FROM_STREAM, /* streamed from primary */
217} XLogSource;
218
219/* human-readable names for XLogSources, for debugging output */
220static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
221
222/*
223 * readFile is -1 or a kernel FD for the log file segment that's currently
224 * open for reading. readSegNo identifies the segment. readOff is the offset
225 * of the page just read, readLen indicates how much of it has been read into
226 * readBuf, and readSource indicates where we got the currently open file from.
227 *
228 * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
229 * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
230 * worthwhile, since the XLOG is not read by general-purpose sessions.
231 */
232static int readFile = -1;
234static uint32 readOff = 0;
235static uint32 readLen = 0;
237
238/*
239 * Keeps track of which source we're currently reading from. This is
240 * different from readSource in that this is always set, even when we don't
241 * currently have a WAL file open. If lastSourceFailed is set, our last
242 * attempt to read from currentSource failed, and we should try another source
243 * next.
244 *
245 * pendingWalRcvRestart is set when a config change occurs that requires a
246 * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
247 */
249static bool lastSourceFailed = false;
250static bool pendingWalRcvRestart = false;
251
252/*
253 * These variables track when we last obtained some WAL data to process,
254 * and where we got it from. (XLogReceiptSource is initially the same as
255 * readSource, but readSource gets reset to zero when we don't have data
256 * to process right now. It is also different from currentSource, which
257 * also changes when we try to read from a source and fail, while
258 * XLogReceiptSource tracks where we last successfully read some WAL.)
259 */
262
263/* Local copy of WalRcv->flushedUpto */
266
267/*
268 * Copy of minRecoveryPoint and backupEndPoint from the control file.
269 *
270 * In order to reach consistency, we must replay the WAL up to
271 * minRecoveryPoint. If backupEndRequired is true, we must also reach
272 * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
273 * to backupStartPoint.
274 *
275 * Note: In archive recovery, after consistency has been reached, the
276 * functions in xlog.c will start updating minRecoveryPoint in the control
277 * file. But this copy of minRecoveryPoint variable reflects the value at the
278 * beginning of recovery, and is *not* updated after consistency is reached.
279 */
282
285static bool backupEndRequired = false;
286
287/*
288 * Have we reached a consistent database state? In crash recovery, we have
289 * to replay all the WAL, so reachedConsistency is never set. During archive
290 * recovery, the database is consistent once minRecoveryPoint is reached.
291 *
292 * Consistent state means that the system is internally consistent, all
293 * the WAL has been replayed up to a certain point, and importantly, there
294 * is no trace of later actions on disk.
295 *
296 * This flag is used only by the startup process and postmaster. When
297 * minRecoveryPoint is reached, the startup process sets it to true and
298 * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
299 * which then sets it to true upon receiving the signal.
300 */
302
303/* Buffers dedicated to consistency checks of size BLCKSZ */
306
307
308/*
309 * Shared-memory state for WAL recovery.
310 */
312{
313 /*
314 * SharedHotStandbyActive indicates if we allow hot standby queries to be
315 * run. Protected by info_lck.
316 */
318
319 /*
320 * SharedPromoteIsTriggered indicates if a standby promotion has been
321 * triggered. Protected by info_lck.
322 */
324
325 /*
326 * recoveryWakeupLatch is used to wake up the startup process to continue
327 * WAL replay, if it is waiting for WAL to arrive or promotion to be
328 * requested.
329 *
330 * Note that the startup process also uses another latch, its procLatch,
331 * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
332 * signaling the startup process in favor of using its procLatch, which
333 * comports better with possible generic signal handlers using that latch.
334 * But we should not do that because the startup process doesn't assume
335 * that it's waken up by walreceiver process or SIGHUP signal handler
336 * while it's waiting for recovery conflict. The separate latches,
337 * recoveryWakeupLatch and procLatch, should be used for inter-process
338 * communication for WAL replay and recovery conflict, respectively.
339 */
341
342 /*
343 * Last record successfully replayed.
344 */
345 XLogRecPtr lastReplayedReadRecPtr; /* start position */
346 XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
348
349 /*
350 * When we're currently replaying a record, ie. in a redo function,
351 * replayEndRecPtr points to the end+1 of the record being replayed,
352 * otherwise it's equal to lastReplayedEndRecPtr.
353 */
356 /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
358
359 /*
360 * timestamp of when we started replaying the current chunk of WAL data,
361 * only relevant for replication or archive recovery
362 */
364 /* Recovery pause state */
367
368 slock_t info_lck; /* locks shared variables shown above */
370
372
373/*
374 * abortedRecPtr is the start pointer of a broken record at end of WAL when
375 * recovery completes; missingContrecPtr is the location of the first
376 * contrecord that went missing. See CreateOverwriteContrecordRecord for
377 * details.
378 */
381
382/*
383 * if recoveryStopsBefore/After returns true, it saves information of the stop
384 * point here
385 */
391
392/* prototypes for local functions */
393static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
394
395static void EnableStandbyMode(void);
396static void readRecoverySignalFile(void);
397static void validateRecoveryParameters(void);
401static bool read_tablespace_map(List **tablespaces);
402
403static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
404static void CheckRecoveryConsistency(void);
405static void rm_redo_error_callback(void *arg);
406#ifdef WAL_DEBUG
407static void xlog_outrec(StringInfo buf, XLogReaderState *record);
408#endif
409static void xlog_block_info(StringInfo buf, XLogReaderState *record);
411 TimeLineID prevTLI, TimeLineID replayTLI);
414
415static bool recoveryStopsBefore(XLogReaderState *record);
416static bool recoveryStopsAfter(XLogReaderState *record);
417static char *getRecoveryStopReason(void);
418static void recoveryPausesHere(bool endOfRecovery);
419static bool recoveryApplyDelay(XLogReaderState *record);
420static void ConfirmRecoveryPaused(void);
421
423 int emode, bool fetching_ckpt,
424 TimeLineID replayTLI);
425
427 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
429 bool randAccess,
430 bool fetching_ckpt,
432 TimeLineID replayTLI,
434 bool nonblocking);
435static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
437 XLogRecPtr RecPtr, TimeLineID replayTLI);
439static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
442
443static bool CheckForStandbyTrigger(void);
444static void SetPromoteIsTriggered(void);
445static bool HotStandbyActiveInReplay(void);
446
448static void SetLatestXTime(TimestampTz xtime);
449
450/*
451 * Initialization of shared memory for WAL recovery
452 */
453Size
455{
456 Size size;
457
458 /* XLogRecoveryCtl */
459 size = sizeof(XLogRecoveryCtlData);
460
461 return size;
462}
463
464void
479
480/*
481 * A thin wrapper to enable StandbyMode and do other preparatory work as
482 * needed.
483 */
484static void
486{
487 StandbyMode = true;
488
489 /*
490 * To avoid server log bloat, we don't report recovery progress in a
491 * standby as it will always be in recovery unless promoted. We disable
492 * startup progress timeout in standby mode to avoid calling
493 * startup_progress_timeout_handler() unnecessarily.
494 */
496}
497
498/*
499 * Prepare the system for WAL recovery, if needed.
500 *
501 * This is called by StartupXLOG() which coordinates the server startup
502 * sequence. This function analyzes the control file and the backup label
503 * file, if any, and figures out whether we need to perform crash recovery or
504 * archive recovery, and how far we need to replay the WAL to reach a
505 * consistent state.
506 *
507 * This doesn't yet change the on-disk state, except for creating the symlinks
508 * from table space map file if any, and for fetching WAL files needed to find
509 * the checkpoint record. On entry, the caller has already read the control
510 * file into memory, and passes it as argument. This function updates it to
511 * reflect the recovery state, and the caller is expected to write it back to
512 * disk does after initializing other subsystems, but before calling
513 * PerformWalRecovery().
514 *
515 * This initializes some global variables like ArchiveRecoveryRequested, and
516 * StandbyModeRequested and InRecovery.
517 */
518void
521{
522 XLogPageReadPrivate *private;
523 struct stat st;
524 bool wasShutdown;
525 XLogRecord *record;
527 bool haveTblspcMap = false;
528 bool haveBackupLabel = false;
529 CheckPoint checkPoint;
530 bool backupFromStandby = false;
531
533
534 /*
535 * Initialize on the assumption we want to recover to the latest timeline
536 * that's active according to pg_control.
537 */
541 else
543
544 /*
545 * Check for signal files, and if so set up state for offline recovery
546 */
549
550 /*
551 * Take ownership of the wakeup latch if we're going to sleep during
552 * recovery, if required.
553 */
556
557 /*
558 * Set the WAL reading processor now, as it will be needed when reading
559 * the checkpoint record required (backup_label or not).
560 */
562 xlogreader =
564 XL_ROUTINE(.page_read = &XLogPageRead,
565 .segment_open = NULL,
566 .segment_close = wal_segment_close),
567 private);
568 if (!xlogreader)
571 errmsg("out of memory"),
572 errdetail("Failed while allocating a WAL reading processor.")));
574
575 /*
576 * Set the WAL decode buffer size. This limits how far ahead we can read
577 * in the WAL.
578 */
580
581 /* Create a WAL prefetcher. */
583
584 /*
585 * Allocate two page buffers dedicated to WAL consistency checks. We do
586 * it this way, rather than just making static arrays, for two reasons:
587 * (1) no need to waste the storage in most instantiations of the backend;
588 * (2) a static char array isn't guaranteed to have any particular
589 * alignment, whereas palloc() will provide MAXALIGN'd storage.
590 */
593
594 /*
595 * Read the backup_label file. We want to run this part of the recovery
596 * process after checking for signal files and after performing validation
597 * of the recovery parameters.
598 */
601 {
602 List *tablespaces = NIL;
603
604 /*
605 * Archive recovery was requested, and thanks to the backup label
606 * file, we know how far we need to replay to reach consistency. Enter
607 * archive recovery directly.
608 */
609 InArchiveRecovery = true;
612
613 /*
614 * Omitting backup_label when creating a new replica, PITR node etc.
615 * unfortunately is a common cause of corruption. Logging that
616 * backup_label was used makes it a bit easier to exclude that as the
617 * cause of observed corruption.
618 *
619 * Do so before we try to read the checkpoint record (which can fail),
620 * as otherwise it can be hard to understand why a checkpoint other
621 * than ControlFile->checkPoint is used.
622 */
623 ereport(LOG,
624 errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
628
629 /*
630 * When a backup_label file is present, we want to roll forward from
631 * the checkpoint it identifies, rather than using pg_control.
632 */
635 if (record != NULL)
636 {
637 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
640 errmsg_internal("checkpoint record is at %X/%08X",
642 InRecovery = true; /* force recovery even if SHUTDOWNED */
643
644 /*
645 * Make sure that REDO location exists. This may not be the case
646 * if there was a crash during an online backup, which left a
647 * backup_label around that references a WAL segment that's
648 * already been archived.
649 */
650 if (checkPoint.redo < CheckPointLoc)
651 {
653 if (!ReadRecord(xlogprefetcher, LOG, false,
654 checkPoint.ThisTimeLineID))
656 errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
658 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
659 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
660 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
662 }
663 }
664 else
665 {
667 errmsg("could not locate required checkpoint record at %X/%08X",
669 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
670 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
671 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
673 wasShutdown = false; /* keep compiler quiet */
674 }
675
676 /* Read the tablespace_map file if present and create symlinks. */
677 if (read_tablespace_map(&tablespaces))
678 {
679 ListCell *lc;
680
681 foreach(lc, tablespaces)
682 {
684 char *linkloc;
685
686 linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
687
688 /*
689 * Remove the existing symlink if any and Create the symlink
690 * under PGDATA.
691 */
693
694 if (symlink(ti->path, linkloc) < 0)
697 errmsg("could not create symbolic link \"%s\": %m",
698 linkloc)));
699
700 pfree(ti->path);
701 pfree(ti);
702 }
703
704 /* tell the caller to delete it later */
705 haveTblspcMap = true;
706 }
707
708 /* tell the caller to delete it later */
709 haveBackupLabel = true;
710 }
711 else
712 {
713 /* No backup_label file has been found if we are here. */
714
715 /*
716 * If tablespace_map file is present without backup_label file, there
717 * is no use of such file. There is no harm in retaining it, but it
718 * is better to get rid of the map file so that we don't have any
719 * redundant file in data directory and it will avoid any sort of
720 * confusion. It seems prudent though to just rename the file out of
721 * the way rather than delete it completely, also we ignore any error
722 * that occurs in rename operation as even if map file is present
723 * without backup_label file, it is harmless.
724 */
725 if (stat(TABLESPACE_MAP, &st) == 0)
726 {
729 ereport(LOG,
730 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
732 errdetail("File \"%s\" was renamed to \"%s\".",
734 else
735 ereport(LOG,
736 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
738 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
740 }
741
742 /*
743 * It's possible that archive recovery was requested, but we don't
744 * know how far we need to replay the WAL before we reach consistency.
745 * This can happen for example if a base backup is taken from a
746 * running server using an atomic filesystem snapshot, without calling
747 * pg_backup_start/stop. Or if you just kill a running primary server
748 * and put it into archive recovery by creating a recovery signal
749 * file.
750 *
751 * Our strategy in that case is to perform crash recovery first,
752 * replaying all the WAL present in pg_wal, and only enter archive
753 * recovery after that.
754 *
755 * But usually we already know how far we need to replay the WAL (up
756 * to minRecoveryPoint, up to backupEndPoint, or until we see an
757 * end-of-backup record), and we can enter archive recovery directly.
758 */
764 {
765 InArchiveRecovery = true;
768 }
769
770 /*
771 * For the same reason as when starting up with backup_label present,
772 * emit a log message when we continue initializing from a base
773 * backup.
774 */
776 ereport(LOG,
777 errmsg("restarting backup recovery with redo LSN %X/%08X",
779
780 /* Get the last valid checkpoint record. */
787 if (record != NULL)
788 {
790 errmsg_internal("checkpoint record is at %X/%08X",
792 }
793 else
794 {
795 /*
796 * We used to attempt to go back to a secondary checkpoint record
797 * here, but only when not in standby mode. We now just fail if we
798 * can't read the last checkpoint because this allows us to
799 * simplify processing around checkpoints.
800 */
802 errmsg("could not locate a valid checkpoint record at %X/%08X",
804 }
805 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
807
808 /* Make sure that REDO location exists. */
809 if (checkPoint.redo < CheckPointLoc)
810 {
812 if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
814 errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
816 }
817 }
818
820 {
822 ereport(LOG,
823 (errmsg("entering standby mode")));
825 ereport(LOG,
826 (errmsg("starting point-in-time recovery to XID %u",
829 ereport(LOG,
830 (errmsg("starting point-in-time recovery to %s",
833 ereport(LOG,
834 (errmsg("starting point-in-time recovery to \"%s\"",
837 ereport(LOG,
838 errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
841 ereport(LOG,
842 (errmsg("starting point-in-time recovery to earliest consistent point")));
843 else
844 ereport(LOG,
845 (errmsg("starting archive recovery")));
846 }
847
848 /*
849 * If the location of the checkpoint record is not on the expected
850 * timeline in the history of the requested timeline, we cannot proceed:
851 * the backup is not part of the history of the requested timeline.
852 */
853 Assert(expectedTLEs); /* was initialized by reading checkpoint
854 * record */
857 {
859
860 /*
861 * tliSwitchPoint will throw an error if the checkpoint's timeline is
862 * not in expectedTLEs at all.
863 */
866 (errmsg("requested timeline %u is not a child of this server's history",
868 /* translator: %s is a backup_label file or a pg_control file */
869 errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
870 haveBackupLabel ? "backup_label" : "pg_control",
874 }
875
876 /*
877 * The min recovery point should be part of the requested timeline's
878 * history, too.
879 */
884 errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
888
890 errmsg_internal("redo record is at %X/%08X; shutdown %s",
891 LSN_FORMAT_ARGS(checkPoint.redo),
892 wasShutdown ? "true" : "false"));
894 (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
896 checkPoint.nextOid)));
898 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64,
899 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
901 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
902 checkPoint.oldestXid, checkPoint.oldestXidDB)));
904 (errmsg_internal("oldest MultiXactId: %u, in database %u",
905 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
907 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
908 checkPoint.oldestCommitTsXid,
909 checkPoint.newestCommitTsXid)));
912 (errmsg("invalid next transaction ID")));
913
914 /* sanity check */
915 if (checkPoint.redo > CheckPointLoc)
917 (errmsg("invalid redo in checkpoint record")));
918
919 /*
920 * Check whether we need to force recovery from WAL. If it appears to
921 * have been a clean shutdown and we did not have a recovery signal file,
922 * then assume no recovery needed.
923 */
924 if (checkPoint.redo < CheckPointLoc)
925 {
926 if (wasShutdown)
928 (errmsg("invalid redo record in shutdown checkpoint")));
929 InRecovery = true;
930 }
931 else if (ControlFile->state != DB_SHUTDOWNED)
932 InRecovery = true;
934 {
935 /* force recovery due to presence of recovery signal file */
936 InRecovery = true;
937 }
938
939 /*
940 * If recovery is needed, update our in-memory copy of pg_control to show
941 * that we are recovering and to show the selected checkpoint as the place
942 * we are starting from. We also mark pg_control with any minimum recovery
943 * stop point obtained from a backup history file.
944 *
945 * We don't write the changes to disk yet, though. Only do that after
946 * initializing various subsystems.
947 */
948 if (InRecovery)
949 {
951 {
953 }
954 else
955 {
956 ereport(LOG,
957 (errmsg("database system was not properly shut down; "
958 "automatic recovery in progress")));
960 ereport(LOG,
961 (errmsg("crash recovery starts in timeline %u "
962 "and has target timeline %u",
966 }
968 ControlFile->checkPointCopy = checkPoint;
970 {
971 /* initialize minRecoveryPoint if not set yet */
972 if (ControlFile->minRecoveryPoint < checkPoint.redo)
973 {
974 ControlFile->minRecoveryPoint = checkPoint.redo;
976 }
977 }
978
979 /*
980 * Set backupStartPoint if we're starting recovery from a base backup.
981 *
982 * Also set backupEndPoint and use minRecoveryPoint as the backup end
983 * location if we're starting recovery from a base backup which was
984 * taken from a standby. In this case, the database system status in
985 * pg_control must indicate that the database was already in recovery.
986 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
987 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
988 * before reaching this point; e.g. because restore_command or
989 * primary_conninfo were faulty.
990 *
991 * Any other state indicates that the backup somehow became corrupted
992 * and we can't sensibly continue with recovery.
993 */
994 if (haveBackupLabel)
995 {
996 ControlFile->backupStartPoint = checkPoint.redo;
998
1000 {
1003 ereport(FATAL,
1004 (errmsg("backup_label contains data inconsistent with control file"),
1005 errhint("This means that the backup is corrupted and you will "
1006 "have to use another backup for recovery.")));
1008 }
1009 }
1010 }
1011
1012 /* remember these, so that we know when we have reached consistency */
1017 {
1020 }
1021 else
1022 {
1025 }
1026
1027 /*
1028 * Start recovery assuming that the final record isn't lost.
1029 */
1032
1036}
1037
1038/*
1039 * See if there are any recovery signal files and if so, set state for
1040 * recovery.
1041 *
1042 * See if there is a recovery command file (recovery.conf), and if so
1043 * throw an ERROR since as of PG12 we no longer recognize that.
1044 */
1045static void
1047{
1048 struct stat stat_buf;
1049
1051 return;
1052
1053 /*
1054 * Check for old recovery API file: recovery.conf
1055 */
1057 ereport(FATAL,
1059 errmsg("using recovery command file \"%s\" is not supported",
1061
1062 /*
1063 * Remove unused .done file, if present. Ignore if absent.
1064 */
1066
1067 /*
1068 * Check for recovery signal files and if found, fsync them since they
1069 * represent server state information. We don't sweat too much about the
1070 * possibility of fsync failure, however.
1071 */
1072 if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1073 {
1074 int fd;
1075
1077 S_IRUSR | S_IWUSR);
1078 if (fd >= 0)
1079 {
1080 (void) pg_fsync(fd);
1081 close(fd);
1082 }
1084 }
1085
1087 {
1088 int fd;
1089
1091 S_IRUSR | S_IWUSR);
1092 if (fd >= 0)
1093 {
1094 (void) pg_fsync(fd);
1095 close(fd);
1096 }
1098 }
1099
1100 /*
1101 * If both signal files are present, standby signal file takes precedence.
1102 * If neither is present then we won't enter archive recovery.
1103 */
1104 StandbyModeRequested = false;
1107 {
1108 StandbyModeRequested = true;
1110 }
1112 {
1113 StandbyModeRequested = false;
1115 }
1116 else
1117 return;
1118
1119 /*
1120 * We don't support standby mode in standalone backends; that requires
1121 * other processes such as the WAL receiver to be alive.
1122 */
1124 ereport(FATAL,
1126 errmsg("standby mode is not supported by single-user servers")));
1127}
1128
1129static void
1131{
1133 return;
1134
1135 /*
1136 * Check for compulsory parameters
1137 */
1139 {
1140 if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1143 (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1144 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1145 }
1146 else
1147 {
1150 ereport(FATAL,
1152 errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1153 }
1154
1155 /*
1156 * Override any inconsistent requests. Note that this is a change of
1157 * behaviour in 9.5; prior to this we simply ignored a request to pause if
1158 * hot_standby = off, which was surprising behaviour.
1159 */
1163
1164 /*
1165 * Final parsing of recovery_target_time string; see also
1166 * check_recovery_target_time().
1167 */
1169 {
1173 Int32GetDatum(-1)));
1174 }
1175
1176 /*
1177 * If user specified recovery_target_timeline, validate it or compute the
1178 * "latest" value. We can't do this until after we've gotten the restore
1179 * command and set InArchiveRecovery, because we need to fetch timeline
1180 * history files from the archive.
1181 */
1183 {
1185
1186 /* Timeline 1 does not have a history file, all else should */
1187 if (rtli != 1 && !existsTimeLineHistory(rtli))
1188 ereport(FATAL,
1190 errmsg("recovery target timeline %u does not exist",
1191 rtli)));
1193 }
1195 {
1196 /* We start the "latest" search from pg_control's timeline */
1198 }
1199 else
1200 {
1201 /*
1202 * else we just use the recoveryTargetTLI as already read from
1203 * ControlFile
1204 */
1206 }
1207}
1208
1209/*
1210 * read_backup_label: check to see if a backup_label file is present
1211 *
1212 * If we see a backup_label during recovery, we assume that we are recovering
1213 * from a backup dump file, and we therefore roll forward from the checkpoint
1214 * identified by the label file, NOT what pg_control says. This avoids the
1215 * problem that pg_control might have been archived one or more checkpoints
1216 * later than the start of the dump, and so if we rely on it as the start
1217 * point, we will fail to restore a consistent database state.
1218 *
1219 * Returns true if a backup_label was found (and fills the checkpoint
1220 * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1221 * returns false if not. If this backup_label came from a streamed backup,
1222 * *backupEndRequired is set to true. If this backup_label was created during
1223 * recovery, *backupFromStandby is set to true.
1224 *
1225 * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1226 * and TLI read from the backup file.
1227 */
1228static bool
1231{
1235 FILE *lfp;
1236 char ch;
1237 char backuptype[20];
1238 char backupfrom[20];
1239 char backuplabel[MAXPGPATH];
1240 char backuptime[128];
1241 uint32 hi,
1242 lo;
1243
1244 /* suppress possible uninitialized-variable warnings */
1246 *backupLabelTLI = 0;
1247 *backupEndRequired = false;
1248 *backupFromStandby = false;
1249
1250 /*
1251 * See if label file is present
1252 */
1254 if (!lfp)
1255 {
1256 if (errno != ENOENT)
1257 ereport(FATAL,
1259 errmsg("could not read file \"%s\": %m",
1261 return false; /* it's not there, all is fine */
1262 }
1263
1264 /*
1265 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1266 * is pretty crude, but we are not expecting any variability in the file
1267 * format).
1268 */
1269 if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
1270 &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1271 ereport(FATAL,
1273 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1274 RedoStartLSN = ((uint64) hi) << 32 | lo;
1276 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
1277 &hi, &lo, &ch) != 3 || ch != '\n')
1278 ereport(FATAL,
1280 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1281 *checkPointLoc = ((uint64) hi) << 32 | lo;
1283
1284 /*
1285 * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1286 * which could mean either pg_basebackup or the pg_backup_start/stop
1287 * method was used) or if this label came from somewhere else (the only
1288 * other option today being from pg_rewind). If this was a streamed
1289 * backup then we know that we need to play through until we get to the
1290 * end of the WAL which was generated during the backup (at which point we
1291 * will have reached consistency and backupEndRequired will be reset to be
1292 * false).
1293 */
1294 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1295 {
1296 if (strcmp(backuptype, "streamed") == 0)
1297 *backupEndRequired = true;
1298 }
1299
1300 /*
1301 * BACKUP FROM lets us know if this was from a primary or a standby. If
1302 * it was from a standby, we'll double-check that the control file state
1303 * matches that of a standby.
1304 */
1305 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1306 {
1307 if (strcmp(backupfrom, "standby") == 0)
1308 *backupFromStandby = true;
1309 }
1310
1311 /*
1312 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1313 * but checking for their presence is useful for debugging and the next
1314 * sanity checks. Cope also with the fact that the result buffers have a
1315 * pre-allocated size, hence if the backup_label file has been generated
1316 * with strings longer than the maximum assumed here an incorrect parsing
1317 * happens. That's fine as only minor consistency checks are done
1318 * afterwards.
1319 */
1320 if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1322 (errmsg_internal("backup time %s in file \"%s\"",
1324
1325 if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1327 (errmsg_internal("backup label %s in file \"%s\"",
1329
1330 /*
1331 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1332 * it as a sanity check if present.
1333 */
1334 if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1335 {
1337 ereport(FATAL,
1339 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1340 errdetail("Timeline ID parsed is %u, but expected %u.",
1342
1344 (errmsg_internal("backup timeline %u in file \"%s\"",
1346 }
1347
1348 if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
1349 ereport(FATAL,
1351 errmsg("this is an incremental backup, not a data directory"),
1352 errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1353
1354 if (ferror(lfp) || FreeFile(lfp))
1355 ereport(FATAL,
1357 errmsg("could not read file \"%s\": %m",
1359
1360 return true;
1361}
1362
1363/*
1364 * read_tablespace_map: check to see if a tablespace_map file is present
1365 *
1366 * If we see a tablespace_map file during recovery, we assume that we are
1367 * recovering from a backup dump file, and we therefore need to create symlinks
1368 * as per the information present in tablespace_map file.
1369 *
1370 * Returns true if a tablespace_map file was found (and fills *tablespaces
1371 * with a tablespaceinfo struct for each tablespace listed in the file);
1372 * returns false if not.
1373 */
1374static bool
1376{
1378 FILE *lfp;
1379 char str[MAXPGPATH];
1380 int ch,
1381 i,
1382 n;
1383 bool was_backslash;
1384
1385 /*
1386 * See if tablespace_map file is present
1387 */
1389 if (!lfp)
1390 {
1391 if (errno != ENOENT)
1392 ereport(FATAL,
1394 errmsg("could not read file \"%s\": %m",
1395 TABLESPACE_MAP)));
1396 return false; /* it's not there, all is fine */
1397 }
1398
1399 /*
1400 * Read and parse the link name and path lines from tablespace_map file
1401 * (this code is pretty crude, but we are not expecting any variability in
1402 * the file format). De-escape any backslashes that were inserted.
1403 */
1404 i = 0;
1405 was_backslash = false;
1406 while ((ch = fgetc(lfp)) != EOF)
1407 {
1408 if (!was_backslash && (ch == '\n' || ch == '\r'))
1409 {
1410 char *endp;
1411
1412 if (i == 0)
1413 continue; /* \r immediately followed by \n */
1414
1415 /*
1416 * The de-escaped line should contain an OID followed by exactly
1417 * one space followed by a path. The path might start with
1418 * spaces, so don't be too liberal about parsing.
1419 */
1420 str[i] = '\0';
1421 n = 0;
1422 while (str[n] && str[n] != ' ')
1423 n++;
1424 if (n < 1 || n >= i - 1)
1425 ereport(FATAL,
1427 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1428 str[n++] = '\0';
1429
1431 errno = 0;
1432 ti->oid = strtoul(str, &endp, 10);
1433 if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1434 ereport(FATAL,
1436 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1437 ti->path = pstrdup(str + n);
1438 *tablespaces = lappend(*tablespaces, ti);
1439
1440 i = 0;
1441 continue;
1442 }
1443 else if (!was_backslash && ch == '\\')
1444 was_backslash = true;
1445 else
1446 {
1447 if (i < sizeof(str) - 1)
1448 str[i++] = ch;
1449 was_backslash = false;
1450 }
1451 }
1452
1453 if (i != 0 || was_backslash) /* last line not terminated? */
1454 ereport(FATAL,
1456 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1457
1458 if (ferror(lfp) || FreeFile(lfp))
1459 ereport(FATAL,
1461 errmsg("could not read file \"%s\": %m",
1462 TABLESPACE_MAP)));
1463
1464 return true;
1465}
1466
1467/*
1468 * Finish WAL recovery.
1469 *
1470 * This does not close the 'xlogreader' yet, because in some cases the caller
1471 * still wants to re-read the last checkpoint record by calling
1472 * ReadCheckpointRecord().
1473 *
1474 * Returns the position of the last valid or applied record, after which new
1475 * WAL should be appended, information about why recovery was ended, and some
1476 * other things. See the EndOfWalRecoveryInfo struct for details.
1477 */
1480{
1482 XLogRecPtr lastRec;
1483 TimeLineID lastRecTLI;
1484 XLogRecPtr endOfLog;
1485
1486 /*
1487 * Kill WAL receiver, if it's still running, before we continue to write
1488 * the startup checkpoint and aborted-contrecord records. It will trump
1489 * over these records and subsequent ones if it's still alive when we
1490 * start writing WAL.
1491 */
1493
1494 /*
1495 * Shutdown the slot sync worker to drop any temporary slots acquired by
1496 * it and to prevent it from keep trying to fetch the failover slots.
1497 *
1498 * We do not update the 'synced' column in 'pg_replication_slots' system
1499 * view from true to false here, as any failed update could leave 'synced'
1500 * column false for some slots. This could cause issues during slot sync
1501 * after restarting the server as a standby. While updating the 'synced'
1502 * column after switching to the new timeline is an option, it does not
1503 * simplify the handling for the 'synced' column. Therefore, we retain the
1504 * 'synced' column as true after promotion as it may provide useful
1505 * information about the slot origin.
1506 */
1508
1509 /*
1510 * We are now done reading the xlog from stream. Turn off streaming
1511 * recovery to force fetching the files (which would be required at end of
1512 * recovery, e.g., timeline history file) from archive or pg_wal.
1513 *
1514 * Note that standby mode must be turned off after killing WAL receiver,
1515 * i.e., calling XLogShutdownWalRcv().
1516 */
1518 StandbyMode = false;
1519
1520 /*
1521 * Determine where to start writing WAL next.
1522 *
1523 * Re-fetch the last valid or last applied record, so we can identify the
1524 * exact endpoint of what we consider the valid portion of WAL. There may
1525 * be an incomplete continuation record after that, in which case
1526 * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1527 * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1528 * it is intentionally missing. See CreateOverwriteContrecordRecord().
1529 *
1530 * An important side-effect of this is to load the last page into
1531 * xlogreader. The caller uses it to initialize the WAL for writing.
1532 */
1533 if (!InRecovery)
1534 {
1535 lastRec = CheckPointLoc;
1536 lastRecTLI = CheckPointTLI;
1537 }
1538 else
1539 {
1541 lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1542 }
1544 (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1545 endOfLog = xlogreader->EndRecPtr;
1546
1547 /*
1548 * Remember the TLI in the filename of the XLOG segment containing the
1549 * end-of-log. It could be different from the timeline that endOfLog
1550 * nominally belongs to, if there was a timeline switch in that segment,
1551 * and we were reading the old WAL from a segment belonging to a higher
1552 * timeline.
1553 */
1554 result->endOfLogTLI = xlogreader->seg.ws_tli;
1555
1557 {
1558 /*
1559 * We are no longer in archive recovery state.
1560 *
1561 * We are now done reading the old WAL. Turn off archive fetching if
1562 * it was active.
1563 */
1565 InArchiveRecovery = false;
1566
1567 /*
1568 * If the ending log segment is still open, close it (to avoid
1569 * problems on Windows with trying to rename or delete an open file).
1570 */
1571 if (readFile >= 0)
1572 {
1573 close(readFile);
1574 readFile = -1;
1575 }
1576 }
1577
1578 /*
1579 * Copy the last partial block to the caller, for initializing the WAL
1580 * buffer for appending new WAL.
1581 */
1582 if (endOfLog % XLOG_BLCKSZ != 0)
1583 {
1584 char *page;
1585 int len;
1587
1588 pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1590
1591 /* Copy the valid part of the last block */
1592 len = endOfLog % XLOG_BLCKSZ;
1593 page = palloc(len);
1594 memcpy(page, xlogreader->readBuf, len);
1595
1597 result->lastPage = page;
1598 }
1599 else
1600 {
1601 /* There is no partial block to copy. */
1602 result->lastPageBeginPtr = endOfLog;
1603 result->lastPage = NULL;
1604 }
1605
1606 /*
1607 * Create a comment for the history file to explain why and where timeline
1608 * changed.
1609 */
1611
1612 result->lastRec = lastRec;
1613 result->lastRecTLI = lastRecTLI;
1614 result->endOfLog = endOfLog;
1615
1616 result->abortedRecPtr = abortedRecPtr;
1618
1621
1622 return result;
1623}
1624
1625/*
1626 * Clean up the WAL reader and leftovers from restoring WAL from archive
1627 */
1628void
1630{
1631 char recoveryPath[MAXPGPATH];
1632
1633 /* Final update of pg_stat_recovery_prefetch. */
1635
1636 /* Shut down xlogreader */
1637 if (readFile >= 0)
1638 {
1639 close(readFile);
1640 readFile = -1;
1641 }
1645
1647 {
1648 /*
1649 * Since there might be a partial WAL segment named RECOVERYXLOG, get
1650 * rid of it.
1651 */
1652 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1653 unlink(recoveryPath); /* ignore any error */
1654
1655 /* Get rid of any remaining recovered timeline-history file, too */
1656 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1657 unlink(recoveryPath); /* ignore any error */
1658 }
1659
1660 /*
1661 * We don't need the latch anymore. It's not strictly necessary to disown
1662 * it, but let's do it for the sake of tidiness.
1663 */
1666}
1667
1668/*
1669 * Perform WAL recovery.
1670 *
1671 * If the system was shut down cleanly, this is never called.
1672 */
1673void
1675{
1676 XLogRecord *record;
1677 bool reachedRecoveryTarget = false;
1678 TimeLineID replayTLI;
1679
1680 /*
1681 * Initialize shared variables for tracking progress of WAL replay, as if
1682 * we had just replayed the record before the REDO location (or the
1683 * checkpoint record itself, if it's a shutdown checkpoint).
1684 */
1687 {
1691 }
1692 else
1693 {
1697 }
1704
1705 /* Also ensure XLogReceiptTime has a sane value */
1707
1708 /*
1709 * Let postmaster know we've started redo now, so that it can launch the
1710 * archiver if necessary.
1711 */
1714
1715 /*
1716 * Allow read-only connections immediately if we're consistent already.
1717 */
1719
1720 /*
1721 * Find the first record that logically follows the checkpoint --- it
1722 * might physically precede it, though.
1723 */
1725 {
1726 /* back up to find the record */
1727 replayTLI = RedoStartTLI;
1729 record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1730
1731 /*
1732 * If a checkpoint record's redo pointer points back to an earlier
1733 * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1734 * record.
1735 */
1736 if (record->xl_rmid != RM_XLOG_ID ||
1738 ereport(FATAL,
1739 errmsg("unexpected record type found at redo point %X/%08X",
1741 }
1742 else
1743 {
1744 /* just have to read next record after CheckPoint */
1746 replayTLI = CheckPointTLI;
1747 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1748 }
1749
1750 if (record != NULL)
1751 {
1753 PGRUsage ru0;
1754
1756
1757 InRedo = true;
1758
1759 RmgrStartup();
1760
1761 ereport(LOG,
1762 errmsg("redo starts at %X/%08X",
1764
1765 /* Prepare to report progress of the redo phase. */
1766 if (!StandbyMode)
1768
1769 /*
1770 * main redo apply loop
1771 */
1772 do
1773 {
1774 if (!StandbyMode)
1775 ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
1777
1778#ifdef WAL_DEBUG
1779 if (XLOG_DEBUG)
1780 {
1782
1784 appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
1788 appendStringInfoString(&buf, " - ");
1790 elog(LOG, "%s", buf.data);
1791 pfree(buf.data);
1792 }
1793#endif
1794
1795 /* Handle interrupt signals of startup process */
1797
1798 /*
1799 * Pause WAL replay, if requested by a hot-standby session via
1800 * SetRecoveryPause().
1801 *
1802 * Note that we intentionally don't take the info_lck spinlock
1803 * here. We might therefore read a slightly stale value of the
1804 * recoveryPause flag, but it can't be very stale (no worse than
1805 * the last spinlock we did acquire). Since a pause request is a
1806 * pretty asynchronous thing anyway, possibly responding to it one
1807 * WAL record later than we otherwise would is a minor issue, so
1808 * it doesn't seem worth adding another spinlock cycle to prevent
1809 * that.
1810 */
1811 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1813 recoveryPausesHere(false);
1814
1815 /*
1816 * Have we reached our recovery target?
1817 */
1819 {
1820 reachedRecoveryTarget = true;
1821 break;
1822 }
1823
1824 /*
1825 * If we've been asked to lag the primary, wait on latch until
1826 * enough time has passed.
1827 */
1829 {
1830 /*
1831 * We test for paused recovery again here. If user sets
1832 * delayed apply, it may be because they expect to pause
1833 * recovery in case of problems, so we must test again here
1834 * otherwise pausing during the delay-wait wouldn't work.
1835 */
1836 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1838 recoveryPausesHere(false);
1839 }
1840
1841 /*
1842 * Apply the record
1843 */
1844 ApplyWalRecord(xlogreader, record, &replayTLI);
1845
1846 /*
1847 * If we replayed an LSN that someone was waiting for then walk
1848 * over the shared memory array and set latches to notify the
1849 * waiters.
1850 */
1851 if (waitLSNState &&
1855
1856 /* Exit loop if we reached inclusive recovery target */
1858 {
1859 reachedRecoveryTarget = true;
1860 break;
1861 }
1862
1863 /* Else, try to fetch the next WAL record */
1864 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1865 } while (record != NULL);
1866
1867 /*
1868 * end of main redo apply loop
1869 */
1870
1872 {
1873 if (!reachedConsistency)
1874 ereport(FATAL,
1875 (errmsg("requested recovery stop point is before consistent recovery point")));
1876
1877 /*
1878 * This is the last point where we can restart recovery with a new
1879 * recovery target, if we shutdown and begin again. After this,
1880 * Resource Managers may choose to do permanent corrective actions
1881 * at end of recovery.
1882 */
1883 switch (recoveryTargetAction)
1884 {
1886
1887 /*
1888 * exit with special return code to request shutdown of
1889 * postmaster. Log messages issued from postmaster.
1890 */
1891 proc_exit(3);
1892
1894 SetRecoveryPause(true);
1895 recoveryPausesHere(true);
1896
1897 /* drop into promote */
1899
1901 break;
1902 }
1903 }
1904
1905 RmgrCleanup();
1906
1907 ereport(LOG,
1908 errmsg("redo done at %X/%08X system usage: %s",
1910 pg_rusage_show(&ru0)));
1912 if (xtime)
1913 ereport(LOG,
1914 (errmsg("last completed transaction was at log time %s",
1916
1917 InRedo = false;
1918 }
1919 else
1920 {
1921 /* there are no WAL records following the checkpoint */
1922 ereport(LOG,
1923 (errmsg("redo is not required")));
1924 }
1925
1926 /*
1927 * This check is intentionally after the above log messages that indicate
1928 * how far recovery went.
1929 */
1933 ereport(FATAL,
1935 errmsg("recovery ended before configured recovery target was reached")));
1936}
1937
1938/*
1939 * Subroutine of PerformWalRecovery, to apply one WAL record.
1940 */
1941static void
1943{
1944 ErrorContextCallback errcallback;
1945 bool switchedTLI = false;
1946
1947 /* Setup error traceback support for ereport() */
1948 errcallback.callback = rm_redo_error_callback;
1949 errcallback.arg = xlogreader;
1950 errcallback.previous = error_context_stack;
1951 error_context_stack = &errcallback;
1952
1953 /*
1954 * TransamVariables->nextXid must be beyond record's xid.
1955 */
1957
1958 /*
1959 * Before replaying this record, check if this record causes the current
1960 * timeline to change. The record is already considered to be part of the
1961 * new timeline, so we update replayTLI before replaying it. That's
1962 * important so that replayEndTLI, which is recorded as the minimum
1963 * recovery point's TLI if recovery stops after this record, is set
1964 * correctly.
1965 */
1966 if (record->xl_rmid == RM_XLOG_ID)
1967 {
1968 TimeLineID newReplayTLI = *replayTLI;
1969 TimeLineID prevReplayTLI = *replayTLI;
1970 uint8 info = record->xl_info & ~XLR_INFO_MASK;
1971
1972 if (info == XLOG_CHECKPOINT_SHUTDOWN)
1973 {
1974 CheckPoint checkPoint;
1975
1976 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1977 newReplayTLI = checkPoint.ThisTimeLineID;
1978 prevReplayTLI = checkPoint.PrevTimeLineID;
1979 }
1980 else if (info == XLOG_END_OF_RECOVERY)
1981 {
1983
1985 newReplayTLI = xlrec.ThisTimeLineID;
1986 prevReplayTLI = xlrec.PrevTimeLineID;
1987 }
1988
1989 if (newReplayTLI != *replayTLI)
1990 {
1991 /* Check that it's OK to switch to this TLI */
1993 newReplayTLI, prevReplayTLI, *replayTLI);
1994
1995 /* Following WAL records should be run with new TLI */
1996 *replayTLI = newReplayTLI;
1997 switchedTLI = true;
1998 }
1999 }
2000
2001 /*
2002 * Update shared replayEndRecPtr before replaying this record, so that
2003 * XLogFlush will update minRecoveryPoint correctly.
2004 */
2007 XLogRecoveryCtl->replayEndTLI = *replayTLI;
2009
2010 /*
2011 * If we are attempting to enter Hot Standby mode, process XIDs we see
2012 */
2016
2017 /*
2018 * Some XLOG record types that are related to recovery are processed
2019 * directly here, rather than in xlog_redo()
2020 */
2021 if (record->xl_rmid == RM_XLOG_ID)
2022 xlogrecovery_redo(xlogreader, *replayTLI);
2023
2024 /* Now apply the WAL record itself */
2026
2027 /*
2028 * After redo, check whether the backup pages associated with the WAL
2029 * record are consistent with the existing pages. This check is done only
2030 * if consistency check is enabled for this record.
2031 */
2032 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2034
2035 /* Pop the error context stack */
2036 error_context_stack = errcallback.previous;
2037
2038 /*
2039 * Update lastReplayedEndRecPtr after this record has been successfully
2040 * replayed.
2041 */
2045 XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2047
2048 /* ------
2049 * Wakeup walsenders:
2050 *
2051 * On the standby, the WAL is flushed first (which will only wake up
2052 * physical walsenders) and then applied, which will only wake up logical
2053 * walsenders.
2054 *
2055 * Indeed, logical walsenders on standby can't decode and send data until
2056 * it's been applied.
2057 *
2058 * Physical walsenders don't need to be woken up during replay unless
2059 * cascading replication is allowed and time line change occurred (so that
2060 * they can notice that they are on a new time line).
2061 *
2062 * That's why the wake up conditions are for:
2063 *
2064 * - physical walsenders in case of new time line and cascade
2065 * replication is allowed
2066 * - logical walsenders in case cascade replication is allowed (could not
2067 * be created otherwise)
2068 * ------
2069 */
2072
2073 /*
2074 * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2075 * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2076 * a reply to the primary.
2077 */
2079 {
2082 }
2083
2084 /* Allow read-only connections if we're consistent now */
2086
2087 /* Is this a timeline switch? */
2088 if (switchedTLI)
2089 {
2090 /*
2091 * Before we continue on the new timeline, clean up any (possibly
2092 * bogus) future WAL segments on the old timeline.
2093 */
2095
2096 /* Reset the prefetcher. */
2098 }
2099}
2100
2101/*
2102 * Some XLOG RM record types that are directly related to WAL recovery are
2103 * handled here rather than in the xlog_redo()
2104 */
2105static void
2107{
2108 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2109 XLogRecPtr lsn = record->EndRecPtr;
2110
2111 Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2112
2113 if (info == XLOG_OVERWRITE_CONTRECORD)
2114 {
2115 /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2117
2119 if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2120 elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
2121 LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2123
2124 /* We have safely skipped the aborted record */
2127
2128 ereport(LOG,
2129 errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
2130 LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2131 timestamptz_to_str(xlrec.overwrite_time)));
2132
2133 /* Verifying the record should only happen once */
2135 }
2136 else if (info == XLOG_BACKUP_END)
2137 {
2138 XLogRecPtr startpoint;
2139
2140 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2141
2142 if (backupStartPoint == startpoint)
2143 {
2144 /*
2145 * We have reached the end of base backup, the point where
2146 * pg_backup_stop() was done. The data on disk is now consistent
2147 * (assuming we have also reached minRecoveryPoint). Set
2148 * backupEndPoint to the current LSN, so that the next call to
2149 * CheckRecoveryConsistency() will notice it and do the
2150 * end-of-backup processing.
2151 */
2152 elog(DEBUG1, "end of backup record reached");
2153
2154 backupEndPoint = lsn;
2155 }
2156 else
2157 elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
2159 }
2160}
2161
2162/*
2163 * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2164 * directories.
2165 *
2166 * Replay of database creation XLOG records for databases that were later
2167 * dropped can create fake directories in pg_tblspc. By the time consistency
2168 * is reached these directories should have been removed; here we verify
2169 * that this did indeed happen. This is to be called at the point where
2170 * consistent state is reached.
2171 *
2172 * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2173 * useful for testing purposes, and also allows for an escape hatch in case
2174 * things go south.
2175 */
2176static void
2178{
2179 DIR *dir;
2180 struct dirent *de;
2181
2183 while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2184 {
2185 char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2186
2187 /* Skip entries of non-oid names */
2188 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2189 continue;
2190
2191 snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2192
2193 if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2196 errmsg("unexpected directory entry \"%s\" found in %s",
2197 de->d_name, PG_TBLSPC_DIR),
2198 errdetail("All directory entries in %s/ should be symbolic links.",
2200 errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2201 }
2202}
2203
2204/*
2205 * Checks if recovery has reached a consistent state. When consistency is
2206 * reached and we have a valid starting standby snapshot, tell postmaster
2207 * that it can start accepting read-only connections.
2208 */
2209static void
2211{
2212 XLogRecPtr lastReplayedEndRecPtr;
2213 TimeLineID lastReplayedTLI;
2214
2215 /*
2216 * During crash recovery, we don't reach a consistent state until we've
2217 * replayed all the WAL.
2218 */
2220 return;
2221
2223
2224 /*
2225 * assume that we are called in the startup process, and hence don't need
2226 * a lock to read lastReplayedEndRecPtr
2227 */
2228 lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2229 lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2230
2231 /*
2232 * Have we reached the point where our base backup was completed?
2233 */
2235 backupEndPoint <= lastReplayedEndRecPtr)
2236 {
2239
2240 elog(DEBUG1, "end of backup reached");
2241
2242 /*
2243 * We have reached the end of base backup, as indicated by pg_control.
2244 * Update the control file accordingly.
2245 */
2246 ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2249 backupEndRequired = false;
2250
2251 ereport(LOG,
2252 errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
2255 }
2256
2257 /*
2258 * Have we passed our safe starting point? Note that minRecoveryPoint is
2259 * known to be incorrectly set if recovering from a backup, until the
2260 * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2261 * All we know prior to that is that we're not consistent yet.
2262 */
2264 minRecoveryPoint <= lastReplayedEndRecPtr)
2265 {
2266 /*
2267 * Check to see if the XLOG sequence contained any unresolved
2268 * references to uninitialized pages.
2269 */
2271
2272 /*
2273 * Check that pg_tblspc doesn't contain any real directories. Replay
2274 * of Database/CREATE_* records may have created fictitious tablespace
2275 * directories that should have been removed by the time consistency
2276 * was reached.
2277 */
2279
2280 reachedConsistency = true;
2282 ereport(LOG,
2283 errmsg("consistent recovery state reached at %X/%08X",
2284 LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
2285 }
2286
2287 /*
2288 * Have we got a valid starting snapshot that will allow queries to be
2289 * run? If so, we can tell postmaster that the database is consistent now,
2290 * enabling connections.
2291 */
2296 {
2300
2301 LocalHotStandbyActive = true;
2302
2304 }
2305}
2306
2307/*
2308 * Error context callback for errors occurring during rm_redo().
2309 */
2310static void
2312{
2313 XLogReaderState *record = (XLogReaderState *) arg;
2315
2317 xlog_outdesc(&buf, record);
2318 xlog_block_info(&buf, record);
2319
2320 /* translator: %s is a WAL record description */
2321 errcontext("WAL redo at %X/%08X for %s",
2322 LSN_FORMAT_ARGS(record->ReadRecPtr),
2323 buf.data);
2324
2325 pfree(buf.data);
2326}
2327
2328/*
2329 * Returns a string describing an XLogRecord, consisting of its identity
2330 * optionally followed by a colon, a space, and a further description.
2331 */
2332void
2334{
2336 uint8 info = XLogRecGetInfo(record);
2337 const char *id;
2338
2341
2342 id = rmgr.rm_identify(info);
2343 if (id == NULL)
2344 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2345 else
2346 appendStringInfo(buf, "%s: ", id);
2347
2348 rmgr.rm_desc(buf, record);
2349}
2350
2351#ifdef WAL_DEBUG
2352
2353static void
2355{
2356 appendStringInfo(buf, "prev %X/%08X; xid %u",
2358 XLogRecGetXid(record));
2359
2360 appendStringInfo(buf, "; len %u",
2361 XLogRecGetDataLen(record));
2362
2363 xlog_block_info(buf, record);
2364}
2365#endif /* WAL_DEBUG */
2366
2367/*
2368 * Returns a string giving information about all the blocks in an
2369 * XLogRecord.
2370 */
2371static void
2373{
2374 int block_id;
2375
2376 /* decode block references */
2377 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2378 {
2379 RelFileLocator rlocator;
2380 ForkNumber forknum;
2382
2384 &rlocator, &forknum, &blk, NULL))
2385 continue;
2386
2387 if (forknum != MAIN_FORKNUM)
2388 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2389 block_id,
2390 rlocator.spcOid, rlocator.dbOid,
2391 rlocator.relNumber,
2392 forknum,
2393 blk);
2394 else
2395 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2396 block_id,
2397 rlocator.spcOid, rlocator.dbOid,
2398 rlocator.relNumber,
2399 blk);
2400 if (XLogRecHasBlockImage(record, block_id))
2401 appendStringInfoString(buf, " FPW");
2402 }
2403}
2404
2405
2406/*
2407 * Check that it's OK to switch to new timeline during recovery.
2408 *
2409 * 'lsn' is the address of the shutdown checkpoint record we're about to
2410 * replay. (Currently, timeline can only change at a shutdown checkpoint).
2411 */
2412static void
2414 TimeLineID replayTLI)
2415{
2416 /* Check that the record agrees on what the current (old) timeline is */
2417 if (prevTLI != replayTLI)
2418 ereport(PANIC,
2419 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2420 prevTLI, replayTLI)));
2421
2422 /*
2423 * The new timeline better be in the list of timelines we expect to see,
2424 * according to the timeline history. It should also not decrease.
2425 */
2426 if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2427 ereport(PANIC,
2428 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2429 newTLI, replayTLI)));
2430
2431 /*
2432 * If we have not yet reached min recovery point, and we're about to
2433 * switch to a timeline greater than the timeline of the min recovery
2434 * point: trouble. After switching to the new timeline, we could not
2435 * possibly visit the min recovery point on the correct timeline anymore.
2436 * This can happen if there is a newer timeline in the archive that
2437 * branched before the timeline the min recovery point is on, and you
2438 * attempt to do PITR to the new timeline.
2439 */
2441 lsn < minRecoveryPoint &&
2443 ereport(PANIC,
2444 errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
2445 newTLI,
2448
2449 /* Looks good */
2450}
2451
2452
2453/*
2454 * Extract timestamp from WAL record.
2455 *
2456 * If the record contains a timestamp, returns true, and saves the timestamp
2457 * in *recordXtime. If the record type has no timestamp, returns false.
2458 * Currently, only transaction commit/abort records and restore points contain
2459 * timestamps.
2460 */
2461static bool
2463{
2464 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2466 uint8 rmid = XLogRecGetRmid(record);
2467
2468 if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2469 {
2470 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2471 return true;
2472 }
2473 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2475 {
2476 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2477 return true;
2478 }
2479 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2481 {
2482 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2483 return true;
2484 }
2485 return false;
2486}
2487
2488/*
2489 * Checks whether the current buffer page and backup page stored in the
2490 * WAL record are consistent or not. Before comparing the two pages, a
2491 * masking can be applied to the pages to ignore certain areas like hint bits,
2492 * unused space between pd_lower and pd_upper among other things. This
2493 * function should be called once WAL replay has been completed for a
2494 * given record.
2495 */
2496static void
2498{
2500 RelFileLocator rlocator;
2501 ForkNumber forknum;
2502 BlockNumber blkno;
2503 int block_id;
2504
2505 /* Records with no backup blocks have no need for consistency checks. */
2506 if (!XLogRecHasAnyBlockRefs(record))
2507 return;
2508
2510
2511 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2512 {
2513 Buffer buf;
2514 Page page;
2515
2517 &rlocator, &forknum, &blkno, NULL))
2518 {
2519 /*
2520 * WAL record doesn't contain a block reference with the given id.
2521 * Do nothing.
2522 */
2523 continue;
2524 }
2525
2527
2528 if (XLogRecBlockImageApply(record, block_id))
2529 {
2530 /*
2531 * WAL record has already applied the page, so bypass the
2532 * consistency check as that would result in comparing the full
2533 * page stored in the record with itself.
2534 */
2535 continue;
2536 }
2537
2538 /*
2539 * Read the contents from the current buffer and store it in a
2540 * temporary page.
2541 */
2542 buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2545 if (!BufferIsValid(buf))
2546 continue;
2547
2549 page = BufferGetPage(buf);
2550
2551 /*
2552 * Take a copy of the local page where WAL has been applied to have a
2553 * comparison base before masking it...
2554 */
2556
2557 /* No need for this page anymore now that a copy is in. */
2559
2560 /*
2561 * If the block LSN is already ahead of this WAL record, we can't
2562 * expect contents to match. This can happen if recovery is
2563 * restarted.
2564 */
2566 continue;
2567
2568 /*
2569 * Read the contents from the backup copy, stored in WAL record and
2570 * store it in a temporary page. There is no need to allocate a new
2571 * page here, a local buffer is fine to hold its contents and a mask
2572 * can be directly applied on it.
2573 */
2575 ereport(ERROR,
2577 errmsg_internal("%s", record->errormsg_buf)));
2578
2579 /*
2580 * If masking function is defined, mask both the primary and replay
2581 * images
2582 */
2583 if (rmgr.rm_mask != NULL)
2584 {
2585 rmgr.rm_mask(replay_image_masked, blkno);
2586 rmgr.rm_mask(primary_image_masked, blkno);
2587 }
2588
2589 /* Time to compare the primary and replay images. */
2591 {
2592 elog(FATAL,
2593 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2594 rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2595 forknum, blkno);
2596 }
2597 }
2598}
2599
2600/*
2601 * For point-in-time recovery, this function decides whether we want to
2602 * stop applying the XLOG before the current record.
2603 *
2604 * Returns true if we are stopping, false otherwise. If stopping, some
2605 * information is saved in recoveryStopXid et al for use in annotating the
2606 * new timeline's history file.
2607 */
2608static bool
2610{
2611 bool stopsHere = false;
2613 bool isCommit;
2616
2617 /*
2618 * Ignore recovery target settings when not in archive recovery (meaning
2619 * we are in crash recovery).
2620 */
2622 return false;
2623
2624 /* Check if we should stop as soon as reaching consistency */
2626 {
2627 ereport(LOG,
2628 (errmsg("recovery stopping after reaching consistency")));
2629
2630 recoveryStopAfter = false;
2633 recoveryStopTime = 0;
2634 recoveryStopName[0] = '\0';
2635 return true;
2636 }
2637
2638 /* Check if target LSN has been reached */
2641 record->ReadRecPtr >= recoveryTargetLSN)
2642 {
2643 recoveryStopAfter = false;
2645 recoveryStopLSN = record->ReadRecPtr;
2646 recoveryStopTime = 0;
2647 recoveryStopName[0] = '\0';
2648 ereport(LOG,
2649 errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
2651 return true;
2652 }
2653
2654 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2655 if (XLogRecGetRmid(record) != RM_XACT_ID)
2656 return false;
2657
2659
2661 {
2662 isCommit = true;
2663 recordXid = XLogRecGetXid(record);
2664 }
2666 {
2669
2670 isCommit = true;
2672 xlrec,
2673 &parsed);
2674 recordXid = parsed.twophase_xid;
2675 }
2676 else if (xact_info == XLOG_XACT_ABORT)
2677 {
2678 isCommit = false;
2679 recordXid = XLogRecGetXid(record);
2680 }
2682 {
2685
2686 isCommit = false;
2688 xlrec,
2689 &parsed);
2690 recordXid = parsed.twophase_xid;
2691 }
2692 else
2693 return false;
2694
2696 {
2697 /*
2698 * There can be only one transaction end record with this exact
2699 * transactionid
2700 *
2701 * when testing for an xid, we MUST test for equality only, since
2702 * transactions are numbered in the order they start, not the order
2703 * they complete. A higher numbered xid will complete before you about
2704 * 50% of the time...
2705 */
2707 }
2708
2709 /*
2710 * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2711 * We don't expect getRecordTimestamp ever to fail, since we already know
2712 * this is a commit or abort record; but test its result anyway.
2713 */
2714 if (getRecordTimestamp(record, &recordXtime) &&
2716 {
2717 /*
2718 * There can be many transactions that share the same commit time, so
2719 * we stop after the last one, if we are inclusive, or stop at the
2720 * first one if we are exclusive
2721 */
2724 else
2726 }
2727
2728 if (stopsHere)
2729 {
2730 recoveryStopAfter = false;
2734 recoveryStopName[0] = '\0';
2735
2736 if (isCommit)
2737 {
2738 ereport(LOG,
2739 (errmsg("recovery stopping before commit of transaction %u, time %s",
2742 }
2743 else
2744 {
2745 ereport(LOG,
2746 (errmsg("recovery stopping before abort of transaction %u, time %s",
2749 }
2750 }
2751
2752 return stopsHere;
2753}
2754
2755/*
2756 * Same as recoveryStopsBefore, but called after applying the record.
2757 *
2758 * We also track the timestamp of the latest applied COMMIT/ABORT
2759 * record in XLogRecoveryCtl->recoveryLastXTime.
2760 */
2761static bool
2763{
2764 uint8 info;
2766 uint8 rmid;
2768
2769 /*
2770 * Ignore recovery target settings when not in archive recovery (meaning
2771 * we are in crash recovery).
2772 */
2774 return false;
2775
2776 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2777 rmid = XLogRecGetRmid(record);
2778
2779 /*
2780 * There can be many restore points that share the same name; we stop at
2781 * the first one.
2782 */
2784 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2785 {
2787
2789
2791 {
2792 recoveryStopAfter = true;
2797
2798 ereport(LOG,
2799 (errmsg("recovery stopping at restore point \"%s\", time %s",
2802 return true;
2803 }
2804 }
2805
2806 /* Check if the target LSN has been reached */
2809 record->ReadRecPtr >= recoveryTargetLSN)
2810 {
2811 recoveryStopAfter = true;
2813 recoveryStopLSN = record->ReadRecPtr;
2814 recoveryStopTime = 0;
2815 recoveryStopName[0] = '\0';
2816 ereport(LOG,
2817 errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
2819 return true;
2820 }
2821
2822 if (rmid != RM_XACT_ID)
2823 return false;
2824
2825 xact_info = info & XLOG_XACT_OPMASK;
2826
2827 if (xact_info == XLOG_XACT_COMMIT ||
2831 {
2833
2834 /* Update the last applied transaction timestamp */
2835 if (getRecordTimestamp(record, &recordXtime))
2837
2838 /* Extract the XID of the committed/aborted transaction */
2840 {
2843
2845 xlrec,
2846 &parsed);
2847 recordXid = parsed.twophase_xid;
2848 }
2850 {
2853
2855 xlrec,
2856 &parsed);
2857 recordXid = parsed.twophase_xid;
2858 }
2859 else
2860 recordXid = XLogRecGetXid(record);
2861
2862 /*
2863 * There can be only one transaction end record with this exact
2864 * transactionid
2865 *
2866 * when testing for an xid, we MUST test for equality only, since
2867 * transactions are numbered in the order they start, not the order
2868 * they complete. A higher numbered xid will complete before you about
2869 * 50% of the time...
2870 */
2873 {
2874 recoveryStopAfter = true;
2878 recoveryStopName[0] = '\0';
2879
2880 if (xact_info == XLOG_XACT_COMMIT ||
2882 {
2883 ereport(LOG,
2884 (errmsg("recovery stopping after commit of transaction %u, time %s",
2887 }
2888 else if (xact_info == XLOG_XACT_ABORT ||
2890 {
2891 ereport(LOG,
2892 (errmsg("recovery stopping after abort of transaction %u, time %s",
2895 }
2896 return true;
2897 }
2898 }
2899
2900 /* Check if we should stop as soon as reaching consistency */
2902 {
2903 ereport(LOG,
2904 (errmsg("recovery stopping after reaching consistency")));
2905
2906 recoveryStopAfter = true;
2908 recoveryStopTime = 0;
2910 recoveryStopName[0] = '\0';
2911 return true;
2912 }
2913
2914 return false;
2915}
2916
2917/*
2918 * Create a comment for the history file to explain why and where
2919 * timeline changed.
2920 */
2921static char *
2923{
2924 char reason[200];
2925
2927 snprintf(reason, sizeof(reason),
2928 "%s transaction %u",
2929 recoveryStopAfter ? "after" : "before",
2932 snprintf(reason, sizeof(reason),
2933 "%s %s\n",
2934 recoveryStopAfter ? "after" : "before",
2937 snprintf(reason, sizeof(reason),
2938 "%s LSN %X/%08X\n",
2939 recoveryStopAfter ? "after" : "before",
2942 snprintf(reason, sizeof(reason),
2943 "at restore point \"%s\"",
2946 snprintf(reason, sizeof(reason), "reached consistency");
2947 else
2948 snprintf(reason, sizeof(reason), "no recovery target specified");
2949
2950 return pstrdup(reason);
2951}
2952
2953/*
2954 * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2955 *
2956 * endOfRecovery is true if the recovery target is reached and
2957 * the paused state starts at the end of recovery because of
2958 * recovery_target_action=pause, and false otherwise.
2959 */
2960static void
2962{
2963 /* Don't pause unless users can connect! */
2965 return;
2966
2967 /* Don't pause after standby promotion has been triggered */
2969 return;
2970
2971 if (endOfRecovery)
2972 ereport(LOG,
2973 (errmsg("pausing at the end of recovery"),
2974 errhint("Execute pg_wal_replay_resume() to promote.")));
2975 else
2976 ereport(LOG,
2977 (errmsg("recovery has paused"),
2978 errhint("Execute pg_wal_replay_resume() to continue.")));
2979
2980 /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2982 {
2985 return;
2986
2987 /*
2988 * If recovery pause is requested then set it paused. While we are in
2989 * the loop, user might resume and pause again so set this every time.
2990 */
2992
2993 /*
2994 * We wait on a condition variable that will wake us as soon as the
2995 * pause ends, but we use a timeout so we can check the above exit
2996 * condition periodically too.
2997 */
3000 }
3002}
3003
3004/*
3005 * When recovery_min_apply_delay is set, we wait long enough to make sure
3006 * certain record types are applied at least that interval behind the primary.
3007 *
3008 * Returns true if we waited.
3009 *
3010 * Note that the delay is calculated between the WAL record log time and
3011 * the current time on standby. We would prefer to keep track of when this
3012 * standby received each WAL record, which would allow a more consistent
3013 * approach and one not affected by time synchronisation issues, but that
3014 * is significantly more effort and complexity for little actual gain in
3015 * usability.
3016 */
3017static bool
3019{
3023 long msecs;
3024
3025 /* nothing to do if no delay configured */
3026 if (recovery_min_apply_delay <= 0)
3027 return false;
3028
3029 /* no delay is applied on a database not yet consistent */
3030 if (!reachedConsistency)
3031 return false;
3032
3033 /* nothing to do if crash recovery is requested */
3035 return false;
3036
3037 /*
3038 * Is it a COMMIT record?
3039 *
3040 * We deliberately choose not to delay aborts since they have no effect on
3041 * MVCC. We already allow replay of records that don't have a timestamp,
3042 * so there is already opportunity for issues caused by early conflicts on
3043 * standbys.
3044 */
3045 if (XLogRecGetRmid(record) != RM_XACT_ID)
3046 return false;
3047
3049
3050 if (xact_info != XLOG_XACT_COMMIT &&
3052 return false;
3053
3054 if (!getRecordTimestamp(record, &xtime))
3055 return false;
3056
3058
3059 /*
3060 * Exit without arming the latch if it's already past time to apply this
3061 * record
3062 */
3064 if (msecs <= 0)
3065 return false;
3066
3067 while (true)
3068 {
3070
3071 /* This might change recovery_min_apply_delay. */
3073
3075 break;
3076
3077 /*
3078 * Recalculate delayUntil as recovery_min_apply_delay could have
3079 * changed while waiting in this loop.
3080 */
3082
3083 /*
3084 * Wait for difference between GetCurrentTimestamp() and delayUntil.
3085 */
3087 delayUntil);
3088
3089 if (msecs <= 0)
3090 break;
3091
3092 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3093
3096 msecs,
3098 }
3099 return true;
3100}
3101
3102/*
3103 * Get the current state of the recovery pause request.
3104 */
3116
3117/*
3118 * Set the recovery pause state.
3119 *
3120 * If recovery pause is requested then sets the recovery pause state to
3121 * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3122 * to 'not paused' to resume the recovery. The recovery pause will be
3123 * confirmed by the ConfirmRecoveryPaused.
3124 */
3125void
3140
3141/*
3142 * Confirm the recovery pause by setting the recovery pause state to
3143 * RECOVERY_PAUSED.
3144 */
3145static void
3154
3155
3156/*
3157 * Attempt to read the next XLOG record.
3158 *
3159 * Before first call, the reader needs to be positioned to the first record
3160 * by calling XLogPrefetcherBeginRead().
3161 *
3162 * If no valid record is available, returns NULL, or fails if emode is PANIC.
3163 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3164 * record is available.
3165 */
3166static XLogRecord *
3168 bool fetching_ckpt, TimeLineID replayTLI)
3169{
3170 XLogRecord *record;
3173
3175
3176 /* Pass through parameters to XLogPageRead */
3177 private->fetching_ckpt = fetching_ckpt;
3178 private->emode = emode;
3179 private->randAccess = !XLogRecPtrIsValid(xlogreader->ReadRecPtr);
3180 private->replayTLI = replayTLI;
3181
3182 /* This is the first attempt to read this page. */
3183 lastSourceFailed = false;
3184
3185 for (;;)
3186 {
3187 char *errormsg;
3188
3189 record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3190 if (record == NULL)
3191 {
3192 /*
3193 * When we find that WAL ends in an incomplete record, keep track
3194 * of that record. After recovery is done, we'll write a record
3195 * to indicate to downstream WAL readers that that portion is to
3196 * be ignored.
3197 *
3198 * However, when ArchiveRecoveryRequested = true, we're going to
3199 * switch to a new timeline at the end of recovery. We will only
3200 * copy WAL over to the new timeline up to the end of the last
3201 * complete record, so if we did this, we would later create an
3202 * overwrite contrecord in the wrong place, breaking everything.
3203 */
3206 {
3209 }
3210
3211 if (readFile >= 0)
3212 {
3213 close(readFile);
3214 readFile = -1;
3215 }
3216
3217 /*
3218 * We only end up here without a message when XLogPageRead()
3219 * failed - in that case we already logged something. In
3220 * StandbyMode that only happens if we have been triggered, so we
3221 * shouldn't loop anymore in that case.
3222 */
3223 if (errormsg)
3225 (errmsg_internal("%s", errormsg) /* already translated */ ));
3226 }
3227
3228 /*
3229 * Check page TLI is one of the expected values.
3230 */
3232 {
3233 char fname[MAXFNAMELEN];
3234 XLogSegNo segno;
3235 int32 offset;
3236
3240 XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3243 errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
3245 fname,
3247 offset));
3248 record = NULL;
3249 }
3250
3251 if (record)
3252 {
3253 /* Great, got a record */
3254 return record;
3255 }
3256 else
3257 {
3258 /* No valid record available from this source */
3259 lastSourceFailed = true;
3260
3261 /*
3262 * If archive recovery was requested, but we were still doing
3263 * crash recovery, switch to archive recovery and retry using the
3264 * offline archive. We have now replayed all the valid WAL in
3265 * pg_wal, so we are presumably now consistent.
3266 *
3267 * We require that there's at least some valid WAL present in
3268 * pg_wal, however (!fetching_ckpt). We could recover using the
3269 * WAL from the archive, even if pg_wal is completely empty, but
3270 * we'd have no idea how far we'd have to replay to reach
3271 * consistency. So err on the safe side and give up.
3272 */
3274 !fetching_ckpt)
3275 {
3277 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3278 InArchiveRecovery = true;
3281
3284 minRecoveryPointTLI = replayTLI;
3285
3287
3288 /*
3289 * Before we retry, reset lastSourceFailed and currentSource
3290 * so that we will check the archive next.
3291 */
3292 lastSourceFailed = false;
3294
3295 continue;
3296 }
3297
3298 /* In standby mode, loop back to retry. Otherwise, give up. */
3300 continue;
3301 else
3302 return NULL;
3303 }
3304 }
3305}
3306
3307/*
3308 * Read the XLOG page containing targetPagePtr into readBuf (if not read
3309 * already). Returns number of bytes read, if the page is read successfully,
3310 * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3311 * but only if they have not been previously reported.
3312 *
3313 * See XLogReaderRoutine.page_read for more details.
3314 *
3315 * While prefetching, xlogreader->nonblocking may be set. In that case,
3316 * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3317 *
3318 * This is responsible for restoring files from archive as needed, as well
3319 * as for waiting for the requested WAL record to arrive in standby mode.
3320 *
3321 * xlogreader->private_data->emode specifies the log level used for reporting
3322 * "file not found" or "end of WAL" situations in archive recovery, or in
3323 * standby mode when promotion is triggered. If set to WARNING or below,
3324 * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3325 * levels the ereport() won't return.
3326 *
3327 * In standby mode, if after a successful return of XLogPageRead() the
3328 * caller finds the record it's interested in to be broken, it should
3329 * ereport the error with the level determined by
3330 * emode_for_corrupt_record(), and then set lastSourceFailed
3331 * and call XLogPageRead() again with the same arguments. This lets
3332 * XLogPageRead() to try fetching the record from another source, or to
3333 * sleep and retry.
3334 */
3335static int
3337 XLogRecPtr targetRecPtr, char *readBuf)
3338{
3339 XLogPageReadPrivate *private =
3341 int emode = private->emode;
3344 int r;
3346
3348
3351
3352 /*
3353 * See if we need to switch to a new segment because the requested record
3354 * is not in the currently open one.
3355 */
3356 if (readFile >= 0 &&
3358 {
3359 /*
3360 * Request a restartpoint if we've replayed too much xlog since the
3361 * last one.
3362 */
3364 {
3366 {
3367 (void) GetRedoRecPtr();
3370 }
3371 }
3372
3373 close(readFile);
3374 readFile = -1;
3376 }
3377
3379
3380retry:
3381 /* See if we need to retrieve more data */
3382 if (readFile < 0 ||
3385 {
3386 if (readFile >= 0 &&
3390 return XLREAD_WOULDBLOCK;
3391
3393 private->randAccess,
3394 private->fetching_ckpt,
3396 private->replayTLI,
3399 {
3400 case XLREAD_WOULDBLOCK:
3401 return XLREAD_WOULDBLOCK;
3402 case XLREAD_FAIL:
3403 if (readFile >= 0)
3404 close(readFile);
3405 readFile = -1;
3406 readLen = 0;
3408 return XLREAD_FAIL;
3409 case XLREAD_SUCCESS:
3410 break;
3411 }
3412 }
3413
3414 /*
3415 * At this point, we have the right segment open and if we're streaming we
3416 * know the requested record is in it.
3417 */
3418 Assert(readFile != -1);
3419
3420 /*
3421 * If the current segment is being streamed from the primary, calculate
3422 * how much of the current page we have received already. We know the
3423 * requested record has been received, but this is for the benefit of
3424 * future calls, to allow quick exit at the top of this function.
3425 */
3427 {
3430 else
3433 }
3434 else
3436
3437 /* Read the requested page */
3439
3440 /* Measure I/O timing when reading segment */
3442
3444 r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff);
3445 if (r != XLOG_BLCKSZ)
3446 {
3447 char fname[MAXFNAMELEN];
3448 int save_errno = errno;
3449
3451
3453 io_start, 1, r);
3454
3456 if (r < 0)
3457 {
3458 errno = save_errno;
3461 errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
3463 readOff)));
3464 }
3465 else
3468 errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
3470 readOff, r, (Size) XLOG_BLCKSZ)));
3472 }
3474
3476 io_start, 1, r);
3477
3480 Assert(reqLen <= readLen);
3481
3483
3484 /*
3485 * Check the page header immediately, so that we can retry immediately if
3486 * it's not valid. This may seem unnecessary, because ReadPageInternal()
3487 * validates the page header anyway, and would propagate the failure up to
3488 * ReadRecord(), which would retry. However, there's a corner case with
3489 * continuation records, if a record is split across two pages such that
3490 * we would need to read the two pages from different sources across two
3491 * WAL segments.
3492 *
3493 * The first page is only available locally, in pg_wal, because it's
3494 * already been recycled on the primary. The second page, however, is not
3495 * present in pg_wal, and we should stream it from the primary. There is a
3496 * recycled WAL segment present in pg_wal, with garbage contents, however.
3497 * We would read the first page from the local WAL segment, but when
3498 * reading the second page, we would read the bogus, recycled, WAL
3499 * segment. If we didn't catch that case here, we would never recover,
3500 * because ReadRecord() would retry reading the whole record from the
3501 * beginning.
3502 *
3503 * Of course, this only catches errors in the page header, which is what
3504 * happens in the case of a recycled WAL segment. Other kinds of errors or
3505 * corruption still has the same problem. But this at least fixes the
3506 * common case, which can happen as part of normal operation.
3507 *
3508 * Validating the page header is cheap enough that doing it twice
3509 * shouldn't be a big deal from a performance point of view.
3510 *
3511 * When not in standby mode, an invalid page header should cause recovery
3512 * to end, not retry reading the page, so we don't need to validate the
3513 * page header here for the retry. Instead, ReadPageInternal() is
3514 * responsible for the validation.
3515 */
3516 if (StandbyMode &&
3519 {
3520 /*
3521 * Emit this error right now then retry this page immediately. Use
3522 * errmsg_internal() because the message was already translated.
3523 */
3524 if (xlogreader->errormsg_buf[0])
3527
3528 /* reset any error XLogReaderValidatePageHeader() might have set */
3531 }
3532
3533 return readLen;
3534
3536
3537 /*
3538 * If we're reading ahead, give up fast. Retries and error reporting will
3539 * be handled by a later read when recovery catches up to this point.
3540 */
3542 return XLREAD_WOULDBLOCK;
3543
3544 lastSourceFailed = true;
3545
3546 if (readFile >= 0)
3547 close(readFile);
3548 readFile = -1;
3549 readLen = 0;
3551
3552 /* In standby-mode, keep trying */
3553 if (StandbyMode)
3554 goto retry;
3555 else
3556 return XLREAD_FAIL;
3557}
3558
3559/*
3560 * Open the WAL segment containing WAL location 'RecPtr'.
3561 *
3562 * The segment can be fetched via restore_command, or via walreceiver having
3563 * streamed the record, or it can already be present in pg_wal. Checking
3564 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3565 * too, in case someone copies a new segment directly to pg_wal. That is not
3566 * documented or recommended, though.
3567 *
3568 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3569 * prepare to read WAL starting from RedoStartLSN after this.
3570 *
3571 * 'RecPtr' might not point to the beginning of the record we're interested
3572 * in, it might also point to the page or segment header. In that case,
3573 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3574 * used to decide which timeline to stream the requested WAL from.
3575 *
3576 * 'replayLSN' is the current replay LSN, so that if we scan for new
3577 * timelines, we can reject a switch to a timeline that branched off before
3578 * this point.
3579 *
3580 * If the record is not immediately available, the function returns XLREAD_FAIL
3581 * if we're not in standby mode. In standby mode, the function waits for it to
3582 * become available.
3583 *
3584 * When the requested record becomes available, the function opens the file
3585 * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3586 * of standby mode is triggered by the user, and there is no more WAL
3587 * available, returns XLREAD_FAIL.
3588 *
3589 * If nonblocking is true, then give up immediately if we can't satisfy the
3590 * request, returning XLREAD_WOULDBLOCK instead of waiting.
3591 */
3592static XLogPageReadResult
3594 bool fetching_ckpt, XLogRecPtr tliRecPtr,
3595 TimeLineID replayTLI, XLogRecPtr replayLSN,
3596 bool nonblocking)
3597{
3598 static TimestampTz last_fail_time = 0;
3600 bool streaming_reply_sent = false;
3601
3602 /*-------
3603 * Standby mode is implemented by a state machine:
3604 *
3605 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3606 * pg_wal (XLOG_FROM_PG_WAL)
3607 * 2. Check for promotion trigger request
3608 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3609 * 4. Rescan timelines
3610 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3611 *
3612 * Failure to read from the current source advances the state machine to
3613 * the next state.
3614 *
3615 * 'currentSource' indicates the current state. There are no currentSource
3616 * values for "check trigger", "rescan timelines", and "sleep" states,
3617 * those actions are taken when reading from the previous source fails, as
3618 * part of advancing to the next state.
3619 *
3620 * If standby mode is turned off while reading WAL from stream, we move
3621 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3622 * the files (which would be required at end of recovery, e.g., timeline
3623 * history file) from archive or pg_wal. We don't need to kill WAL receiver
3624 * here because it's already stopped when standby mode is turned off at
3625 * the end of recovery.
3626 *-------
3627 */
3628 if (!InArchiveRecovery)
3630 else if (currentSource == XLOG_FROM_ANY ||
3632 {
3633 lastSourceFailed = false;
3635 }
3636
3637 for (;;)
3638 {
3640 bool startWalReceiver = false;
3641
3642 /*
3643 * First check if we failed to read from the current source, and
3644 * advance the state machine if so. The failure to read might've
3645 * happened outside this function, e.g when a CRC check fails on a
3646 * record, or within this loop.
3647 */
3648 if (lastSourceFailed)
3649 {
3650 /*
3651 * Don't allow any retry loops to occur during nonblocking
3652 * readahead. Let the caller process everything that has been
3653 * decoded already first.
3654 */
3655 if (nonblocking)
3656 return XLREAD_WOULDBLOCK;
3657
3658 switch (currentSource)
3659 {
3660 case XLOG_FROM_ARCHIVE:
3661 case XLOG_FROM_PG_WAL:
3662
3663 /*
3664 * Check to see if promotion is requested. Note that we do
3665 * this only after failure, so when you promote, we still
3666 * finish replaying as much as we can from archive and
3667 * pg_wal before failover.
3668 */
3670 {
3672 return XLREAD_FAIL;
3673 }
3674
3675 /*
3676 * Not in standby mode, and we've now tried the archive
3677 * and pg_wal.
3678 */
3679 if (!StandbyMode)
3680 return XLREAD_FAIL;
3681
3682 /*
3683 * Move to XLOG_FROM_STREAM state, and set to start a
3684 * walreceiver if necessary.
3685 */
3687 startWalReceiver = true;
3688 break;
3689
3690 case XLOG_FROM_STREAM:
3691
3692 /*
3693 * Failure while streaming. Most likely, we got here
3694 * because streaming replication was terminated, or
3695 * promotion was triggered. But we also get here if we
3696 * find an invalid record in the WAL streamed from the
3697 * primary, in which case something is seriously wrong.
3698 * There's little chance that the problem will just go
3699 * away, but PANIC is not good for availability either,
3700 * especially in hot standby mode. So, we treat that the
3701 * same as disconnection, and retry from archive/pg_wal
3702 * again. The WAL in the archive should be identical to
3703 * what was streamed, so it's unlikely that it helps, but
3704 * one can hope...
3705 */
3706
3707 /*
3708 * We should be able to move to XLOG_FROM_STREAM only in
3709 * standby mode.
3710 */
3712
3713 /*
3714 * Before we leave XLOG_FROM_STREAM state, make sure that
3715 * walreceiver is not active, so that it won't overwrite
3716 * WAL that we restore from archive.
3717 *
3718 * If walreceiver is actively streaming (or attempting to
3719 * connect), we must shut it down. However, if it's
3720 * already in WAITING state (e.g., due to timeline
3721 * divergence), we only need to reset the install flag to
3722 * allow archive restoration.
3723 */
3724 if (WalRcvStreaming())
3726 else
3727 {
3728 /*
3729 * WALRCV_STOPPING state is a transient state while
3730 * the startup process is in ShutdownWalRcv(). It
3731 * should never appear here since we would be waiting
3732 * for the walreceiver to reach WALRCV_STOPPED in that
3733 * case.
3734 */
3737 }
3738
3739 /*
3740 * Before we sleep, re-scan for possible new timelines if
3741 * we were requested to recover to the latest timeline.
3742 */
3744 {
3745 if (rescanLatestTimeLine(replayTLI, replayLSN))
3746 {
3748 break;
3749 }
3750 }
3751
3752 /*
3753 * XLOG_FROM_STREAM is the last state in our state
3754 * machine, so we've exhausted all the options for
3755 * obtaining the requested WAL. We're going to loop back
3756 * and retry from the archive, but if it hasn't been long
3757 * since last attempt, sleep wal_retrieve_retry_interval
3758 * milliseconds to avoid busy-waiting.
3759 */
3763 {
3764 long wait_time;
3765
3768
3769 elog(LOG, "waiting for WAL to become available at %X/%08X",
3771
3772 /* Do background tasks that might benefit us later. */
3774
3778 wait_time,
3782
3783 /* Handle interrupt signals of startup process */
3785 }
3788 break;
3789
3790 default:
3791 elog(ERROR, "unexpected WAL source %d", currentSource);
3792 }
3793 }
3794 else if (currentSource == XLOG_FROM_PG_WAL)
3795 {
3796 /*
3797 * We just successfully read a file in pg_wal. We prefer files in
3798 * the archive over ones in pg_wal, so try the next file again
3799 * from the archive first.
3800 */
3803 }
3804
3805 if (currentSource != oldSource)
3806 elog(DEBUG2, "switched WAL source from %s to %s after %s",
3808 lastSourceFailed ? "failure" : "success");
3809
3810 /*
3811 * We've now handled possible failure. Try to read from the chosen
3812 * source.
3813 */
3814 lastSourceFailed = false;
3815
3816 switch (currentSource)
3817 {
3818 case XLOG_FROM_ARCHIVE:
3819 case XLOG_FROM_PG_WAL:
3820
3821 /*
3822 * WAL receiver must not be running when reading WAL from
3823 * archive or pg_wal.
3824 */
3826
3827 /* Close any old file we might have open. */
3828 if (readFile >= 0)
3829 {
3830 close(readFile);
3831 readFile = -1;
3832 }
3833 /* Reset curFileTLI if random fetch. */
3834 if (randAccess)
3835 curFileTLI = 0;
3836
3837 /*
3838 * Try to restore the file from archive, or read an existing
3839 * file from pg_wal.
3840 */
3844 if (readFile >= 0)
3845 return XLREAD_SUCCESS; /* success! */
3846
3847 /*
3848 * Nope, not found in archive or pg_wal.
3849 */
3850 lastSourceFailed = true;
3851 break;
3852
3853 case XLOG_FROM_STREAM:
3854 {
3855 bool havedata;
3856
3857 /*
3858 * We should be able to move to XLOG_FROM_STREAM only in
3859 * standby mode.
3860 */
3862
3863 /*
3864 * First, shutdown walreceiver if its restart has been
3865 * requested -- but no point if we're already slated for
3866 * starting it.
3867 */
3869 {
3871
3872 /*
3873 * Re-scan for possible new timelines if we were
3874 * requested to recover to the latest timeline.
3875 */
3878 rescanLatestTimeLine(replayTLI, replayLSN);
3879
3880 startWalReceiver = true;
3881 }
3882 pendingWalRcvRestart = false;
3883
3884 /*
3885 * Launch walreceiver if needed.
3886 *
3887 * If fetching_ckpt is true, RecPtr points to the initial
3888 * checkpoint location. In that case, we use RedoStartLSN
3889 * as the streaming start position instead of RecPtr, so
3890 * that when we later jump backwards to start redo at
3891 * RedoStartLSN, we will have the logs streamed already.
3892 */
3893 if (startWalReceiver &&
3895 {
3896 XLogRecPtr ptr;
3897 TimeLineID tli;
3898
3899 if (fetching_ckpt)
3900 {
3901 ptr = RedoStartLSN;
3902 tli = RedoStartTLI;
3903 }
3904 else
3905 {
3906 ptr = RecPtr;
3907
3908 /*
3909 * Use the record begin position to determine the
3910 * TLI, rather than the position we're reading.
3911 */
3913
3914 if (curFileTLI > 0 && tli < curFileTLI)
3915 elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3917 tli, curFileTLI);
3918 }
3919 curFileTLI = tli;
3925 }
3926
3927 /*
3928 * Check if WAL receiver is active or wait to start up.
3929 */
3930 if (!WalRcvStreaming())
3931 {
3932 lastSourceFailed = true;
3933 break;
3934 }
3935
3936 /*
3937 * Walreceiver is active, so see if new data has arrived.
3938 *
3939 * We only advance XLogReceiptTime when we obtain fresh
3940 * WAL from walreceiver and observe that we had already
3941 * processed everything before the most recent "chunk"
3942 * that it flushed to disk. In steady state where we are
3943 * keeping up with the incoming data, XLogReceiptTime will
3944 * be updated on each cycle. When we are behind,
3945 * XLogReceiptTime will not advance, so the grace time
3946 * allotted to conflicting queries will decrease.
3947 */
3948 if (RecPtr < flushedUpto)
3949 havedata = true;
3950 else
3951 {
3952 XLogRecPtr latestChunkStart;
3953
3954 flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3956 {
3957 havedata = true;
3958 if (latestChunkStart <= RecPtr)
3959 {
3962 }
3963 }
3964 else
3965 havedata = false;
3966 }
3967 if (havedata)
3968 {
3969 /*
3970 * Great, streamed far enough. Open the file if it's
3971 * not open already. Also read the timeline history
3972 * file if we haven't initialized timeline history
3973 * yet; it should be streamed over and present in
3974 * pg_wal by now. Use XLOG_FROM_STREAM so that source
3975 * info is set correctly and XLogReceiptTime isn't
3976 * changed.
3977 *
3978 * NB: We must set readTimeLineHistory based on
3979 * recoveryTargetTLI, not receiveTLI. Normally they'll
3980 * be the same, but if recovery_target_timeline is
3981 * 'latest' and archiving is configured, then it's
3982 * possible that we managed to retrieve one or more
3983 * new timeline history files from the archive,
3984 * updating recoveryTargetTLI.
3985 */
3986 if (readFile < 0)
3987 {
3988 if (!expectedTLEs)
3991 XLOG_FROM_STREAM, false);
3992 Assert(readFile >= 0);
3993 }
3994 else
3995 {
3996 /* just make sure source info is correct... */
3999 return XLREAD_SUCCESS;
4000 }
4001 break;
4002 }
4003
4004 /* In nonblocking mode, return rather than sleeping. */
4005 if (nonblocking)
4006 return XLREAD_WOULDBLOCK;
4007
4008 /*
4009 * Data not here yet. Check for trigger, then wait for
4010 * walreceiver to wake us up when new WAL arrives.
4011 */
4013 {
4014 /*
4015 * Note that we don't return XLREAD_FAIL immediately
4016 * here. After being triggered, we still want to
4017 * replay all the WAL that was already streamed. It's
4018 * in pg_wal now, so we just treat this as a failure,
4019 * and the state machine will move on to replay the
4020 * streamed WAL from pg_wal, and then recheck the
4021 * trigger and exit replay.
4022 */
4023 lastSourceFailed = true;
4024 break;
4025 }
4026
4027 /*
4028 * Since we have replayed everything we have received so
4029 * far and are about to start waiting for more WAL, let's
4030 * tell the upstream server our replay location now so
4031 * that pg_stat_replication doesn't show stale
4032 * information.
4033 */
4035 {
4037 streaming_reply_sent = true;
4038 }
4039
4040 /* Do any background tasks that might benefit us later. */
4042
4043 /* Update pg_stat_recovery_prefetch before sleeping. */
4045
4046 /*
4047 * Wait for more WAL to arrive, when we will be woken
4048 * immediately by the WAL receiver.
4049 */
4052 -1L,
4055 break;
4056 }
4057
4058 default:
4059 elog(ERROR, "unexpected WAL source %d", currentSource);
4060 }
4061
4062 /*
4063 * Check for recovery pause here so that we can confirm more quickly
4064 * that a requested pause has actually taken effect.
4065 */
4066 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4068 recoveryPausesHere(false);
4069
4070 /*
4071 * This possibly-long loop needs to handle interrupts of startup
4072 * process.
4073 */
4075 }
4076
4077 return XLREAD_FAIL; /* not reached */
4078}
4079
4080
4081/*
4082 * Determine what log level should be used to report a corrupt WAL record
4083 * in the current WAL page, previously read by XLogPageRead().
4084 *
4085 * 'emode' is the error mode that would be used to report a file-not-found
4086 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4087 * we're retrying the exact same record that we've tried previously, only
4088 * complain the first time to keep the noise down. However, we only do when
4089 * reading from pg_wal, because we don't expect any invalid records in archive
4090 * or in records streamed from the primary. Files in the archive should be complete,
4091 * and we should never hit the end of WAL because we stop and wait for more WAL
4092 * to arrive before replaying it.
4093 *
4094 * NOTE: This function remembers the RecPtr value it was last called with,
4095 * to suppress repeated messages about the same record. Only call this when
4096 * you are about to ereport(), or you might cause a later message to be
4097 * erroneously suppressed.
4098 */
4099static int
4101{
4103
4104 if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4105 {
4106 if (RecPtr == lastComplaint)
4107 emode = DEBUG1;
4108 else
4110 }
4111 return emode;
4112}
4113
4114
4115/*
4116 * Subroutine to try to fetch and validate a prior checkpoint record.
4117 */
4118static XLogRecord *
4120 TimeLineID replayTLI)
4121{
4122 XLogRecord *record;
4123 uint8 info;
4124
4126
4127 if (!XRecOffIsValid(RecPtr))
4128 {
4129 ereport(LOG,
4130 (errmsg("invalid checkpoint location")));
4131 return NULL;
4132 }
4133
4135 record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4136
4137 if (record == NULL)
4138 {
4139 ereport(LOG,
4140 (errmsg("invalid checkpoint record")));
4141 return NULL;
4142 }
4143 if (record->xl_rmid != RM_XLOG_ID)
4144 {
4145 ereport(LOG,
4146 (errmsg("invalid resource manager ID in checkpoint record")));
4147 return NULL;
4148 }
4149 info = record->xl_info & ~XLR_INFO_MASK;
4150 if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4151 info != XLOG_CHECKPOINT_ONLINE)
4152 {
4153 ereport(LOG,
4154 (errmsg("invalid xl_info in checkpoint record")));
4155 return NULL;
4156 }
4158 {
4159 ereport(LOG,
4160 (errmsg("invalid length of checkpoint record")));
4161 return NULL;
4162 }
4163 return record;
4164}
4165
4166/*
4167 * Scan for new timelines that might have appeared in the archive since we
4168 * started recovery.
4169 *
4170 * If there are any, the function changes recovery target TLI to the latest
4171 * one and returns 'true'.
4172 */
4173static bool
4175{
4177 bool found;
4178 ListCell *cell;
4182
4185 {
4186 /* No new timelines found */
4187 return false;
4188 }
4189
4190 /*
4191 * Determine the list of expected TLIs for the new TLI
4192 */
4193
4195
4196 /*
4197 * If the current timeline is not part of the history of the new timeline,
4198 * we cannot proceed to it.
4199 */
4200 found = false;
4201 foreach(cell, newExpectedTLEs)
4202 {
4204
4205 if (currentTle->tli == recoveryTargetTLI)
4206 {
4207 found = true;
4208 break;
4209 }
4210 }
4211 if (!found)
4212 {
4213 ereport(LOG,
4214 (errmsg("new timeline %u is not a child of database system timeline %u",
4215 newtarget,
4216 replayTLI)));
4217 return false;
4218 }
4219
4220 /*
4221 * The current timeline was found in the history file, but check that the
4222 * next timeline was forked off from it *after* the current recovery
4223 * location.
4224 */
4225 if (currentTle->end < replayLSN)
4226 {
4227 ereport(LOG,
4228 errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
4229 newtarget,
4230 replayTLI,
4232 return false;
4233 }
4234
4235 /* The new timeline history seems valid. Switch target */
4239
4240 /*
4241 * As in StartupXLOG(), try to ensure we have all the history files
4242 * between the old target and new target in pg_wal.
4243 */
4245
4246 ereport(LOG,
4247 (errmsg("new target timeline is %u",
4249
4250 return true;
4251}
4252
4253
4254/*
4255 * Open a logfile segment for reading (during recovery).
4256 *
4257 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4258 * Otherwise, it's assumed to be already available in pg_wal.
4259 */
4260static int
4263{
4264 char xlogfname[MAXFNAMELEN];
4265 char activitymsg[MAXFNAMELEN + 16];
4266 char path[MAXPGPATH];
4267 int fd;
4268
4270
4271 switch (source)
4272 {
4273 case XLOG_FROM_ARCHIVE:
4274 /* Report recovery progress in PS display */
4275 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4276 xlogfname);
4278
4279 if (!RestoreArchivedFile(path, xlogfname,
4280 "RECOVERYXLOG",
4282 InRedo))
4283 return -1;
4284 break;
4285
4286 case XLOG_FROM_PG_WAL:
4287 case XLOG_FROM_STREAM:
4288 XLogFilePath(path, tli, segno, wal_segment_size);
4289 break;
4290
4291 default:
4292 elog(ERROR, "invalid XLogFileRead source %d", source);
4293 }
4294
4295 /*
4296 * If the segment was fetched from archival storage, replace the existing
4297 * xlog segment (if any) with the archival version.
4298 */
4300 {
4303
4304 /*
4305 * Set path to point at the new file in pg_wal.
4306 */
4307 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4308 }
4309
4311 if (fd >= 0)
4312 {
4313 /* Success! */
4314 curFileTLI = tli;
4315
4316 /* Report recovery progress in PS display */
4317 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4318 xlogfname);
4320
4321 /* Track source of data in assorted state variables */
4324 /* In FROM_STREAM case, caller tracks receipt time, not me */
4325 if (source != XLOG_FROM_STREAM)
4327
4328 return fd;
4329 }
4330 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4331 ereport(PANIC,
4333 errmsg("could not open file \"%s\": %m", path)));
4334 return -1;
4335}
4336
4337/*
4338 * Open a logfile segment for reading (during recovery).
4339 *
4340 * This version searches for the segment with any TLI listed in expectedTLEs.
4341 */
4342static int
4344{
4345 char path[MAXPGPATH];
4346 ListCell *cell;
4347 int fd;
4348 List *tles;
4349
4350 /*
4351 * Loop looking for a suitable timeline ID: we might need to read any of
4352 * the timelines listed in expectedTLEs.
4353 *
4354 * We expect curFileTLI on entry to be the TLI of the preceding file in
4355 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4356 * to go backwards; this prevents us from picking up the wrong file when a
4357 * parent timeline extends to higher segment numbers than the child we
4358 * want to read.
4359 *
4360 * If we haven't read the timeline history file yet, read it now, so that
4361 * we know which TLIs to scan. We don't save the list in expectedTLEs,
4362 * however, unless we actually find a valid segment. That way if there is
4363 * neither a timeline history file nor a WAL segment in the archive, and
4364 * streaming replication is set up, we'll read the timeline history file
4365 * streamed from the primary when we start streaming, instead of
4366 * recovering with a dummy history generated here.
4367 */
4368 if (expectedTLEs)
4370 else
4372
4373 foreach(cell, tles)
4374 {
4376 TimeLineID tli = hent->tli;
4377
4378 if (tli < curFileTLI)
4379 break; /* don't bother looking at too-old TLIs */
4380
4381 /*
4382 * Skip scanning the timeline ID that the logfile segment to read
4383 * doesn't belong to
4384 */
4385 if (XLogRecPtrIsValid(hent->begin))
4386 {
4387 XLogSegNo beginseg = 0;
4388
4390
4391 /*
4392 * The logfile segment that doesn't belong to the timeline is
4393 * older or newer than the segment that the timeline started or
4394 * ended at, respectively. It's sufficient to check only the
4395 * starting segment of the timeline here. Since the timelines are
4396 * scanned in descending order in this loop, any segments newer
4397 * than the ending segment should belong to newer timeline and
4398 * have already been read before. So it's not necessary to check
4399 * the ending segment of the timeline here.
4400 */
4401 if (segno < beginseg)
4402 continue;
4403 }
4404
4406 {
4407 fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4408 if (fd != -1)
4409 {
4410 elog(DEBUG1, "got WAL segment from archive");
4411 if (!expectedTLEs)
4413 return fd;
4414 }
4415 }
4416
4418 {
4419 fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4420 if (fd != -1)
4421 {
4422 if (!expectedTLEs)
4424 return fd;
4425 }
4426 }
4427 }
4428
4429 /* Couldn't find it. For simplicity, complain about front timeline */
4431 errno = ENOENT;
4434 errmsg("could not open file \"%s\": %m", path)));
4435 return -1;
4436}
4437
4438/*
4439 * Set flag to signal the walreceiver to restart. (The startup process calls
4440 * this on noticing a relevant configuration change.)
4441 */
4442void
4444{
4446 {
4447 ereport(LOG,
4448 (errmsg("WAL receiver process shutdown requested")));
4449
4450 pendingWalRcvRestart = true;
4451 }
4452}
4453
4454
4455/*
4456 * Has a standby promotion already been triggered?
4457 *
4458 * Unlike CheckForStandbyTrigger(), this works in any process
4459 * that's connected to shared memory.
4460 */
4461bool
4463{
4464 /*
4465 * We check shared state each time only until a standby promotion is
4466 * triggered. We can't trigger a promotion again, so there's no need to
4467 * keep checking after the shared variable has once been seen true.
4468 */
4470 return true;
4471
4475
4477}
4478
4479static void
4481{
4485
4486 /*
4487 * Mark the recovery pause state as 'not paused' because the paused state
4488 * ends and promotion continues if a promotion is triggered while recovery
4489 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4490 * return 'paused' while a promotion is ongoing.
4491 */
4492 SetRecoveryPause(false);
4493
4495}
4496
4497/*
4498 * Check whether a promote request has arrived.
4499 */
4500static bool
4502{
4504 return true;
4505
4507 {
4508 ereport(LOG, (errmsg("received promote request")));
4512 return true;
4513 }
4514
4515 return false;
4516}
4517
4518/*
4519 * Remove the files signaling a standby promotion request.
4520 */
4521void
4526
4527/*
4528 * Check to see if a promote request has arrived.
4529 */
4530bool
4532{
4533 struct stat stat_buf;
4534
4535 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4536 return true;
4537
4538 return false;
4539}
4540
4541/*
4542 * Wake up startup process to replay newly arrived WAL, or to notice that
4543 * failover has been requested.
4544 */
4545void
4550
4551/*
4552 * Schedule a walreceiver wakeup in the main recovery loop.
4553 */
4554void
4559
4560/*
4561 * Is HotStandby active yet? This is only important in special backends
4562 * since normal backends won't ever be able to connect until this returns
4563 * true. Postmaster knows this by way of signal, not via shared memory.
4564 *
4565 * Unlike testing standbyState, this works in any process that's connected to
4566 * shared memory. (And note that standbyState alone doesn't tell the truth
4567 * anyway.)
4568 */
4569bool
4571{
4572 /*
4573 * We check shared state each time only until Hot Standby is active. We
4574 * can't de-activate Hot Standby, so there's no need to keep checking
4575 * after the shared variable has once been seen true.
4576 */
4578 return true;
4579 else
4580 {
4581 /* spinlock is essential on machines with weak memory ordering! */
4585
4586 return LocalHotStandbyActive;
4587 }
4588}
4589
4590/*
4591 * Like HotStandbyActive(), but to be used only in WAL replay code,
4592 * where we don't need to ask any other process what the state is.
4593 */
4594static bool
4600
4601/*
4602 * Get latest redo apply position.
4603 *
4604 * Exported to allow WALReceiver to read the pointer directly.
4605 */
4608{
4610 TimeLineID tli;
4611
4616
4617 if (replayTLI)
4618 *replayTLI = tli;
4619 return recptr;
4620}
4621
4622
4623/*
4624 * Get position of last applied, or the record being applied.
4625 *
4626 * This is different from GetXLogReplayRecPtr() in that if a WAL
4627 * record is currently being applied, this includes that record.
4628 */
4631{
4633 TimeLineID tli;
4634
4639
4640 if (replayEndTLI)
4641 *replayEndTLI = tli;
4642 return recptr;
4643}
4644
4645/*
4646 * Save timestamp of latest processed commit/abort record.
4647 *
4648 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4649 * seen by processes other than the startup process. Note in particular
4650 * that CreateRestartPoint is executed in the checkpointer.
4651 */
4652static void
4659
4660/*
4661 * Fetch timestamp of latest processed commit/abort record.
4662 */
4674
4675/*
4676 * Save timestamp of the next chunk of WAL records to apply.
4677 *
4678 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4679 * seen by all backends.
4680 */
4681static void
4688
4689/*
4690 * Fetch timestamp of latest processed commit/abort record.
4691 * Startup process maintains an accurate local copy in XLogReceiptTime
4692 */
4704
4705/*
4706 * Returns time of receipt of current chunk of XLOG data, as well as
4707 * whether it was received from streaming replication or from archives.
4708 */
4709void
4711{
4712 /*
4713 * This must be executed in the startup process, since we don't export the
4714 * relevant state to shared memory.
4715 */
4717
4720}
4721
4722/*
4723 * Note that text field supplied is a parameter name and does not require
4724 * translation
4725 */
4726void
4728{
4729 if (currValue < minValue)
4730 {
4732 {
4733 bool warned_for_promote = false;
4734
4737 errmsg("hot standby is not possible because of insufficient parameter settings"),
4738 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4739 param_name,
4740 currValue,
4741 minValue)));
4742
4743 SetRecoveryPause(true);
4744
4745 ereport(LOG,
4746 (errmsg("recovery has paused"),
4747 errdetail("If recovery is unpaused, the server will shut down."),
4748 errhint("You can then restart the server after making the necessary configuration changes.")));
4749
4751 {
4753
4755 {
4756 if (!warned_for_promote)
4759 errmsg("promotion is not possible because of insufficient parameter settings"),
4760
4761 /*
4762 * Repeat the detail from above so it's easy to find
4763 * in the log.
4764 */
4765 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4766 param_name,
4767 currValue,
4768 minValue),
4769 errhint("Restart the server after making the necessary configuration changes.")));
4770 warned_for_promote = true;
4771 }
4772
4773 /*
4774 * If recovery pause is requested then set it paused. While
4775 * we are in the loop, user might resume and pause again so
4776 * set this every time.
4777 */
4779
4780 /*
4781 * We wait on a condition variable that will wake us as soon
4782 * as the pause ends, but we use a timeout so we can check the
4783 * above conditions periodically too.
4784 */
4787 }
4789 }
4790
4791 ereport(FATAL,
4793 errmsg("recovery aborted because of insufficient parameter settings"),
4794 /* Repeat the detail from above so it's easy to find in the log. */
4795 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4796 param_name,
4797 currValue,
4798 minValue),
4799 errhint("You can restart the server after making the necessary configuration changes.")));
4800 }
4801}
4802
4803
4804/*
4805 * GUC check_hook for primary_slot_name
4806 */
4807bool
4809{
4810 int err_code;
4811 char *err_msg = NULL;
4812 char *err_hint = NULL;
4813
4814 if (*newval && strcmp(*newval, "") != 0 &&
4816 &err_msg, &err_hint))
4817 {
4819 GUC_check_errdetail("%s", err_msg);
4820 if (err_hint != NULL)
4822 return false;
4823 }
4824
4825 return true;
4826}
4827
4828/*
4829 * Recovery target settings: Only one of the several recovery_target* settings
4830 * may be set. Setting a second one results in an error. The global variable
4831 * recoveryTarget tracks which kind of recovery target was chosen. Other
4832 * variables store the actual target value (for example a string or a xid).
4833 * The assign functions of the parameters check whether a competing parameter
4834 * was already set. But we want to allow setting the same parameter multiple
4835 * times. We also want to allow unsetting a parameter and setting a different
4836 * one, so we unset recoveryTarget when the parameter is set to an empty
4837 * string.
4838 *
4839 * XXX this code is broken by design. Throwing an error from a GUC assign
4840 * hook breaks fundamental assumptions of guc.c. So long as all the variables
4841 * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4842 * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4843 * that we have odd behaviors such as unexpected GUC ordering dependencies.
4844 */
4845
4846pg_noreturn static void
4848{
4849 ereport(ERROR,
4851 errmsg("multiple recovery targets specified"),
4852 errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4853}
4854
4855/*
4856 * GUC check_hook for recovery_target
4857 */
4858bool
4860{
4861 if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4862 {
4863 GUC_check_errdetail("The only allowed value is \"immediate\".");
4864 return false;
4865 }
4866 return true;
4867}
4868
4869/*
4870 * GUC assign_hook for recovery_target
4871 */
4872void
4884
4885/*
4886 * GUC check_hook for recovery_target_lsn
4887 */
4888bool
4890{
4891 if (strcmp(*newval, "") != 0)
4892 {
4893 XLogRecPtr lsn;
4896
4897 lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
4898 if (escontext.error_occurred)
4899 return false;
4900
4901 myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4902 if (!myextra)
4903 return false;
4904 *myextra = lsn;
4905 *extra = myextra;
4906 }
4907 return true;
4908}
4909
4910/*
4911 * GUC assign_hook for recovery_target_lsn
4912 */
4913void
4914assign_recovery_target_lsn(const char *newval, void *extra)
4915{
4919
4920 if (newval && strcmp(newval, "") != 0)
4921 {
4923 recoveryTargetLSN = *((XLogRecPtr *) extra);
4924 }
4925 else
4927}
4928
4929/*
4930 * GUC check_hook for recovery_target_name
4931 */
4932bool
4934{
4935 /* Use the value of newval directly */
4936 if (strlen(*newval) >= MAXFNAMELEN)
4937 {
4938 GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4939 "recovery_target_name", MAXFNAMELEN - 1);
4940 return false;
4941 }
4942 return true;
4943}
4944
4945/*
4946 * GUC assign_hook for recovery_target_name
4947 */
4948void
4963
4964/*
4965 * GUC check_hook for recovery_target_time
4966 *
4967 * The interpretation of the recovery_target_time string can depend on the
4968 * time zone setting, so we need to wait until after all GUC processing is
4969 * done before we can do the final parsing of the string. This check function
4970 * only does a parsing pass to catch syntax errors, but we store the string
4971 * and parse it again when we need to use it.
4972 */
4973bool
4975{
4976 if (strcmp(*newval, "") != 0)
4977 {
4978 /* reject some special values */
4979 if (strcmp(*newval, "now") == 0 ||
4980 strcmp(*newval, "today") == 0 ||
4981 strcmp(*newval, "tomorrow") == 0 ||
4982 strcmp(*newval, "yesterday") == 0)
4983 {
4984 return false;
4985 }
4986
4987 /*
4988 * parse timestamp value (see also timestamptz_in())
4989 */
4990 {
4991 char *str = *newval;
4992 fsec_t fsec;
4993 struct pg_tm tt,
4994 *tm = &tt;
4995 int tz;
4996 int dtype;
4997 int nf;
4998 int dterr;
4999 char *field[MAXDATEFIELDS];
5000 int ftype[MAXDATEFIELDS];
5004
5006 field, ftype, MAXDATEFIELDS, &nf);
5007 if (dterr == 0)
5008 dterr = DecodeDateTime(field, ftype, nf,
5009 &dtype, tm, &fsec, &tz, &dtextra);
5010 if (dterr != 0)
5011 return false;
5012 if (dtype != DTK_DATE)
5013 return false;
5014
5015 if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
5016 {
5017 GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
5018 return false;
5019 }
5020 }
5021 }
5022 return true;
5023}
5024
5025/*
5026 * GUC assign_hook for recovery_target_time
5027 */
5028void
5040
5041/*
5042 * GUC check_hook for recovery_target_timeline
5043 */
5044bool
5046{
5049
5050 if (strcmp(*newval, "current") == 0)
5052 else if (strcmp(*newval, "latest") == 0)
5054 else
5055 {
5056 char *endp;
5057 uint64 timeline;
5058
5060
5061 errno = 0;
5062 timeline = strtou64(*newval, &endp, 0);
5063
5064 if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
5065 {
5066 GUC_check_errdetail("\"%s\" is not a valid number.",
5067 "recovery_target_timeline");
5068 return false;
5069 }
5070
5072 {
5073 GUC_check_errdetail("\"%s\" must be between %u and %u.",
5074 "recovery_target_timeline", 1, PG_UINT32_MAX);
5075 return false;
5076 }
5077 }
5078
5080 if (!myextra)
5081 return false;
5082 *myextra = rttg;
5083 *extra = myextra;
5084
5085 return true;
5086}
5087
5088/*
5089 * GUC assign_hook for recovery_target_timeline
5090 */
5091void
5100
5101/*
5102 * GUC check_hook for recovery_target_xid
5103 */
5104bool
5106{
5107 if (strcmp(*newval, "") != 0)
5108 {
5109 TransactionId xid;
5111
5112 errno = 0;
5113 xid = (TransactionId) strtou64(*newval, NULL, 0);
5114 if (errno == EINVAL || errno == ERANGE)
5115 return false;
5116
5118 if (!myextra)
5119 return false;
5120 *myextra = xid;
5121 *extra = myextra;
5122 }
5123 return true;
5124}
5125
5126/*
5127 * GUC assign_hook for recovery_target_xid
5128 */
5129void
5130assign_recovery_target_xid(const char *newval, void *extra)
5131{
5135
5136 if (newval && strcmp(newval, "") != 0)
5137 {
5139 recoveryTargetXid = *((TransactionId *) extra);
5140 }
5141 else
5143}
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
List * readTimeLineHistory(TimeLineID targetTLI)
Definition timeline.c:76
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition timeline.c:544
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition timeline.c:572
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition timeline.c:50
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition timeline.c:526
void remove_tablespace_symlink(const char *linkloc)
Definition tablespace.c:890
bool allow_in_place_tablespaces
Definition tablespace.c:86
void disable_startup_progress_timeout(void)
Definition startup.c:308
bool IsPromoteSignaled(void)
Definition startup.c:287
void begin_startup_progress_phase(void)
Definition startup.c:342
void ProcessStartupProcInterrupts(void)
Definition startup.c:154
void ResetPromoteSignaled(void)
Definition startup.c:293
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition datetime.c:773
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition datetime.c:997
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition timestamp.c:1757
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition timestamp.c:2006
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition timestamp.c:1781
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition timestamp.c:418
TimestampTz GetCurrentTimestamp(void)
Definition timestamp.c:1645
const char * timestamptz_to_str(TimestampTz t)
Definition timestamp.c:1862
Datum now(PG_FUNCTION_ARGS)
Definition timestamp.c:1609
uint32 BlockNumber
Definition block.h:31
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5519
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:466
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:220
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:328
@ RBM_NORMAL_NO_LOG
Definition bufmgr.h:52
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:417
PageData * Page
Definition bufpage.h:81
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:385
uint8_t uint8
Definition c.h:556
#define PG_UINT32_MAX
Definition c.h:616
#define pg_noreturn
Definition c.h:176
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:235
#define Assert(condition)
Definition c.h:885
#define PG_BINARY
Definition c.h:1309
#define UINT64_FORMAT
Definition c.h:577
int32_t int32
Definition c.h:554
uint64_t uint64
Definition c.h:559
uint32_t uint32
Definition c.h:558
#define pg_fallthrough
Definition c.h:144
uint32 TransactionId
Definition c.h:678
size_t Size
Definition c.h:631
void RequestCheckpoint(int flags)
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition timestamp.h:39
int32 fsec_t
Definition timestamp.h:41
Datum arg
Definition elog.c:1322
int errcode_for_file_access(void)
Definition elog.c:897
ErrorContextCallback * error_context_stack
Definition elog.c:99
int errcode(int sqlerrcode)
Definition elog.c:874
int errmsg(const char *fmt,...)
Definition elog.c:1093
#define LOG
Definition elog.h:31
#define errcontext
Definition elog.h:198
int errhint(const char *fmt,...) pg_attribute_printf(1
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define FATAL
Definition elog.h:41
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:36
#define DEBUG2
Definition elog.h:29
#define PANIC
Definition elog.h:42
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1111
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition fd.c:782
int BasicOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1089
int FreeFile(FILE *file)
Definition fd.c:2826
DIR * AllocateDir(const char *dirname)
Definition fd.c:2890
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition fd.c:2956
int pg_fsync(int fd)
Definition fd.c:389
FILE * AllocateFile(const char *name, const char *mode)
Definition fd.c:2627
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc0_object(type)
Definition fe_memutils.h:75
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition file_utils.c:547
@ PGFILETYPE_LNK
Definition file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition fmgr.h:688
bool IsUnderPostmaster
Definition globals.c:120
char * DataDir
Definition globals.c:71
bool IsPostmasterEnvironment
Definition globals.c:119
void GUC_check_errcode(int sqlerrcode)
Definition guc.c:6628
void * guc_malloc(int elevel, size_t size)
Definition guc.c:636
#define newval
#define GUC_check_errdetail
Definition guc.h:506
GucSource
Definition guc.h:112
#define GUC_check_errhint
Definition guc.h:510
const char * str
#define MAXDATEFIELDS
Definition datetime.h:202
#define DTK_DATE
Definition datetime.h:144
#define MAXDATELEN
Definition datetime.h:200
#define close(a)
Definition win32.h:12
void proc_exit(int code)
Definition ipc.c:105
int i
Definition isn.c:77
void OwnLatch(Latch *latch)
Definition latch.c:126
void DisownLatch(Latch *latch)
Definition latch.c:144
void InitSharedLatch(Latch *latch)
Definition latch.c:93
void SetLatch(Latch *latch)
Definition latch.c:290
void ResetLatch(Latch *latch)
Definition latch.c:374
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition latch.c:172
List * lappend(List *list, void *datum)
Definition list.c:339
void list_free_deep(List *list)
Definition list.c:1560
static struct pg_tm tm
Definition localtime.c:104
char * pstrdup(const char *in)
Definition mcxt.c:1781
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define AmStartupProcess()
Definition miscadmin.h:390
#define IsBootstrapProcessingMode()
Definition miscadmin.h:477
#define ERRCODE_DATA_CORRUPTED
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition pg_control.h:76
#define XLOG_CHECKPOINT_REDO
Definition pg_control.h:83
#define XLOG_OVERWRITE_CONTRECORD
Definition pg_control.h:82
DBState
Definition pg_control.h:92
@ DB_IN_ARCHIVE_RECOVERY
Definition pg_control.h:98
@ DB_SHUTDOWNED_IN_RECOVERY
Definition pg_control.h:95
@ DB_SHUTDOWNED
Definition pg_control.h:94
@ DB_IN_CRASH_RECOVERY
Definition pg_control.h:97
#define XLOG_CHECKPOINT_SHUTDOWN
Definition pg_control.h:69
#define XLOG_BACKUP_END
Definition pg_control.h:74
#define XLOG_CHECKPOINT_ONLINE
Definition pg_control.h:70
#define XLOG_END_OF_RECOVERY
Definition pg_control.h:78
const void size_t len
#define lfirst(lc)
Definition pg_list.h:172
#define NIL
Definition pg_list.h:68
XLogRecPtr pg_lsn_in_safe(const char *str, Node *escontext)
Definition pg_lsn.c:32
static rewind_source * source
Definition pg_rewind.c:89
const char * pg_rusage_show(const PGRUsage *ru0)
Definition pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition pg_rusage.c:27
static char buf[DEFAULT_XLOG_SEG_SIZE]
@ IOOBJECT_WAL
Definition pgstat.h:283
@ IOCONTEXT_NORMAL
Definition pgstat.h:293
@ IOOP_READ
Definition pgstat.h:319
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition pmsignal.c:165
@ PMSIGNAL_RECOVERY_STARTED
Definition pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition pmsignal.h:37
@ PMSIGNAL_RECOVERY_CONSISTENT
Definition pmsignal.h:36
#define pg_pread
Definition port.h:247
#define snprintf
Definition port.h:260
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
off_t pgoff_t
Definition port.h:421
static Datum ObjectIdGetDatum(Oid X)
Definition postgres.h:262
static Datum CStringGetDatum(const char *X)
Definition postgres.h:380
static Datum Int32GetDatum(int32 X)
Definition postgres.h:222
#define InvalidOid
static int fd(const char *x, int i)
static int fb(int x)
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition procarray.c:4414
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition procarray.c:4575
static void set_ps_display(const char *activity)
Definition ps_status.h:40
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
#define PG_TBLSPC_DIR
Definition relpath.h:41
void RmgrStartup(void)
Definition rmgr.c:58
void RmgrCleanup(void)
Definition rmgr.c:74
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition shmem.c:378
bool ReplicationSlotValidateNameInternal(const char *name, bool allow_reserved_name, int *err_code, char **err_msg, char **err_hint)
Definition slot.c:312
void ShutDownSlotSync(void)
Definition slotsync.c:1722
static void SpinLockRelease(volatile slock_t *lock)
Definition spin.h:62
static void SpinLockAcquire(volatile slock_t *lock)
Definition spin.h:56
static void SpinLockInit(volatile slock_t *lock)
Definition spin.h:50
#define ereport_startup_progress(msg,...)
Definition startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition stringinfo.c:145
void appendStringInfoString(StringInfo str, const char *s)
Definition stringinfo.c:230
void appendStringInfoChar(StringInfo str, char ch)
Definition stringinfo.c:242
void initStringInfo(StringInfo str)
Definition stringinfo.c:97
Oid oldestMultiDB
Definition pg_control.h:52
MultiXactId oldestMulti
Definition pg_control.h:51
MultiXactOffset nextMultiOffset
Definition pg_control.h:48
TransactionId newestCommitTsXid
Definition pg_control.h:56
TransactionId oldestXid
Definition pg_control.h:49
TimeLineID PrevTimeLineID
Definition pg_control.h:40
TimeLineID ThisTimeLineID
Definition pg_control.h:39
MultiXactId nextMulti
Definition pg_control.h:47
FullTransactionId nextXid
Definition pg_control.h:45
TransactionId oldestCommitTsXid
Definition pg_control.h:54
XLogRecPtr redo
Definition pg_control.h:37
Oid oldestXidDB
Definition pg_control.h:50
XLogRecPtr backupStartPoint
Definition pg_control.h:172
CheckPoint checkPointCopy
Definition pg_control.h:137
XLogRecPtr backupEndPoint
Definition pg_control.h:173
XLogRecPtr minRecoveryPoint
Definition pg_control.h:170
XLogRecPtr checkPoint
Definition pg_control.h:135
uint64 system_identifier
Definition pg_control.h:112
TimeLineID minRecoveryPointTLI
Definition pg_control.h:171
Definition dirent.c:26
XLogRecPtr lastPageBeginPtr
XLogRecPtr missingContrecPtr
struct ErrorContextCallback * previous
Definition elog.h:297
void(* callback)(void *arg)
Definition elog.h:298
Definition latch.h:114
Definition pg_list.h:54
Definition nodes.h:135
RelFileNumber relNumber
void(* rm_redo)(XLogReaderState *record)
TimeLineID ws_tli
Definition xlogreader.h:49
pg_atomic_uint64 minWaitedLSN[WAIT_LSN_TYPE_COUNT]
Definition xlogwait.h:85
XLogRecPtr missingContrecPtr
Definition xlogreader.h:214
char * errormsg_buf
Definition xlogreader.h:310
XLogRecPtr EndRecPtr
Definition xlogreader.h:206
uint64 system_identifier
Definition xlogreader.h:190
XLogRecPtr ReadRecPtr
Definition xlogreader.h:205
XLogRecPtr abortedRecPtr
Definition xlogreader.h:213
TimeLineID latestPageTLI
Definition xlogreader.h:279
XLogRecPtr overwrittenRecPtr
Definition xlogreader.h:216
XLogRecPtr latestPagePtr
Definition xlogreader.h:278
WALOpenSegment seg
Definition xlogreader.h:271
void * private_data
Definition xlogreader.h:195
uint8 xl_info
Definition xlogrecord.h:46
uint32 xl_tot_len
Definition xlogrecord.h:43
TransactionId xl_xid
Definition xlogrecord.h:44
RmgrId xl_rmid
Definition xlogrecord.h:47
ConditionVariable recoveryNotPausedCV
XLogRecPtr lastReplayedEndRecPtr
TimeLineID replayEndTLI
TimeLineID lastReplayedTLI
TimestampTz currentChunkStartTime
XLogRecPtr replayEndRecPtr
TimestampTz recoveryLastXTime
RecoveryPauseState recoveryPauseState
XLogRecPtr lastReplayedReadRecPtr
Definition guc.h:174
Definition pgtime.h:35
#define InvalidTransactionId
Definition transam.h:31
#define U64FromFullTransactionId(x)
Definition transam.h:49
#define XidFromFullTransactionId(x)
Definition transam.h:48
#define TransactionIdIsValid(xid)
Definition transam.h:41
#define TransactionIdIsNormal(xid)
Definition transam.h:42
#define TimestampTzPlusMilliseconds(tz, ms)
Definition timestamp.h:85
static TimestampTz DatumGetTimestampTz(Datum X)
Definition timestamp.h:34
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition varsup.c:304
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
#define WL_TIMEOUT
#define WL_EXIT_ON_PM_DEATH
#define WL_LATCH_SET
void WalRcvForceReply(void)
#define AllowCascadeReplication()
Definition walreceiver.h:40
@ WALRCV_STOPPING
Definition walreceiver.h:54
XLogRecPtr GetWalRcvFlushRecPtr(XLogRecPtr *latestChunkStart, TimeLineID *receiveTLI)
bool WalRcvStreaming(void)
void RequestXLogStreaming(TimeLineID tli, XLogRecPtr recptr, const char *conninfo, const char *slotname, bool create_temp_slot)
WalRcvState WalRcvGetState(void)
bool WalRcvRunning(void)
void WalSndWakeup(bool physical, bool logical)
Definition walsender.c:3810
#define stat
Definition win32_port.h:74
#define S_IRUSR
Definition win32_port.h:279
#define symlink(oldpath, newpath)
Definition win32_port.h:225
#define S_IWUSR
Definition win32_port.h:282
#define XLOG_XACT_COMMIT_PREPARED
Definition xact.h:173
#define XLOG_XACT_COMMIT
Definition xact.h:170
#define XLOG_XACT_OPMASK
Definition xact.h:180
#define XLOG_XACT_ABORT
Definition xact.h:172
#define XLOG_XACT_ABORT_PREPARED
Definition xact.h:174
void ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed)
Definition xactdesc.c:35
void ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed)
Definition xactdesc.c:141
int wal_decode_buffer_size
Definition xlog.c:139
bool EnableHotStandby
Definition xlog.c:124
XLogRecPtr GetRedoRecPtr(void)
Definition xlog.c:6563
void SetInstallXLogFileSegmentActive(void)
Definition xlog.c:9656
bool IsInstallXLogFileSegmentActive(void)
Definition xlog.c:9673
int wal_segment_size
Definition xlog.c:146
void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
Definition xlog.c:6335
void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
Definition xlog.c:3977
void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
Definition xlog.c:6373
void ResetInstallXLogFileSegmentActive(void)
Definition xlog.c:9665
int wal_retrieve_retry_interval
Definition xlog.c:137
bool track_wal_io_timing
Definition xlog.c:140
static ControlFileData * ControlFile
Definition xlog.c:576
void XLogShutdownWalRcv(void)
Definition xlog.c:9646
bool XLogCheckpointNeeded(XLogSegNo new_segno)
Definition xlog.c:2283
#define TABLESPACE_MAP_OLD
Definition xlog.h:323
#define TABLESPACE_MAP
Definition xlog.h:322
#define STANDBY_SIGNAL_FILE
Definition xlog.h:318
#define CHECKPOINT_CAUSE_XLOG
Definition xlog.h:159
#define PROMOTE_SIGNAL_FILE
Definition xlog.h:326
#define BACKUP_LABEL_FILE
Definition xlog.h:319
#define RECOVERY_SIGNAL_FILE
Definition xlog.h:317
static RmgrData GetRmgr(RmgrId rmid)
#define XLogSegmentOffset(xlogptr, wal_segsz_bytes)
#define MAXFNAMELEN
#define XLOGDIR
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)
static void XLogFilePath(char *path, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XRecOffIsValid(xlrp)
static void XLogFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes)
bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize, bool cleanupEnabled)
Definition xlogarchive.c:54
void KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
#define LSN_FORMAT_ARGS(lsn)
Definition xlogdefs.h:47
uint64 XLogRecPtr
Definition xlogdefs.h:21
#define InvalidXLogRecPtr
Definition xlogdefs.h:28
uint32 TimeLineID
Definition xlogdefs.h:63
uint64 XLogSegNo
Definition xlogdefs.h:52
void XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher)
XLogPrefetcher * XLogPrefetcherAllocate(XLogReaderState *reader)
void XLogPrefetchReconfigure(void)
XLogRecord * XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg)
XLogReaderState * XLogPrefetcherGetReader(XLogPrefetcher *prefetcher)
void XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
void XLogPrefetcherFree(XLogPrefetcher *prefetcher)
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, XLogReaderRoutine *routine, void *private_data)
Definition xlogreader.c:107
void XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size)
Definition xlogreader.c:91
void XLogReaderResetError(XLogReaderState *state)
bool XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, char *phdr)
void XLogReaderFree(XLogReaderState *state)
Definition xlogreader.c:162
bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
#define XLogRecGetDataLen(decoder)
Definition xlogreader.h:415
#define XLogRecGetInfo(decoder)
Definition xlogreader.h:409
#define XLogRecBlockImageApply(decoder, block_id)
Definition xlogreader.h:424
#define XLogRecGetRmid(decoder)
Definition xlogreader.h:410
#define XLogRecGetData(decoder)
Definition xlogreader.h:414
#define XLogRecGetXid(decoder)
Definition xlogreader.h:411
#define XL_ROUTINE(...)
Definition xlogreader.h:117
#define XLogRecMaxBlockId(decoder)
Definition xlogreader.h:417
XLogPageReadResult
Definition xlogreader.h:349
@ XLREAD_WOULDBLOCK
Definition xlogreader.h:352
@ XLREAD_SUCCESS
Definition xlogreader.h:350
@ XLREAD_FAIL
Definition xlogreader.h:351
#define XLogRecHasBlockImage(decoder, block_id)
Definition xlogreader.h:422
#define XLogRecGetPrev(decoder)
Definition xlogreader.h:408
#define XLogRecHasAnyBlockRefs(decoder)
Definition xlogreader.h:416
#define SizeOfXLogRecordDataHeaderShort
Definition xlogrecord.h:217
#define XLR_INFO_MASK
Definition xlogrecord.h:62
#define SizeOfXLogRecord
Definition xlogrecord.h:55
#define XLR_CHECK_CONSISTENCY
Definition xlogrecord.h:91
bool reachedConsistency
bool check_primary_slot_name(char **newval, void **extra, GucSource source)
static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
static XLogRecPtr recoveryStopLSN
static bool recoveryStopsBefore(XLogReaderState *record)
static TimestampTz recoveryStopTime
void assign_recovery_target_xid(const char *newval, void *extra)
static bool CheckForStandbyTrigger(void)
int recovery_min_apply_delay
bool check_recovery_target(char **newval, void **extra, GucSource source)
static bool backupEndRequired
bool HotStandbyActive(void)
static char * getRecoveryStopReason(void)
void ShutdownWalRecovery(void)
RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal
int recoveryTargetAction
static void rm_redo_error_callback(void *arg)
static bool recoveryApplyDelay(XLogReaderState *record)
bool ArchiveRecoveryRequested
const char * recoveryTargetName
static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
bool check_recovery_target_timeline(char **newval, void **extra, GucSource source)
static XLogRecPtr minRecoveryPoint
static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf)
static XLogRecPtr backupEndPoint
const struct config_enum_entry recovery_target_action_options[]
static void validateRecoveryParameters(void)
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, TimeLineID replayTLI)
static XLogRecord * ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr, TimeLineID replayTLI)
void StartupRequestWalReceiverRestart(void)
bool InArchiveRecovery
static bool recoveryStopsAfter(XLogReaderState *record)
void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
char * PrimarySlotName
static TimeLineID curFileTLI
static char recoveryStopName[MAXFNAMELEN]
static void CheckRecoveryConsistency(void)
static bool pendingWalRcvRestart
void PerformWalRecovery(void)
static XLogSource XLogReceiptSource
bool CheckPromoteSignal(void)
struct XLogPageReadPrivate XLogPageReadPrivate
static bool recoveryStopAfter
static const char *const xlogSourceNames[]
static TimeLineID RedoStartTLI
char * recoveryRestoreCommand
static void verifyBackupPageConsistency(XLogReaderState *record)
static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
void assign_recovery_target(const char *newval, void *extra)
void SetRecoveryPause(bool recoveryPause)
static bool lastSourceFailed
char * archiveCleanupCommand
XLogRecPtr GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
static TimeLineID receiveTLI
void WakeupRecovery(void)
void xlog_outdesc(StringInfo buf, XLogReaderState *record)
static bool LocalPromoteIsTriggered
bool PromoteIsTriggered(void)
TimestampTz GetCurrentChunkReplayStartTime(void)
static void ConfirmRecoveryPaused(void)
static void readRecoverySignalFile(void)
static XLogRecPtr missingContrecPtr
static XLogRecoveryCtlData * XLogRecoveryCtl
static uint32 readOff
static bool standby_signal_file_found
char * recovery_target_time_string
bool StandbyMode
static int readFile
static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr, TimeLineID replayTLI, XLogRecPtr replayLSN, bool nonblocking)
XLogRecPtr recoveryTargetLSN
RecoveryTargetType recoveryTarget
static bool read_tablespace_map(List **tablespaces)
static bool doRequestWalReceiverReply
static bool read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, bool *backupEndRequired, bool *backupFromStandby)
static int XLogFileRead(XLogSegNo segno, TimeLineID tli, XLogSource source, bool notfoundOk)
static XLogSource currentSource
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI)
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
static List * expectedTLEs
static XLogSegNo readSegNo
void assign_recovery_target_name(const char *newval, void *extra)
static XLogRecPtr abortedRecPtr
static char * primary_image_masked
static TimeLineID minRecoveryPointTLI
static XLogRecord * ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, bool fetching_ckpt, TimeLineID replayTLI)
EndOfWalRecoveryInfo * FinishWalRecovery(void)
void assign_recovery_target_time(const char *newval, void *extra)
static void SetCurrentChunkStartTime(TimestampTz xtime)
static XLogRecPtr CheckPointLoc
bool check_recovery_target_xid(char **newval, void **extra, GucSource source)
static bool LocalHotStandbyActive
static bool HotStandbyActiveInReplay(void)
static bool InRedo
static TransactionId recoveryStopXid
bool check_recovery_target_time(char **newval, void **extra, GucSource source)
static XLogSource readSource
static void SetPromoteIsTriggered(void)
#define RECOVERY_COMMAND_FILE
TransactionId recoveryTargetXid
XLogSource
@ XLOG_FROM_PG_WAL
@ XLOG_FROM_STREAM
@ XLOG_FROM_ARCHIVE
@ XLOG_FROM_ANY
TimeLineID recoveryTargetTLIRequested
static pg_noreturn void error_multiple_recovery_targets(void)
void InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
static void xlog_block_info(StringInfo buf, XLogReaderState *record)
static TimestampTz XLogReceiptTime
static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
Size XLogRecoveryShmemSize(void)
static char * replay_image_masked
bool wal_receiver_create_temp_slot
static void CheckTablespaceDirectory(void)
char * recoveryEndCommand
RecoveryPauseState GetRecoveryPauseState(void)
TimeLineID recoveryTargetTLI
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
void assign_recovery_target_lsn(const char *newval, void *extra)
bool check_recovery_target_lsn(char **newval, void **extra, GucSource source)
static XLogRecPtr RedoStartLSN
static XLogRecPtr flushedUpto
void XLogRecoveryShmemInit(void)
static void recoveryPausesHere(bool endOfRecovery)
static uint32 readLen
static void EnableStandbyMode(void)
#define RECOVERY_COMMAND_DONE
static bool recovery_signal_file_found
TimestampTz recoveryTargetTime
TimestampTz GetLatestXTime(void)
char * PrimaryConnInfo
void XLogRequestWalReceiverReply(void)
static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
static XLogPrefetcher * xlogprefetcher
static bool StandbyModeRequested
bool check_recovery_target_name(char **newval, void **extra, GucSource source)
bool recoveryTargetInclusive
static XLogReaderState * xlogreader
void RemovePromoteSignalFiles(void)
void assign_recovery_target_timeline(const char *newval, void *extra)
static XLogRecPtr backupStartPoint
static void SetLatestXTime(TimestampTz xtime)
static TimeLineID CheckPointTLI
@ RECOVERY_TARGET_ACTION_PAUSE
@ RECOVERY_TARGET_ACTION_PROMOTE
@ RECOVERY_TARGET_ACTION_SHUTDOWN
RecoveryTargetType
@ RECOVERY_TARGET_IMMEDIATE
@ RECOVERY_TARGET_TIME
@ RECOVERY_TARGET_UNSET
@ RECOVERY_TARGET_XID
@ RECOVERY_TARGET_LSN
@ RECOVERY_TARGET_NAME
RecoveryTargetTimeLineGoal
@ RECOVERY_TARGET_TIMELINE_NUMERIC
@ RECOVERY_TARGET_TIMELINE_CONTROLFILE
@ RECOVERY_TARGET_TIMELINE_LATEST
RecoveryPauseState
@ RECOVERY_PAUSED
@ RECOVERY_NOT_PAUSED
@ RECOVERY_PAUSE_REQUESTED
void wal_segment_close(XLogReaderState *state)
Definition xlogutils.c:831
Buffer XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode, Buffer recent_buffer)
Definition xlogutils.c:460
HotStandbyState standbyState
Definition xlogutils.c:53
bool InRecovery
Definition xlogutils.c:50
void XLogCheckInvalidPages(void)
Definition xlogutils.c:234
@ STANDBY_SNAPSHOT_READY
Definition xlogutils.h:55
@ STANDBY_INITIALIZED
Definition xlogutils.h:53
struct WaitLSNState * waitLSNState
Definition xlogwait.c:68
void WaitLSNWakeup(WaitLSNType lsnType, XLogRecPtr currentLSN)
Definition xlogwait.c:317
@ WAIT_LSN_TYPE_STANDBY_REPLAY
Definition xlogwait.h:39