PostgreSQL Source Code git master
Loading...
Searching...
No Matches
xlogrecovery.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * xlogrecovery.c
4 * Functions for WAL recovery, standby mode
5 *
6 * This source file contains functions controlling WAL recovery.
7 * InitWalRecovery() initializes the system for crash or archive recovery,
8 * or standby mode, depending on configuration options and the state of
9 * the control file and possible backup label file. PerformWalRecovery()
10 * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 * and prepares information needed to initialize the WAL for writes. In
13 * addition to these three main functions, there are a bunch of functions
14 * for interrogating recovery state and controlling the recovery process.
15 *
16 *
17 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
19 *
20 * src/backend/access/transam/xlogrecovery.c
21 *
22 *-------------------------------------------------------------------------
23 */
24
25#include "postgres.h"
26
27#include <ctype.h>
28#include <time.h>
29#include <sys/stat.h>
30#include <sys/time.h>
31#include <unistd.h>
32
33#include "access/timeline.h"
34#include "access/transam.h"
35#include "access/xact.h"
37#include "access/xlogarchive.h"
39#include "access/xlogreader.h"
40#include "access/xlogrecovery.h"
41#include "access/xlogutils.h"
42#include "access/xlogwait.h"
43#include "backup/basebackup.h"
44#include "catalog/pg_control.h"
45#include "commands/tablespace.h"
46#include "common/file_utils.h"
47#include "miscadmin.h"
48#include "nodes/miscnodes.h"
49#include "pgstat.h"
50#include "postmaster/bgwriter.h"
51#include "postmaster/startup.h"
52#include "replication/slot.h"
55#include "storage/fd.h"
56#include "storage/ipc.h"
57#include "storage/latch.h"
58#include "storage/pmsignal.h"
59#include "storage/procarray.h"
60#include "storage/spin.h"
61#include "utils/datetime.h"
62#include "utils/fmgrprotos.h"
63#include "utils/guc_hooks.h"
65#include "utils/pg_lsn.h"
66#include "utils/ps_status.h"
67#include "utils/pg_rusage.h"
68#include "utils/wait_event.h"
69
70/* Unsupported old recovery command file names (relative to $PGDATA) */
71#define RECOVERY_COMMAND_FILE "recovery.conf"
72#define RECOVERY_COMMAND_DONE "recovery.done"
73
74/*
75 * GUC support
76 */
78 {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
79 {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
80 {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
81 {NULL, 0, false}
82};
83
84/* options formerly taken from recovery.conf for archive recovery */
97
98/* options formerly taken from recovery.conf for XLOG streaming */
102
103/*
104 * recoveryTargetTimeLineGoal: what the user requested, if any
105 *
106 * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
107 *
108 * recoveryTargetTLI: the currently understood target timeline; changes
109 *
110 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
111 * the timelines of its known parents, newest first (so recoveryTargetTLI is
112 * always the first list member). Only these TLIs are expected to be seen in
113 * the WAL segments we read, and indeed only these TLIs will be considered as
114 * candidate WAL files to open at all.
115 *
116 * curFileTLI: the TLI appearing in the name of the current input WAL file.
117 * (This is not necessarily the same as the timeline from which we are
118 * replaying WAL, which StartupXLOG calls replayTLI, because we could be
119 * scanning data that was copied from an ancestor timeline when the current
120 * file was created.) During a sequential scan we do not allow this value
121 * to decrease.
122 */
128
129/*
130 * When ArchiveRecoveryRequested is set, archive recovery was requested,
131 * ie. signal files were present. When InArchiveRecovery is set, we are
132 * currently recovering using offline XLOG archives. These variables are only
133 * valid in the startup process.
134 *
135 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
136 * currently performing crash recovery using only XLOG files in pg_wal, but
137 * will switch to using offline XLOG archives as soon as we reach the end of
138 * WAL in pg_wal.
139 */
141bool InArchiveRecovery = false;
142
143/*
144 * When StandbyModeRequested is set, standby mode was requested, i.e.
145 * standby.signal file was present. When StandbyMode is set, we are currently
146 * in standby mode. These variables are only valid in the startup process.
147 * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
148 */
149static bool StandbyModeRequested = false;
150bool StandbyMode = false;
151
152/* was a signal file present at startup? */
153static bool standby_signal_file_found = false;
154static bool recovery_signal_file_found = false;
155
156/*
157 * CheckPointLoc is the position of the checkpoint record that determines
158 * where to start the replay. It comes from the backup label file or the
159 * control file.
160 *
161 * RedoStartLSN is the checkpoint's REDO location, also from the backup label
162 * file or the control file. In standby mode, XLOG streaming usually starts
163 * from the position where an invalid record was found. But if we fail to
164 * read even the initial checkpoint record, we use the REDO location instead
165 * of the checkpoint location as the start position of XLOG streaming.
166 * Otherwise we would have to jump backwards to the REDO location after
167 * reading the checkpoint record, because the REDO record can precede the
168 * checkpoint record.
169 */
174
175/*
176 * Local copy of SharedHotStandbyActive variable. False actually means "not
177 * known, need to check the shared state".
178 */
179static bool LocalHotStandbyActive = false;
180
181/*
182 * Local copy of SharedPromoteIsTriggered variable. False actually means "not
183 * known, need to check the shared state".
184 */
185static bool LocalPromoteIsTriggered = false;
186
187/* Has the recovery code requested a walreceiver wakeup? */
189
190/* XLogReader object used to parse the WAL records */
192
193/* XLogPrefetcher object used to consume WAL records with read-ahead */
195
196/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
198{
199 int emode;
200 bool fetching_ckpt; /* are we fetching a checkpoint record? */
204
205/* flag to tell XLogPageRead that we have started replaying */
206static bool InRedo = false;
207
208/*
209 * Codes indicating where we got a WAL file from during recovery, or where
210 * to attempt to get one.
211 */
212typedef enum
213{
214 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
215 XLOG_FROM_ARCHIVE, /* restored using restore_command */
216 XLOG_FROM_PG_WAL, /* existing file in pg_wal */
217 XLOG_FROM_STREAM, /* streamed from primary */
218} XLogSource;
219
220/* human-readable names for XLogSources, for debugging output */
221static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
222
223/*
224 * readFile is -1 or a kernel FD for the log file segment that's currently
225 * open for reading. readSegNo identifies the segment. readOff is the offset
226 * of the page just read, readLen indicates how much of it has been read into
227 * readBuf, and readSource indicates where we got the currently open file from.
228 *
229 * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
230 * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
231 * worthwhile, since the XLOG is not read by general-purpose sessions.
232 */
233static int readFile = -1;
235static uint32 readOff = 0;
236static uint32 readLen = 0;
238
239/*
240 * Keeps track of which source we're currently reading from. This is
241 * different from readSource in that this is always set, even when we don't
242 * currently have a WAL file open. If lastSourceFailed is set, our last
243 * attempt to read from currentSource failed, and we should try another source
244 * next.
245 *
246 * pendingWalRcvRestart is set when a config change occurs that requires a
247 * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
248 */
250static bool lastSourceFailed = false;
251static bool pendingWalRcvRestart = false;
252
253/*
254 * These variables track when we last obtained some WAL data to process,
255 * and where we got it from. (XLogReceiptSource is initially the same as
256 * readSource, but readSource gets reset to zero when we don't have data
257 * to process right now. It is also different from currentSource, which
258 * also changes when we try to read from a source and fail, while
259 * XLogReceiptSource tracks where we last successfully read some WAL.)
260 */
263
264/* Local copy of WalRcv->flushedUpto */
267
268/*
269 * Copy of minRecoveryPoint and backupEndPoint from the control file.
270 *
271 * In order to reach consistency, we must replay the WAL up to
272 * minRecoveryPoint. If backupEndRequired is true, we must also reach
273 * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
274 * to backupStartPoint.
275 *
276 * Note: In archive recovery, after consistency has been reached, the
277 * functions in xlog.c will start updating minRecoveryPoint in the control
278 * file. But this copy of minRecoveryPoint variable reflects the value at the
279 * beginning of recovery, and is *not* updated after consistency is reached.
280 */
283
286static bool backupEndRequired = false;
287
288/*
289 * Have we reached a consistent database state? In crash recovery, we have
290 * to replay all the WAL, so reachedConsistency is never set. During archive
291 * recovery, the database is consistent once minRecoveryPoint is reached.
292 *
293 * Consistent state means that the system is internally consistent, all
294 * the WAL has been replayed up to a certain point, and importantly, there
295 * is no trace of later actions on disk.
296 *
297 * This flag is used only by the startup process and postmaster. When
298 * minRecoveryPoint is reached, the startup process sets it to true and
299 * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
300 * which then sets it to true upon receiving the signal.
301 */
303
304/* Buffers dedicated to consistency checks of size BLCKSZ */
307
309
310/*
311 * abortedRecPtr is the start pointer of a broken record at end of WAL when
312 * recovery completes; missingContrecPtr is the location of the first
313 * contrecord that went missing. See CreateOverwriteContrecordRecord for
314 * details.
315 */
318
319/*
320 * if recoveryStopsBefore/After returns true, it saves information of the stop
321 * point here
322 */
328
329/* prototypes for local functions */
330static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
331
332static void EnableStandbyMode(void);
333static void readRecoverySignalFile(void);
334static void validateRecoveryParameters(void);
338static bool read_tablespace_map(List **tablespaces);
339
340static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
341static void CheckRecoveryConsistency(void);
342static void rm_redo_error_callback(void *arg);
343#ifdef WAL_DEBUG
344static void xlog_outrec(StringInfo buf, XLogReaderState *record);
345#endif
346static void xlog_block_info(StringInfo buf, XLogReaderState *record);
348 TimeLineID prevTLI, TimeLineID replayTLI);
351
352static bool recoveryStopsBefore(XLogReaderState *record);
353static bool recoveryStopsAfter(XLogReaderState *record);
354static char *getRecoveryStopReason(void);
355static void recoveryPausesHere(bool endOfRecovery);
356static bool recoveryApplyDelay(XLogReaderState *record);
357static void ConfirmRecoveryPaused(void);
358
360 int emode, bool fetching_ckpt,
361 TimeLineID replayTLI);
362
364 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
366 bool randAccess,
367 bool fetching_ckpt,
369 TimeLineID replayTLI,
371 bool nonblocking);
372static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
374 XLogRecPtr RecPtr, TimeLineID replayTLI);
376static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
379
380static bool CheckForStandbyTrigger(void);
381static void SetPromoteIsTriggered(void);
382static bool HotStandbyActiveInReplay(void);
383
385static void SetLatestXTime(TimestampTz xtime);
386
387/*
388 * Initialization of shared memory for WAL recovery
389 */
390Size
392{
393 Size size;
394
395 /* XLogRecoveryCtl */
396 size = sizeof(XLogRecoveryCtlData);
397
398 return size;
399}
400
401void
416
417/*
418 * A thin wrapper to enable StandbyMode and do other preparatory work as
419 * needed.
420 */
421static void
423{
424 StandbyMode = true;
425
426 /*
427 * To avoid server log bloat, we don't report recovery progress in a
428 * standby as it will always be in recovery unless promoted. We disable
429 * startup progress timeout in standby mode to avoid calling
430 * startup_progress_timeout_handler() unnecessarily.
431 */
433}
434
435/*
436 * Prepare the system for WAL recovery, if needed.
437 *
438 * This is called by StartupXLOG() which coordinates the server startup
439 * sequence. This function analyzes the control file and the backup label
440 * file, if any, and figures out whether we need to perform crash recovery or
441 * archive recovery, and how far we need to replay the WAL to reach a
442 * consistent state.
443 *
444 * This doesn't yet change the on-disk state, except for creating the symlinks
445 * from table space map file if any, and for fetching WAL files needed to find
446 * the checkpoint record. On entry, the caller has already read the control
447 * file into memory, and passes it as argument. This function updates it to
448 * reflect the recovery state, and the caller is expected to write it back to
449 * disk does after initializing other subsystems, but before calling
450 * PerformWalRecovery().
451 *
452 * This initializes some global variables like ArchiveRecoveryRequested, and
453 * StandbyModeRequested and InRecovery.
454 */
455void
458{
459 XLogPageReadPrivate *private;
460 struct stat st;
461 bool wasShutdown;
462 XLogRecord *record;
464 bool haveTblspcMap = false;
465 bool haveBackupLabel = false;
466 CheckPoint checkPoint;
467 bool backupFromStandby = false;
468
470
471 /*
472 * Initialize on the assumption we want to recover to the latest timeline
473 * that's active according to pg_control.
474 */
478 else
480
481 /*
482 * Check for signal files, and if so set up state for offline recovery
483 */
486
487 /*
488 * Take ownership of the wakeup latch if we're going to sleep during
489 * recovery, if required.
490 */
493
494 /*
495 * Set the WAL reading processor now, as it will be needed when reading
496 * the checkpoint record required (backup_label or not).
497 */
499 xlogreader =
501 XL_ROUTINE(.page_read = &XLogPageRead,
502 .segment_open = NULL,
503 .segment_close = wal_segment_close),
504 private);
505 if (!xlogreader)
508 errmsg("out of memory"),
509 errdetail("Failed while allocating a WAL reading processor.")));
511
512 /*
513 * Set the WAL decode buffer size. This limits how far ahead we can read
514 * in the WAL.
515 */
517
518 /* Create a WAL prefetcher. */
520
521 /*
522 * Allocate two page buffers dedicated to WAL consistency checks. We do
523 * it this way, rather than just making static arrays, for two reasons:
524 * (1) no need to waste the storage in most instantiations of the backend;
525 * (2) a static char array isn't guaranteed to have any particular
526 * alignment, whereas palloc() will provide MAXALIGN'd storage.
527 */
530
531 /*
532 * Read the backup_label file. We want to run this part of the recovery
533 * process after checking for signal files and after performing validation
534 * of the recovery parameters.
535 */
538 {
539 List *tablespaces = NIL;
540
541 /*
542 * Archive recovery was requested, and thanks to the backup label
543 * file, we know how far we need to replay to reach consistency. Enter
544 * archive recovery directly.
545 */
546 InArchiveRecovery = true;
549
550 /*
551 * Omitting backup_label when creating a new replica, PITR node etc.
552 * unfortunately is a common cause of corruption. Logging that
553 * backup_label was used makes it a bit easier to exclude that as the
554 * cause of observed corruption.
555 *
556 * Do so before we try to read the checkpoint record (which can fail),
557 * as otherwise it can be hard to understand why a checkpoint other
558 * than ControlFile->checkPoint is used.
559 */
560 ereport(LOG,
561 errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
565
566 /*
567 * When a backup_label file is present, we want to roll forward from
568 * the checkpoint it identifies, rather than using pg_control.
569 */
572 if (record != NULL)
573 {
574 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
577 errmsg_internal("checkpoint record is at %X/%08X",
579 InRecovery = true; /* force recovery even if SHUTDOWNED */
580
581 /*
582 * Make sure that REDO location exists. This may not be the case
583 * if there was a crash during an online backup, which left a
584 * backup_label around that references a WAL segment that's
585 * already been archived.
586 */
587 if (checkPoint.redo < CheckPointLoc)
588 {
590 if (!ReadRecord(xlogprefetcher, LOG, false,
591 checkPoint.ThisTimeLineID))
593 errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
595 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
596 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
597 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
599 }
600 }
601 else
602 {
604 errmsg("could not locate required checkpoint record at %X/%08X",
606 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
607 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
608 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
610 wasShutdown = false; /* keep compiler quiet */
611 }
612
613 /* Read the tablespace_map file if present and create symlinks. */
614 if (read_tablespace_map(&tablespaces))
615 {
616 ListCell *lc;
617
618 foreach(lc, tablespaces)
619 {
621 char *linkloc;
622
623 linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
624
625 /*
626 * Remove the existing symlink if any and Create the symlink
627 * under PGDATA.
628 */
630
631 if (symlink(ti->path, linkloc) < 0)
634 errmsg("could not create symbolic link \"%s\": %m",
635 linkloc)));
636
637 pfree(ti->path);
638 pfree(ti);
639 }
640
641 /* tell the caller to delete it later */
642 haveTblspcMap = true;
643 }
644
645 /* tell the caller to delete it later */
646 haveBackupLabel = true;
647 }
648 else
649 {
650 /* No backup_label file has been found if we are here. */
651
652 /*
653 * If tablespace_map file is present without backup_label file, there
654 * is no use of such file. There is no harm in retaining it, but it
655 * is better to get rid of the map file so that we don't have any
656 * redundant file in data directory and it will avoid any sort of
657 * confusion. It seems prudent though to just rename the file out of
658 * the way rather than delete it completely, also we ignore any error
659 * that occurs in rename operation as even if map file is present
660 * without backup_label file, it is harmless.
661 */
662 if (stat(TABLESPACE_MAP, &st) == 0)
663 {
666 ereport(LOG,
667 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
669 errdetail("File \"%s\" was renamed to \"%s\".",
671 else
672 ereport(LOG,
673 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
675 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
677 }
678
679 /*
680 * It's possible that archive recovery was requested, but we don't
681 * know how far we need to replay the WAL before we reach consistency.
682 * This can happen for example if a base backup is taken from a
683 * running server using an atomic filesystem snapshot, without calling
684 * pg_backup_start/stop. Or if you just kill a running primary server
685 * and put it into archive recovery by creating a recovery signal
686 * file.
687 *
688 * Our strategy in that case is to perform crash recovery first,
689 * replaying all the WAL present in pg_wal, and only enter archive
690 * recovery after that.
691 *
692 * But usually we already know how far we need to replay the WAL (up
693 * to minRecoveryPoint, up to backupEndPoint, or until we see an
694 * end-of-backup record), and we can enter archive recovery directly.
695 */
701 {
702 InArchiveRecovery = true;
705 }
706
707 /*
708 * For the same reason as when starting up with backup_label present,
709 * emit a log message when we continue initializing from a base
710 * backup.
711 */
713 ereport(LOG,
714 errmsg("restarting backup recovery with redo LSN %X/%08X",
716
717 /* Get the last valid checkpoint record. */
724 if (record != NULL)
725 {
727 errmsg_internal("checkpoint record is at %X/%08X",
729 }
730 else
731 {
732 /*
733 * We used to attempt to go back to a secondary checkpoint record
734 * here, but only when not in standby mode. We now just fail if we
735 * can't read the last checkpoint because this allows us to
736 * simplify processing around checkpoints.
737 */
739 errmsg("could not locate a valid checkpoint record at %X/%08X",
741 }
742 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
744
745 /* Make sure that REDO location exists. */
746 if (checkPoint.redo < CheckPointLoc)
747 {
749 if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
751 errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
753 }
754 }
755
757 {
759 ereport(LOG,
760 (errmsg("entering standby mode")));
762 ereport(LOG,
763 (errmsg("starting point-in-time recovery to XID %u",
766 ereport(LOG,
767 (errmsg("starting point-in-time recovery to %s",
770 ereport(LOG,
771 (errmsg("starting point-in-time recovery to \"%s\"",
774 ereport(LOG,
775 errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
778 ereport(LOG,
779 (errmsg("starting point-in-time recovery to earliest consistent point")));
780 else
781 ereport(LOG,
782 (errmsg("starting archive recovery")));
783 }
784
785 /*
786 * If the location of the checkpoint record is not on the expected
787 * timeline in the history of the requested timeline, we cannot proceed:
788 * the backup is not part of the history of the requested timeline.
789 */
790 Assert(expectedTLEs); /* was initialized by reading checkpoint
791 * record */
794 {
796
797 /*
798 * tliSwitchPoint will throw an error if the checkpoint's timeline is
799 * not in expectedTLEs at all.
800 */
803 (errmsg("requested timeline %u is not a child of this server's history",
805 /* translator: %s is a backup_label file or a pg_control file */
806 errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
807 haveBackupLabel ? "backup_label" : "pg_control",
811 }
812
813 /*
814 * The min recovery point should be part of the requested timeline's
815 * history, too.
816 */
821 errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
825
827 errmsg_internal("redo record is at %X/%08X; shutdown %s",
828 LSN_FORMAT_ARGS(checkPoint.redo),
829 wasShutdown ? "true" : "false"));
831 (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
833 checkPoint.nextOid)));
835 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64,
836 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
838 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
839 checkPoint.oldestXid, checkPoint.oldestXidDB)));
841 (errmsg_internal("oldest MultiXactId: %u, in database %u",
842 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
844 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
845 checkPoint.oldestCommitTsXid,
846 checkPoint.newestCommitTsXid)));
849 (errmsg("invalid next transaction ID")));
850
851 /* sanity check */
852 if (checkPoint.redo > CheckPointLoc)
854 (errmsg("invalid redo in checkpoint record")));
855
856 /*
857 * Check whether we need to force recovery from WAL. If it appears to
858 * have been a clean shutdown and we did not have a recovery signal file,
859 * then assume no recovery needed.
860 */
861 if (checkPoint.redo < CheckPointLoc)
862 {
863 if (wasShutdown)
865 (errmsg("invalid redo record in shutdown checkpoint")));
866 InRecovery = true;
867 }
868 else if (ControlFile->state != DB_SHUTDOWNED)
869 InRecovery = true;
871 {
872 /* force recovery due to presence of recovery signal file */
873 InRecovery = true;
874 }
875
876 /*
877 * If recovery is needed, update our in-memory copy of pg_control to show
878 * that we are recovering and to show the selected checkpoint as the place
879 * we are starting from. We also mark pg_control with any minimum recovery
880 * stop point obtained from a backup history file.
881 *
882 * We don't write the changes to disk yet, though. Only do that after
883 * initializing various subsystems.
884 */
885 if (InRecovery)
886 {
888 {
890 }
891 else
892 {
893 ereport(LOG,
894 (errmsg("database system was not properly shut down; "
895 "automatic recovery in progress")));
897 ereport(LOG,
898 (errmsg("crash recovery starts in timeline %u "
899 "and has target timeline %u",
903 }
905 ControlFile->checkPointCopy = checkPoint;
907 {
908 /* initialize minRecoveryPoint if not set yet */
909 if (ControlFile->minRecoveryPoint < checkPoint.redo)
910 {
911 ControlFile->minRecoveryPoint = checkPoint.redo;
913 }
914 }
915
916 /*
917 * Set backupStartPoint if we're starting recovery from a base backup.
918 *
919 * Also set backupEndPoint and use minRecoveryPoint as the backup end
920 * location if we're starting recovery from a base backup which was
921 * taken from a standby. In this case, the database system status in
922 * pg_control must indicate that the database was already in recovery.
923 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
924 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
925 * before reaching this point; e.g. because restore_command or
926 * primary_conninfo were faulty.
927 *
928 * Any other state indicates that the backup somehow became corrupted
929 * and we can't sensibly continue with recovery.
930 */
931 if (haveBackupLabel)
932 {
933 ControlFile->backupStartPoint = checkPoint.redo;
935
937 {
941 (errmsg("backup_label contains data inconsistent with control file"),
942 errhint("This means that the backup is corrupted and you will "
943 "have to use another backup for recovery.")));
945 }
946 }
947 }
948
949 /* remember these, so that we know when we have reached consistency */
954 {
957 }
958 else
959 {
962 }
963
964 /*
965 * Start recovery assuming that the final record isn't lost.
966 */
969
973}
974
975/*
976 * See if there are any recovery signal files and if so, set state for
977 * recovery.
978 *
979 * See if there is a recovery command file (recovery.conf), and if so
980 * throw an ERROR since as of PG12 we no longer recognize that.
981 */
982static void
984{
985 struct stat stat_buf;
986
988 return;
989
990 /*
991 * Check for old recovery API file: recovery.conf
992 */
996 errmsg("using recovery command file \"%s\" is not supported",
998
999 /*
1000 * Remove unused .done file, if present. Ignore if absent.
1001 */
1003
1004 /*
1005 * Check for recovery signal files and if found, fsync them since they
1006 * represent server state information. We don't sweat too much about the
1007 * possibility of fsync failure, however.
1008 */
1009 if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1010 {
1011 int fd;
1012
1014 S_IRUSR | S_IWUSR);
1015 if (fd >= 0)
1016 {
1017 (void) pg_fsync(fd);
1018 close(fd);
1019 }
1021 }
1022
1024 {
1025 int fd;
1026
1028 S_IRUSR | S_IWUSR);
1029 if (fd >= 0)
1030 {
1031 (void) pg_fsync(fd);
1032 close(fd);
1033 }
1035 }
1036
1037 /*
1038 * If both signal files are present, standby signal file takes precedence.
1039 * If neither is present then we won't enter archive recovery.
1040 */
1041 StandbyModeRequested = false;
1044 {
1045 StandbyModeRequested = true;
1047 }
1049 {
1050 StandbyModeRequested = false;
1052 }
1053 else
1054 return;
1055
1056 /*
1057 * We don't support standby mode in standalone backends; that requires
1058 * other processes such as the WAL receiver to be alive.
1059 */
1061 ereport(FATAL,
1063 errmsg("standby mode is not supported by single-user servers")));
1064}
1065
1066static void
1068{
1070 return;
1071
1072 /*
1073 * Check for compulsory parameters
1074 */
1076 {
1077 if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1080 (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1081 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1082 }
1083 else
1084 {
1087 ereport(FATAL,
1089 errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1090 }
1091
1092 /*
1093 * Override any inconsistent requests. Note that this is a change of
1094 * behaviour in 9.5; prior to this we simply ignored a request to pause if
1095 * hot_standby = off, which was surprising behaviour.
1096 */
1100
1101 /*
1102 * Final parsing of recovery_target_time string; see also
1103 * check_recovery_target_time().
1104 */
1106 {
1110 Int32GetDatum(-1)));
1111 }
1112
1113 /*
1114 * If user specified recovery_target_timeline, validate it or compute the
1115 * "latest" value. We can't do this until after we've gotten the restore
1116 * command and set InArchiveRecovery, because we need to fetch timeline
1117 * history files from the archive.
1118 */
1120 {
1122
1123 /* Timeline 1 does not have a history file, all else should */
1124 if (rtli != 1 && !existsTimeLineHistory(rtli))
1125 ereport(FATAL,
1127 errmsg("recovery target timeline %u does not exist",
1128 rtli)));
1130 }
1132 {
1133 /* We start the "latest" search from pg_control's timeline */
1135 }
1136 else
1137 {
1138 /*
1139 * else we just use the recoveryTargetTLI as already read from
1140 * ControlFile
1141 */
1143 }
1144}
1145
1146/*
1147 * read_backup_label: check to see if a backup_label file is present
1148 *
1149 * If we see a backup_label during recovery, we assume that we are recovering
1150 * from a backup dump file, and we therefore roll forward from the checkpoint
1151 * identified by the label file, NOT what pg_control says. This avoids the
1152 * problem that pg_control might have been archived one or more checkpoints
1153 * later than the start of the dump, and so if we rely on it as the start
1154 * point, we will fail to restore a consistent database state.
1155 *
1156 * Returns true if a backup_label was found (and fills the checkpoint
1157 * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1158 * returns false if not. If this backup_label came from a streamed backup,
1159 * *backupEndRequired is set to true. If this backup_label was created during
1160 * recovery, *backupFromStandby is set to true.
1161 *
1162 * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1163 * and TLI read from the backup file.
1164 */
1165static bool
1168{
1172 FILE *lfp;
1173 char ch;
1174 char backuptype[20];
1175 char backupfrom[20];
1176 char backuplabel[MAXPGPATH];
1177 char backuptime[128];
1178 uint32 hi,
1179 lo;
1180
1181 /* suppress possible uninitialized-variable warnings */
1183 *backupLabelTLI = 0;
1184 *backupEndRequired = false;
1185 *backupFromStandby = false;
1186
1187 /*
1188 * See if label file is present
1189 */
1191 if (!lfp)
1192 {
1193 if (errno != ENOENT)
1194 ereport(FATAL,
1196 errmsg("could not read file \"%s\": %m",
1198 return false; /* it's not there, all is fine */
1199 }
1200
1201 /*
1202 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1203 * is pretty crude, but we are not expecting any variability in the file
1204 * format).
1205 */
1206 if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
1207 &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1208 ereport(FATAL,
1210 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1211 RedoStartLSN = ((uint64) hi) << 32 | lo;
1213 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
1214 &hi, &lo, &ch) != 3 || ch != '\n')
1215 ereport(FATAL,
1217 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1218 *checkPointLoc = ((uint64) hi) << 32 | lo;
1220
1221 /*
1222 * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1223 * which could mean either pg_basebackup or the pg_backup_start/stop
1224 * method was used) or if this label came from somewhere else (the only
1225 * other option today being from pg_rewind). If this was a streamed
1226 * backup then we know that we need to play through until we get to the
1227 * end of the WAL which was generated during the backup (at which point we
1228 * will have reached consistency and backupEndRequired will be reset to be
1229 * false).
1230 */
1231 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1232 {
1233 if (strcmp(backuptype, "streamed") == 0)
1234 *backupEndRequired = true;
1235 }
1236
1237 /*
1238 * BACKUP FROM lets us know if this was from a primary or a standby. If
1239 * it was from a standby, we'll double-check that the control file state
1240 * matches that of a standby.
1241 */
1242 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1243 {
1244 if (strcmp(backupfrom, "standby") == 0)
1245 *backupFromStandby = true;
1246 }
1247
1248 /*
1249 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1250 * but checking for their presence is useful for debugging and the next
1251 * sanity checks. Cope also with the fact that the result buffers have a
1252 * pre-allocated size, hence if the backup_label file has been generated
1253 * with strings longer than the maximum assumed here an incorrect parsing
1254 * happens. That's fine as only minor consistency checks are done
1255 * afterwards.
1256 */
1257 if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1259 (errmsg_internal("backup time %s in file \"%s\"",
1261
1262 if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1264 (errmsg_internal("backup label %s in file \"%s\"",
1266
1267 /*
1268 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1269 * it as a sanity check if present.
1270 */
1271 if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1272 {
1274 ereport(FATAL,
1276 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1277 errdetail("Timeline ID parsed is %u, but expected %u.",
1279
1281 (errmsg_internal("backup timeline %u in file \"%s\"",
1283 }
1284
1285 if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
1286 ereport(FATAL,
1288 errmsg("this is an incremental backup, not a data directory"),
1289 errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1290
1291 if (ferror(lfp) || FreeFile(lfp))
1292 ereport(FATAL,
1294 errmsg("could not read file \"%s\": %m",
1296
1297 return true;
1298}
1299
1300/*
1301 * read_tablespace_map: check to see if a tablespace_map file is present
1302 *
1303 * If we see a tablespace_map file during recovery, we assume that we are
1304 * recovering from a backup dump file, and we therefore need to create symlinks
1305 * as per the information present in tablespace_map file.
1306 *
1307 * Returns true if a tablespace_map file was found (and fills *tablespaces
1308 * with a tablespaceinfo struct for each tablespace listed in the file);
1309 * returns false if not.
1310 */
1311static bool
1313{
1315 FILE *lfp;
1316 char str[MAXPGPATH];
1317 int ch,
1318 i,
1319 n;
1320 bool was_backslash;
1321
1322 /*
1323 * See if tablespace_map file is present
1324 */
1326 if (!lfp)
1327 {
1328 if (errno != ENOENT)
1329 ereport(FATAL,
1331 errmsg("could not read file \"%s\": %m",
1332 TABLESPACE_MAP)));
1333 return false; /* it's not there, all is fine */
1334 }
1335
1336 /*
1337 * Read and parse the link name and path lines from tablespace_map file
1338 * (this code is pretty crude, but we are not expecting any variability in
1339 * the file format). De-escape any backslashes that were inserted.
1340 */
1341 i = 0;
1342 was_backslash = false;
1343 while ((ch = fgetc(lfp)) != EOF)
1344 {
1345 if (!was_backslash && (ch == '\n' || ch == '\r'))
1346 {
1347 char *endp;
1348
1349 if (i == 0)
1350 continue; /* \r immediately followed by \n */
1351
1352 /*
1353 * The de-escaped line should contain an OID followed by exactly
1354 * one space followed by a path. The path might start with
1355 * spaces, so don't be too liberal about parsing.
1356 */
1357 str[i] = '\0';
1358 n = 0;
1359 while (str[n] && str[n] != ' ')
1360 n++;
1361 if (n < 1 || n >= i - 1)
1362 ereport(FATAL,
1364 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1365 str[n++] = '\0';
1366
1368 errno = 0;
1369 ti->oid = strtoul(str, &endp, 10);
1370 if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1371 ereport(FATAL,
1373 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1374 ti->path = pstrdup(str + n);
1375 *tablespaces = lappend(*tablespaces, ti);
1376
1377 i = 0;
1378 continue;
1379 }
1380 else if (!was_backslash && ch == '\\')
1381 was_backslash = true;
1382 else
1383 {
1384 if (i < sizeof(str) - 1)
1385 str[i++] = ch;
1386 was_backslash = false;
1387 }
1388 }
1389
1390 if (i != 0 || was_backslash) /* last line not terminated? */
1391 ereport(FATAL,
1393 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1394
1395 if (ferror(lfp) || FreeFile(lfp))
1396 ereport(FATAL,
1398 errmsg("could not read file \"%s\": %m",
1399 TABLESPACE_MAP)));
1400
1401 return true;
1402}
1403
1404/*
1405 * Finish WAL recovery.
1406 *
1407 * This does not close the 'xlogreader' yet, because in some cases the caller
1408 * still wants to re-read the last checkpoint record by calling
1409 * ReadCheckpointRecord().
1410 *
1411 * Returns the position of the last valid or applied record, after which new
1412 * WAL should be appended, information about why recovery was ended, and some
1413 * other things. See the EndOfWalRecoveryInfo struct for details.
1414 */
1417{
1419 XLogRecPtr lastRec;
1420 TimeLineID lastRecTLI;
1421 XLogRecPtr endOfLog;
1422
1423 /*
1424 * Kill WAL receiver, if it's still running, before we continue to write
1425 * the startup checkpoint and aborted-contrecord records. It will trump
1426 * over these records and subsequent ones if it's still alive when we
1427 * start writing WAL.
1428 */
1430
1431 /*
1432 * Shutdown the slot sync worker to drop any temporary slots acquired by
1433 * it and to prevent it from keep trying to fetch the failover slots.
1434 *
1435 * We do not update the 'synced' column in 'pg_replication_slots' system
1436 * view from true to false here, as any failed update could leave 'synced'
1437 * column false for some slots. This could cause issues during slot sync
1438 * after restarting the server as a standby. While updating the 'synced'
1439 * column after switching to the new timeline is an option, it does not
1440 * simplify the handling for the 'synced' column. Therefore, we retain the
1441 * 'synced' column as true after promotion as it may provide useful
1442 * information about the slot origin.
1443 */
1445
1446 /*
1447 * We are now done reading the xlog from stream. Turn off streaming
1448 * recovery to force fetching the files (which would be required at end of
1449 * recovery, e.g., timeline history file) from archive or pg_wal.
1450 *
1451 * Note that standby mode must be turned off after killing WAL receiver,
1452 * i.e., calling XLogShutdownWalRcv().
1453 */
1455 StandbyMode = false;
1456
1457 /*
1458 * Determine where to start writing WAL next.
1459 *
1460 * Re-fetch the last valid or last applied record, so we can identify the
1461 * exact endpoint of what we consider the valid portion of WAL. There may
1462 * be an incomplete continuation record after that, in which case
1463 * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1464 * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1465 * it is intentionally missing. See CreateOverwriteContrecordRecord().
1466 *
1467 * An important side-effect of this is to load the last page into
1468 * xlogreader. The caller uses it to initialize the WAL for writing.
1469 */
1470 if (!InRecovery)
1471 {
1472 lastRec = CheckPointLoc;
1473 lastRecTLI = CheckPointTLI;
1474 }
1475 else
1476 {
1478 lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1479 }
1481 (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1482 endOfLog = xlogreader->EndRecPtr;
1483
1484 /*
1485 * Remember the TLI in the filename of the XLOG segment containing the
1486 * end-of-log. It could be different from the timeline that endOfLog
1487 * nominally belongs to, if there was a timeline switch in that segment,
1488 * and we were reading the old WAL from a segment belonging to a higher
1489 * timeline.
1490 */
1491 result->endOfLogTLI = xlogreader->seg.ws_tli;
1492
1494 {
1495 /*
1496 * We are no longer in archive recovery state.
1497 *
1498 * We are now done reading the old WAL. Turn off archive fetching if
1499 * it was active.
1500 */
1502 InArchiveRecovery = false;
1503
1504 /*
1505 * If the ending log segment is still open, close it (to avoid
1506 * problems on Windows with trying to rename or delete an open file).
1507 */
1508 if (readFile >= 0)
1509 {
1510 close(readFile);
1511 readFile = -1;
1512 }
1513 }
1514
1515 /*
1516 * Copy the last partial block to the caller, for initializing the WAL
1517 * buffer for appending new WAL.
1518 */
1519 if (endOfLog % XLOG_BLCKSZ != 0)
1520 {
1521 char *page;
1522 int len;
1524
1525 pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1527
1528 /* Copy the valid part of the last block */
1529 len = endOfLog % XLOG_BLCKSZ;
1530 page = palloc(len);
1531 memcpy(page, xlogreader->readBuf, len);
1532
1534 result->lastPage = page;
1535 }
1536 else
1537 {
1538 /* There is no partial block to copy. */
1539 result->lastPageBeginPtr = endOfLog;
1540 result->lastPage = NULL;
1541 }
1542
1543 /*
1544 * Create a comment for the history file to explain why and where timeline
1545 * changed.
1546 */
1548
1549 result->lastRec = lastRec;
1550 result->lastRecTLI = lastRecTLI;
1551 result->endOfLog = endOfLog;
1552
1553 result->abortedRecPtr = abortedRecPtr;
1555
1558
1559 return result;
1560}
1561
1562/*
1563 * Clean up the WAL reader and leftovers from restoring WAL from archive
1564 */
1565void
1567{
1568 char recoveryPath[MAXPGPATH];
1569
1570 /* Final update of pg_stat_recovery_prefetch. */
1572
1573 /* Shut down xlogreader */
1574 if (readFile >= 0)
1575 {
1576 close(readFile);
1577 readFile = -1;
1578 }
1582
1584 {
1585 /*
1586 * Since there might be a partial WAL segment named RECOVERYXLOG, get
1587 * rid of it.
1588 */
1589 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1590 unlink(recoveryPath); /* ignore any error */
1591
1592 /* Get rid of any remaining recovered timeline-history file, too */
1593 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1594 unlink(recoveryPath); /* ignore any error */
1595 }
1596
1597 /*
1598 * We don't need the latch anymore. It's not strictly necessary to disown
1599 * it, but let's do it for the sake of tidiness.
1600 */
1603}
1604
1605/*
1606 * Perform WAL recovery.
1607 *
1608 * If the system was shut down cleanly, this is never called.
1609 */
1610void
1612{
1613 XLogRecord *record;
1614 bool reachedRecoveryTarget = false;
1615 TimeLineID replayTLI;
1616
1617 /*
1618 * Initialize shared variables for tracking progress of WAL replay, as if
1619 * we had just replayed the record before the REDO location (or the
1620 * checkpoint record itself, if it's a shutdown checkpoint).
1621 */
1624 {
1628 }
1629 else
1630 {
1634 }
1641
1642 /* Also ensure XLogReceiptTime has a sane value */
1644
1645 /*
1646 * Let postmaster know we've started redo now, so that it can launch the
1647 * archiver if necessary.
1648 */
1651
1652 /*
1653 * Allow read-only connections immediately if we're consistent already.
1654 */
1656
1657 /*
1658 * Find the first record that logically follows the checkpoint --- it
1659 * might physically precede it, though.
1660 */
1662 {
1663 /* back up to find the record */
1664 replayTLI = RedoStartTLI;
1666 record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1667
1668 /*
1669 * If a checkpoint record's redo pointer points back to an earlier
1670 * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1671 * record.
1672 */
1673 if (record->xl_rmid != RM_XLOG_ID ||
1675 ereport(FATAL,
1676 errmsg("unexpected record type found at redo point %X/%08X",
1678 }
1679 else
1680 {
1681 /* just have to read next record after CheckPoint */
1683 replayTLI = CheckPointTLI;
1684 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1685 }
1686
1687 if (record != NULL)
1688 {
1690 PGRUsage ru0;
1691
1693
1694 InRedo = true;
1695
1696 RmgrStartup();
1697
1698 ereport(LOG,
1699 errmsg("redo starts at %X/%08X",
1701
1702 /* Prepare to report progress of the redo phase. */
1703 if (!StandbyMode)
1705
1706 /*
1707 * main redo apply loop
1708 */
1709 do
1710 {
1711 if (!StandbyMode)
1712 ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
1714
1715#ifdef WAL_DEBUG
1716 if (XLOG_DEBUG)
1717 {
1719
1721 appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
1725 appendStringInfoString(&buf, " - ");
1727 elog(LOG, "%s", buf.data);
1728 pfree(buf.data);
1729 }
1730#endif
1731
1732 /* Handle interrupt signals of startup process */
1734
1735 /*
1736 * Pause WAL replay, if requested by a hot-standby session via
1737 * SetRecoveryPause().
1738 *
1739 * Note that we intentionally don't take the info_lck spinlock
1740 * here. We might therefore read a slightly stale value of the
1741 * recoveryPause flag, but it can't be very stale (no worse than
1742 * the last spinlock we did acquire). Since a pause request is a
1743 * pretty asynchronous thing anyway, possibly responding to it one
1744 * WAL record later than we otherwise would is a minor issue, so
1745 * it doesn't seem worth adding another spinlock cycle to prevent
1746 * that.
1747 */
1748 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1750 recoveryPausesHere(false);
1751
1752 /*
1753 * Have we reached our recovery target?
1754 */
1756 {
1757 reachedRecoveryTarget = true;
1758 break;
1759 }
1760
1761 /*
1762 * If we've been asked to lag the primary, wait on latch until
1763 * enough time has passed.
1764 */
1766 {
1767 /*
1768 * We test for paused recovery again here. If user sets
1769 * delayed apply, it may be because they expect to pause
1770 * recovery in case of problems, so we must test again here
1771 * otherwise pausing during the delay-wait wouldn't work.
1772 */
1773 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1775 recoveryPausesHere(false);
1776 }
1777
1778 /*
1779 * Apply the record
1780 */
1781 ApplyWalRecord(xlogreader, record, &replayTLI);
1782
1783 /*
1784 * If we replayed an LSN that someone was waiting for then walk
1785 * over the shared memory array and set latches to notify the
1786 * waiters.
1787 */
1788 if (waitLSNState &&
1792
1793 /* Exit loop if we reached inclusive recovery target */
1795 {
1796 reachedRecoveryTarget = true;
1797 break;
1798 }
1799
1800 /* Else, try to fetch the next WAL record */
1801 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1802 } while (record != NULL);
1803
1804 /*
1805 * end of main redo apply loop
1806 */
1807
1809 {
1810 if (!reachedConsistency)
1811 ereport(FATAL,
1812 (errmsg("requested recovery stop point is before consistent recovery point")));
1813
1814 /*
1815 * This is the last point where we can restart recovery with a new
1816 * recovery target, if we shutdown and begin again. After this,
1817 * Resource Managers may choose to do permanent corrective actions
1818 * at end of recovery.
1819 */
1820 switch (recoveryTargetAction)
1821 {
1823
1824 /*
1825 * exit with special return code to request shutdown of
1826 * postmaster. Log messages issued from postmaster.
1827 */
1828 proc_exit(3);
1829
1831 SetRecoveryPause(true);
1832 recoveryPausesHere(true);
1833
1834 /* drop into promote */
1836
1838 break;
1839 }
1840 }
1841
1842 RmgrCleanup();
1843
1844 ereport(LOG,
1845 errmsg("redo done at %X/%08X system usage: %s",
1847 pg_rusage_show(&ru0)));
1849 if (xtime)
1850 ereport(LOG,
1851 (errmsg("last completed transaction was at log time %s",
1853
1854 InRedo = false;
1855 }
1856 else
1857 {
1858 /* there are no WAL records following the checkpoint */
1859 ereport(LOG,
1860 (errmsg("redo is not required")));
1861 }
1862
1863 /*
1864 * This check is intentionally after the above log messages that indicate
1865 * how far recovery went.
1866 */
1870 ereport(FATAL,
1872 errmsg("recovery ended before configured recovery target was reached")));
1873}
1874
1875/*
1876 * Subroutine of PerformWalRecovery, to apply one WAL record.
1877 */
1878static void
1880{
1881 ErrorContextCallback errcallback;
1882 bool switchedTLI = false;
1883
1884 /* Setup error traceback support for ereport() */
1885 errcallback.callback = rm_redo_error_callback;
1886 errcallback.arg = xlogreader;
1887 errcallback.previous = error_context_stack;
1888 error_context_stack = &errcallback;
1889
1890 /*
1891 * TransamVariables->nextXid must be beyond record's xid.
1892 */
1894
1895 /*
1896 * Before replaying this record, check if this record causes the current
1897 * timeline to change. The record is already considered to be part of the
1898 * new timeline, so we update replayTLI before replaying it. That's
1899 * important so that replayEndTLI, which is recorded as the minimum
1900 * recovery point's TLI if recovery stops after this record, is set
1901 * correctly.
1902 */
1903 if (record->xl_rmid == RM_XLOG_ID)
1904 {
1905 TimeLineID newReplayTLI = *replayTLI;
1906 TimeLineID prevReplayTLI = *replayTLI;
1907 uint8 info = record->xl_info & ~XLR_INFO_MASK;
1908
1909 if (info == XLOG_CHECKPOINT_SHUTDOWN)
1910 {
1911 CheckPoint checkPoint;
1912
1913 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1914 newReplayTLI = checkPoint.ThisTimeLineID;
1915 prevReplayTLI = checkPoint.PrevTimeLineID;
1916 }
1917 else if (info == XLOG_END_OF_RECOVERY)
1918 {
1920
1922 newReplayTLI = xlrec.ThisTimeLineID;
1923 prevReplayTLI = xlrec.PrevTimeLineID;
1924 }
1925
1926 if (newReplayTLI != *replayTLI)
1927 {
1928 /* Check that it's OK to switch to this TLI */
1930 newReplayTLI, prevReplayTLI, *replayTLI);
1931
1932 /* Following WAL records should be run with new TLI */
1933 *replayTLI = newReplayTLI;
1934 switchedTLI = true;
1935 }
1936 }
1937
1938 /*
1939 * Update shared replayEndRecPtr before replaying this record, so that
1940 * XLogFlush will update minRecoveryPoint correctly.
1941 */
1944 XLogRecoveryCtl->replayEndTLI = *replayTLI;
1946
1947 /*
1948 * If we are attempting to enter Hot Standby mode, process XIDs we see
1949 */
1953
1954 /*
1955 * Some XLOG record types that are related to recovery are processed
1956 * directly here, rather than in xlog_redo()
1957 */
1958 if (record->xl_rmid == RM_XLOG_ID)
1959 xlogrecovery_redo(xlogreader, *replayTLI);
1960
1961 /* Now apply the WAL record itself */
1963
1964 /*
1965 * After redo, check whether the backup pages associated with the WAL
1966 * record are consistent with the existing pages. This check is done only
1967 * if consistency check is enabled for this record.
1968 */
1969 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
1971
1972 /* Pop the error context stack */
1973 error_context_stack = errcallback.previous;
1974
1975 /*
1976 * Update lastReplayedEndRecPtr after this record has been successfully
1977 * replayed.
1978 */
1982 XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
1984
1985 /* ------
1986 * Wakeup walsenders:
1987 *
1988 * On the standby, the WAL is flushed first (which will only wake up
1989 * physical walsenders) and then applied, which will only wake up logical
1990 * walsenders.
1991 *
1992 * Indeed, logical walsenders on standby can't decode and send data until
1993 * it's been applied.
1994 *
1995 * Physical walsenders don't need to be woken up during replay unless
1996 * cascading replication is allowed and time line change occurred (so that
1997 * they can notice that they are on a new time line).
1998 *
1999 * That's why the wake up conditions are for:
2000 *
2001 * - physical walsenders in case of new time line and cascade
2002 * replication is allowed
2003 * - logical walsenders in case cascade replication is allowed (could not
2004 * be created otherwise)
2005 * ------
2006 */
2009
2010 /*
2011 * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2012 * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2013 * a reply to the primary.
2014 */
2016 {
2019 }
2020
2021 /* Allow read-only connections if we're consistent now */
2023
2024 /* Is this a timeline switch? */
2025 if (switchedTLI)
2026 {
2027 /*
2028 * Before we continue on the new timeline, clean up any (possibly
2029 * bogus) future WAL segments on the old timeline.
2030 */
2032
2033 /* Reset the prefetcher. */
2035 }
2036}
2037
2038/*
2039 * Some XLOG RM record types that are directly related to WAL recovery are
2040 * handled here rather than in the xlog_redo()
2041 */
2042static void
2044{
2045 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2046 XLogRecPtr lsn = record->EndRecPtr;
2047
2048 Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2049
2050 if (info == XLOG_OVERWRITE_CONTRECORD)
2051 {
2052 /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2054
2056 if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2057 elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
2058 LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2060
2061 /* We have safely skipped the aborted record */
2064
2065 ereport(LOG,
2066 errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
2067 LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2068 timestamptz_to_str(xlrec.overwrite_time)));
2069
2070 /* Verifying the record should only happen once */
2072 }
2073 else if (info == XLOG_BACKUP_END)
2074 {
2075 XLogRecPtr startpoint;
2076
2077 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2078
2079 if (backupStartPoint == startpoint)
2080 {
2081 /*
2082 * We have reached the end of base backup, the point where
2083 * pg_backup_stop() was done. The data on disk is now consistent
2084 * (assuming we have also reached minRecoveryPoint). Set
2085 * backupEndPoint to the current LSN, so that the next call to
2086 * CheckRecoveryConsistency() will notice it and do the
2087 * end-of-backup processing.
2088 */
2089 elog(DEBUG1, "end of backup record reached");
2090
2091 backupEndPoint = lsn;
2092 }
2093 else
2094 elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
2096 }
2097}
2098
2099/*
2100 * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2101 * directories.
2102 *
2103 * Replay of database creation XLOG records for databases that were later
2104 * dropped can create fake directories in pg_tblspc. By the time consistency
2105 * is reached these directories should have been removed; here we verify
2106 * that this did indeed happen. This is to be called at the point where
2107 * consistent state is reached.
2108 *
2109 * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2110 * useful for testing purposes, and also allows for an escape hatch in case
2111 * things go south.
2112 */
2113static void
2115{
2116 DIR *dir;
2117 struct dirent *de;
2118
2120 while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2121 {
2122 char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2123
2124 /* Skip entries of non-oid names */
2125 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2126 continue;
2127
2128 snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2129
2130 if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2133 errmsg("unexpected directory entry \"%s\" found in %s",
2134 de->d_name, PG_TBLSPC_DIR),
2135 errdetail("All directory entries in %s/ should be symbolic links.",
2137 errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2138 }
2139}
2140
2141/*
2142 * Checks if recovery has reached a consistent state. When consistency is
2143 * reached and we have a valid starting standby snapshot, tell postmaster
2144 * that it can start accepting read-only connections.
2145 */
2146static void
2148{
2149 XLogRecPtr lastReplayedEndRecPtr;
2150 TimeLineID lastReplayedTLI;
2151
2152 /*
2153 * During crash recovery, we don't reach a consistent state until we've
2154 * replayed all the WAL.
2155 */
2157 return;
2158
2160
2161 /*
2162 * assume that we are called in the startup process, and hence don't need
2163 * a lock to read lastReplayedEndRecPtr
2164 */
2165 lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2166 lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2167
2168 /*
2169 * Have we reached the point where our base backup was completed?
2170 */
2172 backupEndPoint <= lastReplayedEndRecPtr)
2173 {
2176
2177 elog(DEBUG1, "end of backup reached");
2178
2179 /*
2180 * We have reached the end of base backup, as indicated by pg_control.
2181 * Update the control file accordingly.
2182 */
2183 ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2186 backupEndRequired = false;
2187
2188 ereport(LOG,
2189 errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
2192 }
2193
2194 /*
2195 * Have we passed our safe starting point? Note that minRecoveryPoint is
2196 * known to be incorrectly set if recovering from a backup, until the
2197 * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2198 * All we know prior to that is that we're not consistent yet.
2199 */
2201 minRecoveryPoint <= lastReplayedEndRecPtr)
2202 {
2203 /*
2204 * Check to see if the XLOG sequence contained any unresolved
2205 * references to uninitialized pages.
2206 */
2208
2209 /*
2210 * Check that pg_tblspc doesn't contain any real directories. Replay
2211 * of Database/CREATE_* records may have created fictitious tablespace
2212 * directories that should have been removed by the time consistency
2213 * was reached.
2214 */
2216
2217 reachedConsistency = true;
2219 ereport(LOG,
2220 errmsg("consistent recovery state reached at %X/%08X",
2221 LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
2222 }
2223
2224 /*
2225 * Have we got a valid starting snapshot that will allow queries to be
2226 * run? If so, we can tell postmaster that the database is consistent now,
2227 * enabling connections.
2228 */
2233 {
2237
2238 LocalHotStandbyActive = true;
2239
2241 }
2242}
2243
2244/*
2245 * Error context callback for errors occurring during rm_redo().
2246 */
2247static void
2249{
2250 XLogReaderState *record = (XLogReaderState *) arg;
2252
2254 xlog_outdesc(&buf, record);
2255 xlog_block_info(&buf, record);
2256
2257 /* translator: %s is a WAL record description */
2258 errcontext("WAL redo at %X/%08X for %s",
2259 LSN_FORMAT_ARGS(record->ReadRecPtr),
2260 buf.data);
2261
2262 pfree(buf.data);
2263}
2264
2265/*
2266 * Returns a string describing an XLogRecord, consisting of its identity
2267 * optionally followed by a colon, a space, and a further description.
2268 */
2269void
2271{
2273 uint8 info = XLogRecGetInfo(record);
2274 const char *id;
2275
2278
2279 id = rmgr.rm_identify(info);
2280 if (id == NULL)
2281 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2282 else
2283 appendStringInfo(buf, "%s: ", id);
2284
2285 rmgr.rm_desc(buf, record);
2286}
2287
2288#ifdef WAL_DEBUG
2289
2290static void
2292{
2293 appendStringInfo(buf, "prev %X/%08X; xid %u",
2295 XLogRecGetXid(record));
2296
2297 appendStringInfo(buf, "; len %u",
2298 XLogRecGetDataLen(record));
2299
2300 xlog_block_info(buf, record);
2301}
2302#endif /* WAL_DEBUG */
2303
2304/*
2305 * Returns a string giving information about all the blocks in an
2306 * XLogRecord.
2307 */
2308static void
2310{
2311 int block_id;
2312
2313 /* decode block references */
2314 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2315 {
2316 RelFileLocator rlocator;
2317 ForkNumber forknum;
2319
2321 &rlocator, &forknum, &blk, NULL))
2322 continue;
2323
2324 if (forknum != MAIN_FORKNUM)
2325 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2326 block_id,
2327 rlocator.spcOid, rlocator.dbOid,
2328 rlocator.relNumber,
2329 forknum,
2330 blk);
2331 else
2332 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2333 block_id,
2334 rlocator.spcOid, rlocator.dbOid,
2335 rlocator.relNumber,
2336 blk);
2337 if (XLogRecHasBlockImage(record, block_id))
2338 appendStringInfoString(buf, " FPW");
2339 }
2340}
2341
2342
2343/*
2344 * Check that it's OK to switch to new timeline during recovery.
2345 *
2346 * 'lsn' is the address of the shutdown checkpoint record we're about to
2347 * replay. (Currently, timeline can only change at a shutdown checkpoint).
2348 */
2349static void
2351 TimeLineID replayTLI)
2352{
2353 /* Check that the record agrees on what the current (old) timeline is */
2354 if (prevTLI != replayTLI)
2355 ereport(PANIC,
2356 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2357 prevTLI, replayTLI)));
2358
2359 /*
2360 * The new timeline better be in the list of timelines we expect to see,
2361 * according to the timeline history. It should also not decrease.
2362 */
2363 if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2364 ereport(PANIC,
2365 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2366 newTLI, replayTLI)));
2367
2368 /*
2369 * If we have not yet reached min recovery point, and we're about to
2370 * switch to a timeline greater than the timeline of the min recovery
2371 * point: trouble. After switching to the new timeline, we could not
2372 * possibly visit the min recovery point on the correct timeline anymore.
2373 * This can happen if there is a newer timeline in the archive that
2374 * branched before the timeline the min recovery point is on, and you
2375 * attempt to do PITR to the new timeline.
2376 */
2378 lsn < minRecoveryPoint &&
2380 ereport(PANIC,
2381 errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
2382 newTLI,
2385
2386 /* Looks good */
2387}
2388
2389
2390/*
2391 * Extract timestamp from WAL record.
2392 *
2393 * If the record contains a timestamp, returns true, and saves the timestamp
2394 * in *recordXtime. If the record type has no timestamp, returns false.
2395 * Currently, only transaction commit/abort records and restore points contain
2396 * timestamps.
2397 */
2398static bool
2400{
2401 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2403 uint8 rmid = XLogRecGetRmid(record);
2404
2405 if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2406 {
2407 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2408 return true;
2409 }
2410 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2412 {
2413 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2414 return true;
2415 }
2416 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2418 {
2419 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2420 return true;
2421 }
2422 return false;
2423}
2424
2425/*
2426 * Checks whether the current buffer page and backup page stored in the
2427 * WAL record are consistent or not. Before comparing the two pages, a
2428 * masking can be applied to the pages to ignore certain areas like hint bits,
2429 * unused space between pd_lower and pd_upper among other things. This
2430 * function should be called once WAL replay has been completed for a
2431 * given record.
2432 */
2433static void
2435{
2437 RelFileLocator rlocator;
2438 ForkNumber forknum;
2439 BlockNumber blkno;
2440 int block_id;
2441
2442 /* Records with no backup blocks have no need for consistency checks. */
2443 if (!XLogRecHasAnyBlockRefs(record))
2444 return;
2445
2447
2448 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2449 {
2450 Buffer buf;
2451 Page page;
2452
2454 &rlocator, &forknum, &blkno, NULL))
2455 {
2456 /*
2457 * WAL record doesn't contain a block reference with the given id.
2458 * Do nothing.
2459 */
2460 continue;
2461 }
2462
2464
2465 if (XLogRecBlockImageApply(record, block_id))
2466 {
2467 /*
2468 * WAL record has already applied the page, so bypass the
2469 * consistency check as that would result in comparing the full
2470 * page stored in the record with itself.
2471 */
2472 continue;
2473 }
2474
2475 /*
2476 * Read the contents from the current buffer and store it in a
2477 * temporary page.
2478 */
2479 buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2482 if (!BufferIsValid(buf))
2483 continue;
2484
2486 page = BufferGetPage(buf);
2487
2488 /*
2489 * Take a copy of the local page where WAL has been applied to have a
2490 * comparison base before masking it...
2491 */
2493
2494 /* No need for this page anymore now that a copy is in. */
2496
2497 /*
2498 * If the block LSN is already ahead of this WAL record, we can't
2499 * expect contents to match. This can happen if recovery is
2500 * restarted.
2501 */
2503 continue;
2504
2505 /*
2506 * Read the contents from the backup copy, stored in WAL record and
2507 * store it in a temporary page. There is no need to allocate a new
2508 * page here, a local buffer is fine to hold its contents and a mask
2509 * can be directly applied on it.
2510 */
2512 ereport(ERROR,
2514 errmsg_internal("%s", record->errormsg_buf)));
2515
2516 /*
2517 * If masking function is defined, mask both the primary and replay
2518 * images
2519 */
2520 if (rmgr.rm_mask != NULL)
2521 {
2522 rmgr.rm_mask(replay_image_masked, blkno);
2523 rmgr.rm_mask(primary_image_masked, blkno);
2524 }
2525
2526 /* Time to compare the primary and replay images. */
2528 {
2529 elog(FATAL,
2530 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2531 rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2532 forknum, blkno);
2533 }
2534 }
2535}
2536
2537/*
2538 * For point-in-time recovery, this function decides whether we want to
2539 * stop applying the XLOG before the current record.
2540 *
2541 * Returns true if we are stopping, false otherwise. If stopping, some
2542 * information is saved in recoveryStopXid et al for use in annotating the
2543 * new timeline's history file.
2544 */
2545static bool
2547{
2548 bool stopsHere = false;
2550 bool isCommit;
2553
2554 /*
2555 * Ignore recovery target settings when not in archive recovery (meaning
2556 * we are in crash recovery).
2557 */
2559 return false;
2560
2561 /* Check if we should stop as soon as reaching consistency */
2563 {
2564 ereport(LOG,
2565 (errmsg("recovery stopping after reaching consistency")));
2566
2567 recoveryStopAfter = false;
2570 recoveryStopTime = 0;
2571 recoveryStopName[0] = '\0';
2572 return true;
2573 }
2574
2575 /* Check if target LSN has been reached */
2578 record->ReadRecPtr >= recoveryTargetLSN)
2579 {
2580 recoveryStopAfter = false;
2582 recoveryStopLSN = record->ReadRecPtr;
2583 recoveryStopTime = 0;
2584 recoveryStopName[0] = '\0';
2585 ereport(LOG,
2586 errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
2588 return true;
2589 }
2590
2591 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2592 if (XLogRecGetRmid(record) != RM_XACT_ID)
2593 return false;
2594
2596
2598 {
2599 isCommit = true;
2600 recordXid = XLogRecGetXid(record);
2601 }
2603 {
2606
2607 isCommit = true;
2609 xlrec,
2610 &parsed);
2611 recordXid = parsed.twophase_xid;
2612 }
2613 else if (xact_info == XLOG_XACT_ABORT)
2614 {
2615 isCommit = false;
2616 recordXid = XLogRecGetXid(record);
2617 }
2619 {
2622
2623 isCommit = false;
2625 xlrec,
2626 &parsed);
2627 recordXid = parsed.twophase_xid;
2628 }
2629 else
2630 return false;
2631
2633 {
2634 /*
2635 * There can be only one transaction end record with this exact
2636 * transactionid
2637 *
2638 * when testing for an xid, we MUST test for equality only, since
2639 * transactions are numbered in the order they start, not the order
2640 * they complete. A higher numbered xid will complete before you about
2641 * 50% of the time...
2642 */
2644 }
2645
2646 /*
2647 * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2648 * We don't expect getRecordTimestamp ever to fail, since we already know
2649 * this is a commit or abort record; but test its result anyway.
2650 */
2651 if (getRecordTimestamp(record, &recordXtime) &&
2653 {
2654 /*
2655 * There can be many transactions that share the same commit time, so
2656 * we stop after the last one, if we are inclusive, or stop at the
2657 * first one if we are exclusive
2658 */
2661 else
2663 }
2664
2665 if (stopsHere)
2666 {
2667 recoveryStopAfter = false;
2671 recoveryStopName[0] = '\0';
2672
2673 if (isCommit)
2674 {
2675 ereport(LOG,
2676 (errmsg("recovery stopping before commit of transaction %u, time %s",
2679 }
2680 else
2681 {
2682 ereport(LOG,
2683 (errmsg("recovery stopping before abort of transaction %u, time %s",
2686 }
2687 }
2688
2689 return stopsHere;
2690}
2691
2692/*
2693 * Same as recoveryStopsBefore, but called after applying the record.
2694 *
2695 * We also track the timestamp of the latest applied COMMIT/ABORT
2696 * record in XLogRecoveryCtl->recoveryLastXTime.
2697 */
2698static bool
2700{
2701 uint8 info;
2703 uint8 rmid;
2705
2706 /*
2707 * Ignore recovery target settings when not in archive recovery (meaning
2708 * we are in crash recovery).
2709 */
2711 return false;
2712
2713 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2714 rmid = XLogRecGetRmid(record);
2715
2716 /*
2717 * There can be many restore points that share the same name; we stop at
2718 * the first one.
2719 */
2721 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2722 {
2724
2726
2728 {
2729 recoveryStopAfter = true;
2734
2735 ereport(LOG,
2736 (errmsg("recovery stopping at restore point \"%s\", time %s",
2739 return true;
2740 }
2741 }
2742
2743 /* Check if the target LSN has been reached */
2746 record->ReadRecPtr >= recoveryTargetLSN)
2747 {
2748 recoveryStopAfter = true;
2750 recoveryStopLSN = record->ReadRecPtr;
2751 recoveryStopTime = 0;
2752 recoveryStopName[0] = '\0';
2753 ereport(LOG,
2754 errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
2756 return true;
2757 }
2758
2759 if (rmid != RM_XACT_ID)
2760 return false;
2761
2762 xact_info = info & XLOG_XACT_OPMASK;
2763
2764 if (xact_info == XLOG_XACT_COMMIT ||
2768 {
2770
2771 /* Update the last applied transaction timestamp */
2772 if (getRecordTimestamp(record, &recordXtime))
2774
2775 /* Extract the XID of the committed/aborted transaction */
2777 {
2780
2782 xlrec,
2783 &parsed);
2784 recordXid = parsed.twophase_xid;
2785 }
2787 {
2790
2792 xlrec,
2793 &parsed);
2794 recordXid = parsed.twophase_xid;
2795 }
2796 else
2797 recordXid = XLogRecGetXid(record);
2798
2799 /*
2800 * There can be only one transaction end record with this exact
2801 * transactionid
2802 *
2803 * when testing for an xid, we MUST test for equality only, since
2804 * transactions are numbered in the order they start, not the order
2805 * they complete. A higher numbered xid will complete before you about
2806 * 50% of the time...
2807 */
2810 {
2811 recoveryStopAfter = true;
2815 recoveryStopName[0] = '\0';
2816
2817 if (xact_info == XLOG_XACT_COMMIT ||
2819 {
2820 ereport(LOG,
2821 (errmsg("recovery stopping after commit of transaction %u, time %s",
2824 }
2825 else if (xact_info == XLOG_XACT_ABORT ||
2827 {
2828 ereport(LOG,
2829 (errmsg("recovery stopping after abort of transaction %u, time %s",
2832 }
2833 return true;
2834 }
2835 }
2836
2837 /* Check if we should stop as soon as reaching consistency */
2839 {
2840 ereport(LOG,
2841 (errmsg("recovery stopping after reaching consistency")));
2842
2843 recoveryStopAfter = true;
2845 recoveryStopTime = 0;
2847 recoveryStopName[0] = '\0';
2848 return true;
2849 }
2850
2851 return false;
2852}
2853
2854/*
2855 * Create a comment for the history file to explain why and where
2856 * timeline changed.
2857 */
2858static char *
2860{
2861 char reason[200];
2862
2864 snprintf(reason, sizeof(reason),
2865 "%s transaction %u",
2866 recoveryStopAfter ? "after" : "before",
2869 snprintf(reason, sizeof(reason),
2870 "%s %s\n",
2871 recoveryStopAfter ? "after" : "before",
2874 snprintf(reason, sizeof(reason),
2875 "%s LSN %X/%08X\n",
2876 recoveryStopAfter ? "after" : "before",
2879 snprintf(reason, sizeof(reason),
2880 "at restore point \"%s\"",
2883 snprintf(reason, sizeof(reason), "reached consistency");
2884 else
2885 snprintf(reason, sizeof(reason), "no recovery target specified");
2886
2887 return pstrdup(reason);
2888}
2889
2890/*
2891 * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2892 *
2893 * endOfRecovery is true if the recovery target is reached and
2894 * the paused state starts at the end of recovery because of
2895 * recovery_target_action=pause, and false otherwise.
2896 */
2897static void
2899{
2900 /* Don't pause unless users can connect! */
2902 return;
2903
2904 /* Don't pause after standby promotion has been triggered */
2906 return;
2907
2908 if (endOfRecovery)
2909 ereport(LOG,
2910 (errmsg("pausing at the end of recovery"),
2911 errhint("Execute pg_wal_replay_resume() to promote.")));
2912 else
2913 ereport(LOG,
2914 (errmsg("recovery has paused"),
2915 errhint("Execute pg_wal_replay_resume() to continue.")));
2916
2917 /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2919 {
2922 return;
2923
2924 /*
2925 * If recovery pause is requested then set it paused. While we are in
2926 * the loop, user might resume and pause again so set this every time.
2927 */
2929
2930 /*
2931 * We wait on a condition variable that will wake us as soon as the
2932 * pause ends, but we use a timeout so we can check the above exit
2933 * condition periodically too.
2934 */
2937 }
2939}
2940
2941/*
2942 * When recovery_min_apply_delay is set, we wait long enough to make sure
2943 * certain record types are applied at least that interval behind the primary.
2944 *
2945 * Returns true if we waited.
2946 *
2947 * Note that the delay is calculated between the WAL record log time and
2948 * the current time on standby. We would prefer to keep track of when this
2949 * standby received each WAL record, which would allow a more consistent
2950 * approach and one not affected by time synchronisation issues, but that
2951 * is significantly more effort and complexity for little actual gain in
2952 * usability.
2953 */
2954static bool
2956{
2960 long msecs;
2961
2962 /* nothing to do if no delay configured */
2963 if (recovery_min_apply_delay <= 0)
2964 return false;
2965
2966 /* no delay is applied on a database not yet consistent */
2967 if (!reachedConsistency)
2968 return false;
2969
2970 /* nothing to do if crash recovery is requested */
2972 return false;
2973
2974 /*
2975 * Is it a COMMIT record?
2976 *
2977 * We deliberately choose not to delay aborts since they have no effect on
2978 * MVCC. We already allow replay of records that don't have a timestamp,
2979 * so there is already opportunity for issues caused by early conflicts on
2980 * standbys.
2981 */
2982 if (XLogRecGetRmid(record) != RM_XACT_ID)
2983 return false;
2984
2986
2987 if (xact_info != XLOG_XACT_COMMIT &&
2989 return false;
2990
2991 if (!getRecordTimestamp(record, &xtime))
2992 return false;
2993
2995
2996 /*
2997 * Exit without arming the latch if it's already past time to apply this
2998 * record
2999 */
3001 if (msecs <= 0)
3002 return false;
3003
3004 while (true)
3005 {
3007
3008 /* This might change recovery_min_apply_delay. */
3010
3012 break;
3013
3014 /*
3015 * Recalculate delayUntil as recovery_min_apply_delay could have
3016 * changed while waiting in this loop.
3017 */
3019
3020 /*
3021 * Wait for difference between GetCurrentTimestamp() and delayUntil.
3022 */
3024 delayUntil);
3025
3026 if (msecs <= 0)
3027 break;
3028
3029 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3030
3033 msecs,
3035 }
3036 return true;
3037}
3038
3039/*
3040 * Get the current state of the recovery pause request.
3041 */
3053
3054/*
3055 * Set the recovery pause state.
3056 *
3057 * If recovery pause is requested then sets the recovery pause state to
3058 * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3059 * to 'not paused' to resume the recovery. The recovery pause will be
3060 * confirmed by the ConfirmRecoveryPaused.
3061 */
3062void
3077
3078/*
3079 * Confirm the recovery pause by setting the recovery pause state to
3080 * RECOVERY_PAUSED.
3081 */
3082static void
3091
3092
3093/*
3094 * Attempt to read the next XLOG record.
3095 *
3096 * Before first call, the reader needs to be positioned to the first record
3097 * by calling XLogPrefetcherBeginRead().
3098 *
3099 * If no valid record is available, returns NULL, or fails if emode is PANIC.
3100 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3101 * record is available.
3102 */
3103static XLogRecord *
3105 bool fetching_ckpt, TimeLineID replayTLI)
3106{
3107 XLogRecord *record;
3110
3112
3113 /* Pass through parameters to XLogPageRead */
3114 private->fetching_ckpt = fetching_ckpt;
3115 private->emode = emode;
3116 private->randAccess = !XLogRecPtrIsValid(xlogreader->ReadRecPtr);
3117 private->replayTLI = replayTLI;
3118
3119 /* This is the first attempt to read this page. */
3120 lastSourceFailed = false;
3121
3122 for (;;)
3123 {
3124 char *errormsg;
3125
3126 record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3127 if (record == NULL)
3128 {
3129 /*
3130 * When we find that WAL ends in an incomplete record, keep track
3131 * of that record. After recovery is done, we'll write a record
3132 * to indicate to downstream WAL readers that that portion is to
3133 * be ignored.
3134 *
3135 * However, when ArchiveRecoveryRequested = true, we're going to
3136 * switch to a new timeline at the end of recovery. We will only
3137 * copy WAL over to the new timeline up to the end of the last
3138 * complete record, so if we did this, we would later create an
3139 * overwrite contrecord in the wrong place, breaking everything.
3140 */
3143 {
3146 }
3147
3148 if (readFile >= 0)
3149 {
3150 close(readFile);
3151 readFile = -1;
3152 }
3153
3154 /*
3155 * We only end up here without a message when XLogPageRead()
3156 * failed - in that case we already logged something. In
3157 * StandbyMode that only happens if we have been triggered, so we
3158 * shouldn't loop anymore in that case.
3159 */
3160 if (errormsg)
3162 (errmsg_internal("%s", errormsg) /* already translated */ ));
3163 }
3164
3165 /*
3166 * Check page TLI is one of the expected values.
3167 */
3169 {
3170 char fname[MAXFNAMELEN];
3171 XLogSegNo segno;
3172 int32 offset;
3173
3177 XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3180 errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
3182 fname,
3184 offset));
3185 record = NULL;
3186 }
3187
3188 if (record)
3189 {
3190 /* Great, got a record */
3191 return record;
3192 }
3193 else
3194 {
3195 /* No valid record available from this source */
3196 lastSourceFailed = true;
3197
3198 /*
3199 * If archive recovery was requested, but we were still doing
3200 * crash recovery, switch to archive recovery and retry using the
3201 * offline archive. We have now replayed all the valid WAL in
3202 * pg_wal, so we are presumably now consistent.
3203 *
3204 * We require that there's at least some valid WAL present in
3205 * pg_wal, however (!fetching_ckpt). We could recover using the
3206 * WAL from the archive, even if pg_wal is completely empty, but
3207 * we'd have no idea how far we'd have to replay to reach
3208 * consistency. So err on the safe side and give up.
3209 */
3211 !fetching_ckpt)
3212 {
3214 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3215 InArchiveRecovery = true;
3218
3221 minRecoveryPointTLI = replayTLI;
3222
3224
3225 /*
3226 * Before we retry, reset lastSourceFailed and currentSource
3227 * so that we will check the archive next.
3228 */
3229 lastSourceFailed = false;
3231
3232 continue;
3233 }
3234
3235 /* In standby mode, loop back to retry. Otherwise, give up. */
3237 continue;
3238 else
3239 return NULL;
3240 }
3241 }
3242}
3243
3244/*
3245 * Read the XLOG page containing targetPagePtr into readBuf (if not read
3246 * already). Returns number of bytes read, if the page is read successfully,
3247 * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3248 * but only if they have not been previously reported.
3249 *
3250 * See XLogReaderRoutine.page_read for more details.
3251 *
3252 * While prefetching, xlogreader->nonblocking may be set. In that case,
3253 * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3254 *
3255 * This is responsible for restoring files from archive as needed, as well
3256 * as for waiting for the requested WAL record to arrive in standby mode.
3257 *
3258 * xlogreader->private_data->emode specifies the log level used for reporting
3259 * "file not found" or "end of WAL" situations in archive recovery, or in
3260 * standby mode when promotion is triggered. If set to WARNING or below,
3261 * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3262 * levels the ereport() won't return.
3263 *
3264 * In standby mode, if after a successful return of XLogPageRead() the
3265 * caller finds the record it's interested in to be broken, it should
3266 * ereport the error with the level determined by
3267 * emode_for_corrupt_record(), and then set lastSourceFailed
3268 * and call XLogPageRead() again with the same arguments. This lets
3269 * XLogPageRead() to try fetching the record from another source, or to
3270 * sleep and retry.
3271 */
3272static int
3274 XLogRecPtr targetRecPtr, char *readBuf)
3275{
3276 XLogPageReadPrivate *private =
3278 int emode = private->emode;
3281 int r;
3283
3285
3288
3289 /*
3290 * See if we need to switch to a new segment because the requested record
3291 * is not in the currently open one.
3292 */
3293 if (readFile >= 0 &&
3295 {
3296 /*
3297 * Request a restartpoint if we've replayed too much xlog since the
3298 * last one.
3299 */
3301 {
3303 {
3304 (void) GetRedoRecPtr();
3307 }
3308 }
3309
3310 close(readFile);
3311 readFile = -1;
3313 }
3314
3316
3317retry:
3318 /* See if we need to retrieve more data */
3319 if (readFile < 0 ||
3322 {
3323 if (readFile >= 0 &&
3327 return XLREAD_WOULDBLOCK;
3328
3330 private->randAccess,
3331 private->fetching_ckpt,
3333 private->replayTLI,
3336 {
3337 case XLREAD_WOULDBLOCK:
3338 return XLREAD_WOULDBLOCK;
3339 case XLREAD_FAIL:
3340 if (readFile >= 0)
3341 close(readFile);
3342 readFile = -1;
3343 readLen = 0;
3345 return XLREAD_FAIL;
3346 case XLREAD_SUCCESS:
3347 break;
3348 }
3349 }
3350
3351 /*
3352 * At this point, we have the right segment open and if we're streaming we
3353 * know the requested record is in it.
3354 */
3355 Assert(readFile != -1);
3356
3357 /*
3358 * If the current segment is being streamed from the primary, calculate
3359 * how much of the current page we have received already. We know the
3360 * requested record has been received, but this is for the benefit of
3361 * future calls, to allow quick exit at the top of this function.
3362 */
3364 {
3367 else
3370 }
3371 else
3373
3374 /* Read the requested page */
3376
3377 /* Measure I/O timing when reading segment */
3379
3381 r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff);
3382 if (r != XLOG_BLCKSZ)
3383 {
3384 char fname[MAXFNAMELEN];
3385 int save_errno = errno;
3386
3388
3390 io_start, 1, r);
3391
3393 if (r < 0)
3394 {
3395 errno = save_errno;
3398 errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
3400 readOff)));
3401 }
3402 else
3405 errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
3407 readOff, r, (Size) XLOG_BLCKSZ)));
3409 }
3411
3413 io_start, 1, r);
3414
3417 Assert(reqLen <= readLen);
3418
3420
3421 /*
3422 * Check the page header immediately, so that we can retry immediately if
3423 * it's not valid. This may seem unnecessary, because ReadPageInternal()
3424 * validates the page header anyway, and would propagate the failure up to
3425 * ReadRecord(), which would retry. However, there's a corner case with
3426 * continuation records, if a record is split across two pages such that
3427 * we would need to read the two pages from different sources across two
3428 * WAL segments.
3429 *
3430 * The first page is only available locally, in pg_wal, because it's
3431 * already been recycled on the primary. The second page, however, is not
3432 * present in pg_wal, and we should stream it from the primary. There is a
3433 * recycled WAL segment present in pg_wal, with garbage contents, however.
3434 * We would read the first page from the local WAL segment, but when
3435 * reading the second page, we would read the bogus, recycled, WAL
3436 * segment. If we didn't catch that case here, we would never recover,
3437 * because ReadRecord() would retry reading the whole record from the
3438 * beginning.
3439 *
3440 * Of course, this only catches errors in the page header, which is what
3441 * happens in the case of a recycled WAL segment. Other kinds of errors or
3442 * corruption still has the same problem. But this at least fixes the
3443 * common case, which can happen as part of normal operation.
3444 *
3445 * Validating the page header is cheap enough that doing it twice
3446 * shouldn't be a big deal from a performance point of view.
3447 *
3448 * When not in standby mode, an invalid page header should cause recovery
3449 * to end, not retry reading the page, so we don't need to validate the
3450 * page header here for the retry. Instead, ReadPageInternal() is
3451 * responsible for the validation.
3452 */
3453 if (StandbyMode &&
3456 {
3457 /*
3458 * Emit this error right now then retry this page immediately. Use
3459 * errmsg_internal() because the message was already translated.
3460 */
3461 if (xlogreader->errormsg_buf[0])
3464
3465 /* reset any error XLogReaderValidatePageHeader() might have set */
3468 }
3469
3470 return readLen;
3471
3473
3474 /*
3475 * If we're reading ahead, give up fast. Retries and error reporting will
3476 * be handled by a later read when recovery catches up to this point.
3477 */
3479 return XLREAD_WOULDBLOCK;
3480
3481 lastSourceFailed = true;
3482
3483 if (readFile >= 0)
3484 close(readFile);
3485 readFile = -1;
3486 readLen = 0;
3488
3489 /* In standby-mode, keep trying */
3490 if (StandbyMode)
3491 goto retry;
3492 else
3493 return XLREAD_FAIL;
3494}
3495
3496/*
3497 * Open the WAL segment containing WAL location 'RecPtr'.
3498 *
3499 * The segment can be fetched via restore_command, or via walreceiver having
3500 * streamed the record, or it can already be present in pg_wal. Checking
3501 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3502 * too, in case someone copies a new segment directly to pg_wal. That is not
3503 * documented or recommended, though.
3504 *
3505 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3506 * prepare to read WAL starting from RedoStartLSN after this.
3507 *
3508 * 'RecPtr' might not point to the beginning of the record we're interested
3509 * in, it might also point to the page or segment header. In that case,
3510 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3511 * used to decide which timeline to stream the requested WAL from.
3512 *
3513 * 'replayLSN' is the current replay LSN, so that if we scan for new
3514 * timelines, we can reject a switch to a timeline that branched off before
3515 * this point.
3516 *
3517 * If the record is not immediately available, the function returns XLREAD_FAIL
3518 * if we're not in standby mode. In standby mode, the function waits for it to
3519 * become available.
3520 *
3521 * When the requested record becomes available, the function opens the file
3522 * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3523 * of standby mode is triggered by the user, and there is no more WAL
3524 * available, returns XLREAD_FAIL.
3525 *
3526 * If nonblocking is true, then give up immediately if we can't satisfy the
3527 * request, returning XLREAD_WOULDBLOCK instead of waiting.
3528 */
3529static XLogPageReadResult
3531 bool fetching_ckpt, XLogRecPtr tliRecPtr,
3532 TimeLineID replayTLI, XLogRecPtr replayLSN,
3533 bool nonblocking)
3534{
3535 static TimestampTz last_fail_time = 0;
3537 bool streaming_reply_sent = false;
3538
3539 /*-------
3540 * Standby mode is implemented by a state machine:
3541 *
3542 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3543 * pg_wal (XLOG_FROM_PG_WAL)
3544 * 2. Check for promotion trigger request
3545 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3546 * 4. Rescan timelines
3547 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3548 *
3549 * Failure to read from the current source advances the state machine to
3550 * the next state.
3551 *
3552 * 'currentSource' indicates the current state. There are no currentSource
3553 * values for "check trigger", "rescan timelines", and "sleep" states,
3554 * those actions are taken when reading from the previous source fails, as
3555 * part of advancing to the next state.
3556 *
3557 * If standby mode is turned off while reading WAL from stream, we move
3558 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3559 * the files (which would be required at end of recovery, e.g., timeline
3560 * history file) from archive or pg_wal. We don't need to kill WAL receiver
3561 * here because it's already stopped when standby mode is turned off at
3562 * the end of recovery.
3563 *-------
3564 */
3565 if (!InArchiveRecovery)
3567 else if (currentSource == XLOG_FROM_ANY ||
3569 {
3570 lastSourceFailed = false;
3572 }
3573
3574 for (;;)
3575 {
3577 bool startWalReceiver = false;
3578
3579 /*
3580 * First check if we failed to read from the current source, and
3581 * advance the state machine if so. The failure to read might've
3582 * happened outside this function, e.g when a CRC check fails on a
3583 * record, or within this loop.
3584 */
3585 if (lastSourceFailed)
3586 {
3587 /*
3588 * Don't allow any retry loops to occur during nonblocking
3589 * readahead. Let the caller process everything that has been
3590 * decoded already first.
3591 */
3592 if (nonblocking)
3593 return XLREAD_WOULDBLOCK;
3594
3595 switch (currentSource)
3596 {
3597 case XLOG_FROM_ARCHIVE:
3598 case XLOG_FROM_PG_WAL:
3599
3600 /*
3601 * Check to see if promotion is requested. Note that we do
3602 * this only after failure, so when you promote, we still
3603 * finish replaying as much as we can from archive and
3604 * pg_wal before failover.
3605 */
3607 {
3609 return XLREAD_FAIL;
3610 }
3611
3612 /*
3613 * Not in standby mode, and we've now tried the archive
3614 * and pg_wal.
3615 */
3616 if (!StandbyMode)
3617 return XLREAD_FAIL;
3618
3619 /*
3620 * Move to XLOG_FROM_STREAM state, and set to start a
3621 * walreceiver if necessary.
3622 */
3624 startWalReceiver = true;
3625 break;
3626
3627 case XLOG_FROM_STREAM:
3628
3629 /*
3630 * Failure while streaming. Most likely, we got here
3631 * because streaming replication was terminated, or
3632 * promotion was triggered. But we also get here if we
3633 * find an invalid record in the WAL streamed from the
3634 * primary, in which case something is seriously wrong.
3635 * There's little chance that the problem will just go
3636 * away, but PANIC is not good for availability either,
3637 * especially in hot standby mode. So, we treat that the
3638 * same as disconnection, and retry from archive/pg_wal
3639 * again. The WAL in the archive should be identical to
3640 * what was streamed, so it's unlikely that it helps, but
3641 * one can hope...
3642 */
3643
3644 /*
3645 * We should be able to move to XLOG_FROM_STREAM only in
3646 * standby mode.
3647 */
3649
3650 /*
3651 * Before we leave XLOG_FROM_STREAM state, make sure that
3652 * walreceiver is not active, so that it won't overwrite
3653 * WAL that we restore from archive.
3654 *
3655 * If walreceiver is actively streaming (or attempting to
3656 * connect), we must shut it down. However, if it's
3657 * already in WAITING state (e.g., due to timeline
3658 * divergence), we only need to reset the install flag to
3659 * allow archive restoration.
3660 */
3661 if (WalRcvStreaming())
3663 else
3664 {
3665 /*
3666 * WALRCV_STOPPING state is a transient state while
3667 * the startup process is in ShutdownWalRcv(). It
3668 * should never appear here since we would be waiting
3669 * for the walreceiver to reach WALRCV_STOPPED in that
3670 * case.
3671 */
3674 }
3675
3676 /*
3677 * Before we sleep, re-scan for possible new timelines if
3678 * we were requested to recover to the latest timeline.
3679 */
3681 {
3682 if (rescanLatestTimeLine(replayTLI, replayLSN))
3683 {
3685 break;
3686 }
3687 }
3688
3689 /*
3690 * XLOG_FROM_STREAM is the last state in our state
3691 * machine, so we've exhausted all the options for
3692 * obtaining the requested WAL. We're going to loop back
3693 * and retry from the archive, but if it hasn't been long
3694 * since last attempt, sleep wal_retrieve_retry_interval
3695 * milliseconds to avoid busy-waiting.
3696 */
3700 {
3701 long wait_time;
3702
3705
3706 elog(LOG, "waiting for WAL to become available at %X/%08X",
3708
3709 /* Do background tasks that might benefit us later. */
3711
3715 wait_time,
3719
3720 /* Handle interrupt signals of startup process */
3722 }
3725 break;
3726
3727 default:
3728 elog(ERROR, "unexpected WAL source %d", currentSource);
3729 }
3730 }
3731 else if (currentSource == XLOG_FROM_PG_WAL)
3732 {
3733 /*
3734 * We just successfully read a file in pg_wal. We prefer files in
3735 * the archive over ones in pg_wal, so try the next file again
3736 * from the archive first.
3737 */
3740 }
3741
3742 if (currentSource != oldSource)
3743 elog(DEBUG2, "switched WAL source from %s to %s after %s",
3745 lastSourceFailed ? "failure" : "success");
3746
3747 /*
3748 * We've now handled possible failure. Try to read from the chosen
3749 * source.
3750 */
3751 lastSourceFailed = false;
3752
3753 switch (currentSource)
3754 {
3755 case XLOG_FROM_ARCHIVE:
3756 case XLOG_FROM_PG_WAL:
3757
3758 /*
3759 * WAL receiver must not be running when reading WAL from
3760 * archive or pg_wal.
3761 */
3763
3764 /* Close any old file we might have open. */
3765 if (readFile >= 0)
3766 {
3767 close(readFile);
3768 readFile = -1;
3769 }
3770 /* Reset curFileTLI if random fetch. */
3771 if (randAccess)
3772 curFileTLI = 0;
3773
3774 /*
3775 * Try to restore the file from archive, or read an existing
3776 * file from pg_wal.
3777 */
3781 if (readFile >= 0)
3782 return XLREAD_SUCCESS; /* success! */
3783
3784 /*
3785 * Nope, not found in archive or pg_wal.
3786 */
3787 lastSourceFailed = true;
3788 break;
3789
3790 case XLOG_FROM_STREAM:
3791 {
3792 bool havedata;
3793
3794 /*
3795 * We should be able to move to XLOG_FROM_STREAM only in
3796 * standby mode.
3797 */
3799
3800 /*
3801 * First, shutdown walreceiver if its restart has been
3802 * requested -- but no point if we're already slated for
3803 * starting it.
3804 */
3806 {
3808
3809 /*
3810 * Re-scan for possible new timelines if we were
3811 * requested to recover to the latest timeline.
3812 */
3815 rescanLatestTimeLine(replayTLI, replayLSN);
3816
3817 startWalReceiver = true;
3818 }
3819 pendingWalRcvRestart = false;
3820
3821 /*
3822 * Launch walreceiver if needed.
3823 *
3824 * If fetching_ckpt is true, RecPtr points to the initial
3825 * checkpoint location. In that case, we use RedoStartLSN
3826 * as the streaming start position instead of RecPtr, so
3827 * that when we later jump backwards to start redo at
3828 * RedoStartLSN, we will have the logs streamed already.
3829 */
3830 if (startWalReceiver &&
3832 {
3833 XLogRecPtr ptr;
3834 TimeLineID tli;
3835
3836 if (fetching_ckpt)
3837 {
3838 ptr = RedoStartLSN;
3839 tli = RedoStartTLI;
3840 }
3841 else
3842 {
3843 ptr = RecPtr;
3844
3845 /*
3846 * Use the record begin position to determine the
3847 * TLI, rather than the position we're reading.
3848 */
3850
3851 if (curFileTLI > 0 && tli < curFileTLI)
3852 elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3854 tli, curFileTLI);
3855 }
3856 curFileTLI = tli;
3862 }
3863
3864 /*
3865 * Check if WAL receiver is active or wait to start up.
3866 */
3867 if (!WalRcvStreaming())
3868 {
3869 lastSourceFailed = true;
3870 break;
3871 }
3872
3873 /*
3874 * Walreceiver is active, so see if new data has arrived.
3875 *
3876 * We only advance XLogReceiptTime when we obtain fresh
3877 * WAL from walreceiver and observe that we had already
3878 * processed everything before the most recent "chunk"
3879 * that it flushed to disk. In steady state where we are
3880 * keeping up with the incoming data, XLogReceiptTime will
3881 * be updated on each cycle. When we are behind,
3882 * XLogReceiptTime will not advance, so the grace time
3883 * allotted to conflicting queries will decrease.
3884 */
3885 if (RecPtr < flushedUpto)
3886 havedata = true;
3887 else
3888 {
3889 XLogRecPtr latestChunkStart;
3890
3891 flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3893 {
3894 havedata = true;
3895 if (latestChunkStart <= RecPtr)
3896 {
3899 }
3900 }
3901 else
3902 havedata = false;
3903 }
3904 if (havedata)
3905 {
3906 /*
3907 * Great, streamed far enough. Open the file if it's
3908 * not open already. Also read the timeline history
3909 * file if we haven't initialized timeline history
3910 * yet; it should be streamed over and present in
3911 * pg_wal by now. Use XLOG_FROM_STREAM so that source
3912 * info is set correctly and XLogReceiptTime isn't
3913 * changed.
3914 *
3915 * NB: We must set readTimeLineHistory based on
3916 * recoveryTargetTLI, not receiveTLI. Normally they'll
3917 * be the same, but if recovery_target_timeline is
3918 * 'latest' and archiving is configured, then it's
3919 * possible that we managed to retrieve one or more
3920 * new timeline history files from the archive,
3921 * updating recoveryTargetTLI.
3922 */
3923 if (readFile < 0)
3924 {
3925 if (!expectedTLEs)
3928 XLOG_FROM_STREAM, false);
3929 Assert(readFile >= 0);
3930 }
3931 else
3932 {
3933 /* just make sure source info is correct... */
3936 return XLREAD_SUCCESS;
3937 }
3938 break;
3939 }
3940
3941 /* In nonblocking mode, return rather than sleeping. */
3942 if (nonblocking)
3943 return XLREAD_WOULDBLOCK;
3944
3945 /*
3946 * Data not here yet. Check for trigger, then wait for
3947 * walreceiver to wake us up when new WAL arrives.
3948 */
3950 {
3951 /*
3952 * Note that we don't return XLREAD_FAIL immediately
3953 * here. After being triggered, we still want to
3954 * replay all the WAL that was already streamed. It's
3955 * in pg_wal now, so we just treat this as a failure,
3956 * and the state machine will move on to replay the
3957 * streamed WAL from pg_wal, and then recheck the
3958 * trigger and exit replay.
3959 */
3960 lastSourceFailed = true;
3961 break;
3962 }
3963
3964 /*
3965 * Since we have replayed everything we have received so
3966 * far and are about to start waiting for more WAL, let's
3967 * tell the upstream server our replay location now so
3968 * that pg_stat_replication doesn't show stale
3969 * information.
3970 */
3972 {
3974 streaming_reply_sent = true;
3975 }
3976
3977 /* Do any background tasks that might benefit us later. */
3979
3980 /* Update pg_stat_recovery_prefetch before sleeping. */
3982
3983 /*
3984 * Wait for more WAL to arrive, when we will be woken
3985 * immediately by the WAL receiver.
3986 */
3989 -1L,
3992 break;
3993 }
3994
3995 default:
3996 elog(ERROR, "unexpected WAL source %d", currentSource);
3997 }
3998
3999 /*
4000 * Check for recovery pause here so that we can confirm more quickly
4001 * that a requested pause has actually taken effect.
4002 */
4003 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4005 recoveryPausesHere(false);
4006
4007 /*
4008 * This possibly-long loop needs to handle interrupts of startup
4009 * process.
4010 */
4012 }
4013
4014 return XLREAD_FAIL; /* not reached */
4015}
4016
4017
4018/*
4019 * Determine what log level should be used to report a corrupt WAL record
4020 * in the current WAL page, previously read by XLogPageRead().
4021 *
4022 * 'emode' is the error mode that would be used to report a file-not-found
4023 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4024 * we're retrying the exact same record that we've tried previously, only
4025 * complain the first time to keep the noise down. However, we only do when
4026 * reading from pg_wal, because we don't expect any invalid records in archive
4027 * or in records streamed from the primary. Files in the archive should be complete,
4028 * and we should never hit the end of WAL because we stop and wait for more WAL
4029 * to arrive before replaying it.
4030 *
4031 * NOTE: This function remembers the RecPtr value it was last called with,
4032 * to suppress repeated messages about the same record. Only call this when
4033 * you are about to ereport(), or you might cause a later message to be
4034 * erroneously suppressed.
4035 */
4036static int
4038{
4040
4041 if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4042 {
4043 if (RecPtr == lastComplaint)
4044 emode = DEBUG1;
4045 else
4047 }
4048 return emode;
4049}
4050
4051
4052/*
4053 * Subroutine to try to fetch and validate a prior checkpoint record.
4054 */
4055static XLogRecord *
4057 TimeLineID replayTLI)
4058{
4059 XLogRecord *record;
4060 uint8 info;
4061
4063
4064 if (!XRecOffIsValid(RecPtr))
4065 {
4066 ereport(LOG,
4067 (errmsg("invalid checkpoint location")));
4068 return NULL;
4069 }
4070
4072 record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4073
4074 if (record == NULL)
4075 {
4076 ereport(LOG,
4077 (errmsg("invalid checkpoint record")));
4078 return NULL;
4079 }
4080 if (record->xl_rmid != RM_XLOG_ID)
4081 {
4082 ereport(LOG,
4083 (errmsg("invalid resource manager ID in checkpoint record")));
4084 return NULL;
4085 }
4086 info = record->xl_info & ~XLR_INFO_MASK;
4087 if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4088 info != XLOG_CHECKPOINT_ONLINE)
4089 {
4090 ereport(LOG,
4091 (errmsg("invalid xl_info in checkpoint record")));
4092 return NULL;
4093 }
4095 {
4096 ereport(LOG,
4097 (errmsg("invalid length of checkpoint record")));
4098 return NULL;
4099 }
4100 return record;
4101}
4102
4103/*
4104 * Scan for new timelines that might have appeared in the archive since we
4105 * started recovery.
4106 *
4107 * If there are any, the function changes recovery target TLI to the latest
4108 * one and returns 'true'.
4109 */
4110static bool
4112{
4114 bool found;
4115 ListCell *cell;
4119
4122 {
4123 /* No new timelines found */
4124 return false;
4125 }
4126
4127 /*
4128 * Determine the list of expected TLIs for the new TLI
4129 */
4130
4132
4133 /*
4134 * If the current timeline is not part of the history of the new timeline,
4135 * we cannot proceed to it.
4136 */
4137 found = false;
4138 foreach(cell, newExpectedTLEs)
4139 {
4141
4142 if (currentTle->tli == recoveryTargetTLI)
4143 {
4144 found = true;
4145 break;
4146 }
4147 }
4148 if (!found)
4149 {
4150 ereport(LOG,
4151 (errmsg("new timeline %u is not a child of database system timeline %u",
4152 newtarget,
4153 replayTLI)));
4154 return false;
4155 }
4156
4157 /*
4158 * The current timeline was found in the history file, but check that the
4159 * next timeline was forked off from it *after* the current recovery
4160 * location.
4161 */
4162 if (currentTle->end < replayLSN)
4163 {
4164 ereport(LOG,
4165 errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
4166 newtarget,
4167 replayTLI,
4169 return false;
4170 }
4171
4172 /* The new timeline history seems valid. Switch target */
4176
4177 /*
4178 * As in StartupXLOG(), try to ensure we have all the history files
4179 * between the old target and new target in pg_wal.
4180 */
4182
4183 ereport(LOG,
4184 (errmsg("new target timeline is %u",
4186
4187 return true;
4188}
4189
4190
4191/*
4192 * Open a logfile segment for reading (during recovery).
4193 *
4194 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4195 * Otherwise, it's assumed to be already available in pg_wal.
4196 */
4197static int
4200{
4201 char xlogfname[MAXFNAMELEN];
4202 char activitymsg[MAXFNAMELEN + 16];
4203 char path[MAXPGPATH];
4204 int fd;
4205
4207
4208 switch (source)
4209 {
4210 case XLOG_FROM_ARCHIVE:
4211 /* Report recovery progress in PS display */
4212 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4213 xlogfname);
4215
4216 if (!RestoreArchivedFile(path, xlogfname,
4217 "RECOVERYXLOG",
4219 InRedo))
4220 return -1;
4221 break;
4222
4223 case XLOG_FROM_PG_WAL:
4224 case XLOG_FROM_STREAM:
4225 XLogFilePath(path, tli, segno, wal_segment_size);
4226 break;
4227
4228 default:
4229 elog(ERROR, "invalid XLogFileRead source %d", source);
4230 }
4231
4232 /*
4233 * If the segment was fetched from archival storage, replace the existing
4234 * xlog segment (if any) with the archival version.
4235 */
4237 {
4240
4241 /*
4242 * Set path to point at the new file in pg_wal.
4243 */
4244 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4245 }
4246
4248 if (fd >= 0)
4249 {
4250 /* Success! */
4251 curFileTLI = tli;
4252
4253 /* Report recovery progress in PS display */
4254 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4255 xlogfname);
4257
4258 /* Track source of data in assorted state variables */
4261 /* In FROM_STREAM case, caller tracks receipt time, not me */
4262 if (source != XLOG_FROM_STREAM)
4264
4265 return fd;
4266 }
4267 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4268 ereport(PANIC,
4270 errmsg("could not open file \"%s\": %m", path)));
4271 return -1;
4272}
4273
4274/*
4275 * Open a logfile segment for reading (during recovery).
4276 *
4277 * This version searches for the segment with any TLI listed in expectedTLEs.
4278 */
4279static int
4281{
4282 char path[MAXPGPATH];
4283 ListCell *cell;
4284 int fd;
4285 List *tles;
4286
4287 /*
4288 * Loop looking for a suitable timeline ID: we might need to read any of
4289 * the timelines listed in expectedTLEs.
4290 *
4291 * We expect curFileTLI on entry to be the TLI of the preceding file in
4292 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4293 * to go backwards; this prevents us from picking up the wrong file when a
4294 * parent timeline extends to higher segment numbers than the child we
4295 * want to read.
4296 *
4297 * If we haven't read the timeline history file yet, read it now, so that
4298 * we know which TLIs to scan. We don't save the list in expectedTLEs,
4299 * however, unless we actually find a valid segment. That way if there is
4300 * neither a timeline history file nor a WAL segment in the archive, and
4301 * streaming replication is set up, we'll read the timeline history file
4302 * streamed from the primary when we start streaming, instead of
4303 * recovering with a dummy history generated here.
4304 */
4305 if (expectedTLEs)
4307 else
4309
4310 foreach(cell, tles)
4311 {
4313 TimeLineID tli = hent->tli;
4314
4315 if (tli < curFileTLI)
4316 break; /* don't bother looking at too-old TLIs */
4317
4318 /*
4319 * Skip scanning the timeline ID that the logfile segment to read
4320 * doesn't belong to
4321 */
4322 if (XLogRecPtrIsValid(hent->begin))
4323 {
4324 XLogSegNo beginseg = 0;
4325
4327
4328 /*
4329 * The logfile segment that doesn't belong to the timeline is
4330 * older or newer than the segment that the timeline started or
4331 * ended at, respectively. It's sufficient to check only the
4332 * starting segment of the timeline here. Since the timelines are
4333 * scanned in descending order in this loop, any segments newer
4334 * than the ending segment should belong to newer timeline and
4335 * have already been read before. So it's not necessary to check
4336 * the ending segment of the timeline here.
4337 */
4338 if (segno < beginseg)
4339 continue;
4340 }
4341
4343 {
4344 fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4345 if (fd != -1)
4346 {
4347 elog(DEBUG1, "got WAL segment from archive");
4348 if (!expectedTLEs)
4350 return fd;
4351 }
4352 }
4353
4355 {
4356 fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4357 if (fd != -1)
4358 {
4359 if (!expectedTLEs)
4361 return fd;
4362 }
4363 }
4364 }
4365
4366 /* Couldn't find it. For simplicity, complain about front timeline */
4368 errno = ENOENT;
4371 errmsg("could not open file \"%s\": %m", path)));
4372 return -1;
4373}
4374
4375/*
4376 * Set flag to signal the walreceiver to restart. (The startup process calls
4377 * this on noticing a relevant configuration change.)
4378 */
4379void
4381{
4383 {
4384 ereport(LOG,
4385 (errmsg("WAL receiver process shutdown requested")));
4386
4387 pendingWalRcvRestart = true;
4388 }
4389}
4390
4391
4392/*
4393 * Has a standby promotion already been triggered?
4394 *
4395 * Unlike CheckForStandbyTrigger(), this works in any process
4396 * that's connected to shared memory.
4397 */
4398bool
4400{
4401 /*
4402 * We check shared state each time only until a standby promotion is
4403 * triggered. We can't trigger a promotion again, so there's no need to
4404 * keep checking after the shared variable has once been seen true.
4405 */
4407 return true;
4408
4412
4414}
4415
4416static void
4418{
4422
4423 /*
4424 * Mark the recovery pause state as 'not paused' because the paused state
4425 * ends and promotion continues if a promotion is triggered while recovery
4426 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4427 * return 'paused' while a promotion is ongoing.
4428 */
4429 SetRecoveryPause(false);
4430
4432}
4433
4434/*
4435 * Check whether a promote request has arrived.
4436 */
4437static bool
4439{
4441 return true;
4442
4444 {
4445 ereport(LOG, (errmsg("received promote request")));
4449 return true;
4450 }
4451
4452 return false;
4453}
4454
4455/*
4456 * Remove the files signaling a standby promotion request.
4457 */
4458void
4463
4464/*
4465 * Check to see if a promote request has arrived.
4466 */
4467bool
4469{
4470 struct stat stat_buf;
4471
4472 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4473 return true;
4474
4475 return false;
4476}
4477
4478/*
4479 * Wake up startup process to replay newly arrived WAL, or to notice that
4480 * failover has been requested.
4481 */
4482void
4487
4488/*
4489 * Schedule a walreceiver wakeup in the main recovery loop.
4490 */
4491void
4496
4497/*
4498 * Is HotStandby active yet? This is only important in special backends
4499 * since normal backends won't ever be able to connect until this returns
4500 * true. Postmaster knows this by way of signal, not via shared memory.
4501 *
4502 * Unlike testing standbyState, this works in any process that's connected to
4503 * shared memory. (And note that standbyState alone doesn't tell the truth
4504 * anyway.)
4505 */
4506bool
4508{
4509 /*
4510 * We check shared state each time only until Hot Standby is active. We
4511 * can't de-activate Hot Standby, so there's no need to keep checking
4512 * after the shared variable has once been seen true.
4513 */
4515 return true;
4516 else
4517 {
4518 /* spinlock is essential on machines with weak memory ordering! */
4522
4523 return LocalHotStandbyActive;
4524 }
4525}
4526
4527/*
4528 * Like HotStandbyActive(), but to be used only in WAL replay code,
4529 * where we don't need to ask any other process what the state is.
4530 */
4531static bool
4537
4538/*
4539 * Get latest redo apply position.
4540 *
4541 * Exported to allow WALReceiver to read the pointer directly.
4542 */
4545{
4547 TimeLineID tli;
4548
4553
4554 if (replayTLI)
4555 *replayTLI = tli;
4556 return recptr;
4557}
4558
4559
4560/*
4561 * Get position of last applied, or the record being applied.
4562 *
4563 * This is different from GetXLogReplayRecPtr() in that if a WAL
4564 * record is currently being applied, this includes that record.
4565 */
4568{
4570 TimeLineID tli;
4571
4576
4577 if (replayEndTLI)
4578 *replayEndTLI = tli;
4579 return recptr;
4580}
4581
4582/*
4583 * Save timestamp of latest processed commit/abort record.
4584 *
4585 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4586 * seen by processes other than the startup process. Note in particular
4587 * that CreateRestartPoint is executed in the checkpointer.
4588 */
4589static void
4596
4597/*
4598 * Fetch timestamp of latest processed commit/abort record.
4599 */
4611
4612/*
4613 * Save timestamp of the next chunk of WAL records to apply.
4614 *
4615 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4616 * seen by all backends.
4617 */
4618static void
4625
4626/*
4627 * Fetch timestamp of latest processed commit/abort record.
4628 * Startup process maintains an accurate local copy in XLogReceiptTime
4629 */
4641
4642/*
4643 * Returns time of receipt of current chunk of XLOG data, as well as
4644 * whether it was received from streaming replication or from archives.
4645 */
4646void
4648{
4649 /*
4650 * This must be executed in the startup process, since we don't export the
4651 * relevant state to shared memory.
4652 */
4654
4657}
4658
4659/*
4660 * Note that text field supplied is a parameter name and does not require
4661 * translation
4662 */
4663void
4665{
4666 if (currValue < minValue)
4667 {
4669 {
4670 bool warned_for_promote = false;
4671
4674 errmsg("hot standby is not possible because of insufficient parameter settings"),
4675 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4676 param_name,
4677 currValue,
4678 minValue)));
4679
4680 SetRecoveryPause(true);
4681
4682 ereport(LOG,
4683 (errmsg("recovery has paused"),
4684 errdetail("If recovery is unpaused, the server will shut down."),
4685 errhint("You can then restart the server after making the necessary configuration changes.")));
4686
4688 {
4690
4692 {
4693 if (!warned_for_promote)
4696 errmsg("promotion is not possible because of insufficient parameter settings"),
4697
4698 /*
4699 * Repeat the detail from above so it's easy to find
4700 * in the log.
4701 */
4702 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4703 param_name,
4704 currValue,
4705 minValue),
4706 errhint("Restart the server after making the necessary configuration changes.")));
4707 warned_for_promote = true;
4708 }
4709
4710 /*
4711 * If recovery pause is requested then set it paused. While
4712 * we are in the loop, user might resume and pause again so
4713 * set this every time.
4714 */
4716
4717 /*
4718 * We wait on a condition variable that will wake us as soon
4719 * as the pause ends, but we use a timeout so we can check the
4720 * above conditions periodically too.
4721 */
4724 }
4726 }
4727
4728 ereport(FATAL,
4730 errmsg("recovery aborted because of insufficient parameter settings"),
4731 /* Repeat the detail from above so it's easy to find in the log. */
4732 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4733 param_name,
4734 currValue,
4735 minValue),
4736 errhint("You can restart the server after making the necessary configuration changes.")));
4737 }
4738}
4739
4740
4741/*
4742 * GUC check_hook for primary_slot_name
4743 */
4744bool
4746{
4747 int err_code;
4748 char *err_msg = NULL;
4749 char *err_hint = NULL;
4750
4751 if (*newval && strcmp(*newval, "") != 0 &&
4753 &err_msg, &err_hint))
4754 {
4756 GUC_check_errdetail("%s", err_msg);
4757 if (err_hint != NULL)
4759 return false;
4760 }
4761
4762 return true;
4763}
4764
4765/*
4766 * Recovery target settings: Only one of the several recovery_target* settings
4767 * may be set. Setting a second one results in an error. The global variable
4768 * recoveryTarget tracks which kind of recovery target was chosen. Other
4769 * variables store the actual target value (for example a string or a xid).
4770 * The assign functions of the parameters check whether a competing parameter
4771 * was already set. But we want to allow setting the same parameter multiple
4772 * times. We also want to allow unsetting a parameter and setting a different
4773 * one, so we unset recoveryTarget when the parameter is set to an empty
4774 * string.
4775 *
4776 * XXX this code is broken by design. Throwing an error from a GUC assign
4777 * hook breaks fundamental assumptions of guc.c. So long as all the variables
4778 * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4779 * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4780 * that we have odd behaviors such as unexpected GUC ordering dependencies.
4781 */
4782
4783pg_noreturn static void
4785{
4786 ereport(ERROR,
4788 errmsg("multiple recovery targets specified"),
4789 errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4790}
4791
4792/*
4793 * GUC check_hook for recovery_target
4794 */
4795bool
4797{
4798 if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4799 {
4800 GUC_check_errdetail("The only allowed value is \"immediate\".");
4801 return false;
4802 }
4803 return true;
4804}
4805
4806/*
4807 * GUC assign_hook for recovery_target
4808 */
4809void
4821
4822/*
4823 * GUC check_hook for recovery_target_lsn
4824 */
4825bool
4827{
4828 if (strcmp(*newval, "") != 0)
4829 {
4830 XLogRecPtr lsn;
4833
4834 lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
4835 if (escontext.error_occurred)
4836 return false;
4837
4838 myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4839 if (!myextra)
4840 return false;
4841 *myextra = lsn;
4842 *extra = myextra;
4843 }
4844 return true;
4845}
4846
4847/*
4848 * GUC assign_hook for recovery_target_lsn
4849 */
4850void
4851assign_recovery_target_lsn(const char *newval, void *extra)
4852{
4856
4857 if (newval && strcmp(newval, "") != 0)
4858 {
4860 recoveryTargetLSN = *((XLogRecPtr *) extra);
4861 }
4862 else
4864}
4865
4866/*
4867 * GUC check_hook for recovery_target_name
4868 */
4869bool
4871{
4872 /* Use the value of newval directly */
4873 if (strlen(*newval) >= MAXFNAMELEN)
4874 {
4875 GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4876 "recovery_target_name", MAXFNAMELEN - 1);
4877 return false;
4878 }
4879 return true;
4880}
4881
4882/*
4883 * GUC assign_hook for recovery_target_name
4884 */
4885void
4900
4901/*
4902 * GUC check_hook for recovery_target_time
4903 *
4904 * The interpretation of the recovery_target_time string can depend on the
4905 * time zone setting, so we need to wait until after all GUC processing is
4906 * done before we can do the final parsing of the string. This check function
4907 * only does a parsing pass to catch syntax errors, but we store the string
4908 * and parse it again when we need to use it.
4909 */
4910bool
4912{
4913 if (strcmp(*newval, "") != 0)
4914 {
4915 /* reject some special values */
4916 if (strcmp(*newval, "now") == 0 ||
4917 strcmp(*newval, "today") == 0 ||
4918 strcmp(*newval, "tomorrow") == 0 ||
4919 strcmp(*newval, "yesterday") == 0)
4920 {
4921 return false;
4922 }
4923
4924 /*
4925 * parse timestamp value (see also timestamptz_in())
4926 */
4927 {
4928 char *str = *newval;
4929 fsec_t fsec;
4930 struct pg_tm tt,
4931 *tm = &tt;
4932 int tz;
4933 int dtype;
4934 int nf;
4935 int dterr;
4936 char *field[MAXDATEFIELDS];
4937 int ftype[MAXDATEFIELDS];
4941
4943 field, ftype, MAXDATEFIELDS, &nf);
4944 if (dterr == 0)
4945 dterr = DecodeDateTime(field, ftype, nf,
4946 &dtype, tm, &fsec, &tz, &dtextra);
4947 if (dterr != 0)
4948 return false;
4949 if (dtype != DTK_DATE)
4950 return false;
4951
4952 if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4953 {
4954 GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4955 return false;
4956 }
4957 }
4958 }
4959 return true;
4960}
4961
4962/*
4963 * GUC assign_hook for recovery_target_time
4964 */
4965void
4977
4978/*
4979 * GUC check_hook for recovery_target_timeline
4980 */
4981bool
4983{
4986
4987 if (strcmp(*newval, "current") == 0)
4989 else if (strcmp(*newval, "latest") == 0)
4991 else
4992 {
4993 char *endp;
4994 uint64 timeline;
4995
4997
4998 errno = 0;
4999 timeline = strtou64(*newval, &endp, 0);
5000
5001 if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
5002 {
5003 GUC_check_errdetail("\"%s\" is not a valid number.",
5004 "recovery_target_timeline");
5005 return false;
5006 }
5007
5009 {
5010 GUC_check_errdetail("\"%s\" must be between %u and %u.",
5011 "recovery_target_timeline", 1, PG_UINT32_MAX);
5012 return false;
5013 }
5014 }
5015
5017 if (!myextra)
5018 return false;
5019 *myextra = rttg;
5020 *extra = myextra;
5021
5022 return true;
5023}
5024
5025/*
5026 * GUC assign_hook for recovery_target_timeline
5027 */
5028void
5037
5038/*
5039 * GUC check_hook for recovery_target_xid
5040 */
5041bool
5043{
5044 if (strcmp(*newval, "") != 0)
5045 {
5046 TransactionId xid;
5048 char *endp;
5049 char *val;
5050
5051 errno = 0;
5052
5053 /*
5054 * Consume leading whitespace to determine if number is negative
5055 */
5056 val = *newval;
5057
5058 while (isspace((unsigned char) *val))
5059 val++;
5060
5061 /*
5062 * This cast will remove the epoch, if any
5063 */
5064 xid = (TransactionId) strtou64(val, &endp, 0);
5065
5066 if (*endp != '\0' || errno == EINVAL || errno == ERANGE || *val == '-')
5067 {
5068 GUC_check_errdetail("\"%s\" is not a valid number.",
5069 "recovery_target_xid");
5070 return false;
5071 }
5072
5073 if (xid < FirstNormalTransactionId)
5074 {
5075 GUC_check_errdetail("\"%s\" without epoch must be greater than or equal to %u.",
5076 "recovery_target_xid",
5078 return false;
5079 }
5080
5082 if (!myextra)
5083 return false;
5084 *myextra = xid;
5085 *extra = myextra;
5086 }
5087 return true;
5088}
5089
5090/*
5091 * GUC assign_hook for recovery_target_xid
5092 */
5093void
5094assign_recovery_target_xid(const char *newval, void *extra)
5095{
5099
5100 if (newval && strcmp(newval, "") != 0)
5101 {
5103 recoveryTargetXid = *((TransactionId *) extra);
5104 }
5105 else
5107}
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
List * readTimeLineHistory(TimeLineID targetTLI)
Definition timeline.c:77
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition timeline.c:265
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition timeline.c:545
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition timeline.c:573
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition timeline.c:223
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition timeline.c:51
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition timeline.c:527
void remove_tablespace_symlink(const char *linkloc)
Definition tablespace.c:890
bool allow_in_place_tablespaces
Definition tablespace.c:86
void disable_startup_progress_timeout(void)
Definition startup.c:308
bool IsPromoteSignaled(void)
Definition startup.c:287
void begin_startup_progress_phase(void)
Definition startup.c:342
void ProcessStartupProcInterrupts(void)
Definition startup.c:154
void ResetPromoteSignaled(void)
Definition startup.c:293
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition datetime.c:774
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition datetime.c:998
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition timestamp.c:1748
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition timestamp.c:1997
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition timestamp.c:1772
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition timestamp.c:409
TimestampTz GetCurrentTimestamp(void)
Definition timestamp.c:1636
const char * timestamptz_to_str(TimestampTz t)
Definition timestamp.c:1853
Datum now(PG_FUNCTION_ARGS)
Definition timestamp.c:1600
uint32 BlockNumber
Definition block.h:31
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5522
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:470
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:220
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:332
@ RBM_NORMAL_NO_LOG
Definition bufmgr.h:52
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:421
PageData * Page
Definition bufpage.h:81
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:411
uint8_t uint8
Definition c.h:616
#define PG_UINT32_MAX
Definition c.h:676
#define pg_noreturn
Definition c.h:184
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:243
#define Assert(condition)
Definition c.h:945
#define PG_BINARY
Definition c.h:1376
#define UINT64_FORMAT
Definition c.h:637
int32_t int32
Definition c.h:614
uint64_t uint64
Definition c.h:619
uint32_t uint32
Definition c.h:618
#define pg_fallthrough
Definition c.h:152
uint32 TransactionId
Definition c.h:738
size_t Size
Definition c.h:691
void RequestCheckpoint(int flags)
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition timestamp.h:39
int32 fsec_t
Definition timestamp.h:41
Datum arg
Definition elog.c:1322
int errcode_for_file_access(void)
Definition elog.c:897
ErrorContextCallback * error_context_stack
Definition elog.c:99
int errcode(int sqlerrcode)
Definition elog.c:874
#define LOG
Definition elog.h:31
#define errcontext
Definition elog.h:198
int errhint(const char *fmt,...) pg_attribute_printf(1
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define FATAL
Definition elog.h:41
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:36
#define DEBUG2
Definition elog.h:29
#define PANIC
Definition elog.h:42
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1112
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition fd.c:783
int BasicOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1090
int FreeFile(FILE *file)
Definition fd.c:2827
DIR * AllocateDir(const char *dirname)
Definition fd.c:2891
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition fd.c:2957
int pg_fsync(int fd)
Definition fd.c:390
FILE * AllocateFile(const char *name, const char *mode)
Definition fd.c:2628
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc0_object(type)
Definition fe_memutils.h:75
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition file_utils.c:547
@ PGFILETYPE_LNK
Definition file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition fmgr.h:688
bool IsUnderPostmaster
Definition globals.c:120
char * DataDir
Definition globals.c:71
bool IsPostmasterEnvironment
Definition globals.c:119
void GUC_check_errcode(int sqlerrcode)
Definition guc.c:6660
void * guc_malloc(int elevel, size_t size)
Definition guc.c:637
#define newval
#define GUC_check_errdetail
Definition guc.h:507
GucSource
Definition guc.h:112
#define GUC_check_errhint
Definition guc.h:511
const char * str
#define MAXDATEFIELDS
Definition datetime.h:202
#define DTK_DATE
Definition datetime.h:144
#define MAXDATELEN
Definition datetime.h:200
long val
Definition informix.c:689
#define close(a)
Definition win32.h:12
void proc_exit(int code)
Definition ipc.c:105
int i
Definition isn.c:77
void OwnLatch(Latch *latch)
Definition latch.c:126
void DisownLatch(Latch *latch)
Definition latch.c:144
void InitSharedLatch(Latch *latch)
Definition latch.c:93
void SetLatch(Latch *latch)
Definition latch.c:290
void ResetLatch(Latch *latch)
Definition latch.c:374
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition latch.c:172
List * lappend(List *list, void *datum)
Definition list.c:339
void list_free_deep(List *list)
Definition list.c:1560
static struct pg_tm tm
Definition localtime.c:104
char * pstrdup(const char *in)
Definition mcxt.c:1781
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define AmStartupProcess()
Definition miscadmin.h:390
#define IsBootstrapProcessingMode()
Definition miscadmin.h:477
static char * errmsg
#define ERRCODE_DATA_CORRUPTED
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition pg_control.h:76
#define XLOG_CHECKPOINT_REDO
Definition pg_control.h:83
#define XLOG_OVERWRITE_CONTRECORD
Definition pg_control.h:82
DBState
Definition pg_control.h:92
@ DB_IN_ARCHIVE_RECOVERY
Definition pg_control.h:98
@ DB_SHUTDOWNED_IN_RECOVERY
Definition pg_control.h:95
@ DB_SHUTDOWNED
Definition pg_control.h:94
@ DB_IN_CRASH_RECOVERY
Definition pg_control.h:97
#define XLOG_CHECKPOINT_SHUTDOWN
Definition pg_control.h:69
#define XLOG_BACKUP_END
Definition pg_control.h:74
#define XLOG_CHECKPOINT_ONLINE
Definition pg_control.h:70
#define XLOG_END_OF_RECOVERY
Definition pg_control.h:78
const void size_t len
#define lfirst(lc)
Definition pg_list.h:172
#define NIL
Definition pg_list.h:68
XLogRecPtr pg_lsn_in_safe(const char *str, Node *escontext)
Definition pg_lsn.c:32
static rewind_source * source
Definition pg_rewind.c:89
const char * pg_rusage_show(const PGRUsage *ru0)
Definition pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition pg_rusage.c:27
static char buf[DEFAULT_XLOG_SEG_SIZE]
@ IOOBJECT_WAL
Definition pgstat.h:282
@ IOCONTEXT_NORMAL
Definition pgstat.h:292
@ IOOP_READ
Definition pgstat.h:318
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition pmsignal.c:165
@ PMSIGNAL_RECOVERY_STARTED
Definition pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition pmsignal.h:37
@ PMSIGNAL_RECOVERY_CONSISTENT
Definition pmsignal.h:36
#define pg_pread
Definition port.h:247
#define snprintf
Definition port.h:260
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
off_t pgoff_t
Definition port.h:421
static Datum ObjectIdGetDatum(Oid X)
Definition postgres.h:252
static Datum CStringGetDatum(const char *X)
Definition postgres.h:370
static Datum Int32GetDatum(int32 X)
Definition postgres.h:212
#define InvalidOid
static int fd(const char *x, int i)
static int fb(int x)
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition procarray.c:4410
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition procarray.c:4571
static void set_ps_display(const char *activity)
Definition ps_status.h:40
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
#define PG_TBLSPC_DIR
Definition relpath.h:41
void RmgrStartup(void)
Definition rmgr.c:58
void RmgrCleanup(void)
Definition rmgr.c:74
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition shmem.c:381
bool ReplicationSlotValidateNameInternal(const char *name, bool allow_reserved_name, int *err_code, char **err_msg, char **err_hint)
Definition slot.c:313
void ShutDownSlotSync(void)
Definition slotsync.c:1727
static void SpinLockRelease(volatile slock_t *lock)
Definition spin.h:62
static void SpinLockAcquire(volatile slock_t *lock)
Definition spin.h:56
static void SpinLockInit(volatile slock_t *lock)
Definition spin.h:50
#define ereport_startup_progress(msg,...)
Definition startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition stringinfo.c:145
void appendStringInfoString(StringInfo str, const char *s)
Definition stringinfo.c:230
void appendStringInfoChar(StringInfo str, char ch)
Definition stringinfo.c:242
void initStringInfo(StringInfo str)
Definition stringinfo.c:97
Oid oldestMultiDB
Definition pg_control.h:52
MultiXactId oldestMulti
Definition pg_control.h:51
MultiXactOffset nextMultiOffset
Definition pg_control.h:48
TransactionId newestCommitTsXid
Definition pg_control.h:56
TransactionId oldestXid
Definition pg_control.h:49
TimeLineID PrevTimeLineID
Definition pg_control.h:40
TimeLineID ThisTimeLineID
Definition pg_control.h:39
MultiXactId nextMulti
Definition pg_control.h:47
FullTransactionId nextXid
Definition pg_control.h:45
TransactionId oldestCommitTsXid
Definition pg_control.h:54
XLogRecPtr redo
Definition pg_control.h:37
Oid oldestXidDB
Definition pg_control.h:50
XLogRecPtr backupStartPoint
Definition pg_control.h:172
CheckPoint checkPointCopy
Definition pg_control.h:137
XLogRecPtr backupEndPoint
Definition pg_control.h:173
XLogRecPtr minRecoveryPoint
Definition pg_control.h:170
XLogRecPtr checkPoint
Definition pg_control.h:135
uint64 system_identifier
Definition pg_control.h:112
TimeLineID minRecoveryPointTLI
Definition pg_control.h:171
Definition dirent.c:26
XLogRecPtr lastPageBeginPtr
XLogRecPtr missingContrecPtr
struct ErrorContextCallback * previous
Definition elog.h:297
void(* callback)(void *arg)
Definition elog.h:298
Definition pg_list.h:54
Definition nodes.h:135
RelFileNumber relNumber
void(* rm_redo)(XLogReaderState *record)
TimeLineID ws_tli
Definition xlogreader.h:49
pg_atomic_uint64 minWaitedLSN[WAIT_LSN_TYPE_COUNT]
Definition xlogwait.h:85
XLogRecPtr missingContrecPtr
Definition xlogreader.h:214
char * errormsg_buf
Definition xlogreader.h:310
XLogRecPtr EndRecPtr
Definition xlogreader.h:206
uint64 system_identifier
Definition xlogreader.h:190
XLogRecPtr ReadRecPtr
Definition xlogreader.h:205
XLogRecPtr abortedRecPtr
Definition xlogreader.h:213
TimeLineID latestPageTLI
Definition xlogreader.h:279
XLogRecPtr overwrittenRecPtr
Definition xlogreader.h:216
XLogRecPtr latestPagePtr
Definition xlogreader.h:278
WALOpenSegment seg
Definition xlogreader.h:271
void * private_data
Definition xlogreader.h:195
uint8 xl_info
Definition xlogrecord.h:46
uint32 xl_tot_len
Definition xlogrecord.h:43
TransactionId xl_xid
Definition xlogrecord.h:44
RmgrId xl_rmid
Definition xlogrecord.h:47
ConditionVariable recoveryNotPausedCV
XLogRecPtr lastReplayedEndRecPtr
TimeLineID replayEndTLI
TimeLineID lastReplayedTLI
TimestampTz currentChunkStartTime
XLogRecPtr replayEndRecPtr
TimestampTz recoveryLastXTime
RecoveryPauseState recoveryPauseState
XLogRecPtr lastReplayedReadRecPtr
Definition guc.h:174
Definition pgtime.h:35
#define InvalidTransactionId
Definition transam.h:31
#define U64FromFullTransactionId(x)
Definition transam.h:49
#define XidFromFullTransactionId(x)
Definition transam.h:48
#define FirstNormalTransactionId
Definition transam.h:34
#define TransactionIdIsValid(xid)
Definition transam.h:41
#define TransactionIdIsNormal(xid)
Definition transam.h:42
#define TimestampTzPlusMilliseconds(tz, ms)
Definition timestamp.h:85
static TimestampTz DatumGetTimestampTz(Datum X)
Definition timestamp.h:34
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition varsup.c:308
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
#define WL_TIMEOUT
#define WL_EXIT_ON_PM_DEATH
#define WL_LATCH_SET
void WalRcvForceReply(void)
#define AllowCascadeReplication()
Definition walreceiver.h:40
@ WALRCV_STOPPING
Definition walreceiver.h:54
XLogRecPtr GetWalRcvFlushRecPtr(XLogRecPtr *latestChunkStart, TimeLineID *receiveTLI)
bool WalRcvStreaming(void)
void RequestXLogStreaming(TimeLineID tli, XLogRecPtr recptr, const char *conninfo, const char *slotname, bool create_temp_slot)
WalRcvState WalRcvGetState(void)
bool WalRcvRunning(void)
void WalSndWakeup(bool physical, bool logical)
Definition walsender.c:3822
#define stat
Definition win32_port.h:74
#define S_IRUSR
Definition win32_port.h:279
#define symlink(oldpath, newpath)
Definition win32_port.h:225
#define S_IWUSR
Definition win32_port.h:282
#define XLOG_XACT_COMMIT_PREPARED
Definition xact.h:173
#define XLOG_XACT_COMMIT
Definition xact.h:170
#define XLOG_XACT_OPMASK
Definition xact.h:180
#define XLOG_XACT_ABORT
Definition xact.h:172
#define XLOG_XACT_ABORT_PREPARED
Definition xact.h:174
void ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed)
Definition xactdesc.c:35
void ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed)
Definition xactdesc.c:141
int wal_decode_buffer_size
Definition xlog.c:140
bool EnableHotStandby
Definition xlog.c:125
XLogRecPtr GetRedoRecPtr(void)
Definition xlog.c:6547
void SetInstallXLogFileSegmentActive(void)
Definition xlog.c:9678
bool IsInstallXLogFileSegmentActive(void)
Definition xlog.c:9695
int wal_segment_size
Definition xlog.c:147
void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
Definition xlog.c:6319
void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
Definition xlog.c:3961
void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
Definition xlog.c:6357
void ResetInstallXLogFileSegmentActive(void)
Definition xlog.c:9687
int wal_retrieve_retry_interval
Definition xlog.c:138
bool track_wal_io_timing
Definition xlog.c:141
static ControlFileData * ControlFile
Definition xlog.c:577
void XLogShutdownWalRcv(void)
Definition xlog.c:9668
bool XLogCheckpointNeeded(XLogSegNo new_segno)
Definition xlog.c:2267
#define TABLESPACE_MAP_OLD
Definition xlog.h:325
#define TABLESPACE_MAP
Definition xlog.h:324
#define STANDBY_SIGNAL_FILE
Definition xlog.h:320
#define CHECKPOINT_CAUSE_XLOG
Definition xlog.h:159
#define PROMOTE_SIGNAL_FILE
Definition xlog.h:328
#define BACKUP_LABEL_FILE
Definition xlog.h:321
#define RECOVERY_SIGNAL_FILE
Definition xlog.h:319
static RmgrData GetRmgr(RmgrId rmid)
#define XLogSegmentOffset(xlogptr, wal_segsz_bytes)
#define MAXFNAMELEN
#define XLOGDIR
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)
static void XLogFilePath(char *path, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XRecOffIsValid(xlrp)
static void XLogFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes)
bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize, bool cleanupEnabled)
Definition xlogarchive.c:55
void KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
#define LSN_FORMAT_ARGS(lsn)
Definition xlogdefs.h:47
uint64 XLogRecPtr
Definition xlogdefs.h:21
#define InvalidXLogRecPtr
Definition xlogdefs.h:28
uint32 TimeLineID
Definition xlogdefs.h:63
uint64 XLogSegNo
Definition xlogdefs.h:52
void XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher)
XLogPrefetcher * XLogPrefetcherAllocate(XLogReaderState *reader)
void XLogPrefetchReconfigure(void)
XLogRecord * XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg)
XLogReaderState * XLogPrefetcherGetReader(XLogPrefetcher *prefetcher)
void XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
void XLogPrefetcherFree(XLogPrefetcher *prefetcher)
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, XLogReaderRoutine *routine, void *private_data)
Definition xlogreader.c:108
void XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size)
Definition xlogreader.c:92
void XLogReaderResetError(XLogReaderState *state)
bool XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, char *phdr)
void XLogReaderFree(XLogReaderState *state)
Definition xlogreader.c:163
bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
#define XLogRecGetDataLen(decoder)
Definition xlogreader.h:415
#define XLogRecGetInfo(decoder)
Definition xlogreader.h:409
#define XLogRecBlockImageApply(decoder, block_id)
Definition xlogreader.h:424
#define XLogRecGetRmid(decoder)
Definition xlogreader.h:410
#define XLogRecGetData(decoder)
Definition xlogreader.h:414
#define XLogRecGetXid(decoder)
Definition xlogreader.h:411
#define XL_ROUTINE(...)
Definition xlogreader.h:117
#define XLogRecMaxBlockId(decoder)
Definition xlogreader.h:417
XLogPageReadResult
Definition xlogreader.h:349
@ XLREAD_WOULDBLOCK
Definition xlogreader.h:352
@ XLREAD_SUCCESS
Definition xlogreader.h:350
@ XLREAD_FAIL
Definition xlogreader.h:351
#define XLogRecHasBlockImage(decoder, block_id)
Definition xlogreader.h:422
#define XLogRecGetPrev(decoder)
Definition xlogreader.h:408
#define XLogRecHasAnyBlockRefs(decoder)
Definition xlogreader.h:416
#define SizeOfXLogRecordDataHeaderShort
Definition xlogrecord.h:217
#define XLR_INFO_MASK
Definition xlogrecord.h:62
#define SizeOfXLogRecord
Definition xlogrecord.h:55
#define XLR_CHECK_CONSISTENCY
Definition xlogrecord.h:91
bool reachedConsistency
bool check_primary_slot_name(char **newval, void **extra, GucSource source)
static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
static XLogRecPtr recoveryStopLSN
static bool recoveryStopsBefore(XLogReaderState *record)
static TimestampTz recoveryStopTime
void assign_recovery_target_xid(const char *newval, void *extra)
static bool CheckForStandbyTrigger(void)
int recovery_min_apply_delay
bool check_recovery_target(char **newval, void **extra, GucSource source)
static bool backupEndRequired
bool HotStandbyActive(void)
static char * getRecoveryStopReason(void)
void ShutdownWalRecovery(void)
RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal
int recoveryTargetAction
static void rm_redo_error_callback(void *arg)
static bool recoveryApplyDelay(XLogReaderState *record)
bool ArchiveRecoveryRequested
const char * recoveryTargetName
static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
bool check_recovery_target_timeline(char **newval, void **extra, GucSource source)
static XLogRecPtr minRecoveryPoint
static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf)
static XLogRecPtr backupEndPoint
const struct config_enum_entry recovery_target_action_options[]
static void validateRecoveryParameters(void)
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, TimeLineID replayTLI)
static XLogRecord * ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr, TimeLineID replayTLI)
void StartupRequestWalReceiverRestart(void)
bool InArchiveRecovery
static bool recoveryStopsAfter(XLogReaderState *record)
void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
char * PrimarySlotName
static TimeLineID curFileTLI
static char recoveryStopName[MAXFNAMELEN]
static void CheckRecoveryConsistency(void)
static bool pendingWalRcvRestart
void PerformWalRecovery(void)
static XLogSource XLogReceiptSource
bool CheckPromoteSignal(void)
struct XLogPageReadPrivate XLogPageReadPrivate
static bool recoveryStopAfter
static const char *const xlogSourceNames[]
static TimeLineID RedoStartTLI
char * recoveryRestoreCommand
static void verifyBackupPageConsistency(XLogReaderState *record)
static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
void assign_recovery_target(const char *newval, void *extra)
void SetRecoveryPause(bool recoveryPause)
static bool lastSourceFailed
char * archiveCleanupCommand
XLogRecPtr GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
static TimeLineID receiveTLI
void WakeupRecovery(void)
void xlog_outdesc(StringInfo buf, XLogReaderState *record)
static bool LocalPromoteIsTriggered
bool PromoteIsTriggered(void)
TimestampTz GetCurrentChunkReplayStartTime(void)
static void ConfirmRecoveryPaused(void)
static void readRecoverySignalFile(void)
static XLogRecPtr missingContrecPtr
XLogRecoveryCtlData * XLogRecoveryCtl
static uint32 readOff
static bool standby_signal_file_found
char * recovery_target_time_string
bool StandbyMode
static int readFile
static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr, TimeLineID replayTLI, XLogRecPtr replayLSN, bool nonblocking)
XLogRecPtr recoveryTargetLSN
RecoveryTargetType recoveryTarget
static bool read_tablespace_map(List **tablespaces)
static bool doRequestWalReceiverReply
static bool read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, bool *backupEndRequired, bool *backupFromStandby)
static int XLogFileRead(XLogSegNo segno, TimeLineID tli, XLogSource source, bool notfoundOk)
static XLogSource currentSource
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI)
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
static List * expectedTLEs
static XLogSegNo readSegNo
void assign_recovery_target_name(const char *newval, void *extra)
static XLogRecPtr abortedRecPtr
static char * primary_image_masked
static TimeLineID minRecoveryPointTLI
static XLogRecord * ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, bool fetching_ckpt, TimeLineID replayTLI)
EndOfWalRecoveryInfo * FinishWalRecovery(void)
void assign_recovery_target_time(const char *newval, void *extra)
static void SetCurrentChunkStartTime(TimestampTz xtime)
static XLogRecPtr CheckPointLoc
bool check_recovery_target_xid(char **newval, void **extra, GucSource source)
static bool LocalHotStandbyActive
static bool HotStandbyActiveInReplay(void)
static bool InRedo
static TransactionId recoveryStopXid
bool check_recovery_target_time(char **newval, void **extra, GucSource source)
static XLogSource readSource
static void SetPromoteIsTriggered(void)
#define RECOVERY_COMMAND_FILE
TransactionId recoveryTargetXid
XLogSource
@ XLOG_FROM_PG_WAL
@ XLOG_FROM_STREAM
@ XLOG_FROM_ARCHIVE
@ XLOG_FROM_ANY
TimeLineID recoveryTargetTLIRequested
static pg_noreturn void error_multiple_recovery_targets(void)
void InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
static void xlog_block_info(StringInfo buf, XLogReaderState *record)
static TimestampTz XLogReceiptTime
static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
Size XLogRecoveryShmemSize(void)
static char * replay_image_masked
bool wal_receiver_create_temp_slot
static void CheckTablespaceDirectory(void)
char * recoveryEndCommand
RecoveryPauseState GetRecoveryPauseState(void)
TimeLineID recoveryTargetTLI
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
void assign_recovery_target_lsn(const char *newval, void *extra)
bool check_recovery_target_lsn(char **newval, void **extra, GucSource source)
static XLogRecPtr RedoStartLSN
static XLogRecPtr flushedUpto
void XLogRecoveryShmemInit(void)
static void recoveryPausesHere(bool endOfRecovery)
static uint32 readLen
static void EnableStandbyMode(void)
#define RECOVERY_COMMAND_DONE
static bool recovery_signal_file_found
TimestampTz recoveryTargetTime
TimestampTz GetLatestXTime(void)
char * PrimaryConnInfo
void XLogRequestWalReceiverReply(void)
static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
static XLogPrefetcher * xlogprefetcher
static bool StandbyModeRequested
bool check_recovery_target_name(char **newval, void **extra, GucSource source)
bool recoveryTargetInclusive
static XLogReaderState * xlogreader
void RemovePromoteSignalFiles(void)
void assign_recovery_target_timeline(const char *newval, void *extra)
static XLogRecPtr backupStartPoint
static void SetLatestXTime(TimestampTz xtime)
static TimeLineID CheckPointTLI
@ RECOVERY_TARGET_ACTION_PAUSE
@ RECOVERY_TARGET_ACTION_PROMOTE
@ RECOVERY_TARGET_ACTION_SHUTDOWN
RecoveryTargetType
@ RECOVERY_TARGET_IMMEDIATE
@ RECOVERY_TARGET_TIME
@ RECOVERY_TARGET_UNSET
@ RECOVERY_TARGET_XID
@ RECOVERY_TARGET_LSN
@ RECOVERY_TARGET_NAME
RecoveryTargetTimeLineGoal
@ RECOVERY_TARGET_TIMELINE_NUMERIC
@ RECOVERY_TARGET_TIMELINE_CONTROLFILE
@ RECOVERY_TARGET_TIMELINE_LATEST
RecoveryPauseState
@ RECOVERY_PAUSED
@ RECOVERY_NOT_PAUSED
@ RECOVERY_PAUSE_REQUESTED
void wal_segment_close(XLogReaderState *state)
Definition xlogutils.c:831
Buffer XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode, Buffer recent_buffer)
Definition xlogutils.c:460
HotStandbyState standbyState
Definition xlogutils.c:53
bool InRecovery
Definition xlogutils.c:50
void XLogCheckInvalidPages(void)
Definition xlogutils.c:234
@ STANDBY_SNAPSHOT_READY
Definition xlogutils.h:55
@ STANDBY_INITIALIZED
Definition xlogutils.h:53
struct WaitLSNState * waitLSNState
Definition xlogwait.c:69
void WaitLSNWakeup(WaitLSNType lsnType, XLogRecPtr currentLSN)
Definition xlogwait.c:318
@ WAIT_LSN_TYPE_STANDBY_REPLAY
Definition xlogwait.h:39