PostgreSQL Source Code git master
xlogrecovery.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * xlogrecovery.c
4 * Functions for WAL recovery, standby mode
5 *
6 * This source file contains functions controlling WAL recovery.
7 * InitWalRecovery() initializes the system for crash or archive recovery,
8 * or standby mode, depending on configuration options and the state of
9 * the control file and possible backup label file. PerformWalRecovery()
10 * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 * and prepares information needed to initialize the WAL for writes. In
13 * addition to these three main functions, there are a bunch of functions
14 * for interrogating recovery state and controlling the recovery process.
15 *
16 *
17 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
19 *
20 * src/backend/access/transam/xlogrecovery.c
21 *
22 *-------------------------------------------------------------------------
23 */
24
25#include "postgres.h"
26
27#include <ctype.h>
28#include <math.h>
29#include <time.h>
30#include <sys/stat.h>
31#include <sys/time.h>
32#include <unistd.h>
33
34#include "access/timeline.h"
35#include "access/transam.h"
36#include "access/xact.h"
38#include "access/xlogarchive.h"
40#include "access/xlogreader.h"
41#include "access/xlogrecovery.h"
42#include "access/xlogutils.h"
43#include "access/xlogwait.h"
44#include "backup/basebackup.h"
45#include "catalog/pg_control.h"
46#include "commands/tablespace.h"
47#include "common/file_utils.h"
48#include "miscadmin.h"
49#include "nodes/miscnodes.h"
50#include "pgstat.h"
51#include "postmaster/bgwriter.h"
52#include "postmaster/startup.h"
53#include "replication/slot.h"
56#include "storage/fd.h"
57#include "storage/ipc.h"
58#include "storage/latch.h"
59#include "storage/pmsignal.h"
60#include "storage/procarray.h"
61#include "storage/spin.h"
62#include "utils/datetime.h"
63#include "utils/fmgrprotos.h"
64#include "utils/guc_hooks.h"
66#include "utils/pg_lsn.h"
67#include "utils/ps_status.h"
68#include "utils/pg_rusage.h"
69
70/* Unsupported old recovery command file names (relative to $PGDATA) */
71#define RECOVERY_COMMAND_FILE "recovery.conf"
72#define RECOVERY_COMMAND_DONE "recovery.done"
73
74/*
75 * GUC support
76 */
78 {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
79 {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
80 {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
81 {NULL, 0, false}
82};
83
84/* options formerly taken from recovery.conf for archive recovery */
86char *recoveryEndCommand = NULL;
97
98/* options formerly taken from recovery.conf for XLOG streaming */
99char *PrimaryConnInfo = NULL;
100char *PrimarySlotName = NULL;
102
103/*
104 * recoveryTargetTimeLineGoal: what the user requested, if any
105 *
106 * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
107 *
108 * recoveryTargetTLI: the currently understood target timeline; changes
109 *
110 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
111 * the timelines of its known parents, newest first (so recoveryTargetTLI is
112 * always the first list member). Only these TLIs are expected to be seen in
113 * the WAL segments we read, and indeed only these TLIs will be considered as
114 * candidate WAL files to open at all.
115 *
116 * curFileTLI: the TLI appearing in the name of the current input WAL file.
117 * (This is not necessarily the same as the timeline from which we are
118 * replaying WAL, which StartupXLOG calls replayTLI, because we could be
119 * scanning data that was copied from an ancestor timeline when the current
120 * file was created.) During a sequential scan we do not allow this value
121 * to decrease.
122 */
128
129/*
130 * When ArchiveRecoveryRequested is set, archive recovery was requested,
131 * ie. signal files were present. When InArchiveRecovery is set, we are
132 * currently recovering using offline XLOG archives. These variables are only
133 * valid in the startup process.
134 *
135 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
136 * currently performing crash recovery using only XLOG files in pg_wal, but
137 * will switch to using offline XLOG archives as soon as we reach the end of
138 * WAL in pg_wal.
139 */
141bool InArchiveRecovery = false;
142
143/*
144 * When StandbyModeRequested is set, standby mode was requested, i.e.
145 * standby.signal file was present. When StandbyMode is set, we are currently
146 * in standby mode. These variables are only valid in the startup process.
147 * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
148 */
149static bool StandbyModeRequested = false;
150bool StandbyMode = false;
151
152/* was a signal file present at startup? */
153static bool standby_signal_file_found = false;
154static bool recovery_signal_file_found = false;
155
156/*
157 * CheckPointLoc is the position of the checkpoint record that determines
158 * where to start the replay. It comes from the backup label file or the
159 * control file.
160 *
161 * RedoStartLSN is the checkpoint's REDO location, also from the backup label
162 * file or the control file. In standby mode, XLOG streaming usually starts
163 * from the position where an invalid record was found. But if we fail to
164 * read even the initial checkpoint record, we use the REDO location instead
165 * of the checkpoint location as the start position of XLOG streaming.
166 * Otherwise we would have to jump backwards to the REDO location after
167 * reading the checkpoint record, because the REDO record can precede the
168 * checkpoint record.
169 */
174
175/*
176 * Local copy of SharedHotStandbyActive variable. False actually means "not
177 * known, need to check the shared state".
178 */
179static bool LocalHotStandbyActive = false;
180
181/*
182 * Local copy of SharedPromoteIsTriggered variable. False actually means "not
183 * known, need to check the shared state".
184 */
185static bool LocalPromoteIsTriggered = false;
186
187/* Has the recovery code requested a walreceiver wakeup? */
189
190/* XLogReader object used to parse the WAL records */
192
193/* XLogPrefetcher object used to consume WAL records with read-ahead */
195
196/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
198{
199 int emode;
200 bool fetching_ckpt; /* are we fetching a checkpoint record? */
204
205/* flag to tell XLogPageRead that we have started replaying */
206static bool InRedo = false;
207
208/*
209 * Codes indicating where we got a WAL file from during recovery, or where
210 * to attempt to get one.
211 */
212typedef enum
213{
214 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
215 XLOG_FROM_ARCHIVE, /* restored using restore_command */
216 XLOG_FROM_PG_WAL, /* existing file in pg_wal */
217 XLOG_FROM_STREAM, /* streamed from primary */
218} XLogSource;
219
220/* human-readable names for XLogSources, for debugging output */
221static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
222
223/*
224 * readFile is -1 or a kernel FD for the log file segment that's currently
225 * open for reading. readSegNo identifies the segment. readOff is the offset
226 * of the page just read, readLen indicates how much of it has been read into
227 * readBuf, and readSource indicates where we got the currently open file from.
228 *
229 * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
230 * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
231 * worthwhile, since the XLOG is not read by general-purpose sessions.
232 */
233static int readFile = -1;
235static uint32 readOff = 0;
236static uint32 readLen = 0;
238
239/*
240 * Keeps track of which source we're currently reading from. This is
241 * different from readSource in that this is always set, even when we don't
242 * currently have a WAL file open. If lastSourceFailed is set, our last
243 * attempt to read from currentSource failed, and we should try another source
244 * next.
245 *
246 * pendingWalRcvRestart is set when a config change occurs that requires a
247 * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
248 */
250static bool lastSourceFailed = false;
251static bool pendingWalRcvRestart = false;
252
253/*
254 * These variables track when we last obtained some WAL data to process,
255 * and where we got it from. (XLogReceiptSource is initially the same as
256 * readSource, but readSource gets reset to zero when we don't have data
257 * to process right now. It is also different from currentSource, which
258 * also changes when we try to read from a source and fail, while
259 * XLogReceiptSource tracks where we last successfully read some WAL.)
260 */
263
264/* Local copy of WalRcv->flushedUpto */
267
268/*
269 * Copy of minRecoveryPoint and backupEndPoint from the control file.
270 *
271 * In order to reach consistency, we must replay the WAL up to
272 * minRecoveryPoint. If backupEndRequired is true, we must also reach
273 * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
274 * to backupStartPoint.
275 *
276 * Note: In archive recovery, after consistency has been reached, the
277 * functions in xlog.c will start updating minRecoveryPoint in the control
278 * file. But this copy of minRecoveryPoint variable reflects the value at the
279 * beginning of recovery, and is *not* updated after consistency is reached.
280 */
283
286static bool backupEndRequired = false;
287
288/*
289 * Have we reached a consistent database state? In crash recovery, we have
290 * to replay all the WAL, so reachedConsistency is never set. During archive
291 * recovery, the database is consistent once minRecoveryPoint is reached.
292 *
293 * Consistent state means that the system is internally consistent, all
294 * the WAL has been replayed up to a certain point, and importantly, there
295 * is no trace of later actions on disk.
296 *
297 * This flag is used only by the startup process and postmaster. When
298 * minRecoveryPoint is reached, the startup process sets it to true and
299 * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
300 * which then sets it to true upon receiving the signal.
301 */
303
304/* Buffers dedicated to consistency checks of size BLCKSZ */
305static char *replay_image_masked = NULL;
306static char *primary_image_masked = NULL;
307
308
309/*
310 * Shared-memory state for WAL recovery.
311 */
313{
314 /*
315 * SharedHotStandbyActive indicates if we allow hot standby queries to be
316 * run. Protected by info_lck.
317 */
319
320 /*
321 * SharedPromoteIsTriggered indicates if a standby promotion has been
322 * triggered. Protected by info_lck.
323 */
325
326 /*
327 * recoveryWakeupLatch is used to wake up the startup process to continue
328 * WAL replay, if it is waiting for WAL to arrive or promotion to be
329 * requested.
330 *
331 * Note that the startup process also uses another latch, its procLatch,
332 * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
333 * signaling the startup process in favor of using its procLatch, which
334 * comports better with possible generic signal handlers using that latch.
335 * But we should not do that because the startup process doesn't assume
336 * that it's waken up by walreceiver process or SIGHUP signal handler
337 * while it's waiting for recovery conflict. The separate latches,
338 * recoveryWakeupLatch and procLatch, should be used for inter-process
339 * communication for WAL replay and recovery conflict, respectively.
340 */
342
343 /*
344 * Last record successfully replayed.
345 */
346 XLogRecPtr lastReplayedReadRecPtr; /* start position */
347 XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
349
350 /*
351 * When we're currently replaying a record, ie. in a redo function,
352 * replayEndRecPtr points to the end+1 of the record being replayed,
353 * otherwise it's equal to lastReplayedEndRecPtr.
354 */
357 /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
359
360 /*
361 * timestamp of when we started replaying the current chunk of WAL data,
362 * only relevant for replication or archive recovery
363 */
365 /* Recovery pause state */
368
369 slock_t info_lck; /* locks shared variables shown above */
371
373
374/*
375 * abortedRecPtr is the start pointer of a broken record at end of WAL when
376 * recovery completes; missingContrecPtr is the location of the first
377 * contrecord that went missing. See CreateOverwriteContrecordRecord for
378 * details.
379 */
382
383/*
384 * if recoveryStopsBefore/After returns true, it saves information of the stop
385 * point here
386 */
392
393/* prototypes for local functions */
394static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
395
396static void EnableStandbyMode(void);
397static void readRecoverySignalFile(void);
398static void validateRecoveryParameters(void);
399static bool read_backup_label(XLogRecPtr *checkPointLoc,
400 TimeLineID *backupLabelTLI,
401 bool *backupEndRequired, bool *backupFromStandby);
402static bool read_tablespace_map(List **tablespaces);
403
404static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
405static void CheckRecoveryConsistency(void);
406static void rm_redo_error_callback(void *arg);
407#ifdef WAL_DEBUG
408static void xlog_outrec(StringInfo buf, XLogReaderState *record);
409#endif
410static void xlog_block_info(StringInfo buf, XLogReaderState *record);
411static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
412 TimeLineID prevTLI, TimeLineID replayTLI);
413static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
415
416static bool recoveryStopsBefore(XLogReaderState *record);
417static bool recoveryStopsAfter(XLogReaderState *record);
418static char *getRecoveryStopReason(void);
419static void recoveryPausesHere(bool endOfRecovery);
420static bool recoveryApplyDelay(XLogReaderState *record);
421static void ConfirmRecoveryPaused(void);
422
424 int emode, bool fetching_ckpt,
425 TimeLineID replayTLI);
426
427static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
428 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
430 bool randAccess,
431 bool fetching_ckpt,
432 XLogRecPtr tliRecPtr,
433 TimeLineID replayTLI,
434 XLogRecPtr replayLSN,
435 bool nonblocking);
436static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
438 XLogRecPtr RecPtr, TimeLineID replayTLI);
439static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
440static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
441 XLogSource source, bool notfoundOk);
443
444static bool CheckForStandbyTrigger(void);
445static void SetPromoteIsTriggered(void);
446static bool HotStandbyActiveInReplay(void);
447
448static void SetCurrentChunkStartTime(TimestampTz xtime);
449static void SetLatestXTime(TimestampTz xtime);
450
451/*
452 * Initialization of shared memory for WAL recovery
453 */
454Size
456{
457 Size size;
458
459 /* XLogRecoveryCtl */
460 size = sizeof(XLogRecoveryCtlData);
461
462 return size;
463}
464
465void
467{
468 bool found;
469
471 ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
472 if (found)
473 return;
474 memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
475
479}
480
481/*
482 * A thin wrapper to enable StandbyMode and do other preparatory work as
483 * needed.
484 */
485static void
487{
488 StandbyMode = true;
489
490 /*
491 * To avoid server log bloat, we don't report recovery progress in a
492 * standby as it will always be in recovery unless promoted. We disable
493 * startup progress timeout in standby mode to avoid calling
494 * startup_progress_timeout_handler() unnecessarily.
495 */
497}
498
499/*
500 * Prepare the system for WAL recovery, if needed.
501 *
502 * This is called by StartupXLOG() which coordinates the server startup
503 * sequence. This function analyzes the control file and the backup label
504 * file, if any, and figures out whether we need to perform crash recovery or
505 * archive recovery, and how far we need to replay the WAL to reach a
506 * consistent state.
507 *
508 * This doesn't yet change the on-disk state, except for creating the symlinks
509 * from table space map file if any, and for fetching WAL files needed to find
510 * the checkpoint record. On entry, the caller has already read the control
511 * file into memory, and passes it as argument. This function updates it to
512 * reflect the recovery state, and the caller is expected to write it back to
513 * disk does after initializing other subsystems, but before calling
514 * PerformWalRecovery().
515 *
516 * This initializes some global variables like ArchiveRecoveryRequested, and
517 * StandbyModeRequested and InRecovery.
518 */
519void
521 bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
522{
523 XLogPageReadPrivate *private;
524 struct stat st;
525 bool wasShutdown;
526 XLogRecord *record;
527 DBState dbstate_at_startup;
528 bool haveTblspcMap = false;
529 bool haveBackupLabel = false;
530 CheckPoint checkPoint;
531 bool backupFromStandby = false;
532
533 dbstate_at_startup = ControlFile->state;
534
535 /*
536 * Initialize on the assumption we want to recover to the latest timeline
537 * that's active according to pg_control.
538 */
542 else
544
545 /*
546 * Check for signal files, and if so set up state for offline recovery
547 */
550
551 /*
552 * Take ownership of the wakeup latch if we're going to sleep during
553 * recovery, if required.
554 */
557
558 /*
559 * Set the WAL reading processor now, as it will be needed when reading
560 * the checkpoint record required (backup_label or not).
561 */
563 xlogreader =
565 XL_ROUTINE(.page_read = &XLogPageRead,
566 .segment_open = NULL,
567 .segment_close = wal_segment_close),
568 private);
569 if (!xlogreader)
571 (errcode(ERRCODE_OUT_OF_MEMORY),
572 errmsg("out of memory"),
573 errdetail("Failed while allocating a WAL reading processor.")));
575
576 /*
577 * Set the WAL decode buffer size. This limits how far ahead we can read
578 * in the WAL.
579 */
581
582 /* Create a WAL prefetcher. */
584
585 /*
586 * Allocate two page buffers dedicated to WAL consistency checks. We do
587 * it this way, rather than just making static arrays, for two reasons:
588 * (1) no need to waste the storage in most instantiations of the backend;
589 * (2) a static char array isn't guaranteed to have any particular
590 * alignment, whereas palloc() will provide MAXALIGN'd storage.
591 */
592 replay_image_masked = (char *) palloc(BLCKSZ);
593 primary_image_masked = (char *) palloc(BLCKSZ);
594
595 /*
596 * Read the backup_label file. We want to run this part of the recovery
597 * process after checking for signal files and after performing validation
598 * of the recovery parameters.
599 */
601 &backupFromStandby))
602 {
603 List *tablespaces = NIL;
604
605 /*
606 * Archive recovery was requested, and thanks to the backup label
607 * file, we know how far we need to replay to reach consistency. Enter
608 * archive recovery directly.
609 */
610 InArchiveRecovery = true;
613
614 /*
615 * Omitting backup_label when creating a new replica, PITR node etc.
616 * unfortunately is a common cause of corruption. Logging that
617 * backup_label was used makes it a bit easier to exclude that as the
618 * cause of observed corruption.
619 *
620 * Do so before we try to read the checkpoint record (which can fail),
621 * as otherwise it can be hard to understand why a checkpoint other
622 * than ControlFile->checkPoint is used.
623 */
624 ereport(LOG,
625 errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
629
630 /*
631 * When a backup_label file is present, we want to roll forward from
632 * the checkpoint it identifies, rather than using pg_control.
633 */
636 if (record != NULL)
637 {
638 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
639 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
641 errmsg_internal("checkpoint record is at %X/%08X",
643 InRecovery = true; /* force recovery even if SHUTDOWNED */
644
645 /*
646 * Make sure that REDO location exists. This may not be the case
647 * if there was a crash during an online backup, which left a
648 * backup_label around that references a WAL segment that's
649 * already been archived.
650 */
651 if (checkPoint.redo < CheckPointLoc)
652 {
654 if (!ReadRecord(xlogprefetcher, LOG, false,
655 checkPoint.ThisTimeLineID))
657 errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
659 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
660 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
661 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
663 }
664 }
665 else
666 {
668 errmsg("could not locate required checkpoint record at %X/%08X",
670 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
671 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
672 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
674 wasShutdown = false; /* keep compiler quiet */
675 }
676
677 /* Read the tablespace_map file if present and create symlinks. */
678 if (read_tablespace_map(&tablespaces))
679 {
680 ListCell *lc;
681
682 foreach(lc, tablespaces)
683 {
684 tablespaceinfo *ti = lfirst(lc);
685 char *linkloc;
686
687 linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
688
689 /*
690 * Remove the existing symlink if any and Create the symlink
691 * under PGDATA.
692 */
694
695 if (symlink(ti->path, linkloc) < 0)
698 errmsg("could not create symbolic link \"%s\": %m",
699 linkloc)));
700
701 pfree(ti->path);
702 pfree(ti);
703 }
704
705 /* tell the caller to delete it later */
706 haveTblspcMap = true;
707 }
708
709 /* tell the caller to delete it later */
710 haveBackupLabel = true;
711 }
712 else
713 {
714 /* No backup_label file has been found if we are here. */
715
716 /*
717 * If tablespace_map file is present without backup_label file, there
718 * is no use of such file. There is no harm in retaining it, but it
719 * is better to get rid of the map file so that we don't have any
720 * redundant file in data directory and it will avoid any sort of
721 * confusion. It seems prudent though to just rename the file out of
722 * the way rather than delete it completely, also we ignore any error
723 * that occurs in rename operation as even if map file is present
724 * without backup_label file, it is harmless.
725 */
726 if (stat(TABLESPACE_MAP, &st) == 0)
727 {
728 unlink(TABLESPACE_MAP_OLD);
730 ereport(LOG,
731 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
733 errdetail("File \"%s\" was renamed to \"%s\".",
735 else
736 ereport(LOG,
737 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
739 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
741 }
742
743 /*
744 * It's possible that archive recovery was requested, but we don't
745 * know how far we need to replay the WAL before we reach consistency.
746 * This can happen for example if a base backup is taken from a
747 * running server using an atomic filesystem snapshot, without calling
748 * pg_backup_start/stop. Or if you just kill a running primary server
749 * and put it into archive recovery by creating a recovery signal
750 * file.
751 *
752 * Our strategy in that case is to perform crash recovery first,
753 * replaying all the WAL present in pg_wal, and only enter archive
754 * recovery after that.
755 *
756 * But usually we already know how far we need to replay the WAL (up
757 * to minRecoveryPoint, up to backupEndPoint, or until we see an
758 * end-of-backup record), and we can enter archive recovery directly.
759 */
765 {
766 InArchiveRecovery = true;
769 }
770
771 /*
772 * For the same reason as when starting up with backup_label present,
773 * emit a log message when we continue initializing from a base
774 * backup.
775 */
777 ereport(LOG,
778 errmsg("restarting backup recovery with redo LSN %X/%08X",
780
781 /* Get the last valid checkpoint record. */
788 if (record != NULL)
789 {
791 errmsg_internal("checkpoint record is at %X/%08X",
793 }
794 else
795 {
796 /*
797 * We used to attempt to go back to a secondary checkpoint record
798 * here, but only when not in standby mode. We now just fail if we
799 * can't read the last checkpoint because this allows us to
800 * simplify processing around checkpoints.
801 */
803 errmsg("could not locate a valid checkpoint record at %X/%08X",
805 }
806 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
807 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
808
809 /* Make sure that REDO location exists. */
810 if (checkPoint.redo < CheckPointLoc)
811 {
813 if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
815 errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
817 }
818 }
819
821 {
823 ereport(LOG,
824 (errmsg("entering standby mode")));
826 ereport(LOG,
827 (errmsg("starting point-in-time recovery to XID %u",
830 ereport(LOG,
831 (errmsg("starting point-in-time recovery to %s",
834 ereport(LOG,
835 (errmsg("starting point-in-time recovery to \"%s\"",
838 ereport(LOG,
839 errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
842 ereport(LOG,
843 (errmsg("starting point-in-time recovery to earliest consistent point")));
844 else
845 ereport(LOG,
846 (errmsg("starting archive recovery")));
847 }
848
849 /*
850 * If the location of the checkpoint record is not on the expected
851 * timeline in the history of the requested timeline, we cannot proceed:
852 * the backup is not part of the history of the requested timeline.
853 */
854 Assert(expectedTLEs); /* was initialized by reading checkpoint
855 * record */
858 {
859 XLogRecPtr switchpoint;
860
861 /*
862 * tliSwitchPoint will throw an error if the checkpoint's timeline is
863 * not in expectedTLEs at all.
864 */
865 switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
867 (errmsg("requested timeline %u is not a child of this server's history",
869 /* translator: %s is a backup_label file or a pg_control file */
870 errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
871 haveBackupLabel ? "backup_label" : "pg_control",
874 LSN_FORMAT_ARGS(switchpoint))));
875 }
876
877 /*
878 * The min recovery point should be part of the requested timeline's
879 * history, too.
880 */
885 errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
889
891 errmsg_internal("redo record is at %X/%08X; shutdown %s",
892 LSN_FORMAT_ARGS(checkPoint.redo),
893 wasShutdown ? "true" : "false"));
895 (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
897 checkPoint.nextOid)));
899 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64,
900 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
902 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
903 checkPoint.oldestXid, checkPoint.oldestXidDB)));
905 (errmsg_internal("oldest MultiXactId: %u, in database %u",
906 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
908 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
909 checkPoint.oldestCommitTsXid,
910 checkPoint.newestCommitTsXid)));
913 (errmsg("invalid next transaction ID")));
914
915 /* sanity check */
916 if (checkPoint.redo > CheckPointLoc)
918 (errmsg("invalid redo in checkpoint record")));
919
920 /*
921 * Check whether we need to force recovery from WAL. If it appears to
922 * have been a clean shutdown and we did not have a recovery signal file,
923 * then assume no recovery needed.
924 */
925 if (checkPoint.redo < CheckPointLoc)
926 {
927 if (wasShutdown)
929 (errmsg("invalid redo record in shutdown checkpoint")));
930 InRecovery = true;
931 }
932 else if (ControlFile->state != DB_SHUTDOWNED)
933 InRecovery = true;
935 {
936 /* force recovery due to presence of recovery signal file */
937 InRecovery = true;
938 }
939
940 /*
941 * If recovery is needed, update our in-memory copy of pg_control to show
942 * that we are recovering and to show the selected checkpoint as the place
943 * we are starting from. We also mark pg_control with any minimum recovery
944 * stop point obtained from a backup history file.
945 *
946 * We don't write the changes to disk yet, though. Only do that after
947 * initializing various subsystems.
948 */
949 if (InRecovery)
950 {
952 {
954 }
955 else
956 {
957 ereport(LOG,
958 (errmsg("database system was not properly shut down; "
959 "automatic recovery in progress")));
961 ereport(LOG,
962 (errmsg("crash recovery starts in timeline %u "
963 "and has target timeline %u",
967 }
969 ControlFile->checkPointCopy = checkPoint;
971 {
972 /* initialize minRecoveryPoint if not set yet */
973 if (ControlFile->minRecoveryPoint < checkPoint.redo)
974 {
975 ControlFile->minRecoveryPoint = checkPoint.redo;
977 }
978 }
979
980 /*
981 * Set backupStartPoint if we're starting recovery from a base backup.
982 *
983 * Also set backupEndPoint and use minRecoveryPoint as the backup end
984 * location if we're starting recovery from a base backup which was
985 * taken from a standby. In this case, the database system status in
986 * pg_control must indicate that the database was already in recovery.
987 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
988 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
989 * before reaching this point; e.g. because restore_command or
990 * primary_conninfo were faulty.
991 *
992 * Any other state indicates that the backup somehow became corrupted
993 * and we can't sensibly continue with recovery.
994 */
995 if (haveBackupLabel)
996 {
997 ControlFile->backupStartPoint = checkPoint.redo;
999
1000 if (backupFromStandby)
1001 {
1002 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
1003 dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
1004 ereport(FATAL,
1005 (errmsg("backup_label contains data inconsistent with control file"),
1006 errhint("This means that the backup is corrupted and you will "
1007 "have to use another backup for recovery.")));
1009 }
1010 }
1011 }
1012
1013 /* remember these, so that we know when we have reached consistency */
1018 {
1021 }
1022 else
1023 {
1026 }
1027
1028 /*
1029 * Start recovery assuming that the final record isn't lost.
1030 */
1033
1034 *wasShutdown_ptr = wasShutdown;
1035 *haveBackupLabel_ptr = haveBackupLabel;
1036 *haveTblspcMap_ptr = haveTblspcMap;
1037}
1038
1039/*
1040 * See if there are any recovery signal files and if so, set state for
1041 * recovery.
1042 *
1043 * See if there is a recovery command file (recovery.conf), and if so
1044 * throw an ERROR since as of PG12 we no longer recognize that.
1045 */
1046static void
1048{
1049 struct stat stat_buf;
1050
1052 return;
1053
1054 /*
1055 * Check for old recovery API file: recovery.conf
1056 */
1057 if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1058 ereport(FATAL,
1060 errmsg("using recovery command file \"%s\" is not supported",
1062
1063 /*
1064 * Remove unused .done file, if present. Ignore if absent.
1065 */
1066 unlink(RECOVERY_COMMAND_DONE);
1067
1068 /*
1069 * Check for recovery signal files and if found, fsync them since they
1070 * represent server state information. We don't sweat too much about the
1071 * possibility of fsync failure, however.
1072 *
1073 * If present, standby signal file takes precedence. If neither is present
1074 * then we won't enter archive recovery.
1075 */
1076 if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1077 {
1078 int fd;
1079
1081 S_IRUSR | S_IWUSR);
1082 if (fd >= 0)
1083 {
1084 (void) pg_fsync(fd);
1085 close(fd);
1086 }
1088 }
1089 else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1090 {
1091 int fd;
1092
1094 S_IRUSR | S_IWUSR);
1095 if (fd >= 0)
1096 {
1097 (void) pg_fsync(fd);
1098 close(fd);
1099 }
1101 }
1102
1103 StandbyModeRequested = false;
1106 {
1107 StandbyModeRequested = true;
1109 }
1111 {
1112 StandbyModeRequested = false;
1114 }
1115 else
1116 return;
1117
1118 /*
1119 * We don't support standby mode in standalone backends; that requires
1120 * other processes such as the WAL receiver to be alive.
1121 */
1123 ereport(FATAL,
1124 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1125 errmsg("standby mode is not supported by single-user servers")));
1126}
1127
1128static void
1130{
1132 return;
1133
1134 /*
1135 * Check for compulsory parameters
1136 */
1138 {
1139 if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1140 (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1142 (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1143 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1144 }
1145 else
1146 {
1147 if (recoveryRestoreCommand == NULL ||
1148 strcmp(recoveryRestoreCommand, "") == 0)
1149 ereport(FATAL,
1150 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1151 errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1152 }
1153
1154 /*
1155 * Override any inconsistent requests. Note that this is a change of
1156 * behaviour in 9.5; prior to this we simply ignored a request to pause if
1157 * hot_standby = off, which was surprising behaviour.
1158 */
1162
1163 /*
1164 * Final parsing of recovery_target_time string; see also
1165 * check_recovery_target_time().
1166 */
1168 {
1172 Int32GetDatum(-1)));
1173 }
1174
1175 /*
1176 * If user specified recovery_target_timeline, validate it or compute the
1177 * "latest" value. We can't do this until after we've gotten the restore
1178 * command and set InArchiveRecovery, because we need to fetch timeline
1179 * history files from the archive.
1180 */
1182 {
1184
1185 /* Timeline 1 does not have a history file, all else should */
1186 if (rtli != 1 && !existsTimeLineHistory(rtli))
1187 ereport(FATAL,
1188 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1189 errmsg("recovery target timeline %u does not exist",
1190 rtli)));
1191 recoveryTargetTLI = rtli;
1192 }
1194 {
1195 /* We start the "latest" search from pg_control's timeline */
1197 }
1198 else
1199 {
1200 /*
1201 * else we just use the recoveryTargetTLI as already read from
1202 * ControlFile
1203 */
1205 }
1206}
1207
1208/*
1209 * read_backup_label: check to see if a backup_label file is present
1210 *
1211 * If we see a backup_label during recovery, we assume that we are recovering
1212 * from a backup dump file, and we therefore roll forward from the checkpoint
1213 * identified by the label file, NOT what pg_control says. This avoids the
1214 * problem that pg_control might have been archived one or more checkpoints
1215 * later than the start of the dump, and so if we rely on it as the start
1216 * point, we will fail to restore a consistent database state.
1217 *
1218 * Returns true if a backup_label was found (and fills the checkpoint
1219 * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1220 * returns false if not. If this backup_label came from a streamed backup,
1221 * *backupEndRequired is set to true. If this backup_label was created during
1222 * recovery, *backupFromStandby is set to true.
1223 *
1224 * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1225 * and TLI read from the backup file.
1226 */
1227static bool
1228read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1229 bool *backupEndRequired, bool *backupFromStandby)
1230{
1231 char startxlogfilename[MAXFNAMELEN];
1232 TimeLineID tli_from_walseg,
1233 tli_from_file;
1234 FILE *lfp;
1235 char ch;
1236 char backuptype[20];
1237 char backupfrom[20];
1238 char backuplabel[MAXPGPATH];
1239 char backuptime[128];
1240 uint32 hi,
1241 lo;
1242
1243 /* suppress possible uninitialized-variable warnings */
1244 *checkPointLoc = InvalidXLogRecPtr;
1245 *backupLabelTLI = 0;
1246 *backupEndRequired = false;
1247 *backupFromStandby = false;
1248
1249 /*
1250 * See if label file is present
1251 */
1252 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1253 if (!lfp)
1254 {
1255 if (errno != ENOENT)
1256 ereport(FATAL,
1258 errmsg("could not read file \"%s\": %m",
1260 return false; /* it's not there, all is fine */
1261 }
1262
1263 /*
1264 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1265 * is pretty crude, but we are not expecting any variability in the file
1266 * format).
1267 */
1268 if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
1269 &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1270 ereport(FATAL,
1271 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1272 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1273 RedoStartLSN = ((uint64) hi) << 32 | lo;
1274 RedoStartTLI = tli_from_walseg;
1275 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
1276 &hi, &lo, &ch) != 3 || ch != '\n')
1277 ereport(FATAL,
1278 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1279 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1280 *checkPointLoc = ((uint64) hi) << 32 | lo;
1281 *backupLabelTLI = tli_from_walseg;
1282
1283 /*
1284 * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1285 * which could mean either pg_basebackup or the pg_backup_start/stop
1286 * method was used) or if this label came from somewhere else (the only
1287 * other option today being from pg_rewind). If this was a streamed
1288 * backup then we know that we need to play through until we get to the
1289 * end of the WAL which was generated during the backup (at which point we
1290 * will have reached consistency and backupEndRequired will be reset to be
1291 * false).
1292 */
1293 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1294 {
1295 if (strcmp(backuptype, "streamed") == 0)
1296 *backupEndRequired = true;
1297 }
1298
1299 /*
1300 * BACKUP FROM lets us know if this was from a primary or a standby. If
1301 * it was from a standby, we'll double-check that the control file state
1302 * matches that of a standby.
1303 */
1304 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1305 {
1306 if (strcmp(backupfrom, "standby") == 0)
1307 *backupFromStandby = true;
1308 }
1309
1310 /*
1311 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1312 * but checking for their presence is useful for debugging and the next
1313 * sanity checks. Cope also with the fact that the result buffers have a
1314 * pre-allocated size, hence if the backup_label file has been generated
1315 * with strings longer than the maximum assumed here an incorrect parsing
1316 * happens. That's fine as only minor consistency checks are done
1317 * afterwards.
1318 */
1319 if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1321 (errmsg_internal("backup time %s in file \"%s\"",
1322 backuptime, BACKUP_LABEL_FILE)));
1323
1324 if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1326 (errmsg_internal("backup label %s in file \"%s\"",
1327 backuplabel, BACKUP_LABEL_FILE)));
1328
1329 /*
1330 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1331 * it as a sanity check if present.
1332 */
1333 if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1334 {
1335 if (tli_from_walseg != tli_from_file)
1336 ereport(FATAL,
1337 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1338 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1339 errdetail("Timeline ID parsed is %u, but expected %u.",
1340 tli_from_file, tli_from_walseg)));
1341
1343 (errmsg_internal("backup timeline %u in file \"%s\"",
1344 tli_from_file, BACKUP_LABEL_FILE)));
1345 }
1346
1347 if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
1348 ereport(FATAL,
1349 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1350 errmsg("this is an incremental backup, not a data directory"),
1351 errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1352
1353 if (ferror(lfp) || FreeFile(lfp))
1354 ereport(FATAL,
1356 errmsg("could not read file \"%s\": %m",
1358
1359 return true;
1360}
1361
1362/*
1363 * read_tablespace_map: check to see if a tablespace_map file is present
1364 *
1365 * If we see a tablespace_map file during recovery, we assume that we are
1366 * recovering from a backup dump file, and we therefore need to create symlinks
1367 * as per the information present in tablespace_map file.
1368 *
1369 * Returns true if a tablespace_map file was found (and fills *tablespaces
1370 * with a tablespaceinfo struct for each tablespace listed in the file);
1371 * returns false if not.
1372 */
1373static bool
1375{
1376 tablespaceinfo *ti;
1377 FILE *lfp;
1378 char str[MAXPGPATH];
1379 int ch,
1380 i,
1381 n;
1382 bool was_backslash;
1383
1384 /*
1385 * See if tablespace_map file is present
1386 */
1387 lfp = AllocateFile(TABLESPACE_MAP, "r");
1388 if (!lfp)
1389 {
1390 if (errno != ENOENT)
1391 ereport(FATAL,
1393 errmsg("could not read file \"%s\": %m",
1394 TABLESPACE_MAP)));
1395 return false; /* it's not there, all is fine */
1396 }
1397
1398 /*
1399 * Read and parse the link name and path lines from tablespace_map file
1400 * (this code is pretty crude, but we are not expecting any variability in
1401 * the file format). De-escape any backslashes that were inserted.
1402 */
1403 i = 0;
1404 was_backslash = false;
1405 while ((ch = fgetc(lfp)) != EOF)
1406 {
1407 if (!was_backslash && (ch == '\n' || ch == '\r'))
1408 {
1409 char *endp;
1410
1411 if (i == 0)
1412 continue; /* \r immediately followed by \n */
1413
1414 /*
1415 * The de-escaped line should contain an OID followed by exactly
1416 * one space followed by a path. The path might start with
1417 * spaces, so don't be too liberal about parsing.
1418 */
1419 str[i] = '\0';
1420 n = 0;
1421 while (str[n] && str[n] != ' ')
1422 n++;
1423 if (n < 1 || n >= i - 1)
1424 ereport(FATAL,
1425 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1426 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1427 str[n++] = '\0';
1428
1430 errno = 0;
1431 ti->oid = strtoul(str, &endp, 10);
1432 if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1433 ereport(FATAL,
1434 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1435 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1436 ti->path = pstrdup(str + n);
1437 *tablespaces = lappend(*tablespaces, ti);
1438
1439 i = 0;
1440 continue;
1441 }
1442 else if (!was_backslash && ch == '\\')
1443 was_backslash = true;
1444 else
1445 {
1446 if (i < sizeof(str) - 1)
1447 str[i++] = ch;
1448 was_backslash = false;
1449 }
1450 }
1451
1452 if (i != 0 || was_backslash) /* last line not terminated? */
1453 ereport(FATAL,
1454 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1455 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1456
1457 if (ferror(lfp) || FreeFile(lfp))
1458 ereport(FATAL,
1460 errmsg("could not read file \"%s\": %m",
1461 TABLESPACE_MAP)));
1462
1463 return true;
1464}
1465
1466/*
1467 * Finish WAL recovery.
1468 *
1469 * This does not close the 'xlogreader' yet, because in some cases the caller
1470 * still wants to re-read the last checkpoint record by calling
1471 * ReadCheckpointRecord().
1472 *
1473 * Returns the position of the last valid or applied record, after which new
1474 * WAL should be appended, information about why recovery was ended, and some
1475 * other things. See the EndOfWalRecoveryInfo struct for details.
1476 */
1479{
1481 XLogRecPtr lastRec;
1482 TimeLineID lastRecTLI;
1483 XLogRecPtr endOfLog;
1484
1485 /*
1486 * Kill WAL receiver, if it's still running, before we continue to write
1487 * the startup checkpoint and aborted-contrecord records. It will trump
1488 * over these records and subsequent ones if it's still alive when we
1489 * start writing WAL.
1490 */
1492
1493 /*
1494 * Shutdown the slot sync worker to drop any temporary slots acquired by
1495 * it and to prevent it from keep trying to fetch the failover slots.
1496 *
1497 * We do not update the 'synced' column in 'pg_replication_slots' system
1498 * view from true to false here, as any failed update could leave 'synced'
1499 * column false for some slots. This could cause issues during slot sync
1500 * after restarting the server as a standby. While updating the 'synced'
1501 * column after switching to the new timeline is an option, it does not
1502 * simplify the handling for the 'synced' column. Therefore, we retain the
1503 * 'synced' column as true after promotion as it may provide useful
1504 * information about the slot origin.
1505 */
1507
1508 /*
1509 * We are now done reading the xlog from stream. Turn off streaming
1510 * recovery to force fetching the files (which would be required at end of
1511 * recovery, e.g., timeline history file) from archive or pg_wal.
1512 *
1513 * Note that standby mode must be turned off after killing WAL receiver,
1514 * i.e., calling XLogShutdownWalRcv().
1515 */
1517 StandbyMode = false;
1518
1519 /*
1520 * Determine where to start writing WAL next.
1521 *
1522 * Re-fetch the last valid or last applied record, so we can identify the
1523 * exact endpoint of what we consider the valid portion of WAL. There may
1524 * be an incomplete continuation record after that, in which case
1525 * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1526 * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1527 * it is intentionally missing. See CreateOverwriteContrecordRecord().
1528 *
1529 * An important side-effect of this is to load the last page into
1530 * xlogreader. The caller uses it to initialize the WAL for writing.
1531 */
1532 if (!InRecovery)
1533 {
1534 lastRec = CheckPointLoc;
1535 lastRecTLI = CheckPointTLI;
1536 }
1537 else
1538 {
1540 lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1541 }
1543 (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1544 endOfLog = xlogreader->EndRecPtr;
1545
1546 /*
1547 * Remember the TLI in the filename of the XLOG segment containing the
1548 * end-of-log. It could be different from the timeline that endOfLog
1549 * nominally belongs to, if there was a timeline switch in that segment,
1550 * and we were reading the old WAL from a segment belonging to a higher
1551 * timeline.
1552 */
1553 result->endOfLogTLI = xlogreader->seg.ws_tli;
1554
1556 {
1557 /*
1558 * We are no longer in archive recovery state.
1559 *
1560 * We are now done reading the old WAL. Turn off archive fetching if
1561 * it was active.
1562 */
1564 InArchiveRecovery = false;
1565
1566 /*
1567 * If the ending log segment is still open, close it (to avoid
1568 * problems on Windows with trying to rename or delete an open file).
1569 */
1570 if (readFile >= 0)
1571 {
1572 close(readFile);
1573 readFile = -1;
1574 }
1575 }
1576
1577 /*
1578 * Copy the last partial block to the caller, for initializing the WAL
1579 * buffer for appending new WAL.
1580 */
1581 if (endOfLog % XLOG_BLCKSZ != 0)
1582 {
1583 char *page;
1584 int len;
1585 XLogRecPtr pageBeginPtr;
1586
1587 pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1589
1590 /* Copy the valid part of the last block */
1591 len = endOfLog % XLOG_BLCKSZ;
1592 page = palloc(len);
1593 memcpy(page, xlogreader->readBuf, len);
1594
1595 result->lastPageBeginPtr = pageBeginPtr;
1596 result->lastPage = page;
1597 }
1598 else
1599 {
1600 /* There is no partial block to copy. */
1601 result->lastPageBeginPtr = endOfLog;
1602 result->lastPage = NULL;
1603 }
1604
1605 /*
1606 * Create a comment for the history file to explain why and where timeline
1607 * changed.
1608 */
1610
1611 result->lastRec = lastRec;
1612 result->lastRecTLI = lastRecTLI;
1613 result->endOfLog = endOfLog;
1614
1615 result->abortedRecPtr = abortedRecPtr;
1617
1620
1621 return result;
1622}
1623
1624/*
1625 * Clean up the WAL reader and leftovers from restoring WAL from archive
1626 */
1627void
1629{
1630 char recoveryPath[MAXPGPATH];
1631
1632 /* Final update of pg_stat_recovery_prefetch. */
1634
1635 /* Shut down xlogreader */
1636 if (readFile >= 0)
1637 {
1638 close(readFile);
1639 readFile = -1;
1640 }
1644
1646 {
1647 /*
1648 * Since there might be a partial WAL segment named RECOVERYXLOG, get
1649 * rid of it.
1650 */
1651 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1652 unlink(recoveryPath); /* ignore any error */
1653
1654 /* Get rid of any remaining recovered timeline-history file, too */
1655 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1656 unlink(recoveryPath); /* ignore any error */
1657 }
1658
1659 /*
1660 * We don't need the latch anymore. It's not strictly necessary to disown
1661 * it, but let's do it for the sake of tidiness.
1662 */
1665}
1666
1667/*
1668 * Perform WAL recovery.
1669 *
1670 * If the system was shut down cleanly, this is never called.
1671 */
1672void
1674{
1675 XLogRecord *record;
1676 bool reachedRecoveryTarget = false;
1677 TimeLineID replayTLI;
1678
1679 /*
1680 * Initialize shared variables for tracking progress of WAL replay, as if
1681 * we had just replayed the record before the REDO location (or the
1682 * checkpoint record itself, if it's a shutdown checkpoint).
1683 */
1686 {
1690 }
1691 else
1692 {
1696 }
1703
1704 /* Also ensure XLogReceiptTime has a sane value */
1706
1707 /*
1708 * Let postmaster know we've started redo now, so that it can launch the
1709 * archiver if necessary.
1710 */
1713
1714 /*
1715 * Allow read-only connections immediately if we're consistent already.
1716 */
1718
1719 /*
1720 * Find the first record that logically follows the checkpoint --- it
1721 * might physically precede it, though.
1722 */
1724 {
1725 /* back up to find the record */
1726 replayTLI = RedoStartTLI;
1728 record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1729
1730 /*
1731 * If a checkpoint record's redo pointer points back to an earlier
1732 * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1733 * record.
1734 */
1735 if (record->xl_rmid != RM_XLOG_ID ||
1737 ereport(FATAL,
1738 errmsg("unexpected record type found at redo point %X/%08X",
1740 }
1741 else
1742 {
1743 /* just have to read next record after CheckPoint */
1745 replayTLI = CheckPointTLI;
1746 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1747 }
1748
1749 if (record != NULL)
1750 {
1751 TimestampTz xtime;
1752 PGRUsage ru0;
1753
1754 pg_rusage_init(&ru0);
1755
1756 InRedo = true;
1757
1758 RmgrStartup();
1759
1760 ereport(LOG,
1761 errmsg("redo starts at %X/%08X",
1763
1764 /* Prepare to report progress of the redo phase. */
1765 if (!StandbyMode)
1767
1768 /*
1769 * main redo apply loop
1770 */
1771 do
1772 {
1773 if (!StandbyMode)
1774 ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
1776
1777#ifdef WAL_DEBUG
1778 if (XLOG_DEBUG)
1779 {
1781
1783 appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
1786 xlog_outrec(&buf, xlogreader);
1787 appendStringInfoString(&buf, " - ");
1789 elog(LOG, "%s", buf.data);
1790 pfree(buf.data);
1791 }
1792#endif
1793
1794 /* Handle interrupt signals of startup process */
1796
1797 /*
1798 * Pause WAL replay, if requested by a hot-standby session via
1799 * SetRecoveryPause().
1800 *
1801 * Note that we intentionally don't take the info_lck spinlock
1802 * here. We might therefore read a slightly stale value of the
1803 * recoveryPause flag, but it can't be very stale (no worse than
1804 * the last spinlock we did acquire). Since a pause request is a
1805 * pretty asynchronous thing anyway, possibly responding to it one
1806 * WAL record later than we otherwise would is a minor issue, so
1807 * it doesn't seem worth adding another spinlock cycle to prevent
1808 * that.
1809 */
1810 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1812 recoveryPausesHere(false);
1813
1814 /*
1815 * Have we reached our recovery target?
1816 */
1818 {
1819 reachedRecoveryTarget = true;
1820 break;
1821 }
1822
1823 /*
1824 * If we've been asked to lag the primary, wait on latch until
1825 * enough time has passed.
1826 */
1828 {
1829 /*
1830 * We test for paused recovery again here. If user sets
1831 * delayed apply, it may be because they expect to pause
1832 * recovery in case of problems, so we must test again here
1833 * otherwise pausing during the delay-wait wouldn't work.
1834 */
1835 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1837 recoveryPausesHere(false);
1838 }
1839
1840 /*
1841 * Apply the record
1842 */
1843 ApplyWalRecord(xlogreader, record, &replayTLI);
1844
1845 /* Exit loop if we reached inclusive recovery target */
1847 {
1848 reachedRecoveryTarget = true;
1849 break;
1850 }
1851
1852 /*
1853 * If we replayed an LSN that someone was waiting for then walk
1854 * over the shared memory array and set latches to notify the
1855 * waiters.
1856 */
1857 if (waitLSNState &&
1861
1862 /* Else, try to fetch the next WAL record */
1863 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1864 } while (record != NULL);
1865
1866 /*
1867 * end of main redo apply loop
1868 */
1869
1870 if (reachedRecoveryTarget)
1871 {
1872 if (!reachedConsistency)
1873 ereport(FATAL,
1874 (errmsg("requested recovery stop point is before consistent recovery point")));
1875
1876 /*
1877 * This is the last point where we can restart recovery with a new
1878 * recovery target, if we shutdown and begin again. After this,
1879 * Resource Managers may choose to do permanent corrective actions
1880 * at end of recovery.
1881 */
1882 switch (recoveryTargetAction)
1883 {
1885
1886 /*
1887 * exit with special return code to request shutdown of
1888 * postmaster. Log messages issued from postmaster.
1889 */
1890 proc_exit(3);
1891
1893 SetRecoveryPause(true);
1894 recoveryPausesHere(true);
1895
1896 /* drop into promote */
1897
1899 break;
1900 }
1901 }
1902
1903 RmgrCleanup();
1904
1905 ereport(LOG,
1906 errmsg("redo done at %X/%08X system usage: %s",
1908 pg_rusage_show(&ru0)));
1909 xtime = GetLatestXTime();
1910 if (xtime)
1911 ereport(LOG,
1912 (errmsg("last completed transaction was at log time %s",
1913 timestamptz_to_str(xtime))));
1914
1915 InRedo = false;
1916 }
1917 else
1918 {
1919 /* there are no WAL records following the checkpoint */
1920 ereport(LOG,
1921 (errmsg("redo is not required")));
1922 }
1923
1924 /*
1925 * This check is intentionally after the above log messages that indicate
1926 * how far recovery went.
1927 */
1930 !reachedRecoveryTarget)
1931 ereport(FATAL,
1932 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1933 errmsg("recovery ended before configured recovery target was reached")));
1934}
1935
1936/*
1937 * Subroutine of PerformWalRecovery, to apply one WAL record.
1938 */
1939static void
1941{
1942 ErrorContextCallback errcallback;
1943 bool switchedTLI = false;
1944
1945 /* Setup error traceback support for ereport() */
1946 errcallback.callback = rm_redo_error_callback;
1947 errcallback.arg = xlogreader;
1948 errcallback.previous = error_context_stack;
1949 error_context_stack = &errcallback;
1950
1951 /*
1952 * TransamVariables->nextXid must be beyond record's xid.
1953 */
1955
1956 /*
1957 * Before replaying this record, check if this record causes the current
1958 * timeline to change. The record is already considered to be part of the
1959 * new timeline, so we update replayTLI before replaying it. That's
1960 * important so that replayEndTLI, which is recorded as the minimum
1961 * recovery point's TLI if recovery stops after this record, is set
1962 * correctly.
1963 */
1964 if (record->xl_rmid == RM_XLOG_ID)
1965 {
1966 TimeLineID newReplayTLI = *replayTLI;
1967 TimeLineID prevReplayTLI = *replayTLI;
1968 uint8 info = record->xl_info & ~XLR_INFO_MASK;
1969
1970 if (info == XLOG_CHECKPOINT_SHUTDOWN)
1971 {
1972 CheckPoint checkPoint;
1973
1974 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1975 newReplayTLI = checkPoint.ThisTimeLineID;
1976 prevReplayTLI = checkPoint.PrevTimeLineID;
1977 }
1978 else if (info == XLOG_END_OF_RECOVERY)
1979 {
1980 xl_end_of_recovery xlrec;
1981
1982 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1983 newReplayTLI = xlrec.ThisTimeLineID;
1984 prevReplayTLI = xlrec.PrevTimeLineID;
1985 }
1986
1987 if (newReplayTLI != *replayTLI)
1988 {
1989 /* Check that it's OK to switch to this TLI */
1991 newReplayTLI, prevReplayTLI, *replayTLI);
1992
1993 /* Following WAL records should be run with new TLI */
1994 *replayTLI = newReplayTLI;
1995 switchedTLI = true;
1996 }
1997 }
1998
1999 /*
2000 * Update shared replayEndRecPtr before replaying this record, so that
2001 * XLogFlush will update minRecoveryPoint correctly.
2002 */
2005 XLogRecoveryCtl->replayEndTLI = *replayTLI;
2007
2008 /*
2009 * If we are attempting to enter Hot Standby mode, process XIDs we see
2010 */
2014
2015 /*
2016 * Some XLOG record types that are related to recovery are processed
2017 * directly here, rather than in xlog_redo()
2018 */
2019 if (record->xl_rmid == RM_XLOG_ID)
2020 xlogrecovery_redo(xlogreader, *replayTLI);
2021
2022 /* Now apply the WAL record itself */
2024
2025 /*
2026 * After redo, check whether the backup pages associated with the WAL
2027 * record are consistent with the existing pages. This check is done only
2028 * if consistency check is enabled for this record.
2029 */
2030 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2032
2033 /* Pop the error context stack */
2034 error_context_stack = errcallback.previous;
2035
2036 /*
2037 * Update lastReplayedEndRecPtr after this record has been successfully
2038 * replayed.
2039 */
2043 XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2045
2046 /* ------
2047 * Wakeup walsenders:
2048 *
2049 * On the standby, the WAL is flushed first (which will only wake up
2050 * physical walsenders) and then applied, which will only wake up logical
2051 * walsenders.
2052 *
2053 * Indeed, logical walsenders on standby can't decode and send data until
2054 * it's been applied.
2055 *
2056 * Physical walsenders don't need to be woken up during replay unless
2057 * cascading replication is allowed and time line change occurred (so that
2058 * they can notice that they are on a new time line).
2059 *
2060 * That's why the wake up conditions are for:
2061 *
2062 * - physical walsenders in case of new time line and cascade
2063 * replication is allowed
2064 * - logical walsenders in case cascade replication is allowed (could not
2065 * be created otherwise)
2066 * ------
2067 */
2069 WalSndWakeup(switchedTLI, true);
2070
2071 /*
2072 * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2073 * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2074 * a reply to the primary.
2075 */
2077 {
2080 }
2081
2082 /* Allow read-only connections if we're consistent now */
2084
2085 /* Is this a timeline switch? */
2086 if (switchedTLI)
2087 {
2088 /*
2089 * Before we continue on the new timeline, clean up any (possibly
2090 * bogus) future WAL segments on the old timeline.
2091 */
2093
2094 /* Reset the prefetcher. */
2096 }
2097}
2098
2099/*
2100 * Some XLOG RM record types that are directly related to WAL recovery are
2101 * handled here rather than in the xlog_redo()
2102 */
2103static void
2105{
2106 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2107 XLogRecPtr lsn = record->EndRecPtr;
2108
2109 Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2110
2111 if (info == XLOG_OVERWRITE_CONTRECORD)
2112 {
2113 /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2115
2116 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2117 if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2118 elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
2121
2122 /* We have safely skipped the aborted record */
2125
2126 ereport(LOG,
2127 errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
2130
2131 /* Verifying the record should only happen once */
2133 }
2134 else if (info == XLOG_BACKUP_END)
2135 {
2136 XLogRecPtr startpoint;
2137
2138 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2139
2140 if (backupStartPoint == startpoint)
2141 {
2142 /*
2143 * We have reached the end of base backup, the point where
2144 * pg_backup_stop() was done. The data on disk is now consistent
2145 * (assuming we have also reached minRecoveryPoint). Set
2146 * backupEndPoint to the current LSN, so that the next call to
2147 * CheckRecoveryConsistency() will notice it and do the
2148 * end-of-backup processing.
2149 */
2150 elog(DEBUG1, "end of backup record reached");
2151
2152 backupEndPoint = lsn;
2153 }
2154 else
2155 elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
2157 }
2158}
2159
2160/*
2161 * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2162 * directories.
2163 *
2164 * Replay of database creation XLOG records for databases that were later
2165 * dropped can create fake directories in pg_tblspc. By the time consistency
2166 * is reached these directories should have been removed; here we verify
2167 * that this did indeed happen. This is to be called at the point where
2168 * consistent state is reached.
2169 *
2170 * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2171 * useful for testing purposes, and also allows for an escape hatch in case
2172 * things go south.
2173 */
2174static void
2176{
2177 DIR *dir;
2178 struct dirent *de;
2179
2181 while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2182 {
2183 char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2184
2185 /* Skip entries of non-oid names */
2186 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2187 continue;
2188
2189 snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2190
2191 if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2194 errmsg("unexpected directory entry \"%s\" found in %s",
2195 de->d_name, PG_TBLSPC_DIR),
2196 errdetail("All directory entries in %s/ should be symbolic links.",
2198 errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2199 }
2200}
2201
2202/*
2203 * Checks if recovery has reached a consistent state. When consistency is
2204 * reached and we have a valid starting standby snapshot, tell postmaster
2205 * that it can start accepting read-only connections.
2206 */
2207static void
2209{
2210 XLogRecPtr lastReplayedEndRecPtr;
2211 TimeLineID lastReplayedTLI;
2212
2213 /*
2214 * During crash recovery, we don't reach a consistent state until we've
2215 * replayed all the WAL.
2216 */
2218 return;
2219
2221
2222 /*
2223 * assume that we are called in the startup process, and hence don't need
2224 * a lock to read lastReplayedEndRecPtr
2225 */
2226 lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2227 lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2228
2229 /*
2230 * Have we reached the point where our base backup was completed?
2231 */
2233 backupEndPoint <= lastReplayedEndRecPtr)
2234 {
2235 XLogRecPtr saveBackupStartPoint = backupStartPoint;
2236 XLogRecPtr saveBackupEndPoint = backupEndPoint;
2237
2238 elog(DEBUG1, "end of backup reached");
2239
2240 /*
2241 * We have reached the end of base backup, as indicated by pg_control.
2242 * Update the control file accordingly.
2243 */
2244 ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2247 backupEndRequired = false;
2248
2249 ereport(LOG,
2250 errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
2251 LSN_FORMAT_ARGS(saveBackupStartPoint),
2252 LSN_FORMAT_ARGS(saveBackupEndPoint)));
2253 }
2254
2255 /*
2256 * Have we passed our safe starting point? Note that minRecoveryPoint is
2257 * known to be incorrectly set if recovering from a backup, until the
2258 * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2259 * All we know prior to that is that we're not consistent yet.
2260 */
2262 minRecoveryPoint <= lastReplayedEndRecPtr)
2263 {
2264 /*
2265 * Check to see if the XLOG sequence contained any unresolved
2266 * references to uninitialized pages.
2267 */
2269
2270 /*
2271 * Check that pg_tblspc doesn't contain any real directories. Replay
2272 * of Database/CREATE_* records may have created fictitious tablespace
2273 * directories that should have been removed by the time consistency
2274 * was reached.
2275 */
2277
2278 reachedConsistency = true;
2280 ereport(LOG,
2281 errmsg("consistent recovery state reached at %X/%08X",
2282 LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
2283 }
2284
2285 /*
2286 * Have we got a valid starting snapshot that will allow queries to be
2287 * run? If so, we can tell postmaster that the database is consistent now,
2288 * enabling connections.
2289 */
2294 {
2298
2299 LocalHotStandbyActive = true;
2300
2302 }
2303}
2304
2305/*
2306 * Error context callback for errors occurring during rm_redo().
2307 */
2308static void
2310{
2311 XLogReaderState *record = (XLogReaderState *) arg;
2313
2315 xlog_outdesc(&buf, record);
2316 xlog_block_info(&buf, record);
2317
2318 /* translator: %s is a WAL record description */
2319 errcontext("WAL redo at %X/%08X for %s",
2320 LSN_FORMAT_ARGS(record->ReadRecPtr),
2321 buf.data);
2322
2323 pfree(buf.data);
2324}
2325
2326/*
2327 * Returns a string describing an XLogRecord, consisting of its identity
2328 * optionally followed by a colon, a space, and a further description.
2329 */
2330void
2332{
2333 RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2334 uint8 info = XLogRecGetInfo(record);
2335 const char *id;
2336
2339
2340 id = rmgr.rm_identify(info);
2341 if (id == NULL)
2342 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2343 else
2344 appendStringInfo(buf, "%s: ", id);
2345
2346 rmgr.rm_desc(buf, record);
2347}
2348
2349#ifdef WAL_DEBUG
2350
2351static void
2352xlog_outrec(StringInfo buf, XLogReaderState *record)
2353{
2354 appendStringInfo(buf, "prev %X/%08X; xid %u",
2356 XLogRecGetXid(record));
2357
2358 appendStringInfo(buf, "; len %u",
2359 XLogRecGetDataLen(record));
2360
2361 xlog_block_info(buf, record);
2362}
2363#endif /* WAL_DEBUG */
2364
2365/*
2366 * Returns a string giving information about all the blocks in an
2367 * XLogRecord.
2368 */
2369static void
2371{
2372 int block_id;
2373
2374 /* decode block references */
2375 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2376 {
2377 RelFileLocator rlocator;
2378 ForkNumber forknum;
2379 BlockNumber blk;
2380
2381 if (!XLogRecGetBlockTagExtended(record, block_id,
2382 &rlocator, &forknum, &blk, NULL))
2383 continue;
2384
2385 if (forknum != MAIN_FORKNUM)
2386 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2387 block_id,
2388 rlocator.spcOid, rlocator.dbOid,
2389 rlocator.relNumber,
2390 forknum,
2391 blk);
2392 else
2393 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2394 block_id,
2395 rlocator.spcOid, rlocator.dbOid,
2396 rlocator.relNumber,
2397 blk);
2398 if (XLogRecHasBlockImage(record, block_id))
2399 appendStringInfoString(buf, " FPW");
2400 }
2401}
2402
2403
2404/*
2405 * Check that it's OK to switch to new timeline during recovery.
2406 *
2407 * 'lsn' is the address of the shutdown checkpoint record we're about to
2408 * replay. (Currently, timeline can only change at a shutdown checkpoint).
2409 */
2410static void
2412 TimeLineID replayTLI)
2413{
2414 /* Check that the record agrees on what the current (old) timeline is */
2415 if (prevTLI != replayTLI)
2416 ereport(PANIC,
2417 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2418 prevTLI, replayTLI)));
2419
2420 /*
2421 * The new timeline better be in the list of timelines we expect to see,
2422 * according to the timeline history. It should also not decrease.
2423 */
2424 if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2425 ereport(PANIC,
2426 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2427 newTLI, replayTLI)));
2428
2429 /*
2430 * If we have not yet reached min recovery point, and we're about to
2431 * switch to a timeline greater than the timeline of the min recovery
2432 * point: trouble. After switching to the new timeline, we could not
2433 * possibly visit the min recovery point on the correct timeline anymore.
2434 * This can happen if there is a newer timeline in the archive that
2435 * branched before the timeline the min recovery point is on, and you
2436 * attempt to do PITR to the new timeline.
2437 */
2439 lsn < minRecoveryPoint &&
2440 newTLI > minRecoveryPointTLI)
2441 ereport(PANIC,
2442 errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
2443 newTLI,
2446
2447 /* Looks good */
2448}
2449
2450
2451/*
2452 * Extract timestamp from WAL record.
2453 *
2454 * If the record contains a timestamp, returns true, and saves the timestamp
2455 * in *recordXtime. If the record type has no timestamp, returns false.
2456 * Currently, only transaction commit/abort records and restore points contain
2457 * timestamps.
2458 */
2459static bool
2461{
2462 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2463 uint8 xact_info = info & XLOG_XACT_OPMASK;
2464 uint8 rmid = XLogRecGetRmid(record);
2465
2466 if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2467 {
2468 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2469 return true;
2470 }
2471 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2472 xact_info == XLOG_XACT_COMMIT_PREPARED))
2473 {
2474 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2475 return true;
2476 }
2477 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2478 xact_info == XLOG_XACT_ABORT_PREPARED))
2479 {
2480 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2481 return true;
2482 }
2483 return false;
2484}
2485
2486/*
2487 * Checks whether the current buffer page and backup page stored in the
2488 * WAL record are consistent or not. Before comparing the two pages, a
2489 * masking can be applied to the pages to ignore certain areas like hint bits,
2490 * unused space between pd_lower and pd_upper among other things. This
2491 * function should be called once WAL replay has been completed for a
2492 * given record.
2493 */
2494static void
2496{
2497 RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2498 RelFileLocator rlocator;
2499 ForkNumber forknum;
2500 BlockNumber blkno;
2501 int block_id;
2502
2503 /* Records with no backup blocks have no need for consistency checks. */
2504 if (!XLogRecHasAnyBlockRefs(record))
2505 return;
2506
2508
2509 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2510 {
2511 Buffer buf;
2512 Page page;
2513
2514 if (!XLogRecGetBlockTagExtended(record, block_id,
2515 &rlocator, &forknum, &blkno, NULL))
2516 {
2517 /*
2518 * WAL record doesn't contain a block reference with the given id.
2519 * Do nothing.
2520 */
2521 continue;
2522 }
2523
2524 Assert(XLogRecHasBlockImage(record, block_id));
2525
2526 if (XLogRecBlockImageApply(record, block_id))
2527 {
2528 /*
2529 * WAL record has already applied the page, so bypass the
2530 * consistency check as that would result in comparing the full
2531 * page stored in the record with itself.
2532 */
2533 continue;
2534 }
2535
2536 /*
2537 * Read the contents from the current buffer and store it in a
2538 * temporary page.
2539 */
2540 buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2543 if (!BufferIsValid(buf))
2544 continue;
2545
2547 page = BufferGetPage(buf);
2548
2549 /*
2550 * Take a copy of the local page where WAL has been applied to have a
2551 * comparison base before masking it...
2552 */
2553 memcpy(replay_image_masked, page, BLCKSZ);
2554
2555 /* No need for this page anymore now that a copy is in. */
2557
2558 /*
2559 * If the block LSN is already ahead of this WAL record, we can't
2560 * expect contents to match. This can happen if recovery is
2561 * restarted.
2562 */
2564 continue;
2565
2566 /*
2567 * Read the contents from the backup copy, stored in WAL record and
2568 * store it in a temporary page. There is no need to allocate a new
2569 * page here, a local buffer is fine to hold its contents and a mask
2570 * can be directly applied on it.
2571 */
2572 if (!RestoreBlockImage(record, block_id, primary_image_masked))
2573 ereport(ERROR,
2574 (errcode(ERRCODE_INTERNAL_ERROR),
2575 errmsg_internal("%s", record->errormsg_buf)));
2576
2577 /*
2578 * If masking function is defined, mask both the primary and replay
2579 * images
2580 */
2581 if (rmgr.rm_mask != NULL)
2582 {
2583 rmgr.rm_mask(replay_image_masked, blkno);
2584 rmgr.rm_mask(primary_image_masked, blkno);
2585 }
2586
2587 /* Time to compare the primary and replay images. */
2588 if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2589 {
2590 elog(FATAL,
2591 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2592 rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2593 forknum, blkno);
2594 }
2595 }
2596}
2597
2598/*
2599 * For point-in-time recovery, this function decides whether we want to
2600 * stop applying the XLOG before the current record.
2601 *
2602 * Returns true if we are stopping, false otherwise. If stopping, some
2603 * information is saved in recoveryStopXid et al for use in annotating the
2604 * new timeline's history file.
2605 */
2606static bool
2608{
2609 bool stopsHere = false;
2610 uint8 xact_info;
2611 bool isCommit;
2612 TimestampTz recordXtime = 0;
2613 TransactionId recordXid;
2614
2615 /*
2616 * Ignore recovery target settings when not in archive recovery (meaning
2617 * we are in crash recovery).
2618 */
2620 return false;
2621
2622 /* Check if we should stop as soon as reaching consistency */
2624 {
2625 ereport(LOG,
2626 (errmsg("recovery stopping after reaching consistency")));
2627
2628 recoveryStopAfter = false;
2631 recoveryStopTime = 0;
2632 recoveryStopName[0] = '\0';
2633 return true;
2634 }
2635
2636 /* Check if target LSN has been reached */
2639 record->ReadRecPtr >= recoveryTargetLSN)
2640 {
2641 recoveryStopAfter = false;
2643 recoveryStopLSN = record->ReadRecPtr;
2644 recoveryStopTime = 0;
2645 recoveryStopName[0] = '\0';
2646 ereport(LOG,
2647 errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
2649 return true;
2650 }
2651
2652 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2653 if (XLogRecGetRmid(record) != RM_XACT_ID)
2654 return false;
2655
2656 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2657
2658 if (xact_info == XLOG_XACT_COMMIT)
2659 {
2660 isCommit = true;
2661 recordXid = XLogRecGetXid(record);
2662 }
2663 else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2664 {
2665 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2666 xl_xact_parsed_commit parsed;
2667
2668 isCommit = true;
2670 xlrec,
2671 &parsed);
2672 recordXid = parsed.twophase_xid;
2673 }
2674 else if (xact_info == XLOG_XACT_ABORT)
2675 {
2676 isCommit = false;
2677 recordXid = XLogRecGetXid(record);
2678 }
2679 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2680 {
2681 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2682 xl_xact_parsed_abort parsed;
2683
2684 isCommit = false;
2686 xlrec,
2687 &parsed);
2688 recordXid = parsed.twophase_xid;
2689 }
2690 else
2691 return false;
2692
2694 {
2695 /*
2696 * There can be only one transaction end record with this exact
2697 * transactionid
2698 *
2699 * when testing for an xid, we MUST test for equality only, since
2700 * transactions are numbered in the order they start, not the order
2701 * they complete. A higher numbered xid will complete before you about
2702 * 50% of the time...
2703 */
2704 stopsHere = (recordXid == recoveryTargetXid);
2705 }
2706
2707 /*
2708 * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2709 * We don't expect getRecordTimestamp ever to fail, since we already know
2710 * this is a commit or abort record; but test its result anyway.
2711 */
2712 if (getRecordTimestamp(record, &recordXtime) &&
2714 {
2715 /*
2716 * There can be many transactions that share the same commit time, so
2717 * we stop after the last one, if we are inclusive, or stop at the
2718 * first one if we are exclusive
2719 */
2721 stopsHere = (recordXtime > recoveryTargetTime);
2722 else
2723 stopsHere = (recordXtime >= recoveryTargetTime);
2724 }
2725
2726 if (stopsHere)
2727 {
2728 recoveryStopAfter = false;
2729 recoveryStopXid = recordXid;
2730 recoveryStopTime = recordXtime;
2732 recoveryStopName[0] = '\0';
2733
2734 if (isCommit)
2735 {
2736 ereport(LOG,
2737 (errmsg("recovery stopping before commit of transaction %u, time %s",
2740 }
2741 else
2742 {
2743 ereport(LOG,
2744 (errmsg("recovery stopping before abort of transaction %u, time %s",
2747 }
2748 }
2749
2750 return stopsHere;
2751}
2752
2753/*
2754 * Same as recoveryStopsBefore, but called after applying the record.
2755 *
2756 * We also track the timestamp of the latest applied COMMIT/ABORT
2757 * record in XLogRecoveryCtl->recoveryLastXTime.
2758 */
2759static bool
2761{
2762 uint8 info;
2763 uint8 xact_info;
2764 uint8 rmid;
2765 TimestampTz recordXtime = 0;
2766
2767 /*
2768 * Ignore recovery target settings when not in archive recovery (meaning
2769 * we are in crash recovery).
2770 */
2772 return false;
2773
2774 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2775 rmid = XLogRecGetRmid(record);
2776
2777 /*
2778 * There can be many restore points that share the same name; we stop at
2779 * the first one.
2780 */
2782 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2783 {
2784 xl_restore_point *recordRestorePointData;
2785
2786 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2787
2788 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2789 {
2790 recoveryStopAfter = true;
2793 (void) getRecordTimestamp(record, &recoveryStopTime);
2794 strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2795
2796 ereport(LOG,
2797 (errmsg("recovery stopping at restore point \"%s\", time %s",
2800 return true;
2801 }
2802 }
2803
2804 /* Check if the target LSN has been reached */
2807 record->ReadRecPtr >= recoveryTargetLSN)
2808 {
2809 recoveryStopAfter = true;
2811 recoveryStopLSN = record->ReadRecPtr;
2812 recoveryStopTime = 0;
2813 recoveryStopName[0] = '\0';
2814 ereport(LOG,
2815 errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
2817 return true;
2818 }
2819
2820 if (rmid != RM_XACT_ID)
2821 return false;
2822
2823 xact_info = info & XLOG_XACT_OPMASK;
2824
2825 if (xact_info == XLOG_XACT_COMMIT ||
2826 xact_info == XLOG_XACT_COMMIT_PREPARED ||
2827 xact_info == XLOG_XACT_ABORT ||
2828 xact_info == XLOG_XACT_ABORT_PREPARED)
2829 {
2830 TransactionId recordXid;
2831
2832 /* Update the last applied transaction timestamp */
2833 if (getRecordTimestamp(record, &recordXtime))
2834 SetLatestXTime(recordXtime);
2835
2836 /* Extract the XID of the committed/aborted transaction */
2837 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2838 {
2839 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2840 xl_xact_parsed_commit parsed;
2841
2843 xlrec,
2844 &parsed);
2845 recordXid = parsed.twophase_xid;
2846 }
2847 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2848 {
2849 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2850 xl_xact_parsed_abort parsed;
2851
2853 xlrec,
2854 &parsed);
2855 recordXid = parsed.twophase_xid;
2856 }
2857 else
2858 recordXid = XLogRecGetXid(record);
2859
2860 /*
2861 * There can be only one transaction end record with this exact
2862 * transactionid
2863 *
2864 * when testing for an xid, we MUST test for equality only, since
2865 * transactions are numbered in the order they start, not the order
2866 * they complete. A higher numbered xid will complete before you about
2867 * 50% of the time...
2868 */
2870 recordXid == recoveryTargetXid)
2871 {
2872 recoveryStopAfter = true;
2873 recoveryStopXid = recordXid;
2874 recoveryStopTime = recordXtime;
2876 recoveryStopName[0] = '\0';
2877
2878 if (xact_info == XLOG_XACT_COMMIT ||
2879 xact_info == XLOG_XACT_COMMIT_PREPARED)
2880 {
2881 ereport(LOG,
2882 (errmsg("recovery stopping after commit of transaction %u, time %s",
2885 }
2886 else if (xact_info == XLOG_XACT_ABORT ||
2887 xact_info == XLOG_XACT_ABORT_PREPARED)
2888 {
2889 ereport(LOG,
2890 (errmsg("recovery stopping after abort of transaction %u, time %s",
2893 }
2894 return true;
2895 }
2896 }
2897
2898 /* Check if we should stop as soon as reaching consistency */
2900 {
2901 ereport(LOG,
2902 (errmsg("recovery stopping after reaching consistency")));
2903
2904 recoveryStopAfter = true;
2906 recoveryStopTime = 0;
2908 recoveryStopName[0] = '\0';
2909 return true;
2910 }
2911
2912 return false;
2913}
2914
2915/*
2916 * Create a comment for the history file to explain why and where
2917 * timeline changed.
2918 */
2919static char *
2921{
2922 char reason[200];
2923
2925 snprintf(reason, sizeof(reason),
2926 "%s transaction %u",
2927 recoveryStopAfter ? "after" : "before",
2930 snprintf(reason, sizeof(reason),
2931 "%s %s\n",
2932 recoveryStopAfter ? "after" : "before",
2935 snprintf(reason, sizeof(reason),
2936 "%s LSN %X/%08X\n",
2937 recoveryStopAfter ? "after" : "before",
2940 snprintf(reason, sizeof(reason),
2941 "at restore point \"%s\"",
2944 snprintf(reason, sizeof(reason), "reached consistency");
2945 else
2946 snprintf(reason, sizeof(reason), "no recovery target specified");
2947
2948 return pstrdup(reason);
2949}
2950
2951/*
2952 * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2953 *
2954 * endOfRecovery is true if the recovery target is reached and
2955 * the paused state starts at the end of recovery because of
2956 * recovery_target_action=pause, and false otherwise.
2957 */
2958static void
2959recoveryPausesHere(bool endOfRecovery)
2960{
2961 /* Don't pause unless users can connect! */
2963 return;
2964
2965 /* Don't pause after standby promotion has been triggered */
2967 return;
2968
2969 if (endOfRecovery)
2970 ereport(LOG,
2971 (errmsg("pausing at the end of recovery"),
2972 errhint("Execute pg_wal_replay_resume() to promote.")));
2973 else
2974 ereport(LOG,
2975 (errmsg("recovery has paused"),
2976 errhint("Execute pg_wal_replay_resume() to continue.")));
2977
2978 /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2980 {
2983 return;
2984
2985 /*
2986 * If recovery pause is requested then set it paused. While we are in
2987 * the loop, user might resume and pause again so set this every time.
2988 */
2990
2991 /*
2992 * We wait on a condition variable that will wake us as soon as the
2993 * pause ends, but we use a timeout so we can check the above exit
2994 * condition periodically too.
2995 */
2997 WAIT_EVENT_RECOVERY_PAUSE);
2998 }
3000}
3001
3002/*
3003 * When recovery_min_apply_delay is set, we wait long enough to make sure
3004 * certain record types are applied at least that interval behind the primary.
3005 *
3006 * Returns true if we waited.
3007 *
3008 * Note that the delay is calculated between the WAL record log time and
3009 * the current time on standby. We would prefer to keep track of when this
3010 * standby received each WAL record, which would allow a more consistent
3011 * approach and one not affected by time synchronisation issues, but that
3012 * is significantly more effort and complexity for little actual gain in
3013 * usability.
3014 */
3015static bool
3017{
3018 uint8 xact_info;
3019 TimestampTz xtime;
3020 TimestampTz delayUntil;
3021 long msecs;
3022
3023 /* nothing to do if no delay configured */
3024 if (recovery_min_apply_delay <= 0)
3025 return false;
3026
3027 /* no delay is applied on a database not yet consistent */
3028 if (!reachedConsistency)
3029 return false;
3030
3031 /* nothing to do if crash recovery is requested */
3033 return false;
3034
3035 /*
3036 * Is it a COMMIT record?
3037 *
3038 * We deliberately choose not to delay aborts since they have no effect on
3039 * MVCC. We already allow replay of records that don't have a timestamp,
3040 * so there is already opportunity for issues caused by early conflicts on
3041 * standbys.
3042 */
3043 if (XLogRecGetRmid(record) != RM_XACT_ID)
3044 return false;
3045
3046 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3047
3048 if (xact_info != XLOG_XACT_COMMIT &&
3049 xact_info != XLOG_XACT_COMMIT_PREPARED)
3050 return false;
3051
3052 if (!getRecordTimestamp(record, &xtime))
3053 return false;
3054
3056
3057 /*
3058 * Exit without arming the latch if it's already past time to apply this
3059 * record
3060 */
3062 if (msecs <= 0)
3063 return false;
3064
3065 while (true)
3066 {
3068
3069 /* This might change recovery_min_apply_delay. */
3071
3073 break;
3074
3075 /*
3076 * Recalculate delayUntil as recovery_min_apply_delay could have
3077 * changed while waiting in this loop.
3078 */
3080
3081 /*
3082 * Wait for difference between GetCurrentTimestamp() and delayUntil.
3083 */
3085 delayUntil);
3086
3087 if (msecs <= 0)
3088 break;
3089
3090 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3091
3094 msecs,
3095 WAIT_EVENT_RECOVERY_APPLY_DELAY);
3096 }
3097 return true;
3098}
3099
3100/*
3101 * Get the current state of the recovery pause request.
3102 */
3105{
3107
3111
3112 return state;
3113}
3114
3115/*
3116 * Set the recovery pause state.
3117 *
3118 * If recovery pause is requested then sets the recovery pause state to
3119 * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3120 * to 'not paused' to resume the recovery. The recovery pause will be
3121 * confirmed by the ConfirmRecoveryPaused.
3122 */
3123void
3124SetRecoveryPause(bool recoveryPause)
3125{
3127
3128 if (!recoveryPause)
3132
3134
3135 if (!recoveryPause)
3137}
3138
3139/*
3140 * Confirm the recovery pause by setting the recovery pause state to
3141 * RECOVERY_PAUSED.
3142 */
3143static void
3145{
3146 /* If recovery pause is requested then set it paused */
3151}
3152
3153
3154/*
3155 * Attempt to read the next XLOG record.
3156 *
3157 * Before first call, the reader needs to be positioned to the first record
3158 * by calling XLogPrefetcherBeginRead().
3159 *
3160 * If no valid record is available, returns NULL, or fails if emode is PANIC.
3161 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3162 * record is available.
3163 */
3164static XLogRecord *
3166 bool fetching_ckpt, TimeLineID replayTLI)
3167{
3168 XLogRecord *record;
3171
3173
3174 /* Pass through parameters to XLogPageRead */
3175 private->fetching_ckpt = fetching_ckpt;
3176 private->emode = emode;
3177 private->randAccess = !XLogRecPtrIsValid(xlogreader->ReadRecPtr);
3178 private->replayTLI = replayTLI;
3179
3180 /* This is the first attempt to read this page. */
3181 lastSourceFailed = false;
3182
3183 for (;;)
3184 {
3185 char *errormsg;
3186
3187 record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3188 if (record == NULL)
3189 {
3190 /*
3191 * When we find that WAL ends in an incomplete record, keep track
3192 * of that record. After recovery is done, we'll write a record
3193 * to indicate to downstream WAL readers that that portion is to
3194 * be ignored.
3195 *
3196 * However, when ArchiveRecoveryRequested = true, we're going to
3197 * switch to a new timeline at the end of recovery. We will only
3198 * copy WAL over to the new timeline up to the end of the last
3199 * complete record, so if we did this, we would later create an
3200 * overwrite contrecord in the wrong place, breaking everything.
3201 */
3204 {
3207 }
3208
3209 if (readFile >= 0)
3210 {
3211 close(readFile);
3212 readFile = -1;
3213 }
3214
3215 /*
3216 * We only end up here without a message when XLogPageRead()
3217 * failed - in that case we already logged something. In
3218 * StandbyMode that only happens if we have been triggered, so we
3219 * shouldn't loop anymore in that case.
3220 */
3221 if (errormsg)
3223 (errmsg_internal("%s", errormsg) /* already translated */ ));
3224 }
3225
3226 /*
3227 * Check page TLI is one of the expected values.
3228 */
3230 {
3231 char fname[MAXFNAMELEN];
3232 XLogSegNo segno;
3233 int32 offset;
3234
3238 XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3241 errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
3243 fname,
3245 offset));
3246 record = NULL;
3247 }
3248
3249 if (record)
3250 {
3251 /* Great, got a record */
3252 return record;
3253 }
3254 else
3255 {
3256 /* No valid record available from this source */
3257 lastSourceFailed = true;
3258
3259 /*
3260 * If archive recovery was requested, but we were still doing
3261 * crash recovery, switch to archive recovery and retry using the
3262 * offline archive. We have now replayed all the valid WAL in
3263 * pg_wal, so we are presumably now consistent.
3264 *
3265 * We require that there's at least some valid WAL present in
3266 * pg_wal, however (!fetching_ckpt). We could recover using the
3267 * WAL from the archive, even if pg_wal is completely empty, but
3268 * we'd have no idea how far we'd have to replay to reach
3269 * consistency. So err on the safe side and give up.
3270 */
3272 !fetching_ckpt)
3273 {
3275 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3276 InArchiveRecovery = true;
3279
3282 minRecoveryPointTLI = replayTLI;
3283
3285
3286 /*
3287 * Before we retry, reset lastSourceFailed and currentSource
3288 * so that we will check the archive next.
3289 */
3290 lastSourceFailed = false;
3292
3293 continue;
3294 }
3295
3296 /* In standby mode, loop back to retry. Otherwise, give up. */
3298 continue;
3299 else
3300 return NULL;
3301 }
3302 }
3303}
3304
3305/*
3306 * Read the XLOG page containing targetPagePtr into readBuf (if not read
3307 * already). Returns number of bytes read, if the page is read successfully,
3308 * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3309 * but only if they have not been previously reported.
3310 *
3311 * See XLogReaderRoutine.page_read for more details.
3312 *
3313 * While prefetching, xlogreader->nonblocking may be set. In that case,
3314 * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3315 *
3316 * This is responsible for restoring files from archive as needed, as well
3317 * as for waiting for the requested WAL record to arrive in standby mode.
3318 *
3319 * xlogreader->private_data->emode specifies the log level used for reporting
3320 * "file not found" or "end of WAL" situations in archive recovery, or in
3321 * standby mode when promotion is triggered. If set to WARNING or below,
3322 * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3323 * levels the ereport() won't return.
3324 *
3325 * In standby mode, if after a successful return of XLogPageRead() the
3326 * caller finds the record it's interested in to be broken, it should
3327 * ereport the error with the level determined by
3328 * emode_for_corrupt_record(), and then set lastSourceFailed
3329 * and call XLogPageRead() again with the same arguments. This lets
3330 * XLogPageRead() to try fetching the record from another source, or to
3331 * sleep and retry.
3332 */
3333static int
3335 XLogRecPtr targetRecPtr, char *readBuf)
3336{
3337 XLogPageReadPrivate *private =
3339 int emode = private->emode;
3340 uint32 targetPageOff;
3342 int r;
3343 instr_time io_start;
3344
3346
3347 XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3348 targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3349
3350 /*
3351 * See if we need to switch to a new segment because the requested record
3352 * is not in the currently open one.
3353 */
3354 if (readFile >= 0 &&
3355 !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3356 {
3357 /*
3358 * Request a restartpoint if we've replayed too much xlog since the
3359 * last one.
3360 */
3362 {
3364 {
3365 (void) GetRedoRecPtr();
3368 }
3369 }
3370
3371 close(readFile);
3372 readFile = -1;
3374 }
3375
3376 XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3377
3378retry:
3379 /* See if we need to retrieve more data */
3380 if (readFile < 0 ||
3382 flushedUpto < targetPagePtr + reqLen))
3383 {
3384 if (readFile >= 0 &&
3387 flushedUpto < targetPagePtr + reqLen)
3388 return XLREAD_WOULDBLOCK;
3389
3390 switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3391 private->randAccess,
3392 private->fetching_ckpt,
3393 targetRecPtr,
3394 private->replayTLI,
3397 {
3398 case XLREAD_WOULDBLOCK:
3399 return XLREAD_WOULDBLOCK;
3400 case XLREAD_FAIL:
3401 if (readFile >= 0)
3402 close(readFile);
3403 readFile = -1;
3404 readLen = 0;
3406 return XLREAD_FAIL;
3407 case XLREAD_SUCCESS:
3408 break;
3409 }
3410 }
3411
3412 /*
3413 * At this point, we have the right segment open and if we're streaming we
3414 * know the requested record is in it.
3415 */
3416 Assert(readFile != -1);
3417
3418 /*
3419 * If the current segment is being streamed from the primary, calculate
3420 * how much of the current page we have received already. We know the
3421 * requested record has been received, but this is for the benefit of
3422 * future calls, to allow quick exit at the top of this function.
3423 */
3425 {
3426 if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3427 readLen = XLOG_BLCKSZ;
3428 else
3430 targetPageOff;
3431 }
3432 else
3433 readLen = XLOG_BLCKSZ;
3434
3435 /* Read the requested page */
3436 readOff = targetPageOff;
3437
3438 /* Measure I/O timing when reading segment */
3440
3441 pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3442 r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff);
3443 if (r != XLOG_BLCKSZ)
3444 {
3445 char fname[MAXFNAMELEN];
3446 int save_errno = errno;
3447
3449
3451 io_start, 1, r);
3452
3454 if (r < 0)
3455 {
3456 errno = save_errno;
3457 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3459 errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
3460 fname, LSN_FORMAT_ARGS(targetPagePtr),
3461 readOff)));
3462 }
3463 else
3464 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3466 errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
3467 fname, LSN_FORMAT_ARGS(targetPagePtr),
3468 readOff, r, (Size) XLOG_BLCKSZ)));
3469 goto next_record_is_invalid;
3470 }
3472
3474 io_start, 1, r);
3475
3476 Assert(targetSegNo == readSegNo);
3477 Assert(targetPageOff == readOff);
3478 Assert(reqLen <= readLen);
3479
3481
3482 /*
3483 * Check the page header immediately, so that we can retry immediately if
3484 * it's not valid. This may seem unnecessary, because ReadPageInternal()
3485 * validates the page header anyway, and would propagate the failure up to
3486 * ReadRecord(), which would retry. However, there's a corner case with
3487 * continuation records, if a record is split across two pages such that
3488 * we would need to read the two pages from different sources across two
3489 * WAL segments.
3490 *
3491 * The first page is only available locally, in pg_wal, because it's
3492 * already been recycled on the primary. The second page, however, is not
3493 * present in pg_wal, and we should stream it from the primary. There is a
3494 * recycled WAL segment present in pg_wal, with garbage contents, however.
3495 * We would read the first page from the local WAL segment, but when
3496 * reading the second page, we would read the bogus, recycled, WAL
3497 * segment. If we didn't catch that case here, we would never recover,
3498 * because ReadRecord() would retry reading the whole record from the
3499 * beginning.
3500 *
3501 * Of course, this only catches errors in the page header, which is what
3502 * happens in the case of a recycled WAL segment. Other kinds of errors or
3503 * corruption still has the same problem. But this at least fixes the
3504 * common case, which can happen as part of normal operation.
3505 *
3506 * Validating the page header is cheap enough that doing it twice
3507 * shouldn't be a big deal from a performance point of view.
3508 *
3509 * When not in standby mode, an invalid page header should cause recovery
3510 * to end, not retry reading the page, so we don't need to validate the
3511 * page header here for the retry. Instead, ReadPageInternal() is
3512 * responsible for the validation.
3513 */
3514 if (StandbyMode &&
3515 (targetPagePtr % wal_segment_size) == 0 &&
3516 !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3517 {
3518 /*
3519 * Emit this error right now then retry this page immediately. Use
3520 * errmsg_internal() because the message was already translated.
3521 */
3522 if (xlogreader->errormsg_buf[0])
3525
3526 /* reset any error XLogReaderValidatePageHeader() might have set */
3528 goto next_record_is_invalid;
3529 }
3530
3531 return readLen;
3532
3533next_record_is_invalid:
3534
3535 /*
3536 * If we're reading ahead, give up fast. Retries and error reporting will
3537 * be handled by a later read when recovery catches up to this point.
3538 */
3540 return XLREAD_WOULDBLOCK;
3541
3542 lastSourceFailed = true;
3543
3544 if (readFile >= 0)
3545 close(readFile);
3546 readFile = -1;
3547 readLen = 0;
3549
3550 /* In standby-mode, keep trying */
3551 if (StandbyMode)
3552 goto retry;
3553 else
3554 return XLREAD_FAIL;
3555}
3556
3557/*
3558 * Open the WAL segment containing WAL location 'RecPtr'.
3559 *
3560 * The segment can be fetched via restore_command, or via walreceiver having
3561 * streamed the record, or it can already be present in pg_wal. Checking
3562 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3563 * too, in case someone copies a new segment directly to pg_wal. That is not
3564 * documented or recommended, though.
3565 *
3566 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3567 * prepare to read WAL starting from RedoStartLSN after this.
3568 *
3569 * 'RecPtr' might not point to the beginning of the record we're interested
3570 * in, it might also point to the page or segment header. In that case,
3571 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3572 * used to decide which timeline to stream the requested WAL from.
3573 *
3574 * 'replayLSN' is the current replay LSN, so that if we scan for new
3575 * timelines, we can reject a switch to a timeline that branched off before
3576 * this point.
3577 *
3578 * If the record is not immediately available, the function returns false
3579 * if we're not in standby mode. In standby mode, waits for it to become
3580 * available.
3581 *
3582 * When the requested record becomes available, the function opens the file
3583 * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3584 * of standby mode is triggered by the user, and there is no more WAL
3585 * available, returns XLREAD_FAIL.
3586 *
3587 * If nonblocking is true, then give up immediately if we can't satisfy the
3588 * request, returning XLREAD_WOULDBLOCK instead of waiting.
3589 */
3590static XLogPageReadResult
3592 bool fetching_ckpt, XLogRecPtr tliRecPtr,
3593 TimeLineID replayTLI, XLogRecPtr replayLSN,
3594 bool nonblocking)
3595{
3596 static TimestampTz last_fail_time = 0;
3598 bool streaming_reply_sent = false;
3599
3600 /*-------
3601 * Standby mode is implemented by a state machine:
3602 *
3603 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3604 * pg_wal (XLOG_FROM_PG_WAL)
3605 * 2. Check for promotion trigger request
3606 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3607 * 4. Rescan timelines
3608 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3609 *
3610 * Failure to read from the current source advances the state machine to
3611 * the next state.
3612 *
3613 * 'currentSource' indicates the current state. There are no currentSource
3614 * values for "check trigger", "rescan timelines", and "sleep" states,
3615 * those actions are taken when reading from the previous source fails, as
3616 * part of advancing to the next state.
3617 *
3618 * If standby mode is turned off while reading WAL from stream, we move
3619 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3620 * the files (which would be required at end of recovery, e.g., timeline
3621 * history file) from archive or pg_wal. We don't need to kill WAL receiver
3622 * here because it's already stopped when standby mode is turned off at
3623 * the end of recovery.
3624 *-------
3625 */
3626 if (!InArchiveRecovery)
3628 else if (currentSource == XLOG_FROM_ANY ||
3630 {
3631 lastSourceFailed = false;
3633 }
3634
3635 for (;;)
3636 {
3637 XLogSource oldSource = currentSource;
3638 bool startWalReceiver = false;
3639
3640 /*
3641 * First check if we failed to read from the current source, and
3642 * advance the state machine if so. The failure to read might've
3643 * happened outside this function, e.g when a CRC check fails on a
3644 * record, or within this loop.
3645 */
3646 if (lastSourceFailed)
3647 {
3648 /*
3649 * Don't allow any retry loops to occur during nonblocking
3650 * readahead. Let the caller process everything that has been
3651 * decoded already first.
3652 */
3653 if (nonblocking)
3654 return XLREAD_WOULDBLOCK;
3655
3656 switch (currentSource)
3657 {
3658 case XLOG_FROM_ARCHIVE:
3659 case XLOG_FROM_PG_WAL:
3660
3661 /*
3662 * Check to see if promotion is requested. Note that we do
3663 * this only after failure, so when you promote, we still
3664 * finish replaying as much as we can from archive and
3665 * pg_wal before failover.
3666 */
3668 {
3670 return XLREAD_FAIL;
3671 }
3672
3673 /*
3674 * Not in standby mode, and we've now tried the archive
3675 * and pg_wal.
3676 */
3677 if (!StandbyMode)
3678 return XLREAD_FAIL;
3679
3680 /*
3681 * Move to XLOG_FROM_STREAM state, and set to start a
3682 * walreceiver if necessary.
3683 */
3685 startWalReceiver = true;
3686 break;
3687
3688 case XLOG_FROM_STREAM:
3689
3690 /*
3691 * Failure while streaming. Most likely, we got here
3692 * because streaming replication was terminated, or
3693 * promotion was triggered. But we also get here if we
3694 * find an invalid record in the WAL streamed from the
3695 * primary, in which case something is seriously wrong.
3696 * There's little chance that the problem will just go
3697 * away, but PANIC is not good for availability either,
3698 * especially in hot standby mode. So, we treat that the
3699 * same as disconnection, and retry from archive/pg_wal
3700 * again. The WAL in the archive should be identical to
3701 * what was streamed, so it's unlikely that it helps, but
3702 * one can hope...
3703 */
3704
3705 /*
3706 * We should be able to move to XLOG_FROM_STREAM only in
3707 * standby mode.
3708 */
3710
3711 /*
3712 * Before we leave XLOG_FROM_STREAM state, make sure that
3713 * walreceiver is not active, so that it won't overwrite
3714 * WAL that we restore from archive.
3715 *
3716 * If walreceiver is actively streaming (or attempting to
3717 * connect), we must shut it down. However, if it's
3718 * already in WAITING state (e.g., due to timeline
3719 * divergence), we only need to reset the install flag to
3720 * allow archive restoration.
3721 */
3722 if (WalRcvStreaming())
3724 else
3725 {
3726 /*
3727 * WALRCV_STOPPING state is a transient state while
3728 * the startup process is in ShutdownWalRcv(). It
3729 * should never appear here since we would be waiting
3730 * for the walreceiver to reach WALRCV_STOPPED in that
3731 * case.
3732 */
3735 }
3736
3737 /*
3738 * Before we sleep, re-scan for possible new timelines if
3739 * we were requested to recover to the latest timeline.
3740 */
3742 {
3743 if (rescanLatestTimeLine(replayTLI, replayLSN))
3744 {
3746 break;
3747 }
3748 }
3749
3750 /*
3751 * XLOG_FROM_STREAM is the last state in our state
3752 * machine, so we've exhausted all the options for
3753 * obtaining the requested WAL. We're going to loop back
3754 * and retry from the archive, but if it hasn't been long
3755 * since last attempt, sleep wal_retrieve_retry_interval
3756 * milliseconds to avoid busy-waiting.
3757 */
3759 if (!TimestampDifferenceExceeds(last_fail_time, now,
3761 {
3762 long wait_time;
3763
3764 wait_time = wal_retrieve_retry_interval -
3765 TimestampDifferenceMilliseconds(last_fail_time, now);
3766
3767 elog(LOG, "waiting for WAL to become available at %X/%08X",
3768 LSN_FORMAT_ARGS(RecPtr));
3769
3770 /* Do background tasks that might benefit us later. */
3772
3776 wait_time,
3777 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3780
3781 /* Handle interrupt signals of startup process */
3783 }
3784 last_fail_time = now;
3786 break;
3787
3788 default:
3789 elog(ERROR, "unexpected WAL source %d", currentSource);
3790 }
3791 }
3792 else if (currentSource == XLOG_FROM_PG_WAL)
3793 {
3794 /*
3795 * We just successfully read a file in pg_wal. We prefer files in
3796 * the archive over ones in pg_wal, so try the next file again
3797 * from the archive first.
3798 */
3801 }
3802
3803 if (currentSource != oldSource)
3804 elog(DEBUG2, "switched WAL source from %s to %s after %s",
3806 lastSourceFailed ? "failure" : "success");
3807
3808 /*
3809 * We've now handled possible failure. Try to read from the chosen
3810 * source.
3811 */
3812 lastSourceFailed = false;
3813
3814 switch (currentSource)
3815 {
3816 case XLOG_FROM_ARCHIVE:
3817 case XLOG_FROM_PG_WAL:
3818
3819 /*
3820 * WAL receiver must not be running when reading WAL from
3821 * archive or pg_wal.
3822 */
3824
3825 /* Close any old file we might have open. */
3826 if (readFile >= 0)
3827 {
3828 close(readFile);
3829 readFile = -1;
3830 }
3831 /* Reset curFileTLI if random fetch. */
3832 if (randAccess)
3833 curFileTLI = 0;
3834
3835 /*
3836 * Try to restore the file from archive, or read an existing
3837 * file from pg_wal.
3838 */
3842 if (readFile >= 0)
3843 return XLREAD_SUCCESS; /* success! */
3844
3845 /*
3846 * Nope, not found in archive or pg_wal.
3847 */
3848 lastSourceFailed = true;
3849 break;
3850
3851 case XLOG_FROM_STREAM:
3852 {
3853 bool havedata;
3854
3855 /*
3856 * We should be able to move to XLOG_FROM_STREAM only in
3857 * standby mode.
3858 */
3860
3861 /*
3862 * First, shutdown walreceiver if its restart has been
3863 * requested -- but no point if we're already slated for
3864 * starting it.
3865 */
3866 if (pendingWalRcvRestart && !startWalReceiver)
3867 {
3869
3870 /*
3871 * Re-scan for possible new timelines if we were
3872 * requested to recover to the latest timeline.
3873 */
3876 rescanLatestTimeLine(replayTLI, replayLSN);
3877
3878 startWalReceiver = true;
3879 }
3880 pendingWalRcvRestart = false;
3881
3882 /*
3883 * Launch walreceiver if needed.
3884 *
3885 * If fetching_ckpt is true, RecPtr points to the initial
3886 * checkpoint location. In that case, we use RedoStartLSN
3887 * as the streaming start position instead of RecPtr, so
3888 * that when we later jump backwards to start redo at
3889 * RedoStartLSN, we will have the logs streamed already.
3890 */
3891 if (startWalReceiver &&
3892 PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3893 {
3894 XLogRecPtr ptr;
3895 TimeLineID tli;
3896
3897 if (fetching_ckpt)
3898 {
3899 ptr = RedoStartLSN;
3900 tli = RedoStartTLI;
3901 }
3902 else
3903 {
3904 ptr = RecPtr;
3905
3906 /*
3907 * Use the record begin position to determine the
3908 * TLI, rather than the position we're reading.
3909 */
3910 tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3911
3912 if (curFileTLI > 0 && tli < curFileTLI)
3913 elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3914 LSN_FORMAT_ARGS(tliRecPtr),
3915 tli, curFileTLI);
3916 }
3917 curFileTLI = tli;
3922 flushedUpto = 0;
3923 }
3924
3925 /*
3926 * Check if WAL receiver is active or wait to start up.
3927 */
3928 if (!WalRcvStreaming())
3929 {
3930 lastSourceFailed = true;
3931 break;
3932 }
3933
3934 /*
3935 * Walreceiver is active, so see if new data has arrived.
3936 *
3937 * We only advance XLogReceiptTime when we obtain fresh
3938 * WAL from walreceiver and observe that we had already
3939 * processed everything before the most recent "chunk"
3940 * that it flushed to disk. In steady state where we are
3941 * keeping up with the incoming data, XLogReceiptTime will
3942 * be updated on each cycle. When we are behind,
3943 * XLogReceiptTime will not advance, so the grace time
3944 * allotted to conflicting queries will decrease.
3945 */
3946 if (RecPtr < flushedUpto)
3947 havedata = true;
3948 else
3949 {
3950 XLogRecPtr latestChunkStart;
3951
3952 flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3953 if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3954 {
3955 havedata = true;
3956 if (latestChunkStart <= RecPtr)
3957 {
3960 }
3961 }
3962 else
3963 havedata = false;
3964 }
3965 if (havedata)
3966 {
3967 /*
3968 * Great, streamed far enough. Open the file if it's
3969 * not open already. Also read the timeline history
3970 * file if we haven't initialized timeline history
3971 * yet; it should be streamed over and present in
3972 * pg_wal by now. Use XLOG_FROM_STREAM so that source
3973 * info is set correctly and XLogReceiptTime isn't
3974 * changed.
3975 *
3976 * NB: We must set readTimeLineHistory based on
3977 * recoveryTargetTLI, not receiveTLI. Normally they'll
3978 * be the same, but if recovery_target_timeline is
3979 * 'latest' and archiving is configured, then it's
3980 * possible that we managed to retrieve one or more
3981 * new timeline history files from the archive,
3982 * updating recoveryTargetTLI.
3983 */
3984 if (readFile < 0)
3985 {
3986 if (!expectedTLEs)
3989 XLOG_FROM_STREAM, false);
3990 Assert(readFile >= 0);
3991 }
3992 else
3993 {
3994 /* just make sure source info is correct... */
3997 return XLREAD_SUCCESS;
3998 }
3999 break;
4000 }
4001
4002 /* In nonblocking mode, return rather than sleeping. */
4003 if (nonblocking)
4004 return XLREAD_WOULDBLOCK;
4005
4006 /*
4007 * Data not here yet. Check for trigger, then wait for
4008 * walreceiver to wake us up when new WAL arrives.
4009 */
4011 {
4012 /*
4013 * Note that we don't return XLREAD_FAIL immediately
4014 * here. After being triggered, we still want to
4015 * replay all the WAL that was already streamed. It's
4016 * in pg_wal now, so we just treat this as a failure,
4017 * and the state machine will move on to replay the
4018 * streamed WAL from pg_wal, and then recheck the
4019 * trigger and exit replay.
4020 */
4021 lastSourceFailed = true;
4022 break;
4023 }
4024
4025 /*
4026 * Since we have replayed everything we have received so
4027 * far and are about to start waiting for more WAL, let's
4028 * tell the upstream server our replay location now so
4029 * that pg_stat_replication doesn't show stale
4030 * information.
4031 */
4032 if (!streaming_reply_sent)
4033 {
4035 streaming_reply_sent = true;
4036 }
4037
4038 /* Do any background tasks that might benefit us later. */
4040
4041 /* Update pg_stat_recovery_prefetch before sleeping. */
4043
4044 /*
4045 * Wait for more WAL to arrive, when we will be woken
4046 * immediately by the WAL receiver.
4047 */
4050 -1L,
4051 WAIT_EVENT_RECOVERY_WAL_STREAM);
4053 break;
4054 }
4055
4056 default:
4057 elog(ERROR, "unexpected WAL source %d", currentSource);
4058 }
4059
4060 /*
4061 * Check for recovery pause here so that we can confirm more quickly
4062 * that a requested pause has actually taken effect.
4063 */
4064 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4066 recoveryPausesHere(false);
4067
4068 /*
4069 * This possibly-long loop needs to handle interrupts of startup
4070 * process.
4071 */
4073 }
4074
4075 return XLREAD_FAIL; /* not reached */
4076}
4077
4078
4079/*
4080 * Determine what log level should be used to report a corrupt WAL record
4081 * in the current WAL page, previously read by XLogPageRead().
4082 *
4083 * 'emode' is the error mode that would be used to report a file-not-found
4084 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4085 * we're retrying the exact same record that we've tried previously, only
4086 * complain the first time to keep the noise down. However, we only do when
4087 * reading from pg_wal, because we don't expect any invalid records in archive
4088 * or in records streamed from the primary. Files in the archive should be complete,
4089 * and we should never hit the end of WAL because we stop and wait for more WAL
4090 * to arrive before replaying it.
4091 *
4092 * NOTE: This function remembers the RecPtr value it was last called with,
4093 * to suppress repeated messages about the same record. Only call this when
4094 * you are about to ereport(), or you might cause a later message to be
4095 * erroneously suppressed.
4096 */
4097static int
4099{
4100 static XLogRecPtr lastComplaint = 0;
4101
4102 if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4103 {
4104 if (RecPtr == lastComplaint)
4105 emode = DEBUG1;
4106 else
4107 lastComplaint = RecPtr;
4108 }
4109 return emode;
4110}
4111
4112
4113/*
4114 * Subroutine to try to fetch and validate a prior checkpoint record.
4115 */
4116static XLogRecord *
4118 TimeLineID replayTLI)
4119{
4120 XLogRecord *record;
4121 uint8 info;
4122
4123 Assert(xlogreader != NULL);
4124
4125 if (!XRecOffIsValid(RecPtr))
4126 {
4127 ereport(LOG,
4128 (errmsg("invalid checkpoint location")));
4129 return NULL;
4130 }
4131
4133 record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4134
4135 if (record == NULL)
4136 {
4137 ereport(LOG,
4138 (errmsg("invalid checkpoint record")));
4139 return NULL;
4140 }
4141 if (record->xl_rmid != RM_XLOG_ID)
4142 {
4143 ereport(LOG,
4144 (errmsg("invalid resource manager ID in checkpoint record")));
4145 return NULL;
4146 }
4147 info = record->xl_info & ~XLR_INFO_MASK;
4148 if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4149 info != XLOG_CHECKPOINT_ONLINE)
4150 {
4151 ereport(LOG,
4152 (errmsg("invalid xl_info in checkpoint record")));
4153 return NULL;
4154 }
4156 {
4157 ereport(LOG,
4158 (errmsg("invalid length of checkpoint record")));
4159 return NULL;
4160 }
4161 return record;
4162}
4163
4164/*
4165 * Scan for new timelines that might have appeared in the archive since we
4166 * started recovery.
4167 *
4168 * If there are any, the function changes recovery target TLI to the latest
4169 * one and returns 'true'.
4170 */
4171static bool
4173{
4174 List *newExpectedTLEs;
4175 bool found;
4176 ListCell *cell;
4177 TimeLineID newtarget;
4178 TimeLineID oldtarget = recoveryTargetTLI;
4179 TimeLineHistoryEntry *currentTle = NULL;
4180
4182 if (newtarget == recoveryTargetTLI)
4183 {
4184 /* No new timelines found */
4185 return false;
4186 }
4187
4188 /*
4189 * Determine the list of expected TLIs for the new TLI
4190 */
4191
4192 newExpectedTLEs = readTimeLineHistory(newtarget);
4193
4194 /*
4195 * If the current timeline is not part of the history of the new timeline,
4196 * we cannot proceed to it.
4197 */
4198 found = false;
4199 foreach(cell, newExpectedTLEs)
4200 {
4201 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4202
4203 if (currentTle->tli == recoveryTargetTLI)
4204 {
4205 found = true;
4206 break;
4207 }
4208 }
4209 if (!found)
4210 {
4211 ereport(LOG,
4212 (errmsg("new timeline %u is not a child of database system timeline %u",
4213 newtarget,
4214 replayTLI)));
4215 return false;
4216 }
4217
4218 /*
4219 * The current timeline was found in the history file, but check that the
4220 * next timeline was forked off from it *after* the current recovery
4221 * location.
4222 */
4223 if (currentTle->end < replayLSN)
4224 {
4225 ereport(LOG,
4226 errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
4227 newtarget,
4228 replayTLI,
4229 LSN_FORMAT_ARGS(replayLSN)));
4230 return false;
4231 }
4232
4233 /* The new timeline history seems valid. Switch target */
4234 recoveryTargetTLI = newtarget;
4236 expectedTLEs = newExpectedTLEs;
4237
4238 /*
4239 * As in StartupXLOG(), try to ensure we have all the history files
4240 * between the old target and new target in pg_wal.
4241 */
4242 restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4243
4244 ereport(LOG,
4245 (errmsg("new target timeline is %u",
4247
4248 return true;
4249}
4250
4251
4252/*
4253 * Open a logfile segment for reading (during recovery).
4254 *
4255 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4256 * Otherwise, it's assumed to be already available in pg_wal.
4257 */
4258static int
4260 XLogSource source, bool notfoundOk)
4261{
4262 char xlogfname[MAXFNAMELEN];
4263 char activitymsg[MAXFNAMELEN + 16];
4264 char path[MAXPGPATH];
4265 int fd;
4266
4267 XLogFileName(xlogfname, tli, segno, wal_segment_size);
4268
4269 switch (source)
4270 {
4271 case XLOG_FROM_ARCHIVE:
4272 /* Report recovery progress in PS display */
4273 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4274 xlogfname);
4275 set_ps_display(activitymsg);
4276
4277 if (!RestoreArchivedFile(path, xlogfname,
4278 "RECOVERYXLOG",
4280 InRedo))
4281 return -1;
4282 break;
4283
4284 case XLOG_FROM_PG_WAL:
4285 case XLOG_FROM_STREAM:
4286 XLogFilePath(path, tli, segno, wal_segment_size);
4287 break;
4288
4289 default:
4290 elog(ERROR, "invalid XLogFileRead source %d", source);
4291 }
4292
4293 /*
4294 * If the segment was fetched from archival storage, replace the existing
4295 * xlog segment (if any) with the archival version.
4296 */
4298 {
4300 KeepFileRestoredFromArchive(path, xlogfname);
4301
4302 /*
4303 * Set path to point at the new file in pg_wal.
4304 */
4305 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4306 }
4307
4308 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4309 if (fd >= 0)
4310 {
4311 /* Success! */
4312 curFileTLI = tli;
4313
4314 /* Report recovery progress in PS display */
4315 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4316 xlogfname);
4317 set_ps_display(activitymsg);
4318
4319 /* Track source of data in assorted state variables */
4322 /* In FROM_STREAM case, caller tracks receipt time, not me */
4323 if (source != XLOG_FROM_STREAM)
4325
4326 return fd;
4327 }
4328 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4329 ereport(PANIC,
4331 errmsg("could not open file \"%s\": %m", path)));
4332 return -1;
4333}
4334
4335/*
4336 * Open a logfile segment for reading (during recovery).
4337 *
4338 * This version searches for the segment with any TLI listed in expectedTLEs.
4339 */
4340static int
4342{
4343 char path[MAXPGPATH];
4344 ListCell *cell;
4345 int fd;
4346 List *tles;
4347
4348 /*
4349 * Loop looking for a suitable timeline ID: we might need to read any of
4350 * the timelines listed in expectedTLEs.
4351 *
4352 * We expect curFileTLI on entry to be the TLI of the preceding file in
4353 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4354 * to go backwards; this prevents us from picking up the wrong file when a
4355 * parent timeline extends to higher segment numbers than the child we
4356 * want to read.
4357 *
4358 * If we haven't read the timeline history file yet, read it now, so that
4359 * we know which TLIs to scan. We don't save the list in expectedTLEs,
4360 * however, unless we actually find a valid segment. That way if there is
4361 * neither a timeline history file nor a WAL segment in the archive, and
4362 * streaming replication is set up, we'll read the timeline history file
4363 * streamed from the primary when we start streaming, instead of
4364 * recovering with a dummy history generated here.
4365 */
4366 if (expectedTLEs)
4367 tles = expectedTLEs;
4368 else
4370
4371 foreach(cell, tles)
4372 {
4374 TimeLineID tli = hent->tli;
4375
4376 if (tli < curFileTLI)
4377 break; /* don't bother looking at too-old TLIs */
4378
4379 /*
4380 * Skip scanning the timeline ID that the logfile segment to read
4381 * doesn't belong to
4382 */
4383 if (XLogRecPtrIsValid(hent->begin))
4384 {
4385 XLogSegNo beginseg = 0;
4386
4387 XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4388
4389 /*
4390 * The logfile segment that doesn't belong to the timeline is
4391 * older or newer than the segment that the timeline started or
4392 * ended at, respectively. It's sufficient to check only the
4393 * starting segment of the timeline here. Since the timelines are
4394 * scanned in descending order in this loop, any segments newer
4395 * than the ending segment should belong to newer timeline and
4396 * have already been read before. So it's not necessary to check
4397 * the ending segment of the timeline here.
4398 */
4399 if (segno < beginseg)
4400 continue;
4401 }
4402
4404 {
4405 fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4406 if (fd != -1)
4407 {
4408 elog(DEBUG1, "got WAL segment from archive");
4409 if (!expectedTLEs)
4410 expectedTLEs = tles;
4411 return fd;
4412 }
4413 }
4414
4416 {
4417 fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4418 if (fd != -1)
4419 {
4420 if (!expectedTLEs)
4421 expectedTLEs = tles;
4422 return fd;
4423 }
4424 }
4425 }
4426
4427 /* Couldn't find it. For simplicity, complain about front timeline */
4429 errno = ENOENT;
4432 errmsg("could not open file \"%s\": %m", path)));
4433 return -1;
4434}
4435
4436/*
4437 * Set flag to signal the walreceiver to restart. (The startup process calls
4438 * this on noticing a relevant configuration change.)
4439 */
4440void
4442{
4444 {
4445 ereport(LOG,
4446 (errmsg("WAL receiver process shutdown requested")));
4447
4448 pendingWalRcvRestart = true;
4449 }
4450}
4451
4452
4453/*
4454 * Has a standby promotion already been triggered?
4455 *
4456 * Unlike CheckForStandbyTrigger(), this works in any process
4457 * that's connected to shared memory.
4458 */
4459bool
4461{
4462 /*
4463 * We check shared state each time only until a standby promotion is
4464 * triggered. We can't trigger a promotion again, so there's no need to
4465 * keep checking after the shared variable has once been seen true.
4466 */
4468 return true;
4469
4473
4475}
4476
4477static void
4479{
4483
4484 /*
4485 * Mark the recovery pause state as 'not paused' because the paused state
4486 * ends and promotion continues if a promotion is triggered while recovery
4487 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4488 * return 'paused' while a promotion is ongoing.
4489 */
4490 SetRecoveryPause(false);
4491
4493}
4494
4495/*
4496 * Check whether a promote request has arrived.
4497 */
4498static bool
4500{
4502 return true;
4503
4505 {
4506 ereport(LOG, (errmsg("received promote request")));
4510 return true;
4511 }
4512
4513 return false;
4514}
4515
4516/*
4517 * Remove the files signaling a standby promotion request.
4518 */
4519void
4521{
4522 unlink(PROMOTE_SIGNAL_FILE);
4523}
4524
4525/*
4526 * Check to see if a promote request has arrived.
4527 */
4528bool
4530{
4531 struct stat stat_buf;
4532
4533 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4534 return true;
4535
4536 return false;
4537}
4538
4539/*
4540 * Wake up startup process to replay newly arrived WAL, or to notice that
4541 * failover has been requested.
4542 */
4543void
4545{
4547}
4548
4549/*
4550 * Schedule a walreceiver wakeup in the main recovery loop.
4551 */
4552void
4554{
4556}
4557
4558/*
4559 * Is HotStandby active yet? This is only important in special backends
4560 * since normal backends won't ever be able to connect until this returns
4561 * true. Postmaster knows this by way of signal, not via shared memory.
4562 *
4563 * Unlike testing standbyState, this works in any process that's connected to
4564 * shared memory. (And note that standbyState alone doesn't tell the truth
4565 * anyway.)
4566 */
4567bool
4569{
4570 /*
4571 * We check shared state each time only until Hot Standby is active. We
4572 * can't de-activate Hot Standby, so there's no need to keep checking
4573 * after the shared variable has once been seen true.
4574 */
4576 return true;
4577 else
4578 {
4579 /* spinlock is essential on machines with weak memory ordering! */
4583
4584 return LocalHotStandbyActive;
4585 }
4586}
4587
4588/*
4589 * Like HotStandbyActive(), but to be used only in WAL replay code,
4590 * where we don't need to ask any other process what the state is.
4591 */
4592static bool
4594{
4596 return LocalHotStandbyActive;
4597}
4598
4599/*
4600 * Get latest redo apply position.
4601 *
4602 * Exported to allow WALReceiver to read the pointer directly.
4603 */
4606{
4607 XLogRecPtr recptr;
4608 TimeLineID tli;
4609
4614
4615 if (replayTLI)
4616 *replayTLI = tli;
4617 return recptr;
4618}
4619
4620
4621/*
4622 * Get position of last applied, or the record being applied.
4623 *
4624 * This is different from GetXLogReplayRecPtr() in that if a WAL
4625 * record is currently being applied, this includes that record.
4626 */
4629{
4630 XLogRecPtr recptr;
4631 TimeLineID tli;
4632
4637
4638 if (replayEndTLI)
4639 *replayEndTLI = tli;
4640 return recptr;
4641}
4642
4643/*
4644 * Save timestamp of latest processed commit/abort record.
4645 *
4646 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4647 * seen by processes other than the startup process. Note in particular
4648 * that CreateRestartPoint is executed in the checkpointer.
4649 */
4650static void
4652{
4656}
4657
4658/*
4659 * Fetch timestamp of latest processed commit/abort record.
4660 */
4663{
4664 TimestampTz xtime;
4665
4669
4670 return xtime;
4671}
4672
4673/*
4674 * Save timestamp of the next chunk of WAL records to apply.
4675 *
4676 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4677 * seen by all backends.
4678 */
4679static void
4681{
4685}
4686
4687/*
4688 * Fetch timestamp of latest processed commit/abort record.
4689 * Startup process maintains an accurate local copy in XLogReceiptTime
4690 */
4693{
4694 TimestampTz xtime;
4695
4699
4700 return xtime;
4701}
4702
4703/*
4704 * Returns time of receipt of current chunk of XLOG data, as well as
4705 * whether it was received from streaming replication or from archives.
4706 */
4707void
4708GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4709{
4710 /*
4711 * This must be executed in the startup process, since we don't export the
4712 * relevant state to shared memory.
4713 */
4715
4716 *rtime = XLogReceiptTime;
4717 *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4718}
4719
4720/*
4721 * Note that text field supplied is a parameter name and does not require
4722 * translation
4723 */
4724void
4725RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4726{
4727 if (currValue < minValue)
4728 {
4730 {
4731 bool warned_for_promote = false;
4732
4734 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4735 errmsg("hot standby is not possible because of insufficient parameter settings"),
4736 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4737 param_name,
4738 currValue,
4739 minValue)));
4740
4741 SetRecoveryPause(true);
4742
4743 ereport(LOG,
4744 (errmsg("recovery has paused"),
4745 errdetail("If recovery is unpaused, the server will shut down."),
4746 errhint("You can then restart the server after making the necessary configuration changes.")));
4747
4749 {
4751
4753 {
4754 if (!warned_for_promote)
4756 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4757 errmsg("promotion is not possible because of insufficient parameter settings"),
4758
4759 /*
4760 * Repeat the detail from above so it's easy to find
4761 * in the log.
4762 */
4763 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4764 param_name,
4765 currValue,
4766 minValue),
4767 errhint("Restart the server after making the necessary configuration changes.")));
4768 warned_for_promote = true;
4769 }
4770
4771 /*
4772 * If recovery pause is requested then set it paused. While
4773 * we are in the loop, user might resume and pause again so
4774 * set this every time.
4775 */
4777
4778 /*
4779 * We wait on a condition variable that will wake us as soon
4780 * as the pause ends, but we use a timeout so we can check the
4781 * above conditions periodically too.
4782 */
4784 WAIT_EVENT_RECOVERY_PAUSE);
4785 }
4787 }
4788
4789 ereport(FATAL,
4790 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4791 errmsg("recovery aborted because of insufficient parameter settings"),
4792 /* Repeat the detail from above so it's easy to find in the log. */
4793 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4794 param_name,
4795 currValue,
4796 minValue),
4797 errhint("You can restart the server after making the necessary configuration changes.")));
4798 }
4799}
4800
4801
4802/*
4803 * GUC check_hook for primary_slot_name
4804 */
4805bool
4807{
4808 int err_code;
4809 char *err_msg = NULL;
4810 char *err_hint = NULL;
4811
4812 if (*newval && strcmp(*newval, "") != 0 &&
4813 !ReplicationSlotValidateNameInternal(*newval, false, &err_code,
4814 &err_msg, &err_hint))
4815 {
4816 GUC_check_errcode(err_code);
4817 GUC_check_errdetail("%s", err_msg);
4818 if (err_hint != NULL)
4819 GUC_check_errhint("%s", err_hint);
4820 return false;
4821 }
4822
4823 return true;
4824}
4825
4826/*
4827 * Recovery target settings: Only one of the several recovery_target* settings
4828 * may be set. Setting a second one results in an error. The global variable
4829 * recoveryTarget tracks which kind of recovery target was chosen. Other
4830 * variables store the actual target value (for example a string or a xid).
4831 * The assign functions of the parameters check whether a competing parameter
4832 * was already set. But we want to allow setting the same parameter multiple
4833 * times. We also want to allow unsetting a parameter and setting a different
4834 * one, so we unset recoveryTarget when the parameter is set to an empty
4835 * string.
4836 *
4837 * XXX this code is broken by design. Throwing an error from a GUC assign
4838 * hook breaks fundamental assumptions of guc.c. So long as all the variables
4839 * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4840 * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4841 * that we have odd behaviors such as unexpected GUC ordering dependencies.
4842 */
4843
4844pg_noreturn static void
4846{
4847 ereport(ERROR,
4848 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4849 errmsg("multiple recovery targets specified"),
4850 errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4851}
4852
4853/*
4854 * GUC check_hook for recovery_target
4855 */
4856bool
4858{
4859 if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4860 {
4861 GUC_check_errdetail("The only allowed value is \"immediate\".");
4862 return false;
4863 }
4864 return true;
4865}
4866
4867/*
4868 * GUC assign_hook for recovery_target
4869 */
4870void
4871assign_recovery_target(const char *newval, void *extra)
4872{
4876
4877 if (newval && strcmp(newval, "") != 0)
4879 else
4881}
4882
4883/*
4884 * GUC check_hook for recovery_target_lsn
4885 */
4886bool
4888{
4889 if (strcmp(*newval, "") != 0)
4890 {
4891 XLogRecPtr lsn;
4892 XLogRecPtr *myextra;
4893 ErrorSaveContext escontext = {T_ErrorSaveContext};
4894
4895 lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
4896 if (escontext.error_occurred)
4897 return false;
4898
4899 myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4900 if (!myextra)
4901 return false;
4902 *myextra = lsn;
4903 *extra = myextra;
4904 }
4905 return true;
4906}
4907
4908/*
4909 * GUC assign_hook for recovery_target_lsn
4910 */
4911void
4912assign_recovery_target_lsn(const char *newval, void *extra)
4913{
4917
4918 if (newval && strcmp(newval, "") != 0)
4919 {
4921 recoveryTargetLSN = *((XLogRecPtr *) extra);
4922 }
4923 else
4925}
4926
4927/*
4928 * GUC check_hook for recovery_target_name
4929 */
4930bool
4932{
4933 /* Use the value of newval directly */
4934 if (strlen(*newval) >= MAXFNAMELEN)
4935 {
4936 GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4937 "recovery_target_name", MAXFNAMELEN - 1);
4938 return false;
4939 }
4940 return true;
4941}
4942
4943/*
4944 * GUC assign_hook for recovery_target_name
4945 */
4946void
4947assign_recovery_target_name(const char *newval, void *extra)
4948{
4952
4953 if (newval && strcmp(newval, "") != 0)
4954 {
4957 }
4958 else
4960}
4961
4962/*
4963 * GUC check_hook for recovery_target_time
4964 *
4965 * The interpretation of the recovery_target_time string can depend on the
4966 * time zone setting, so we need to wait until after all GUC processing is
4967 * done before we can do the final parsing of the string. This check function
4968 * only does a parsing pass to catch syntax errors, but we store the string
4969 * and parse it again when we need to use it.
4970 */
4971bool
4973{
4974 if (strcmp(*newval, "") != 0)
4975 {
4976 /* reject some special values */
4977 if (strcmp(*newval, "now") == 0 ||
4978 strcmp(*newval, "today") == 0 ||
4979 strcmp(*newval, "tomorrow") == 0 ||
4980 strcmp(*newval, "yesterday") == 0)
4981 {
4982 return false;
4983 }
4984
4985 /*
4986 * parse timestamp value (see also timestamptz_in())
4987 */
4988 {
4989 char *str = *newval;
4990 fsec_t fsec;
4991 struct pg_tm tt,
4992 *tm = &tt;
4993 int tz;
4994 int dtype;
4995 int nf;
4996 int dterr;
4997 char *field[MAXDATEFIELDS];
4998 int ftype[MAXDATEFIELDS];
4999 char workbuf[MAXDATELEN + MAXDATEFIELDS];
5000 DateTimeErrorExtra dtextra;
5002
5003 dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
5004 field, ftype, MAXDATEFIELDS, &nf);
5005 if (dterr == 0)
5006 dterr = DecodeDateTime(field, ftype, nf,
5007 &dtype, tm, &fsec, &tz, &dtextra);
5008 if (dterr != 0)
5009 return false;
5010 if (dtype != DTK_DATE)
5011 return false;
5012
5013 if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
5014 {
5015 GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
5016 return false;
5017 }
5018 }
5019 }
5020 return true;
5021}
5022
5023/*
5024 * GUC assign_hook for recovery_target_time
5025 */
5026void
5027assign_recovery_target_time(const char *newval, void *extra)
5028{
5032
5033 if (newval && strcmp(newval, "") != 0)
5035 else
5037}
5038
5039/*
5040 * GUC check_hook for recovery_target_timeline
5041 */
5042bool
5044{
5047
5048 if (strcmp(*newval, "current") == 0)
5050 else if (strcmp(*newval, "latest") == 0)
5052 else
5053 {
5054 char *endp;
5055 uint64 timeline;
5056
5058
5059 errno = 0;
5060 timeline = strtou64(*newval, &endp, 0);
5061
5062 if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
5063 {
5064 GUC_check_errdetail("\"%s\" is not a valid number.",
5065 "recovery_target_timeline");
5066 return false;
5067 }
5068
5069 if (timeline < 1 || timeline > PG_UINT32_MAX)
5070 {
5071 GUC_check_errdetail("\"%s\" must be between %u and %u.",
5072 "recovery_target_timeline", 1, UINT_MAX);
5073 return false;
5074 }
5075 }
5076
5078 if (!myextra)
5079 return false;
5080 *myextra = rttg;
5081 *extra = myextra;
5082
5083 return true;
5084}
5085
5086/*
5087 * GUC assign_hook for recovery_target_timeline
5088 */
5089void
5091{
5094 recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5095 else
5097}
5098
5099/*
5100 * GUC check_hook for recovery_target_xid
5101 */
5102bool
5104{
5105 if (strcmp(*newval, "") != 0)
5106 {
5107 TransactionId xid;
5108 TransactionId *myextra;
5109
5110 errno = 0;
5111 xid = (TransactionId) strtou64(*newval, NULL, 0);
5112 if (errno == EINVAL || errno == ERANGE)
5113 return false;
5114
5115 myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5116 if (!myextra)
5117 return false;
5118 *myextra = xid;
5119 *extra = myextra;
5120 }
5121 return true;
5122}
5123
5124/*
5125 * GUC assign_hook for recovery_target_xid
5126 */
5127void
5128assign_recovery_target_xid(const char *newval, void *extra)
5129{
5133
5134 if (newval && strcmp(newval, "") != 0)
5135 {
5137 recoveryTargetXid = *((TransactionId *) extra);
5138 }
5139 else
5141}
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition: atomics.h:467
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition: timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:544
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:572
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition: timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition: timeline.c:50
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition: timeline.c:526
void remove_tablespace_symlink(const char *linkloc)
Definition: tablespace.c:883
bool allow_in_place_tablespaces
Definition: tablespace.c:85
void disable_startup_progress_timeout(void)
Definition: startup.c:309
bool IsPromoteSignaled(void)
Definition: startup.c:288
void begin_startup_progress_phase(void)
Definition: startup.c:343
void ProcessStartupProcInterrupts(void)
Definition: startup.c:154
void ResetPromoteSignaled(void)
Definition: startup.c:294
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition: datetime.c:773
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition: datetime.c:997
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1757
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition: timestamp.c:2006
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1781
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition: timestamp.c:418
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
const char * timestamptz_to_str(TimestampTz t)
Definition: timestamp.c:1862
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1609
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition: bufmgr.c:5699
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5478
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:436
@ BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:207
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:52
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:387
PageData * Page
Definition: bufpage.h:81
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:385
uint8_t uint8
Definition: c.h:550
#define PG_UINT32_MAX
Definition: c.h:609
#define pg_noreturn
Definition: c.h:170
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:229
#define PG_BINARY
Definition: c.h:1250
#define UINT64_FORMAT
Definition: c.h:571
int32_t int32
Definition: c.h:548
uint64_t uint64
Definition: c.h:553
uint32_t uint32
Definition: c.h:552
uint32 TransactionId
Definition: c.h:671
size_t Size
Definition: c.h:624
void RequestCheckpoint(int flags)
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition: timestamp.h:39
int32 fsec_t
Definition: timestamp.h:41
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errcode_for_file_access(void)
Definition: elog.c:886
int errdetail(const char *fmt,...)
Definition: elog.c:1216
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errhint(const char *fmt,...)
Definition: elog.c:1330
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define LOG
Definition: elog.h:31
#define errcontext
Definition: elog.h:198
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1108
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:779
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1086
int FreeFile(FILE *file)
Definition: fd.c:2823
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2887
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2953
int pg_fsync(int fd)
Definition: fd.c:386
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2624
#define palloc_object(type)
Definition: fe_memutils.h:74
#define palloc0_object(type)
Definition: fe_memutils.h:75
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:547
@ PGFILETYPE_LNK
Definition: file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:686
bool IsUnderPostmaster
Definition: globals.c:120
char * DataDir
Definition: globals.c:71
bool IsPostmasterEnvironment
Definition: globals.c:119
void GUC_check_errcode(int sqlerrcode)
Definition: guc.c:6628
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:636
#define newval
#define GUC_check_errdetail
Definition: guc.h:505
GucSource
Definition: guc.h:112
#define GUC_check_errhint
Definition: guc.h:509
Assert(PointerIsAligned(start, uint64))
const char * str
#define MAXDATEFIELDS
Definition: datetime.h:202
#define DTK_DATE
Definition: datetime.h:144
#define MAXDATELEN
Definition: datetime.h:200
#define close(a)
Definition: win32.h:12
void proc_exit(int code)
Definition: ipc.c:104
int i
Definition: isn.c:77
void OwnLatch(Latch *latch)
Definition: latch.c:126
void DisownLatch(Latch *latch)
Definition: latch.c:144
void InitSharedLatch(Latch *latch)
Definition: latch.c:93
void SetLatch(Latch *latch)
Definition: latch.c:290
void ResetLatch(Latch *latch)
Definition: latch.c:374
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:172
List * lappend(List *list, void *datum)
Definition: list.c:339
void list_free_deep(List *list)
Definition: list.c:1560
static struct pg_tm tm
Definition: localtime.c:104
char * pstrdup(const char *in)
Definition: mcxt.c:1781
void pfree(void *pointer)
Definition: mcxt.c:1616
void * palloc(Size size)
Definition: mcxt.c:1387
#define AmStartupProcess()
Definition: miscadmin.h:390
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:477
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition: pg_control.h:76
#define XLOG_CHECKPOINT_REDO
Definition: pg_control.h:83
#define XLOG_OVERWRITE_CONTRECORD
Definition: pg_control.h:82
DBState
Definition: pg_control.h:92
@ DB_IN_ARCHIVE_RECOVERY
Definition: pg_control.h:98
@ DB_SHUTDOWNED_IN_RECOVERY
Definition: pg_control.h:95
@ DB_SHUTDOWNED
Definition: pg_control.h:94
@ DB_IN_CRASH_RECOVERY
Definition: pg_control.h:97
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:69
#define XLOG_BACKUP_END
Definition: pg_control.h:74
#define XLOG_CHECKPOINT_ONLINE
Definition: pg_control.h:70
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:78
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
XLogRecPtr pg_lsn_in_safe(const char *str, Node *escontext)
Definition: pg_lsn.c:32
static rewind_source * source
Definition: pg_rewind.c:89
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char buf[DEFAULT_XLOG_SEG_SIZE]
Definition: pg_test_fsync.c:71
@ IOOBJECT_WAL
Definition: pgstat.h:279
@ IOCONTEXT_NORMAL
Definition: pgstat.h:289
@ IOOP_READ
Definition: pgstat.h:315
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:122
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:165
@ PMSIGNAL_RECOVERY_STARTED
Definition: pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition: pmsignal.h:37
@ PMSIGNAL_RECOVERY_CONSISTENT
Definition: pmsignal.h:36
#define pg_pread
Definition: port.h:247
#define snprintf
Definition: port.h:260
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
off_t pgoff_t
Definition: port.h:421
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:262
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:360
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:222
#define InvalidOid
Definition: postgres_ext.h:37
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition: procarray.c:4382
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition: procarray.c:4543
static void set_ps_display(const char *activity)
Definition: ps_status.h:40
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
#define PG_TBLSPC_DIR
Definition: relpath.h:41
void RmgrStartup(void)
Definition: rmgr.c:58
void RmgrCleanup(void)
Definition: rmgr.c:74
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:389
bool ReplicationSlotValidateNameInternal(const char *name, bool allow_reserved_name, int *err_code, char **err_msg, char **err_hint)
Definition: slot.c:311
void ShutDownSlotSync(void)
Definition: slotsync.c:1784
#define SpinLockInit(lock)
Definition: spin.h:57
#define SpinLockRelease(lock)
Definition: spin.h:61
#define SpinLockAcquire(lock)
Definition: spin.h:59
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:145
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:230
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:242
void initStringInfo(StringInfo str)
Definition: stringinfo.c:97
Oid oldestMultiDB
Definition: pg_control.h:52
MultiXactId oldestMulti
Definition: pg_control.h:51
MultiXactOffset nextMultiOffset
Definition: pg_control.h:48
TransactionId newestCommitTsXid
Definition: pg_control.h:56
TransactionId oldestXid
Definition: pg_control.h:49
TimeLineID PrevTimeLineID
Definition: pg_control.h:40
TimeLineID ThisTimeLineID
Definition: pg_control.h:39
Oid nextOid
Definition: pg_control.h:46
MultiXactId nextMulti
Definition: pg_control.h:47
FullTransactionId nextXid
Definition: pg_control.h:45
TransactionId oldestCommitTsXid
Definition: pg_control.h:54
XLogRecPtr redo
Definition: pg_control.h:37
Oid oldestXidDB
Definition: pg_control.h:50
XLogRecPtr backupStartPoint
Definition: pg_control.h:172
bool backupEndRequired
Definition: pg_control.h:174
CheckPoint checkPointCopy
Definition: pg_control.h:137
XLogRecPtr backupEndPoint
Definition: pg_control.h:173
XLogRecPtr minRecoveryPoint
Definition: pg_control.h:170
XLogRecPtr checkPoint
Definition: pg_control.h:135
uint64 system_identifier
Definition: pg_control.h:112
TimeLineID minRecoveryPointTLI
Definition: pg_control.h:171
Definition: dirent.c:26
XLogRecPtr lastPageBeginPtr
Definition: xlogrecovery.h:121
XLogRecPtr abortedRecPtr
Definition: xlogrecovery.h:130
XLogRecPtr missingContrecPtr
Definition: xlogrecovery.h:131
TimeLineID endOfLogTLI
Definition: xlogrecovery.h:119
struct ErrorContextCallback * previous
Definition: elog.h:297
void(* callback)(void *arg)
Definition: elog.h:298
bool error_occurred
Definition: miscnodes.h:47
Definition: latch.h:114
Definition: pg_list.h:54
Definition: nodes.h:135
RelFileNumber relNumber
const char *(* rm_identify)(uint8 info)
void(* rm_mask)(char *pagedata, BlockNumber blkno)
void(* rm_redo)(XLogReaderState *record)
const char * rm_name
void(* rm_desc)(StringInfo buf, XLogReaderState *record)
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h:29
TimeLineID ws_tli
Definition: xlogreader.h:49
pg_atomic_uint64 minWaitedLSN[WAIT_LSN_TYPE_COUNT]
Definition: xlogwait.h:85
TimeLineID replayTLI
Definition: xlogrecovery.c:202
XLogRecPtr missingContrecPtr
Definition: xlogreader.h:214
char * errormsg_buf
Definition: xlogreader.h:310
XLogRecPtr EndRecPtr
Definition: xlogreader.h:206
uint64 system_identifier
Definition: xlogreader.h:190
XLogRecPtr ReadRecPtr
Definition: xlogreader.h:205
XLogRecPtr abortedRecPtr
Definition: xlogreader.h:213
TimeLineID latestPageTLI
Definition: xlogreader.h:279
XLogRecPtr overwrittenRecPtr
Definition: xlogreader.h:216
XLogRecPtr latestPagePtr
Definition: xlogreader.h:278
WALOpenSegment seg
Definition: xlogreader.h:271
void * private_data
Definition: xlogreader.h:195
uint8 xl_info
Definition: xlogrecord.h:46
uint32 xl_tot_len
Definition: xlogrecord.h:43
TransactionId xl_xid
Definition: xlogrecord.h:44
RmgrId xl_rmid
Definition: xlogrecord.h:47
ConditionVariable recoveryNotPausedCV
Definition: xlogrecovery.c:367
XLogRecPtr lastReplayedEndRecPtr
Definition: xlogrecovery.c:347
TimeLineID replayEndTLI
Definition: xlogrecovery.c:356
TimeLineID lastReplayedTLI
Definition: xlogrecovery.c:348
TimestampTz currentChunkStartTime
Definition: xlogrecovery.c:364
XLogRecPtr replayEndRecPtr
Definition: xlogrecovery.c:355
TimestampTz recoveryLastXTime
Definition: xlogrecovery.c:358
RecoveryPauseState recoveryPauseState
Definition: xlogrecovery.c:366
XLogRecPtr lastReplayedReadRecPtr
Definition: xlogrecovery.c:346
Definition: guc.h:174
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
Definition: pgtime.h:35
Definition: regguts.h:323
TimeLineID PrevTimeLineID
TimeLineID ThisTimeLineID
char rp_name[MAXFNAMELEN]
TransactionId twophase_xid
Definition: xact.h:428
TransactionId twophase_xid
Definition: xact.h:398
#define InvalidTransactionId
Definition: transam.h:31
#define U64FromFullTransactionId(x)
Definition: transam.h:49
#define XidFromFullTransactionId(x)
Definition: transam.h:48
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
#define TimestampTzPlusMilliseconds(tz, ms)
Definition: timestamp.h:85
static TimestampTz DatumGetTimestampTz(Datum X)
Definition: timestamp.h:34
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition: varsup.c:304
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:69
static void pgstat_report_wait_end(void)
Definition: wait_event.h:85
#define WL_TIMEOUT
Definition: waiteventset.h:37
#define WL_EXIT_ON_PM_DEATH
Definition: waiteventset.h:39
#define WL_LATCH_SET
Definition: waiteventset.h:34
void WalRcvForceReply(void)
Definition: walreceiver.c:1368
#define AllowCascadeReplication()
Definition: walreceiver.h:40
@ WALRCV_STOPPING
Definition: walreceiver.h:53
XLogRecPtr GetWalRcvFlushRecPtr(XLogRecPtr *latestChunkStart, TimeLineID *receiveTLI)
bool WalRcvStreaming(void)
void RequestXLogStreaming(TimeLineID tli, XLogRecPtr recptr, const char *conninfo, const char *slotname, bool create_temp_slot)
WalRcvState WalRcvGetState(void)
bool WalRcvRunning(void)
void WalSndWakeup(bool physical, bool logical)
Definition: walsender.c:3800
#define stat
Definition: win32_port.h:274
#define S_IRUSR
Definition: win32_port.h:279
#define symlink(oldpath, newpath)
Definition: win32_port.h:225
#define S_IWUSR
Definition: win32_port.h:282
#define XLOG_XACT_COMMIT_PREPARED
Definition: xact.h:173
#define XLOG_XACT_COMMIT
Definition: xact.h:170
#define XLOG_XACT_OPMASK
Definition: xact.h:180
#define XLOG_XACT_ABORT
Definition: xact.h:172
#define XLOG_XACT_ABORT_PREPARED
Definition: xact.h:174
void ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed)
Definition: xactdesc.c:35
void ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed)
Definition: xactdesc.c:141
int wal_decode_buffer_size
Definition: xlog.c:139
bool EnableHotStandby
Definition: xlog.c:124
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:6564
void SetInstallXLogFileSegmentActive(void)
Definition: xlog.c:9644
bool IsInstallXLogFileSegmentActive(void)
Definition: xlog.c:9661
int wal_segment_size
Definition: xlog.c:146
void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
Definition: xlog.c:6336
void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
Definition: xlog.c:3978
void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
Definition: xlog.c:6374
void ResetInstallXLogFileSegmentActive(void)
Definition: xlog.c:9653
int wal_retrieve_retry_interval
Definition: xlog.c:137
bool track_wal_io_timing
Definition: xlog.c:140
static ControlFileData * ControlFile
Definition: xlog.c:576
void XLogShutdownWalRcv(void)
Definition: xlog.c:9634
bool XLogCheckpointNeeded(XLogSegNo new_segno)
Definition: xlog.c:2284
#define TABLESPACE_MAP_OLD
Definition: xlog.h:322
#define TABLESPACE_MAP
Definition: xlog.h:321
#define STANDBY_SIGNAL_FILE
Definition: xlog.h:317
#define CHECKPOINT_CAUSE_XLOG
Definition: xlog.h:159
#define PROMOTE_SIGNAL_FILE
Definition: xlog.h:325
#define BACKUP_LABEL_FILE
Definition: xlog.h:318
#define RECOVERY_SIGNAL_FILE
Definition: xlog.h:316
static RmgrData GetRmgr(RmgrId rmid)
#define XLogSegmentOffset(xlogptr, wal_segsz_bytes)
#define MAXFNAMELEN
#define XLOGDIR
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)
static void XLogFilePath(char *path, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XRecOffIsValid(xlrp)
static void XLogFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes)
bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize, bool cleanupEnabled)
Definition: xlogarchive.c:54
void KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
Definition: xlogarchive.c:358
#define XLogRecPtrIsValid(r)
Definition: xlogdefs.h:29
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:47
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
uint32 TimeLineID
Definition: xlogdefs.h:63
uint64 XLogSegNo
Definition: xlogdefs.h:52
void XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher)
XLogPrefetcher * XLogPrefetcherAllocate(XLogReaderState *reader)
void XLogPrefetchReconfigure(void)
XLogRecord * XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg)
XLogReaderState * XLogPrefetcherGetReader(XLogPrefetcher *prefetcher)
void XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
void XLogPrefetcherFree(XLogPrefetcher *prefetcher)
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
Definition: xlogreader.c:2017
XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, XLogReaderRoutine *routine, void *private_data)
Definition: xlogreader.c:107
void XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size)
Definition: xlogreader.c:91
void XLogReaderResetError(XLogReaderState *state)
Definition: xlogreader.c:1376
bool XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, char *phdr)
Definition: xlogreader.c:1235
void XLogReaderFree(XLogReaderState *state)
Definition: xlogreader.c:162
bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
Definition: xlogreader.c:2076
#define XLogRecGetDataLen(decoder)
Definition: xlogreader.h:415
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:409
#define XLogRecBlockImageApply(decoder, block_id)
Definition: xlogreader.h:424
#define XLogRecGetRmid(decoder)
Definition: xlogreader.h:410
#define XLogRecGetData(decoder)
Definition: xlogreader.h:414
#define XLogRecGetXid(decoder)
Definition: xlogreader.h:411
#define XL_ROUTINE(...)
Definition: xlogreader.h:117
#define XLogRecMaxBlockId(decoder)
Definition: xlogreader.h:417
XLogPageReadResult
Definition: xlogreader.h:349
@ XLREAD_WOULDBLOCK
Definition: xlogreader.h:352
@ XLREAD_SUCCESS
Definition: xlogreader.h:350
@ XLREAD_FAIL
Definition: xlogreader.h:351
#define XLogRecHasBlockImage(decoder, block_id)
Definition: xlogreader.h:422
#define XLogRecGetPrev(decoder)
Definition: xlogreader.h:408
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:416
#define SizeOfXLogRecordDataHeaderShort
Definition: xlogrecord.h:217
#define XLR_INFO_MASK
Definition: xlogrecord.h:62
#define SizeOfXLogRecord
Definition: xlogrecord.h:55
#define XLR_CHECK_CONSISTENCY
Definition: xlogrecord.h:91
bool reachedConsistency
Definition: xlogrecovery.c:302
bool check_primary_slot_name(char **newval, void **extra, GucSource source)
static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
static XLogRecPtr recoveryStopLSN
Definition: xlogrecovery.c:389
static bool recoveryStopsBefore(XLogReaderState *record)
static TimestampTz recoveryStopTime
Definition: xlogrecovery.c:388
void assign_recovery_target_xid(const char *newval, void *extra)
static bool CheckForStandbyTrigger(void)
int recovery_min_apply_delay
Definition: xlogrecovery.c:96
bool check_recovery_target(char **newval, void **extra, GucSource source)
static bool backupEndRequired
Definition: xlogrecovery.c:286
bool HotStandbyActive(void)
static char * getRecoveryStopReason(void)
void ShutdownWalRecovery(void)
RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal
Definition: xlogrecovery.c:123
int recoveryTargetAction
Definition: xlogrecovery.c:90
static void rm_redo_error_callback(void *arg)
static bool recoveryApplyDelay(XLogReaderState *record)
bool ArchiveRecoveryRequested
Definition: xlogrecovery.c:140
const char * recoveryTargetName
Definition: xlogrecovery.c:94
static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
bool check_recovery_target_timeline(char **newval, void **extra, GucSource source)
static XLogRecPtr minRecoveryPoint
Definition: xlogrecovery.c:281
static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf)
static XLogRecPtr backupEndPoint
Definition: xlogrecovery.c:285
const struct config_enum_entry recovery_target_action_options[]
Definition: xlogrecovery.c:77
static void validateRecoveryParameters(void)
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, TimeLineID replayTLI)
static XLogRecord * ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr, TimeLineID replayTLI)
void StartupRequestWalReceiverRestart(void)
bool InArchiveRecovery
Definition: xlogrecovery.c:141
static bool recoveryStopsAfter(XLogReaderState *record)
void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
char * PrimarySlotName
Definition: xlogrecovery.c:100
static TimeLineID curFileTLI
Definition: xlogrecovery.c:127
static char recoveryStopName[MAXFNAMELEN]
Definition: xlogrecovery.c:390
static void CheckRecoveryConsistency(void)
static bool pendingWalRcvRestart
Definition: xlogrecovery.c:251
void PerformWalRecovery(void)
static XLogSource XLogReceiptSource
Definition: xlogrecovery.c:262
bool CheckPromoteSignal(void)
struct XLogPageReadPrivate XLogPageReadPrivate
static bool recoveryStopAfter
Definition: xlogrecovery.c:391
static const char *const xlogSourceNames[]
Definition: xlogrecovery.c:221
static TimeLineID RedoStartTLI
Definition: xlogrecovery.c:173
char * recoveryRestoreCommand
Definition: xlogrecovery.c:85
static void verifyBackupPageConsistency(XLogReaderState *record)
static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
void assign_recovery_target(const char *newval, void *extra)
void SetRecoveryPause(bool recoveryPause)
static bool lastSourceFailed
Definition: xlogrecovery.c:250
char * archiveCleanupCommand
Definition: xlogrecovery.c:87
XLogRecPtr GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
static TimeLineID receiveTLI
Definition: xlogrecovery.c:266
void WakeupRecovery(void)
void xlog_outdesc(StringInfo buf, XLogReaderState *record)
static bool LocalPromoteIsTriggered
Definition: xlogrecovery.c:185
bool PromoteIsTriggered(void)
TimestampTz GetCurrentChunkReplayStartTime(void)
static void ConfirmRecoveryPaused(void)
static void readRecoverySignalFile(void)
static XLogRecPtr missingContrecPtr
Definition: xlogrecovery.c:381
static XLogRecoveryCtlData * XLogRecoveryCtl
Definition: xlogrecovery.c:372
static uint32 readOff
Definition: xlogrecovery.c:235
static bool standby_signal_file_found
Definition: xlogrecovery.c:153
char * recovery_target_time_string
Definition: xlogrecovery.c:92
bool StandbyMode
Definition: xlogrecovery.c:150
static int readFile
Definition: xlogrecovery.c:233
static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr, TimeLineID replayTLI, XLogRecPtr replayLSN, bool nonblocking)
XLogRecPtr recoveryTargetLSN
Definition: xlogrecovery.c:95
RecoveryTargetType recoveryTarget
Definition: xlogrecovery.c:88
static bool read_tablespace_map(List **tablespaces)
static bool doRequestWalReceiverReply
Definition: xlogrecovery.c:188
static bool read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, bool *backupEndRequired, bool *backupFromStandby)
static int XLogFileRead(XLogSegNo segno, TimeLineID tli, XLogSource source, bool notfoundOk)
static XLogSource currentSource
Definition: xlogrecovery.c:249
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI)
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
static List * expectedTLEs
Definition: xlogrecovery.c:126
static XLogSegNo readSegNo
Definition: xlogrecovery.c:234
void assign_recovery_target_name(const char *newval, void *extra)
static XLogRecPtr abortedRecPtr
Definition: xlogrecovery.c:380
static char * primary_image_masked
Definition: xlogrecovery.c:306
static TimeLineID minRecoveryPointTLI
Definition: xlogrecovery.c:282
static XLogRecord * ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, bool fetching_ckpt, TimeLineID replayTLI)
EndOfWalRecoveryInfo * FinishWalRecovery(void)
void assign_recovery_target_time(const char *newval, void *extra)
static void SetCurrentChunkStartTime(TimestampTz xtime)
static XLogRecPtr CheckPointLoc
Definition: xlogrecovery.c:170
bool check_recovery_target_xid(char **newval, void **extra, GucSource source)
static bool LocalHotStandbyActive
Definition: xlogrecovery.c:179
struct XLogRecoveryCtlData XLogRecoveryCtlData
static bool HotStandbyActiveInReplay(void)
static bool InRedo
Definition: xlogrecovery.c:206
static TransactionId recoveryStopXid
Definition: xlogrecovery.c:387
bool check_recovery_target_time(char **newval, void **extra, GucSource source)
static XLogSource readSource
Definition: xlogrecovery.c:237
static void SetPromoteIsTriggered(void)
#define RECOVERY_COMMAND_FILE
Definition: xlogrecovery.c:71
TransactionId recoveryTargetXid
Definition: xlogrecovery.c:91
XLogSource
Definition: xlogrecovery.c:213
@ XLOG_FROM_PG_WAL
Definition: xlogrecovery.c:216
@ XLOG_FROM_STREAM
Definition: xlogrecovery.c:217
@ XLOG_FROM_ARCHIVE
Definition: xlogrecovery.c:215
@ XLOG_FROM_ANY
Definition: xlogrecovery.c:214
TimeLineID recoveryTargetTLIRequested
Definition: xlogrecovery.c:124
static pg_noreturn void error_multiple_recovery_targets(void)
void InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
Definition: xlogrecovery.c:520
static void xlog_block_info(StringInfo buf, XLogReaderState *record)
static TimestampTz XLogReceiptTime
Definition: xlogrecovery.c:261
static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
Size XLogRecoveryShmemSize(void)
Definition: xlogrecovery.c:455
static char * replay_image_masked
Definition: xlogrecovery.c:305
bool wal_receiver_create_temp_slot
Definition: xlogrecovery.c:101
static void CheckTablespaceDirectory(void)
char * recoveryEndCommand
Definition: xlogrecovery.c:86
RecoveryPauseState GetRecoveryPauseState(void)
TimeLineID recoveryTargetTLI
Definition: xlogrecovery.c:125
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
void assign_recovery_target_lsn(const char *newval, void *extra)
bool check_recovery_target_lsn(char **newval, void **extra, GucSource source)
static XLogRecPtr RedoStartLSN
Definition: xlogrecovery.c:172
static XLogRecPtr flushedUpto
Definition: xlogrecovery.c:265
void XLogRecoveryShmemInit(void)
Definition: xlogrecovery.c:466
static void recoveryPausesHere(bool endOfRecovery)
static uint32 readLen
Definition: xlogrecovery.c:236
static void EnableStandbyMode(void)
Definition: xlogrecovery.c:486
#define RECOVERY_COMMAND_DONE
Definition: xlogrecovery.c:72
static bool recovery_signal_file_found
Definition: xlogrecovery.c:154
TimestampTz recoveryTargetTime
Definition: xlogrecovery.c:93
TimestampTz GetLatestXTime(void)
char * PrimaryConnInfo
Definition: xlogrecovery.c:99
void XLogRequestWalReceiverReply(void)
static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
static XLogPrefetcher * xlogprefetcher
Definition: xlogrecovery.c:194
static bool StandbyModeRequested
Definition: xlogrecovery.c:149
bool check_recovery_target_name(char **newval, void **extra, GucSource source)
bool recoveryTargetInclusive
Definition: xlogrecovery.c:89
static XLogReaderState * xlogreader
Definition: xlogrecovery.c:191
void RemovePromoteSignalFiles(void)
void assign_recovery_target_timeline(const char *newval, void *extra)
static XLogRecPtr backupStartPoint
Definition: xlogrecovery.c:284
static void SetLatestXTime(TimestampTz xtime)
static TimeLineID CheckPointTLI
Definition: xlogrecovery.c:171
@ RECOVERY_TARGET_ACTION_PAUSE
Definition: xlogrecovery.h:48
@ RECOVERY_TARGET_ACTION_PROMOTE
Definition: xlogrecovery.h:49
@ RECOVERY_TARGET_ACTION_SHUTDOWN
Definition: xlogrecovery.h:50
RecoveryTargetType
Definition: xlogrecovery.h:24
@ RECOVERY_TARGET_IMMEDIATE
Definition: xlogrecovery.h:30
@ RECOVERY_TARGET_TIME
Definition: xlogrecovery.h:27
@ RECOVERY_TARGET_UNSET
Definition: xlogrecovery.h:25
@ RECOVERY_TARGET_XID
Definition: xlogrecovery.h:26
@ RECOVERY_TARGET_LSN
Definition: xlogrecovery.h:29
@ RECOVERY_TARGET_NAME
Definition: xlogrecovery.h:28
RecoveryTargetTimeLineGoal
Definition: xlogrecovery.h:37
@ RECOVERY_TARGET_TIMELINE_NUMERIC
Definition: xlogrecovery.h:40
@ RECOVERY_TARGET_TIMELINE_CONTROLFILE
Definition: xlogrecovery.h:38
@ RECOVERY_TARGET_TIMELINE_LATEST
Definition: xlogrecovery.h:39
RecoveryPauseState
Definition: xlogrecovery.h:55
@ RECOVERY_PAUSED
Definition: xlogrecovery.h:58
@ RECOVERY_NOT_PAUSED
Definition: xlogrecovery.h:56
@ RECOVERY_PAUSE_REQUESTED
Definition: xlogrecovery.h:57
void wal_segment_close(XLogReaderState *state)
Definition: xlogutils.c:831
Buffer XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode, Buffer recent_buffer)
Definition: xlogutils.c:460
HotStandbyState standbyState
Definition: xlogutils.c:53
bool InRecovery
Definition: xlogutils.c:50
void XLogCheckInvalidPages(void)
Definition: xlogutils.c:234
@ STANDBY_SNAPSHOT_READY
Definition: xlogutils.h:55
@ STANDBY_INITIALIZED
Definition: xlogutils.h:53
struct WaitLSNState * waitLSNState
Definition: xlogwait.c:69
void WaitLSNWakeup(WaitLSNType lsnType, XLogRecPtr currentLSN)
Definition: xlogwait.c:318
@ WAIT_LSN_TYPE_STANDBY_REPLAY
Definition: xlogwait.h:39