PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
xlogrecovery.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * xlogrecovery.c
4 * Functions for WAL recovery, standby mode
5 *
6 * This source file contains functions controlling WAL recovery.
7 * InitWalRecovery() initializes the system for crash or archive recovery,
8 * or standby mode, depending on configuration options and the state of
9 * the control file and possible backup label file. PerformWalRecovery()
10 * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 * and prepares information needed to initialize the WAL for writes. In
13 * addition to these three main functions, there are a bunch of functions
14 * for interrogating recovery state and controlling the recovery process.
15 *
16 *
17 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
19 *
20 * src/backend/access/transam/xlogrecovery.c
21 *
22 *-------------------------------------------------------------------------
23 */
24
25#include "postgres.h"
26
27#include <ctype.h>
28#include <math.h>
29#include <time.h>
30#include <sys/stat.h>
31#include <sys/time.h>
32#include <unistd.h>
33
34#include "access/timeline.h"
35#include "access/transam.h"
36#include "access/xact.h"
38#include "access/xlogarchive.h"
40#include "access/xlogreader.h"
41#include "access/xlogrecovery.h"
42#include "access/xlogutils.h"
43#include "backup/basebackup.h"
44#include "catalog/pg_control.h"
45#include "commands/tablespace.h"
46#include "common/file_utils.h"
47#include "miscadmin.h"
48#include "pgstat.h"
49#include "postmaster/bgwriter.h"
50#include "postmaster/startup.h"
51#include "replication/slot.h"
54#include "storage/fd.h"
55#include "storage/ipc.h"
56#include "storage/latch.h"
57#include "storage/pmsignal.h"
58#include "storage/procarray.h"
59#include "storage/spin.h"
60#include "utils/datetime.h"
61#include "utils/fmgrprotos.h"
62#include "utils/guc_hooks.h"
64#include "utils/pg_lsn.h"
65#include "utils/ps_status.h"
66#include "utils/pg_rusage.h"
67
68/* Unsupported old recovery command file names (relative to $PGDATA) */
69#define RECOVERY_COMMAND_FILE "recovery.conf"
70#define RECOVERY_COMMAND_DONE "recovery.done"
71
72/*
73 * GUC support
74 */
76 {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
77 {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
78 {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
79 {NULL, 0, false}
80};
81
82/* options formerly taken from recovery.conf for archive recovery */
84char *recoveryEndCommand = NULL;
95
96/* options formerly taken from recovery.conf for XLOG streaming */
97char *PrimaryConnInfo = NULL;
98char *PrimarySlotName = NULL;
100
101/*
102 * recoveryTargetTimeLineGoal: what the user requested, if any
103 *
104 * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
105 *
106 * recoveryTargetTLI: the currently understood target timeline; changes
107 *
108 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
109 * the timelines of its known parents, newest first (so recoveryTargetTLI is
110 * always the first list member). Only these TLIs are expected to be seen in
111 * the WAL segments we read, and indeed only these TLIs will be considered as
112 * candidate WAL files to open at all.
113 *
114 * curFileTLI: the TLI appearing in the name of the current input WAL file.
115 * (This is not necessarily the same as the timeline from which we are
116 * replaying WAL, which StartupXLOG calls replayTLI, because we could be
117 * scanning data that was copied from an ancestor timeline when the current
118 * file was created.) During a sequential scan we do not allow this value
119 * to decrease.
120 */
126
127/*
128 * When ArchiveRecoveryRequested is set, archive recovery was requested,
129 * ie. signal files were present. When InArchiveRecovery is set, we are
130 * currently recovering using offline XLOG archives. These variables are only
131 * valid in the startup process.
132 *
133 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
134 * currently performing crash recovery using only XLOG files in pg_wal, but
135 * will switch to using offline XLOG archives as soon as we reach the end of
136 * WAL in pg_wal.
137 */
139bool InArchiveRecovery = false;
140
141/*
142 * When StandbyModeRequested is set, standby mode was requested, i.e.
143 * standby.signal file was present. When StandbyMode is set, we are currently
144 * in standby mode. These variables are only valid in the startup process.
145 * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
146 */
147static bool StandbyModeRequested = false;
148bool StandbyMode = false;
149
150/* was a signal file present at startup? */
151static bool standby_signal_file_found = false;
152static bool recovery_signal_file_found = false;
153
154/*
155 * CheckPointLoc is the position of the checkpoint record that determines
156 * where to start the replay. It comes from the backup label file or the
157 * control file.
158 *
159 * RedoStartLSN is the checkpoint's REDO location, also from the backup label
160 * file or the control file. In standby mode, XLOG streaming usually starts
161 * from the position where an invalid record was found. But if we fail to
162 * read even the initial checkpoint record, we use the REDO location instead
163 * of the checkpoint location as the start position of XLOG streaming.
164 * Otherwise we would have to jump backwards to the REDO location after
165 * reading the checkpoint record, because the REDO record can precede the
166 * checkpoint record.
167 */
172
173/*
174 * Local copy of SharedHotStandbyActive variable. False actually means "not
175 * known, need to check the shared state".
176 */
177static bool LocalHotStandbyActive = false;
178
179/*
180 * Local copy of SharedPromoteIsTriggered variable. False actually means "not
181 * known, need to check the shared state".
182 */
183static bool LocalPromoteIsTriggered = false;
184
185/* Has the recovery code requested a walreceiver wakeup? */
187
188/* XLogReader object used to parse the WAL records */
190
191/* XLogPrefetcher object used to consume WAL records with read-ahead */
193
194/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
196{
197 int emode;
198 bool fetching_ckpt; /* are we fetching a checkpoint record? */
202
203/* flag to tell XLogPageRead that we have started replaying */
204static bool InRedo = false;
205
206/*
207 * Codes indicating where we got a WAL file from during recovery, or where
208 * to attempt to get one.
209 */
210typedef enum
211{
212 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
213 XLOG_FROM_ARCHIVE, /* restored using restore_command */
214 XLOG_FROM_PG_WAL, /* existing file in pg_wal */
215 XLOG_FROM_STREAM, /* streamed from primary */
216} XLogSource;
217
218/* human-readable names for XLogSources, for debugging output */
219static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
220
221/*
222 * readFile is -1 or a kernel FD for the log file segment that's currently
223 * open for reading. readSegNo identifies the segment. readOff is the offset
224 * of the page just read, readLen indicates how much of it has been read into
225 * readBuf, and readSource indicates where we got the currently open file from.
226 *
227 * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
228 * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
229 * worthwhile, since the XLOG is not read by general-purpose sessions.
230 */
231static int readFile = -1;
233static uint32 readOff = 0;
234static uint32 readLen = 0;
236
237/*
238 * Keeps track of which source we're currently reading from. This is
239 * different from readSource in that this is always set, even when we don't
240 * currently have a WAL file open. If lastSourceFailed is set, our last
241 * attempt to read from currentSource failed, and we should try another source
242 * next.
243 *
244 * pendingWalRcvRestart is set when a config change occurs that requires a
245 * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
246 */
248static bool lastSourceFailed = false;
249static bool pendingWalRcvRestart = false;
250
251/*
252 * These variables track when we last obtained some WAL data to process,
253 * and where we got it from. (XLogReceiptSource is initially the same as
254 * readSource, but readSource gets reset to zero when we don't have data
255 * to process right now. It is also different from currentSource, which
256 * also changes when we try to read from a source and fail, while
257 * XLogReceiptSource tracks where we last successfully read some WAL.)
258 */
261
262/* Local copy of WalRcv->flushedUpto */
265
266/*
267 * Copy of minRecoveryPoint and backupEndPoint from the control file.
268 *
269 * In order to reach consistency, we must replay the WAL up to
270 * minRecoveryPoint. If backupEndRequired is true, we must also reach
271 * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
272 * to backupStartPoint.
273 *
274 * Note: In archive recovery, after consistency has been reached, the
275 * functions in xlog.c will start updating minRecoveryPoint in the control
276 * file. But this copy of minRecoveryPoint variable reflects the value at the
277 * beginning of recovery, and is *not* updated after consistency is reached.
278 */
281
284static bool backupEndRequired = false;
285
286/*
287 * Have we reached a consistent database state? In crash recovery, we have
288 * to replay all the WAL, so reachedConsistency is never set. During archive
289 * recovery, the database is consistent once minRecoveryPoint is reached.
290 *
291 * Consistent state means that the system is internally consistent, all
292 * the WAL has been replayed up to a certain point, and importantly, there
293 * is no trace of later actions on disk.
294 *
295 * This flag is used only by the startup process and postmaster. When
296 * minRecoveryPoint is reached, the startup process sets it to true and
297 * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
298 * which then sets it to true upon receiving the signal.
299 */
301
302/* Buffers dedicated to consistency checks of size BLCKSZ */
303static char *replay_image_masked = NULL;
304static char *primary_image_masked = NULL;
305
306
307/*
308 * Shared-memory state for WAL recovery.
309 */
311{
312 /*
313 * SharedHotStandbyActive indicates if we allow hot standby queries to be
314 * run. Protected by info_lck.
315 */
317
318 /*
319 * SharedPromoteIsTriggered indicates if a standby promotion has been
320 * triggered. Protected by info_lck.
321 */
323
324 /*
325 * recoveryWakeupLatch is used to wake up the startup process to continue
326 * WAL replay, if it is waiting for WAL to arrive or promotion to be
327 * requested.
328 *
329 * Note that the startup process also uses another latch, its procLatch,
330 * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
331 * signaling the startup process in favor of using its procLatch, which
332 * comports better with possible generic signal handlers using that latch.
333 * But we should not do that because the startup process doesn't assume
334 * that it's waken up by walreceiver process or SIGHUP signal handler
335 * while it's waiting for recovery conflict. The separate latches,
336 * recoveryWakeupLatch and procLatch, should be used for inter-process
337 * communication for WAL replay and recovery conflict, respectively.
338 */
340
341 /*
342 * Last record successfully replayed.
343 */
344 XLogRecPtr lastReplayedReadRecPtr; /* start position */
345 XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
347
348 /*
349 * When we're currently replaying a record, ie. in a redo function,
350 * replayEndRecPtr points to the end+1 of the record being replayed,
351 * otherwise it's equal to lastReplayedEndRecPtr.
352 */
355 /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
357
358 /*
359 * timestamp of when we started replaying the current chunk of WAL data,
360 * only relevant for replication or archive recovery
361 */
363 /* Recovery pause state */
366
367 slock_t info_lck; /* locks shared variables shown above */
369
371
372/*
373 * abortedRecPtr is the start pointer of a broken record at end of WAL when
374 * recovery completes; missingContrecPtr is the location of the first
375 * contrecord that went missing. See CreateOverwriteContrecordRecord for
376 * details.
377 */
380
381/*
382 * if recoveryStopsBefore/After returns true, it saves information of the stop
383 * point here
384 */
390
391/* prototypes for local functions */
392static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
393
394static void EnableStandbyMode(void);
395static void readRecoverySignalFile(void);
396static void validateRecoveryParameters(void);
397static bool read_backup_label(XLogRecPtr *checkPointLoc,
398 TimeLineID *backupLabelTLI,
399 bool *backupEndRequired, bool *backupFromStandby);
400static bool read_tablespace_map(List **tablespaces);
401
402static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
403static void CheckRecoveryConsistency(void);
404static void rm_redo_error_callback(void *arg);
405#ifdef WAL_DEBUG
406static void xlog_outrec(StringInfo buf, XLogReaderState *record);
407#endif
408static void xlog_block_info(StringInfo buf, XLogReaderState *record);
409static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
410 TimeLineID prevTLI, TimeLineID replayTLI);
411static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
413
414static bool recoveryStopsBefore(XLogReaderState *record);
415static bool recoveryStopsAfter(XLogReaderState *record);
416static char *getRecoveryStopReason(void);
417static void recoveryPausesHere(bool endOfRecovery);
418static bool recoveryApplyDelay(XLogReaderState *record);
419static void ConfirmRecoveryPaused(void);
420
422 int emode, bool fetching_ckpt,
423 TimeLineID replayTLI);
424
425static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
426 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
428 bool randAccess,
429 bool fetching_ckpt,
430 XLogRecPtr tliRecPtr,
431 TimeLineID replayTLI,
432 XLogRecPtr replayLSN,
433 bool nonblocking);
434static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
436 XLogRecPtr RecPtr, TimeLineID replayTLI);
437static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
438static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
439 XLogSource source, bool notfoundOk);
441
442static bool CheckForStandbyTrigger(void);
443static void SetPromoteIsTriggered(void);
444static bool HotStandbyActiveInReplay(void);
445
446static void SetCurrentChunkStartTime(TimestampTz xtime);
447static void SetLatestXTime(TimestampTz xtime);
448
449/*
450 * Initialization of shared memory for WAL recovery
451 */
452Size
454{
455 Size size;
456
457 /* XLogRecoveryCtl */
458 size = sizeof(XLogRecoveryCtlData);
459
460 return size;
461}
462
463void
465{
466 bool found;
467
469 ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
470 if (found)
471 return;
472 memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
473
477}
478
479/*
480 * A thin wrapper to enable StandbyMode and do other preparatory work as
481 * needed.
482 */
483static void
485{
486 StandbyMode = true;
487
488 /*
489 * To avoid server log bloat, we don't report recovery progress in a
490 * standby as it will always be in recovery unless promoted. We disable
491 * startup progress timeout in standby mode to avoid calling
492 * startup_progress_timeout_handler() unnecessarily.
493 */
495}
496
497/*
498 * Prepare the system for WAL recovery, if needed.
499 *
500 * This is called by StartupXLOG() which coordinates the server startup
501 * sequence. This function analyzes the control file and the backup label
502 * file, if any, and figures out whether we need to perform crash recovery or
503 * archive recovery, and how far we need to replay the WAL to reach a
504 * consistent state.
505 *
506 * This doesn't yet change the on-disk state, except for creating the symlinks
507 * from table space map file if any, and for fetching WAL files needed to find
508 * the checkpoint record. On entry, the caller has already read the control
509 * file into memory, and passes it as argument. This function updates it to
510 * reflect the recovery state, and the caller is expected to write it back to
511 * disk does after initializing other subsystems, but before calling
512 * PerformWalRecovery().
513 *
514 * This initializes some global variables like ArchiveRecoveryRequested, and
515 * StandbyModeRequested and InRecovery.
516 */
517void
519 bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
520{
521 XLogPageReadPrivate *private;
522 struct stat st;
523 bool wasShutdown;
524 XLogRecord *record;
525 DBState dbstate_at_startup;
526 bool haveTblspcMap = false;
527 bool haveBackupLabel = false;
528 CheckPoint checkPoint;
529 bool backupFromStandby = false;
530
531 dbstate_at_startup = ControlFile->state;
532
533 /*
534 * Initialize on the assumption we want to recover to the latest timeline
535 * that's active according to pg_control.
536 */
540 else
542
543 /*
544 * Check for signal files, and if so set up state for offline recovery
545 */
548
549 /*
550 * Take ownership of the wakeup latch if we're going to sleep during
551 * recovery, if required.
552 */
555
556 /*
557 * Set the WAL reading processor now, as it will be needed when reading
558 * the checkpoint record required (backup_label or not).
559 */
560 private = palloc0(sizeof(XLogPageReadPrivate));
561 xlogreader =
563 XL_ROUTINE(.page_read = &XLogPageRead,
564 .segment_open = NULL,
565 .segment_close = wal_segment_close),
566 private);
567 if (!xlogreader)
569 (errcode(ERRCODE_OUT_OF_MEMORY),
570 errmsg("out of memory"),
571 errdetail("Failed while allocating a WAL reading processor.")));
573
574 /*
575 * Set the WAL decode buffer size. This limits how far ahead we can read
576 * in the WAL.
577 */
579
580 /* Create a WAL prefetcher. */
582
583 /*
584 * Allocate two page buffers dedicated to WAL consistency checks. We do
585 * it this way, rather than just making static arrays, for two reasons:
586 * (1) no need to waste the storage in most instantiations of the backend;
587 * (2) a static char array isn't guaranteed to have any particular
588 * alignment, whereas palloc() will provide MAXALIGN'd storage.
589 */
590 replay_image_masked = (char *) palloc(BLCKSZ);
591 primary_image_masked = (char *) palloc(BLCKSZ);
592
593 /*
594 * Read the backup_label file. We want to run this part of the recovery
595 * process after checking for signal files and after performing validation
596 * of the recovery parameters.
597 */
599 &backupFromStandby))
600 {
601 List *tablespaces = NIL;
602
603 /*
604 * Archive recovery was requested, and thanks to the backup label
605 * file, we know how far we need to replay to reach consistency. Enter
606 * archive recovery directly.
607 */
608 InArchiveRecovery = true;
611
612 /*
613 * Omitting backup_label when creating a new replica, PITR node etc.
614 * unfortunately is a common cause of corruption. Logging that
615 * backup_label was used makes it a bit easier to exclude that as the
616 * cause of observed corruption.
617 *
618 * Do so before we try to read the checkpoint record (which can fail),
619 * as otherwise it can be hard to understand why a checkpoint other
620 * than ControlFile->checkPoint is used.
621 */
622 ereport(LOG,
623 (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
626 CheckPointTLI)));
627
628 /*
629 * When a backup_label file is present, we want to roll forward from
630 * the checkpoint it identifies, rather than using pg_control.
631 */
634 if (record != NULL)
635 {
636 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
637 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
639 (errmsg_internal("checkpoint record is at %X/%X",
641 InRecovery = true; /* force recovery even if SHUTDOWNED */
642
643 /*
644 * Make sure that REDO location exists. This may not be the case
645 * if there was a crash during an online backup, which left a
646 * backup_label around that references a WAL segment that's
647 * already been archived.
648 */
649 if (checkPoint.redo < CheckPointLoc)
650 {
652 if (!ReadRecord(xlogprefetcher, LOG, false,
653 checkPoint.ThisTimeLineID))
655 (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
657 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
658 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
659 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
661 }
662 }
663 else
664 {
666 (errmsg("could not locate required checkpoint record at %X/%X",
668 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
669 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
670 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
672 wasShutdown = false; /* keep compiler quiet */
673 }
674
675 /* Read the tablespace_map file if present and create symlinks. */
676 if (read_tablespace_map(&tablespaces))
677 {
678 ListCell *lc;
679
680 foreach(lc, tablespaces)
681 {
682 tablespaceinfo *ti = lfirst(lc);
683 char *linkloc;
684
685 linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
686
687 /*
688 * Remove the existing symlink if any and Create the symlink
689 * under PGDATA.
690 */
692
693 if (symlink(ti->path, linkloc) < 0)
696 errmsg("could not create symbolic link \"%s\": %m",
697 linkloc)));
698
699 pfree(ti->path);
700 pfree(ti);
701 }
702
703 /* tell the caller to delete it later */
704 haveTblspcMap = true;
705 }
706
707 /* tell the caller to delete it later */
708 haveBackupLabel = true;
709 }
710 else
711 {
712 /* No backup_label file has been found if we are here. */
713
714 /*
715 * If tablespace_map file is present without backup_label file, there
716 * is no use of such file. There is no harm in retaining it, but it
717 * is better to get rid of the map file so that we don't have any
718 * redundant file in data directory and it will avoid any sort of
719 * confusion. It seems prudent though to just rename the file out of
720 * the way rather than delete it completely, also we ignore any error
721 * that occurs in rename operation as even if map file is present
722 * without backup_label file, it is harmless.
723 */
724 if (stat(TABLESPACE_MAP, &st) == 0)
725 {
726 unlink(TABLESPACE_MAP_OLD);
728 ereport(LOG,
729 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
731 errdetail("File \"%s\" was renamed to \"%s\".",
733 else
734 ereport(LOG,
735 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
737 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
739 }
740
741 /*
742 * It's possible that archive recovery was requested, but we don't
743 * know how far we need to replay the WAL before we reach consistency.
744 * This can happen for example if a base backup is taken from a
745 * running server using an atomic filesystem snapshot, without calling
746 * pg_backup_start/stop. Or if you just kill a running primary server
747 * and put it into archive recovery by creating a recovery signal
748 * file.
749 *
750 * Our strategy in that case is to perform crash recovery first,
751 * replaying all the WAL present in pg_wal, and only enter archive
752 * recovery after that.
753 *
754 * But usually we already know how far we need to replay the WAL (up
755 * to minRecoveryPoint, up to backupEndPoint, or until we see an
756 * end-of-backup record), and we can enter archive recovery directly.
757 */
763 {
764 InArchiveRecovery = true;
767 }
768
769 /*
770 * For the same reason as when starting up with backup_label present,
771 * emit a log message when we continue initializing from a base
772 * backup.
773 */
775 ereport(LOG,
776 (errmsg("restarting backup recovery with redo LSN %X/%X",
778
779 /* Get the last valid checkpoint record. */
786 if (record != NULL)
787 {
789 (errmsg_internal("checkpoint record is at %X/%X",
791 }
792 else
793 {
794 /*
795 * We used to attempt to go back to a secondary checkpoint record
796 * here, but only when not in standby mode. We now just fail if we
797 * can't read the last checkpoint because this allows us to
798 * simplify processing around checkpoints.
799 */
801 (errmsg("could not locate a valid checkpoint record at %X/%X",
803 }
804 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
805 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
806 }
807
809 {
811 ereport(LOG,
812 (errmsg("entering standby mode")));
814 ereport(LOG,
815 (errmsg("starting point-in-time recovery to XID %u",
818 ereport(LOG,
819 (errmsg("starting point-in-time recovery to %s",
822 ereport(LOG,
823 (errmsg("starting point-in-time recovery to \"%s\"",
826 ereport(LOG,
827 (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
830 ereport(LOG,
831 (errmsg("starting point-in-time recovery to earliest consistent point")));
832 else
833 ereport(LOG,
834 (errmsg("starting archive recovery")));
835 }
836
837 /*
838 * If the location of the checkpoint record is not on the expected
839 * timeline in the history of the requested timeline, we cannot proceed:
840 * the backup is not part of the history of the requested timeline.
841 */
842 Assert(expectedTLEs); /* was initialized by reading checkpoint
843 * record */
846 {
847 XLogRecPtr switchpoint;
848
849 /*
850 * tliSwitchPoint will throw an error if the checkpoint's timeline is
851 * not in expectedTLEs at all.
852 */
853 switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
855 (errmsg("requested timeline %u is not a child of this server's history",
857 /* translator: %s is a backup_label file or a pg_control file */
858 errdetail("Latest checkpoint in file \"%s\" is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
859 haveBackupLabel ? "backup_label" : "pg_control",
862 LSN_FORMAT_ARGS(switchpoint))));
863 }
864
865 /*
866 * The min recovery point should be part of the requested timeline's
867 * history, too.
868 */
873 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
877
879 (errmsg_internal("redo record is at %X/%X; shutdown %s",
880 LSN_FORMAT_ARGS(checkPoint.redo),
881 wasShutdown ? "true" : "false")));
883 (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
885 checkPoint.nextOid)));
887 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
888 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
890 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
891 checkPoint.oldestXid, checkPoint.oldestXidDB)));
893 (errmsg_internal("oldest MultiXactId: %u, in database %u",
894 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
896 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
897 checkPoint.oldestCommitTsXid,
898 checkPoint.newestCommitTsXid)));
901 (errmsg("invalid next transaction ID")));
902
903 /* sanity check */
904 if (checkPoint.redo > CheckPointLoc)
906 (errmsg("invalid redo in checkpoint record")));
907
908 /*
909 * Check whether we need to force recovery from WAL. If it appears to
910 * have been a clean shutdown and we did not have a recovery signal file,
911 * then assume no recovery needed.
912 */
913 if (checkPoint.redo < CheckPointLoc)
914 {
915 if (wasShutdown)
917 (errmsg("invalid redo record in shutdown checkpoint")));
918 InRecovery = true;
919 }
920 else if (ControlFile->state != DB_SHUTDOWNED)
921 InRecovery = true;
923 {
924 /* force recovery due to presence of recovery signal file */
925 InRecovery = true;
926 }
927
928 /*
929 * If recovery is needed, update our in-memory copy of pg_control to show
930 * that we are recovering and to show the selected checkpoint as the place
931 * we are starting from. We also mark pg_control with any minimum recovery
932 * stop point obtained from a backup history file.
933 *
934 * We don't write the changes to disk yet, though. Only do that after
935 * initializing various subsystems.
936 */
937 if (InRecovery)
938 {
940 {
942 }
943 else
944 {
945 ereport(LOG,
946 (errmsg("database system was not properly shut down; "
947 "automatic recovery in progress")));
949 ereport(LOG,
950 (errmsg("crash recovery starts in timeline %u "
951 "and has target timeline %u",
955 }
957 ControlFile->checkPointCopy = checkPoint;
959 {
960 /* initialize minRecoveryPoint if not set yet */
961 if (ControlFile->minRecoveryPoint < checkPoint.redo)
962 {
963 ControlFile->minRecoveryPoint = checkPoint.redo;
965 }
966 }
967
968 /*
969 * Set backupStartPoint if we're starting recovery from a base backup.
970 *
971 * Also set backupEndPoint and use minRecoveryPoint as the backup end
972 * location if we're starting recovery from a base backup which was
973 * taken from a standby. In this case, the database system status in
974 * pg_control must indicate that the database was already in recovery.
975 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
976 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
977 * before reaching this point; e.g. because restore_command or
978 * primary_conninfo were faulty.
979 *
980 * Any other state indicates that the backup somehow became corrupted
981 * and we can't sensibly continue with recovery.
982 */
983 if (haveBackupLabel)
984 {
985 ControlFile->backupStartPoint = checkPoint.redo;
987
988 if (backupFromStandby)
989 {
990 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
991 dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
993 (errmsg("backup_label contains data inconsistent with control file"),
994 errhint("This means that the backup is corrupted and you will "
995 "have to use another backup for recovery.")));
997 }
998 }
999 }
1000
1001 /* remember these, so that we know when we have reached consistency */
1006 {
1009 }
1010 else
1011 {
1014 }
1015
1016 /*
1017 * Start recovery assuming that the final record isn't lost.
1018 */
1021
1022 *wasShutdown_ptr = wasShutdown;
1023 *haveBackupLabel_ptr = haveBackupLabel;
1024 *haveTblspcMap_ptr = haveTblspcMap;
1025}
1026
1027/*
1028 * See if there are any recovery signal files and if so, set state for
1029 * recovery.
1030 *
1031 * See if there is a recovery command file (recovery.conf), and if so
1032 * throw an ERROR since as of PG12 we no longer recognize that.
1033 */
1034static void
1036{
1037 struct stat stat_buf;
1038
1040 return;
1041
1042 /*
1043 * Check for old recovery API file: recovery.conf
1044 */
1045 if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1046 ereport(FATAL,
1048 errmsg("using recovery command file \"%s\" is not supported",
1050
1051 /*
1052 * Remove unused .done file, if present. Ignore if absent.
1053 */
1054 unlink(RECOVERY_COMMAND_DONE);
1055
1056 /*
1057 * Check for recovery signal files and if found, fsync them since they
1058 * represent server state information. We don't sweat too much about the
1059 * possibility of fsync failure, however.
1060 *
1061 * If present, standby signal file takes precedence. If neither is present
1062 * then we won't enter archive recovery.
1063 */
1064 if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1065 {
1066 int fd;
1067
1069 S_IRUSR | S_IWUSR);
1070 if (fd >= 0)
1071 {
1072 (void) pg_fsync(fd);
1073 close(fd);
1074 }
1076 }
1077 else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1078 {
1079 int fd;
1080
1082 S_IRUSR | S_IWUSR);
1083 if (fd >= 0)
1084 {
1085 (void) pg_fsync(fd);
1086 close(fd);
1087 }
1089 }
1090
1091 StandbyModeRequested = false;
1094 {
1095 StandbyModeRequested = true;
1097 }
1099 {
1100 StandbyModeRequested = false;
1102 }
1103 else
1104 return;
1105
1106 /*
1107 * We don't support standby mode in standalone backends; that requires
1108 * other processes such as the WAL receiver to be alive.
1109 */
1111 ereport(FATAL,
1112 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1113 errmsg("standby mode is not supported by single-user servers")));
1114}
1115
1116static void
1118{
1120 return;
1121
1122 /*
1123 * Check for compulsory parameters
1124 */
1126 {
1127 if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1128 (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1130 (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1131 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1132 }
1133 else
1134 {
1135 if (recoveryRestoreCommand == NULL ||
1136 strcmp(recoveryRestoreCommand, "") == 0)
1137 ereport(FATAL,
1138 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1139 errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1140 }
1141
1142 /*
1143 * Override any inconsistent requests. Note that this is a change of
1144 * behaviour in 9.5; prior to this we simply ignored a request to pause if
1145 * hot_standby = off, which was surprising behaviour.
1146 */
1150
1151 /*
1152 * Final parsing of recovery_target_time string; see also
1153 * check_recovery_target_time().
1154 */
1156 {
1160 Int32GetDatum(-1)));
1161 }
1162
1163 /*
1164 * If user specified recovery_target_timeline, validate it or compute the
1165 * "latest" value. We can't do this until after we've gotten the restore
1166 * command and set InArchiveRecovery, because we need to fetch timeline
1167 * history files from the archive.
1168 */
1170 {
1172
1173 /* Timeline 1 does not have a history file, all else should */
1174 if (rtli != 1 && !existsTimeLineHistory(rtli))
1175 ereport(FATAL,
1176 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1177 errmsg("recovery target timeline %u does not exist",
1178 rtli)));
1179 recoveryTargetTLI = rtli;
1180 }
1182 {
1183 /* We start the "latest" search from pg_control's timeline */
1185 }
1186 else
1187 {
1188 /*
1189 * else we just use the recoveryTargetTLI as already read from
1190 * ControlFile
1191 */
1193 }
1194}
1195
1196/*
1197 * read_backup_label: check to see if a backup_label file is present
1198 *
1199 * If we see a backup_label during recovery, we assume that we are recovering
1200 * from a backup dump file, and we therefore roll forward from the checkpoint
1201 * identified by the label file, NOT what pg_control says. This avoids the
1202 * problem that pg_control might have been archived one or more checkpoints
1203 * later than the start of the dump, and so if we rely on it as the start
1204 * point, we will fail to restore a consistent database state.
1205 *
1206 * Returns true if a backup_label was found (and fills the checkpoint
1207 * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1208 * returns false if not. If this backup_label came from a streamed backup,
1209 * *backupEndRequired is set to true. If this backup_label was created during
1210 * recovery, *backupFromStandby is set to true.
1211 *
1212 * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1213 * and TLI read from the backup file.
1214 */
1215static bool
1216read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1217 bool *backupEndRequired, bool *backupFromStandby)
1218{
1219 char startxlogfilename[MAXFNAMELEN];
1220 TimeLineID tli_from_walseg,
1221 tli_from_file;
1222 FILE *lfp;
1223 char ch;
1224 char backuptype[20];
1225 char backupfrom[20];
1226 char backuplabel[MAXPGPATH];
1227 char backuptime[128];
1228 uint32 hi,
1229 lo;
1230
1231 /* suppress possible uninitialized-variable warnings */
1232 *checkPointLoc = InvalidXLogRecPtr;
1233 *backupLabelTLI = 0;
1234 *backupEndRequired = false;
1235 *backupFromStandby = false;
1236
1237 /*
1238 * See if label file is present
1239 */
1240 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1241 if (!lfp)
1242 {
1243 if (errno != ENOENT)
1244 ereport(FATAL,
1246 errmsg("could not read file \"%s\": %m",
1248 return false; /* it's not there, all is fine */
1249 }
1250
1251 /*
1252 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1253 * is pretty crude, but we are not expecting any variability in the file
1254 * format).
1255 */
1256 if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1257 &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1258 ereport(FATAL,
1259 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1260 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1261 RedoStartLSN = ((uint64) hi) << 32 | lo;
1262 RedoStartTLI = tli_from_walseg;
1263 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1264 &hi, &lo, &ch) != 3 || ch != '\n')
1265 ereport(FATAL,
1266 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1267 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1268 *checkPointLoc = ((uint64) hi) << 32 | lo;
1269 *backupLabelTLI = tli_from_walseg;
1270
1271 /*
1272 * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1273 * which could mean either pg_basebackup or the pg_backup_start/stop
1274 * method was used) or if this label came from somewhere else (the only
1275 * other option today being from pg_rewind). If this was a streamed
1276 * backup then we know that we need to play through until we get to the
1277 * end of the WAL which was generated during the backup (at which point we
1278 * will have reached consistency and backupEndRequired will be reset to be
1279 * false).
1280 */
1281 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1282 {
1283 if (strcmp(backuptype, "streamed") == 0)
1284 *backupEndRequired = true;
1285 }
1286
1287 /*
1288 * BACKUP FROM lets us know if this was from a primary or a standby. If
1289 * it was from a standby, we'll double-check that the control file state
1290 * matches that of a standby.
1291 */
1292 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1293 {
1294 if (strcmp(backupfrom, "standby") == 0)
1295 *backupFromStandby = true;
1296 }
1297
1298 /*
1299 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1300 * but checking for their presence is useful for debugging and the next
1301 * sanity checks. Cope also with the fact that the result buffers have a
1302 * pre-allocated size, hence if the backup_label file has been generated
1303 * with strings longer than the maximum assumed here an incorrect parsing
1304 * happens. That's fine as only minor consistency checks are done
1305 * afterwards.
1306 */
1307 if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1309 (errmsg_internal("backup time %s in file \"%s\"",
1310 backuptime, BACKUP_LABEL_FILE)));
1311
1312 if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1314 (errmsg_internal("backup label %s in file \"%s\"",
1315 backuplabel, BACKUP_LABEL_FILE)));
1316
1317 /*
1318 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1319 * it as a sanity check if present.
1320 */
1321 if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1322 {
1323 if (tli_from_walseg != tli_from_file)
1324 ereport(FATAL,
1325 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1326 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1327 errdetail("Timeline ID parsed is %u, but expected %u.",
1328 tli_from_file, tli_from_walseg)));
1329
1331 (errmsg_internal("backup timeline %u in file \"%s\"",
1332 tli_from_file, BACKUP_LABEL_FILE)));
1333 }
1334
1335 if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
1336 ereport(FATAL,
1337 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1338 errmsg("this is an incremental backup, not a data directory"),
1339 errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1340
1341 if (ferror(lfp) || FreeFile(lfp))
1342 ereport(FATAL,
1344 errmsg("could not read file \"%s\": %m",
1346
1347 return true;
1348}
1349
1350/*
1351 * read_tablespace_map: check to see if a tablespace_map file is present
1352 *
1353 * If we see a tablespace_map file during recovery, we assume that we are
1354 * recovering from a backup dump file, and we therefore need to create symlinks
1355 * as per the information present in tablespace_map file.
1356 *
1357 * Returns true if a tablespace_map file was found (and fills *tablespaces
1358 * with a tablespaceinfo struct for each tablespace listed in the file);
1359 * returns false if not.
1360 */
1361static bool
1363{
1364 tablespaceinfo *ti;
1365 FILE *lfp;
1366 char str[MAXPGPATH];
1367 int ch,
1368 i,
1369 n;
1370 bool was_backslash;
1371
1372 /*
1373 * See if tablespace_map file is present
1374 */
1375 lfp = AllocateFile(TABLESPACE_MAP, "r");
1376 if (!lfp)
1377 {
1378 if (errno != ENOENT)
1379 ereport(FATAL,
1381 errmsg("could not read file \"%s\": %m",
1382 TABLESPACE_MAP)));
1383 return false; /* it's not there, all is fine */
1384 }
1385
1386 /*
1387 * Read and parse the link name and path lines from tablespace_map file
1388 * (this code is pretty crude, but we are not expecting any variability in
1389 * the file format). De-escape any backslashes that were inserted.
1390 */
1391 i = 0;
1392 was_backslash = false;
1393 while ((ch = fgetc(lfp)) != EOF)
1394 {
1395 if (!was_backslash && (ch == '\n' || ch == '\r'))
1396 {
1397 char *endp;
1398
1399 if (i == 0)
1400 continue; /* \r immediately followed by \n */
1401
1402 /*
1403 * The de-escaped line should contain an OID followed by exactly
1404 * one space followed by a path. The path might start with
1405 * spaces, so don't be too liberal about parsing.
1406 */
1407 str[i] = '\0';
1408 n = 0;
1409 while (str[n] && str[n] != ' ')
1410 n++;
1411 if (n < 1 || n >= i - 1)
1412 ereport(FATAL,
1413 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1414 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1415 str[n++] = '\0';
1416
1417 ti = palloc0(sizeof(tablespaceinfo));
1418 errno = 0;
1419 ti->oid = strtoul(str, &endp, 10);
1420 if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1421 ereport(FATAL,
1422 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1423 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1424 ti->path = pstrdup(str + n);
1425 *tablespaces = lappend(*tablespaces, ti);
1426
1427 i = 0;
1428 continue;
1429 }
1430 else if (!was_backslash && ch == '\\')
1431 was_backslash = true;
1432 else
1433 {
1434 if (i < sizeof(str) - 1)
1435 str[i++] = ch;
1436 was_backslash = false;
1437 }
1438 }
1439
1440 if (i != 0 || was_backslash) /* last line not terminated? */
1441 ereport(FATAL,
1442 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1443 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1444
1445 if (ferror(lfp) || FreeFile(lfp))
1446 ereport(FATAL,
1448 errmsg("could not read file \"%s\": %m",
1449 TABLESPACE_MAP)));
1450
1451 return true;
1452}
1453
1454/*
1455 * Finish WAL recovery.
1456 *
1457 * This does not close the 'xlogreader' yet, because in some cases the caller
1458 * still wants to re-read the last checkpoint record by calling
1459 * ReadCheckpointRecord().
1460 *
1461 * Returns the position of the last valid or applied record, after which new
1462 * WAL should be appended, information about why recovery was ended, and some
1463 * other things. See the EndOfWalRecoveryInfo struct for details.
1464 */
1467{
1469 XLogRecPtr lastRec;
1470 TimeLineID lastRecTLI;
1471 XLogRecPtr endOfLog;
1472
1473 /*
1474 * Kill WAL receiver, if it's still running, before we continue to write
1475 * the startup checkpoint and aborted-contrecord records. It will trump
1476 * over these records and subsequent ones if it's still alive when we
1477 * start writing WAL.
1478 */
1480
1481 /*
1482 * Shutdown the slot sync worker to drop any temporary slots acquired by
1483 * it and to prevent it from keep trying to fetch the failover slots.
1484 *
1485 * We do not update the 'synced' column in 'pg_replication_slots' system
1486 * view from true to false here, as any failed update could leave 'synced'
1487 * column false for some slots. This could cause issues during slot sync
1488 * after restarting the server as a standby. While updating the 'synced'
1489 * column after switching to the new timeline is an option, it does not
1490 * simplify the handling for the 'synced' column. Therefore, we retain the
1491 * 'synced' column as true after promotion as it may provide useful
1492 * information about the slot origin.
1493 */
1495
1496 /*
1497 * We are now done reading the xlog from stream. Turn off streaming
1498 * recovery to force fetching the files (which would be required at end of
1499 * recovery, e.g., timeline history file) from archive or pg_wal.
1500 *
1501 * Note that standby mode must be turned off after killing WAL receiver,
1502 * i.e., calling XLogShutdownWalRcv().
1503 */
1505 StandbyMode = false;
1506
1507 /*
1508 * Determine where to start writing WAL next.
1509 *
1510 * Re-fetch the last valid or last applied record, so we can identify the
1511 * exact endpoint of what we consider the valid portion of WAL. There may
1512 * be an incomplete continuation record after that, in which case
1513 * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1514 * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1515 * it is intentionally missing. See CreateOverwriteContrecordRecord().
1516 *
1517 * An important side-effect of this is to load the last page into
1518 * xlogreader. The caller uses it to initialize the WAL for writing.
1519 */
1520 if (!InRecovery)
1521 {
1522 lastRec = CheckPointLoc;
1523 lastRecTLI = CheckPointTLI;
1524 }
1525 else
1526 {
1528 lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1529 }
1531 (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1532 endOfLog = xlogreader->EndRecPtr;
1533
1534 /*
1535 * Remember the TLI in the filename of the XLOG segment containing the
1536 * end-of-log. It could be different from the timeline that endOfLog
1537 * nominally belongs to, if there was a timeline switch in that segment,
1538 * and we were reading the old WAL from a segment belonging to a higher
1539 * timeline.
1540 */
1541 result->endOfLogTLI = xlogreader->seg.ws_tli;
1542
1544 {
1545 /*
1546 * We are no longer in archive recovery state.
1547 *
1548 * We are now done reading the old WAL. Turn off archive fetching if
1549 * it was active.
1550 */
1552 InArchiveRecovery = false;
1553
1554 /*
1555 * If the ending log segment is still open, close it (to avoid
1556 * problems on Windows with trying to rename or delete an open file).
1557 */
1558 if (readFile >= 0)
1559 {
1560 close(readFile);
1561 readFile = -1;
1562 }
1563 }
1564
1565 /*
1566 * Copy the last partial block to the caller, for initializing the WAL
1567 * buffer for appending new WAL.
1568 */
1569 if (endOfLog % XLOG_BLCKSZ != 0)
1570 {
1571 char *page;
1572 int len;
1573 XLogRecPtr pageBeginPtr;
1574
1575 pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1577
1578 /* Copy the valid part of the last block */
1579 len = endOfLog % XLOG_BLCKSZ;
1580 page = palloc(len);
1581 memcpy(page, xlogreader->readBuf, len);
1582
1583 result->lastPageBeginPtr = pageBeginPtr;
1584 result->lastPage = page;
1585 }
1586 else
1587 {
1588 /* There is no partial block to copy. */
1589 result->lastPageBeginPtr = endOfLog;
1590 result->lastPage = NULL;
1591 }
1592
1593 /*
1594 * Create a comment for the history file to explain why and where timeline
1595 * changed.
1596 */
1598
1599 result->lastRec = lastRec;
1600 result->lastRecTLI = lastRecTLI;
1601 result->endOfLog = endOfLog;
1602
1603 result->abortedRecPtr = abortedRecPtr;
1605
1608
1609 return result;
1610}
1611
1612/*
1613 * Clean up the WAL reader and leftovers from restoring WAL from archive
1614 */
1615void
1617{
1618 char recoveryPath[MAXPGPATH];
1619
1620 /* Final update of pg_stat_recovery_prefetch. */
1622
1623 /* Shut down xlogreader */
1624 if (readFile >= 0)
1625 {
1626 close(readFile);
1627 readFile = -1;
1628 }
1631
1633 {
1634 /*
1635 * Since there might be a partial WAL segment named RECOVERYXLOG, get
1636 * rid of it.
1637 */
1638 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1639 unlink(recoveryPath); /* ignore any error */
1640
1641 /* Get rid of any remaining recovered timeline-history file, too */
1642 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1643 unlink(recoveryPath); /* ignore any error */
1644 }
1645
1646 /*
1647 * We don't need the latch anymore. It's not strictly necessary to disown
1648 * it, but let's do it for the sake of tidiness.
1649 */
1652}
1653
1654/*
1655 * Perform WAL recovery.
1656 *
1657 * If the system was shut down cleanly, this is never called.
1658 */
1659void
1661{
1662 XLogRecord *record;
1663 bool reachedRecoveryTarget = false;
1664 TimeLineID replayTLI;
1665
1666 /*
1667 * Initialize shared variables for tracking progress of WAL replay, as if
1668 * we had just replayed the record before the REDO location (or the
1669 * checkpoint record itself, if it's a shutdown checkpoint).
1670 */
1673 {
1677 }
1678 else
1679 {
1683 }
1690
1691 /* Also ensure XLogReceiptTime has a sane value */
1693
1694 /*
1695 * Let postmaster know we've started redo now, so that it can launch the
1696 * archiver if necessary.
1697 */
1700
1701 /*
1702 * Allow read-only connections immediately if we're consistent already.
1703 */
1705
1706 /*
1707 * Find the first record that logically follows the checkpoint --- it
1708 * might physically precede it, though.
1709 */
1711 {
1712 /* back up to find the record */
1713 replayTLI = RedoStartTLI;
1715 record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1716
1717 /*
1718 * If a checkpoint record's redo pointer points back to an earlier
1719 * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1720 * record.
1721 */
1722 if (record->xl_rmid != RM_XLOG_ID ||
1724 ereport(FATAL,
1725 (errmsg("unexpected record type found at redo point %X/%X",
1727 }
1728 else
1729 {
1730 /* just have to read next record after CheckPoint */
1732 replayTLI = CheckPointTLI;
1733 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1734 }
1735
1736 if (record != NULL)
1737 {
1738 TimestampTz xtime;
1739 PGRUsage ru0;
1740
1741 pg_rusage_init(&ru0);
1742
1743 InRedo = true;
1744
1745 RmgrStartup();
1746
1747 ereport(LOG,
1748 (errmsg("redo starts at %X/%X",
1750
1751 /* Prepare to report progress of the redo phase. */
1752 if (!StandbyMode)
1754
1755 /*
1756 * main redo apply loop
1757 */
1758 do
1759 {
1760 if (!StandbyMode)
1761 ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1763
1764#ifdef WAL_DEBUG
1765 if (XLOG_DEBUG)
1766 {
1768
1770 appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1773 xlog_outrec(&buf, xlogreader);
1774 appendStringInfoString(&buf, " - ");
1776 elog(LOG, "%s", buf.data);
1777 pfree(buf.data);
1778 }
1779#endif
1780
1781 /* Handle interrupt signals of startup process */
1783
1784 /*
1785 * Pause WAL replay, if requested by a hot-standby session via
1786 * SetRecoveryPause().
1787 *
1788 * Note that we intentionally don't take the info_lck spinlock
1789 * here. We might therefore read a slightly stale value of the
1790 * recoveryPause flag, but it can't be very stale (no worse than
1791 * the last spinlock we did acquire). Since a pause request is a
1792 * pretty asynchronous thing anyway, possibly responding to it one
1793 * WAL record later than we otherwise would is a minor issue, so
1794 * it doesn't seem worth adding another spinlock cycle to prevent
1795 * that.
1796 */
1797 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1799 recoveryPausesHere(false);
1800
1801 /*
1802 * Have we reached our recovery target?
1803 */
1805 {
1806 reachedRecoveryTarget = true;
1807 break;
1808 }
1809
1810 /*
1811 * If we've been asked to lag the primary, wait on latch until
1812 * enough time has passed.
1813 */
1815 {
1816 /*
1817 * We test for paused recovery again here. If user sets
1818 * delayed apply, it may be because they expect to pause
1819 * recovery in case of problems, so we must test again here
1820 * otherwise pausing during the delay-wait wouldn't work.
1821 */
1822 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1824 recoveryPausesHere(false);
1825 }
1826
1827 /*
1828 * Apply the record
1829 */
1830 ApplyWalRecord(xlogreader, record, &replayTLI);
1831
1832 /* Exit loop if we reached inclusive recovery target */
1834 {
1835 reachedRecoveryTarget = true;
1836 break;
1837 }
1838
1839 /* Else, try to fetch the next WAL record */
1840 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1841 } while (record != NULL);
1842
1843 /*
1844 * end of main redo apply loop
1845 */
1846
1847 if (reachedRecoveryTarget)
1848 {
1849 if (!reachedConsistency)
1850 ereport(FATAL,
1851 (errmsg("requested recovery stop point is before consistent recovery point")));
1852
1853 /*
1854 * This is the last point where we can restart recovery with a new
1855 * recovery target, if we shutdown and begin again. After this,
1856 * Resource Managers may choose to do permanent corrective actions
1857 * at end of recovery.
1858 */
1859 switch (recoveryTargetAction)
1860 {
1862
1863 /*
1864 * exit with special return code to request shutdown of
1865 * postmaster. Log messages issued from postmaster.
1866 */
1867 proc_exit(3);
1868
1870 SetRecoveryPause(true);
1871 recoveryPausesHere(true);
1872
1873 /* drop into promote */
1874
1876 break;
1877 }
1878 }
1879
1880 RmgrCleanup();
1881
1882 ereport(LOG,
1883 (errmsg("redo done at %X/%X system usage: %s",
1885 pg_rusage_show(&ru0))));
1886 xtime = GetLatestXTime();
1887 if (xtime)
1888 ereport(LOG,
1889 (errmsg("last completed transaction was at log time %s",
1890 timestamptz_to_str(xtime))));
1891
1892 InRedo = false;
1893 }
1894 else
1895 {
1896 /* there are no WAL records following the checkpoint */
1897 ereport(LOG,
1898 (errmsg("redo is not required")));
1899 }
1900
1901 /*
1902 * This check is intentionally after the above log messages that indicate
1903 * how far recovery went.
1904 */
1907 !reachedRecoveryTarget)
1908 ereport(FATAL,
1909 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1910 errmsg("recovery ended before configured recovery target was reached")));
1911}
1912
1913/*
1914 * Subroutine of PerformWalRecovery, to apply one WAL record.
1915 */
1916static void
1918{
1919 ErrorContextCallback errcallback;
1920 bool switchedTLI = false;
1921
1922 /* Setup error traceback support for ereport() */
1923 errcallback.callback = rm_redo_error_callback;
1924 errcallback.arg = xlogreader;
1925 errcallback.previous = error_context_stack;
1926 error_context_stack = &errcallback;
1927
1928 /*
1929 * TransamVariables->nextXid must be beyond record's xid.
1930 */
1932
1933 /*
1934 * Before replaying this record, check if this record causes the current
1935 * timeline to change. The record is already considered to be part of the
1936 * new timeline, so we update replayTLI before replaying it. That's
1937 * important so that replayEndTLI, which is recorded as the minimum
1938 * recovery point's TLI if recovery stops after this record, is set
1939 * correctly.
1940 */
1941 if (record->xl_rmid == RM_XLOG_ID)
1942 {
1943 TimeLineID newReplayTLI = *replayTLI;
1944 TimeLineID prevReplayTLI = *replayTLI;
1945 uint8 info = record->xl_info & ~XLR_INFO_MASK;
1946
1947 if (info == XLOG_CHECKPOINT_SHUTDOWN)
1948 {
1949 CheckPoint checkPoint;
1950
1951 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1952 newReplayTLI = checkPoint.ThisTimeLineID;
1953 prevReplayTLI = checkPoint.PrevTimeLineID;
1954 }
1955 else if (info == XLOG_END_OF_RECOVERY)
1956 {
1957 xl_end_of_recovery xlrec;
1958
1959 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1960 newReplayTLI = xlrec.ThisTimeLineID;
1961 prevReplayTLI = xlrec.PrevTimeLineID;
1962 }
1963
1964 if (newReplayTLI != *replayTLI)
1965 {
1966 /* Check that it's OK to switch to this TLI */
1968 newReplayTLI, prevReplayTLI, *replayTLI);
1969
1970 /* Following WAL records should be run with new TLI */
1971 *replayTLI = newReplayTLI;
1972 switchedTLI = true;
1973 }
1974 }
1975
1976 /*
1977 * Update shared replayEndRecPtr before replaying this record, so that
1978 * XLogFlush will update minRecoveryPoint correctly.
1979 */
1982 XLogRecoveryCtl->replayEndTLI = *replayTLI;
1984
1985 /*
1986 * If we are attempting to enter Hot Standby mode, process XIDs we see
1987 */
1991
1992 /*
1993 * Some XLOG record types that are related to recovery are processed
1994 * directly here, rather than in xlog_redo()
1995 */
1996 if (record->xl_rmid == RM_XLOG_ID)
1997 xlogrecovery_redo(xlogreader, *replayTLI);
1998
1999 /* Now apply the WAL record itself */
2001
2002 /*
2003 * After redo, check whether the backup pages associated with the WAL
2004 * record are consistent with the existing pages. This check is done only
2005 * if consistency check is enabled for this record.
2006 */
2007 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2009
2010 /* Pop the error context stack */
2011 error_context_stack = errcallback.previous;
2012
2013 /*
2014 * Update lastReplayedEndRecPtr after this record has been successfully
2015 * replayed.
2016 */
2020 XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2022
2023 /* ------
2024 * Wakeup walsenders:
2025 *
2026 * On the standby, the WAL is flushed first (which will only wake up
2027 * physical walsenders) and then applied, which will only wake up logical
2028 * walsenders.
2029 *
2030 * Indeed, logical walsenders on standby can't decode and send data until
2031 * it's been applied.
2032 *
2033 * Physical walsenders don't need to be woken up during replay unless
2034 * cascading replication is allowed and time line change occurred (so that
2035 * they can notice that they are on a new time line).
2036 *
2037 * That's why the wake up conditions are for:
2038 *
2039 * - physical walsenders in case of new time line and cascade
2040 * replication is allowed
2041 * - logical walsenders in case cascade replication is allowed (could not
2042 * be created otherwise)
2043 * ------
2044 */
2046 WalSndWakeup(switchedTLI, true);
2047
2048 /*
2049 * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2050 * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2051 * a reply to the primary.
2052 */
2054 {
2057 }
2058
2059 /* Allow read-only connections if we're consistent now */
2061
2062 /* Is this a timeline switch? */
2063 if (switchedTLI)
2064 {
2065 /*
2066 * Before we continue on the new timeline, clean up any (possibly
2067 * bogus) future WAL segments on the old timeline.
2068 */
2070
2071 /* Reset the prefetcher. */
2073 }
2074}
2075
2076/*
2077 * Some XLOG RM record types that are directly related to WAL recovery are
2078 * handled here rather than in the xlog_redo()
2079 */
2080static void
2082{
2083 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2084 XLogRecPtr lsn = record->EndRecPtr;
2085
2086 Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2087
2088 if (info == XLOG_OVERWRITE_CONTRECORD)
2089 {
2090 /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2092
2093 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2094 if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2095 elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
2098
2099 /* We have safely skipped the aborted record */
2102
2103 ereport(LOG,
2104 (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2107
2108 /* Verifying the record should only happen once */
2110 }
2111 else if (info == XLOG_BACKUP_END)
2112 {
2113 XLogRecPtr startpoint;
2114
2115 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2116
2117 if (backupStartPoint == startpoint)
2118 {
2119 /*
2120 * We have reached the end of base backup, the point where
2121 * pg_backup_stop() was done. The data on disk is now consistent
2122 * (assuming we have also reached minRecoveryPoint). Set
2123 * backupEndPoint to the current LSN, so that the next call to
2124 * CheckRecoveryConsistency() will notice it and do the
2125 * end-of-backup processing.
2126 */
2127 elog(DEBUG1, "end of backup record reached");
2128
2129 backupEndPoint = lsn;
2130 }
2131 else
2132 elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2134 }
2135}
2136
2137/*
2138 * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2139 * directories.
2140 *
2141 * Replay of database creation XLOG records for databases that were later
2142 * dropped can create fake directories in pg_tblspc. By the time consistency
2143 * is reached these directories should have been removed; here we verify
2144 * that this did indeed happen. This is to be called at the point where
2145 * consistent state is reached.
2146 *
2147 * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2148 * useful for testing purposes, and also allows for an escape hatch in case
2149 * things go south.
2150 */
2151static void
2153{
2154 DIR *dir;
2155 struct dirent *de;
2156
2158 while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2159 {
2160 char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2161
2162 /* Skip entries of non-oid names */
2163 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2164 continue;
2165
2166 snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2167
2168 if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2171 errmsg("unexpected directory entry \"%s\" found in %s",
2172 de->d_name, PG_TBLSPC_DIR),
2173 errdetail("All directory entries in %s/ should be symbolic links.",
2175 errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2176 }
2177}
2178
2179/*
2180 * Checks if recovery has reached a consistent state. When consistency is
2181 * reached and we have a valid starting standby snapshot, tell postmaster
2182 * that it can start accepting read-only connections.
2183 */
2184static void
2186{
2187 XLogRecPtr lastReplayedEndRecPtr;
2188 TimeLineID lastReplayedTLI;
2189
2190 /*
2191 * During crash recovery, we don't reach a consistent state until we've
2192 * replayed all the WAL.
2193 */
2195 return;
2196
2198
2199 /*
2200 * assume that we are called in the startup process, and hence don't need
2201 * a lock to read lastReplayedEndRecPtr
2202 */
2203 lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2204 lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2205
2206 /*
2207 * Have we reached the point where our base backup was completed?
2208 */
2210 backupEndPoint <= lastReplayedEndRecPtr)
2211 {
2212 XLogRecPtr saveBackupStartPoint = backupStartPoint;
2213 XLogRecPtr saveBackupEndPoint = backupEndPoint;
2214
2215 elog(DEBUG1, "end of backup reached");
2216
2217 /*
2218 * We have reached the end of base backup, as indicated by pg_control.
2219 * Update the control file accordingly.
2220 */
2221 ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2224 backupEndRequired = false;
2225
2226 ereport(LOG,
2227 (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
2228 LSN_FORMAT_ARGS(saveBackupStartPoint),
2229 LSN_FORMAT_ARGS(saveBackupEndPoint))));
2230 }
2231
2232 /*
2233 * Have we passed our safe starting point? Note that minRecoveryPoint is
2234 * known to be incorrectly set if recovering from a backup, until the
2235 * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2236 * All we know prior to that is that we're not consistent yet.
2237 */
2239 minRecoveryPoint <= lastReplayedEndRecPtr)
2240 {
2241 /*
2242 * Check to see if the XLOG sequence contained any unresolved
2243 * references to uninitialized pages.
2244 */
2246
2247 /*
2248 * Check that pg_tblspc doesn't contain any real directories. Replay
2249 * of Database/CREATE_* records may have created fictitious tablespace
2250 * directories that should have been removed by the time consistency
2251 * was reached.
2252 */
2254
2255 reachedConsistency = true;
2257 ereport(LOG,
2258 (errmsg("consistent recovery state reached at %X/%X",
2259 LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2260 }
2261
2262 /*
2263 * Have we got a valid starting snapshot that will allow queries to be
2264 * run? If so, we can tell postmaster that the database is consistent now,
2265 * enabling connections.
2266 */
2271 {
2275
2276 LocalHotStandbyActive = true;
2277
2279 }
2280}
2281
2282/*
2283 * Error context callback for errors occurring during rm_redo().
2284 */
2285static void
2287{
2288 XLogReaderState *record = (XLogReaderState *) arg;
2290
2292 xlog_outdesc(&buf, record);
2293 xlog_block_info(&buf, record);
2294
2295 /* translator: %s is a WAL record description */
2296 errcontext("WAL redo at %X/%X for %s",
2297 LSN_FORMAT_ARGS(record->ReadRecPtr),
2298 buf.data);
2299
2300 pfree(buf.data);
2301}
2302
2303/*
2304 * Returns a string describing an XLogRecord, consisting of its identity
2305 * optionally followed by a colon, a space, and a further description.
2306 */
2307void
2309{
2310 RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2311 uint8 info = XLogRecGetInfo(record);
2312 const char *id;
2313
2316
2317 id = rmgr.rm_identify(info);
2318 if (id == NULL)
2319 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2320 else
2321 appendStringInfo(buf, "%s: ", id);
2322
2323 rmgr.rm_desc(buf, record);
2324}
2325
2326#ifdef WAL_DEBUG
2327
2328static void
2329xlog_outrec(StringInfo buf, XLogReaderState *record)
2330{
2331 appendStringInfo(buf, "prev %X/%X; xid %u",
2333 XLogRecGetXid(record));
2334
2335 appendStringInfo(buf, "; len %u",
2336 XLogRecGetDataLen(record));
2337
2338 xlog_block_info(buf, record);
2339}
2340#endif /* WAL_DEBUG */
2341
2342/*
2343 * Returns a string giving information about all the blocks in an
2344 * XLogRecord.
2345 */
2346static void
2348{
2349 int block_id;
2350
2351 /* decode block references */
2352 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2353 {
2354 RelFileLocator rlocator;
2355 ForkNumber forknum;
2356 BlockNumber blk;
2357
2358 if (!XLogRecGetBlockTagExtended(record, block_id,
2359 &rlocator, &forknum, &blk, NULL))
2360 continue;
2361
2362 if (forknum != MAIN_FORKNUM)
2363 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2364 block_id,
2365 rlocator.spcOid, rlocator.dbOid,
2366 rlocator.relNumber,
2367 forknum,
2368 blk);
2369 else
2370 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2371 block_id,
2372 rlocator.spcOid, rlocator.dbOid,
2373 rlocator.relNumber,
2374 blk);
2375 if (XLogRecHasBlockImage(record, block_id))
2376 appendStringInfoString(buf, " FPW");
2377 }
2378}
2379
2380
2381/*
2382 * Check that it's OK to switch to new timeline during recovery.
2383 *
2384 * 'lsn' is the address of the shutdown checkpoint record we're about to
2385 * replay. (Currently, timeline can only change at a shutdown checkpoint).
2386 */
2387static void
2389 TimeLineID replayTLI)
2390{
2391 /* Check that the record agrees on what the current (old) timeline is */
2392 if (prevTLI != replayTLI)
2393 ereport(PANIC,
2394 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2395 prevTLI, replayTLI)));
2396
2397 /*
2398 * The new timeline better be in the list of timelines we expect to see,
2399 * according to the timeline history. It should also not decrease.
2400 */
2401 if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2402 ereport(PANIC,
2403 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2404 newTLI, replayTLI)));
2405
2406 /*
2407 * If we have not yet reached min recovery point, and we're about to
2408 * switch to a timeline greater than the timeline of the min recovery
2409 * point: trouble. After switching to the new timeline, we could not
2410 * possibly visit the min recovery point on the correct timeline anymore.
2411 * This can happen if there is a newer timeline in the archive that
2412 * branched before the timeline the min recovery point is on, and you
2413 * attempt to do PITR to the new timeline.
2414 */
2416 lsn < minRecoveryPoint &&
2417 newTLI > minRecoveryPointTLI)
2418 ereport(PANIC,
2419 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2420 newTLI,
2423
2424 /* Looks good */
2425}
2426
2427
2428/*
2429 * Extract timestamp from WAL record.
2430 *
2431 * If the record contains a timestamp, returns true, and saves the timestamp
2432 * in *recordXtime. If the record type has no timestamp, returns false.
2433 * Currently, only transaction commit/abort records and restore points contain
2434 * timestamps.
2435 */
2436static bool
2438{
2439 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2440 uint8 xact_info = info & XLOG_XACT_OPMASK;
2441 uint8 rmid = XLogRecGetRmid(record);
2442
2443 if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2444 {
2445 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2446 return true;
2447 }
2448 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2449 xact_info == XLOG_XACT_COMMIT_PREPARED))
2450 {
2451 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2452 return true;
2453 }
2454 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2455 xact_info == XLOG_XACT_ABORT_PREPARED))
2456 {
2457 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2458 return true;
2459 }
2460 return false;
2461}
2462
2463/*
2464 * Checks whether the current buffer page and backup page stored in the
2465 * WAL record are consistent or not. Before comparing the two pages, a
2466 * masking can be applied to the pages to ignore certain areas like hint bits,
2467 * unused space between pd_lower and pd_upper among other things. This
2468 * function should be called once WAL replay has been completed for a
2469 * given record.
2470 */
2471static void
2473{
2474 RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2475 RelFileLocator rlocator;
2476 ForkNumber forknum;
2477 BlockNumber blkno;
2478 int block_id;
2479
2480 /* Records with no backup blocks have no need for consistency checks. */
2481 if (!XLogRecHasAnyBlockRefs(record))
2482 return;
2483
2485
2486 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2487 {
2488 Buffer buf;
2489 Page page;
2490
2491 if (!XLogRecGetBlockTagExtended(record, block_id,
2492 &rlocator, &forknum, &blkno, NULL))
2493 {
2494 /*
2495 * WAL record doesn't contain a block reference with the given id.
2496 * Do nothing.
2497 */
2498 continue;
2499 }
2500
2501 Assert(XLogRecHasBlockImage(record, block_id));
2502
2503 if (XLogRecBlockImageApply(record, block_id))
2504 {
2505 /*
2506 * WAL record has already applied the page, so bypass the
2507 * consistency check as that would result in comparing the full
2508 * page stored in the record with itself.
2509 */
2510 continue;
2511 }
2512
2513 /*
2514 * Read the contents from the current buffer and store it in a
2515 * temporary page.
2516 */
2517 buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2520 if (!BufferIsValid(buf))
2521 continue;
2522
2524 page = BufferGetPage(buf);
2525
2526 /*
2527 * Take a copy of the local page where WAL has been applied to have a
2528 * comparison base before masking it...
2529 */
2530 memcpy(replay_image_masked, page, BLCKSZ);
2531
2532 /* No need for this page anymore now that a copy is in. */
2534
2535 /*
2536 * If the block LSN is already ahead of this WAL record, we can't
2537 * expect contents to match. This can happen if recovery is
2538 * restarted.
2539 */
2541 continue;
2542
2543 /*
2544 * Read the contents from the backup copy, stored in WAL record and
2545 * store it in a temporary page. There is no need to allocate a new
2546 * page here, a local buffer is fine to hold its contents and a mask
2547 * can be directly applied on it.
2548 */
2549 if (!RestoreBlockImage(record, block_id, primary_image_masked))
2550 ereport(ERROR,
2551 (errcode(ERRCODE_INTERNAL_ERROR),
2552 errmsg_internal("%s", record->errormsg_buf)));
2553
2554 /*
2555 * If masking function is defined, mask both the primary and replay
2556 * images
2557 */
2558 if (rmgr.rm_mask != NULL)
2559 {
2560 rmgr.rm_mask(replay_image_masked, blkno);
2561 rmgr.rm_mask(primary_image_masked, blkno);
2562 }
2563
2564 /* Time to compare the primary and replay images. */
2565 if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2566 {
2567 elog(FATAL,
2568 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2569 rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2570 forknum, blkno);
2571 }
2572 }
2573}
2574
2575/*
2576 * For point-in-time recovery, this function decides whether we want to
2577 * stop applying the XLOG before the current record.
2578 *
2579 * Returns true if we are stopping, false otherwise. If stopping, some
2580 * information is saved in recoveryStopXid et al for use in annotating the
2581 * new timeline's history file.
2582 */
2583static bool
2585{
2586 bool stopsHere = false;
2587 uint8 xact_info;
2588 bool isCommit;
2589 TimestampTz recordXtime = 0;
2590 TransactionId recordXid;
2591
2592 /*
2593 * Ignore recovery target settings when not in archive recovery (meaning
2594 * we are in crash recovery).
2595 */
2597 return false;
2598
2599 /* Check if we should stop as soon as reaching consistency */
2601 {
2602 ereport(LOG,
2603 (errmsg("recovery stopping after reaching consistency")));
2604
2605 recoveryStopAfter = false;
2608 recoveryStopTime = 0;
2609 recoveryStopName[0] = '\0';
2610 return true;
2611 }
2612
2613 /* Check if target LSN has been reached */
2616 record->ReadRecPtr >= recoveryTargetLSN)
2617 {
2618 recoveryStopAfter = false;
2620 recoveryStopLSN = record->ReadRecPtr;
2621 recoveryStopTime = 0;
2622 recoveryStopName[0] = '\0';
2623 ereport(LOG,
2624 (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2626 return true;
2627 }
2628
2629 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2630 if (XLogRecGetRmid(record) != RM_XACT_ID)
2631 return false;
2632
2633 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2634
2635 if (xact_info == XLOG_XACT_COMMIT)
2636 {
2637 isCommit = true;
2638 recordXid = XLogRecGetXid(record);
2639 }
2640 else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2641 {
2642 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2643 xl_xact_parsed_commit parsed;
2644
2645 isCommit = true;
2647 xlrec,
2648 &parsed);
2649 recordXid = parsed.twophase_xid;
2650 }
2651 else if (xact_info == XLOG_XACT_ABORT)
2652 {
2653 isCommit = false;
2654 recordXid = XLogRecGetXid(record);
2655 }
2656 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2657 {
2658 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2659 xl_xact_parsed_abort parsed;
2660
2661 isCommit = false;
2663 xlrec,
2664 &parsed);
2665 recordXid = parsed.twophase_xid;
2666 }
2667 else
2668 return false;
2669
2671 {
2672 /*
2673 * There can be only one transaction end record with this exact
2674 * transactionid
2675 *
2676 * when testing for an xid, we MUST test for equality only, since
2677 * transactions are numbered in the order they start, not the order
2678 * they complete. A higher numbered xid will complete before you about
2679 * 50% of the time...
2680 */
2681 stopsHere = (recordXid == recoveryTargetXid);
2682 }
2683
2684 /*
2685 * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2686 * We don't expect getRecordTimestamp ever to fail, since we already know
2687 * this is a commit or abort record; but test its result anyway.
2688 */
2689 if (getRecordTimestamp(record, &recordXtime) &&
2691 {
2692 /*
2693 * There can be many transactions that share the same commit time, so
2694 * we stop after the last one, if we are inclusive, or stop at the
2695 * first one if we are exclusive
2696 */
2698 stopsHere = (recordXtime > recoveryTargetTime);
2699 else
2700 stopsHere = (recordXtime >= recoveryTargetTime);
2701 }
2702
2703 if (stopsHere)
2704 {
2705 recoveryStopAfter = false;
2706 recoveryStopXid = recordXid;
2707 recoveryStopTime = recordXtime;
2709 recoveryStopName[0] = '\0';
2710
2711 if (isCommit)
2712 {
2713 ereport(LOG,
2714 (errmsg("recovery stopping before commit of transaction %u, time %s",
2717 }
2718 else
2719 {
2720 ereport(LOG,
2721 (errmsg("recovery stopping before abort of transaction %u, time %s",
2724 }
2725 }
2726
2727 return stopsHere;
2728}
2729
2730/*
2731 * Same as recoveryStopsBefore, but called after applying the record.
2732 *
2733 * We also track the timestamp of the latest applied COMMIT/ABORT
2734 * record in XLogRecoveryCtl->recoveryLastXTime.
2735 */
2736static bool
2738{
2739 uint8 info;
2740 uint8 xact_info;
2741 uint8 rmid;
2742 TimestampTz recordXtime = 0;
2743
2744 /*
2745 * Ignore recovery target settings when not in archive recovery (meaning
2746 * we are in crash recovery).
2747 */
2749 return false;
2750
2751 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2752 rmid = XLogRecGetRmid(record);
2753
2754 /*
2755 * There can be many restore points that share the same name; we stop at
2756 * the first one.
2757 */
2759 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2760 {
2761 xl_restore_point *recordRestorePointData;
2762
2763 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2764
2765 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2766 {
2767 recoveryStopAfter = true;
2770 (void) getRecordTimestamp(record, &recoveryStopTime);
2771 strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2772
2773 ereport(LOG,
2774 (errmsg("recovery stopping at restore point \"%s\", time %s",
2777 return true;
2778 }
2779 }
2780
2781 /* Check if the target LSN has been reached */
2784 record->ReadRecPtr >= recoveryTargetLSN)
2785 {
2786 recoveryStopAfter = true;
2788 recoveryStopLSN = record->ReadRecPtr;
2789 recoveryStopTime = 0;
2790 recoveryStopName[0] = '\0';
2791 ereport(LOG,
2792 (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2794 return true;
2795 }
2796
2797 if (rmid != RM_XACT_ID)
2798 return false;
2799
2800 xact_info = info & XLOG_XACT_OPMASK;
2801
2802 if (xact_info == XLOG_XACT_COMMIT ||
2803 xact_info == XLOG_XACT_COMMIT_PREPARED ||
2804 xact_info == XLOG_XACT_ABORT ||
2805 xact_info == XLOG_XACT_ABORT_PREPARED)
2806 {
2807 TransactionId recordXid;
2808
2809 /* Update the last applied transaction timestamp */
2810 if (getRecordTimestamp(record, &recordXtime))
2811 SetLatestXTime(recordXtime);
2812
2813 /* Extract the XID of the committed/aborted transaction */
2814 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2815 {
2816 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2817 xl_xact_parsed_commit parsed;
2818
2820 xlrec,
2821 &parsed);
2822 recordXid = parsed.twophase_xid;
2823 }
2824 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2825 {
2826 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2827 xl_xact_parsed_abort parsed;
2828
2830 xlrec,
2831 &parsed);
2832 recordXid = parsed.twophase_xid;
2833 }
2834 else
2835 recordXid = XLogRecGetXid(record);
2836
2837 /*
2838 * There can be only one transaction end record with this exact
2839 * transactionid
2840 *
2841 * when testing for an xid, we MUST test for equality only, since
2842 * transactions are numbered in the order they start, not the order
2843 * they complete. A higher numbered xid will complete before you about
2844 * 50% of the time...
2845 */
2847 recordXid == recoveryTargetXid)
2848 {
2849 recoveryStopAfter = true;
2850 recoveryStopXid = recordXid;
2851 recoveryStopTime = recordXtime;
2853 recoveryStopName[0] = '\0';
2854
2855 if (xact_info == XLOG_XACT_COMMIT ||
2856 xact_info == XLOG_XACT_COMMIT_PREPARED)
2857 {
2858 ereport(LOG,
2859 (errmsg("recovery stopping after commit of transaction %u, time %s",
2862 }
2863 else if (xact_info == XLOG_XACT_ABORT ||
2864 xact_info == XLOG_XACT_ABORT_PREPARED)
2865 {
2866 ereport(LOG,
2867 (errmsg("recovery stopping after abort of transaction %u, time %s",
2870 }
2871 return true;
2872 }
2873 }
2874
2875 /* Check if we should stop as soon as reaching consistency */
2877 {
2878 ereport(LOG,
2879 (errmsg("recovery stopping after reaching consistency")));
2880
2881 recoveryStopAfter = true;
2883 recoveryStopTime = 0;
2885 recoveryStopName[0] = '\0';
2886 return true;
2887 }
2888
2889 return false;
2890}
2891
2892/*
2893 * Create a comment for the history file to explain why and where
2894 * timeline changed.
2895 */
2896static char *
2898{
2899 char reason[200];
2900
2902 snprintf(reason, sizeof(reason),
2903 "%s transaction %u",
2904 recoveryStopAfter ? "after" : "before",
2907 snprintf(reason, sizeof(reason),
2908 "%s %s\n",
2909 recoveryStopAfter ? "after" : "before",
2912 snprintf(reason, sizeof(reason),
2913 "%s LSN %X/%X\n",
2914 recoveryStopAfter ? "after" : "before",
2917 snprintf(reason, sizeof(reason),
2918 "at restore point \"%s\"",
2921 snprintf(reason, sizeof(reason), "reached consistency");
2922 else
2923 snprintf(reason, sizeof(reason), "no recovery target specified");
2924
2925 return pstrdup(reason);
2926}
2927
2928/*
2929 * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2930 *
2931 * endOfRecovery is true if the recovery target is reached and
2932 * the paused state starts at the end of recovery because of
2933 * recovery_target_action=pause, and false otherwise.
2934 */
2935static void
2936recoveryPausesHere(bool endOfRecovery)
2937{
2938 /* Don't pause unless users can connect! */
2940 return;
2941
2942 /* Don't pause after standby promotion has been triggered */
2944 return;
2945
2946 if (endOfRecovery)
2947 ereport(LOG,
2948 (errmsg("pausing at the end of recovery"),
2949 errhint("Execute pg_wal_replay_resume() to promote.")));
2950 else
2951 ereport(LOG,
2952 (errmsg("recovery has paused"),
2953 errhint("Execute pg_wal_replay_resume() to continue.")));
2954
2955 /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2957 {
2960 return;
2961
2962 /*
2963 * If recovery pause is requested then set it paused. While we are in
2964 * the loop, user might resume and pause again so set this every time.
2965 */
2967
2968 /*
2969 * We wait on a condition variable that will wake us as soon as the
2970 * pause ends, but we use a timeout so we can check the above exit
2971 * condition periodically too.
2972 */
2974 WAIT_EVENT_RECOVERY_PAUSE);
2975 }
2977}
2978
2979/*
2980 * When recovery_min_apply_delay is set, we wait long enough to make sure
2981 * certain record types are applied at least that interval behind the primary.
2982 *
2983 * Returns true if we waited.
2984 *
2985 * Note that the delay is calculated between the WAL record log time and
2986 * the current time on standby. We would prefer to keep track of when this
2987 * standby received each WAL record, which would allow a more consistent
2988 * approach and one not affected by time synchronisation issues, but that
2989 * is significantly more effort and complexity for little actual gain in
2990 * usability.
2991 */
2992static bool
2994{
2995 uint8 xact_info;
2996 TimestampTz xtime;
2997 TimestampTz delayUntil;
2998 long msecs;
2999
3000 /* nothing to do if no delay configured */
3001 if (recovery_min_apply_delay <= 0)
3002 return false;
3003
3004 /* no delay is applied on a database not yet consistent */
3005 if (!reachedConsistency)
3006 return false;
3007
3008 /* nothing to do if crash recovery is requested */
3010 return false;
3011
3012 /*
3013 * Is it a COMMIT record?
3014 *
3015 * We deliberately choose not to delay aborts since they have no effect on
3016 * MVCC. We already allow replay of records that don't have a timestamp,
3017 * so there is already opportunity for issues caused by early conflicts on
3018 * standbys.
3019 */
3020 if (XLogRecGetRmid(record) != RM_XACT_ID)
3021 return false;
3022
3023 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3024
3025 if (xact_info != XLOG_XACT_COMMIT &&
3026 xact_info != XLOG_XACT_COMMIT_PREPARED)
3027 return false;
3028
3029 if (!getRecordTimestamp(record, &xtime))
3030 return false;
3031
3033
3034 /*
3035 * Exit without arming the latch if it's already past time to apply this
3036 * record
3037 */
3039 if (msecs <= 0)
3040 return false;
3041
3042 while (true)
3043 {
3045
3046 /* This might change recovery_min_apply_delay. */
3048
3050 break;
3051
3052 /*
3053 * Recalculate delayUntil as recovery_min_apply_delay could have
3054 * changed while waiting in this loop.
3055 */
3057
3058 /*
3059 * Wait for difference between GetCurrentTimestamp() and delayUntil.
3060 */
3062 delayUntil);
3063
3064 if (msecs <= 0)
3065 break;
3066
3067 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3068
3071 msecs,
3072 WAIT_EVENT_RECOVERY_APPLY_DELAY);
3073 }
3074 return true;
3075}
3076
3077/*
3078 * Get the current state of the recovery pause request.
3079 */
3082{
3084
3088
3089 return state;
3090}
3091
3092/*
3093 * Set the recovery pause state.
3094 *
3095 * If recovery pause is requested then sets the recovery pause state to
3096 * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3097 * to 'not paused' to resume the recovery. The recovery pause will be
3098 * confirmed by the ConfirmRecoveryPaused.
3099 */
3100void
3101SetRecoveryPause(bool recoveryPause)
3102{
3104
3105 if (!recoveryPause)
3109
3111
3112 if (!recoveryPause)
3114}
3115
3116/*
3117 * Confirm the recovery pause by setting the recovery pause state to
3118 * RECOVERY_PAUSED.
3119 */
3120static void
3122{
3123 /* If recovery pause is requested then set it paused */
3128}
3129
3130
3131/*
3132 * Attempt to read the next XLOG record.
3133 *
3134 * Before first call, the reader needs to be positioned to the first record
3135 * by calling XLogPrefetcherBeginRead().
3136 *
3137 * If no valid record is available, returns NULL, or fails if emode is PANIC.
3138 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3139 * record is available.
3140 */
3141static XLogRecord *
3143 bool fetching_ckpt, TimeLineID replayTLI)
3144{
3145 XLogRecord *record;
3148
3149 /* Pass through parameters to XLogPageRead */
3150 private->fetching_ckpt = fetching_ckpt;
3151 private->emode = emode;
3152 private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3153 private->replayTLI = replayTLI;
3154
3155 /* This is the first attempt to read this page. */
3156 lastSourceFailed = false;
3157
3158 for (;;)
3159 {
3160 char *errormsg;
3161
3162 record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3163 if (record == NULL)
3164 {
3165 /*
3166 * When we find that WAL ends in an incomplete record, keep track
3167 * of that record. After recovery is done, we'll write a record
3168 * to indicate to downstream WAL readers that that portion is to
3169 * be ignored.
3170 *
3171 * However, when ArchiveRecoveryRequested = true, we're going to
3172 * switch to a new timeline at the end of recovery. We will only
3173 * copy WAL over to the new timeline up to the end of the last
3174 * complete record, so if we did this, we would later create an
3175 * overwrite contrecord in the wrong place, breaking everything.
3176 */
3179 {
3182 }
3183
3184 if (readFile >= 0)
3185 {
3186 close(readFile);
3187 readFile = -1;
3188 }
3189
3190 /*
3191 * We only end up here without a message when XLogPageRead()
3192 * failed - in that case we already logged something. In
3193 * StandbyMode that only happens if we have been triggered, so we
3194 * shouldn't loop anymore in that case.
3195 */
3196 if (errormsg)
3198 (errmsg_internal("%s", errormsg) /* already translated */ ));
3199 }
3200
3201 /*
3202 * Check page TLI is one of the expected values.
3203 */
3205 {
3206 char fname[MAXFNAMELEN];
3207 XLogSegNo segno;
3208 int32 offset;
3209
3213 XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3216 (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3218 fname,
3220 offset)));
3221 record = NULL;
3222 }
3223
3224 if (record)
3225 {
3226 /* Great, got a record */
3227 return record;
3228 }
3229 else
3230 {
3231 /* No valid record available from this source */
3232 lastSourceFailed = true;
3233
3234 /*
3235 * If archive recovery was requested, but we were still doing
3236 * crash recovery, switch to archive recovery and retry using the
3237 * offline archive. We have now replayed all the valid WAL in
3238 * pg_wal, so we are presumably now consistent.
3239 *
3240 * We require that there's at least some valid WAL present in
3241 * pg_wal, however (!fetching_ckpt). We could recover using the
3242 * WAL from the archive, even if pg_wal is completely empty, but
3243 * we'd have no idea how far we'd have to replay to reach
3244 * consistency. So err on the safe side and give up.
3245 */
3247 !fetching_ckpt)
3248 {
3250 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3251 InArchiveRecovery = true;
3254
3257 minRecoveryPointTLI = replayTLI;
3258
3260
3261 /*
3262 * Before we retry, reset lastSourceFailed and currentSource
3263 * so that we will check the archive next.
3264 */
3265 lastSourceFailed = false;
3267
3268 continue;
3269 }
3270
3271 /* In standby mode, loop back to retry. Otherwise, give up. */
3273 continue;
3274 else
3275 return NULL;
3276 }
3277 }
3278}
3279
3280/*
3281 * Read the XLOG page containing targetPagePtr into readBuf (if not read
3282 * already). Returns number of bytes read, if the page is read successfully,
3283 * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3284 * but only if they have not been previously reported.
3285 *
3286 * See XLogReaderRoutine.page_read for more details.
3287 *
3288 * While prefetching, xlogreader->nonblocking may be set. In that case,
3289 * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3290 *
3291 * This is responsible for restoring files from archive as needed, as well
3292 * as for waiting for the requested WAL record to arrive in standby mode.
3293 *
3294 * xlogreader->private_data->emode specifies the log level used for reporting
3295 * "file not found" or "end of WAL" situations in archive recovery, or in
3296 * standby mode when promotion is triggered. If set to WARNING or below,
3297 * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3298 * levels the ereport() won't return.
3299 *
3300 * In standby mode, if after a successful return of XLogPageRead() the
3301 * caller finds the record it's interested in to be broken, it should
3302 * ereport the error with the level determined by
3303 * emode_for_corrupt_record(), and then set lastSourceFailed
3304 * and call XLogPageRead() again with the same arguments. This lets
3305 * XLogPageRead() to try fetching the record from another source, or to
3306 * sleep and retry.
3307 */
3308static int
3310 XLogRecPtr targetRecPtr, char *readBuf)
3311{
3312 XLogPageReadPrivate *private =
3314 int emode = private->emode;
3315 uint32 targetPageOff;
3317 int r;
3318 instr_time io_start;
3319
3320 XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3321 targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3322
3323 /*
3324 * See if we need to switch to a new segment because the requested record
3325 * is not in the currently open one.
3326 */
3327 if (readFile >= 0 &&
3328 !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3329 {
3330 /*
3331 * Request a restartpoint if we've replayed too much xlog since the
3332 * last one.
3333 */
3335 {
3337 {
3338 (void) GetRedoRecPtr();
3341 }
3342 }
3343
3344 close(readFile);
3345 readFile = -1;
3347 }
3348
3349 XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3350
3351retry:
3352 /* See if we need to retrieve more data */
3353 if (readFile < 0 ||
3355 flushedUpto < targetPagePtr + reqLen))
3356 {
3357 if (readFile >= 0 &&
3360 flushedUpto < targetPagePtr + reqLen)
3361 return XLREAD_WOULDBLOCK;
3362
3363 switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3364 private->randAccess,
3365 private->fetching_ckpt,
3366 targetRecPtr,
3367 private->replayTLI,
3370 {
3371 case XLREAD_WOULDBLOCK:
3372 return XLREAD_WOULDBLOCK;
3373 case XLREAD_FAIL:
3374 if (readFile >= 0)
3375 close(readFile);
3376 readFile = -1;
3377 readLen = 0;
3379 return XLREAD_FAIL;
3380 case XLREAD_SUCCESS:
3381 break;
3382 }
3383 }
3384
3385 /*
3386 * At this point, we have the right segment open and if we're streaming we
3387 * know the requested record is in it.
3388 */
3389 Assert(readFile != -1);
3390
3391 /*
3392 * If the current segment is being streamed from the primary, calculate
3393 * how much of the current page we have received already. We know the
3394 * requested record has been received, but this is for the benefit of
3395 * future calls, to allow quick exit at the top of this function.
3396 */
3398 {
3399 if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3400 readLen = XLOG_BLCKSZ;
3401 else
3403 targetPageOff;
3404 }
3405 else
3406 readLen = XLOG_BLCKSZ;
3407
3408 /* Read the requested page */
3409 readOff = targetPageOff;
3410
3411 /* Measure I/O timing when reading segment */
3413
3414 pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3415 r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3416 if (r != XLOG_BLCKSZ)
3417 {
3418 char fname[MAXFNAMELEN];
3419 int save_errno = errno;
3420
3422
3424 io_start, 1, r);
3425
3427 if (r < 0)
3428 {
3429 errno = save_errno;
3430 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3432 errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3433 fname, LSN_FORMAT_ARGS(targetPagePtr),
3434 readOff)));
3435 }
3436 else
3437 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3439 errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3440 fname, LSN_FORMAT_ARGS(targetPagePtr),
3441 readOff, r, (Size) XLOG_BLCKSZ)));
3442 goto next_record_is_invalid;
3443 }
3445
3447 io_start, 1, r);
3448
3449 Assert(targetSegNo == readSegNo);
3450 Assert(targetPageOff == readOff);
3451 Assert(reqLen <= readLen);
3452
3454
3455 /*
3456 * Check the page header immediately, so that we can retry immediately if
3457 * it's not valid. This may seem unnecessary, because ReadPageInternal()
3458 * validates the page header anyway, and would propagate the failure up to
3459 * ReadRecord(), which would retry. However, there's a corner case with
3460 * continuation records, if a record is split across two pages such that
3461 * we would need to read the two pages from different sources across two
3462 * WAL segments.
3463 *
3464 * The first page is only available locally, in pg_wal, because it's
3465 * already been recycled on the primary. The second page, however, is not
3466 * present in pg_wal, and we should stream it from the primary. There is a
3467 * recycled WAL segment present in pg_wal, with garbage contents, however.
3468 * We would read the first page from the local WAL segment, but when
3469 * reading the second page, we would read the bogus, recycled, WAL
3470 * segment. If we didn't catch that case here, we would never recover,
3471 * because ReadRecord() would retry reading the whole record from the
3472 * beginning.
3473 *
3474 * Of course, this only catches errors in the page header, which is what
3475 * happens in the case of a recycled WAL segment. Other kinds of errors or
3476 * corruption still has the same problem. But this at least fixes the
3477 * common case, which can happen as part of normal operation.
3478 *
3479 * Validating the page header is cheap enough that doing it twice
3480 * shouldn't be a big deal from a performance point of view.
3481 *
3482 * When not in standby mode, an invalid page header should cause recovery
3483 * to end, not retry reading the page, so we don't need to validate the
3484 * page header here for the retry. Instead, ReadPageInternal() is
3485 * responsible for the validation.
3486 */
3487 if (StandbyMode &&
3488 (targetPagePtr % wal_segment_size) == 0 &&
3489 !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3490 {
3491 /*
3492 * Emit this error right now then retry this page immediately. Use
3493 * errmsg_internal() because the message was already translated.
3494 */
3495 if (xlogreader->errormsg_buf[0])
3498
3499 /* reset any error XLogReaderValidatePageHeader() might have set */
3501 goto next_record_is_invalid;
3502 }
3503
3504 return readLen;
3505
3506next_record_is_invalid:
3507
3508 /*
3509 * If we're reading ahead, give up fast. Retries and error reporting will
3510 * be handled by a later read when recovery catches up to this point.
3511 */
3513 return XLREAD_WOULDBLOCK;
3514
3515 lastSourceFailed = true;
3516
3517 if (readFile >= 0)
3518 close(readFile);
3519 readFile = -1;
3520 readLen = 0;
3522
3523 /* In standby-mode, keep trying */
3524 if (StandbyMode)
3525 goto retry;
3526 else
3527 return XLREAD_FAIL;
3528}
3529
3530/*
3531 * Open the WAL segment containing WAL location 'RecPtr'.
3532 *
3533 * The segment can be fetched via restore_command, or via walreceiver having
3534 * streamed the record, or it can already be present in pg_wal. Checking
3535 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3536 * too, in case someone copies a new segment directly to pg_wal. That is not
3537 * documented or recommended, though.
3538 *
3539 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3540 * prepare to read WAL starting from RedoStartLSN after this.
3541 *
3542 * 'RecPtr' might not point to the beginning of the record we're interested
3543 * in, it might also point to the page or segment header. In that case,
3544 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3545 * used to decide which timeline to stream the requested WAL from.
3546 *
3547 * 'replayLSN' is the current replay LSN, so that if we scan for new
3548 * timelines, we can reject a switch to a timeline that branched off before
3549 * this point.
3550 *
3551 * If the record is not immediately available, the function returns false
3552 * if we're not in standby mode. In standby mode, waits for it to become
3553 * available.
3554 *
3555 * When the requested record becomes available, the function opens the file
3556 * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3557 * of standby mode is triggered by the user, and there is no more WAL
3558 * available, returns XLREAD_FAIL.
3559 *
3560 * If nonblocking is true, then give up immediately if we can't satisfy the
3561 * request, returning XLREAD_WOULDBLOCK instead of waiting.
3562 */
3563static XLogPageReadResult
3565 bool fetching_ckpt, XLogRecPtr tliRecPtr,
3566 TimeLineID replayTLI, XLogRecPtr replayLSN,
3567 bool nonblocking)
3568{
3569 static TimestampTz last_fail_time = 0;
3571 bool streaming_reply_sent = false;
3572
3573 /*-------
3574 * Standby mode is implemented by a state machine:
3575 *
3576 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3577 * pg_wal (XLOG_FROM_PG_WAL)
3578 * 2. Check for promotion trigger request
3579 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3580 * 4. Rescan timelines
3581 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3582 *
3583 * Failure to read from the current source advances the state machine to
3584 * the next state.
3585 *
3586 * 'currentSource' indicates the current state. There are no currentSource
3587 * values for "check trigger", "rescan timelines", and "sleep" states,
3588 * those actions are taken when reading from the previous source fails, as
3589 * part of advancing to the next state.
3590 *
3591 * If standby mode is turned off while reading WAL from stream, we move
3592 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3593 * the files (which would be required at end of recovery, e.g., timeline
3594 * history file) from archive or pg_wal. We don't need to kill WAL receiver
3595 * here because it's already stopped when standby mode is turned off at
3596 * the end of recovery.
3597 *-------
3598 */
3599 if (!InArchiveRecovery)
3601 else if (currentSource == XLOG_FROM_ANY ||
3603 {
3604 lastSourceFailed = false;
3606 }
3607
3608 for (;;)
3609 {
3610 XLogSource oldSource = currentSource;
3611 bool startWalReceiver = false;
3612
3613 /*
3614 * First check if we failed to read from the current source, and
3615 * advance the state machine if so. The failure to read might've
3616 * happened outside this function, e.g when a CRC check fails on a
3617 * record, or within this loop.
3618 */
3619 if (lastSourceFailed)
3620 {
3621 /*
3622 * Don't allow any retry loops to occur during nonblocking
3623 * readahead. Let the caller process everything that has been
3624 * decoded already first.
3625 */
3626 if (nonblocking)
3627 return XLREAD_WOULDBLOCK;
3628
3629 switch (currentSource)
3630 {
3631 case XLOG_FROM_ARCHIVE:
3632 case XLOG_FROM_PG_WAL:
3633
3634 /*
3635 * Check to see if promotion is requested. Note that we do
3636 * this only after failure, so when you promote, we still
3637 * finish replaying as much as we can from archive and
3638 * pg_wal before failover.
3639 */
3641 {
3643 return XLREAD_FAIL;
3644 }
3645
3646 /*
3647 * Not in standby mode, and we've now tried the archive
3648 * and pg_wal.
3649 */
3650 if (!StandbyMode)
3651 return XLREAD_FAIL;
3652
3653 /*
3654 * Move to XLOG_FROM_STREAM state, and set to start a
3655 * walreceiver if necessary.
3656 */
3658 startWalReceiver = true;
3659 break;
3660
3661 case XLOG_FROM_STREAM:
3662
3663 /*
3664 * Failure while streaming. Most likely, we got here
3665 * because streaming replication was terminated, or
3666 * promotion was triggered. But we also get here if we
3667 * find an invalid record in the WAL streamed from the
3668 * primary, in which case something is seriously wrong.
3669 * There's little chance that the problem will just go
3670 * away, but PANIC is not good for availability either,
3671 * especially in hot standby mode. So, we treat that the
3672 * same as disconnection, and retry from archive/pg_wal
3673 * again. The WAL in the archive should be identical to
3674 * what was streamed, so it's unlikely that it helps, but
3675 * one can hope...
3676 */
3677
3678 /*
3679 * We should be able to move to XLOG_FROM_STREAM only in
3680 * standby mode.
3681 */
3683
3684 /*
3685 * Before we leave XLOG_FROM_STREAM state, make sure that
3686 * walreceiver is not active, so that it won't overwrite
3687 * WAL that we restore from archive.
3688 */
3690
3691 /*
3692 * Before we sleep, re-scan for possible new timelines if
3693 * we were requested to recover to the latest timeline.
3694 */
3696 {
3697 if (rescanLatestTimeLine(replayTLI, replayLSN))
3698 {
3700 break;
3701 }
3702 }
3703
3704 /*
3705 * XLOG_FROM_STREAM is the last state in our state
3706 * machine, so we've exhausted all the options for
3707 * obtaining the requested WAL. We're going to loop back
3708 * and retry from the archive, but if it hasn't been long
3709 * since last attempt, sleep wal_retrieve_retry_interval
3710 * milliseconds to avoid busy-waiting.
3711 */
3713 if (!TimestampDifferenceExceeds(last_fail_time, now,
3715 {
3716 long wait_time;
3717
3718 wait_time = wal_retrieve_retry_interval -
3719 TimestampDifferenceMilliseconds(last_fail_time, now);
3720
3721 elog(LOG, "waiting for WAL to become available at %X/%X",
3722 LSN_FORMAT_ARGS(RecPtr));
3723
3724 /* Do background tasks that might benefit us later. */
3726
3730 wait_time,
3731 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3734
3735 /* Handle interrupt signals of startup process */
3737 }
3738 last_fail_time = now;
3740 break;
3741
3742 default:
3743 elog(ERROR, "unexpected WAL source %d", currentSource);
3744 }
3745 }
3746 else if (currentSource == XLOG_FROM_PG_WAL)
3747 {
3748 /*
3749 * We just successfully read a file in pg_wal. We prefer files in
3750 * the archive over ones in pg_wal, so try the next file again
3751 * from the archive first.
3752 */
3755 }
3756
3757 if (currentSource != oldSource)
3758 elog(DEBUG2, "switched WAL source from %s to %s after %s",
3760 lastSourceFailed ? "failure" : "success");
3761
3762 /*
3763 * We've now handled possible failure. Try to read from the chosen
3764 * source.
3765 */
3766 lastSourceFailed = false;
3767
3768 switch (currentSource)
3769 {
3770 case XLOG_FROM_ARCHIVE:
3771 case XLOG_FROM_PG_WAL:
3772
3773 /*
3774 * WAL receiver must not be running when reading WAL from
3775 * archive or pg_wal.
3776 */
3778
3779 /* Close any old file we might have open. */
3780 if (readFile >= 0)
3781 {
3782 close(readFile);
3783 readFile = -1;
3784 }
3785 /* Reset curFileTLI if random fetch. */
3786 if (randAccess)
3787 curFileTLI = 0;
3788
3789 /*
3790 * Try to restore the file from archive, or read an existing
3791 * file from pg_wal.
3792 */
3796 if (readFile >= 0)
3797 return XLREAD_SUCCESS; /* success! */
3798
3799 /*
3800 * Nope, not found in archive or pg_wal.
3801 */
3802 lastSourceFailed = true;
3803 break;
3804
3805 case XLOG_FROM_STREAM:
3806 {
3807 bool havedata;
3808
3809 /*
3810 * We should be able to move to XLOG_FROM_STREAM only in
3811 * standby mode.
3812 */
3814
3815 /*
3816 * First, shutdown walreceiver if its restart has been
3817 * requested -- but no point if we're already slated for
3818 * starting it.
3819 */
3820 if (pendingWalRcvRestart && !startWalReceiver)
3821 {
3823
3824 /*
3825 * Re-scan for possible new timelines if we were
3826 * requested to recover to the latest timeline.
3827 */
3830 rescanLatestTimeLine(replayTLI, replayLSN);
3831
3832 startWalReceiver = true;
3833 }
3834 pendingWalRcvRestart = false;
3835
3836 /*
3837 * Launch walreceiver if needed.
3838 *
3839 * If fetching_ckpt is true, RecPtr points to the initial
3840 * checkpoint location. In that case, we use RedoStartLSN
3841 * as the streaming start position instead of RecPtr, so
3842 * that when we later jump backwards to start redo at
3843 * RedoStartLSN, we will have the logs streamed already.
3844 */
3845 if (startWalReceiver &&
3846 PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3847 {
3848 XLogRecPtr ptr;
3849 TimeLineID tli;
3850
3851 if (fetching_ckpt)
3852 {
3853 ptr = RedoStartLSN;
3854 tli = RedoStartTLI;
3855 }
3856 else
3857 {
3858 ptr = RecPtr;
3859
3860 /*
3861 * Use the record begin position to determine the
3862 * TLI, rather than the position we're reading.
3863 */
3864 tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3865
3866 if (curFileTLI > 0 && tli < curFileTLI)
3867 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3868 LSN_FORMAT_ARGS(tliRecPtr),
3869 tli, curFileTLI);
3870 }
3871 curFileTLI = tli;
3876 flushedUpto = 0;
3877 }
3878
3879 /*
3880 * Check if WAL receiver is active or wait to start up.
3881 */
3882 if (!WalRcvStreaming())
3883 {
3884 lastSourceFailed = true;
3885 break;
3886 }
3887
3888 /*
3889 * Walreceiver is active, so see if new data has arrived.
3890 *
3891 * We only advance XLogReceiptTime when we obtain fresh
3892 * WAL from walreceiver and observe that we had already
3893 * processed everything before the most recent "chunk"
3894 * that it flushed to disk. In steady state where we are
3895 * keeping up with the incoming data, XLogReceiptTime will
3896 * be updated on each cycle. When we are behind,
3897 * XLogReceiptTime will not advance, so the grace time
3898 * allotted to conflicting queries will decrease.
3899 */
3900 if (RecPtr < flushedUpto)
3901 havedata = true;
3902 else
3903 {
3904 XLogRecPtr latestChunkStart;
3905
3906 flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3907 if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3908 {
3909 havedata = true;
3910 if (latestChunkStart <= RecPtr)
3911 {
3914 }
3915 }
3916 else
3917 havedata = false;
3918 }
3919 if (havedata)
3920 {
3921 /*
3922 * Great, streamed far enough. Open the file if it's
3923 * not open already. Also read the timeline history
3924 * file if we haven't initialized timeline history
3925 * yet; it should be streamed over and present in
3926 * pg_wal by now. Use XLOG_FROM_STREAM so that source
3927 * info is set correctly and XLogReceiptTime isn't
3928 * changed.
3929 *
3930 * NB: We must set readTimeLineHistory based on
3931 * recoveryTargetTLI, not receiveTLI. Normally they'll
3932 * be the same, but if recovery_target_timeline is
3933 * 'latest' and archiving is configured, then it's
3934 * possible that we managed to retrieve one or more
3935 * new timeline history files from the archive,
3936 * updating recoveryTargetTLI.
3937 */
3938 if (readFile < 0)
3939 {
3940 if (!expectedTLEs)
3943 XLOG_FROM_STREAM, false);
3944 Assert(readFile >= 0);
3945 }
3946 else
3947 {
3948 /* just make sure source info is correct... */
3951 return XLREAD_SUCCESS;
3952 }
3953 break;
3954 }
3955
3956 /* In nonblocking mode, return rather than sleeping. */
3957 if (nonblocking)
3958 return XLREAD_WOULDBLOCK;
3959
3960 /*
3961 * Data not here yet. Check for trigger, then wait for
3962 * walreceiver to wake us up when new WAL arrives.
3963 */
3965 {
3966 /*
3967 * Note that we don't return XLREAD_FAIL immediately
3968 * here. After being triggered, we still want to
3969 * replay all the WAL that was already streamed. It's
3970 * in pg_wal now, so we just treat this as a failure,
3971 * and the state machine will move on to replay the
3972 * streamed WAL from pg_wal, and then recheck the
3973 * trigger and exit replay.
3974 */
3975 lastSourceFailed = true;
3976 break;
3977 }
3978
3979 /*
3980 * Since we have replayed everything we have received so
3981 * far and are about to start waiting for more WAL, let's
3982 * tell the upstream server our replay location now so
3983 * that pg_stat_replication doesn't show stale
3984 * information.
3985 */
3986 if (!streaming_reply_sent)
3987 {
3989 streaming_reply_sent = true;
3990 }
3991
3992 /* Do any background tasks that might benefit us later. */
3994
3995 /* Update pg_stat_recovery_prefetch before sleeping. */
3997
3998 /*
3999 * Wait for more WAL to arrive, when we will be woken
4000 * immediately by the WAL receiver.
4001 */
4004 -1L,
4005 WAIT_EVENT_RECOVERY_WAL_STREAM);
4007 break;
4008 }
4009
4010 default:
4011 elog(ERROR, "unexpected WAL source %d", currentSource);
4012 }
4013
4014 /*
4015 * Check for recovery pause here so that we can confirm more quickly
4016 * that a requested pause has actually taken effect.
4017 */
4018 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4020 recoveryPausesHere(false);
4021
4022 /*
4023 * This possibly-long loop needs to handle interrupts of startup
4024 * process.
4025 */
4027 }
4028
4029 return XLREAD_FAIL; /* not reached */
4030}
4031
4032
4033/*
4034 * Determine what log level should be used to report a corrupt WAL record
4035 * in the current WAL page, previously read by XLogPageRead().
4036 *
4037 * 'emode' is the error mode that would be used to report a file-not-found
4038 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4039 * we're retrying the exact same record that we've tried previously, only
4040 * complain the first time to keep the noise down. However, we only do when
4041 * reading from pg_wal, because we don't expect any invalid records in archive
4042 * or in records streamed from the primary. Files in the archive should be complete,
4043 * and we should never hit the end of WAL because we stop and wait for more WAL
4044 * to arrive before replaying it.
4045 *
4046 * NOTE: This function remembers the RecPtr value it was last called with,
4047 * to suppress repeated messages about the same record. Only call this when
4048 * you are about to ereport(), or you might cause a later message to be
4049 * erroneously suppressed.
4050 */
4051static int
4053{
4054 static XLogRecPtr lastComplaint = 0;
4055
4056 if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4057 {
4058 if (RecPtr == lastComplaint)
4059 emode = DEBUG1;
4060 else
4061 lastComplaint = RecPtr;
4062 }
4063 return emode;
4064}
4065
4066
4067/*
4068 * Subroutine to try to fetch and validate a prior checkpoint record.
4069 */
4070static XLogRecord *
4072 TimeLineID replayTLI)
4073{
4074 XLogRecord *record;
4075 uint8 info;
4076
4077 Assert(xlogreader != NULL);
4078
4079 if (!XRecOffIsValid(RecPtr))
4080 {
4081 ereport(LOG,
4082 (errmsg("invalid checkpoint location")));
4083 return NULL;
4084 }
4085
4087 record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4088
4089 if (record == NULL)
4090 {
4091 ereport(LOG,
4092 (errmsg("invalid checkpoint record")));
4093 return NULL;
4094 }
4095 if (record->xl_rmid != RM_XLOG_ID)
4096 {
4097 ereport(LOG,
4098 (errmsg("invalid resource manager ID in checkpoint record")));
4099 return NULL;
4100 }
4101 info = record->xl_info & ~XLR_INFO_MASK;
4102 if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4103 info != XLOG_CHECKPOINT_ONLINE)
4104 {
4105 ereport(LOG,
4106 (errmsg("invalid xl_info in checkpoint record")));
4107 return NULL;
4108 }
4110 {
4111 ereport(LOG,
4112 (errmsg("invalid length of checkpoint record")));
4113 return NULL;
4114 }
4115 return record;
4116}
4117
4118/*
4119 * Scan for new timelines that might have appeared in the archive since we
4120 * started recovery.
4121 *
4122 * If there are any, the function changes recovery target TLI to the latest
4123 * one and returns 'true'.
4124 */
4125static bool
4127{
4128 List *newExpectedTLEs;
4129 bool found;
4130 ListCell *cell;
4131 TimeLineID newtarget;
4132 TimeLineID oldtarget = recoveryTargetTLI;
4133 TimeLineHistoryEntry *currentTle = NULL;
4134
4136 if (newtarget == recoveryTargetTLI)
4137 {
4138 /* No new timelines found */
4139 return false;
4140 }
4141
4142 /*
4143 * Determine the list of expected TLIs for the new TLI
4144 */
4145
4146 newExpectedTLEs = readTimeLineHistory(newtarget);
4147
4148 /*
4149 * If the current timeline is not part of the history of the new timeline,
4150 * we cannot proceed to it.
4151 */
4152 found = false;
4153 foreach(cell, newExpectedTLEs)
4154 {
4155 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4156
4157 if (currentTle->tli == recoveryTargetTLI)
4158 {
4159 found = true;
4160 break;
4161 }
4162 }
4163 if (!found)
4164 {
4165 ereport(LOG,
4166 (errmsg("new timeline %u is not a child of database system timeline %u",
4167 newtarget,
4168 replayTLI)));
4169 return false;
4170 }
4171
4172 /*
4173 * The current timeline was found in the history file, but check that the
4174 * next timeline was forked off from it *after* the current recovery
4175 * location.
4176 */
4177 if (currentTle->end < replayLSN)
4178 {
4179 ereport(LOG,
4180 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4181 newtarget,
4182 replayTLI,
4183 LSN_FORMAT_ARGS(replayLSN))));
4184 return false;
4185 }
4186
4187 /* The new timeline history seems valid. Switch target */
4188 recoveryTargetTLI = newtarget;
4190 expectedTLEs = newExpectedTLEs;
4191
4192 /*
4193 * As in StartupXLOG(), try to ensure we have all the history files
4194 * between the old target and new target in pg_wal.
4195 */
4196 restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4197
4198 ereport(LOG,
4199 (errmsg("new target timeline is %u",
4201
4202 return true;
4203}
4204
4205
4206/*
4207 * Open a logfile segment for reading (during recovery).
4208 *
4209 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4210 * Otherwise, it's assumed to be already available in pg_wal.
4211 */
4212static int
4214 XLogSource source, bool notfoundOk)
4215{
4216 char xlogfname[MAXFNAMELEN];
4217 char activitymsg[MAXFNAMELEN + 16];
4218 char path[MAXPGPATH];
4219 int fd;
4220
4221 XLogFileName(xlogfname, tli, segno, wal_segment_size);
4222
4223 switch (source)
4224 {
4225 case XLOG_FROM_ARCHIVE:
4226 /* Report recovery progress in PS display */
4227 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4228 xlogfname);
4229 set_ps_display(activitymsg);
4230
4231 if (!RestoreArchivedFile(path, xlogfname,
4232 "RECOVERYXLOG",
4234 InRedo))
4235 return -1;
4236 break;
4237
4238 case XLOG_FROM_PG_WAL:
4239 case XLOG_FROM_STREAM:
4240 XLogFilePath(path, tli, segno, wal_segment_size);
4241 break;
4242
4243 default:
4244 elog(ERROR, "invalid XLogFileRead source %d", source);
4245 }
4246
4247 /*
4248 * If the segment was fetched from archival storage, replace the existing
4249 * xlog segment (if any) with the archival version.
4250 */
4252 {
4254 KeepFileRestoredFromArchive(path, xlogfname);
4255
4256 /*
4257 * Set path to point at the new file in pg_wal.
4258 */
4259 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4260 }
4261
4262 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4263 if (fd >= 0)
4264 {
4265 /* Success! */
4266 curFileTLI = tli;
4267
4268 /* Report recovery progress in PS display */
4269 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4270 xlogfname);
4271 set_ps_display(activitymsg);
4272
4273 /* Track source of data in assorted state variables */
4276 /* In FROM_STREAM case, caller tracks receipt time, not me */
4277 if (source != XLOG_FROM_STREAM)
4279
4280 return fd;
4281 }
4282 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4283 ereport(PANIC,
4285 errmsg("could not open file \"%s\": %m", path)));
4286 return -1;
4287}
4288
4289/*
4290 * Open a logfile segment for reading (during recovery).
4291 *
4292 * This version searches for the segment with any TLI listed in expectedTLEs.
4293 */
4294static int
4296{
4297 char path[MAXPGPATH];
4298 ListCell *cell;
4299 int fd;
4300 List *tles;
4301
4302 /*
4303 * Loop looking for a suitable timeline ID: we might need to read any of
4304 * the timelines listed in expectedTLEs.
4305 *
4306 * We expect curFileTLI on entry to be the TLI of the preceding file in
4307 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4308 * to go backwards; this prevents us from picking up the wrong file when a
4309 * parent timeline extends to higher segment numbers than the child we
4310 * want to read.
4311 *
4312 * If we haven't read the timeline history file yet, read it now, so that
4313 * we know which TLIs to scan. We don't save the list in expectedTLEs,
4314 * however, unless we actually find a valid segment. That way if there is
4315 * neither a timeline history file nor a WAL segment in the archive, and
4316 * streaming replication is set up, we'll read the timeline history file
4317 * streamed from the primary when we start streaming, instead of
4318 * recovering with a dummy history generated here.
4319 */
4320 if (expectedTLEs)
4321 tles = expectedTLEs;
4322 else
4324
4325 foreach(cell, tles)
4326 {
4328 TimeLineID tli = hent->tli;
4329
4330 if (tli < curFileTLI)
4331 break; /* don't bother looking at too-old TLIs */
4332
4333 /*
4334 * Skip scanning the timeline ID that the logfile segment to read
4335 * doesn't belong to
4336 */
4337 if (hent->begin != InvalidXLogRecPtr)
4338 {
4339 XLogSegNo beginseg = 0;
4340
4341 XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4342
4343 /*
4344 * The logfile segment that doesn't belong to the timeline is
4345 * older or newer than the segment that the timeline started or
4346 * ended at, respectively. It's sufficient to check only the
4347 * starting segment of the timeline here. Since the timelines are
4348 * scanned in descending order in this loop, any segments newer
4349 * than the ending segment should belong to newer timeline and
4350 * have already been read before. So it's not necessary to check
4351 * the ending segment of the timeline here.
4352 */
4353 if (segno < beginseg)
4354 continue;
4355 }
4356
4358 {
4359 fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4360 if (fd != -1)
4361 {
4362 elog(DEBUG1, "got WAL segment from archive");
4363 if (!expectedTLEs)
4364 expectedTLEs = tles;
4365 return fd;
4366 }
4367 }
4368
4370 {
4371 fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4372 if (fd != -1)
4373 {
4374 if (!expectedTLEs)
4375 expectedTLEs = tles;
4376 return fd;
4377 }
4378 }
4379 }
4380
4381 /* Couldn't find it. For simplicity, complain about front timeline */
4383 errno = ENOENT;
4386 errmsg("could not open file \"%s\": %m", path)));
4387 return -1;
4388}
4389
4390/*
4391 * Set flag to signal the walreceiver to restart. (The startup process calls
4392 * this on noticing a relevant configuration change.)
4393 */
4394void
4396{
4398 {
4399 ereport(LOG,
4400 (errmsg("WAL receiver process shutdown requested")));
4401
4402 pendingWalRcvRestart = true;
4403 }
4404}
4405
4406
4407/*
4408 * Has a standby promotion already been triggered?
4409 *
4410 * Unlike CheckForStandbyTrigger(), this works in any process
4411 * that's connected to shared memory.
4412 */
4413bool
4415{
4416 /*
4417 * We check shared state each time only until a standby promotion is
4418 * triggered. We can't trigger a promotion again, so there's no need to
4419 * keep checking after the shared variable has once been seen true.
4420 */
4422 return true;
4423
4427
4429}
4430
4431static void
4433{
4437
4438 /*
4439 * Mark the recovery pause state as 'not paused' because the paused state
4440 * ends and promotion continues if a promotion is triggered while recovery
4441 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4442 * return 'paused' while a promotion is ongoing.
4443 */
4444 SetRecoveryPause(false);
4445
4447}
4448
4449/*
4450 * Check whether a promote request has arrived.
4451 */
4452static bool
4454{
4456 return true;
4457
4459 {
4460 ereport(LOG, (errmsg("received promote request")));
4464 return true;
4465 }
4466
4467 return false;
4468}
4469
4470/*
4471 * Remove the files signaling a standby promotion request.
4472 */
4473void
4475{
4476 unlink(PROMOTE_SIGNAL_FILE);
4477}
4478
4479/*
4480 * Check to see if a promote request has arrived.
4481 */
4482bool
4484{
4485 struct stat stat_buf;
4486
4487 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4488 return true;
4489
4490 return false;
4491}
4492
4493/*
4494 * Wake up startup process to replay newly arrived WAL, or to notice that
4495 * failover has been requested.
4496 */
4497void
4499{
4501}
4502
4503/*
4504 * Schedule a walreceiver wakeup in the main recovery loop.
4505 */
4506void
4508{
4510}
4511
4512/*
4513 * Is HotStandby active yet? This is only important in special backends
4514 * since normal backends won't ever be able to connect until this returns
4515 * true. Postmaster knows this by way of signal, not via shared memory.
4516 *
4517 * Unlike testing standbyState, this works in any process that's connected to
4518 * shared memory. (And note that standbyState alone doesn't tell the truth
4519 * anyway.)
4520 */
4521bool
4523{
4524 /*
4525 * We check shared state each time only until Hot Standby is active. We
4526 * can't de-activate Hot Standby, so there's no need to keep checking
4527 * after the shared variable has once been seen true.
4528 */
4530 return true;
4531 else
4532 {
4533 /* spinlock is essential on machines with weak memory ordering! */
4537
4538 return LocalHotStandbyActive;
4539 }
4540}
4541
4542/*
4543 * Like HotStandbyActive(), but to be used only in WAL replay code,
4544 * where we don't need to ask any other process what the state is.
4545 */
4546static bool
4548{
4550 return LocalHotStandbyActive;
4551}
4552
4553/*
4554 * Get latest redo apply position.
4555 *
4556 * Exported to allow WALReceiver to read the pointer directly.
4557 */
4560{
4561 XLogRecPtr recptr;
4562 TimeLineID tli;
4563
4568
4569 if (replayTLI)
4570 *replayTLI = tli;
4571 return recptr;
4572}
4573
4574
4575/*
4576 * Get position of last applied, or the record being applied.
4577 *
4578 * This is different from GetXLogReplayRecPtr() in that if a WAL
4579 * record is currently being applied, this includes that record.
4580 */
4583{
4584 XLogRecPtr recptr;
4585 TimeLineID tli;
4586
4591
4592 if (replayEndTLI)
4593 *replayEndTLI = tli;
4594 return recptr;
4595}
4596
4597/*
4598 * Save timestamp of latest processed commit/abort record.
4599 *
4600 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4601 * seen by processes other than the startup process. Note in particular
4602 * that CreateRestartPoint is executed in the checkpointer.
4603 */
4604static void
4606{
4610}
4611
4612/*
4613 * Fetch timestamp of latest processed commit/abort record.
4614 */
4617{
4618 TimestampTz xtime;
4619
4623
4624 return xtime;
4625}
4626
4627/*
4628 * Save timestamp of the next chunk of WAL records to apply.
4629 *
4630 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4631 * seen by all backends.
4632 */
4633static void
4635{
4639}
4640
4641/*
4642 * Fetch timestamp of latest processed commit/abort record.
4643 * Startup process maintains an accurate local copy in XLogReceiptTime
4644 */
4647{
4648 TimestampTz xtime;
4649
4653
4654 return xtime;
4655}
4656
4657/*
4658 * Returns time of receipt of current chunk of XLOG data, as well as
4659 * whether it was received from streaming replication or from archives.
4660 */
4661void
4662GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4663{
4664 /*
4665 * This must be executed in the startup process, since we don't export the
4666 * relevant state to shared memory.
4667 */
4669
4670 *rtime = XLogReceiptTime;
4671 *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4672}
4673
4674/*
4675 * Note that text field supplied is a parameter name and does not require
4676 * translation
4677 */
4678void
4679RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4680{
4681 if (currValue < minValue)
4682 {
4684 {
4685 bool warned_for_promote = false;
4686
4688 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4689 errmsg("hot standby is not possible because of insufficient parameter settings"),
4690 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4691 param_name,
4692 currValue,
4693 minValue)));
4694
4695 SetRecoveryPause(true);
4696
4697 ereport(LOG,
4698 (errmsg("recovery has paused"),
4699 errdetail("If recovery is unpaused, the server will shut down."),
4700 errhint("You can then restart the server after making the necessary configuration changes.")));
4701
4703 {
4705
4707 {
4708 if (!warned_for_promote)
4710 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4711 errmsg("promotion is not possible because of insufficient parameter settings"),
4712
4713 /*
4714 * Repeat the detail from above so it's easy to find
4715 * in the log.
4716 */
4717 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4718 param_name,
4719 currValue,
4720 minValue),
4721 errhint("Restart the server after making the necessary configuration changes.")));
4722 warned_for_promote = true;
4723 }
4724
4725 /*
4726 * If recovery pause is requested then set it paused. While
4727 * we are in the loop, user might resume and pause again so
4728 * set this every time.
4729 */
4731
4732 /*
4733 * We wait on a condition variable that will wake us as soon
4734 * as the pause ends, but we use a timeout so we can check the
4735 * above conditions periodically too.
4736 */
4738 WAIT_EVENT_RECOVERY_PAUSE);
4739 }
4741 }
4742
4743 ereport(FATAL,
4744 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4745 errmsg("recovery aborted because of insufficient parameter settings"),
4746 /* Repeat the detail from above so it's easy to find in the log. */
4747 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4748 param_name,
4749 currValue,
4750 minValue),
4751 errhint("You can restart the server after making the necessary configuration changes.")));
4752 }
4753}
4754
4755
4756/*
4757 * GUC check_hook for primary_slot_name
4758 */
4759bool
4761{
4762 if (*newval && strcmp(*newval, "") != 0 &&
4764 return false;
4765
4766 return true;
4767}
4768
4769/*
4770 * Recovery target settings: Only one of the several recovery_target* settings
4771 * may be set. Setting a second one results in an error. The global variable
4772 * recoveryTarget tracks which kind of recovery target was chosen. Other
4773 * variables store the actual target value (for example a string or a xid).
4774 * The assign functions of the parameters check whether a competing parameter
4775 * was already set. But we want to allow setting the same parameter multiple
4776 * times. We also want to allow unsetting a parameter and setting a different
4777 * one, so we unset recoveryTarget when the parameter is set to an empty
4778 * string.
4779 *
4780 * XXX this code is broken by design. Throwing an error from a GUC assign
4781 * hook breaks fundamental assumptions of guc.c. So long as all the variables
4782 * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4783 * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4784 * that we have odd behaviors such as unexpected GUC ordering dependencies.
4785 */
4786
4787pg_noreturn static void
4789{
4790 ereport(ERROR,
4791 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4792 errmsg("multiple recovery targets specified"),
4793 errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4794}
4795
4796/*
4797 * GUC check_hook for recovery_target
4798 */
4799bool
4801{
4802 if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4803 {
4804 GUC_check_errdetail("The only allowed value is \"immediate\".");
4805 return false;
4806 }
4807 return true;
4808}
4809
4810/*
4811 * GUC assign_hook for recovery_target
4812 */
4813void
4814assign_recovery_target(const char *newval, void *extra)
4815{
4819
4820 if (newval && strcmp(newval, "") != 0)
4822 else
4824}
4825
4826/*
4827 * GUC check_hook for recovery_target_lsn
4828 */
4829bool
4831{
4832 if (strcmp(*newval, "") != 0)
4833 {
4834 XLogRecPtr lsn;
4835 XLogRecPtr *myextra;
4836 bool have_error = false;
4837
4838 lsn = pg_lsn_in_internal(*newval, &have_error);
4839 if (have_error)
4840 return false;
4841
4842 myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4843 if (!myextra)
4844 return false;
4845 *myextra = lsn;
4846 *extra = myextra;
4847 }
4848 return true;
4849}
4850
4851/*
4852 * GUC assign_hook for recovery_target_lsn
4853 */
4854void
4855assign_recovery_target_lsn(const char *newval, void *extra)
4856{
4860
4861 if (newval && strcmp(newval, "") != 0)
4862 {
4864 recoveryTargetLSN = *((XLogRecPtr *) extra);
4865 }
4866 else
4868}
4869
4870/*
4871 * GUC check_hook for recovery_target_name
4872 */
4873bool
4875{
4876 /* Use the value of newval directly */
4877 if (strlen(*newval) >= MAXFNAMELEN)
4878 {
4879 GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4880 "recovery_target_name", MAXFNAMELEN - 1);
4881 return false;
4882 }
4883 return true;
4884}
4885
4886/*
4887 * GUC assign_hook for recovery_target_name
4888 */
4889void
4890assign_recovery_target_name(const char *newval, void *extra)
4891{
4895
4896 if (newval && strcmp(newval, "") != 0)
4897 {
4900 }
4901 else
4903}
4904
4905/*
4906 * GUC check_hook for recovery_target_time
4907 *
4908 * The interpretation of the recovery_target_time string can depend on the
4909 * time zone setting, so we need to wait until after all GUC processing is
4910 * done before we can do the final parsing of the string. This check function
4911 * only does a parsing pass to catch syntax errors, but we store the string
4912 * and parse it again when we need to use it.
4913 */
4914bool
4916{
4917 if (strcmp(*newval, "") != 0)
4918 {
4919 /* reject some special values */
4920 if (strcmp(*newval, "now") == 0 ||
4921 strcmp(*newval, "today") == 0 ||
4922 strcmp(*newval, "tomorrow") == 0 ||
4923 strcmp(*newval, "yesterday") == 0)
4924 {
4925 return false;
4926 }
4927
4928 /*
4929 * parse timestamp value (see also timestamptz_in())
4930 */
4931 {
4932 char *str = *newval;
4933 fsec_t fsec;
4934 struct pg_tm tt,
4935 *tm = &tt;
4936 int tz;
4937 int dtype;
4938 int nf;
4939 int dterr;
4940 char *field[MAXDATEFIELDS];
4941 int ftype[MAXDATEFIELDS];
4942 char workbuf[MAXDATELEN + MAXDATEFIELDS];
4943 DateTimeErrorExtra dtextra;
4945
4946 dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4947 field, ftype, MAXDATEFIELDS, &nf);
4948 if (dterr == 0)
4949 dterr = DecodeDateTime(field, ftype, nf,
4950 &dtype, tm, &fsec, &tz, &dtextra);
4951 if (dterr != 0)
4952 return false;
4953 if (dtype != DTK_DATE)
4954 return false;
4955
4956 if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4957 {
4958 GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4959 return false;
4960 }
4961 }
4962 }
4963 return true;
4964}
4965
4966/*
4967 * GUC assign_hook for recovery_target_time
4968 */
4969void
4970assign_recovery_target_time(const char *newval, void *extra)
4971{
4975
4976 if (newval && strcmp(newval, "") != 0)
4978 else
4980}
4981
4982/*
4983 * GUC check_hook for recovery_target_timeline
4984 */
4985bool
4987{
4990
4991 if (strcmp(*newval, "current") == 0)
4993 else if (strcmp(*newval, "latest") == 0)
4995 else
4996 {
4998
4999 errno = 0;
5000 strtoul(*newval, NULL, 0);
5001 if (errno == EINVAL || errno == ERANGE)
5002 {
5003 GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number.");
5004 return false;
5005 }
5006 }
5007
5009 if (!myextra)
5010 return false;
5011 *myextra = rttg;
5012 *extra = myextra;
5013
5014 return true;
5015}
5016
5017/*
5018 * GUC assign_hook for recovery_target_timeline
5019 */
5020void
5022{
5025 recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5026 else
5028}
5029
5030/*
5031 * GUC check_hook for recovery_target_xid
5032 */
5033bool
5035{
5036 if (strcmp(*newval, "") != 0)
5037 {
5038 TransactionId xid;
5039 TransactionId *myextra;
5040
5041 errno = 0;
5042 xid = (TransactionId) strtou64(*newval, NULL, 0);
5043 if (errno == EINVAL || errno == ERANGE)
5044 return false;
5045
5046 myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5047 if (!myextra)
5048 return false;
5049 *myextra = xid;
5050 *extra = myextra;
5051 }
5052 return true;
5053}
5054
5055/*
5056 * GUC assign_hook for recovery_target_xid
5057 */
5058void
5059assign_recovery_target_xid(const char *newval, void *extra)
5060{
5064
5065 if (newval && strcmp(newval, "") != 0)
5066 {
5068 recoveryTargetXid = *((TransactionId *) extra);
5069 }
5070 else
5072}
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition: timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:544
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:572
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition: timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition: timeline.c:50
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition: timeline.c:526
void remove_tablespace_symlink(const char *linkloc)
Definition: tablespace.c:883
bool allow_in_place_tablespaces
Definition: tablespace.c:85
void disable_startup_progress_timeout(void)
Definition: startup.c:313
bool IsPromoteSignaled(void)
Definition: startup.c:292
void begin_startup_progress_phase(void)
Definition: startup.c:347
void ProcessStartupProcInterrupts(void)
Definition: startup.c:154
void ResetPromoteSignaled(void)
Definition: startup.c:298
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition: datetime.c:764
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition: datetime.c:988
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1757
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition: timestamp.c:2006
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1781
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition: timestamp.c:418
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
const char * timestamptz_to_str(TimestampTz t)
Definition: timestamp.c:1862
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1609
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5320
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5537
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:414
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:198
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:52
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:365
PageData * Page
Definition: bufpage.h:82
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
uint8_t uint8
Definition: c.h:500
#define pg_noreturn
Definition: c.h:165
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:224
#define PG_BINARY
Definition: c.h:1244
#define UINT64_FORMAT
Definition: c.h:521
int32_t int32
Definition: c.h:498
uint64_t uint64
Definition: c.h:503
uint32_t uint32
Definition: c.h:502
uint32 TransactionId
Definition: c.h:623
size_t Size
Definition: c.h:576
void RequestCheckpoint(int flags)
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition: timestamp.h:39
int32 fsec_t
Definition: timestamp.h:41
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1158
int errcode_for_file_access(void)
Definition: elog.c:877
int errdetail(const char *fmt,...)
Definition: elog.c:1204
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errhint(const char *fmt,...)
Definition: elog.c:1318
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define LOG
Definition: elog.h:31
#define errcontext
Definition: elog.h:197
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:149
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1111
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:782
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1089
int FreeFile(FILE *file)
Definition: fd.c:2843
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2907
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2973
int pg_fsync(int fd)
Definition: fd.c:386
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2644
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:547
@ PGFILETYPE_LNK
Definition: file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:686
bool IsUnderPostmaster
Definition: globals.c:121
char * DataDir
Definition: globals.c:72
bool IsPostmasterEnvironment
Definition: globals.c:120
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:638
#define newval
#define GUC_check_errdetail
Definition: guc.h:481
GucSource
Definition: guc.h:112
Assert(PointerIsAligned(start, uint64))
const char * str
#define MAXDATEFIELDS
Definition: datetime.h:202
#define DTK_DATE
Definition: datetime.h:144
#define MAXDATELEN
Definition: datetime.h:200
#define close(a)
Definition: win32.h:12
void proc_exit(int code)
Definition: ipc.c:104
int i
Definition: isn.c:77
void OwnLatch(Latch *latch)
Definition: latch.c:126
void DisownLatch(Latch *latch)
Definition: latch.c:144
void InitSharedLatch(Latch *latch)
Definition: latch.c:93
void SetLatch(Latch *latch)
Definition: latch.c:288
void ResetLatch(Latch *latch)
Definition: latch.c:372
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:172
List * lappend(List *list, void *datum)
Definition: list.c:339
void list_free_deep(List *list)
Definition: list.c:1560
static struct pg_tm tm
Definition: localtime.c:104
char * pstrdup(const char *in)
Definition: mcxt.c:2322
void pfree(void *pointer)
Definition: mcxt.c:2147
void * palloc0(Size size)
Definition: mcxt.c:1970
void * palloc(Size size)
Definition: mcxt.c:1940
#define AmStartupProcess()
Definition: miscadmin.h:390
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:477
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition: pg_control.h:75
#define XLOG_CHECKPOINT_REDO
Definition: pg_control.h:82
#define XLOG_OVERWRITE_CONTRECORD
Definition: pg_control.h:81
DBState
Definition: pg_control.h:90
@ DB_IN_ARCHIVE_RECOVERY
Definition: pg_control.h:96
@ DB_SHUTDOWNED_IN_RECOVERY
Definition: pg_control.h:93
@ DB_SHUTDOWNED
Definition: pg_control.h:92
@ DB_IN_CRASH_RECOVERY
Definition: pg_control.h:95
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:68
#define XLOG_BACKUP_END
Definition: pg_control.h:73
#define XLOG_CHECKPOINT_ONLINE
Definition: pg_control.h:69
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:77
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
XLogRecPtr pg_lsn_in_internal(const char *str, bool *have_error)
Definition: pg_lsn.c:29
static rewind_source * source
Definition: pg_rewind.c:89
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:72
@ IOOBJECT_WAL
Definition: pgstat.h:276
@ IOCONTEXT_NORMAL
Definition: pgstat.h:286
@ IOOP_READ
Definition: pgstat.h:312
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:90
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:121
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:165
@ PMSIGNAL_RECOVERY_STARTED
Definition: pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition: pmsignal.h:37
@ PMSIGNAL_RECOVERY_CONSISTENT
Definition: pmsignal.h:36
#define pg_pread
Definition: port.h:226
#define snprintf
Definition: port.h:239
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:257
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:355
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:217
#define InvalidOid
Definition: postgres_ext.h:35
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition: procarray.c:4403
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition: procarray.c:4564
static void set_ps_display(const char *activity)
Definition: ps_status.h:40
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
#define PG_TBLSPC_DIR
Definition: relpath.h:41
void RmgrStartup(void)
Definition: rmgr.c:58
void RmgrCleanup(void)
Definition: rmgr.c:74
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:387
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:267
void ShutDownSlotSync(void)
Definition: slotsync.c:1565
#define SpinLockInit(lock)
Definition: spin.h:57
#define SpinLockRelease(lock)
Definition: spin.h:61
#define SpinLockAcquire(lock)
Definition: spin.h:59
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:145
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:230
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:242
void initStringInfo(StringInfo str)
Definition: stringinfo.c:97
Oid oldestMultiDB
Definition: pg_control.h:51
MultiXactId oldestMulti
Definition: pg_control.h:50
MultiXactOffset nextMultiOffset
Definition: pg_control.h:47
TransactionId newestCommitTsXid
Definition: pg_control.h:55
TransactionId oldestXid
Definition: pg_control.h:48
TimeLineID PrevTimeLineID
Definition: pg_control.h:40
TimeLineID ThisTimeLineID
Definition: pg_control.h:39
Oid nextOid
Definition: pg_control.h:45
MultiXactId nextMulti
Definition: pg_control.h:46
FullTransactionId nextXid
Definition: pg_control.h:44
TransactionId oldestCommitTsXid
Definition: pg_control.h:53
XLogRecPtr redo
Definition: pg_control.h:37
Oid oldestXidDB
Definition: pg_control.h:49
XLogRecPtr backupStartPoint
Definition: pg_control.h:170
bool backupEndRequired
Definition: pg_control.h:172
CheckPoint checkPointCopy
Definition: pg_control.h:135
XLogRecPtr backupEndPoint
Definition: pg_control.h:171
XLogRecPtr minRecoveryPoint
Definition: pg_control.h:168
XLogRecPtr checkPoint
Definition: pg_control.h:133
uint64 system_identifier
Definition: pg_control.h:110
TimeLineID minRecoveryPointTLI
Definition: pg_control.h:169
Definition: dirent.c:26
XLogRecPtr lastPageBeginPtr
Definition: xlogrecovery.h:111
XLogRecPtr abortedRecPtr
Definition: xlogrecovery.h:120
XLogRecPtr missingContrecPtr
Definition: xlogrecovery.h:121
TimeLineID endOfLogTLI
Definition: xlogrecovery.h:109
struct ErrorContextCallback * previous
Definition: elog.h:297
void(* callback)(void *arg)
Definition: elog.h:298
Definition: latch.h:114
Definition: pg_list.h:54
RelFileNumber relNumber
const char *(* rm_identify)(uint8 info)
void(* rm_mask)(char *pagedata, BlockNumber blkno)
void(* rm_redo)(XLogReaderState *record)
const char * rm_name
void(* rm_desc)(StringInfo buf, XLogReaderState *record)
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h:29
TimeLineID ws_tli
Definition: xlogreader.h:49
TimeLineID replayTLI
Definition: xlogrecovery.c:200
XLogRecPtr missingContrecPtr
Definition: xlogreader.h:215
char * errormsg_buf
Definition: xlogreader.h:311
XLogRecPtr EndRecPtr
Definition: xlogreader.h:207
uint64 system_identifier
Definition: xlogreader.h:191
XLogRecPtr ReadRecPtr
Definition: xlogreader.h:206
XLogRecPtr abortedRecPtr
Definition: xlogreader.h:214
TimeLineID latestPageTLI
Definition: xlogreader.h:280
XLogRecPtr overwrittenRecPtr
Definition: xlogreader.h:217
XLogRecPtr latestPagePtr
Definition: xlogreader.h:279
WALOpenSegment seg
Definition: xlogreader.h:272
void * private_data
Definition: xlogreader.h:196
uint8 xl_info
Definition: xlogrecord.h:46
uint32 xl_tot_len
Definition: xlogrecord.h:43
TransactionId xl_xid
Definition: xlogrecord.h:44
RmgrId xl_rmid
Definition: xlogrecord.h:47
ConditionVariable recoveryNotPausedCV
Definition: xlogrecovery.c:365
XLogRecPtr lastReplayedEndRecPtr
Definition: xlogrecovery.c:345
TimeLineID replayEndTLI
Definition: xlogrecovery.c:354
TimeLineID lastReplayedTLI
Definition: xlogrecovery.c:346
TimestampTz currentChunkStartTime
Definition: xlogrecovery.c:362
XLogRecPtr replayEndRecPtr
Definition: xlogrecovery.c:353
TimestampTz recoveryLastXTime
Definition: xlogrecovery.c:356
RecoveryPauseState recoveryPauseState
Definition: xlogrecovery.c:364
XLogRecPtr lastReplayedReadRecPtr
Definition: xlogrecovery.c:344
Definition: guc.h:174
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
Definition: pgtime.h:35
Definition: regguts.h:323
TimeLineID PrevTimeLineID
TimeLineID ThisTimeLineID
char rp_name[MAXFNAMELEN]
TransactionId twophase_xid
Definition: xact.h:427
TransactionId twophase_xid
Definition: xact.h:397
#define InvalidTransactionId
Definition: transam.h:31
#define U64FromFullTransactionId(x)
Definition: transam.h:49
#define XidFromFullTransactionId(x)
Definition: transam.h:48
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
#define TimestampTzPlusMilliseconds(tz, ms)
Definition: timestamp.h:85
static TimestampTz DatumGetTimestampTz(Datum X)
Definition: timestamp.h:34
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition: varsup.c:304
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:85
static void pgstat_report_wait_end(void)
Definition: wait_event.h:101
#define WL_TIMEOUT
Definition: waiteventset.h:37
#define WL_EXIT_ON_PM_DEATH
Definition: waiteventset.h:39
#define WL_LATCH_SET
Definition: waiteventset.h:34
void WalRcvForceReply(void)
Definition: walreceiver.c:1350
#define AllowCascadeReplication()
Definition: walreceiver.h:40
XLogRecPtr GetWalRcvFlushRecPtr(XLogRecPtr *latestChunkStart, TimeLineID *receiveTLI)
bool WalRcvStreaming(void)
void RequestXLogStreaming(TimeLineID tli, XLogRecPtr recptr, const char *conninfo, const char *slotname, bool create_temp_slot)
bool WalRcvRunning(void)
void WalSndWakeup(bool physical, bool logical)
Definition: walsender.c:3671
#define stat
Definition: win32_port.h:274
#define S_IRUSR
Definition: win32_port.h:279
#define symlink(oldpath, newpath)
Definition: win32_port.h:225
#define S_IWUSR
Definition: win32_port.h:282
#define XLOG_XACT_COMMIT_PREPARED
Definition: xact.h:172
#define XLOG_XACT_COMMIT
Definition: xact.h:169
#define XLOG_XACT_OPMASK
Definition: xact.h:179
#define XLOG_XACT_ABORT
Definition: xact.h:171
#define XLOG_XACT_ABORT_PREPARED
Definition: xact.h:173
void ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed)
Definition: xactdesc.c:35
void ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed)
Definition: xactdesc.c:141
int wal_decode_buffer_size
Definition: xlog.c:136
bool EnableHotStandby
Definition: xlog.c:121
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:6625
void SetInstallXLogFileSegmentActive(void)
Definition: xlog.c:9661
bool IsInstallXLogFileSegmentActive(void)
Definition: xlog.c:9669
int wal_segment_size
Definition: xlog.c:143
void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
Definition: xlog.c:6397
void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
Definition: xlog.c:4079
void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
Definition: xlog.c:6435
int wal_retrieve_retry_interval
Definition: xlog.c:134
bool track_wal_io_timing
Definition: xlog.c:137
static ControlFileData * ControlFile
Definition: xlog.c:585
void XLogShutdownWalRcv(void)
Definition: xlog.c:9650
bool XLogCheckpointNeeded(XLogSegNo new_segno)
Definition: xlog.c:2423
#define TABLESPACE_MAP_OLD
Definition: xlog.h:307
#define TABLESPACE_MAP
Definition: xlog.h:306
#define STANDBY_SIGNAL_FILE
Definition: xlog.h:302
#define CHECKPOINT_CAUSE_XLOG
Definition: xlog.h:148
#define PROMOTE_SIGNAL_FILE
Definition: xlog.h:310
#define BACKUP_LABEL_FILE
Definition: xlog.h:303
#define RECOVERY_SIGNAL_FILE
Definition: xlog.h:301
static RmgrData GetRmgr(RmgrId rmid)
@ RECOVERY_TARGET_ACTION_PAUSE
@ RECOVERY_TARGET_ACTION_PROMOTE
@ RECOVERY_TARGET_ACTION_SHUTDOWN
#define XLogSegmentOffset(xlogptr, wal_segsz_bytes)
#define MAXFNAMELEN
#define XLOGDIR
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)
static void XLogFilePath(char *path, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XRecOffIsValid(xlrp)
static void XLogFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes)
bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize, bool cleanupEnabled)
Definition: xlogarchive.c:54
void KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
Definition: xlogarchive.c:358
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
uint32 TimeLineID
Definition: xlogdefs.h:59
uint64 XLogSegNo
Definition: xlogdefs.h:48
void XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher)
XLogPrefetcher * XLogPrefetcherAllocate(XLogReaderState *reader)
void XLogPrefetchReconfigure(void)
XLogRecord * XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg)
XLogReaderState * XLogPrefetcherGetReader(XLogPrefetcher *prefetcher)
void XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
void XLogPrefetcherFree(XLogPrefetcher *prefetcher)
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
Definition: xlogreader.c:2007
XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, XLogReaderRoutine *routine, void *private_data)
Definition: xlogreader.c:107
void XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size)
Definition: xlogreader.c:91
void XLogReaderResetError(XLogReaderState *state)
Definition: xlogreader.c:1366
bool XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, char *phdr)
Definition: xlogreader.c:1225
void XLogReaderFree(XLogReaderState *state)
Definition: xlogreader.c:162
bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
Definition: xlogreader.c:2066
#define XLogRecGetDataLen(decoder)
Definition: xlogreader.h:416
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:410
#define XLogRecBlockImageApply(decoder, block_id)
Definition: xlogreader.h:425
#define XLogRecGetRmid(decoder)
Definition: xlogreader.h:411
#define XLogRecGetData(decoder)
Definition: xlogreader.h:415
#define XLogRecGetXid(decoder)
Definition: xlogreader.h:412
#define XL_ROUTINE(...)
Definition: xlogreader.h:117
#define XLogRecMaxBlockId(decoder)
Definition: xlogreader.h:418
XLogPageReadResult
Definition: xlogreader.h:350
@ XLREAD_WOULDBLOCK
Definition: xlogreader.h:353
@ XLREAD_SUCCESS
Definition: xlogreader.h:351
@ XLREAD_FAIL
Definition: xlogreader.h:352
#define XLogRecHasBlockImage(decoder, block_id)
Definition: xlogreader.h:423
#define XLogRecGetPrev(decoder)
Definition: xlogreader.h:409
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:417
#define SizeOfXLogRecordDataHeaderShort
Definition: xlogrecord.h:217
#define XLR_INFO_MASK
Definition: xlogrecord.h:62
#define SizeOfXLogRecord
Definition: xlogrecord.h:55
#define XLR_CHECK_CONSISTENCY
Definition: xlogrecord.h:91
bool reachedConsistency
Definition: xlogrecovery.c:300
bool check_primary_slot_name(char **newval, void **extra, GucSource source)
static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
static XLogRecPtr recoveryStopLSN
Definition: xlogrecovery.c:387
static bool recoveryStopsBefore(XLogReaderState *record)
static TimestampTz recoveryStopTime
Definition: xlogrecovery.c:386
void assign_recovery_target_xid(const char *newval, void *extra)
static bool CheckForStandbyTrigger(void)
int recovery_min_apply_delay
Definition: xlogrecovery.c:94
bool check_recovery_target(char **newval, void **extra, GucSource source)
static bool backupEndRequired
Definition: xlogrecovery.c:284
bool HotStandbyActive(void)
static char * getRecoveryStopReason(void)
void ShutdownWalRecovery(void)
RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal
Definition: xlogrecovery.c:121
int recoveryTargetAction
Definition: xlogrecovery.c:88
static void rm_redo_error_callback(void *arg)
static bool recoveryApplyDelay(XLogReaderState *record)
bool ArchiveRecoveryRequested
Definition: xlogrecovery.c:138
const char * recoveryTargetName
Definition: xlogrecovery.c:92
static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
bool check_recovery_target_timeline(char **newval, void **extra, GucSource source)
static XLogRecPtr minRecoveryPoint
Definition: xlogrecovery.c:279
static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf)
static XLogRecPtr backupEndPoint
Definition: xlogrecovery.c:283
const struct config_enum_entry recovery_target_action_options[]
Definition: xlogrecovery.c:75
static void validateRecoveryParameters(void)
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, TimeLineID replayTLI)
static XLogRecord * ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr, TimeLineID replayTLI)
void StartupRequestWalReceiverRestart(void)
bool InArchiveRecovery
Definition: xlogrecovery.c:139
static bool recoveryStopsAfter(XLogReaderState *record)
void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
char * PrimarySlotName
Definition: xlogrecovery.c:98
static TimeLineID curFileTLI
Definition: xlogrecovery.c:125
static char recoveryStopName[MAXFNAMELEN]
Definition: xlogrecovery.c:388
static void CheckRecoveryConsistency(void)
static bool pendingWalRcvRestart
Definition: xlogrecovery.c:249
void PerformWalRecovery(void)
static XLogSource XLogReceiptSource
Definition: xlogrecovery.c:260
bool CheckPromoteSignal(void)
struct XLogPageReadPrivate XLogPageReadPrivate
static bool recoveryStopAfter
Definition: xlogrecovery.c:389
static const char *const xlogSourceNames[]
Definition: xlogrecovery.c:219
static TimeLineID RedoStartTLI
Definition: xlogrecovery.c:171
char * recoveryRestoreCommand
Definition: xlogrecovery.c:83
static void verifyBackupPageConsistency(XLogReaderState *record)
static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
void assign_recovery_target(const char *newval, void *extra)
void SetRecoveryPause(bool recoveryPause)
static bool lastSourceFailed
Definition: xlogrecovery.c:248
char * archiveCleanupCommand
Definition: xlogrecovery.c:85
XLogRecPtr GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
static TimeLineID receiveTLI
Definition: xlogrecovery.c:264
void WakeupRecovery(void)
void xlog_outdesc(StringInfo buf, XLogReaderState *record)
static bool LocalPromoteIsTriggered
Definition: xlogrecovery.c:183
bool PromoteIsTriggered(void)
TimestampTz GetCurrentChunkReplayStartTime(void)
static void ConfirmRecoveryPaused(void)
static void readRecoverySignalFile(void)
static XLogRecPtr missingContrecPtr
Definition: xlogrecovery.c:379
static XLogRecoveryCtlData * XLogRecoveryCtl
Definition: xlogrecovery.c:370
static uint32 readOff
Definition: xlogrecovery.c:233
static bool standby_signal_file_found
Definition: xlogrecovery.c:151
char * recovery_target_time_string
Definition: xlogrecovery.c:90
bool StandbyMode
Definition: xlogrecovery.c:148
static int readFile
Definition: xlogrecovery.c:231
static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr, TimeLineID replayTLI, XLogRecPtr replayLSN, bool nonblocking)
XLogRecPtr recoveryTargetLSN
Definition: xlogrecovery.c:93
RecoveryTargetType recoveryTarget
Definition: xlogrecovery.c:86
static bool read_tablespace_map(List **tablespaces)
static bool doRequestWalReceiverReply
Definition: xlogrecovery.c:186
static bool read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, bool *backupEndRequired, bool *backupFromStandby)
static int XLogFileRead(XLogSegNo segno, TimeLineID tli, XLogSource source, bool notfoundOk)
static XLogSource currentSource
Definition: xlogrecovery.c:247
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI)
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
static List * expectedTLEs
Definition: xlogrecovery.c:124
static XLogSegNo readSegNo
Definition: xlogrecovery.c:232
void assign_recovery_target_name(const char *newval, void *extra)
static XLogRecPtr abortedRecPtr
Definition: xlogrecovery.c:378
static char * primary_image_masked
Definition: xlogrecovery.c:304
static TimeLineID minRecoveryPointTLI
Definition: xlogrecovery.c:280
static XLogRecord * ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, bool fetching_ckpt, TimeLineID replayTLI)
EndOfWalRecoveryInfo * FinishWalRecovery(void)
void assign_recovery_target_time(const char *newval, void *extra)
static void SetCurrentChunkStartTime(TimestampTz xtime)
static XLogRecPtr CheckPointLoc
Definition: xlogrecovery.c:168
bool check_recovery_target_xid(char **newval, void **extra, GucSource source)
static bool LocalHotStandbyActive
Definition: xlogrecovery.c:177
struct XLogRecoveryCtlData XLogRecoveryCtlData
static bool HotStandbyActiveInReplay(void)
static bool InRedo
Definition: xlogrecovery.c:204
static TransactionId recoveryStopXid
Definition: xlogrecovery.c:385
bool check_recovery_target_time(char **newval, void **extra, GucSource source)
static XLogSource readSource
Definition: xlogrecovery.c:235
static void SetPromoteIsTriggered(void)
#define RECOVERY_COMMAND_FILE
Definition: xlogrecovery.c:69
TransactionId recoveryTargetXid
Definition: xlogrecovery.c:89
XLogSource
Definition: xlogrecovery.c:211
@ XLOG_FROM_PG_WAL
Definition: xlogrecovery.c:214
@ XLOG_FROM_STREAM
Definition: xlogrecovery.c:215
@ XLOG_FROM_ARCHIVE
Definition: xlogrecovery.c:213
@ XLOG_FROM_ANY
Definition: xlogrecovery.c:212
TimeLineID recoveryTargetTLIRequested
Definition: xlogrecovery.c:122
static pg_noreturn void error_multiple_recovery_targets(void)
void InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
Definition: xlogrecovery.c:518
static void xlog_block_info(StringInfo buf, XLogReaderState *record)
static TimestampTz XLogReceiptTime
Definition: xlogrecovery.c:259
static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
Size XLogRecoveryShmemSize(void)
Definition: xlogrecovery.c:453
static char * replay_image_masked
Definition: xlogrecovery.c:303
bool wal_receiver_create_temp_slot
Definition: xlogrecovery.c:99
static void CheckTablespaceDirectory(void)
char * recoveryEndCommand
Definition: xlogrecovery.c:84
RecoveryPauseState GetRecoveryPauseState(void)
TimeLineID recoveryTargetTLI
Definition: xlogrecovery.c:123
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
void assign_recovery_target_lsn(const char *newval, void *extra)
bool check_recovery_target_lsn(char **newval, void **extra, GucSource source)
static XLogRecPtr RedoStartLSN
Definition: xlogrecovery.c:170
static XLogRecPtr flushedUpto
Definition: xlogrecovery.c:263
void XLogRecoveryShmemInit(void)
Definition: xlogrecovery.c:464
static void recoveryPausesHere(bool endOfRecovery)
static uint32 readLen
Definition: xlogrecovery.c:234
static void EnableStandbyMode(void)
Definition: xlogrecovery.c:484
#define RECOVERY_COMMAND_DONE
Definition: xlogrecovery.c:70
static bool recovery_signal_file_found
Definition: xlogrecovery.c:152
TimestampTz recoveryTargetTime
Definition: xlogrecovery.c:91
TimestampTz GetLatestXTime(void)
char * PrimaryConnInfo
Definition: xlogrecovery.c:97
void XLogRequestWalReceiverReply(void)
static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
static XLogPrefetcher * xlogprefetcher
Definition: xlogrecovery.c:192
static bool StandbyModeRequested
Definition: xlogrecovery.c:147
bool check_recovery_target_name(char **newval, void **extra, GucSource source)
bool recoveryTargetInclusive
Definition: xlogrecovery.c:87
static XLogReaderState * xlogreader
Definition: xlogrecovery.c:189
void RemovePromoteSignalFiles(void)
void assign_recovery_target_timeline(const char *newval, void *extra)
static XLogRecPtr backupStartPoint
Definition: xlogrecovery.c:282
static void SetLatestXTime(TimestampTz xtime)
static TimeLineID CheckPointTLI
Definition: xlogrecovery.c:169
RecoveryTargetType
Definition: xlogrecovery.h:24
@ RECOVERY_TARGET_IMMEDIATE
Definition: xlogrecovery.h:30
@ RECOVERY_TARGET_TIME
Definition: xlogrecovery.h:27
@ RECOVERY_TARGET_UNSET
Definition: xlogrecovery.h:25
@ RECOVERY_TARGET_XID
Definition: xlogrecovery.h:26
@ RECOVERY_TARGET_LSN
Definition: xlogrecovery.h:29
@ RECOVERY_TARGET_NAME
Definition: xlogrecovery.h:28
RecoveryTargetTimeLineGoal
Definition: xlogrecovery.h:37
@ RECOVERY_TARGET_TIMELINE_NUMERIC
Definition: xlogrecovery.h:40
@ RECOVERY_TARGET_TIMELINE_CONTROLFILE
Definition: xlogrecovery.h:38
@ RECOVERY_TARGET_TIMELINE_LATEST
Definition: xlogrecovery.h:39
RecoveryPauseState
Definition: xlogrecovery.h:45
@ RECOVERY_PAUSED
Definition: xlogrecovery.h:48
@ RECOVERY_NOT_PAUSED
Definition: xlogrecovery.h:46
@ RECOVERY_PAUSE_REQUESTED
Definition: xlogrecovery.h:47
void wal_segment_close(XLogReaderState *state)
Definition: xlogutils.c:831
Buffer XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode, Buffer recent_buffer)
Definition: xlogutils.c:460
HotStandbyState standbyState
Definition: xlogutils.c:53
bool InRecovery
Definition: xlogutils.c:50
void XLogCheckInvalidPages(void)
Definition: xlogutils.c:234
@ STANDBY_SNAPSHOT_READY
Definition: xlogutils.h:55
@ STANDBY_INITIALIZED
Definition: xlogutils.h:53