PostgreSQL Source Code git master
Loading...
Searching...
No Matches
xlogrecovery.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * xlogrecovery.c
4 * Functions for WAL recovery, standby mode
5 *
6 * This source file contains functions controlling WAL recovery.
7 * InitWalRecovery() initializes the system for crash or archive recovery,
8 * or standby mode, depending on configuration options and the state of
9 * the control file and possible backup label file. PerformWalRecovery()
10 * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 * and prepares information needed to initialize the WAL for writes. In
13 * addition to these three main functions, there are a bunch of functions
14 * for interrogating recovery state and controlling the recovery process.
15 *
16 *
17 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
19 *
20 * src/backend/access/transam/xlogrecovery.c
21 *
22 *-------------------------------------------------------------------------
23 */
24
25#include "postgres.h"
26
27#include <ctype.h>
28#include <time.h>
29#include <sys/stat.h>
30#include <sys/time.h>
31#include <unistd.h>
32
33#include "access/timeline.h"
34#include "access/transam.h"
35#include "access/xact.h"
37#include "access/xlogarchive.h"
39#include "access/xlogreader.h"
40#include "access/xlogrecovery.h"
41#include "access/xlogutils.h"
42#include "access/xlogwait.h"
43#include "backup/basebackup.h"
44#include "catalog/pg_control.h"
45#include "commands/tablespace.h"
46#include "common/file_utils.h"
47#include "miscadmin.h"
48#include "nodes/miscnodes.h"
49#include "pgstat.h"
50#include "postmaster/bgwriter.h"
51#include "postmaster/startup.h"
52#include "replication/slot.h"
55#include "storage/fd.h"
56#include "storage/ipc.h"
57#include "storage/latch.h"
58#include "storage/pmsignal.h"
59#include "storage/procarray.h"
60#include "storage/spin.h"
61#include "storage/subsystems.h"
62#include "utils/datetime.h"
63#include "utils/fmgrprotos.h"
64#include "utils/guc_hooks.h"
66#include "utils/pg_lsn.h"
67#include "utils/ps_status.h"
68#include "utils/pg_rusage.h"
69#include "utils/wait_event.h"
70
71/* Unsupported old recovery command file names (relative to $PGDATA) */
72#define RECOVERY_COMMAND_FILE "recovery.conf"
73#define RECOVERY_COMMAND_DONE "recovery.done"
74
75/*
76 * GUC support
77 */
79 {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
80 {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
81 {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
82 {NULL, 0, false}
83};
84
85/* options formerly taken from recovery.conf for archive recovery */
87char *recoveryEndCommand = NULL;
98
99/* options formerly taken from recovery.conf for XLOG streaming */
100char *PrimaryConnInfo = NULL;
101char *PrimarySlotName = NULL;
103
104/*
105 * recoveryTargetTimeLineGoal: what the user requested, if any
106 *
107 * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
108 *
109 * recoveryTargetTLI: the currently understood target timeline; changes
110 *
111 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
112 * the timelines of its known parents, newest first (so recoveryTargetTLI is
113 * always the first list member). Only these TLIs are expected to be seen in
114 * the WAL segments we read, and indeed only these TLIs will be considered as
115 * candidate WAL files to open at all.
116 *
117 * curFileTLI: the TLI appearing in the name of the current input WAL file.
118 * (This is not necessarily the same as the timeline from which we are
119 * replaying WAL, which StartupXLOG calls replayTLI, because we could be
120 * scanning data that was copied from an ancestor timeline when the current
121 * file was created.) During a sequential scan we do not allow this value
122 * to decrease.
123 */
129
130/*
131 * When ArchiveRecoveryRequested is set, archive recovery was requested,
132 * ie. signal files were present. When InArchiveRecovery is set, we are
133 * currently recovering using offline XLOG archives. These variables are only
134 * valid in the startup process.
135 *
136 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
137 * currently performing crash recovery using only XLOG files in pg_wal, but
138 * will switch to using offline XLOG archives as soon as we reach the end of
139 * WAL in pg_wal.
140 */
142bool InArchiveRecovery = false;
143
144/*
145 * When StandbyModeRequested is set, standby mode was requested, i.e.
146 * standby.signal file was present. When StandbyMode is set, we are currently
147 * in standby mode. These variables are only valid in the startup process.
148 * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
149 */
150static bool StandbyModeRequested = false;
151bool StandbyMode = false;
152
153/* was a signal file present at startup? */
154static bool standby_signal_file_found = false;
155static bool recovery_signal_file_found = false;
156
157/*
158 * CheckPointLoc is the position of the checkpoint record that determines
159 * where to start the replay. It comes from the backup label file or the
160 * control file.
161 *
162 * RedoStartLSN is the checkpoint's REDO location, also from the backup label
163 * file or the control file. In standby mode, XLOG streaming usually starts
164 * from the position where an invalid record was found. But if we fail to
165 * read even the initial checkpoint record, we use the REDO location instead
166 * of the checkpoint location as the start position of XLOG streaming.
167 * Otherwise we would have to jump backwards to the REDO location after
168 * reading the checkpoint record, because the REDO record can precede the
169 * checkpoint record.
170 */
175
176/*
177 * Local copy of SharedHotStandbyActive variable. False actually means "not
178 * known, need to check the shared state".
179 */
180static bool LocalHotStandbyActive = false;
181
182/*
183 * Local copy of SharedPromoteIsTriggered variable. False actually means "not
184 * known, need to check the shared state".
185 */
186static bool LocalPromoteIsTriggered = false;
187
188/* Has the recovery code requested a walreceiver wakeup? */
190
191/* XLogReader object used to parse the WAL records */
193
194/* XLogPrefetcher object used to consume WAL records with read-ahead */
196
197/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
199{
200 int emode;
201 bool fetching_ckpt; /* are we fetching a checkpoint record? */
205
206/* flag to tell XLogPageRead that we have started replaying */
207static bool InRedo = false;
208
209/*
210 * Codes indicating where we got a WAL file from during recovery, or where
211 * to attempt to get one.
212 */
213typedef enum
214{
215 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
216 XLOG_FROM_ARCHIVE, /* restored using restore_command */
217 XLOG_FROM_PG_WAL, /* existing file in pg_wal */
218 XLOG_FROM_STREAM, /* streamed from primary */
219} XLogSource;
220
221/* human-readable names for XLogSources, for debugging output */
222static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
223
224/*
225 * readFile is -1 or a kernel FD for the log file segment that's currently
226 * open for reading. readSegNo identifies the segment. readOff is the offset
227 * of the page just read, readLen indicates how much of it has been read into
228 * readBuf, and readSource indicates where we got the currently open file from.
229 *
230 * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
231 * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
232 * worthwhile, since the XLOG is not read by general-purpose sessions.
233 */
234static int readFile = -1;
236static uint32 readOff = 0;
237static uint32 readLen = 0;
239
240/*
241 * Keeps track of which source we're currently reading from. This is
242 * different from readSource in that this is always set, even when we don't
243 * currently have a WAL file open. If lastSourceFailed is set, our last
244 * attempt to read from currentSource failed, and we should try another source
245 * next.
246 *
247 * pendingWalRcvRestart is set when a config change occurs that requires a
248 * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
249 */
251static bool lastSourceFailed = false;
252static bool pendingWalRcvRestart = false;
253
254/*
255 * These variables track when we last obtained some WAL data to process,
256 * and where we got it from. (XLogReceiptSource is initially the same as
257 * readSource, but readSource gets reset to zero when we don't have data
258 * to process right now. It is also different from currentSource, which
259 * also changes when we try to read from a source and fail, while
260 * XLogReceiptSource tracks where we last successfully read some WAL.)
261 */
264
265/* Local copy of WalRcv->flushedUpto */
268
269/*
270 * Copy of minRecoveryPoint and backupEndPoint from the control file.
271 *
272 * In order to reach consistency, we must replay the WAL up to
273 * minRecoveryPoint. If backupEndRequired is true, we must also reach
274 * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
275 * to backupStartPoint.
276 *
277 * Note: In archive recovery, after consistency has been reached, the
278 * functions in xlog.c will start updating minRecoveryPoint in the control
279 * file. But this copy of minRecoveryPoint variable reflects the value at the
280 * beginning of recovery, and is *not* updated after consistency is reached.
281 */
284
287static bool backupEndRequired = false;
288
289/*
290 * Have we reached a consistent database state? In crash recovery, we have
291 * to replay all the WAL, so reachedConsistency is never set. During archive
292 * recovery, the database is consistent once minRecoveryPoint is reached.
293 *
294 * Consistent state means that the system is internally consistent, all
295 * the WAL has been replayed up to a certain point, and importantly, there
296 * is no trace of later actions on disk.
297 *
298 * This flag is used only by the startup process and postmaster. When
299 * minRecoveryPoint is reached, the startup process sets it to true and
300 * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
301 * which then sets it to true upon receiving the signal.
302 */
304
305/* Buffers dedicated to consistency checks of size BLCKSZ */
306static char *replay_image_masked = NULL;
307static char *primary_image_masked = NULL;
308
310
311static void XLogRecoveryShmemRequest(void *arg);
312static void XLogRecoveryShmemInit(void *arg);
313
318
319/*
320 * abortedRecPtr is the start pointer of a broken record at end of WAL when
321 * recovery completes; missingContrecPtr is the location of the first
322 * contrecord that went missing. See CreateOverwriteContrecordRecord for
323 * details.
324 */
327
328/*
329 * if recoveryStopsBefore/After returns true, it saves information of the stop
330 * point here
331 */
337
338/* prototypes for local functions */
339static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
340
341static void EnableStandbyMode(void);
342static void readRecoverySignalFile(void);
343static void validateRecoveryParameters(void);
344static bool read_backup_label(XLogRecPtr *checkPointLoc,
345 TimeLineID *backupLabelTLI,
346 bool *backupEndRequired, bool *backupFromStandby);
347static bool read_tablespace_map(List **tablespaces);
348
349static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
350static void CheckRecoveryConsistency(void);
351static void rm_redo_error_callback(void *arg);
352#ifdef WAL_DEBUG
353static void xlog_outrec(StringInfo buf, XLogReaderState *record);
354#endif
355static void xlog_block_info(StringInfo buf, XLogReaderState *record);
356static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
357 TimeLineID prevTLI, TimeLineID replayTLI);
358static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
360
361static bool recoveryStopsBefore(XLogReaderState *record);
362static bool recoveryStopsAfter(XLogReaderState *record);
363static char *getRecoveryStopReason(void);
364static void recoveryPausesHere(bool endOfRecovery);
365static bool recoveryApplyDelay(XLogReaderState *record);
366static void ConfirmRecoveryPaused(void);
367
369 int emode, bool fetching_ckpt,
370 TimeLineID replayTLI);
371
372static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
373 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
375 bool randAccess,
376 bool fetching_ckpt,
377 XLogRecPtr tliRecPtr,
378 TimeLineID replayTLI,
379 XLogRecPtr replayLSN,
380 bool nonblocking);
381static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
383 XLogRecPtr RecPtr, TimeLineID replayTLI);
384static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
385static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
386 XLogSource source, bool notfoundOk);
388
389static bool CheckForStandbyTrigger(void);
390static void SetPromoteIsTriggered(void);
391static bool HotStandbyActiveInReplay(void);
392
393static void SetCurrentChunkStartTime(TimestampTz xtime);
394static void SetLatestXTime(TimestampTz xtime);
395
396/*
397 * Register shared memory for WAL recovery
398 */
399static void
401{
402 ShmemRequestStruct(.name = "XLOG Recovery Ctl",
403 .size = sizeof(XLogRecoveryCtlData),
404 .ptr = (void **) &XLogRecoveryCtl,
405 );
406}
407
408static void
417
418/*
419 * A thin wrapper to enable StandbyMode and do other preparatory work as
420 * needed.
421 */
422static void
424{
425 StandbyMode = true;
426
427 /*
428 * To avoid server log bloat, we don't report recovery progress in a
429 * standby as it will always be in recovery unless promoted. We disable
430 * startup progress timeout in standby mode to avoid calling
431 * startup_progress_timeout_handler() unnecessarily.
432 */
434}
435
436/*
437 * Prepare the system for WAL recovery, if needed.
438 *
439 * This is called by StartupXLOG() which coordinates the server startup
440 * sequence. This function analyzes the control file and the backup label
441 * file, if any, and figures out whether we need to perform crash recovery or
442 * archive recovery, and how far we need to replay the WAL to reach a
443 * consistent state.
444 *
445 * This doesn't yet change the on-disk state, except for creating the symlinks
446 * from table space map file if any, and for fetching WAL files needed to find
447 * the checkpoint record. On entry, the caller has already read the control
448 * file into memory, and passes it as argument. This function updates it to
449 * reflect the recovery state, and the caller is expected to write it back to
450 * disk does after initializing other subsystems, but before calling
451 * PerformWalRecovery().
452 *
453 * This initializes some global variables like ArchiveRecoveryRequested, and
454 * StandbyModeRequested and InRecovery.
455 */
456void
458 bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
459{
460 XLogPageReadPrivate *private;
461 struct stat st;
462 bool wasShutdown;
463 XLogRecord *record;
464 DBState dbstate_at_startup;
465 bool haveTblspcMap = false;
466 bool haveBackupLabel = false;
467 CheckPoint checkPoint;
468 bool backupFromStandby = false;
469
470 dbstate_at_startup = ControlFile->state;
471
472 /*
473 * Initialize on the assumption we want to recover to the latest timeline
474 * that's active according to pg_control.
475 */
479 else
481
482 /*
483 * Check for signal files, and if so set up state for offline recovery
484 */
487
488 /*
489 * Take ownership of the wakeup latch if we're going to sleep during
490 * recovery, if required.
491 */
494
495 /*
496 * Set the WAL reading processor now, as it will be needed when reading
497 * the checkpoint record required (backup_label or not).
498 */
500 xlogreader =
502 XL_ROUTINE(.page_read = &XLogPageRead,
503 .segment_open = NULL,
504 .segment_close = wal_segment_close),
505 private);
506 if (!xlogreader)
508 (errcode(ERRCODE_OUT_OF_MEMORY),
509 errmsg("out of memory"),
510 errdetail("Failed while allocating a WAL reading processor.")));
512
513 /*
514 * Set the WAL decode buffer size. This limits how far ahead we can read
515 * in the WAL.
516 */
518
519 /* Create a WAL prefetcher. */
521
522 /*
523 * Allocate two page buffers dedicated to WAL consistency checks. We do
524 * it this way, rather than just making static arrays, for two reasons:
525 * (1) no need to waste the storage in most instantiations of the backend;
526 * (2) a static char array isn't guaranteed to have any particular
527 * alignment, whereas palloc() will provide MAXALIGN'd storage.
528 */
529 replay_image_masked = (char *) palloc(BLCKSZ);
530 primary_image_masked = (char *) palloc(BLCKSZ);
531
532 /*
533 * Read the backup_label file. We want to run this part of the recovery
534 * process after checking for signal files and after performing validation
535 * of the recovery parameters.
536 */
538 &backupFromStandby))
539 {
540 List *tablespaces = NIL;
541
542 /*
543 * Archive recovery was requested, and thanks to the backup label
544 * file, we know how far we need to replay to reach consistency. Enter
545 * archive recovery directly.
546 */
547 InArchiveRecovery = true;
550
551 /*
552 * Omitting backup_label when creating a new replica, PITR node etc.
553 * unfortunately is a common cause of corruption. Logging that
554 * backup_label was used makes it a bit easier to exclude that as the
555 * cause of observed corruption.
556 *
557 * Do so before we try to read the checkpoint record (which can fail),
558 * as otherwise it can be hard to understand why a checkpoint other
559 * than ControlFile->checkPoint is used.
560 */
561 ereport(LOG,
562 errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
566
567 /*
568 * When a backup_label file is present, we want to roll forward from
569 * the checkpoint it identifies, rather than using pg_control.
570 */
573 if (record != NULL)
574 {
575 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
576 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
578 errmsg_internal("checkpoint record is at %X/%08X",
580 InRecovery = true; /* force recovery even if SHUTDOWNED */
581
582 /*
583 * Make sure that REDO location exists. This may not be the case
584 * if there was a crash during an online backup, which left a
585 * backup_label around that references a WAL segment that's
586 * already been archived.
587 */
588 if (checkPoint.redo < CheckPointLoc)
589 {
591 if (!ReadRecord(xlogprefetcher, LOG, false,
592 checkPoint.ThisTimeLineID))
594 errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
596 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
597 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
598 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
600 }
601 }
602 else
603 {
605 errmsg("could not locate required checkpoint record at %X/%08X",
607 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
608 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
609 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
611 wasShutdown = false; /* keep compiler quiet */
612 }
613
614 /* Read the tablespace_map file if present and create symlinks. */
615 if (read_tablespace_map(&tablespaces))
616 {
617 ListCell *lc;
618
619 foreach(lc, tablespaces)
620 {
621 tablespaceinfo *ti = lfirst(lc);
622 char *linkloc;
623
624 linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
625
626 /*
627 * Remove the existing symlink if any and Create the symlink
628 * under PGDATA.
629 */
631
632 if (symlink(ti->path, linkloc) < 0)
635 errmsg("could not create symbolic link \"%s\": %m",
636 linkloc)));
637
638 pfree(ti->path);
639 pfree(ti);
640 }
641
642 /* tell the caller to delete it later */
643 haveTblspcMap = true;
644 }
645
646 /* tell the caller to delete it later */
647 haveBackupLabel = true;
648 }
649 else
650 {
651 /* No backup_label file has been found if we are here. */
652
653 /*
654 * If tablespace_map file is present without backup_label file, there
655 * is no use of such file. There is no harm in retaining it, but it
656 * is better to get rid of the map file so that we don't have any
657 * redundant file in data directory and it will avoid any sort of
658 * confusion. It seems prudent though to just rename the file out of
659 * the way rather than delete it completely, also we ignore any error
660 * that occurs in rename operation as even if map file is present
661 * without backup_label file, it is harmless.
662 */
663 if (stat(TABLESPACE_MAP, &st) == 0)
664 {
665 unlink(TABLESPACE_MAP_OLD);
667 ereport(LOG,
668 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
670 errdetail("File \"%s\" was renamed to \"%s\".",
672 else
673 ereport(LOG,
674 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
676 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
678 }
679
680 /*
681 * It's possible that archive recovery was requested, but we don't
682 * know how far we need to replay the WAL before we reach consistency.
683 * This can happen for example if a base backup is taken from a
684 * running server using an atomic filesystem snapshot, without calling
685 * pg_backup_start/stop. Or if you just kill a running primary server
686 * and put it into archive recovery by creating a recovery signal
687 * file.
688 *
689 * Our strategy in that case is to perform crash recovery first,
690 * replaying all the WAL present in pg_wal, and only enter archive
691 * recovery after that.
692 *
693 * But usually we already know how far we need to replay the WAL (up
694 * to minRecoveryPoint, up to backupEndPoint, or until we see an
695 * end-of-backup record), and we can enter archive recovery directly.
696 */
702 {
703 InArchiveRecovery = true;
706 }
707
708 /*
709 * For the same reason as when starting up with backup_label present,
710 * emit a log message when we continue initializing from a base
711 * backup.
712 */
714 ereport(LOG,
715 errmsg("restarting backup recovery with redo LSN %X/%08X",
717
718 /* Get the last valid checkpoint record. */
725 if (record != NULL)
726 {
728 errmsg_internal("checkpoint record is at %X/%08X",
730 }
731 else
732 {
733 /*
734 * We used to attempt to go back to a secondary checkpoint record
735 * here, but only when not in standby mode. We now just fail if we
736 * can't read the last checkpoint because this allows us to
737 * simplify processing around checkpoints.
738 */
740 errmsg("could not locate a valid checkpoint record at %X/%08X",
742 }
743 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
744 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
745
746 /* Make sure that REDO location exists. */
747 if (checkPoint.redo < CheckPointLoc)
748 {
750 if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
752 errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
754 }
755 }
756
758 {
760 ereport(LOG,
761 (errmsg("entering standby mode")));
763 ereport(LOG,
764 (errmsg("starting point-in-time recovery to XID %u",
767 ereport(LOG,
768 (errmsg("starting point-in-time recovery to %s",
771 ereport(LOG,
772 (errmsg("starting point-in-time recovery to \"%s\"",
775 ereport(LOG,
776 errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
779 ereport(LOG,
780 (errmsg("starting point-in-time recovery to earliest consistent point")));
781 else
782 ereport(LOG,
783 (errmsg("starting archive recovery")));
784 }
785
786 /*
787 * If the location of the checkpoint record is not on the expected
788 * timeline in the history of the requested timeline, we cannot proceed:
789 * the backup is not part of the history of the requested timeline.
790 */
791 Assert(expectedTLEs); /* was initialized by reading checkpoint
792 * record */
795 {
796 XLogRecPtr switchpoint;
797
798 /*
799 * tliSwitchPoint will throw an error if the checkpoint's timeline is
800 * not in expectedTLEs at all.
801 */
802 switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
804 (errmsg("requested timeline %u is not a child of this server's history",
806 /* translator: %s is a backup_label file or a pg_control file */
807 errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
808 haveBackupLabel ? "backup_label" : "pg_control",
811 LSN_FORMAT_ARGS(switchpoint))));
812 }
813
814 /*
815 * The min recovery point should be part of the requested timeline's
816 * history, too.
817 */
822 errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
826
828 errmsg_internal("redo record is at %X/%08X; shutdown %s",
829 LSN_FORMAT_ARGS(checkPoint.redo),
830 wasShutdown ? "true" : "false"));
832 (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
834 checkPoint.nextOid)));
836 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %" PRIu64,
837 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
839 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
840 checkPoint.oldestXid, checkPoint.oldestXidDB)));
842 (errmsg_internal("oldest MultiXactId: %u, in database %u",
843 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
845 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
846 checkPoint.oldestCommitTsXid,
847 checkPoint.newestCommitTsXid)));
850 (errmsg("invalid next transaction ID")));
851
852 /* sanity check */
853 if (checkPoint.redo > CheckPointLoc)
855 (errmsg("invalid redo in checkpoint record")));
856
857 /*
858 * Check whether we need to force recovery from WAL. If it appears to
859 * have been a clean shutdown and we did not have a recovery signal file,
860 * then assume no recovery needed.
861 */
862 if (checkPoint.redo < CheckPointLoc)
863 {
864 if (wasShutdown)
866 (errmsg("invalid redo record in shutdown checkpoint")));
867 InRecovery = true;
868 }
869 else if (ControlFile->state != DB_SHUTDOWNED)
870 InRecovery = true;
872 {
873 /* force recovery due to presence of recovery signal file */
874 InRecovery = true;
875 }
876
877 /*
878 * If recovery is needed, update our in-memory copy of pg_control to show
879 * that we are recovering and to show the selected checkpoint as the place
880 * we are starting from. We also mark pg_control with any minimum recovery
881 * stop point obtained from a backup history file.
882 *
883 * We don't write the changes to disk yet, though. Only do that after
884 * initializing various subsystems.
885 */
886 if (InRecovery)
887 {
889 {
891 }
892 else
893 {
894 ereport(LOG,
895 (errmsg("database system was not properly shut down; "
896 "automatic recovery in progress")));
898 ereport(LOG,
899 (errmsg("crash recovery starts in timeline %u "
900 "and has target timeline %u",
904 }
906 ControlFile->checkPointCopy = checkPoint;
908 {
909 /* initialize minRecoveryPoint if not set yet */
910 if (ControlFile->minRecoveryPoint < checkPoint.redo)
911 {
912 ControlFile->minRecoveryPoint = checkPoint.redo;
914 }
915 }
916
917 /*
918 * Set backupStartPoint if we're starting recovery from a base backup.
919 *
920 * Also set backupEndPoint and use minRecoveryPoint as the backup end
921 * location if we're starting recovery from a base backup which was
922 * taken from a standby. In this case, the database system status in
923 * pg_control must indicate that the database was already in recovery.
924 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
925 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
926 * before reaching this point; e.g. because restore_command or
927 * primary_conninfo were faulty.
928 *
929 * Any other state indicates that the backup somehow became corrupted
930 * and we can't sensibly continue with recovery.
931 */
932 if (haveBackupLabel)
933 {
934 ControlFile->backupStartPoint = checkPoint.redo;
936
937 if (backupFromStandby)
938 {
939 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
940 dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
942 (errmsg("backup_label contains data inconsistent with control file"),
943 errhint("This means that the backup is corrupted and you will "
944 "have to use another backup for recovery.")));
946 }
947 }
948 }
949
950 /* remember these, so that we know when we have reached consistency */
955 {
958 }
959 else
960 {
963 }
964
965 /*
966 * Start recovery assuming that the final record isn't lost.
967 */
970
971 *wasShutdown_ptr = wasShutdown;
972 *haveBackupLabel_ptr = haveBackupLabel;
973 *haveTblspcMap_ptr = haveTblspcMap;
974}
975
976/*
977 * See if there are any recovery signal files and if so, set state for
978 * recovery.
979 *
980 * See if there is a recovery command file (recovery.conf), and if so
981 * throw an ERROR since as of PG12 we no longer recognize that.
982 */
983static void
985{
986 struct stat stat_buf;
987
989 return;
990
991 /*
992 * Check for old recovery API file: recovery.conf
993 */
994 if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
997 errmsg("using recovery command file \"%s\" is not supported",
999
1000 /*
1001 * Remove unused .done file, if present. Ignore if absent.
1002 */
1003 unlink(RECOVERY_COMMAND_DONE);
1004
1005 /*
1006 * Check for recovery signal files and if found, fsync them since they
1007 * represent server state information. We don't sweat too much about the
1008 * possibility of fsync failure, however.
1009 */
1010 if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1011 {
1012 int fd;
1013
1015 S_IRUSR | S_IWUSR);
1016 if (fd >= 0)
1017 {
1018 (void) pg_fsync(fd);
1019 close(fd);
1020 }
1022 }
1023
1024 if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1025 {
1026 int fd;
1027
1029 S_IRUSR | S_IWUSR);
1030 if (fd >= 0)
1031 {
1032 (void) pg_fsync(fd);
1033 close(fd);
1034 }
1036 }
1037
1038 /*
1039 * If both signal files are present, standby signal file takes precedence.
1040 * If neither is present then we won't enter archive recovery.
1041 */
1042 StandbyModeRequested = false;
1045 {
1046 StandbyModeRequested = true;
1048 }
1050 {
1051 StandbyModeRequested = false;
1053 }
1054 else
1055 return;
1056
1057 /*
1058 * We don't support standby mode in standalone backends; that requires
1059 * other processes such as the WAL receiver to be alive.
1060 */
1062 ereport(FATAL,
1063 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1064 errmsg("standby mode is not supported by single-user servers")));
1065}
1066
1067static void
1069{
1071 return;
1072
1073 /*
1074 * Check for compulsory parameters
1075 */
1077 {
1078 if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1079 (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1081 (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1082 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1083 }
1084 else
1085 {
1086 if (recoveryRestoreCommand == NULL ||
1087 strcmp(recoveryRestoreCommand, "") == 0)
1088 ereport(FATAL,
1089 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1090 errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1091 }
1092
1093 /*
1094 * Override any inconsistent requests. Note that this is a change of
1095 * behaviour in 9.5; prior to this we simply ignored a request to pause if
1096 * hot_standby = off, which was surprising behaviour.
1097 */
1101
1102 /*
1103 * Final parsing of recovery_target_time string; see also
1104 * check_recovery_target_time().
1105 */
1107 {
1111 Int32GetDatum(-1)));
1112 }
1113
1114 /*
1115 * If user specified recovery_target_timeline, validate it or compute the
1116 * "latest" value. We can't do this until after we've gotten the restore
1117 * command and set InArchiveRecovery, because we need to fetch timeline
1118 * history files from the archive.
1119 */
1121 {
1123
1124 /* Timeline 1 does not have a history file, all else should */
1125 if (rtli != 1 && !existsTimeLineHistory(rtli))
1126 ereport(FATAL,
1127 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1128 errmsg("recovery target timeline %u does not exist",
1129 rtli)));
1130 recoveryTargetTLI = rtli;
1131 }
1133 {
1134 /* We start the "latest" search from pg_control's timeline */
1136 }
1137 else
1138 {
1139 /*
1140 * else we just use the recoveryTargetTLI as already read from
1141 * ControlFile
1142 */
1144 }
1145}
1146
1147/*
1148 * read_backup_label: check to see if a backup_label file is present
1149 *
1150 * If we see a backup_label during recovery, we assume that we are recovering
1151 * from a backup dump file, and we therefore roll forward from the checkpoint
1152 * identified by the label file, NOT what pg_control says. This avoids the
1153 * problem that pg_control might have been archived one or more checkpoints
1154 * later than the start of the dump, and so if we rely on it as the start
1155 * point, we will fail to restore a consistent database state.
1156 *
1157 * Returns true if a backup_label was found (and fills the checkpoint
1158 * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1159 * returns false if not. If this backup_label came from a streamed backup,
1160 * *backupEndRequired is set to true. If this backup_label was created during
1161 * recovery, *backupFromStandby is set to true.
1162 *
1163 * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1164 * and TLI read from the backup file.
1165 */
1166static bool
1167read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1168 bool *backupEndRequired, bool *backupFromStandby)
1169{
1170 char startxlogfilename[MAXFNAMELEN];
1171 TimeLineID tli_from_walseg,
1172 tli_from_file;
1173 FILE *lfp;
1174 char ch;
1175 char backuptype[20];
1176 char backupfrom[20];
1177 char backuplabel[MAXPGPATH];
1178 char backuptime[128];
1179 uint32 hi,
1180 lo;
1181
1182 /* suppress possible uninitialized-variable warnings */
1183 *checkPointLoc = InvalidXLogRecPtr;
1184 *backupLabelTLI = 0;
1185 *backupEndRequired = false;
1186 *backupFromStandby = false;
1187
1188 /*
1189 * See if label file is present
1190 */
1191 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1192 if (!lfp)
1193 {
1194 if (errno != ENOENT)
1195 ereport(FATAL,
1197 errmsg("could not read file \"%s\": %m",
1199 return false; /* it's not there, all is fine */
1200 }
1201
1202 /*
1203 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1204 * is pretty crude, but we are not expecting any variability in the file
1205 * format).
1206 */
1207 if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
1208 &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1209 ereport(FATAL,
1210 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1211 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1212 RedoStartLSN = ((uint64) hi) << 32 | lo;
1213 RedoStartTLI = tli_from_walseg;
1214 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
1215 &hi, &lo, &ch) != 3 || ch != '\n')
1216 ereport(FATAL,
1217 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1218 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1219 *checkPointLoc = ((uint64) hi) << 32 | lo;
1220 *backupLabelTLI = tli_from_walseg;
1221
1222 /*
1223 * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1224 * which could mean either pg_basebackup or the pg_backup_start/stop
1225 * method was used) or if this label came from somewhere else (the only
1226 * other option today being from pg_rewind). If this was a streamed
1227 * backup then we know that we need to play through until we get to the
1228 * end of the WAL which was generated during the backup (at which point we
1229 * will have reached consistency and backupEndRequired will be reset to be
1230 * false).
1231 */
1232 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1233 {
1234 if (strcmp(backuptype, "streamed") == 0)
1235 *backupEndRequired = true;
1236 }
1237
1238 /*
1239 * BACKUP FROM lets us know if this was from a primary or a standby. If
1240 * it was from a standby, we'll double-check that the control file state
1241 * matches that of a standby.
1242 */
1243 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1244 {
1245 if (strcmp(backupfrom, "standby") == 0)
1246 *backupFromStandby = true;
1247 }
1248
1249 /*
1250 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1251 * but checking for their presence is useful for debugging and the next
1252 * sanity checks. Cope also with the fact that the result buffers have a
1253 * pre-allocated size, hence if the backup_label file has been generated
1254 * with strings longer than the maximum assumed here an incorrect parsing
1255 * happens. That's fine as only minor consistency checks are done
1256 * afterwards.
1257 */
1258 if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1260 (errmsg_internal("backup time %s in file \"%s\"",
1261 backuptime, BACKUP_LABEL_FILE)));
1262
1263 if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1265 (errmsg_internal("backup label %s in file \"%s\"",
1266 backuplabel, BACKUP_LABEL_FILE)));
1267
1268 /*
1269 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1270 * it as a sanity check if present.
1271 */
1272 if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1273 {
1274 if (tli_from_walseg != tli_from_file)
1275 ereport(FATAL,
1276 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1277 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1278 errdetail("Timeline ID parsed is %u, but expected %u.",
1279 tli_from_file, tli_from_walseg)));
1280
1282 (errmsg_internal("backup timeline %u in file \"%s\"",
1283 tli_from_file, BACKUP_LABEL_FILE)));
1284 }
1285
1286 if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
1287 ereport(FATAL,
1288 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1289 errmsg("this is an incremental backup, not a data directory"),
1290 errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1291
1292 if (ferror(lfp) || FreeFile(lfp))
1293 ereport(FATAL,
1295 errmsg("could not read file \"%s\": %m",
1297
1298 return true;
1299}
1300
1301/*
1302 * read_tablespace_map: check to see if a tablespace_map file is present
1303 *
1304 * If we see a tablespace_map file during recovery, we assume that we are
1305 * recovering from a backup dump file, and we therefore need to create symlinks
1306 * as per the information present in tablespace_map file.
1307 *
1308 * Returns true if a tablespace_map file was found (and fills *tablespaces
1309 * with a tablespaceinfo struct for each tablespace listed in the file);
1310 * returns false if not.
1311 */
1312static bool
1314{
1315 tablespaceinfo *ti;
1316 FILE *lfp;
1317 char str[MAXPGPATH];
1318 int ch,
1319 i,
1320 n;
1321 bool was_backslash;
1322
1323 /*
1324 * See if tablespace_map file is present
1325 */
1326 lfp = AllocateFile(TABLESPACE_MAP, "r");
1327 if (!lfp)
1328 {
1329 if (errno != ENOENT)
1330 ereport(FATAL,
1332 errmsg("could not read file \"%s\": %m",
1333 TABLESPACE_MAP)));
1334 return false; /* it's not there, all is fine */
1335 }
1336
1337 /*
1338 * Read and parse the link name and path lines from tablespace_map file
1339 * (this code is pretty crude, but we are not expecting any variability in
1340 * the file format). De-escape any backslashes that were inserted.
1341 */
1342 i = 0;
1343 was_backslash = false;
1344 while ((ch = fgetc(lfp)) != EOF)
1345 {
1346 if (!was_backslash && (ch == '\n' || ch == '\r'))
1347 {
1348 char *endp;
1349
1350 if (i == 0)
1351 continue; /* \r immediately followed by \n */
1352
1353 /*
1354 * The de-escaped line should contain an OID followed by exactly
1355 * one space followed by a path. The path might start with
1356 * spaces, so don't be too liberal about parsing.
1357 */
1358 str[i] = '\0';
1359 n = 0;
1360 while (str[n] && str[n] != ' ')
1361 n++;
1362 if (n < 1 || n >= i - 1)
1363 ereport(FATAL,
1364 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1365 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1366 str[n++] = '\0';
1367
1369 errno = 0;
1370 ti->oid = strtoul(str, &endp, 10);
1371 if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1372 ereport(FATAL,
1373 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1374 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1375 ti->path = pstrdup(str + n);
1376 *tablespaces = lappend(*tablespaces, ti);
1377
1378 i = 0;
1379 continue;
1380 }
1381 else if (!was_backslash && ch == '\\')
1382 was_backslash = true;
1383 else
1384 {
1385 if (i < sizeof(str) - 1)
1386 str[i++] = ch;
1387 was_backslash = false;
1388 }
1389 }
1390
1391 if (i != 0 || was_backslash) /* last line not terminated? */
1392 ereport(FATAL,
1393 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1394 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1395
1396 if (ferror(lfp) || FreeFile(lfp))
1397 ereport(FATAL,
1399 errmsg("could not read file \"%s\": %m",
1400 TABLESPACE_MAP)));
1401
1402 return true;
1403}
1404
1405/*
1406 * Finish WAL recovery.
1407 *
1408 * This does not close the 'xlogreader' yet, because in some cases the caller
1409 * still wants to re-read the last checkpoint record by calling
1410 * ReadCheckpointRecord().
1411 *
1412 * Returns the position of the last valid or applied record, after which new
1413 * WAL should be appended, information about why recovery was ended, and some
1414 * other things. See the EndOfWalRecoveryInfo struct for details.
1415 */
1418{
1420 XLogRecPtr lastRec;
1421 TimeLineID lastRecTLI;
1422 XLogRecPtr endOfLog;
1423
1424 /*
1425 * Kill WAL receiver, if it's still running, before we continue to write
1426 * the startup checkpoint and aborted-contrecord records. It will trump
1427 * over these records and subsequent ones if it's still alive when we
1428 * start writing WAL.
1429 */
1431
1432 /*
1433 * Shutdown the slot sync worker to drop any temporary slots acquired by
1434 * it and to prevent it from keep trying to fetch the failover slots.
1435 *
1436 * We do not update the 'synced' column in 'pg_replication_slots' system
1437 * view from true to false here, as any failed update could leave 'synced'
1438 * column false for some slots. This could cause issues during slot sync
1439 * after restarting the server as a standby. While updating the 'synced'
1440 * column after switching to the new timeline is an option, it does not
1441 * simplify the handling for the 'synced' column. Therefore, we retain the
1442 * 'synced' column as true after promotion as it may provide useful
1443 * information about the slot origin.
1444 */
1446
1447 /*
1448 * We are now done reading the xlog from stream. Turn off streaming
1449 * recovery to force fetching the files (which would be required at end of
1450 * recovery, e.g., timeline history file) from archive or pg_wal.
1451 *
1452 * Note that standby mode must be turned off after killing WAL receiver,
1453 * i.e., calling XLogShutdownWalRcv().
1454 */
1456 StandbyMode = false;
1457
1458 /*
1459 * Determine where to start writing WAL next.
1460 *
1461 * Re-fetch the last valid or last applied record, so we can identify the
1462 * exact endpoint of what we consider the valid portion of WAL. There may
1463 * be an incomplete continuation record after that, in which case
1464 * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1465 * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1466 * it is intentionally missing. See CreateOverwriteContrecordRecord().
1467 *
1468 * An important side-effect of this is to load the last page into
1469 * xlogreader. The caller uses it to initialize the WAL for writing.
1470 */
1471 if (!InRecovery)
1472 {
1473 lastRec = CheckPointLoc;
1474 lastRecTLI = CheckPointTLI;
1475 }
1476 else
1477 {
1479 lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1480 }
1482 (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1483 endOfLog = xlogreader->EndRecPtr;
1484
1485 /*
1486 * Remember the TLI in the filename of the XLOG segment containing the
1487 * end-of-log. It could be different from the timeline that endOfLog
1488 * nominally belongs to, if there was a timeline switch in that segment,
1489 * and we were reading the old WAL from a segment belonging to a higher
1490 * timeline.
1491 */
1492 result->endOfLogTLI = xlogreader->seg.ws_tli;
1493
1495 {
1496 /*
1497 * We are no longer in archive recovery state.
1498 *
1499 * We are now done reading the old WAL. Turn off archive fetching if
1500 * it was active.
1501 */
1503 InArchiveRecovery = false;
1504
1505 /*
1506 * If the ending log segment is still open, close it (to avoid
1507 * problems on Windows with trying to rename or delete an open file).
1508 */
1509 if (readFile >= 0)
1510 {
1511 close(readFile);
1512 readFile = -1;
1513 }
1514 }
1515
1516 /*
1517 * Copy the last partial block to the caller, for initializing the WAL
1518 * buffer for appending new WAL.
1519 */
1520 if (endOfLog % XLOG_BLCKSZ != 0)
1521 {
1522 char *page;
1523 int len;
1524 XLogRecPtr pageBeginPtr;
1525
1526 pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1528
1529 /* Copy the valid part of the last block */
1530 len = endOfLog % XLOG_BLCKSZ;
1531 page = palloc(len);
1532 memcpy(page, xlogreader->readBuf, len);
1533
1534 result->lastPageBeginPtr = pageBeginPtr;
1535 result->lastPage = page;
1536 }
1537 else
1538 {
1539 /* There is no partial block to copy. */
1540 result->lastPageBeginPtr = endOfLog;
1541 result->lastPage = NULL;
1542 }
1543
1544 /*
1545 * Create a comment for the history file to explain why and where timeline
1546 * changed.
1547 */
1548 result->recoveryStopReason = getRecoveryStopReason();
1549
1550 result->lastRec = lastRec;
1551 result->lastRecTLI = lastRecTLI;
1552 result->endOfLog = endOfLog;
1553
1554 result->abortedRecPtr = abortedRecPtr;
1555 result->missingContrecPtr = missingContrecPtr;
1556
1557 result->standby_signal_file_found = standby_signal_file_found;
1558 result->recovery_signal_file_found = recovery_signal_file_found;
1559
1560 return result;
1561}
1562
1563/*
1564 * Clean up the WAL reader and leftovers from restoring WAL from archive
1565 */
1566void
1568{
1569 char recoveryPath[MAXPGPATH];
1570
1571 /* Final update of pg_stat_recovery_prefetch. */
1573
1574 /* Shut down xlogreader */
1575 if (readFile >= 0)
1576 {
1577 close(readFile);
1578 readFile = -1;
1579 }
1583
1585 {
1586 /*
1587 * Since there might be a partial WAL segment named RECOVERYXLOG, get
1588 * rid of it.
1589 */
1590 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1591 unlink(recoveryPath); /* ignore any error */
1592
1593 /* Get rid of any remaining recovered timeline-history file, too */
1594 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1595 unlink(recoveryPath); /* ignore any error */
1596 }
1597
1598 /*
1599 * We don't need the latch anymore. It's not strictly necessary to disown
1600 * it, but let's do it for the sake of tidiness.
1601 */
1604}
1605
1606/*
1607 * Perform WAL recovery.
1608 *
1609 * If the system was shut down cleanly, this is never called.
1610 */
1611void
1613{
1614 XLogRecord *record;
1615 bool reachedRecoveryTarget = false;
1616 TimeLineID replayTLI;
1617
1618 /*
1619 * Initialize shared variables for tracking progress of WAL replay, as if
1620 * we had just replayed the record before the REDO location (or the
1621 * checkpoint record itself, if it's a shutdown checkpoint).
1622 */
1625 {
1629 }
1630 else
1631 {
1635 }
1642
1643 /* Also ensure XLogReceiptTime has a sane value */
1645
1646 /*
1647 * Let postmaster know we've started redo now, so that it can launch the
1648 * archiver if necessary.
1649 */
1652
1653 /*
1654 * Allow read-only connections immediately if we're consistent already.
1655 */
1657
1658 /*
1659 * Find the first record that logically follows the checkpoint --- it
1660 * might physically precede it, though.
1661 */
1663 {
1664 /* back up to find the record */
1665 replayTLI = RedoStartTLI;
1667 record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1668
1669 /*
1670 * If a checkpoint record's redo pointer points back to an earlier
1671 * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1672 * record.
1673 */
1674 if (record->xl_rmid != RM_XLOG_ID ||
1676 ereport(FATAL,
1677 errmsg("unexpected record type found at redo point %X/%08X",
1679 }
1680 else
1681 {
1682 /* just have to read next record after CheckPoint */
1684 replayTLI = CheckPointTLI;
1685 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1686 }
1687
1688 if (record != NULL)
1689 {
1690 TimestampTz xtime;
1691 PGRUsage ru0;
1692
1693 pg_rusage_init(&ru0);
1694
1695 InRedo = true;
1696
1697 RmgrStartup();
1698
1699 ereport(LOG,
1700 errmsg("redo starts at %X/%08X",
1702
1703 /* Prepare to report progress of the redo phase. */
1704 if (!StandbyMode)
1706
1707 /*
1708 * main redo apply loop
1709 */
1710 do
1711 {
1712 if (!StandbyMode)
1713 ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
1715
1716#ifdef WAL_DEBUG
1717 if (XLOG_DEBUG)
1718 {
1720
1722 appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
1725 xlog_outrec(&buf, xlogreader);
1726 appendStringInfoString(&buf, " - ");
1728 elog(LOG, "%s", buf.data);
1729 pfree(buf.data);
1730 }
1731#endif
1732
1733 /* Handle interrupt signals of startup process */
1735
1736 /*
1737 * Pause WAL replay, if requested by a hot-standby session via
1738 * SetRecoveryPause().
1739 *
1740 * Note that we intentionally don't take the info_lck spinlock
1741 * here. We might therefore read a slightly stale value of the
1742 * recoveryPause flag, but it can't be very stale (no worse than
1743 * the last spinlock we did acquire). Since a pause request is a
1744 * pretty asynchronous thing anyway, possibly responding to it one
1745 * WAL record later than we otherwise would is a minor issue, so
1746 * it doesn't seem worth adding another spinlock cycle to prevent
1747 * that.
1748 */
1749 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1751 recoveryPausesHere(false);
1752
1753 /*
1754 * Have we reached our recovery target?
1755 */
1757 {
1758 reachedRecoveryTarget = true;
1759 break;
1760 }
1761
1762 /*
1763 * If we've been asked to lag the primary, wait on latch until
1764 * enough time has passed.
1765 */
1767 {
1768 /*
1769 * We test for paused recovery again here. If user sets
1770 * delayed apply, it may be because they expect to pause
1771 * recovery in case of problems, so we must test again here
1772 * otherwise pausing during the delay-wait wouldn't work.
1773 */
1774 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1776 recoveryPausesHere(false);
1777 }
1778
1779 /*
1780 * Apply the record
1781 */
1782 ApplyWalRecord(xlogreader, record, &replayTLI);
1783
1784 /*
1785 * If we replayed an LSN that someone was waiting for then walk
1786 * over the shared memory array and set latches to notify the
1787 * waiters.
1788 */
1789 if (waitLSNState &&
1793
1794 /* Exit loop if we reached inclusive recovery target */
1796 {
1797 reachedRecoveryTarget = true;
1798 break;
1799 }
1800
1801 /* Else, try to fetch the next WAL record */
1802 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1803 } while (record != NULL);
1804
1805 /*
1806 * end of main redo apply loop
1807 */
1808
1809 if (reachedRecoveryTarget)
1810 {
1811 if (!reachedConsistency)
1812 ereport(FATAL,
1813 (errmsg("requested recovery stop point is before consistent recovery point")));
1814
1815 /*
1816 * This is the last point where we can restart recovery with a new
1817 * recovery target, if we shutdown and begin again. After this,
1818 * Resource Managers may choose to do permanent corrective actions
1819 * at end of recovery.
1820 */
1821 switch (recoveryTargetAction)
1822 {
1824
1825 /*
1826 * exit with special return code to request shutdown of
1827 * postmaster. Log messages issued from postmaster.
1828 */
1829 proc_exit(3);
1830
1832 SetRecoveryPause(true);
1833 recoveryPausesHere(true);
1834
1835 /* drop into promote */
1837
1839 break;
1840 }
1841 }
1842
1843 RmgrCleanup();
1844
1845 ereport(LOG,
1846 errmsg("redo done at %X/%08X system usage: %s",
1848 pg_rusage_show(&ru0)));
1849 xtime = GetLatestXTime();
1850 if (xtime)
1851 ereport(LOG,
1852 (errmsg("last completed transaction was at log time %s",
1853 timestamptz_to_str(xtime))));
1854
1855 InRedo = false;
1856 }
1857 else
1858 {
1859 /* there are no WAL records following the checkpoint */
1860 ereport(LOG,
1861 (errmsg("redo is not required")));
1862 }
1863
1864 /*
1865 * This check is intentionally after the above log messages that indicate
1866 * how far recovery went.
1867 */
1870 !reachedRecoveryTarget)
1871 ereport(FATAL,
1872 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1873 errmsg("recovery ended before configured recovery target was reached")));
1874}
1875
1876/*
1877 * Subroutine of PerformWalRecovery, to apply one WAL record.
1878 */
1879static void
1881{
1882 ErrorContextCallback errcallback;
1883 bool switchedTLI = false;
1884
1885 /* Setup error traceback support for ereport() */
1886 errcallback.callback = rm_redo_error_callback;
1887 errcallback.arg = xlogreader;
1888 errcallback.previous = error_context_stack;
1889 error_context_stack = &errcallback;
1890
1891 /*
1892 * TransamVariables->nextXid must be beyond record's xid.
1893 */
1895
1896 /*
1897 * Before replaying this record, check if this record causes the current
1898 * timeline to change. The record is already considered to be part of the
1899 * new timeline, so we update replayTLI before replaying it. That's
1900 * important so that replayEndTLI, which is recorded as the minimum
1901 * recovery point's TLI if recovery stops after this record, is set
1902 * correctly.
1903 */
1904 if (record->xl_rmid == RM_XLOG_ID)
1905 {
1906 TimeLineID newReplayTLI = *replayTLI;
1907 TimeLineID prevReplayTLI = *replayTLI;
1908 uint8 info = record->xl_info & ~XLR_INFO_MASK;
1909
1910 if (info == XLOG_CHECKPOINT_SHUTDOWN)
1911 {
1912 CheckPoint checkPoint;
1913
1914 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1915 newReplayTLI = checkPoint.ThisTimeLineID;
1916 prevReplayTLI = checkPoint.PrevTimeLineID;
1917 }
1918 else if (info == XLOG_END_OF_RECOVERY)
1919 {
1920 xl_end_of_recovery xlrec;
1921
1923 newReplayTLI = xlrec.ThisTimeLineID;
1924 prevReplayTLI = xlrec.PrevTimeLineID;
1925 }
1926
1927 if (newReplayTLI != *replayTLI)
1928 {
1929 /* Check that it's OK to switch to this TLI */
1931 newReplayTLI, prevReplayTLI, *replayTLI);
1932
1933 /* Following WAL records should be run with new TLI */
1934 *replayTLI = newReplayTLI;
1935 switchedTLI = true;
1936 }
1937 }
1938
1939 /*
1940 * Update shared replayEndRecPtr before replaying this record, so that
1941 * XLogFlush will update minRecoveryPoint correctly.
1942 */
1945 XLogRecoveryCtl->replayEndTLI = *replayTLI;
1947
1948 /*
1949 * If we are attempting to enter Hot Standby mode, process XIDs we see
1950 */
1954
1955 /*
1956 * Some XLOG record types that are related to recovery are processed
1957 * directly here, rather than in xlog_redo()
1958 */
1959 if (record->xl_rmid == RM_XLOG_ID)
1960 xlogrecovery_redo(xlogreader, *replayTLI);
1961
1962 /* Now apply the WAL record itself */
1964
1965 /*
1966 * After redo, check whether the backup pages associated with the WAL
1967 * record are consistent with the existing pages. This check is done only
1968 * if consistency check is enabled for this record.
1969 */
1970 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
1972
1973 /* Pop the error context stack */
1974 error_context_stack = errcallback.previous;
1975
1976 /*
1977 * Update lastReplayedEndRecPtr after this record has been successfully
1978 * replayed.
1979 */
1983 XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
1985
1986 /* ------
1987 * Wakeup walsenders:
1988 *
1989 * On the standby, the WAL is flushed first (which will only wake up
1990 * physical walsenders) and then applied, which will only wake up logical
1991 * walsenders.
1992 *
1993 * Indeed, logical walsenders on standby can't decode and send data until
1994 * it's been applied.
1995 *
1996 * Physical walsenders don't need to be woken up during replay unless
1997 * cascading replication is allowed and time line change occurred (so that
1998 * they can notice that they are on a new time line).
1999 *
2000 * That's why the wake up conditions are for:
2001 *
2002 * - physical walsenders in case of new time line and cascade
2003 * replication is allowed
2004 * - logical walsenders in case cascade replication is allowed (could not
2005 * be created otherwise)
2006 * ------
2007 */
2009 WalSndWakeup(switchedTLI, true);
2010
2011 /*
2012 * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2013 * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2014 * a reply to the primary.
2015 */
2017 {
2020 }
2021
2022 /* Allow read-only connections if we're consistent now */
2024
2025 /* Is this a timeline switch? */
2026 if (switchedTLI)
2027 {
2028 /*
2029 * Before we continue on the new timeline, clean up any (possibly
2030 * bogus) future WAL segments on the old timeline.
2031 */
2033
2034 /* Reset the prefetcher. */
2036 }
2037}
2038
2039/*
2040 * Some XLOG RM record types that are directly related to WAL recovery are
2041 * handled here rather than in the xlog_redo()
2042 */
2043static void
2045{
2046 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2047 XLogRecPtr lsn = record->EndRecPtr;
2048
2049 Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2050
2051 if (info == XLOG_OVERWRITE_CONTRECORD)
2052 {
2053 /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2055
2056 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2057 if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2058 elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
2061
2062 /* We have safely skipped the aborted record */
2065
2066 ereport(LOG,
2067 errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
2070
2071 /* Verifying the record should only happen once */
2073 }
2074 else if (info == XLOG_BACKUP_END)
2075 {
2076 XLogRecPtr startpoint;
2077
2078 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2079
2080 if (backupStartPoint == startpoint)
2081 {
2082 /*
2083 * We have reached the end of base backup, the point where
2084 * pg_backup_stop() was done. The data on disk is now consistent
2085 * (assuming we have also reached minRecoveryPoint). Set
2086 * backupEndPoint to the current LSN, so that the next call to
2087 * CheckRecoveryConsistency() will notice it and do the
2088 * end-of-backup processing.
2089 */
2090 elog(DEBUG1, "end of backup record reached");
2091
2092 backupEndPoint = lsn;
2093 }
2094 else
2095 elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
2097 }
2098}
2099
2100/*
2101 * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2102 * directories.
2103 *
2104 * Replay of database creation XLOG records for databases that were later
2105 * dropped can create fake directories in pg_tblspc. By the time consistency
2106 * is reached these directories should have been removed; here we verify
2107 * that this did indeed happen. This is to be called at the point where
2108 * consistent state is reached.
2109 *
2110 * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2111 * useful for testing purposes, and also allows for an escape hatch in case
2112 * things go south.
2113 */
2114static void
2116{
2117 DIR *dir;
2118 struct dirent *de;
2119
2121 while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2122 {
2123 char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2124
2125 /* Skip entries of non-oid names */
2126 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2127 continue;
2128
2129 snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2130
2131 if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2134 errmsg("unexpected directory entry \"%s\" found in %s",
2135 de->d_name, PG_TBLSPC_DIR),
2136 errdetail("All directory entries in %s/ should be symbolic links.",
2138 errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2139 }
2140}
2141
2142/*
2143 * Checks if recovery has reached a consistent state. When consistency is
2144 * reached and we have a valid starting standby snapshot, tell postmaster
2145 * that it can start accepting read-only connections.
2146 */
2147static void
2149{
2150 XLogRecPtr lastReplayedEndRecPtr;
2151 TimeLineID lastReplayedTLI;
2152
2153 /*
2154 * During crash recovery, we don't reach a consistent state until we've
2155 * replayed all the WAL.
2156 */
2158 return;
2159
2161
2162 /*
2163 * assume that we are called in the startup process, and hence don't need
2164 * a lock to read lastReplayedEndRecPtr
2165 */
2166 lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2167 lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2168
2169 /*
2170 * Have we reached the point where our base backup was completed?
2171 */
2173 backupEndPoint <= lastReplayedEndRecPtr)
2174 {
2175 XLogRecPtr saveBackupStartPoint = backupStartPoint;
2176 XLogRecPtr saveBackupEndPoint = backupEndPoint;
2177
2178 elog(DEBUG1, "end of backup reached");
2179
2180 /*
2181 * We have reached the end of base backup, as indicated by pg_control.
2182 * Update the control file accordingly.
2183 */
2184 ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2187 backupEndRequired = false;
2188
2189 ereport(LOG,
2190 errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
2191 LSN_FORMAT_ARGS(saveBackupStartPoint),
2192 LSN_FORMAT_ARGS(saveBackupEndPoint)));
2193 }
2194
2195 /*
2196 * Have we passed our safe starting point? Note that minRecoveryPoint is
2197 * known to be incorrectly set if recovering from a backup, until the
2198 * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2199 * All we know prior to that is that we're not consistent yet.
2200 */
2202 minRecoveryPoint <= lastReplayedEndRecPtr)
2203 {
2204 /*
2205 * Check to see if the XLOG sequence contained any unresolved
2206 * references to uninitialized pages.
2207 */
2209
2210 /*
2211 * Check that pg_tblspc doesn't contain any real directories. Replay
2212 * of Database/CREATE_* records may have created fictitious tablespace
2213 * directories that should have been removed by the time consistency
2214 * was reached.
2215 */
2217
2218 reachedConsistency = true;
2220 ereport(LOG,
2221 errmsg("consistent recovery state reached at %X/%08X",
2222 LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
2223 }
2224
2225 /*
2226 * Have we got a valid starting snapshot that will allow queries to be
2227 * run? If so, we can tell postmaster that the database is consistent now,
2228 * enabling connections.
2229 */
2234 {
2238
2239 LocalHotStandbyActive = true;
2240
2242 }
2243}
2244
2245/*
2246 * Error context callback for errors occurring during rm_redo().
2247 */
2248static void
2250{
2251 XLogReaderState *record = (XLogReaderState *) arg;
2253
2255 xlog_outdesc(&buf, record);
2256 xlog_block_info(&buf, record);
2257
2258 /* translator: %s is a WAL record description */
2259 errcontext("WAL redo at %X/%08X for %s",
2260 LSN_FORMAT_ARGS(record->ReadRecPtr),
2261 buf.data);
2262
2263 pfree(buf.data);
2264}
2265
2266/*
2267 * Returns a string describing an XLogRecord, consisting of its identity
2268 * optionally followed by a colon, a space, and a further description.
2269 */
2270void
2272{
2273 RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2274 uint8 info = XLogRecGetInfo(record);
2275 const char *id;
2276
2279
2280 id = rmgr.rm_identify(info);
2281 if (id == NULL)
2282 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2283 else
2284 appendStringInfo(buf, "%s: ", id);
2285
2286 rmgr.rm_desc(buf, record);
2287}
2288
2289#ifdef WAL_DEBUG
2290
2291static void
2292xlog_outrec(StringInfo buf, XLogReaderState *record)
2293{
2294 appendStringInfo(buf, "prev %X/%08X; xid %u",
2296 XLogRecGetXid(record));
2297
2298 appendStringInfo(buf, "; len %u",
2299 XLogRecGetDataLen(record));
2300
2301 xlog_block_info(buf, record);
2302}
2303#endif /* WAL_DEBUG */
2304
2305/*
2306 * Returns a string giving information about all the blocks in an
2307 * XLogRecord.
2308 */
2309static void
2311{
2312 int block_id;
2313
2314 /* decode block references */
2315 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2316 {
2317 RelFileLocator rlocator;
2318 ForkNumber forknum;
2319 BlockNumber blk;
2320
2321 if (!XLogRecGetBlockTagExtended(record, block_id,
2322 &rlocator, &forknum, &blk, NULL))
2323 continue;
2324
2325 if (forknum != MAIN_FORKNUM)
2326 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2327 block_id,
2328 rlocator.spcOid, rlocator.dbOid,
2329 rlocator.relNumber,
2330 forknum,
2331 blk);
2332 else
2333 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2334 block_id,
2335 rlocator.spcOid, rlocator.dbOid,
2336 rlocator.relNumber,
2337 blk);
2338 if (XLogRecHasBlockImage(record, block_id))
2339 appendStringInfoString(buf, " FPW");
2340 }
2341}
2342
2343
2344/*
2345 * Check that it's OK to switch to new timeline during recovery.
2346 *
2347 * 'lsn' is the address of the shutdown checkpoint record we're about to
2348 * replay. (Currently, timeline can only change at a shutdown checkpoint).
2349 */
2350static void
2352 TimeLineID replayTLI)
2353{
2354 /* Check that the record agrees on what the current (old) timeline is */
2355 if (prevTLI != replayTLI)
2356 ereport(PANIC,
2357 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2358 prevTLI, replayTLI)));
2359
2360 /*
2361 * The new timeline better be in the list of timelines we expect to see,
2362 * according to the timeline history. It should also not decrease.
2363 */
2364 if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2365 ereport(PANIC,
2366 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2367 newTLI, replayTLI)));
2368
2369 /*
2370 * If we have not yet reached min recovery point, and we're about to
2371 * switch to a timeline greater than the timeline of the min recovery
2372 * point: trouble. After switching to the new timeline, we could not
2373 * possibly visit the min recovery point on the correct timeline anymore.
2374 * This can happen if there is a newer timeline in the archive that
2375 * branched before the timeline the min recovery point is on, and you
2376 * attempt to do PITR to the new timeline.
2377 */
2379 lsn < minRecoveryPoint &&
2380 newTLI > minRecoveryPointTLI)
2381 ereport(PANIC,
2382 errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
2383 newTLI,
2386
2387 /* Looks good */
2388}
2389
2390
2391/*
2392 * Extract timestamp from WAL record.
2393 *
2394 * If the record contains a timestamp, returns true, and saves the timestamp
2395 * in *recordXtime. If the record type has no timestamp, returns false.
2396 * Currently, only transaction commit/abort records and restore points contain
2397 * timestamps.
2398 */
2399static bool
2401{
2402 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2403 uint8 xact_info = info & XLOG_XACT_OPMASK;
2404 uint8 rmid = XLogRecGetRmid(record);
2405
2406 if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2407 {
2408 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2409 return true;
2410 }
2411 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2412 xact_info == XLOG_XACT_COMMIT_PREPARED))
2413 {
2414 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2415 return true;
2416 }
2417 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2418 xact_info == XLOG_XACT_ABORT_PREPARED))
2419 {
2420 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2421 return true;
2422 }
2423 return false;
2424}
2425
2426/*
2427 * Checks whether the current buffer page and backup page stored in the
2428 * WAL record are consistent or not. Before comparing the two pages, a
2429 * masking can be applied to the pages to ignore certain areas like hint bits,
2430 * unused space between pd_lower and pd_upper among other things. This
2431 * function should be called once WAL replay has been completed for a
2432 * given record.
2433 */
2434static void
2436{
2437 RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2438 RelFileLocator rlocator;
2439 ForkNumber forknum;
2440 BlockNumber blkno;
2441 int block_id;
2442
2443 /* Records with no backup blocks have no need for consistency checks. */
2444 if (!XLogRecHasAnyBlockRefs(record))
2445 return;
2446
2448
2449 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2450 {
2451 Buffer buf;
2452 Page page;
2453
2454 if (!XLogRecGetBlockTagExtended(record, block_id,
2455 &rlocator, &forknum, &blkno, NULL))
2456 {
2457 /*
2458 * WAL record doesn't contain a block reference with the given id.
2459 * Do nothing.
2460 */
2461 continue;
2462 }
2463
2464 Assert(XLogRecHasBlockImage(record, block_id));
2465
2466 if (XLogRecBlockImageApply(record, block_id))
2467 {
2468 /*
2469 * WAL record has already applied the page, so bypass the
2470 * consistency check as that would result in comparing the full
2471 * page stored in the record with itself.
2472 */
2473 continue;
2474 }
2475
2476 /*
2477 * Read the contents from the current buffer and store it in a
2478 * temporary page.
2479 */
2480 buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2483 if (!BufferIsValid(buf))
2484 continue;
2485
2487 page = BufferGetPage(buf);
2488
2489 /*
2490 * Take a copy of the local page where WAL has been applied to have a
2491 * comparison base before masking it...
2492 */
2493 memcpy(replay_image_masked, page, BLCKSZ);
2494
2495 /* No need for this page anymore now that a copy is in. */
2497
2498 /*
2499 * If the block LSN is already ahead of this WAL record, we can't
2500 * expect contents to match. This can happen if recovery is
2501 * restarted.
2502 */
2504 continue;
2505
2506 /*
2507 * Read the contents from the backup copy, stored in WAL record and
2508 * store it in a temporary page. There is no need to allocate a new
2509 * page here, a local buffer is fine to hold its contents and a mask
2510 * can be directly applied on it.
2511 */
2512 if (!RestoreBlockImage(record, block_id, primary_image_masked))
2513 ereport(ERROR,
2514 (errcode(ERRCODE_INTERNAL_ERROR),
2515 errmsg_internal("%s", record->errormsg_buf)));
2516
2517 /*
2518 * If masking function is defined, mask both the primary and replay
2519 * images
2520 */
2521 if (rmgr.rm_mask != NULL)
2522 {
2523 rmgr.rm_mask(replay_image_masked, blkno);
2524 rmgr.rm_mask(primary_image_masked, blkno);
2525 }
2526
2527 /* Time to compare the primary and replay images. */
2528 if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2529 {
2530 elog(FATAL,
2531 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2532 rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2533 forknum, blkno);
2534 }
2535 }
2536}
2537
2538/*
2539 * For point-in-time recovery, this function decides whether we want to
2540 * stop applying the XLOG before the current record.
2541 *
2542 * Returns true if we are stopping, false otherwise. If stopping, some
2543 * information is saved in recoveryStopXid et al for use in annotating the
2544 * new timeline's history file.
2545 */
2546static bool
2548{
2549 bool stopsHere = false;
2550 uint8 xact_info;
2551 bool isCommit;
2552 TimestampTz recordXtime = 0;
2553 TransactionId recordXid;
2554
2555 /*
2556 * Ignore recovery target settings when not in archive recovery (meaning
2557 * we are in crash recovery).
2558 */
2560 return false;
2561
2562 /* Check if we should stop as soon as reaching consistency */
2564 {
2565 ereport(LOG,
2566 (errmsg("recovery stopping after reaching consistency")));
2567
2568 recoveryStopAfter = false;
2571 recoveryStopTime = 0;
2572 recoveryStopName[0] = '\0';
2573 return true;
2574 }
2575
2576 /* Check if target LSN has been reached */
2579 record->ReadRecPtr >= recoveryTargetLSN)
2580 {
2581 recoveryStopAfter = false;
2583 recoveryStopLSN = record->ReadRecPtr;
2584 recoveryStopTime = 0;
2585 recoveryStopName[0] = '\0';
2586 ereport(LOG,
2587 errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
2589 return true;
2590 }
2591
2592 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2593 if (XLogRecGetRmid(record) != RM_XACT_ID)
2594 return false;
2595
2596 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2597
2598 if (xact_info == XLOG_XACT_COMMIT)
2599 {
2600 isCommit = true;
2601 recordXid = XLogRecGetXid(record);
2602 }
2603 else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2604 {
2605 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2606 xl_xact_parsed_commit parsed;
2607
2608 isCommit = true;
2610 xlrec,
2611 &parsed);
2612 recordXid = parsed.twophase_xid;
2613 }
2614 else if (xact_info == XLOG_XACT_ABORT)
2615 {
2616 isCommit = false;
2617 recordXid = XLogRecGetXid(record);
2618 }
2619 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2620 {
2621 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2622 xl_xact_parsed_abort parsed;
2623
2624 isCommit = false;
2626 xlrec,
2627 &parsed);
2628 recordXid = parsed.twophase_xid;
2629 }
2630 else
2631 return false;
2632
2634 {
2635 /*
2636 * There can be only one transaction end record with this exact
2637 * transactionid
2638 *
2639 * when testing for an xid, we MUST test for equality only, since
2640 * transactions are numbered in the order they start, not the order
2641 * they complete. A higher numbered xid will complete before you about
2642 * 50% of the time...
2643 */
2644 stopsHere = (recordXid == recoveryTargetXid);
2645 }
2646
2647 /*
2648 * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2649 * We don't expect getRecordTimestamp ever to fail, since we already know
2650 * this is a commit or abort record; but test its result anyway.
2651 */
2652 if (getRecordTimestamp(record, &recordXtime) &&
2654 {
2655 /*
2656 * There can be many transactions that share the same commit time, so
2657 * we stop after the last one, if we are inclusive, or stop at the
2658 * first one if we are exclusive
2659 */
2661 stopsHere = (recordXtime > recoveryTargetTime);
2662 else
2663 stopsHere = (recordXtime >= recoveryTargetTime);
2664 }
2665
2666 if (stopsHere)
2667 {
2668 recoveryStopAfter = false;
2669 recoveryStopXid = recordXid;
2670 recoveryStopTime = recordXtime;
2672 recoveryStopName[0] = '\0';
2673
2674 if (isCommit)
2675 {
2676 ereport(LOG,
2677 (errmsg("recovery stopping before commit of transaction %u, time %s",
2680 }
2681 else
2682 {
2683 ereport(LOG,
2684 (errmsg("recovery stopping before abort of transaction %u, time %s",
2687 }
2688 }
2689
2690 return stopsHere;
2691}
2692
2693/*
2694 * Same as recoveryStopsBefore, but called after applying the record.
2695 *
2696 * We also track the timestamp of the latest applied COMMIT/ABORT
2697 * record in XLogRecoveryCtl->recoveryLastXTime.
2698 */
2699static bool
2701{
2702 uint8 info;
2703 uint8 xact_info;
2704 uint8 rmid;
2705 TimestampTz recordXtime = 0;
2706
2707 /*
2708 * Ignore recovery target settings when not in archive recovery (meaning
2709 * we are in crash recovery).
2710 */
2712 return false;
2713
2714 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2715 rmid = XLogRecGetRmid(record);
2716
2717 /*
2718 * There can be many restore points that share the same name; we stop at
2719 * the first one.
2720 */
2722 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2723 {
2724 xl_restore_point *recordRestorePointData;
2725
2726 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2727
2728 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2729 {
2730 recoveryStopAfter = true;
2733 (void) getRecordTimestamp(record, &recoveryStopTime);
2734 strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2735
2736 ereport(LOG,
2737 (errmsg("recovery stopping at restore point \"%s\", time %s",
2740 return true;
2741 }
2742 }
2743
2744 /* Check if the target LSN has been reached */
2747 record->ReadRecPtr >= recoveryTargetLSN)
2748 {
2749 recoveryStopAfter = true;
2751 recoveryStopLSN = record->ReadRecPtr;
2752 recoveryStopTime = 0;
2753 recoveryStopName[0] = '\0';
2754 ereport(LOG,
2755 errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
2757 return true;
2758 }
2759
2760 if (rmid != RM_XACT_ID)
2761 return false;
2762
2763 xact_info = info & XLOG_XACT_OPMASK;
2764
2765 if (xact_info == XLOG_XACT_COMMIT ||
2766 xact_info == XLOG_XACT_COMMIT_PREPARED ||
2767 xact_info == XLOG_XACT_ABORT ||
2768 xact_info == XLOG_XACT_ABORT_PREPARED)
2769 {
2770 TransactionId recordXid;
2771
2772 /* Update the last applied transaction timestamp */
2773 if (getRecordTimestamp(record, &recordXtime))
2774 SetLatestXTime(recordXtime);
2775
2776 /* Extract the XID of the committed/aborted transaction */
2777 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2778 {
2779 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2780 xl_xact_parsed_commit parsed;
2781
2783 xlrec,
2784 &parsed);
2785 recordXid = parsed.twophase_xid;
2786 }
2787 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2788 {
2789 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2790 xl_xact_parsed_abort parsed;
2791
2793 xlrec,
2794 &parsed);
2795 recordXid = parsed.twophase_xid;
2796 }
2797 else
2798 recordXid = XLogRecGetXid(record);
2799
2800 /*
2801 * There can be only one transaction end record with this exact
2802 * transactionid
2803 *
2804 * when testing for an xid, we MUST test for equality only, since
2805 * transactions are numbered in the order they start, not the order
2806 * they complete. A higher numbered xid will complete before you about
2807 * 50% of the time...
2808 */
2810 recordXid == recoveryTargetXid)
2811 {
2812 recoveryStopAfter = true;
2813 recoveryStopXid = recordXid;
2814 recoveryStopTime = recordXtime;
2816 recoveryStopName[0] = '\0';
2817
2818 if (xact_info == XLOG_XACT_COMMIT ||
2819 xact_info == XLOG_XACT_COMMIT_PREPARED)
2820 {
2821 ereport(LOG,
2822 (errmsg("recovery stopping after commit of transaction %u, time %s",
2825 }
2826 else if (xact_info == XLOG_XACT_ABORT ||
2827 xact_info == XLOG_XACT_ABORT_PREPARED)
2828 {
2829 ereport(LOG,
2830 (errmsg("recovery stopping after abort of transaction %u, time %s",
2833 }
2834 return true;
2835 }
2836 }
2837
2838 /* Check if we should stop as soon as reaching consistency */
2840 {
2841 ereport(LOG,
2842 (errmsg("recovery stopping after reaching consistency")));
2843
2844 recoveryStopAfter = true;
2846 recoveryStopTime = 0;
2848 recoveryStopName[0] = '\0';
2849 return true;
2850 }
2851
2852 return false;
2853}
2854
2855/*
2856 * Create a comment for the history file to explain why and where
2857 * timeline changed.
2858 */
2859static char *
2861{
2862 char reason[200];
2863
2865 snprintf(reason, sizeof(reason),
2866 "%s transaction %u",
2867 recoveryStopAfter ? "after" : "before",
2870 snprintf(reason, sizeof(reason),
2871 "%s %s\n",
2872 recoveryStopAfter ? "after" : "before",
2875 snprintf(reason, sizeof(reason),
2876 "%s LSN %X/%08X\n",
2877 recoveryStopAfter ? "after" : "before",
2880 snprintf(reason, sizeof(reason),
2881 "at restore point \"%s\"",
2884 snprintf(reason, sizeof(reason), "reached consistency");
2885 else
2886 snprintf(reason, sizeof(reason), "no recovery target specified");
2887
2888 return pstrdup(reason);
2889}
2890
2891/*
2892 * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2893 *
2894 * endOfRecovery is true if the recovery target is reached and
2895 * the paused state starts at the end of recovery because of
2896 * recovery_target_action=pause, and false otherwise.
2897 */
2898static void
2899recoveryPausesHere(bool endOfRecovery)
2900{
2901 /* Don't pause unless users can connect! */
2903 return;
2904
2905 /* Don't pause after standby promotion has been triggered */
2907 return;
2908
2909 if (endOfRecovery)
2910 ereport(LOG,
2911 (errmsg("pausing at the end of recovery"),
2912 errhint("Execute pg_wal_replay_resume() to promote.")));
2913 else
2914 ereport(LOG,
2915 (errmsg("recovery has paused"),
2916 errhint("Execute pg_wal_replay_resume() to continue.")));
2917
2918 /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2920 {
2923 return;
2924
2925 /*
2926 * If recovery pause is requested then set it paused. While we are in
2927 * the loop, user might resume and pause again so set this every time.
2928 */
2930
2931 /*
2932 * We wait on a condition variable that will wake us as soon as the
2933 * pause ends, but we use a timeout so we can check the above exit
2934 * condition periodically too.
2935 */
2937 WAIT_EVENT_RECOVERY_PAUSE);
2938 }
2940}
2941
2942/*
2943 * When recovery_min_apply_delay is set, we wait long enough to make sure
2944 * certain record types are applied at least that interval behind the primary.
2945 *
2946 * Returns true if we waited.
2947 *
2948 * Note that the delay is calculated between the WAL record log time and
2949 * the current time on standby. We would prefer to keep track of when this
2950 * standby received each WAL record, which would allow a more consistent
2951 * approach and one not affected by time synchronisation issues, but that
2952 * is significantly more effort and complexity for little actual gain in
2953 * usability.
2954 */
2955static bool
2957{
2958 uint8 xact_info;
2959 TimestampTz xtime;
2960 TimestampTz delayUntil;
2961 long msecs;
2962
2963 /* nothing to do if no delay configured */
2964 if (recovery_min_apply_delay <= 0)
2965 return false;
2966
2967 /* no delay is applied on a database not yet consistent */
2968 if (!reachedConsistency)
2969 return false;
2970
2971 /* nothing to do if crash recovery is requested */
2973 return false;
2974
2975 /*
2976 * Is it a COMMIT record?
2977 *
2978 * We deliberately choose not to delay aborts since they have no effect on
2979 * MVCC. We already allow replay of records that don't have a timestamp,
2980 * so there is already opportunity for issues caused by early conflicts on
2981 * standbys.
2982 */
2983 if (XLogRecGetRmid(record) != RM_XACT_ID)
2984 return false;
2985
2986 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2987
2988 if (xact_info != XLOG_XACT_COMMIT &&
2989 xact_info != XLOG_XACT_COMMIT_PREPARED)
2990 return false;
2991
2992 if (!getRecordTimestamp(record, &xtime))
2993 return false;
2994
2996
2997 /*
2998 * Exit without arming the latch if it's already past time to apply this
2999 * record
3000 */
3002 if (msecs <= 0)
3003 return false;
3004
3005 while (true)
3006 {
3008
3009 /* This might change recovery_min_apply_delay. */
3011
3013 break;
3014
3015 /*
3016 * Recalculate delayUntil as recovery_min_apply_delay could have
3017 * changed while waiting in this loop.
3018 */
3020
3021 /*
3022 * Wait for difference between GetCurrentTimestamp() and delayUntil.
3023 */
3025 delayUntil);
3026
3027 if (msecs <= 0)
3028 break;
3029
3030 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3031
3034 msecs,
3035 WAIT_EVENT_RECOVERY_APPLY_DELAY);
3036 }
3037 return true;
3038}
3039
3040/*
3041 * Get the current state of the recovery pause request.
3042 */
3054
3055/*
3056 * Set the recovery pause state.
3057 *
3058 * If recovery pause is requested then sets the recovery pause state to
3059 * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3060 * to 'not paused' to resume the recovery. The recovery pause will be
3061 * confirmed by the ConfirmRecoveryPaused.
3062 */
3063void
3078
3079/*
3080 * Confirm the recovery pause by setting the recovery pause state to
3081 * RECOVERY_PAUSED.
3082 */
3083static void
3092
3093
3094/*
3095 * Attempt to read the next XLOG record.
3096 *
3097 * Before first call, the reader needs to be positioned to the first record
3098 * by calling XLogPrefetcherBeginRead().
3099 *
3100 * If no valid record is available, returns NULL, or fails if emode is PANIC.
3101 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3102 * record is available.
3103 */
3104static XLogRecord *
3106 bool fetching_ckpt, TimeLineID replayTLI)
3107{
3108 XLogRecord *record;
3111
3113
3114 /* Pass through parameters to XLogPageRead */
3115 private->fetching_ckpt = fetching_ckpt;
3116 private->emode = emode;
3117 private->randAccess = !XLogRecPtrIsValid(xlogreader->ReadRecPtr);
3118 private->replayTLI = replayTLI;
3119
3120 /* This is the first attempt to read this page. */
3121 lastSourceFailed = false;
3122
3123 for (;;)
3124 {
3125 char *errormsg;
3126
3127 record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3128 if (record == NULL)
3129 {
3130 /*
3131 * When we find that WAL ends in an incomplete record, keep track
3132 * of that record. After recovery is done, we'll write a record
3133 * to indicate to downstream WAL readers that that portion is to
3134 * be ignored.
3135 *
3136 * However, when ArchiveRecoveryRequested = true, we're going to
3137 * switch to a new timeline at the end of recovery. We will only
3138 * copy WAL over to the new timeline up to the end of the last
3139 * complete record, so if we did this, we would later create an
3140 * overwrite contrecord in the wrong place, breaking everything.
3141 */
3144 {
3147 }
3148
3149 if (readFile >= 0)
3150 {
3151 close(readFile);
3152 readFile = -1;
3153 }
3154
3155 /*
3156 * We only end up here without a message when XLogPageRead()
3157 * failed - in that case we already logged something. In
3158 * StandbyMode that only happens if we have been triggered, so we
3159 * shouldn't loop anymore in that case.
3160 */
3161 if (errormsg)
3163 (errmsg_internal("%s", errormsg) /* already translated */ ));
3164 }
3165
3166 /*
3167 * Check page TLI is one of the expected values.
3168 */
3170 {
3171 char fname[MAXFNAMELEN];
3172 XLogSegNo segno;
3173 int32 offset;
3174
3178 XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3181 errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
3183 fname,
3185 offset));
3186 record = NULL;
3187 }
3188
3189 if (record)
3190 {
3191 /* Great, got a record */
3192 return record;
3193 }
3194 else
3195 {
3196 /* No valid record available from this source */
3197 lastSourceFailed = true;
3198
3199 /*
3200 * If archive recovery was requested, but we were still doing
3201 * crash recovery, switch to archive recovery and retry using the
3202 * offline archive. We have now replayed all the valid WAL in
3203 * pg_wal, so we are presumably now consistent.
3204 *
3205 * We require that there's at least some valid WAL present in
3206 * pg_wal, however (!fetching_ckpt). We could recover using the
3207 * WAL from the archive, even if pg_wal is completely empty, but
3208 * we'd have no idea how far we'd have to replay to reach
3209 * consistency. So err on the safe side and give up.
3210 */
3212 !fetching_ckpt)
3213 {
3215 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3216 InArchiveRecovery = true;
3219
3222 minRecoveryPointTLI = replayTLI;
3223
3225
3226 /*
3227 * Before we retry, reset lastSourceFailed and currentSource
3228 * so that we will check the archive next.
3229 */
3230 lastSourceFailed = false;
3232
3233 continue;
3234 }
3235
3236 /* In standby mode, loop back to retry. Otherwise, give up. */
3238 continue;
3239 else
3240 return NULL;
3241 }
3242 }
3243}
3244
3245/*
3246 * Read the XLOG page containing targetPagePtr into readBuf (if not read
3247 * already). Returns number of bytes read, if the page is read successfully,
3248 * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3249 * but only if they have not been previously reported.
3250 *
3251 * See XLogReaderRoutine.page_read for more details.
3252 *
3253 * While prefetching, xlogreader->nonblocking may be set. In that case,
3254 * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3255 *
3256 * This is responsible for restoring files from archive as needed, as well
3257 * as for waiting for the requested WAL record to arrive in standby mode.
3258 *
3259 * xlogreader->private_data->emode specifies the log level used for reporting
3260 * "file not found" or "end of WAL" situations in archive recovery, or in
3261 * standby mode when promotion is triggered. If set to WARNING or below,
3262 * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3263 * levels the ereport() won't return.
3264 *
3265 * In standby mode, if after a successful return of XLogPageRead() the
3266 * caller finds the record it's interested in to be broken, it should
3267 * ereport the error with the level determined by
3268 * emode_for_corrupt_record(), and then set lastSourceFailed
3269 * and call XLogPageRead() again with the same arguments. This lets
3270 * XLogPageRead() to try fetching the record from another source, or to
3271 * sleep and retry.
3272 */
3273static int
3275 XLogRecPtr targetRecPtr, char *readBuf)
3276{
3277 XLogPageReadPrivate *private =
3279 int emode = private->emode;
3280 uint32 targetPageOff;
3282 int r;
3283 instr_time io_start;
3284
3286
3287 XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3288 targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3289
3290 /*
3291 * See if we need to switch to a new segment because the requested record
3292 * is not in the currently open one.
3293 */
3294 if (readFile >= 0 &&
3295 !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3296 {
3297 /*
3298 * Request a restartpoint if we've replayed too much xlog since the
3299 * last one.
3300 */
3302 {
3304 {
3305 (void) GetRedoRecPtr();
3308 }
3309 }
3310
3311 close(readFile);
3312 readFile = -1;
3314 }
3315
3316 XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3317
3318retry:
3319 /* See if we need to retrieve more data */
3320 if (readFile < 0 ||
3322 flushedUpto < targetPagePtr + reqLen))
3323 {
3324 if (readFile >= 0 &&
3327 flushedUpto < targetPagePtr + reqLen)
3328 return XLREAD_WOULDBLOCK;
3329
3330 switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3331 private->randAccess,
3332 private->fetching_ckpt,
3333 targetRecPtr,
3334 private->replayTLI,
3337 {
3338 case XLREAD_WOULDBLOCK:
3339 return XLREAD_WOULDBLOCK;
3340 case XLREAD_FAIL:
3341 if (readFile >= 0)
3342 close(readFile);
3343 readFile = -1;
3344 readLen = 0;
3346 return XLREAD_FAIL;
3347 case XLREAD_SUCCESS:
3348 break;
3349 }
3350 }
3351
3352 /*
3353 * At this point, we have the right segment open and if we're streaming we
3354 * know the requested record is in it.
3355 */
3356 Assert(readFile != -1);
3357
3358 /*
3359 * If the current segment is being streamed from the primary, calculate
3360 * how much of the current page we have received already. We know the
3361 * requested record has been received, but this is for the benefit of
3362 * future calls, to allow quick exit at the top of this function.
3363 */
3365 {
3366 if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3367 readLen = XLOG_BLCKSZ;
3368 else
3370 targetPageOff;
3371 }
3372 else
3373 readLen = XLOG_BLCKSZ;
3374
3375 /* Read the requested page */
3376 readOff = targetPageOff;
3377
3378 /* Measure I/O timing when reading segment */
3380
3381 pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3382 r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (pgoff_t) readOff);
3383 if (r != XLOG_BLCKSZ)
3384 {
3385 char fname[MAXFNAMELEN];
3386 int save_errno = errno;
3387
3389
3391 io_start, 1, r);
3392
3394 if (r < 0)
3395 {
3396 errno = save_errno;
3397 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3399 errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
3400 fname, LSN_FORMAT_ARGS(targetPagePtr),
3401 readOff)));
3402 }
3403 else
3404 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3406 errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
3407 fname, LSN_FORMAT_ARGS(targetPagePtr),
3408 readOff, r, (Size) XLOG_BLCKSZ)));
3409 goto next_record_is_invalid;
3410 }
3412
3414 io_start, 1, r);
3415
3416 Assert(targetSegNo == readSegNo);
3417 Assert(targetPageOff == readOff);
3418 Assert(reqLen <= readLen);
3419
3421
3422 /*
3423 * Check the page header immediately, so that we can retry immediately if
3424 * it's not valid. This may seem unnecessary, because ReadPageInternal()
3425 * validates the page header anyway, and would propagate the failure up to
3426 * ReadRecord(), which would retry. However, there's a corner case with
3427 * continuation records, if a record is split across two pages such that
3428 * we would need to read the two pages from different sources across two
3429 * WAL segments.
3430 *
3431 * The first page is only available locally, in pg_wal, because it's
3432 * already been recycled on the primary. The second page, however, is not
3433 * present in pg_wal, and we should stream it from the primary. There is a
3434 * recycled WAL segment present in pg_wal, with garbage contents, however.
3435 * We would read the first page from the local WAL segment, but when
3436 * reading the second page, we would read the bogus, recycled, WAL
3437 * segment. If we didn't catch that case here, we would never recover,
3438 * because ReadRecord() would retry reading the whole record from the
3439 * beginning.
3440 *
3441 * Of course, this only catches errors in the page header, which is what
3442 * happens in the case of a recycled WAL segment. Other kinds of errors or
3443 * corruption still has the same problem. But this at least fixes the
3444 * common case, which can happen as part of normal operation.
3445 *
3446 * Validating the page header is cheap enough that doing it twice
3447 * shouldn't be a big deal from a performance point of view.
3448 *
3449 * When not in standby mode, an invalid page header should cause recovery
3450 * to end, not retry reading the page, so we don't need to validate the
3451 * page header here for the retry. Instead, ReadPageInternal() is
3452 * responsible for the validation.
3453 */
3454 if (StandbyMode &&
3455 (targetPagePtr % wal_segment_size) == 0 &&
3456 !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3457 {
3458 /*
3459 * Emit this error right now then retry this page immediately. Use
3460 * errmsg_internal() because the message was already translated.
3461 */
3462 if (xlogreader->errormsg_buf[0])
3465
3466 /* reset any error XLogReaderValidatePageHeader() might have set */
3468 goto next_record_is_invalid;
3469 }
3470
3471 return readLen;
3472
3473next_record_is_invalid:
3474
3475 /*
3476 * If we're reading ahead, give up fast. Retries and error reporting will
3477 * be handled by a later read when recovery catches up to this point.
3478 */
3480 return XLREAD_WOULDBLOCK;
3481
3482 lastSourceFailed = true;
3483
3484 if (readFile >= 0)
3485 close(readFile);
3486 readFile = -1;
3487 readLen = 0;
3489
3490 /* In standby-mode, keep trying */
3491 if (StandbyMode)
3492 goto retry;
3493 else
3494 return XLREAD_FAIL;
3495}
3496
3497/*
3498 * Open the WAL segment containing WAL location 'RecPtr'.
3499 *
3500 * The segment can be fetched via restore_command, or via walreceiver having
3501 * streamed the record, or it can already be present in pg_wal. Checking
3502 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3503 * too, in case someone copies a new segment directly to pg_wal. That is not
3504 * documented or recommended, though.
3505 *
3506 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3507 * prepare to read WAL starting from RedoStartLSN after this.
3508 *
3509 * 'RecPtr' might not point to the beginning of the record we're interested
3510 * in, it might also point to the page or segment header. In that case,
3511 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3512 * used to decide which timeline to stream the requested WAL from.
3513 *
3514 * 'replayLSN' is the current replay LSN, so that if we scan for new
3515 * timelines, we can reject a switch to a timeline that branched off before
3516 * this point.
3517 *
3518 * If the record is not immediately available, the function returns XLREAD_FAIL
3519 * if we're not in standby mode. In standby mode, the function waits for it to
3520 * become available.
3521 *
3522 * When the requested record becomes available, the function opens the file
3523 * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3524 * of standby mode is triggered by the user, and there is no more WAL
3525 * available, returns XLREAD_FAIL.
3526 *
3527 * If nonblocking is true, then give up immediately if we can't satisfy the
3528 * request, returning XLREAD_WOULDBLOCK instead of waiting.
3529 */
3530static XLogPageReadResult
3532 bool fetching_ckpt, XLogRecPtr tliRecPtr,
3533 TimeLineID replayTLI, XLogRecPtr replayLSN,
3534 bool nonblocking)
3535{
3536 static TimestampTz last_fail_time = 0;
3538 bool streaming_reply_sent = false;
3539
3540 /*-------
3541 * Standby mode is implemented by a state machine:
3542 *
3543 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3544 * pg_wal (XLOG_FROM_PG_WAL)
3545 * 2. Check for promotion trigger request
3546 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3547 * 4. Rescan timelines
3548 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3549 *
3550 * Failure to read from the current source advances the state machine to
3551 * the next state.
3552 *
3553 * 'currentSource' indicates the current state. There are no currentSource
3554 * values for "check trigger", "rescan timelines", and "sleep" states,
3555 * those actions are taken when reading from the previous source fails, as
3556 * part of advancing to the next state.
3557 *
3558 * If standby mode is turned off while reading WAL from stream, we move
3559 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3560 * the files (which would be required at end of recovery, e.g., timeline
3561 * history file) from archive or pg_wal. We don't need to kill WAL receiver
3562 * here because it's already stopped when standby mode is turned off at
3563 * the end of recovery.
3564 *-------
3565 */
3566 if (!InArchiveRecovery)
3568 else if (currentSource == XLOG_FROM_ANY ||
3570 {
3571 lastSourceFailed = false;
3573 }
3574
3575 for (;;)
3576 {
3577 XLogSource oldSource = currentSource;
3578 bool startWalReceiver = false;
3579
3580 /*
3581 * First check if we failed to read from the current source, and
3582 * advance the state machine if so. The failure to read might've
3583 * happened outside this function, e.g when a CRC check fails on a
3584 * record, or within this loop.
3585 */
3586 if (lastSourceFailed)
3587 {
3588 /*
3589 * Don't allow any retry loops to occur during nonblocking
3590 * readahead. Let the caller process everything that has been
3591 * decoded already first.
3592 */
3593 if (nonblocking)
3594 return XLREAD_WOULDBLOCK;
3595
3596 switch (currentSource)
3597 {
3598 case XLOG_FROM_ARCHIVE:
3599 case XLOG_FROM_PG_WAL:
3600
3601 /*
3602 * Check to see if promotion is requested. Note that we do
3603 * this only after failure, so when you promote, we still
3604 * finish replaying as much as we can from archive and
3605 * pg_wal before failover.
3606 */
3608 {
3610 return XLREAD_FAIL;
3611 }
3612
3613 /*
3614 * Not in standby mode, and we've now tried the archive
3615 * and pg_wal.
3616 */
3617 if (!StandbyMode)
3618 return XLREAD_FAIL;
3619
3620 /*
3621 * Move to XLOG_FROM_STREAM state, and set to start a
3622 * walreceiver if necessary.
3623 */
3625 startWalReceiver = true;
3626 break;
3627
3628 case XLOG_FROM_STREAM:
3629
3630 /*
3631 * Failure while streaming. Most likely, we got here
3632 * because streaming replication was terminated, or
3633 * promotion was triggered. But we also get here if we
3634 * find an invalid record in the WAL streamed from the
3635 * primary, in which case something is seriously wrong.
3636 * There's little chance that the problem will just go
3637 * away, but PANIC is not good for availability either,
3638 * especially in hot standby mode. So, we treat that the
3639 * same as disconnection, and retry from archive/pg_wal
3640 * again. The WAL in the archive should be identical to
3641 * what was streamed, so it's unlikely that it helps, but
3642 * one can hope...
3643 */
3644
3645 /*
3646 * We should be able to move to XLOG_FROM_STREAM only in
3647 * standby mode.
3648 */
3650
3651 /*
3652 * Before we leave XLOG_FROM_STREAM state, make sure that
3653 * walreceiver is not active, so that it won't overwrite
3654 * WAL that we restore from archive.
3655 *
3656 * If walreceiver is actively streaming (or attempting to
3657 * connect), we must shut it down. However, if it's
3658 * already in WAITING state (e.g., due to timeline
3659 * divergence), we only need to reset the install flag to
3660 * allow archive restoration.
3661 */
3662 if (WalRcvStreaming())
3664 else
3665 {
3666 /*
3667 * WALRCV_STOPPING state is a transient state while
3668 * the startup process is in ShutdownWalRcv(). It
3669 * should never appear here since we would be waiting
3670 * for the walreceiver to reach WALRCV_STOPPED in that
3671 * case.
3672 */
3675 }
3676
3677 /*
3678 * Before we sleep, re-scan for possible new timelines if
3679 * we were requested to recover to the latest timeline.
3680 */
3682 {
3683 if (rescanLatestTimeLine(replayTLI, replayLSN))
3684 {
3686 break;
3687 }
3688 }
3689
3690 /*
3691 * XLOG_FROM_STREAM is the last state in our state
3692 * machine, so we've exhausted all the options for
3693 * obtaining the requested WAL. We're going to loop back
3694 * and retry from the archive, but if it hasn't been long
3695 * since last attempt, sleep wal_retrieve_retry_interval
3696 * milliseconds to avoid busy-waiting.
3697 */
3699 if (!TimestampDifferenceExceeds(last_fail_time, now,
3701 {
3702 long wait_time;
3703
3704 wait_time = wal_retrieve_retry_interval -
3705 TimestampDifferenceMilliseconds(last_fail_time, now);
3706
3707 elog(LOG, "waiting for WAL to become available at %X/%08X",
3708 LSN_FORMAT_ARGS(RecPtr));
3709
3710 /* Do background tasks that might benefit us later. */
3712
3716 wait_time,
3717 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3720
3721 /* Handle interrupt signals of startup process */
3723 }
3724 last_fail_time = now;
3726 break;
3727
3728 default:
3729 elog(ERROR, "unexpected WAL source %d", currentSource);
3730 }
3731 }
3732 else if (currentSource == XLOG_FROM_PG_WAL)
3733 {
3734 /*
3735 * We just successfully read a file in pg_wal. We prefer files in
3736 * the archive over ones in pg_wal, so try the next file again
3737 * from the archive first.
3738 */
3741 }
3742
3743 if (currentSource != oldSource)
3744 elog(DEBUG2, "switched WAL source from %s to %s after %s",
3746 lastSourceFailed ? "failure" : "success");
3747
3748 /*
3749 * We've now handled possible failure. Try to read from the chosen
3750 * source.
3751 */
3752 lastSourceFailed = false;
3753
3754 switch (currentSource)
3755 {
3756 case XLOG_FROM_ARCHIVE:
3757 case XLOG_FROM_PG_WAL:
3758
3759 /*
3760 * WAL receiver must not be running when reading WAL from
3761 * archive or pg_wal.
3762 */
3764
3765 /* Close any old file we might have open. */
3766 if (readFile >= 0)
3767 {
3768 close(readFile);
3769 readFile = -1;
3770 }
3771 /* Reset curFileTLI if random fetch. */
3772 if (randAccess)
3773 curFileTLI = 0;
3774
3775 /*
3776 * Try to restore the file from archive, or read an existing
3777 * file from pg_wal.
3778 */
3782 if (readFile >= 0)
3783 return XLREAD_SUCCESS; /* success! */
3784
3785 /*
3786 * Nope, not found in archive or pg_wal.
3787 */
3788 lastSourceFailed = true;
3789 break;
3790
3791 case XLOG_FROM_STREAM:
3792 {
3793 bool havedata;
3794
3795 /*
3796 * We should be able to move to XLOG_FROM_STREAM only in
3797 * standby mode.
3798 */
3800
3801 /*
3802 * First, shutdown walreceiver if its restart has been
3803 * requested -- but no point if we're already slated for
3804 * starting it.
3805 */
3806 if (pendingWalRcvRestart && !startWalReceiver)
3807 {
3809
3810 /*
3811 * Re-scan for possible new timelines if we were
3812 * requested to recover to the latest timeline.
3813 */
3816 rescanLatestTimeLine(replayTLI, replayLSN);
3817
3818 startWalReceiver = true;
3819 }
3820 pendingWalRcvRestart = false;
3821
3822 /*
3823 * Launch walreceiver if needed.
3824 *
3825 * If fetching_ckpt is true, RecPtr points to the initial
3826 * checkpoint location. In that case, we use RedoStartLSN
3827 * as the streaming start position instead of RecPtr, so
3828 * that when we later jump backwards to start redo at
3829 * RedoStartLSN, we will have the logs streamed already.
3830 */
3831 if (startWalReceiver &&
3832 PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3833 {
3834 XLogRecPtr ptr;
3835 TimeLineID tli;
3836
3837 if (fetching_ckpt)
3838 {
3839 ptr = RedoStartLSN;
3840 tli = RedoStartTLI;
3841 }
3842 else
3843 {
3844 ptr = RecPtr;
3845
3846 /*
3847 * Use the record begin position to determine the
3848 * TLI, rather than the position we're reading.
3849 */
3850 tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3851
3852 if (curFileTLI > 0 && tli < curFileTLI)
3853 elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3854 LSN_FORMAT_ARGS(tliRecPtr),
3855 tli, curFileTLI);
3856 }
3857 curFileTLI = tli;
3863 }
3864
3865 /*
3866 * Check if WAL receiver is active or wait to start up.
3867 */
3868 if (!WalRcvStreaming())
3869 {
3870 lastSourceFailed = true;
3871 break;
3872 }
3873
3874 /*
3875 * Walreceiver is active, so see if new data has arrived.
3876 *
3877 * We only advance XLogReceiptTime when we obtain fresh
3878 * WAL from walreceiver and observe that we had already
3879 * processed everything before the most recent "chunk"
3880 * that it flushed to disk. In steady state where we are
3881 * keeping up with the incoming data, XLogReceiptTime will
3882 * be updated on each cycle. When we are behind,
3883 * XLogReceiptTime will not advance, so the grace time
3884 * allotted to conflicting queries will decrease.
3885 */
3886 if (RecPtr < flushedUpto)
3887 havedata = true;
3888 else
3889 {
3890 XLogRecPtr latestChunkStart;
3891
3892 flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3893 if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3894 {
3895 havedata = true;
3896 if (latestChunkStart <= RecPtr)
3897 {
3900 }
3901 }
3902 else
3903 havedata = false;
3904 }
3905 if (havedata)
3906 {
3907 /*
3908 * Great, streamed far enough. Open the file if it's
3909 * not open already. Also read the timeline history
3910 * file if we haven't initialized timeline history
3911 * yet; it should be streamed over and present in
3912 * pg_wal by now. Use XLOG_FROM_STREAM so that source
3913 * info is set correctly and XLogReceiptTime isn't
3914 * changed.
3915 *
3916 * NB: We must set readTimeLineHistory based on
3917 * recoveryTargetTLI, not receiveTLI. Normally they'll
3918 * be the same, but if recovery_target_timeline is
3919 * 'latest' and archiving is configured, then it's
3920 * possible that we managed to retrieve one or more
3921 * new timeline history files from the archive,
3922 * updating recoveryTargetTLI.
3923 */
3924 if (readFile < 0)
3925 {
3926 if (!expectedTLEs)
3929 XLOG_FROM_STREAM, false);
3930 Assert(readFile >= 0);
3931 }
3932 else
3933 {
3934 /* just make sure source info is correct... */
3937 return XLREAD_SUCCESS;
3938 }
3939 break;
3940 }
3941
3942 /* In nonblocking mode, return rather than sleeping. */
3943 if (nonblocking)
3944 return XLREAD_WOULDBLOCK;
3945
3946 /*
3947 * Data not here yet. Check for trigger, then wait for
3948 * walreceiver to wake us up when new WAL arrives.
3949 */
3951 {
3952 /*
3953 * Note that we don't return XLREAD_FAIL immediately
3954 * here. After being triggered, we still want to
3955 * replay all the WAL that was already streamed. It's
3956 * in pg_wal now, so we just treat this as a failure,
3957 * and the state machine will move on to replay the
3958 * streamed WAL from pg_wal, and then recheck the
3959 * trigger and exit replay.
3960 */
3961 lastSourceFailed = true;
3962 break;
3963 }
3964
3965 /*
3966 * Since we have replayed everything we have received so
3967 * far and are about to start waiting for more WAL, let's
3968 * tell the upstream server our replay location now so
3969 * that pg_stat_replication doesn't show stale
3970 * information.
3971 */
3972 if (!streaming_reply_sent)
3973 {
3975 streaming_reply_sent = true;
3976 }
3977
3978 /* Do any background tasks that might benefit us later. */
3980
3981 /* Update pg_stat_recovery_prefetch before sleeping. */
3983
3984 /*
3985 * Wait for more WAL to arrive, when we will be woken
3986 * immediately by the WAL receiver.
3987 */
3990 -1L,
3991 WAIT_EVENT_RECOVERY_WAL_STREAM);
3993 break;
3994 }
3995
3996 default:
3997 elog(ERROR, "unexpected WAL source %d", currentSource);
3998 }
3999
4000 /*
4001 * Check for recovery pause here so that we can confirm more quickly
4002 * that a requested pause has actually taken effect.
4003 */
4004 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4006 recoveryPausesHere(false);
4007
4008 /*
4009 * This possibly-long loop needs to handle interrupts of startup
4010 * process.
4011 */
4013 }
4014
4015 return XLREAD_FAIL; /* not reached */
4016}
4017
4018
4019/*
4020 * Determine what log level should be used to report a corrupt WAL record
4021 * in the current WAL page, previously read by XLogPageRead().
4022 *
4023 * 'emode' is the error mode that would be used to report a file-not-found
4024 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4025 * we're retrying the exact same record that we've tried previously, only
4026 * complain the first time to keep the noise down. However, we only do when
4027 * reading from pg_wal, because we don't expect any invalid records in archive
4028 * or in records streamed from the primary. Files in the archive should be complete,
4029 * and we should never hit the end of WAL because we stop and wait for more WAL
4030 * to arrive before replaying it.
4031 *
4032 * NOTE: This function remembers the RecPtr value it was last called with,
4033 * to suppress repeated messages about the same record. Only call this when
4034 * you are about to ereport(), or you might cause a later message to be
4035 * erroneously suppressed.
4036 */
4037static int
4039{
4040 static XLogRecPtr lastComplaint = InvalidXLogRecPtr;
4041
4042 if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4043 {
4044 if (RecPtr == lastComplaint)
4045 emode = DEBUG1;
4046 else
4047 lastComplaint = RecPtr;
4048 }
4049 return emode;
4050}
4051
4052
4053/*
4054 * Subroutine to try to fetch and validate a prior checkpoint record.
4055 */
4056static XLogRecord *
4058 TimeLineID replayTLI)
4059{
4060 XLogRecord *record;
4061 uint8 info;
4062
4063 Assert(xlogreader != NULL);
4064
4065 if (!XRecOffIsValid(RecPtr))
4066 {
4067 ereport(LOG,
4068 (errmsg("invalid checkpoint location")));
4069 return NULL;
4070 }
4071
4073 record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4074
4075 if (record == NULL)
4076 {
4077 ereport(LOG,
4078 (errmsg("invalid checkpoint record")));
4079 return NULL;
4080 }
4081 if (record->xl_rmid != RM_XLOG_ID)
4082 {
4083 ereport(LOG,
4084 (errmsg("invalid resource manager ID in checkpoint record")));
4085 return NULL;
4086 }
4087 info = record->xl_info & ~XLR_INFO_MASK;
4088 if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4089 info != XLOG_CHECKPOINT_ONLINE)
4090 {
4091 ereport(LOG,
4092 (errmsg("invalid xl_info in checkpoint record")));
4093 return NULL;
4094 }
4096 {
4097 ereport(LOG,
4098 (errmsg("invalid length of checkpoint record")));
4099 return NULL;
4100 }
4101 return record;
4102}
4103
4104/*
4105 * Scan for new timelines that might have appeared in the archive since we
4106 * started recovery.
4107 *
4108 * If there are any, the function changes recovery target TLI to the latest
4109 * one and returns 'true'.
4110 */
4111static bool
4113{
4114 List *newExpectedTLEs;
4115 bool found;
4116 ListCell *cell;
4117 TimeLineID newtarget;
4118 TimeLineID oldtarget = recoveryTargetTLI;
4119 TimeLineHistoryEntry *currentTle = NULL;
4120
4122 if (newtarget == recoveryTargetTLI)
4123 {
4124 /* No new timelines found */
4125 return false;
4126 }
4127
4128 /*
4129 * Determine the list of expected TLIs for the new TLI
4130 */
4131
4132 newExpectedTLEs = readTimeLineHistory(newtarget);
4133
4134 /*
4135 * If the current timeline is not part of the history of the new timeline,
4136 * we cannot proceed to it.
4137 */
4138 found = false;
4139 foreach(cell, newExpectedTLEs)
4140 {
4141 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4142
4143 if (currentTle->tli == recoveryTargetTLI)
4144 {
4145 found = true;
4146 break;
4147 }
4148 }
4149 if (!found)
4150 {
4151 ereport(LOG,
4152 (errmsg("new timeline %u is not a child of database system timeline %u",
4153 newtarget,
4154 replayTLI)));
4155 return false;
4156 }
4157
4158 /*
4159 * The current timeline was found in the history file, but check that the
4160 * next timeline was forked off from it *after* the current recovery
4161 * location.
4162 */
4163 if (currentTle->end < replayLSN)
4164 {
4165 ereport(LOG,
4166 errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
4167 newtarget,
4168 replayTLI,
4169 LSN_FORMAT_ARGS(replayLSN)));
4170 return false;
4171 }
4172
4173 /* The new timeline history seems valid. Switch target */
4174 recoveryTargetTLI = newtarget;
4176 expectedTLEs = newExpectedTLEs;
4177
4178 /*
4179 * As in StartupXLOG(), try to ensure we have all the history files
4180 * between the old target and new target in pg_wal.
4181 */
4182 restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4183
4184 ereport(LOG,
4185 (errmsg("new target timeline is %u",
4187
4188 return true;
4189}
4190
4191
4192/*
4193 * Open a logfile segment for reading (during recovery).
4194 *
4195 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4196 * Otherwise, it's assumed to be already available in pg_wal.
4197 */
4198static int
4200 XLogSource source, bool notfoundOk)
4201{
4202 char xlogfname[MAXFNAMELEN];
4203 char activitymsg[MAXFNAMELEN + 16];
4204 char path[MAXPGPATH];
4205 int fd;
4206
4207 XLogFileName(xlogfname, tli, segno, wal_segment_size);
4208
4209 switch (source)
4210 {
4211 case XLOG_FROM_ARCHIVE:
4212 /* Report recovery progress in PS display */
4213 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4214 xlogfname);
4215 set_ps_display(activitymsg);
4216
4217 if (!RestoreArchivedFile(path, xlogfname,
4218 "RECOVERYXLOG",
4220 InRedo))
4221 return -1;
4222 break;
4223
4224 case XLOG_FROM_PG_WAL:
4225 case XLOG_FROM_STREAM:
4226 XLogFilePath(path, tli, segno, wal_segment_size);
4227 break;
4228
4229 default:
4230 elog(ERROR, "invalid XLogFileRead source %d", source);
4231 }
4232
4233 /*
4234 * If the segment was fetched from archival storage, replace the existing
4235 * xlog segment (if any) with the archival version.
4236 */
4238 {
4240 KeepFileRestoredFromArchive(path, xlogfname);
4241
4242 /*
4243 * Set path to point at the new file in pg_wal.
4244 */
4245 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4246 }
4247
4248 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4249 if (fd >= 0)
4250 {
4251 /* Success! */
4252 curFileTLI = tli;
4253
4254 /* Report recovery progress in PS display */
4255 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4256 xlogfname);
4257 set_ps_display(activitymsg);
4258
4259 /* Track source of data in assorted state variables */
4262 /* In FROM_STREAM case, caller tracks receipt time, not me */
4263 if (source != XLOG_FROM_STREAM)
4265
4266 return fd;
4267 }
4268 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4269 ereport(PANIC,
4271 errmsg("could not open file \"%s\": %m", path)));
4272 return -1;
4273}
4274
4275/*
4276 * Open a logfile segment for reading (during recovery).
4277 *
4278 * This version searches for the segment with any TLI listed in expectedTLEs.
4279 */
4280static int
4282{
4283 char path[MAXPGPATH];
4284 ListCell *cell;
4285 int fd;
4286 List *tles;
4287
4288 /*
4289 * Loop looking for a suitable timeline ID: we might need to read any of
4290 * the timelines listed in expectedTLEs.
4291 *
4292 * We expect curFileTLI on entry to be the TLI of the preceding file in
4293 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4294 * to go backwards; this prevents us from picking up the wrong file when a
4295 * parent timeline extends to higher segment numbers than the child we
4296 * want to read.
4297 *
4298 * If we haven't read the timeline history file yet, read it now, so that
4299 * we know which TLIs to scan. We don't save the list in expectedTLEs,
4300 * however, unless we actually find a valid segment. That way if there is
4301 * neither a timeline history file nor a WAL segment in the archive, and
4302 * streaming replication is set up, we'll read the timeline history file
4303 * streamed from the primary when we start streaming, instead of
4304 * recovering with a dummy history generated here.
4305 */
4306 if (expectedTLEs)
4307 tles = expectedTLEs;
4308 else
4310
4311 foreach(cell, tles)
4312 {
4314 TimeLineID tli = hent->tli;
4315
4316 if (tli < curFileTLI)
4317 break; /* don't bother looking at too-old TLIs */
4318
4319 /*
4320 * Skip scanning the timeline ID that the logfile segment to read
4321 * doesn't belong to
4322 */
4323 if (XLogRecPtrIsValid(hent->begin))
4324 {
4325 XLogSegNo beginseg = 0;
4326
4327 XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4328
4329 /*
4330 * The logfile segment that doesn't belong to the timeline is
4331 * older or newer than the segment that the timeline started or
4332 * ended at, respectively. It's sufficient to check only the
4333 * starting segment of the timeline here. Since the timelines are
4334 * scanned in descending order in this loop, any segments newer
4335 * than the ending segment should belong to newer timeline and
4336 * have already been read before. So it's not necessary to check
4337 * the ending segment of the timeline here.
4338 */
4339 if (segno < beginseg)
4340 continue;
4341 }
4342
4344 {
4345 fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4346 if (fd != -1)
4347 {
4348 elog(DEBUG1, "got WAL segment from archive");
4349 if (!expectedTLEs)
4350 expectedTLEs = tles;
4351 return fd;
4352 }
4353 }
4354
4356 {
4357 fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4358 if (fd != -1)
4359 {
4360 if (!expectedTLEs)
4361 expectedTLEs = tles;
4362 return fd;
4363 }
4364 }
4365 }
4366
4367 /* Couldn't find it. For simplicity, complain about front timeline */
4369 errno = ENOENT;
4372 errmsg("could not open file \"%s\": %m", path)));
4373 return -1;
4374}
4375
4376/*
4377 * Set flag to signal the walreceiver to restart. (The startup process calls
4378 * this on noticing a relevant configuration change.)
4379 */
4380void
4382{
4384 {
4385 ereport(LOG,
4386 (errmsg("WAL receiver process shutdown requested")));
4387
4388 pendingWalRcvRestart = true;
4389 }
4390}
4391
4392
4393/*
4394 * Has a standby promotion already been triggered?
4395 *
4396 * Unlike CheckForStandbyTrigger(), this works in any process
4397 * that's connected to shared memory.
4398 */
4399bool
4401{
4402 /*
4403 * We check shared state each time only until a standby promotion is
4404 * triggered. We can't trigger a promotion again, so there's no need to
4405 * keep checking after the shared variable has once been seen true.
4406 */
4408 return true;
4409
4413
4415}
4416
4417static void
4419{
4423
4424 /*
4425 * Mark the recovery pause state as 'not paused' because the paused state
4426 * ends and promotion continues if a promotion is triggered while recovery
4427 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4428 * return 'paused' while a promotion is ongoing.
4429 */
4430 SetRecoveryPause(false);
4431
4433}
4434
4435/*
4436 * Check whether a promote request has arrived.
4437 */
4438static bool
4440{
4442 return true;
4443
4445 {
4446 ereport(LOG, (errmsg("received promote request")));
4450 return true;
4451 }
4452
4453 return false;
4454}
4455
4456/*
4457 * Remove the files signaling a standby promotion request.
4458 */
4459void
4461{
4462 unlink(PROMOTE_SIGNAL_FILE);
4463}
4464
4465/*
4466 * Check to see if a promote request has arrived.
4467 */
4468bool
4470{
4471 struct stat stat_buf;
4472
4473 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4474 return true;
4475
4476 return false;
4477}
4478
4479/*
4480 * Wake up startup process to replay newly arrived WAL, or to notice that
4481 * failover has been requested.
4482 */
4483void
4488
4489/*
4490 * Schedule a walreceiver wakeup in the main recovery loop.
4491 */
4492void
4497
4498/*
4499 * Is HotStandby active yet? This is only important in special backends
4500 * since normal backends won't ever be able to connect until this returns
4501 * true. Postmaster knows this by way of signal, not via shared memory.
4502 *
4503 * Unlike testing standbyState, this works in any process that's connected to
4504 * shared memory. (And note that standbyState alone doesn't tell the truth
4505 * anyway.)
4506 */
4507bool
4509{
4510 /*
4511 * We check shared state each time only until Hot Standby is active. We
4512 * can't de-activate Hot Standby, so there's no need to keep checking
4513 * after the shared variable has once been seen true.
4514 */
4516 return true;
4517 else
4518 {
4519 /* spinlock is essential on machines with weak memory ordering! */
4523
4524 return LocalHotStandbyActive;
4525 }
4526}
4527
4528/*
4529 * Like HotStandbyActive(), but to be used only in WAL replay code,
4530 * where we don't need to ask any other process what the state is.
4531 */
4532static bool
4538
4539/*
4540 * Get latest redo apply position.
4541 *
4542 * Exported to allow WALReceiver to read the pointer directly.
4543 */
4546{
4547 XLogRecPtr recptr;
4548 TimeLineID tli;
4549
4554
4555 if (replayTLI)
4556 *replayTLI = tli;
4557 return recptr;
4558}
4559
4560
4561/*
4562 * Get position of last applied, or the record being applied.
4563 *
4564 * This is different from GetXLogReplayRecPtr() in that if a WAL
4565 * record is currently being applied, this includes that record.
4566 */
4569{
4570 XLogRecPtr recptr;
4571 TimeLineID tli;
4572
4577
4578 if (replayEndTLI)
4579 *replayEndTLI = tli;
4580 return recptr;
4581}
4582
4583/*
4584 * Save timestamp of latest processed commit/abort record.
4585 *
4586 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4587 * seen by processes other than the startup process. Note in particular
4588 * that CreateRestartPoint is executed in the checkpointer.
4589 */
4590static void
4597
4598/*
4599 * Fetch timestamp of latest processed commit/abort record.
4600 */
4603{
4604 TimestampTz xtime;
4605
4609
4610 return xtime;
4611}
4612
4613/*
4614 * Save timestamp of the next chunk of WAL records to apply.
4615 *
4616 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4617 * seen by all backends.
4618 */
4619static void
4626
4627/*
4628 * Fetch timestamp of latest processed commit/abort record.
4629 * Startup process maintains an accurate local copy in XLogReceiptTime
4630 */
4642
4643/*
4644 * Returns time of receipt of current chunk of XLOG data, as well as
4645 * whether it was received from streaming replication or from archives.
4646 */
4647void
4648GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4649{
4650 /*
4651 * This must be executed in the startup process, since we don't export the
4652 * relevant state to shared memory.
4653 */
4655
4656 *rtime = XLogReceiptTime;
4657 *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4658}
4659
4660/*
4661 * Note that text field supplied is a parameter name and does not require
4662 * translation
4663 */
4664void
4665RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4666{
4667 if (currValue < minValue)
4668 {
4670 {
4671 bool warned_for_promote = false;
4672
4674 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4675 errmsg("hot standby is not possible because of insufficient parameter settings"),
4676 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4677 param_name,
4678 currValue,
4679 minValue)));
4680
4681 SetRecoveryPause(true);
4682
4683 ereport(LOG,
4684 (errmsg("recovery has paused"),
4685 errdetail("If recovery is unpaused, the server will shut down."),
4686 errhint("You can then restart the server after making the necessary configuration changes.")));
4687
4689 {
4691
4693 {
4694 if (!warned_for_promote)
4696 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4697 errmsg("promotion is not possible because of insufficient parameter settings"),
4698
4699 /*
4700 * Repeat the detail from above so it's easy to find
4701 * in the log.
4702 */
4703 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4704 param_name,
4705 currValue,
4706 minValue),
4707 errhint("Restart the server after making the necessary configuration changes.")));
4708 warned_for_promote = true;
4709 }
4710
4711 /*
4712 * If recovery pause is requested then set it paused. While
4713 * we are in the loop, user might resume and pause again so
4714 * set this every time.
4715 */
4717
4718 /*
4719 * We wait on a condition variable that will wake us as soon
4720 * as the pause ends, but we use a timeout so we can check the
4721 * above conditions periodically too.
4722 */
4724 WAIT_EVENT_RECOVERY_PAUSE);
4725 }
4727 }
4728
4729 ereport(FATAL,
4730 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4731 errmsg("recovery aborted because of insufficient parameter settings"),
4732 /* Repeat the detail from above so it's easy to find in the log. */
4733 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4734 param_name,
4735 currValue,
4736 minValue),
4737 errhint("You can restart the server after making the necessary configuration changes.")));
4738 }
4739}
4740
4741
4742/*
4743 * GUC check_hook for primary_slot_name
4744 */
4745bool
4747{
4748 int err_code;
4749 char *err_msg = NULL;
4750 char *err_hint = NULL;
4751
4752 if (*newval && strcmp(*newval, "") != 0 &&
4753 !ReplicationSlotValidateNameInternal(*newval, false, &err_code,
4754 &err_msg, &err_hint))
4755 {
4756 GUC_check_errcode(err_code);
4757 GUC_check_errdetail("%s", err_msg);
4758 if (err_hint != NULL)
4759 GUC_check_errhint("%s", err_hint);
4760 return false;
4761 }
4762
4763 return true;
4764}
4765
4766/*
4767 * Recovery target settings: Only one of the several recovery_target* settings
4768 * may be set. Setting a second one results in an error. The global variable
4769 * recoveryTarget tracks which kind of recovery target was chosen. Other
4770 * variables store the actual target value (for example a string or a xid).
4771 * The assign functions of the parameters check whether a competing parameter
4772 * was already set. But we want to allow setting the same parameter multiple
4773 * times. We also want to allow unsetting a parameter and setting a different
4774 * one, so we unset recoveryTarget when the parameter is set to an empty
4775 * string.
4776 *
4777 * XXX this code is broken by design. Throwing an error from a GUC assign
4778 * hook breaks fundamental assumptions of guc.c. So long as all the variables
4779 * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4780 * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4781 * that we have odd behaviors such as unexpected GUC ordering dependencies.
4782 */
4783
4784pg_noreturn static void
4786{
4787 ereport(ERROR,
4788 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4789 errmsg("multiple recovery targets specified"),
4790 errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4791}
4792
4793/*
4794 * GUC check_hook for recovery_target
4795 */
4796bool
4798{
4799 if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4800 {
4801 GUC_check_errdetail("The only allowed value is \"immediate\".");
4802 return false;
4803 }
4804 return true;
4805}
4806
4807/*
4808 * GUC assign_hook for recovery_target
4809 */
4810void
4822
4823/*
4824 * GUC check_hook for recovery_target_lsn
4825 */
4826bool
4828{
4829 if (strcmp(*newval, "") != 0)
4830 {
4831 XLogRecPtr lsn;
4832 XLogRecPtr *myextra;
4833 ErrorSaveContext escontext = {T_ErrorSaveContext};
4834
4835 lsn = pg_lsn_in_safe(*newval, (Node *) &escontext);
4836 if (escontext.error_occurred)
4837 return false;
4838
4839 myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4840 if (!myextra)
4841 return false;
4842 *myextra = lsn;
4843 *extra = myextra;
4844 }
4845 return true;
4846}
4847
4848/*
4849 * GUC assign_hook for recovery_target_lsn
4850 */
4851void
4852assign_recovery_target_lsn(const char *newval, void *extra)
4853{
4857
4858 if (newval && strcmp(newval, "") != 0)
4859 {
4861 recoveryTargetLSN = *((XLogRecPtr *) extra);
4862 }
4863 else
4865}
4866
4867/*
4868 * GUC check_hook for recovery_target_name
4869 */
4870bool
4872{
4873 /* Use the value of newval directly */
4874 if (strlen(*newval) >= MAXFNAMELEN)
4875 {
4876 GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4877 "recovery_target_name", MAXFNAMELEN - 1);
4878 return false;
4879 }
4880 return true;
4881}
4882
4883/*
4884 * GUC assign_hook for recovery_target_name
4885 */
4886void
4887assign_recovery_target_name(const char *newval, void *extra)
4888{
4892
4893 if (newval && strcmp(newval, "") != 0)
4894 {
4897 }
4898 else
4900}
4901
4902/*
4903 * GUC check_hook for recovery_target_time
4904 *
4905 * The interpretation of the recovery_target_time string can depend on the
4906 * time zone setting, so we need to wait until after all GUC processing is
4907 * done before we can do the final parsing of the string. This check function
4908 * only does a parsing pass to catch syntax errors, but we store the string
4909 * and parse it again when we need to use it.
4910 */
4911bool
4913{
4914 if (strcmp(*newval, "") != 0)
4915 {
4916 /* reject some special values */
4917 if (strcmp(*newval, "now") == 0 ||
4918 strcmp(*newval, "today") == 0 ||
4919 strcmp(*newval, "tomorrow") == 0 ||
4920 strcmp(*newval, "yesterday") == 0)
4921 {
4922 return false;
4923 }
4924
4925 /*
4926 * parse timestamp value (see also timestamptz_in())
4927 */
4928 {
4929 char *str = *newval;
4930 fsec_t fsec;
4931 struct pg_tm tt,
4932 *tm = &tt;
4933 int tz;
4934 int dtype;
4935 int nf;
4936 int dterr;
4937 char *field[MAXDATEFIELDS];
4938 int ftype[MAXDATEFIELDS];
4939 char workbuf[MAXDATELEN + MAXDATEFIELDS];
4940 DateTimeErrorExtra dtextra;
4942
4943 dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4944 field, ftype, MAXDATEFIELDS, &nf);
4945 if (dterr == 0)
4946 dterr = DecodeDateTime(field, ftype, nf,
4947 &dtype, tm, &fsec, &tz, &dtextra);
4948 if (dterr != 0)
4949 return false;
4950 if (dtype != DTK_DATE)
4951 return false;
4952
4953 if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4954 {
4955 GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4956 return false;
4957 }
4958 }
4959 }
4960 return true;
4961}
4962
4963/*
4964 * GUC assign_hook for recovery_target_time
4965 */
4966void
4978
4979/*
4980 * GUC check_hook for recovery_target_timeline
4981 */
4982bool
4984{
4987
4988 if (strcmp(*newval, "current") == 0)
4990 else if (strcmp(*newval, "latest") == 0)
4992 else
4993 {
4994 char *endp;
4995 uint64 timeline;
4996
4998
4999 errno = 0;
5000 timeline = strtou64(*newval, &endp, 0);
5001
5002 if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
5003 {
5004 GUC_check_errdetail("\"%s\" is not a valid number.",
5005 "recovery_target_timeline");
5006 return false;
5007 }
5008
5009 if (timeline < 1 || timeline > PG_UINT32_MAX)
5010 {
5011 GUC_check_errdetail("\"%s\" must be between %u and %u.",
5012 "recovery_target_timeline", 1, PG_UINT32_MAX);
5013 return false;
5014 }
5015 }
5016
5018 if (!myextra)
5019 return false;
5020 *myextra = rttg;
5021 *extra = myextra;
5022
5023 return true;
5024}
5025
5026/*
5027 * GUC assign_hook for recovery_target_timeline
5028 */
5029void
5038
5039/*
5040 * GUC check_hook for recovery_target_xid
5041 */
5042bool
5044{
5045 if (strcmp(*newval, "") != 0)
5046 {
5047 TransactionId xid;
5048 TransactionId *myextra;
5049 char *endp;
5050 char *val;
5051
5052 errno = 0;
5053
5054 /*
5055 * Consume leading whitespace to determine if number is negative
5056 */
5057 val = *newval;
5058
5059 while (isspace((unsigned char) *val))
5060 val++;
5061
5062 /*
5063 * This cast will remove the epoch, if any
5064 */
5065 xid = (TransactionId) strtou64(val, &endp, 0);
5066
5067 if (*endp != '\0' || errno == EINVAL || errno == ERANGE || *val == '-')
5068 {
5069 GUC_check_errdetail("\"%s\" is not a valid number.",
5070 "recovery_target_xid");
5071 return false;
5072 }
5073
5074 if (xid < FirstNormalTransactionId)
5075 {
5076 GUC_check_errdetail("\"%s\" without epoch must be greater than or equal to %u.",
5077 "recovery_target_xid",
5079 return false;
5080 }
5081
5082 myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5083 if (!myextra)
5084 return false;
5085 *myextra = xid;
5086 *extra = myextra;
5087 }
5088 return true;
5089}
5090
5091/*
5092 * GUC assign_hook for recovery_target_xid
5093 */
5094void
5095assign_recovery_target_xid(const char *newval, void *extra)
5096{
5100
5101 if (newval && strcmp(newval, "") != 0)
5102 {
5104 recoveryTargetXid = *((TransactionId *) extra);
5105 }
5106 else
5108}
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
List * readTimeLineHistory(TimeLineID targetTLI)
Definition timeline.c:77
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition timeline.c:265
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition timeline.c:545
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition timeline.c:573
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition timeline.c:223
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition timeline.c:51
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition timeline.c:527
void remove_tablespace_symlink(const char *linkloc)
Definition tablespace.c:891
bool allow_in_place_tablespaces
Definition tablespace.c:87
void disable_startup_progress_timeout(void)
Definition startup.c:308
bool IsPromoteSignaled(void)
Definition startup.c:287
void begin_startup_progress_phase(void)
Definition startup.c:342
void ProcessStartupProcInterrupts(void)
Definition startup.c:154
void ResetPromoteSignaled(void)
Definition startup.c:293
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition datetime.c:774
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition datetime.c:998
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition timestamp.c:1751
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition timestamp.c:2000
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition timestamp.c:1775
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition timestamp.c:410
TimestampTz GetCurrentTimestamp(void)
Definition timestamp.c:1639
const char * timestamptz_to_str(TimestampTz t)
Definition timestamp.c:1856
Datum now(PG_FUNCTION_ARGS)
Definition timestamp.c:1603
uint32 BlockNumber
Definition block.h:31
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5603
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:468
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:222
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:334
@ RBM_NORMAL_NO_LOG
Definition bufmgr.h:52
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:419
PageData * Page
Definition bufpage.h:81
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:410
uint8_t uint8
Definition c.h:622
#define PG_UINT32_MAX
Definition c.h:674
#define pg_noreturn
Definition c.h:190
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:249
#define Assert(condition)
Definition c.h:943
#define PG_BINARY
Definition c.h:1374
#define UINT64_FORMAT
Definition c.h:635
int32_t int32
Definition c.h:620
uint64_t uint64
Definition c.h:625
uint32_t uint32
Definition c.h:624
#define pg_fallthrough
Definition c.h:161
uint32 TransactionId
Definition c.h:736
size_t Size
Definition c.h:689
void RequestCheckpoint(int flags)
uint32 result
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition timestamp.h:39
int32 fsec_t
Definition timestamp.h:41
Datum arg
Definition elog.c:1322
int errcode_for_file_access(void)
Definition elog.c:897
ErrorContextCallback * error_context_stack
Definition elog.c:99
int errcode(int sqlerrcode)
Definition elog.c:874
#define LOG
Definition elog.h:32
#define errcontext
Definition elog.h:200
int errhint(const char *fmt,...) pg_attribute_printf(1
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define FATAL
Definition elog.h:42
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:37
#define DEBUG2
Definition elog.h:30
#define PANIC
Definition elog.h:44
#define DEBUG1
Definition elog.h:31
#define ERROR
Definition elog.h:40
#define elog(elevel,...)
Definition elog.h:228
#define ereport(elevel,...)
Definition elog.h:152
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1112
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition fd.c:783
int BasicOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1090
int FreeFile(FILE *file)
Definition fd.c:2827
DIR * AllocateDir(const char *dirname)
Definition fd.c:2891
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition fd.c:2957
int pg_fsync(int fd)
Definition fd.c:390
FILE * AllocateFile(const char *name, const char *mode)
Definition fd.c:2628
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc0_object(type)
Definition fe_memutils.h:75
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition file_utils.c:547
@ PGFILETYPE_LNK
Definition file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition fmgr.h:688
bool IsUnderPostmaster
Definition globals.c:122
char * DataDir
Definition globals.c:73
bool IsPostmasterEnvironment
Definition globals.c:121
void GUC_check_errcode(int sqlerrcode)
Definition guc.c:6666
void * guc_malloc(int elevel, size_t size)
Definition guc.c:637
#define newval
#define GUC_check_errdetail
Definition guc.h:507
GucSource
Definition guc.h:112
#define GUC_check_errhint
Definition guc.h:511
const char * str
#define MAXDATEFIELDS
Definition datetime.h:202
#define DTK_DATE
Definition datetime.h:144
#define MAXDATELEN
Definition datetime.h:200
long val
Definition informix.c:689
#define close(a)
Definition win32.h:12
void proc_exit(int code)
Definition ipc.c:105
int i
Definition isn.c:77
void OwnLatch(Latch *latch)
Definition latch.c:126
void DisownLatch(Latch *latch)
Definition latch.c:144
void InitSharedLatch(Latch *latch)
Definition latch.c:93
void SetLatch(Latch *latch)
Definition latch.c:290
void ResetLatch(Latch *latch)
Definition latch.c:374
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition latch.c:172
List * lappend(List *list, void *datum)
Definition list.c:339
void list_free_deep(List *list)
Definition list.c:1560
static struct pg_tm tm
Definition localtime.c:104
char * pstrdup(const char *in)
Definition mcxt.c:1781
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define AmStartupProcess()
Definition miscadmin.h:405
#define IsBootstrapProcessingMode()
Definition miscadmin.h:495
static char * errmsg
#define ERRCODE_DATA_CORRUPTED
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition pg_control.h:79
#define XLOG_CHECKPOINT_REDO
Definition pg_control.h:86
#define XLOG_OVERWRITE_CONTRECORD
Definition pg_control.h:85
DBState
Definition pg_control.h:98
@ DB_IN_ARCHIVE_RECOVERY
Definition pg_control.h:104
@ DB_SHUTDOWNED_IN_RECOVERY
Definition pg_control.h:101
@ DB_SHUTDOWNED
Definition pg_control.h:100
@ DB_IN_CRASH_RECOVERY
Definition pg_control.h:103
#define XLOG_CHECKPOINT_SHUTDOWN
Definition pg_control.h:72
#define XLOG_BACKUP_END
Definition pg_control.h:77
#define XLOG_CHECKPOINT_ONLINE
Definition pg_control.h:73
#define XLOG_END_OF_RECOVERY
Definition pg_control.h:81
const void size_t len
#define lfirst(lc)
Definition pg_list.h:172
#define NIL
Definition pg_list.h:68
XLogRecPtr pg_lsn_in_safe(const char *str, Node *escontext)
Definition pg_lsn.c:32
static rewind_source * source
Definition pg_rewind.c:89
const char * pg_rusage_show(const PGRUsage *ru0)
Definition pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition pg_rusage.c:27
static char buf[DEFAULT_XLOG_SEG_SIZE]
@ IOOBJECT_WAL
Definition pgstat.h:283
@ IOCONTEXT_NORMAL
Definition pgstat.h:293
@ IOOP_READ
Definition pgstat.h:319
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition pmsignal.c:164
@ PMSIGNAL_RECOVERY_STARTED
Definition pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition pmsignal.h:37
@ PMSIGNAL_RECOVERY_CONSISTENT
Definition pmsignal.h:36
#define pg_pread
Definition port.h:247
#define snprintf
Definition port.h:260
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
off_t pgoff_t
Definition port.h:421
static Datum ObjectIdGetDatum(Oid X)
Definition postgres.h:252
static Datum CStringGetDatum(const char *X)
Definition postgres.h:370
static Datum Int32GetDatum(int32 X)
Definition postgres.h:212
#define InvalidOid
static int fd(const char *x, int i)
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition procarray.c:4456
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition procarray.c:4617
static void set_ps_display(const char *activity)
Definition ps_status.h:40
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
#define PG_TBLSPC_DIR
Definition relpath.h:41
void RmgrStartup(void)
Definition rmgr.c:58
void RmgrCleanup(void)
Definition rmgr.c:74
#define ShmemRequestStruct(...)
Definition shmem.h:176
bool ReplicationSlotValidateNameInternal(const char *name, bool allow_reserved_name, int *err_code, char **err_msg, char **err_hint)
Definition slot.c:310
void ShutDownSlotSync(void)
Definition slotsync.c:1799
static void SpinLockRelease(volatile slock_t *lock)
Definition spin.h:62
static void SpinLockAcquire(volatile slock_t *lock)
Definition spin.h:56
static void SpinLockInit(volatile slock_t *lock)
Definition spin.h:50
#define ereport_startup_progress(msg,...)
Definition startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition stringinfo.c:145
void appendStringInfoString(StringInfo str, const char *s)
Definition stringinfo.c:230
void appendStringInfoChar(StringInfo str, char ch)
Definition stringinfo.c:242
void initStringInfo(StringInfo str)
Definition stringinfo.c:97
Oid oldestMultiDB
Definition pg_control.h:52
MultiXactId oldestMulti
Definition pg_control.h:51
MultiXactOffset nextMultiOffset
Definition pg_control.h:48
TransactionId newestCommitTsXid
Definition pg_control.h:56
TransactionId oldestXid
Definition pg_control.h:49
TimeLineID PrevTimeLineID
Definition pg_control.h:40
TimeLineID ThisTimeLineID
Definition pg_control.h:39
MultiXactId nextMulti
Definition pg_control.h:47
FullTransactionId nextXid
Definition pg_control.h:45
TransactionId oldestCommitTsXid
Definition pg_control.h:54
XLogRecPtr redo
Definition pg_control.h:37
Oid oldestXidDB
Definition pg_control.h:50
XLogRecPtr backupStartPoint
Definition pg_control.h:178
CheckPoint checkPointCopy
Definition pg_control.h:143
XLogRecPtr backupEndPoint
Definition pg_control.h:179
XLogRecPtr minRecoveryPoint
Definition pg_control.h:176
XLogRecPtr checkPoint
Definition pg_control.h:141
uint64 system_identifier
Definition pg_control.h:118
TimeLineID minRecoveryPointTLI
Definition pg_control.h:177
Definition dirent.c:26
struct ErrorContextCallback * previous
Definition elog.h:299
void(* callback)(void *arg)
Definition elog.h:300
Definition pg_list.h:54
Definition nodes.h:135
RelFileNumber relNumber
const char *(* rm_identify)(uint8 info)
void(* rm_mask)(char *pagedata, BlockNumber blkno)
void(* rm_redo)(XLogReaderState *record)
const char * rm_name
void(* rm_desc)(StringInfo buf, XLogReaderState *record)
ShmemRequestCallback request_fn
Definition shmem.h:133
XLogRecPtr begin
Definition timeline.h:28
TimeLineID ws_tli
Definition xlogreader.h:49
pg_atomic_uint64 minWaitedLSN[WAIT_LSN_TYPE_COUNT]
Definition xlogwait.h:85
XLogRecPtr missingContrecPtr
Definition xlogreader.h:214
char * errormsg_buf
Definition xlogreader.h:310
XLogRecPtr EndRecPtr
Definition xlogreader.h:206
uint64 system_identifier
Definition xlogreader.h:190
XLogRecPtr ReadRecPtr
Definition xlogreader.h:205
XLogRecPtr abortedRecPtr
Definition xlogreader.h:213
TimeLineID latestPageTLI
Definition xlogreader.h:279
XLogRecPtr overwrittenRecPtr
Definition xlogreader.h:216
XLogRecPtr latestPagePtr
Definition xlogreader.h:278
WALOpenSegment seg
Definition xlogreader.h:271
void * private_data
Definition xlogreader.h:195
uint8 xl_info
Definition xlogrecord.h:46
uint32 xl_tot_len
Definition xlogrecord.h:43
TransactionId xl_xid
Definition xlogrecord.h:44
RmgrId xl_rmid
Definition xlogrecord.h:47
ConditionVariable recoveryNotPausedCV
XLogRecPtr lastReplayedEndRecPtr
TimeLineID replayEndTLI
TimeLineID lastReplayedTLI
TimestampTz currentChunkStartTime
XLogRecPtr replayEndRecPtr
TimestampTz recoveryLastXTime
RecoveryPauseState recoveryPauseState
XLogRecPtr lastReplayedReadRecPtr
Definition guc.h:174
char d_name[MAX_PATH]
Definition dirent.h:15
Definition pgtime.h:35
TimeLineID PrevTimeLineID
TimeLineID ThisTimeLineID
char rp_name[MAXFNAMELEN]
TransactionId twophase_xid
Definition xact.h:428
TransactionId twophase_xid
Definition xact.h:398
#define InvalidTransactionId
Definition transam.h:31
#define U64FromFullTransactionId(x)
Definition transam.h:49
#define XidFromFullTransactionId(x)
Definition transam.h:48
#define FirstNormalTransactionId
Definition transam.h:34
#define TransactionIdIsValid(xid)
Definition transam.h:41
#define TransactionIdIsNormal(xid)
Definition transam.h:42
#define TimestampTzPlusMilliseconds(tz, ms)
Definition timestamp.h:85
static TimestampTz DatumGetTimestampTz(Datum X)
Definition timestamp.h:34
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition varsup.c:299
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:67
static void pgstat_report_wait_end(void)
Definition wait_event.h:83
const char * name
#define WL_TIMEOUT
#define WL_EXIT_ON_PM_DEATH
#define WL_LATCH_SET
void WalRcvRequestApplyReply(void)
#define AllowCascadeReplication()
Definition walreceiver.h:40
@ WALRCV_STOPPING
Definition walreceiver.h:54
XLogRecPtr GetWalRcvFlushRecPtr(XLogRecPtr *latestChunkStart, TimeLineID *receiveTLI)
bool WalRcvStreaming(void)
void RequestXLogStreaming(TimeLineID tli, XLogRecPtr recptr, const char *conninfo, const char *slotname, bool create_temp_slot)
WalRcvState WalRcvGetState(void)
bool WalRcvRunning(void)
void WalSndWakeup(bool physical, bool logical)
Definition walsender.c:3958
#define stat
Definition win32_port.h:74
#define S_IRUSR
Definition win32_port.h:279
#define symlink(oldpath, newpath)
Definition win32_port.h:225
#define S_IWUSR
Definition win32_port.h:282
#define XLOG_XACT_COMMIT_PREPARED
Definition xact.h:173
#define XLOG_XACT_COMMIT
Definition xact.h:170
#define XLOG_XACT_OPMASK
Definition xact.h:180
#define XLOG_XACT_ABORT
Definition xact.h:172
#define XLOG_XACT_ABORT_PREPARED
Definition xact.h:174
void ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed)
Definition xactdesc.c:35
void ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed)
Definition xactdesc.c:141
int wal_decode_buffer_size
Definition xlog.c:143
bool EnableHotStandby
Definition xlog.c:128
XLogRecPtr GetRedoRecPtr(void)
Definition xlog.c:6933
void SetInstallXLogFileSegmentActive(void)
Definition xlog.c:10172
bool IsInstallXLogFileSegmentActive(void)
Definition xlog.c:10189
int wal_segment_size
Definition xlog.c:150
void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
Definition xlog.c:6705
void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
Definition xlog.c:3995
void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
Definition xlog.c:6743
void ResetInstallXLogFileSegmentActive(void)
Definition xlog.c:10181
int wal_retrieve_retry_interval
Definition xlog.c:141
bool track_wal_io_timing
Definition xlog.c:144
static ControlFileData * ControlFile
Definition xlog.c:584
void XLogShutdownWalRcv(void)
Definition xlog.c:10162
bool XLogCheckpointNeeded(XLogSegNo new_segno)
Definition xlog.c:2301
#define TABLESPACE_MAP_OLD
Definition xlog.h:336
#define TABLESPACE_MAP
Definition xlog.h:335
#define STANDBY_SIGNAL_FILE
Definition xlog.h:331
#define CHECKPOINT_CAUSE_XLOG
Definition xlog.h:160
#define PROMOTE_SIGNAL_FILE
Definition xlog.h:339
#define BACKUP_LABEL_FILE
Definition xlog.h:332
#define RECOVERY_SIGNAL_FILE
Definition xlog.h:330
static RmgrData GetRmgr(RmgrId rmid)
#define XLogSegmentOffset(xlogptr, wal_segsz_bytes)
#define MAXFNAMELEN
#define XLOGDIR
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)
static void XLogFilePath(char *path, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XRecOffIsValid(xlrp)
static void XLogFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes)
bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize, bool cleanupEnabled)
Definition xlogarchive.c:55
void KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
#define LSN_FORMAT_ARGS(lsn)
Definition xlogdefs.h:47
uint64 XLogRecPtr
Definition xlogdefs.h:21
#define InvalidXLogRecPtr
Definition xlogdefs.h:28
uint32 TimeLineID
Definition xlogdefs.h:63
uint64 XLogSegNo
Definition xlogdefs.h:52
void XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher)
XLogPrefetcher * XLogPrefetcherAllocate(XLogReaderState *reader)
void XLogPrefetchReconfigure(void)
XLogRecord * XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg)
XLogReaderState * XLogPrefetcherGetReader(XLogPrefetcher *prefetcher)
void XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
void XLogPrefetcherFree(XLogPrefetcher *prefetcher)
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, XLogReaderRoutine *routine, void *private_data)
Definition xlogreader.c:108
void XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size)
Definition xlogreader.c:92
void XLogReaderResetError(XLogReaderState *state)
bool XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, char *phdr)
void XLogReaderFree(XLogReaderState *state)
Definition xlogreader.c:163
bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
#define XLogRecGetDataLen(decoder)
Definition xlogreader.h:416
#define XLogRecGetInfo(decoder)
Definition xlogreader.h:410
#define XLogRecBlockImageApply(decoder, block_id)
Definition xlogreader.h:425
#define XLogRecGetRmid(decoder)
Definition xlogreader.h:411
#define XLogRecGetData(decoder)
Definition xlogreader.h:415
#define XLogRecGetXid(decoder)
Definition xlogreader.h:412
#define XL_ROUTINE(...)
Definition xlogreader.h:117
#define XLogRecMaxBlockId(decoder)
Definition xlogreader.h:418
XLogPageReadResult
Definition xlogreader.h:350
@ XLREAD_WOULDBLOCK
Definition xlogreader.h:353
@ XLREAD_SUCCESS
Definition xlogreader.h:351
@ XLREAD_FAIL
Definition xlogreader.h:352
#define XLogRecHasBlockImage(decoder, block_id)
Definition xlogreader.h:423
#define XLogRecGetPrev(decoder)
Definition xlogreader.h:409
#define XLogRecHasAnyBlockRefs(decoder)
Definition xlogreader.h:417
#define SizeOfXLogRecordDataHeaderShort
Definition xlogrecord.h:217
#define XLR_INFO_MASK
Definition xlogrecord.h:62
#define SizeOfXLogRecord
Definition xlogrecord.h:55
#define XLR_CHECK_CONSISTENCY
Definition xlogrecord.h:91
bool reachedConsistency
bool check_primary_slot_name(char **newval, void **extra, GucSource source)
static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
static XLogRecPtr recoveryStopLSN
static bool recoveryStopsBefore(XLogReaderState *record)
static TimestampTz recoveryStopTime
void assign_recovery_target_xid(const char *newval, void *extra)
static bool CheckForStandbyTrigger(void)
int recovery_min_apply_delay
bool check_recovery_target(char **newval, void **extra, GucSource source)
static bool backupEndRequired
bool HotStandbyActive(void)
static char * getRecoveryStopReason(void)
void ShutdownWalRecovery(void)
RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal
int recoveryTargetAction
static void rm_redo_error_callback(void *arg)
static bool recoveryApplyDelay(XLogReaderState *record)
bool ArchiveRecoveryRequested
const char * recoveryTargetName
static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
bool check_recovery_target_timeline(char **newval, void **extra, GucSource source)
static XLogRecPtr minRecoveryPoint
static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf)
static XLogRecPtr backupEndPoint
const struct config_enum_entry recovery_target_action_options[]
static void validateRecoveryParameters(void)
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, TimeLineID replayTLI)
static XLogRecord * ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr, TimeLineID replayTLI)
void StartupRequestWalReceiverRestart(void)
bool InArchiveRecovery
static bool recoveryStopsAfter(XLogReaderState *record)
void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
char * PrimarySlotName
static TimeLineID curFileTLI
static char recoveryStopName[MAXFNAMELEN]
static void CheckRecoveryConsistency(void)
static bool pendingWalRcvRestart
void PerformWalRecovery(void)
static XLogSource XLogReceiptSource
bool CheckPromoteSignal(void)
struct XLogPageReadPrivate XLogPageReadPrivate
static bool recoveryStopAfter
static const char *const xlogSourceNames[]
static TimeLineID RedoStartTLI
char * recoveryRestoreCommand
static void verifyBackupPageConsistency(XLogReaderState *record)
static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
void assign_recovery_target(const char *newval, void *extra)
void SetRecoveryPause(bool recoveryPause)
static bool lastSourceFailed
char * archiveCleanupCommand
XLogRecPtr GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
const ShmemCallbacks XLogRecoveryShmemCallbacks
static TimeLineID receiveTLI
void WakeupRecovery(void)
void xlog_outdesc(StringInfo buf, XLogReaderState *record)
static bool LocalPromoteIsTriggered
bool PromoteIsTriggered(void)
TimestampTz GetCurrentChunkReplayStartTime(void)
static void ConfirmRecoveryPaused(void)
static void readRecoverySignalFile(void)
static XLogRecPtr missingContrecPtr
XLogRecoveryCtlData * XLogRecoveryCtl
static uint32 readOff
static bool standby_signal_file_found
char * recovery_target_time_string
bool StandbyMode
static int readFile
static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr, TimeLineID replayTLI, XLogRecPtr replayLSN, bool nonblocking)
XLogRecPtr recoveryTargetLSN
RecoveryTargetType recoveryTarget
static bool read_tablespace_map(List **tablespaces)
static bool doRequestWalReceiverReply
static bool read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, bool *backupEndRequired, bool *backupFromStandby)
static int XLogFileRead(XLogSegNo segno, TimeLineID tli, XLogSource source, bool notfoundOk)
static XLogSource currentSource
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI)
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
static List * expectedTLEs
static XLogSegNo readSegNo
void assign_recovery_target_name(const char *newval, void *extra)
static XLogRecPtr abortedRecPtr
static char * primary_image_masked
static TimeLineID minRecoveryPointTLI
static XLogRecord * ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, bool fetching_ckpt, TimeLineID replayTLI)
EndOfWalRecoveryInfo * FinishWalRecovery(void)
void assign_recovery_target_time(const char *newval, void *extra)
static void SetCurrentChunkStartTime(TimestampTz xtime)
static XLogRecPtr CheckPointLoc
bool check_recovery_target_xid(char **newval, void **extra, GucSource source)
static bool LocalHotStandbyActive
static bool HotStandbyActiveInReplay(void)
static bool InRedo
static TransactionId recoveryStopXid
bool check_recovery_target_time(char **newval, void **extra, GucSource source)
static XLogSource readSource
static void SetPromoteIsTriggered(void)
#define RECOVERY_COMMAND_FILE
TransactionId recoveryTargetXid
XLogSource
@ XLOG_FROM_PG_WAL
@ XLOG_FROM_STREAM
@ XLOG_FROM_ARCHIVE
@ XLOG_FROM_ANY
TimeLineID recoveryTargetTLIRequested
static pg_noreturn void error_multiple_recovery_targets(void)
void InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
static void xlog_block_info(StringInfo buf, XLogReaderState *record)
static TimestampTz XLogReceiptTime
static void XLogRecoveryShmemInit(void *arg)
static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
static char * replay_image_masked
bool wal_receiver_create_temp_slot
static void CheckTablespaceDirectory(void)
char * recoveryEndCommand
RecoveryPauseState GetRecoveryPauseState(void)
TimeLineID recoveryTargetTLI
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
void assign_recovery_target_lsn(const char *newval, void *extra)
bool check_recovery_target_lsn(char **newval, void **extra, GucSource source)
static XLogRecPtr RedoStartLSN
static XLogRecPtr flushedUpto
static void recoveryPausesHere(bool endOfRecovery)
static uint32 readLen
static void EnableStandbyMode(void)
#define RECOVERY_COMMAND_DONE
static bool recovery_signal_file_found
TimestampTz recoveryTargetTime
TimestampTz GetLatestXTime(void)
char * PrimaryConnInfo
void XLogRequestWalReceiverReply(void)
static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
static XLogPrefetcher * xlogprefetcher
static bool StandbyModeRequested
bool check_recovery_target_name(char **newval, void **extra, GucSource source)
bool recoveryTargetInclusive
static XLogReaderState * xlogreader
void RemovePromoteSignalFiles(void)
void assign_recovery_target_timeline(const char *newval, void *extra)
static XLogRecPtr backupStartPoint
static void SetLatestXTime(TimestampTz xtime)
static TimeLineID CheckPointTLI
static void XLogRecoveryShmemRequest(void *arg)
@ RECOVERY_TARGET_ACTION_PAUSE
@ RECOVERY_TARGET_ACTION_PROMOTE
@ RECOVERY_TARGET_ACTION_SHUTDOWN
RecoveryTargetType
@ RECOVERY_TARGET_IMMEDIATE
@ RECOVERY_TARGET_TIME
@ RECOVERY_TARGET_UNSET
@ RECOVERY_TARGET_XID
@ RECOVERY_TARGET_LSN
@ RECOVERY_TARGET_NAME
RecoveryTargetTimeLineGoal
@ RECOVERY_TARGET_TIMELINE_NUMERIC
@ RECOVERY_TARGET_TIMELINE_CONTROLFILE
@ RECOVERY_TARGET_TIMELINE_LATEST
RecoveryPauseState
@ RECOVERY_PAUSED
@ RECOVERY_NOT_PAUSED
@ RECOVERY_PAUSE_REQUESTED
void wal_segment_close(XLogReaderState *state)
Definition xlogutils.c:831
Buffer XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode, Buffer recent_buffer)
Definition xlogutils.c:460
HotStandbyState standbyState
Definition xlogutils.c:53
bool InRecovery
Definition xlogutils.c:50
void XLogCheckInvalidPages(void)
Definition xlogutils.c:234
@ STANDBY_SNAPSHOT_READY
Definition xlogutils.h:55
@ STANDBY_INITIALIZED
Definition xlogutils.h:53
struct WaitLSNState * waitLSNState
Definition xlogwait.c:70
void WaitLSNWakeup(WaitLSNType lsnType, XLogRecPtr currentLSN)
Definition xlogwait.c:320
@ WAIT_LSN_TYPE_STANDBY_REPLAY
Definition xlogwait.h:39