PostgreSQL Source Code git master
xlogrecovery.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * xlogrecovery.c
4 * Functions for WAL recovery, standby mode
5 *
6 * This source file contains functions controlling WAL recovery.
7 * InitWalRecovery() initializes the system for crash or archive recovery,
8 * or standby mode, depending on configuration options and the state of
9 * the control file and possible backup label file. PerformWalRecovery()
10 * performs the actual WAL replay, calling the rmgr-specific redo routines.
11 * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12 * and prepares information needed to initialize the WAL for writes. In
13 * addition to these three main functions, there are a bunch of functions
14 * for interrogating recovery state and controlling the recovery process.
15 *
16 *
17 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
18 * Portions Copyright (c) 1994, Regents of the University of California
19 *
20 * src/backend/access/transam/xlogrecovery.c
21 *
22 *-------------------------------------------------------------------------
23 */
24
25#include "postgres.h"
26
27#include <ctype.h>
28#include <math.h>
29#include <time.h>
30#include <sys/stat.h>
31#include <sys/time.h>
32#include <unistd.h>
33
34#include "access/timeline.h"
35#include "access/transam.h"
36#include "access/xact.h"
38#include "access/xlogarchive.h"
40#include "access/xlogreader.h"
41#include "access/xlogrecovery.h"
42#include "access/xlogutils.h"
43#include "backup/basebackup.h"
44#include "catalog/pg_control.h"
45#include "commands/tablespace.h"
46#include "common/file_utils.h"
47#include "miscadmin.h"
48#include "pgstat.h"
49#include "postmaster/bgwriter.h"
50#include "postmaster/startup.h"
51#include "replication/slot.h"
54#include "storage/fd.h"
55#include "storage/ipc.h"
56#include "storage/latch.h"
57#include "storage/pmsignal.h"
58#include "storage/procarray.h"
59#include "storage/spin.h"
60#include "utils/datetime.h"
61#include "utils/fmgrprotos.h"
62#include "utils/guc_hooks.h"
63#include "utils/pg_lsn.h"
64#include "utils/ps_status.h"
65#include "utils/pg_rusage.h"
66
67/* Unsupported old recovery command file names (relative to $PGDATA) */
68#define RECOVERY_COMMAND_FILE "recovery.conf"
69#define RECOVERY_COMMAND_DONE "recovery.done"
70
71/*
72 * GUC support
73 */
75 {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
76 {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
77 {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
78 {NULL, 0, false}
79};
80
81/* options formerly taken from recovery.conf for archive recovery */
83char *recoveryEndCommand = NULL;
94
95/* options formerly taken from recovery.conf for XLOG streaming */
96char *PrimaryConnInfo = NULL;
97char *PrimarySlotName = NULL;
99
100/*
101 * recoveryTargetTimeLineGoal: what the user requested, if any
102 *
103 * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
104 *
105 * recoveryTargetTLI: the currently understood target timeline; changes
106 *
107 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
108 * the timelines of its known parents, newest first (so recoveryTargetTLI is
109 * always the first list member). Only these TLIs are expected to be seen in
110 * the WAL segments we read, and indeed only these TLIs will be considered as
111 * candidate WAL files to open at all.
112 *
113 * curFileTLI: the TLI appearing in the name of the current input WAL file.
114 * (This is not necessarily the same as the timeline from which we are
115 * replaying WAL, which StartupXLOG calls replayTLI, because we could be
116 * scanning data that was copied from an ancestor timeline when the current
117 * file was created.) During a sequential scan we do not allow this value
118 * to decrease.
119 */
125
126/*
127 * When ArchiveRecoveryRequested is set, archive recovery was requested,
128 * ie. signal files were present. When InArchiveRecovery is set, we are
129 * currently recovering using offline XLOG archives. These variables are only
130 * valid in the startup process.
131 *
132 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
133 * currently performing crash recovery using only XLOG files in pg_wal, but
134 * will switch to using offline XLOG archives as soon as we reach the end of
135 * WAL in pg_wal.
136 */
138bool InArchiveRecovery = false;
139
140/*
141 * When StandbyModeRequested is set, standby mode was requested, i.e.
142 * standby.signal file was present. When StandbyMode is set, we are currently
143 * in standby mode. These variables are only valid in the startup process.
144 * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
145 */
146static bool StandbyModeRequested = false;
147bool StandbyMode = false;
148
149/* was a signal file present at startup? */
150static bool standby_signal_file_found = false;
151static bool recovery_signal_file_found = false;
152
153/*
154 * CheckPointLoc is the position of the checkpoint record that determines
155 * where to start the replay. It comes from the backup label file or the
156 * control file.
157 *
158 * RedoStartLSN is the checkpoint's REDO location, also from the backup label
159 * file or the control file. In standby mode, XLOG streaming usually starts
160 * from the position where an invalid record was found. But if we fail to
161 * read even the initial checkpoint record, we use the REDO location instead
162 * of the checkpoint location as the start position of XLOG streaming.
163 * Otherwise we would have to jump backwards to the REDO location after
164 * reading the checkpoint record, because the REDO record can precede the
165 * checkpoint record.
166 */
171
172/*
173 * Local copy of SharedHotStandbyActive variable. False actually means "not
174 * known, need to check the shared state".
175 */
176static bool LocalHotStandbyActive = false;
177
178/*
179 * Local copy of SharedPromoteIsTriggered variable. False actually means "not
180 * known, need to check the shared state".
181 */
182static bool LocalPromoteIsTriggered = false;
183
184/* Has the recovery code requested a walreceiver wakeup? */
186
187/* XLogReader object used to parse the WAL records */
189
190/* XLogPrefetcher object used to consume WAL records with read-ahead */
192
193/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
195{
196 int emode;
197 bool fetching_ckpt; /* are we fetching a checkpoint record? */
201
202/* flag to tell XLogPageRead that we have started replaying */
203static bool InRedo = false;
204
205/*
206 * Codes indicating where we got a WAL file from during recovery, or where
207 * to attempt to get one.
208 */
209typedef enum
210{
211 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
212 XLOG_FROM_ARCHIVE, /* restored using restore_command */
213 XLOG_FROM_PG_WAL, /* existing file in pg_wal */
214 XLOG_FROM_STREAM, /* streamed from primary */
215} XLogSource;
216
217/* human-readable names for XLogSources, for debugging output */
218static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
219
220/*
221 * readFile is -1 or a kernel FD for the log file segment that's currently
222 * open for reading. readSegNo identifies the segment. readOff is the offset
223 * of the page just read, readLen indicates how much of it has been read into
224 * readBuf, and readSource indicates where we got the currently open file from.
225 *
226 * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
227 * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
228 * worthwhile, since the XLOG is not read by general-purpose sessions.
229 */
230static int readFile = -1;
232static uint32 readOff = 0;
233static uint32 readLen = 0;
235
236/*
237 * Keeps track of which source we're currently reading from. This is
238 * different from readSource in that this is always set, even when we don't
239 * currently have a WAL file open. If lastSourceFailed is set, our last
240 * attempt to read from currentSource failed, and we should try another source
241 * next.
242 *
243 * pendingWalRcvRestart is set when a config change occurs that requires a
244 * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
245 */
247static bool lastSourceFailed = false;
248static bool pendingWalRcvRestart = false;
249
250/*
251 * These variables track when we last obtained some WAL data to process,
252 * and where we got it from. (XLogReceiptSource is initially the same as
253 * readSource, but readSource gets reset to zero when we don't have data
254 * to process right now. It is also different from currentSource, which
255 * also changes when we try to read from a source and fail, while
256 * XLogReceiptSource tracks where we last successfully read some WAL.)
257 */
260
261/* Local copy of WalRcv->flushedUpto */
264
265/*
266 * Copy of minRecoveryPoint and backupEndPoint from the control file.
267 *
268 * In order to reach consistency, we must replay the WAL up to
269 * minRecoveryPoint. If backupEndRequired is true, we must also reach
270 * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
271 * to backupStartPoint.
272 *
273 * Note: In archive recovery, after consistency has been reached, the
274 * functions in xlog.c will start updating minRecoveryPoint in the control
275 * file. But this copy of minRecoveryPoint variable reflects the value at the
276 * beginning of recovery, and is *not* updated after consistency is reached.
277 */
280
283static bool backupEndRequired = false;
284
285/*
286 * Have we reached a consistent database state? In crash recovery, we have
287 * to replay all the WAL, so reachedConsistency is never set. During archive
288 * recovery, the database is consistent once minRecoveryPoint is reached.
289 *
290 * Consistent state means that the system is internally consistent, all
291 * the WAL has been replayed up to a certain point, and importantly, there
292 * is no trace of later actions on disk.
293 */
295
296/* Buffers dedicated to consistency checks of size BLCKSZ */
297static char *replay_image_masked = NULL;
298static char *primary_image_masked = NULL;
299
300
301/*
302 * Shared-memory state for WAL recovery.
303 */
305{
306 /*
307 * SharedHotStandbyActive indicates if we allow hot standby queries to be
308 * run. Protected by info_lck.
309 */
311
312 /*
313 * SharedPromoteIsTriggered indicates if a standby promotion has been
314 * triggered. Protected by info_lck.
315 */
317
318 /*
319 * recoveryWakeupLatch is used to wake up the startup process to continue
320 * WAL replay, if it is waiting for WAL to arrive or promotion to be
321 * requested.
322 *
323 * Note that the startup process also uses another latch, its procLatch,
324 * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
325 * signaling the startup process in favor of using its procLatch, which
326 * comports better with possible generic signal handlers using that latch.
327 * But we should not do that because the startup process doesn't assume
328 * that it's waken up by walreceiver process or SIGHUP signal handler
329 * while it's waiting for recovery conflict. The separate latches,
330 * recoveryWakeupLatch and procLatch, should be used for inter-process
331 * communication for WAL replay and recovery conflict, respectively.
332 */
334
335 /*
336 * Last record successfully replayed.
337 */
338 XLogRecPtr lastReplayedReadRecPtr; /* start position */
339 XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
341
342 /*
343 * When we're currently replaying a record, ie. in a redo function,
344 * replayEndRecPtr points to the end+1 of the record being replayed,
345 * otherwise it's equal to lastReplayedEndRecPtr.
346 */
349 /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
351
352 /*
353 * timestamp of when we started replaying the current chunk of WAL data,
354 * only relevant for replication or archive recovery
355 */
357 /* Recovery pause state */
360
361 slock_t info_lck; /* locks shared variables shown above */
363
365
366/*
367 * abortedRecPtr is the start pointer of a broken record at end of WAL when
368 * recovery completes; missingContrecPtr is the location of the first
369 * contrecord that went missing. See CreateOverwriteContrecordRecord for
370 * details.
371 */
374
375/*
376 * if recoveryStopsBefore/After returns true, it saves information of the stop
377 * point here
378 */
384
385/* prototypes for local functions */
386static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
387
388static void EnableStandbyMode(void);
389static void readRecoverySignalFile(void);
390static void validateRecoveryParameters(void);
391static bool read_backup_label(XLogRecPtr *checkPointLoc,
392 TimeLineID *backupLabelTLI,
393 bool *backupEndRequired, bool *backupFromStandby);
394static bool read_tablespace_map(List **tablespaces);
395
396static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
397static void CheckRecoveryConsistency(void);
398static void rm_redo_error_callback(void *arg);
399#ifdef WAL_DEBUG
400static void xlog_outrec(StringInfo buf, XLogReaderState *record);
401#endif
402static void xlog_block_info(StringInfo buf, XLogReaderState *record);
403static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
404 TimeLineID prevTLI, TimeLineID replayTLI);
405static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
407
408static bool recoveryStopsBefore(XLogReaderState *record);
409static bool recoveryStopsAfter(XLogReaderState *record);
410static char *getRecoveryStopReason(void);
411static void recoveryPausesHere(bool endOfRecovery);
412static bool recoveryApplyDelay(XLogReaderState *record);
413static void ConfirmRecoveryPaused(void);
414
416 int emode, bool fetching_ckpt,
417 TimeLineID replayTLI);
418
419static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
420 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
422 bool randAccess,
423 bool fetching_ckpt,
424 XLogRecPtr tliRecPtr,
425 TimeLineID replayTLI,
426 XLogRecPtr replayLSN,
427 bool nonblocking);
428static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
430 XLogRecPtr RecPtr, TimeLineID replayTLI);
431static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
432static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
433 XLogSource source, bool notfoundOk);
435
436static bool CheckForStandbyTrigger(void);
437static void SetPromoteIsTriggered(void);
438static bool HotStandbyActiveInReplay(void);
439
440static void SetCurrentChunkStartTime(TimestampTz xtime);
441static void SetLatestXTime(TimestampTz xtime);
442
443/*
444 * Initialization of shared memory for WAL recovery
445 */
446Size
448{
449 Size size;
450
451 /* XLogRecoveryCtl */
452 size = sizeof(XLogRecoveryCtlData);
453
454 return size;
455}
456
457void
459{
460 bool found;
461
463 ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
464 if (found)
465 return;
466 memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
467
471}
472
473/*
474 * A thin wrapper to enable StandbyMode and do other preparatory work as
475 * needed.
476 */
477static void
479{
480 StandbyMode = true;
481
482 /*
483 * To avoid server log bloat, we don't report recovery progress in a
484 * standby as it will always be in recovery unless promoted. We disable
485 * startup progress timeout in standby mode to avoid calling
486 * startup_progress_timeout_handler() unnecessarily.
487 */
489}
490
491/*
492 * Prepare the system for WAL recovery, if needed.
493 *
494 * This is called by StartupXLOG() which coordinates the server startup
495 * sequence. This function analyzes the control file and the backup label
496 * file, if any, and figures out whether we need to perform crash recovery or
497 * archive recovery, and how far we need to replay the WAL to reach a
498 * consistent state.
499 *
500 * This doesn't yet change the on-disk state, except for creating the symlinks
501 * from table space map file if any, and for fetching WAL files needed to find
502 * the checkpoint record. On entry, the caller has already read the control
503 * file into memory, and passes it as argument. This function updates it to
504 * reflect the recovery state, and the caller is expected to write it back to
505 * disk does after initializing other subsystems, but before calling
506 * PerformWalRecovery().
507 *
508 * This initializes some global variables like ArchiveRecoveryRequested, and
509 * StandbyModeRequested and InRecovery.
510 */
511void
513 bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
514{
515 XLogPageReadPrivate *private;
516 struct stat st;
517 bool wasShutdown;
518 XLogRecord *record;
519 DBState dbstate_at_startup;
520 bool haveTblspcMap = false;
521 bool haveBackupLabel = false;
522 CheckPoint checkPoint;
523 bool backupFromStandby = false;
524
525 dbstate_at_startup = ControlFile->state;
526
527 /*
528 * Initialize on the assumption we want to recover to the latest timeline
529 * that's active according to pg_control.
530 */
534 else
536
537 /*
538 * Check for signal files, and if so set up state for offline recovery
539 */
542
543 /*
544 * Take ownership of the wakeup latch if we're going to sleep during
545 * recovery, if required.
546 */
549
550 /*
551 * Set the WAL reading processor now, as it will be needed when reading
552 * the checkpoint record required (backup_label or not).
553 */
554 private = palloc0(sizeof(XLogPageReadPrivate));
555 xlogreader =
557 XL_ROUTINE(.page_read = &XLogPageRead,
558 .segment_open = NULL,
559 .segment_close = wal_segment_close),
560 private);
561 if (!xlogreader)
563 (errcode(ERRCODE_OUT_OF_MEMORY),
564 errmsg("out of memory"),
565 errdetail("Failed while allocating a WAL reading processor.")));
567
568 /*
569 * Set the WAL decode buffer size. This limits how far ahead we can read
570 * in the WAL.
571 */
573
574 /* Create a WAL prefetcher. */
576
577 /*
578 * Allocate two page buffers dedicated to WAL consistency checks. We do
579 * it this way, rather than just making static arrays, for two reasons:
580 * (1) no need to waste the storage in most instantiations of the backend;
581 * (2) a static char array isn't guaranteed to have any particular
582 * alignment, whereas palloc() will provide MAXALIGN'd storage.
583 */
584 replay_image_masked = (char *) palloc(BLCKSZ);
585 primary_image_masked = (char *) palloc(BLCKSZ);
586
587 /*
588 * Read the backup_label file. We want to run this part of the recovery
589 * process after checking for signal files and after performing validation
590 * of the recovery parameters.
591 */
593 &backupFromStandby))
594 {
595 List *tablespaces = NIL;
596
597 /*
598 * Archive recovery was requested, and thanks to the backup label
599 * file, we know how far we need to replay to reach consistency. Enter
600 * archive recovery directly.
601 */
602 InArchiveRecovery = true;
605
606 /*
607 * Omitting backup_label when creating a new replica, PITR node etc.
608 * unfortunately is a common cause of corruption. Logging that
609 * backup_label was used makes it a bit easier to exclude that as the
610 * cause of observed corruption.
611 *
612 * Do so before we try to read the checkpoint record (which can fail),
613 * as otherwise it can be hard to understand why a checkpoint other
614 * than ControlFile->checkPoint is used.
615 */
616 ereport(LOG,
617 (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
620 CheckPointTLI)));
621
622 /*
623 * When a backup_label file is present, we want to roll forward from
624 * the checkpoint it identifies, rather than using pg_control.
625 */
628 if (record != NULL)
629 {
630 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
631 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
633 (errmsg_internal("checkpoint record is at %X/%X",
635 InRecovery = true; /* force recovery even if SHUTDOWNED */
636
637 /*
638 * Make sure that REDO location exists. This may not be the case
639 * if there was a crash during an online backup, which left a
640 * backup_label around that references a WAL segment that's
641 * already been archived.
642 */
643 if (checkPoint.redo < CheckPointLoc)
644 {
646 if (!ReadRecord(xlogprefetcher, LOG, false,
647 checkPoint.ThisTimeLineID))
649 (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
651 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
652 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
653 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
655 }
656 }
657 else
658 {
660 (errmsg("could not locate required checkpoint record at %X/%X",
662 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
663 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
664 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
666 wasShutdown = false; /* keep compiler quiet */
667 }
668
669 /* Read the tablespace_map file if present and create symlinks. */
670 if (read_tablespace_map(&tablespaces))
671 {
672 ListCell *lc;
673
674 foreach(lc, tablespaces)
675 {
676 tablespaceinfo *ti = lfirst(lc);
677 char *linkloc;
678
679 linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
680
681 /*
682 * Remove the existing symlink if any and Create the symlink
683 * under PGDATA.
684 */
686
687 if (symlink(ti->path, linkloc) < 0)
690 errmsg("could not create symbolic link \"%s\": %m",
691 linkloc)));
692
693 pfree(ti->path);
694 pfree(ti);
695 }
696
697 /* tell the caller to delete it later */
698 haveTblspcMap = true;
699 }
700
701 /* tell the caller to delete it later */
702 haveBackupLabel = true;
703 }
704 else
705 {
706 /* No backup_label file has been found if we are here. */
707
708 /*
709 * If tablespace_map file is present without backup_label file, there
710 * is no use of such file. There is no harm in retaining it, but it
711 * is better to get rid of the map file so that we don't have any
712 * redundant file in data directory and it will avoid any sort of
713 * confusion. It seems prudent though to just rename the file out of
714 * the way rather than delete it completely, also we ignore any error
715 * that occurs in rename operation as even if map file is present
716 * without backup_label file, it is harmless.
717 */
718 if (stat(TABLESPACE_MAP, &st) == 0)
719 {
720 unlink(TABLESPACE_MAP_OLD);
722 ereport(LOG,
723 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
725 errdetail("File \"%s\" was renamed to \"%s\".",
727 else
728 ereport(LOG,
729 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
731 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
733 }
734
735 /*
736 * It's possible that archive recovery was requested, but we don't
737 * know how far we need to replay the WAL before we reach consistency.
738 * This can happen for example if a base backup is taken from a
739 * running server using an atomic filesystem snapshot, without calling
740 * pg_backup_start/stop. Or if you just kill a running primary server
741 * and put it into archive recovery by creating a recovery signal
742 * file.
743 *
744 * Our strategy in that case is to perform crash recovery first,
745 * replaying all the WAL present in pg_wal, and only enter archive
746 * recovery after that.
747 *
748 * But usually we already know how far we need to replay the WAL (up
749 * to minRecoveryPoint, up to backupEndPoint, or until we see an
750 * end-of-backup record), and we can enter archive recovery directly.
751 */
757 {
758 InArchiveRecovery = true;
761 }
762
763 /*
764 * For the same reason as when starting up with backup_label present,
765 * emit a log message when we continue initializing from a base
766 * backup.
767 */
769 ereport(LOG,
770 (errmsg("restarting backup recovery with redo LSN %X/%X",
772
773 /* Get the last valid checkpoint record. */
780 if (record != NULL)
781 {
783 (errmsg_internal("checkpoint record is at %X/%X",
785 }
786 else
787 {
788 /*
789 * We used to attempt to go back to a secondary checkpoint record
790 * here, but only when not in standby mode. We now just fail if we
791 * can't read the last checkpoint because this allows us to
792 * simplify processing around checkpoints.
793 */
795 (errmsg("could not locate a valid checkpoint record at %X/%X",
797 }
798 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
799 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
800 }
801
803 {
805 ereport(LOG,
806 (errmsg("entering standby mode")));
808 ereport(LOG,
809 (errmsg("starting point-in-time recovery to XID %u",
812 ereport(LOG,
813 (errmsg("starting point-in-time recovery to %s",
816 ereport(LOG,
817 (errmsg("starting point-in-time recovery to \"%s\"",
820 ereport(LOG,
821 (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
824 ereport(LOG,
825 (errmsg("starting point-in-time recovery to earliest consistent point")));
826 else
827 ereport(LOG,
828 (errmsg("starting archive recovery")));
829 }
830
831 /*
832 * If the location of the checkpoint record is not on the expected
833 * timeline in the history of the requested timeline, we cannot proceed:
834 * the backup is not part of the history of the requested timeline.
835 */
836 Assert(expectedTLEs); /* was initialized by reading checkpoint
837 * record */
840 {
841 XLogRecPtr switchpoint;
842
843 /*
844 * tliSwitchPoint will throw an error if the checkpoint's timeline is
845 * not in expectedTLEs at all.
846 */
849 (errmsg("requested timeline %u is not a child of this server's history",
851 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
854 LSN_FORMAT_ARGS(switchpoint))));
855 }
856
857 /*
858 * The min recovery point should be part of the requested timeline's
859 * history, too.
860 */
865 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
869
871 (errmsg_internal("redo record is at %X/%X; shutdown %s",
872 LSN_FORMAT_ARGS(checkPoint.redo),
873 wasShutdown ? "true" : "false")));
875 (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
877 checkPoint.nextOid)));
879 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
880 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
882 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
883 checkPoint.oldestXid, checkPoint.oldestXidDB)));
885 (errmsg_internal("oldest MultiXactId: %u, in database %u",
886 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
888 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
889 checkPoint.oldestCommitTsXid,
890 checkPoint.newestCommitTsXid)));
893 (errmsg("invalid next transaction ID")));
894
895 /* sanity check */
896 if (checkPoint.redo > CheckPointLoc)
898 (errmsg("invalid redo in checkpoint record")));
899
900 /*
901 * Check whether we need to force recovery from WAL. If it appears to
902 * have been a clean shutdown and we did not have a recovery signal file,
903 * then assume no recovery needed.
904 */
905 if (checkPoint.redo < CheckPointLoc)
906 {
907 if (wasShutdown)
909 (errmsg("invalid redo record in shutdown checkpoint")));
910 InRecovery = true;
911 }
912 else if (ControlFile->state != DB_SHUTDOWNED)
913 InRecovery = true;
915 {
916 /* force recovery due to presence of recovery signal file */
917 InRecovery = true;
918 }
919
920 /*
921 * If recovery is needed, update our in-memory copy of pg_control to show
922 * that we are recovering and to show the selected checkpoint as the place
923 * we are starting from. We also mark pg_control with any minimum recovery
924 * stop point obtained from a backup history file.
925 *
926 * We don't write the changes to disk yet, though. Only do that after
927 * initializing various subsystems.
928 */
929 if (InRecovery)
930 {
932 {
934 }
935 else
936 {
937 ereport(LOG,
938 (errmsg("database system was not properly shut down; "
939 "automatic recovery in progress")));
941 ereport(LOG,
942 (errmsg("crash recovery starts in timeline %u "
943 "and has target timeline %u",
947 }
949 ControlFile->checkPointCopy = checkPoint;
951 {
952 /* initialize minRecoveryPoint if not set yet */
953 if (ControlFile->minRecoveryPoint < checkPoint.redo)
954 {
955 ControlFile->minRecoveryPoint = checkPoint.redo;
957 }
958 }
959
960 /*
961 * Set backupStartPoint if we're starting recovery from a base backup.
962 *
963 * Also set backupEndPoint and use minRecoveryPoint as the backup end
964 * location if we're starting recovery from a base backup which was
965 * taken from a standby. In this case, the database system status in
966 * pg_control must indicate that the database was already in recovery.
967 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
968 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
969 * before reaching this point; e.g. because restore_command or
970 * primary_conninfo were faulty.
971 *
972 * Any other state indicates that the backup somehow became corrupted
973 * and we can't sensibly continue with recovery.
974 */
975 if (haveBackupLabel)
976 {
977 ControlFile->backupStartPoint = checkPoint.redo;
979
980 if (backupFromStandby)
981 {
982 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
983 dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
985 (errmsg("backup_label contains data inconsistent with control file"),
986 errhint("This means that the backup is corrupted and you will "
987 "have to use another backup for recovery.")));
989 }
990 }
991 }
992
993 /* remember these, so that we know when we have reached consistency */
998 {
1001 }
1002 else
1003 {
1006 }
1007
1008 /*
1009 * Start recovery assuming that the final record isn't lost.
1010 */
1013
1014 *wasShutdown_ptr = wasShutdown;
1015 *haveBackupLabel_ptr = haveBackupLabel;
1016 *haveTblspcMap_ptr = haveTblspcMap;
1017}
1018
1019/*
1020 * See if there are any recovery signal files and if so, set state for
1021 * recovery.
1022 *
1023 * See if there is a recovery command file (recovery.conf), and if so
1024 * throw an ERROR since as of PG12 we no longer recognize that.
1025 */
1026static void
1028{
1029 struct stat stat_buf;
1030
1032 return;
1033
1034 /*
1035 * Check for old recovery API file: recovery.conf
1036 */
1037 if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1038 ereport(FATAL,
1040 errmsg("using recovery command file \"%s\" is not supported",
1042
1043 /*
1044 * Remove unused .done file, if present. Ignore if absent.
1045 */
1046 unlink(RECOVERY_COMMAND_DONE);
1047
1048 /*
1049 * Check for recovery signal files and if found, fsync them since they
1050 * represent server state information. We don't sweat too much about the
1051 * possibility of fsync failure, however.
1052 *
1053 * If present, standby signal file takes precedence. If neither is present
1054 * then we won't enter archive recovery.
1055 */
1056 if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1057 {
1058 int fd;
1059
1061 S_IRUSR | S_IWUSR);
1062 if (fd >= 0)
1063 {
1064 (void) pg_fsync(fd);
1065 close(fd);
1066 }
1068 }
1069 else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1070 {
1071 int fd;
1072
1074 S_IRUSR | S_IWUSR);
1075 if (fd >= 0)
1076 {
1077 (void) pg_fsync(fd);
1078 close(fd);
1079 }
1081 }
1082
1083 StandbyModeRequested = false;
1086 {
1087 StandbyModeRequested = true;
1089 }
1091 {
1092 StandbyModeRequested = false;
1094 }
1095 else
1096 return;
1097
1098 /*
1099 * We don't support standby mode in standalone backends; that requires
1100 * other processes such as the WAL receiver to be alive.
1101 */
1103 ereport(FATAL,
1104 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1105 errmsg("standby mode is not supported by single-user servers")));
1106}
1107
1108static void
1110{
1112 return;
1113
1114 /*
1115 * Check for compulsory parameters
1116 */
1118 {
1119 if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1120 (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1122 (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1123 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1124 }
1125 else
1126 {
1127 if (recoveryRestoreCommand == NULL ||
1128 strcmp(recoveryRestoreCommand, "") == 0)
1129 ereport(FATAL,
1130 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1131 errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1132 }
1133
1134 /*
1135 * Override any inconsistent requests. Note that this is a change of
1136 * behaviour in 9.5; prior to this we simply ignored a request to pause if
1137 * hot_standby = off, which was surprising behaviour.
1138 */
1142
1143 /*
1144 * Final parsing of recovery_target_time string; see also
1145 * check_recovery_target_time().
1146 */
1148 {
1152 Int32GetDatum(-1)));
1153 }
1154
1155 /*
1156 * If user specified recovery_target_timeline, validate it or compute the
1157 * "latest" value. We can't do this until after we've gotten the restore
1158 * command and set InArchiveRecovery, because we need to fetch timeline
1159 * history files from the archive.
1160 */
1162 {
1164
1165 /* Timeline 1 does not have a history file, all else should */
1166 if (rtli != 1 && !existsTimeLineHistory(rtli))
1167 ereport(FATAL,
1168 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1169 errmsg("recovery target timeline %u does not exist",
1170 rtli)));
1171 recoveryTargetTLI = rtli;
1172 }
1174 {
1175 /* We start the "latest" search from pg_control's timeline */
1177 }
1178 else
1179 {
1180 /*
1181 * else we just use the recoveryTargetTLI as already read from
1182 * ControlFile
1183 */
1185 }
1186}
1187
1188/*
1189 * read_backup_label: check to see if a backup_label file is present
1190 *
1191 * If we see a backup_label during recovery, we assume that we are recovering
1192 * from a backup dump file, and we therefore roll forward from the checkpoint
1193 * identified by the label file, NOT what pg_control says. This avoids the
1194 * problem that pg_control might have been archived one or more checkpoints
1195 * later than the start of the dump, and so if we rely on it as the start
1196 * point, we will fail to restore a consistent database state.
1197 *
1198 * Returns true if a backup_label was found (and fills the checkpoint
1199 * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1200 * returns false if not. If this backup_label came from a streamed backup,
1201 * *backupEndRequired is set to true. If this backup_label was created during
1202 * recovery, *backupFromStandby is set to true.
1203 *
1204 * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1205 * and TLI read from the backup file.
1206 */
1207static bool
1208read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1209 bool *backupEndRequired, bool *backupFromStandby)
1210{
1211 char startxlogfilename[MAXFNAMELEN];
1212 TimeLineID tli_from_walseg,
1213 tli_from_file;
1214 FILE *lfp;
1215 char ch;
1216 char backuptype[20];
1217 char backupfrom[20];
1218 char backuplabel[MAXPGPATH];
1219 char backuptime[128];
1220 uint32 hi,
1221 lo;
1222
1223 /* suppress possible uninitialized-variable warnings */
1224 *checkPointLoc = InvalidXLogRecPtr;
1225 *backupLabelTLI = 0;
1226 *backupEndRequired = false;
1227 *backupFromStandby = false;
1228
1229 /*
1230 * See if label file is present
1231 */
1232 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1233 if (!lfp)
1234 {
1235 if (errno != ENOENT)
1236 ereport(FATAL,
1238 errmsg("could not read file \"%s\": %m",
1240 return false; /* it's not there, all is fine */
1241 }
1242
1243 /*
1244 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1245 * is pretty crude, but we are not expecting any variability in the file
1246 * format).
1247 */
1248 if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1249 &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1250 ereport(FATAL,
1251 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1252 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1253 RedoStartLSN = ((uint64) hi) << 32 | lo;
1254 RedoStartTLI = tli_from_walseg;
1255 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1256 &hi, &lo, &ch) != 3 || ch != '\n')
1257 ereport(FATAL,
1258 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1259 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1260 *checkPointLoc = ((uint64) hi) << 32 | lo;
1261 *backupLabelTLI = tli_from_walseg;
1262
1263 /*
1264 * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1265 * which could mean either pg_basebackup or the pg_backup_start/stop
1266 * method was used) or if this label came from somewhere else (the only
1267 * other option today being from pg_rewind). If this was a streamed
1268 * backup then we know that we need to play through until we get to the
1269 * end of the WAL which was generated during the backup (at which point we
1270 * will have reached consistency and backupEndRequired will be reset to be
1271 * false).
1272 */
1273 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1274 {
1275 if (strcmp(backuptype, "streamed") == 0)
1276 *backupEndRequired = true;
1277 }
1278
1279 /*
1280 * BACKUP FROM lets us know if this was from a primary or a standby. If
1281 * it was from a standby, we'll double-check that the control file state
1282 * matches that of a standby.
1283 */
1284 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1285 {
1286 if (strcmp(backupfrom, "standby") == 0)
1287 *backupFromStandby = true;
1288 }
1289
1290 /*
1291 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1292 * but checking for their presence is useful for debugging and the next
1293 * sanity checks. Cope also with the fact that the result buffers have a
1294 * pre-allocated size, hence if the backup_label file has been generated
1295 * with strings longer than the maximum assumed here an incorrect parsing
1296 * happens. That's fine as only minor consistency checks are done
1297 * afterwards.
1298 */
1299 if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1301 (errmsg_internal("backup time %s in file \"%s\"",
1302 backuptime, BACKUP_LABEL_FILE)));
1303
1304 if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1306 (errmsg_internal("backup label %s in file \"%s\"",
1307 backuplabel, BACKUP_LABEL_FILE)));
1308
1309 /*
1310 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1311 * it as a sanity check if present.
1312 */
1313 if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1314 {
1315 if (tli_from_walseg != tli_from_file)
1316 ereport(FATAL,
1317 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1318 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1319 errdetail("Timeline ID parsed is %u, but expected %u.",
1320 tli_from_file, tli_from_walseg)));
1321
1323 (errmsg_internal("backup timeline %u in file \"%s\"",
1324 tli_from_file, BACKUP_LABEL_FILE)));
1325 }
1326
1327 if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
1328 ereport(FATAL,
1329 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1330 errmsg("this is an incremental backup, not a data directory"),
1331 errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1332
1333 if (ferror(lfp) || FreeFile(lfp))
1334 ereport(FATAL,
1336 errmsg("could not read file \"%s\": %m",
1338
1339 return true;
1340}
1341
1342/*
1343 * read_tablespace_map: check to see if a tablespace_map file is present
1344 *
1345 * If we see a tablespace_map file during recovery, we assume that we are
1346 * recovering from a backup dump file, and we therefore need to create symlinks
1347 * as per the information present in tablespace_map file.
1348 *
1349 * Returns true if a tablespace_map file was found (and fills *tablespaces
1350 * with a tablespaceinfo struct for each tablespace listed in the file);
1351 * returns false if not.
1352 */
1353static bool
1355{
1356 tablespaceinfo *ti;
1357 FILE *lfp;
1358 char str[MAXPGPATH];
1359 int ch,
1360 i,
1361 n;
1362 bool was_backslash;
1363
1364 /*
1365 * See if tablespace_map file is present
1366 */
1367 lfp = AllocateFile(TABLESPACE_MAP, "r");
1368 if (!lfp)
1369 {
1370 if (errno != ENOENT)
1371 ereport(FATAL,
1373 errmsg("could not read file \"%s\": %m",
1374 TABLESPACE_MAP)));
1375 return false; /* it's not there, all is fine */
1376 }
1377
1378 /*
1379 * Read and parse the link name and path lines from tablespace_map file
1380 * (this code is pretty crude, but we are not expecting any variability in
1381 * the file format). De-escape any backslashes that were inserted.
1382 */
1383 i = 0;
1384 was_backslash = false;
1385 while ((ch = fgetc(lfp)) != EOF)
1386 {
1387 if (!was_backslash && (ch == '\n' || ch == '\r'))
1388 {
1389 char *endp;
1390
1391 if (i == 0)
1392 continue; /* \r immediately followed by \n */
1393
1394 /*
1395 * The de-escaped line should contain an OID followed by exactly
1396 * one space followed by a path. The path might start with
1397 * spaces, so don't be too liberal about parsing.
1398 */
1399 str[i] = '\0';
1400 n = 0;
1401 while (str[n] && str[n] != ' ')
1402 n++;
1403 if (n < 1 || n >= i - 1)
1404 ereport(FATAL,
1405 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1406 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1407 str[n++] = '\0';
1408
1409 ti = palloc0(sizeof(tablespaceinfo));
1410 errno = 0;
1411 ti->oid = strtoul(str, &endp, 10);
1412 if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1413 ereport(FATAL,
1414 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1415 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1416 ti->path = pstrdup(str + n);
1417 *tablespaces = lappend(*tablespaces, ti);
1418
1419 i = 0;
1420 continue;
1421 }
1422 else if (!was_backslash && ch == '\\')
1423 was_backslash = true;
1424 else
1425 {
1426 if (i < sizeof(str) - 1)
1427 str[i++] = ch;
1428 was_backslash = false;
1429 }
1430 }
1431
1432 if (i != 0 || was_backslash) /* last line not terminated? */
1433 ereport(FATAL,
1434 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1435 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1436
1437 if (ferror(lfp) || FreeFile(lfp))
1438 ereport(FATAL,
1440 errmsg("could not read file \"%s\": %m",
1441 TABLESPACE_MAP)));
1442
1443 return true;
1444}
1445
1446/*
1447 * Finish WAL recovery.
1448 *
1449 * This does not close the 'xlogreader' yet, because in some cases the caller
1450 * still wants to re-read the last checkpoint record by calling
1451 * ReadCheckpointRecord().
1452 *
1453 * Returns the position of the last valid or applied record, after which new
1454 * WAL should be appended, information about why recovery was ended, and some
1455 * other things. See the EndOfWalRecoveryInfo struct for details.
1456 */
1459{
1461 XLogRecPtr lastRec;
1462 TimeLineID lastRecTLI;
1463 XLogRecPtr endOfLog;
1464
1465 /*
1466 * Kill WAL receiver, if it's still running, before we continue to write
1467 * the startup checkpoint and aborted-contrecord records. It will trump
1468 * over these records and subsequent ones if it's still alive when we
1469 * start writing WAL.
1470 */
1472
1473 /*
1474 * Shutdown the slot sync worker to drop any temporary slots acquired by
1475 * it and to prevent it from keep trying to fetch the failover slots.
1476 *
1477 * We do not update the 'synced' column in 'pg_replication_slots' system
1478 * view from true to false here, as any failed update could leave 'synced'
1479 * column false for some slots. This could cause issues during slot sync
1480 * after restarting the server as a standby. While updating the 'synced'
1481 * column after switching to the new timeline is an option, it does not
1482 * simplify the handling for the 'synced' column. Therefore, we retain the
1483 * 'synced' column as true after promotion as it may provide useful
1484 * information about the slot origin.
1485 */
1487
1488 /*
1489 * We are now done reading the xlog from stream. Turn off streaming
1490 * recovery to force fetching the files (which would be required at end of
1491 * recovery, e.g., timeline history file) from archive or pg_wal.
1492 *
1493 * Note that standby mode must be turned off after killing WAL receiver,
1494 * i.e., calling XLogShutdownWalRcv().
1495 */
1497 StandbyMode = false;
1498
1499 /*
1500 * Determine where to start writing WAL next.
1501 *
1502 * Re-fetch the last valid or last applied record, so we can identify the
1503 * exact endpoint of what we consider the valid portion of WAL. There may
1504 * be an incomplete continuation record after that, in which case
1505 * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1506 * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1507 * it is intentionally missing. See CreateOverwriteContrecordRecord().
1508 *
1509 * An important side-effect of this is to load the last page into
1510 * xlogreader. The caller uses it to initialize the WAL for writing.
1511 */
1512 if (!InRecovery)
1513 {
1514 lastRec = CheckPointLoc;
1515 lastRecTLI = CheckPointTLI;
1516 }
1517 else
1518 {
1520 lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1521 }
1523 (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1524 endOfLog = xlogreader->EndRecPtr;
1525
1526 /*
1527 * Remember the TLI in the filename of the XLOG segment containing the
1528 * end-of-log. It could be different from the timeline that endOfLog
1529 * nominally belongs to, if there was a timeline switch in that segment,
1530 * and we were reading the old WAL from a segment belonging to a higher
1531 * timeline.
1532 */
1533 result->endOfLogTLI = xlogreader->seg.ws_tli;
1534
1536 {
1537 /*
1538 * We are no longer in archive recovery state.
1539 *
1540 * We are now done reading the old WAL. Turn off archive fetching if
1541 * it was active.
1542 */
1544 InArchiveRecovery = false;
1545
1546 /*
1547 * If the ending log segment is still open, close it (to avoid
1548 * problems on Windows with trying to rename or delete an open file).
1549 */
1550 if (readFile >= 0)
1551 {
1552 close(readFile);
1553 readFile = -1;
1554 }
1555 }
1556
1557 /*
1558 * Copy the last partial block to the caller, for initializing the WAL
1559 * buffer for appending new WAL.
1560 */
1561 if (endOfLog % XLOG_BLCKSZ != 0)
1562 {
1563 char *page;
1564 int len;
1565 XLogRecPtr pageBeginPtr;
1566
1567 pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1569
1570 /* Copy the valid part of the last block */
1571 len = endOfLog % XLOG_BLCKSZ;
1572 page = palloc(len);
1573 memcpy(page, xlogreader->readBuf, len);
1574
1575 result->lastPageBeginPtr = pageBeginPtr;
1576 result->lastPage = page;
1577 }
1578 else
1579 {
1580 /* There is no partial block to copy. */
1581 result->lastPageBeginPtr = endOfLog;
1582 result->lastPage = NULL;
1583 }
1584
1585 /*
1586 * Create a comment for the history file to explain why and where timeline
1587 * changed.
1588 */
1590
1591 result->lastRec = lastRec;
1592 result->lastRecTLI = lastRecTLI;
1593 result->endOfLog = endOfLog;
1594
1595 result->abortedRecPtr = abortedRecPtr;
1597
1600
1601 return result;
1602}
1603
1604/*
1605 * Clean up the WAL reader and leftovers from restoring WAL from archive
1606 */
1607void
1609{
1610 char recoveryPath[MAXPGPATH];
1611
1612 /* Final update of pg_stat_recovery_prefetch. */
1614
1615 /* Shut down xlogreader */
1616 if (readFile >= 0)
1617 {
1618 close(readFile);
1619 readFile = -1;
1620 }
1623
1625 {
1626 /*
1627 * Since there might be a partial WAL segment named RECOVERYXLOG, get
1628 * rid of it.
1629 */
1630 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1631 unlink(recoveryPath); /* ignore any error */
1632
1633 /* Get rid of any remaining recovered timeline-history file, too */
1634 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1635 unlink(recoveryPath); /* ignore any error */
1636 }
1637
1638 /*
1639 * We don't need the latch anymore. It's not strictly necessary to disown
1640 * it, but let's do it for the sake of tidiness.
1641 */
1644}
1645
1646/*
1647 * Perform WAL recovery.
1648 *
1649 * If the system was shut down cleanly, this is never called.
1650 */
1651void
1653{
1654 XLogRecord *record;
1655 bool reachedRecoveryTarget = false;
1656 TimeLineID replayTLI;
1657
1658 /*
1659 * Initialize shared variables for tracking progress of WAL replay, as if
1660 * we had just replayed the record before the REDO location (or the
1661 * checkpoint record itself, if it's a shutdown checkpoint).
1662 */
1665 {
1669 }
1670 else
1671 {
1675 }
1682
1683 /* Also ensure XLogReceiptTime has a sane value */
1685
1686 /*
1687 * Let postmaster know we've started redo now, so that it can launch the
1688 * archiver if necessary.
1689 */
1692
1693 /*
1694 * Allow read-only connections immediately if we're consistent already.
1695 */
1697
1698 /*
1699 * Find the first record that logically follows the checkpoint --- it
1700 * might physically precede it, though.
1701 */
1703 {
1704 /* back up to find the record */
1705 replayTLI = RedoStartTLI;
1707 record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1708
1709 /*
1710 * If a checkpoint record's redo pointer points back to an earlier
1711 * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1712 * record.
1713 */
1714 if (record->xl_rmid != RM_XLOG_ID ||
1716 ereport(FATAL,
1717 (errmsg("unexpected record type found at redo point %X/%X",
1719 }
1720 else
1721 {
1722 /* just have to read next record after CheckPoint */
1724 replayTLI = CheckPointTLI;
1725 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1726 }
1727
1728 if (record != NULL)
1729 {
1730 TimestampTz xtime;
1731 PGRUsage ru0;
1732
1733 pg_rusage_init(&ru0);
1734
1735 InRedo = true;
1736
1737 RmgrStartup();
1738
1739 ereport(LOG,
1740 (errmsg("redo starts at %X/%X",
1742
1743 /* Prepare to report progress of the redo phase. */
1744 if (!StandbyMode)
1746
1747 /*
1748 * main redo apply loop
1749 */
1750 do
1751 {
1752 if (!StandbyMode)
1753 ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1755
1756#ifdef WAL_DEBUG
1757 if (XLOG_DEBUG)
1758 {
1760
1762 appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1765 xlog_outrec(&buf, xlogreader);
1766 appendStringInfoString(&buf, " - ");
1768 elog(LOG, "%s", buf.data);
1769 pfree(buf.data);
1770 }
1771#endif
1772
1773 /* Handle interrupt signals of startup process */
1775
1776 /*
1777 * Pause WAL replay, if requested by a hot-standby session via
1778 * SetRecoveryPause().
1779 *
1780 * Note that we intentionally don't take the info_lck spinlock
1781 * here. We might therefore read a slightly stale value of the
1782 * recoveryPause flag, but it can't be very stale (no worse than
1783 * the last spinlock we did acquire). Since a pause request is a
1784 * pretty asynchronous thing anyway, possibly responding to it one
1785 * WAL record later than we otherwise would is a minor issue, so
1786 * it doesn't seem worth adding another spinlock cycle to prevent
1787 * that.
1788 */
1789 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1791 recoveryPausesHere(false);
1792
1793 /*
1794 * Have we reached our recovery target?
1795 */
1797 {
1798 reachedRecoveryTarget = true;
1799 break;
1800 }
1801
1802 /*
1803 * If we've been asked to lag the primary, wait on latch until
1804 * enough time has passed.
1805 */
1807 {
1808 /*
1809 * We test for paused recovery again here. If user sets
1810 * delayed apply, it may be because they expect to pause
1811 * recovery in case of problems, so we must test again here
1812 * otherwise pausing during the delay-wait wouldn't work.
1813 */
1814 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1816 recoveryPausesHere(false);
1817 }
1818
1819 /*
1820 * Apply the record
1821 */
1822 ApplyWalRecord(xlogreader, record, &replayTLI);
1823
1824 /* Exit loop if we reached inclusive recovery target */
1826 {
1827 reachedRecoveryTarget = true;
1828 break;
1829 }
1830
1831 /* Else, try to fetch the next WAL record */
1832 record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1833 } while (record != NULL);
1834
1835 /*
1836 * end of main redo apply loop
1837 */
1838
1839 if (reachedRecoveryTarget)
1840 {
1841 if (!reachedConsistency)
1842 ereport(FATAL,
1843 (errmsg("requested recovery stop point is before consistent recovery point")));
1844
1845 /*
1846 * This is the last point where we can restart recovery with a new
1847 * recovery target, if we shutdown and begin again. After this,
1848 * Resource Managers may choose to do permanent corrective actions
1849 * at end of recovery.
1850 */
1851 switch (recoveryTargetAction)
1852 {
1854
1855 /*
1856 * exit with special return code to request shutdown of
1857 * postmaster. Log messages issued from postmaster.
1858 */
1859 proc_exit(3);
1860
1862 SetRecoveryPause(true);
1863 recoveryPausesHere(true);
1864
1865 /* drop into promote */
1866
1868 break;
1869 }
1870 }
1871
1872 RmgrCleanup();
1873
1874 ereport(LOG,
1875 (errmsg("redo done at %X/%X system usage: %s",
1877 pg_rusage_show(&ru0))));
1878 xtime = GetLatestXTime();
1879 if (xtime)
1880 ereport(LOG,
1881 (errmsg("last completed transaction was at log time %s",
1882 timestamptz_to_str(xtime))));
1883
1884 InRedo = false;
1885 }
1886 else
1887 {
1888 /* there are no WAL records following the checkpoint */
1889 ereport(LOG,
1890 (errmsg("redo is not required")));
1891 }
1892
1893 /*
1894 * This check is intentionally after the above log messages that indicate
1895 * how far recovery went.
1896 */
1899 !reachedRecoveryTarget)
1900 ereport(FATAL,
1901 (errcode(ERRCODE_CONFIG_FILE_ERROR),
1902 errmsg("recovery ended before configured recovery target was reached")));
1903}
1904
1905/*
1906 * Subroutine of PerformWalRecovery, to apply one WAL record.
1907 */
1908static void
1910{
1911 ErrorContextCallback errcallback;
1912 bool switchedTLI = false;
1913
1914 /* Setup error traceback support for ereport() */
1915 errcallback.callback = rm_redo_error_callback;
1916 errcallback.arg = xlogreader;
1917 errcallback.previous = error_context_stack;
1918 error_context_stack = &errcallback;
1919
1920 /*
1921 * TransamVariables->nextXid must be beyond record's xid.
1922 */
1924
1925 /*
1926 * Before replaying this record, check if this record causes the current
1927 * timeline to change. The record is already considered to be part of the
1928 * new timeline, so we update replayTLI before replaying it. That's
1929 * important so that replayEndTLI, which is recorded as the minimum
1930 * recovery point's TLI if recovery stops after this record, is set
1931 * correctly.
1932 */
1933 if (record->xl_rmid == RM_XLOG_ID)
1934 {
1935 TimeLineID newReplayTLI = *replayTLI;
1936 TimeLineID prevReplayTLI = *replayTLI;
1937 uint8 info = record->xl_info & ~XLR_INFO_MASK;
1938
1939 if (info == XLOG_CHECKPOINT_SHUTDOWN)
1940 {
1941 CheckPoint checkPoint;
1942
1943 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1944 newReplayTLI = checkPoint.ThisTimeLineID;
1945 prevReplayTLI = checkPoint.PrevTimeLineID;
1946 }
1947 else if (info == XLOG_END_OF_RECOVERY)
1948 {
1949 xl_end_of_recovery xlrec;
1950
1951 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1952 newReplayTLI = xlrec.ThisTimeLineID;
1953 prevReplayTLI = xlrec.PrevTimeLineID;
1954 }
1955
1956 if (newReplayTLI != *replayTLI)
1957 {
1958 /* Check that it's OK to switch to this TLI */
1960 newReplayTLI, prevReplayTLI, *replayTLI);
1961
1962 /* Following WAL records should be run with new TLI */
1963 *replayTLI = newReplayTLI;
1964 switchedTLI = true;
1965 }
1966 }
1967
1968 /*
1969 * Update shared replayEndRecPtr before replaying this record, so that
1970 * XLogFlush will update minRecoveryPoint correctly.
1971 */
1974 XLogRecoveryCtl->replayEndTLI = *replayTLI;
1976
1977 /*
1978 * If we are attempting to enter Hot Standby mode, process XIDs we see
1979 */
1983
1984 /*
1985 * Some XLOG record types that are related to recovery are processed
1986 * directly here, rather than in xlog_redo()
1987 */
1988 if (record->xl_rmid == RM_XLOG_ID)
1989 xlogrecovery_redo(xlogreader, *replayTLI);
1990
1991 /* Now apply the WAL record itself */
1993
1994 /*
1995 * After redo, check whether the backup pages associated with the WAL
1996 * record are consistent with the existing pages. This check is done only
1997 * if consistency check is enabled for this record.
1998 */
1999 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2001
2002 /* Pop the error context stack */
2003 error_context_stack = errcallback.previous;
2004
2005 /*
2006 * Update lastReplayedEndRecPtr after this record has been successfully
2007 * replayed.
2008 */
2012 XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2014
2015 /* ------
2016 * Wakeup walsenders:
2017 *
2018 * On the standby, the WAL is flushed first (which will only wake up
2019 * physical walsenders) and then applied, which will only wake up logical
2020 * walsenders.
2021 *
2022 * Indeed, logical walsenders on standby can't decode and send data until
2023 * it's been applied.
2024 *
2025 * Physical walsenders don't need to be woken up during replay unless
2026 * cascading replication is allowed and time line change occurred (so that
2027 * they can notice that they are on a new time line).
2028 *
2029 * That's why the wake up conditions are for:
2030 *
2031 * - physical walsenders in case of new time line and cascade
2032 * replication is allowed
2033 * - logical walsenders in case cascade replication is allowed (could not
2034 * be created otherwise)
2035 * ------
2036 */
2038 WalSndWakeup(switchedTLI, true);
2039
2040 /*
2041 * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2042 * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2043 * a reply to the primary.
2044 */
2046 {
2049 }
2050
2051 /* Allow read-only connections if we're consistent now */
2053
2054 /* Is this a timeline switch? */
2055 if (switchedTLI)
2056 {
2057 /*
2058 * Before we continue on the new timeline, clean up any (possibly
2059 * bogus) future WAL segments on the old timeline.
2060 */
2062
2063 /* Reset the prefetcher. */
2065 }
2066}
2067
2068/*
2069 * Some XLOG RM record types that are directly related to WAL recovery are
2070 * handled here rather than in the xlog_redo()
2071 */
2072static void
2074{
2075 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2076 XLogRecPtr lsn = record->EndRecPtr;
2077
2078 Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2079
2080 if (info == XLOG_OVERWRITE_CONTRECORD)
2081 {
2082 /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2084
2085 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2086 if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2087 elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
2090
2091 /* We have safely skipped the aborted record */
2094
2095 ereport(LOG,
2096 (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2099
2100 /* Verifying the record should only happen once */
2102 }
2103 else if (info == XLOG_BACKUP_END)
2104 {
2105 XLogRecPtr startpoint;
2106
2107 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2108
2109 if (backupStartPoint == startpoint)
2110 {
2111 /*
2112 * We have reached the end of base backup, the point where
2113 * pg_backup_stop() was done. The data on disk is now consistent
2114 * (assuming we have also reached minRecoveryPoint). Set
2115 * backupEndPoint to the current LSN, so that the next call to
2116 * CheckRecoveryConsistency() will notice it and do the
2117 * end-of-backup processing.
2118 */
2119 elog(DEBUG1, "end of backup record reached");
2120
2121 backupEndPoint = lsn;
2122 }
2123 else
2124 elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2126 }
2127}
2128
2129/*
2130 * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2131 * directories.
2132 *
2133 * Replay of database creation XLOG records for databases that were later
2134 * dropped can create fake directories in pg_tblspc. By the time consistency
2135 * is reached these directories should have been removed; here we verify
2136 * that this did indeed happen. This is to be called at the point where
2137 * consistent state is reached.
2138 *
2139 * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2140 * useful for testing purposes, and also allows for an escape hatch in case
2141 * things go south.
2142 */
2143static void
2145{
2146 DIR *dir;
2147 struct dirent *de;
2148
2150 while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2151 {
2152 char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2153
2154 /* Skip entries of non-oid names */
2155 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2156 continue;
2157
2158 snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2159
2160 if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2163 errmsg("unexpected directory entry \"%s\" found in %s",
2164 de->d_name, PG_TBLSPC_DIR),
2165 errdetail("All directory entries in %s/ should be symbolic links.",
2167 errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2168 }
2169}
2170
2171/*
2172 * Checks if recovery has reached a consistent state. When consistency is
2173 * reached and we have a valid starting standby snapshot, tell postmaster
2174 * that it can start accepting read-only connections.
2175 */
2176static void
2178{
2179 XLogRecPtr lastReplayedEndRecPtr;
2180 TimeLineID lastReplayedTLI;
2181
2182 /*
2183 * During crash recovery, we don't reach a consistent state until we've
2184 * replayed all the WAL.
2185 */
2187 return;
2188
2190
2191 /*
2192 * assume that we are called in the startup process, and hence don't need
2193 * a lock to read lastReplayedEndRecPtr
2194 */
2195 lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2196 lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2197
2198 /*
2199 * Have we reached the point where our base backup was completed?
2200 */
2202 backupEndPoint <= lastReplayedEndRecPtr)
2203 {
2204 XLogRecPtr saveBackupStartPoint = backupStartPoint;
2205 XLogRecPtr saveBackupEndPoint = backupEndPoint;
2206
2207 elog(DEBUG1, "end of backup reached");
2208
2209 /*
2210 * We have reached the end of base backup, as indicated by pg_control.
2211 * Update the control file accordingly.
2212 */
2213 ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2216 backupEndRequired = false;
2217
2218 ereport(LOG,
2219 (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
2220 LSN_FORMAT_ARGS(saveBackupStartPoint),
2221 LSN_FORMAT_ARGS(saveBackupEndPoint))));
2222 }
2223
2224 /*
2225 * Have we passed our safe starting point? Note that minRecoveryPoint is
2226 * known to be incorrectly set if recovering from a backup, until the
2227 * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2228 * All we know prior to that is that we're not consistent yet.
2229 */
2231 minRecoveryPoint <= lastReplayedEndRecPtr)
2232 {
2233 /*
2234 * Check to see if the XLOG sequence contained any unresolved
2235 * references to uninitialized pages.
2236 */
2238
2239 /*
2240 * Check that pg_tblspc doesn't contain any real directories. Replay
2241 * of Database/CREATE_* records may have created fictitious tablespace
2242 * directories that should have been removed by the time consistency
2243 * was reached.
2244 */
2246
2247 reachedConsistency = true;
2248 ereport(LOG,
2249 (errmsg("consistent recovery state reached at %X/%X",
2250 LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2251 }
2252
2253 /*
2254 * Have we got a valid starting snapshot that will allow queries to be
2255 * run? If so, we can tell postmaster that the database is consistent now,
2256 * enabling connections.
2257 */
2262 {
2266
2267 LocalHotStandbyActive = true;
2268
2270 }
2271}
2272
2273/*
2274 * Error context callback for errors occurring during rm_redo().
2275 */
2276static void
2278{
2279 XLogReaderState *record = (XLogReaderState *) arg;
2281
2283 xlog_outdesc(&buf, record);
2284 xlog_block_info(&buf, record);
2285
2286 /* translator: %s is a WAL record description */
2287 errcontext("WAL redo at %X/%X for %s",
2288 LSN_FORMAT_ARGS(record->ReadRecPtr),
2289 buf.data);
2290
2291 pfree(buf.data);
2292}
2293
2294/*
2295 * Returns a string describing an XLogRecord, consisting of its identity
2296 * optionally followed by a colon, a space, and a further description.
2297 */
2298void
2300{
2301 RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2302 uint8 info = XLogRecGetInfo(record);
2303 const char *id;
2304
2307
2308 id = rmgr.rm_identify(info);
2309 if (id == NULL)
2310 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2311 else
2312 appendStringInfo(buf, "%s: ", id);
2313
2314 rmgr.rm_desc(buf, record);
2315}
2316
2317#ifdef WAL_DEBUG
2318
2319static void
2320xlog_outrec(StringInfo buf, XLogReaderState *record)
2321{
2322 appendStringInfo(buf, "prev %X/%X; xid %u",
2324 XLogRecGetXid(record));
2325
2326 appendStringInfo(buf, "; len %u",
2327 XLogRecGetDataLen(record));
2328
2329 xlog_block_info(buf, record);
2330}
2331#endif /* WAL_DEBUG */
2332
2333/*
2334 * Returns a string giving information about all the blocks in an
2335 * XLogRecord.
2336 */
2337static void
2339{
2340 int block_id;
2341
2342 /* decode block references */
2343 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2344 {
2345 RelFileLocator rlocator;
2346 ForkNumber forknum;
2347 BlockNumber blk;
2348
2349 if (!XLogRecGetBlockTagExtended(record, block_id,
2350 &rlocator, &forknum, &blk, NULL))
2351 continue;
2352
2353 if (forknum != MAIN_FORKNUM)
2354 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2355 block_id,
2356 rlocator.spcOid, rlocator.dbOid,
2357 rlocator.relNumber,
2358 forknum,
2359 blk);
2360 else
2361 appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2362 block_id,
2363 rlocator.spcOid, rlocator.dbOid,
2364 rlocator.relNumber,
2365 blk);
2366 if (XLogRecHasBlockImage(record, block_id))
2367 appendStringInfoString(buf, " FPW");
2368 }
2369}
2370
2371
2372/*
2373 * Check that it's OK to switch to new timeline during recovery.
2374 *
2375 * 'lsn' is the address of the shutdown checkpoint record we're about to
2376 * replay. (Currently, timeline can only change at a shutdown checkpoint).
2377 */
2378static void
2380 TimeLineID replayTLI)
2381{
2382 /* Check that the record agrees on what the current (old) timeline is */
2383 if (prevTLI != replayTLI)
2384 ereport(PANIC,
2385 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2386 prevTLI, replayTLI)));
2387
2388 /*
2389 * The new timeline better be in the list of timelines we expect to see,
2390 * according to the timeline history. It should also not decrease.
2391 */
2392 if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2393 ereport(PANIC,
2394 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2395 newTLI, replayTLI)));
2396
2397 /*
2398 * If we have not yet reached min recovery point, and we're about to
2399 * switch to a timeline greater than the timeline of the min recovery
2400 * point: trouble. After switching to the new timeline, we could not
2401 * possibly visit the min recovery point on the correct timeline anymore.
2402 * This can happen if there is a newer timeline in the archive that
2403 * branched before the timeline the min recovery point is on, and you
2404 * attempt to do PITR to the new timeline.
2405 */
2407 lsn < minRecoveryPoint &&
2408 newTLI > minRecoveryPointTLI)
2409 ereport(PANIC,
2410 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2411 newTLI,
2414
2415 /* Looks good */
2416}
2417
2418
2419/*
2420 * Extract timestamp from WAL record.
2421 *
2422 * If the record contains a timestamp, returns true, and saves the timestamp
2423 * in *recordXtime. If the record type has no timestamp, returns false.
2424 * Currently, only transaction commit/abort records and restore points contain
2425 * timestamps.
2426 */
2427static bool
2429{
2430 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2431 uint8 xact_info = info & XLOG_XACT_OPMASK;
2432 uint8 rmid = XLogRecGetRmid(record);
2433
2434 if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2435 {
2436 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2437 return true;
2438 }
2439 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2440 xact_info == XLOG_XACT_COMMIT_PREPARED))
2441 {
2442 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2443 return true;
2444 }
2445 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2446 xact_info == XLOG_XACT_ABORT_PREPARED))
2447 {
2448 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2449 return true;
2450 }
2451 return false;
2452}
2453
2454/*
2455 * Checks whether the current buffer page and backup page stored in the
2456 * WAL record are consistent or not. Before comparing the two pages, a
2457 * masking can be applied to the pages to ignore certain areas like hint bits,
2458 * unused space between pd_lower and pd_upper among other things. This
2459 * function should be called once WAL replay has been completed for a
2460 * given record.
2461 */
2462static void
2464{
2465 RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2466 RelFileLocator rlocator;
2467 ForkNumber forknum;
2468 BlockNumber blkno;
2469 int block_id;
2470
2471 /* Records with no backup blocks have no need for consistency checks. */
2472 if (!XLogRecHasAnyBlockRefs(record))
2473 return;
2474
2476
2477 for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2478 {
2479 Buffer buf;
2480 Page page;
2481
2482 if (!XLogRecGetBlockTagExtended(record, block_id,
2483 &rlocator, &forknum, &blkno, NULL))
2484 {
2485 /*
2486 * WAL record doesn't contain a block reference with the given id.
2487 * Do nothing.
2488 */
2489 continue;
2490 }
2491
2492 Assert(XLogRecHasBlockImage(record, block_id));
2493
2494 if (XLogRecBlockImageApply(record, block_id))
2495 {
2496 /*
2497 * WAL record has already applied the page, so bypass the
2498 * consistency check as that would result in comparing the full
2499 * page stored in the record with itself.
2500 */
2501 continue;
2502 }
2503
2504 /*
2505 * Read the contents from the current buffer and store it in a
2506 * temporary page.
2507 */
2508 buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2511 if (!BufferIsValid(buf))
2512 continue;
2513
2515 page = BufferGetPage(buf);
2516
2517 /*
2518 * Take a copy of the local page where WAL has been applied to have a
2519 * comparison base before masking it...
2520 */
2521 memcpy(replay_image_masked, page, BLCKSZ);
2522
2523 /* No need for this page anymore now that a copy is in. */
2525
2526 /*
2527 * If the block LSN is already ahead of this WAL record, we can't
2528 * expect contents to match. This can happen if recovery is
2529 * restarted.
2530 */
2532 continue;
2533
2534 /*
2535 * Read the contents from the backup copy, stored in WAL record and
2536 * store it in a temporary page. There is no need to allocate a new
2537 * page here, a local buffer is fine to hold its contents and a mask
2538 * can be directly applied on it.
2539 */
2540 if (!RestoreBlockImage(record, block_id, primary_image_masked))
2541 ereport(ERROR,
2542 (errcode(ERRCODE_INTERNAL_ERROR),
2543 errmsg_internal("%s", record->errormsg_buf)));
2544
2545 /*
2546 * If masking function is defined, mask both the primary and replay
2547 * images
2548 */
2549 if (rmgr.rm_mask != NULL)
2550 {
2551 rmgr.rm_mask(replay_image_masked, blkno);
2552 rmgr.rm_mask(primary_image_masked, blkno);
2553 }
2554
2555 /* Time to compare the primary and replay images. */
2556 if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2557 {
2558 elog(FATAL,
2559 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2560 rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2561 forknum, blkno);
2562 }
2563 }
2564}
2565
2566/*
2567 * For point-in-time recovery, this function decides whether we want to
2568 * stop applying the XLOG before the current record.
2569 *
2570 * Returns true if we are stopping, false otherwise. If stopping, some
2571 * information is saved in recoveryStopXid et al for use in annotating the
2572 * new timeline's history file.
2573 */
2574static bool
2576{
2577 bool stopsHere = false;
2578 uint8 xact_info;
2579 bool isCommit;
2580 TimestampTz recordXtime = 0;
2581 TransactionId recordXid;
2582
2583 /*
2584 * Ignore recovery target settings when not in archive recovery (meaning
2585 * we are in crash recovery).
2586 */
2588 return false;
2589
2590 /* Check if we should stop as soon as reaching consistency */
2592 {
2593 ereport(LOG,
2594 (errmsg("recovery stopping after reaching consistency")));
2595
2596 recoveryStopAfter = false;
2599 recoveryStopTime = 0;
2600 recoveryStopName[0] = '\0';
2601 return true;
2602 }
2603
2604 /* Check if target LSN has been reached */
2607 record->ReadRecPtr >= recoveryTargetLSN)
2608 {
2609 recoveryStopAfter = false;
2611 recoveryStopLSN = record->ReadRecPtr;
2612 recoveryStopTime = 0;
2613 recoveryStopName[0] = '\0';
2614 ereport(LOG,
2615 (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2617 return true;
2618 }
2619
2620 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2621 if (XLogRecGetRmid(record) != RM_XACT_ID)
2622 return false;
2623
2624 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2625
2626 if (xact_info == XLOG_XACT_COMMIT)
2627 {
2628 isCommit = true;
2629 recordXid = XLogRecGetXid(record);
2630 }
2631 else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2632 {
2633 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2634 xl_xact_parsed_commit parsed;
2635
2636 isCommit = true;
2638 xlrec,
2639 &parsed);
2640 recordXid = parsed.twophase_xid;
2641 }
2642 else if (xact_info == XLOG_XACT_ABORT)
2643 {
2644 isCommit = false;
2645 recordXid = XLogRecGetXid(record);
2646 }
2647 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2648 {
2649 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2650 xl_xact_parsed_abort parsed;
2651
2652 isCommit = false;
2654 xlrec,
2655 &parsed);
2656 recordXid = parsed.twophase_xid;
2657 }
2658 else
2659 return false;
2660
2662 {
2663 /*
2664 * There can be only one transaction end record with this exact
2665 * transactionid
2666 *
2667 * when testing for an xid, we MUST test for equality only, since
2668 * transactions are numbered in the order they start, not the order
2669 * they complete. A higher numbered xid will complete before you about
2670 * 50% of the time...
2671 */
2672 stopsHere = (recordXid == recoveryTargetXid);
2673 }
2674
2675 /*
2676 * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2677 * We don't expect getRecordTimestamp ever to fail, since we already know
2678 * this is a commit or abort record; but test its result anyway.
2679 */
2680 if (getRecordTimestamp(record, &recordXtime) &&
2682 {
2683 /*
2684 * There can be many transactions that share the same commit time, so
2685 * we stop after the last one, if we are inclusive, or stop at the
2686 * first one if we are exclusive
2687 */
2689 stopsHere = (recordXtime > recoveryTargetTime);
2690 else
2691 stopsHere = (recordXtime >= recoveryTargetTime);
2692 }
2693
2694 if (stopsHere)
2695 {
2696 recoveryStopAfter = false;
2697 recoveryStopXid = recordXid;
2698 recoveryStopTime = recordXtime;
2700 recoveryStopName[0] = '\0';
2701
2702 if (isCommit)
2703 {
2704 ereport(LOG,
2705 (errmsg("recovery stopping before commit of transaction %u, time %s",
2708 }
2709 else
2710 {
2711 ereport(LOG,
2712 (errmsg("recovery stopping before abort of transaction %u, time %s",
2715 }
2716 }
2717
2718 return stopsHere;
2719}
2720
2721/*
2722 * Same as recoveryStopsBefore, but called after applying the record.
2723 *
2724 * We also track the timestamp of the latest applied COMMIT/ABORT
2725 * record in XLogRecoveryCtl->recoveryLastXTime.
2726 */
2727static bool
2729{
2730 uint8 info;
2731 uint8 xact_info;
2732 uint8 rmid;
2733 TimestampTz recordXtime = 0;
2734
2735 /*
2736 * Ignore recovery target settings when not in archive recovery (meaning
2737 * we are in crash recovery).
2738 */
2740 return false;
2741
2742 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2743 rmid = XLogRecGetRmid(record);
2744
2745 /*
2746 * There can be many restore points that share the same name; we stop at
2747 * the first one.
2748 */
2750 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2751 {
2752 xl_restore_point *recordRestorePointData;
2753
2754 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2755
2756 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2757 {
2758 recoveryStopAfter = true;
2761 (void) getRecordTimestamp(record, &recoveryStopTime);
2762 strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2763
2764 ereport(LOG,
2765 (errmsg("recovery stopping at restore point \"%s\", time %s",
2768 return true;
2769 }
2770 }
2771
2772 /* Check if the target LSN has been reached */
2775 record->ReadRecPtr >= recoveryTargetLSN)
2776 {
2777 recoveryStopAfter = true;
2779 recoveryStopLSN = record->ReadRecPtr;
2780 recoveryStopTime = 0;
2781 recoveryStopName[0] = '\0';
2782 ereport(LOG,
2783 (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2785 return true;
2786 }
2787
2788 if (rmid != RM_XACT_ID)
2789 return false;
2790
2791 xact_info = info & XLOG_XACT_OPMASK;
2792
2793 if (xact_info == XLOG_XACT_COMMIT ||
2794 xact_info == XLOG_XACT_COMMIT_PREPARED ||
2795 xact_info == XLOG_XACT_ABORT ||
2796 xact_info == XLOG_XACT_ABORT_PREPARED)
2797 {
2798 TransactionId recordXid;
2799
2800 /* Update the last applied transaction timestamp */
2801 if (getRecordTimestamp(record, &recordXtime))
2802 SetLatestXTime(recordXtime);
2803
2804 /* Extract the XID of the committed/aborted transaction */
2805 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2806 {
2807 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2808 xl_xact_parsed_commit parsed;
2809
2811 xlrec,
2812 &parsed);
2813 recordXid = parsed.twophase_xid;
2814 }
2815 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2816 {
2817 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2818 xl_xact_parsed_abort parsed;
2819
2821 xlrec,
2822 &parsed);
2823 recordXid = parsed.twophase_xid;
2824 }
2825 else
2826 recordXid = XLogRecGetXid(record);
2827
2828 /*
2829 * There can be only one transaction end record with this exact
2830 * transactionid
2831 *
2832 * when testing for an xid, we MUST test for equality only, since
2833 * transactions are numbered in the order they start, not the order
2834 * they complete. A higher numbered xid will complete before you about
2835 * 50% of the time...
2836 */
2838 recordXid == recoveryTargetXid)
2839 {
2840 recoveryStopAfter = true;
2841 recoveryStopXid = recordXid;
2842 recoveryStopTime = recordXtime;
2844 recoveryStopName[0] = '\0';
2845
2846 if (xact_info == XLOG_XACT_COMMIT ||
2847 xact_info == XLOG_XACT_COMMIT_PREPARED)
2848 {
2849 ereport(LOG,
2850 (errmsg("recovery stopping after commit of transaction %u, time %s",
2853 }
2854 else if (xact_info == XLOG_XACT_ABORT ||
2855 xact_info == XLOG_XACT_ABORT_PREPARED)
2856 {
2857 ereport(LOG,
2858 (errmsg("recovery stopping after abort of transaction %u, time %s",
2861 }
2862 return true;
2863 }
2864 }
2865
2866 /* Check if we should stop as soon as reaching consistency */
2868 {
2869 ereport(LOG,
2870 (errmsg("recovery stopping after reaching consistency")));
2871
2872 recoveryStopAfter = true;
2874 recoveryStopTime = 0;
2876 recoveryStopName[0] = '\0';
2877 return true;
2878 }
2879
2880 return false;
2881}
2882
2883/*
2884 * Create a comment for the history file to explain why and where
2885 * timeline changed.
2886 */
2887static char *
2889{
2890 char reason[200];
2891
2893 snprintf(reason, sizeof(reason),
2894 "%s transaction %u",
2895 recoveryStopAfter ? "after" : "before",
2898 snprintf(reason, sizeof(reason),
2899 "%s %s\n",
2900 recoveryStopAfter ? "after" : "before",
2903 snprintf(reason, sizeof(reason),
2904 "%s LSN %X/%X\n",
2905 recoveryStopAfter ? "after" : "before",
2908 snprintf(reason, sizeof(reason),
2909 "at restore point \"%s\"",
2912 snprintf(reason, sizeof(reason), "reached consistency");
2913 else
2914 snprintf(reason, sizeof(reason), "no recovery target specified");
2915
2916 return pstrdup(reason);
2917}
2918
2919/*
2920 * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2921 *
2922 * endOfRecovery is true if the recovery target is reached and
2923 * the paused state starts at the end of recovery because of
2924 * recovery_target_action=pause, and false otherwise.
2925 */
2926static void
2927recoveryPausesHere(bool endOfRecovery)
2928{
2929 /* Don't pause unless users can connect! */
2931 return;
2932
2933 /* Don't pause after standby promotion has been triggered */
2935 return;
2936
2937 if (endOfRecovery)
2938 ereport(LOG,
2939 (errmsg("pausing at the end of recovery"),
2940 errhint("Execute pg_wal_replay_resume() to promote.")));
2941 else
2942 ereport(LOG,
2943 (errmsg("recovery has paused"),
2944 errhint("Execute pg_wal_replay_resume() to continue.")));
2945
2946 /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2948 {
2951 return;
2952
2953 /*
2954 * If recovery pause is requested then set it paused. While we are in
2955 * the loop, user might resume and pause again so set this every time.
2956 */
2958
2959 /*
2960 * We wait on a condition variable that will wake us as soon as the
2961 * pause ends, but we use a timeout so we can check the above exit
2962 * condition periodically too.
2963 */
2965 WAIT_EVENT_RECOVERY_PAUSE);
2966 }
2968}
2969
2970/*
2971 * When recovery_min_apply_delay is set, we wait long enough to make sure
2972 * certain record types are applied at least that interval behind the primary.
2973 *
2974 * Returns true if we waited.
2975 *
2976 * Note that the delay is calculated between the WAL record log time and
2977 * the current time on standby. We would prefer to keep track of when this
2978 * standby received each WAL record, which would allow a more consistent
2979 * approach and one not affected by time synchronisation issues, but that
2980 * is significantly more effort and complexity for little actual gain in
2981 * usability.
2982 */
2983static bool
2985{
2986 uint8 xact_info;
2987 TimestampTz xtime;
2988 TimestampTz delayUntil;
2989 long msecs;
2990
2991 /* nothing to do if no delay configured */
2992 if (recovery_min_apply_delay <= 0)
2993 return false;
2994
2995 /* no delay is applied on a database not yet consistent */
2996 if (!reachedConsistency)
2997 return false;
2998
2999 /* nothing to do if crash recovery is requested */
3001 return false;
3002
3003 /*
3004 * Is it a COMMIT record?
3005 *
3006 * We deliberately choose not to delay aborts since they have no effect on
3007 * MVCC. We already allow replay of records that don't have a timestamp,
3008 * so there is already opportunity for issues caused by early conflicts on
3009 * standbys.
3010 */
3011 if (XLogRecGetRmid(record) != RM_XACT_ID)
3012 return false;
3013
3014 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3015
3016 if (xact_info != XLOG_XACT_COMMIT &&
3017 xact_info != XLOG_XACT_COMMIT_PREPARED)
3018 return false;
3019
3020 if (!getRecordTimestamp(record, &xtime))
3021 return false;
3022
3024
3025 /*
3026 * Exit without arming the latch if it's already past time to apply this
3027 * record
3028 */
3030 if (msecs <= 0)
3031 return false;
3032
3033 while (true)
3034 {
3036
3037 /* This might change recovery_min_apply_delay. */
3039
3041 break;
3042
3043 /*
3044 * Recalculate delayUntil as recovery_min_apply_delay could have
3045 * changed while waiting in this loop.
3046 */
3048
3049 /*
3050 * Wait for difference between GetCurrentTimestamp() and delayUntil.
3051 */
3053 delayUntil);
3054
3055 if (msecs <= 0)
3056 break;
3057
3058 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3059
3062 msecs,
3063 WAIT_EVENT_RECOVERY_APPLY_DELAY);
3064 }
3065 return true;
3066}
3067
3068/*
3069 * Get the current state of the recovery pause request.
3070 */
3073{
3075
3079
3080 return state;
3081}
3082
3083/*
3084 * Set the recovery pause state.
3085 *
3086 * If recovery pause is requested then sets the recovery pause state to
3087 * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3088 * to 'not paused' to resume the recovery. The recovery pause will be
3089 * confirmed by the ConfirmRecoveryPaused.
3090 */
3091void
3092SetRecoveryPause(bool recoveryPause)
3093{
3095
3096 if (!recoveryPause)
3100
3102
3103 if (!recoveryPause)
3105}
3106
3107/*
3108 * Confirm the recovery pause by setting the recovery pause state to
3109 * RECOVERY_PAUSED.
3110 */
3111static void
3113{
3114 /* If recovery pause is requested then set it paused */
3119}
3120
3121
3122/*
3123 * Attempt to read the next XLOG record.
3124 *
3125 * Before first call, the reader needs to be positioned to the first record
3126 * by calling XLogPrefetcherBeginRead().
3127 *
3128 * If no valid record is available, returns NULL, or fails if emode is PANIC.
3129 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3130 * record is available.
3131 */
3132static XLogRecord *
3134 bool fetching_ckpt, TimeLineID replayTLI)
3135{
3136 XLogRecord *record;
3139
3140 /* Pass through parameters to XLogPageRead */
3141 private->fetching_ckpt = fetching_ckpt;
3142 private->emode = emode;
3143 private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3144 private->replayTLI = replayTLI;
3145
3146 /* This is the first attempt to read this page. */
3147 lastSourceFailed = false;
3148
3149 for (;;)
3150 {
3151 char *errormsg;
3152
3153 record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3154 if (record == NULL)
3155 {
3156 /*
3157 * When we find that WAL ends in an incomplete record, keep track
3158 * of that record. After recovery is done, we'll write a record
3159 * to indicate to downstream WAL readers that that portion is to
3160 * be ignored.
3161 *
3162 * However, when ArchiveRecoveryRequested = true, we're going to
3163 * switch to a new timeline at the end of recovery. We will only
3164 * copy WAL over to the new timeline up to the end of the last
3165 * complete record, so if we did this, we would later create an
3166 * overwrite contrecord in the wrong place, breaking everything.
3167 */
3170 {
3173 }
3174
3175 if (readFile >= 0)
3176 {
3177 close(readFile);
3178 readFile = -1;
3179 }
3180
3181 /*
3182 * We only end up here without a message when XLogPageRead()
3183 * failed - in that case we already logged something. In
3184 * StandbyMode that only happens if we have been triggered, so we
3185 * shouldn't loop anymore in that case.
3186 */
3187 if (errormsg)
3189 (errmsg_internal("%s", errormsg) /* already translated */ ));
3190 }
3191
3192 /*
3193 * Check page TLI is one of the expected values.
3194 */
3196 {
3197 char fname[MAXFNAMELEN];
3198 XLogSegNo segno;
3199 int32 offset;
3200
3204 XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3207 (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3209 fname,
3211 offset)));
3212 record = NULL;
3213 }
3214
3215 if (record)
3216 {
3217 /* Great, got a record */
3218 return record;
3219 }
3220 else
3221 {
3222 /* No valid record available from this source */
3223 lastSourceFailed = true;
3224
3225 /*
3226 * If archive recovery was requested, but we were still doing
3227 * crash recovery, switch to archive recovery and retry using the
3228 * offline archive. We have now replayed all the valid WAL in
3229 * pg_wal, so we are presumably now consistent.
3230 *
3231 * We require that there's at least some valid WAL present in
3232 * pg_wal, however (!fetching_ckpt). We could recover using the
3233 * WAL from the archive, even if pg_wal is completely empty, but
3234 * we'd have no idea how far we'd have to replay to reach
3235 * consistency. So err on the safe side and give up.
3236 */
3238 !fetching_ckpt)
3239 {
3241 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3242 InArchiveRecovery = true;
3245
3248 minRecoveryPointTLI = replayTLI;
3249
3251
3252 /*
3253 * Before we retry, reset lastSourceFailed and currentSource
3254 * so that we will check the archive next.
3255 */
3256 lastSourceFailed = false;
3258
3259 continue;
3260 }
3261
3262 /* In standby mode, loop back to retry. Otherwise, give up. */
3264 continue;
3265 else
3266 return NULL;
3267 }
3268 }
3269}
3270
3271/*
3272 * Read the XLOG page containing targetPagePtr into readBuf (if not read
3273 * already). Returns number of bytes read, if the page is read successfully,
3274 * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3275 * but only if they have not been previously reported.
3276 *
3277 * See XLogReaderRoutine.page_read for more details.
3278 *
3279 * While prefetching, xlogreader->nonblocking may be set. In that case,
3280 * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3281 *
3282 * This is responsible for restoring files from archive as needed, as well
3283 * as for waiting for the requested WAL record to arrive in standby mode.
3284 *
3285 * xlogreader->private_data->emode specifies the log level used for reporting
3286 * "file not found" or "end of WAL" situations in archive recovery, or in
3287 * standby mode when promotion is triggered. If set to WARNING or below,
3288 * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3289 * levels the ereport() won't return.
3290 *
3291 * In standby mode, if after a successful return of XLogPageRead() the
3292 * caller finds the record it's interested in to be broken, it should
3293 * ereport the error with the level determined by
3294 * emode_for_corrupt_record(), and then set lastSourceFailed
3295 * and call XLogPageRead() again with the same arguments. This lets
3296 * XLogPageRead() to try fetching the record from another source, or to
3297 * sleep and retry.
3298 */
3299static int
3301 XLogRecPtr targetRecPtr, char *readBuf)
3302{
3303 XLogPageReadPrivate *private =
3305 int emode = private->emode;
3306 uint32 targetPageOff;
3308 int r;
3309
3310 XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3311 targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3312
3313 /*
3314 * See if we need to switch to a new segment because the requested record
3315 * is not in the currently open one.
3316 */
3317 if (readFile >= 0 &&
3318 !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3319 {
3320 /*
3321 * Request a restartpoint if we've replayed too much xlog since the
3322 * last one.
3323 */
3325 {
3327 {
3328 (void) GetRedoRecPtr();
3331 }
3332 }
3333
3334 close(readFile);
3335 readFile = -1;
3337 }
3338
3339 XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3340
3341retry:
3342 /* See if we need to retrieve more data */
3343 if (readFile < 0 ||
3345 flushedUpto < targetPagePtr + reqLen))
3346 {
3347 if (readFile >= 0 &&
3350 flushedUpto < targetPagePtr + reqLen)
3351 return XLREAD_WOULDBLOCK;
3352
3353 switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3354 private->randAccess,
3355 private->fetching_ckpt,
3356 targetRecPtr,
3357 private->replayTLI,
3360 {
3361 case XLREAD_WOULDBLOCK:
3362 return XLREAD_WOULDBLOCK;
3363 case XLREAD_FAIL:
3364 if (readFile >= 0)
3365 close(readFile);
3366 readFile = -1;
3367 readLen = 0;
3369 return XLREAD_FAIL;
3370 case XLREAD_SUCCESS:
3371 break;
3372 }
3373 }
3374
3375 /*
3376 * At this point, we have the right segment open and if we're streaming we
3377 * know the requested record is in it.
3378 */
3379 Assert(readFile != -1);
3380
3381 /*
3382 * If the current segment is being streamed from the primary, calculate
3383 * how much of the current page we have received already. We know the
3384 * requested record has been received, but this is for the benefit of
3385 * future calls, to allow quick exit at the top of this function.
3386 */
3388 {
3389 if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3390 readLen = XLOG_BLCKSZ;
3391 else
3393 targetPageOff;
3394 }
3395 else
3396 readLen = XLOG_BLCKSZ;
3397
3398 /* Read the requested page */
3399 readOff = targetPageOff;
3400
3401 pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3402 r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3403 if (r != XLOG_BLCKSZ)
3404 {
3405 char fname[MAXFNAMELEN];
3406 int save_errno = errno;
3407
3410 if (r < 0)
3411 {
3412 errno = save_errno;
3413 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3415 errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3416 fname, LSN_FORMAT_ARGS(targetPagePtr),
3417 readOff)));
3418 }
3419 else
3420 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3422 errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3423 fname, LSN_FORMAT_ARGS(targetPagePtr),
3424 readOff, r, (Size) XLOG_BLCKSZ)));
3425 goto next_record_is_invalid;
3426 }
3428
3429 Assert(targetSegNo == readSegNo);
3430 Assert(targetPageOff == readOff);
3431 Assert(reqLen <= readLen);
3432
3434
3435 /*
3436 * Check the page header immediately, so that we can retry immediately if
3437 * it's not valid. This may seem unnecessary, because ReadPageInternal()
3438 * validates the page header anyway, and would propagate the failure up to
3439 * ReadRecord(), which would retry. However, there's a corner case with
3440 * continuation records, if a record is split across two pages such that
3441 * we would need to read the two pages from different sources across two
3442 * WAL segments.
3443 *
3444 * The first page is only available locally, in pg_wal, because it's
3445 * already been recycled on the primary. The second page, however, is not
3446 * present in pg_wal, and we should stream it from the primary. There is a
3447 * recycled WAL segment present in pg_wal, with garbage contents, however.
3448 * We would read the first page from the local WAL segment, but when
3449 * reading the second page, we would read the bogus, recycled, WAL
3450 * segment. If we didn't catch that case here, we would never recover,
3451 * because ReadRecord() would retry reading the whole record from the
3452 * beginning.
3453 *
3454 * Of course, this only catches errors in the page header, which is what
3455 * happens in the case of a recycled WAL segment. Other kinds of errors or
3456 * corruption still has the same problem. But this at least fixes the
3457 * common case, which can happen as part of normal operation.
3458 *
3459 * Validating the page header is cheap enough that doing it twice
3460 * shouldn't be a big deal from a performance point of view.
3461 *
3462 * When not in standby mode, an invalid page header should cause recovery
3463 * to end, not retry reading the page, so we don't need to validate the
3464 * page header here for the retry. Instead, ReadPageInternal() is
3465 * responsible for the validation.
3466 */
3467 if (StandbyMode &&
3468 (targetPagePtr % wal_segment_size) == 0 &&
3469 !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3470 {
3471 /*
3472 * Emit this error right now then retry this page immediately. Use
3473 * errmsg_internal() because the message was already translated.
3474 */
3475 if (xlogreader->errormsg_buf[0])
3478
3479 /* reset any error XLogReaderValidatePageHeader() might have set */
3481 goto next_record_is_invalid;
3482 }
3483
3484 return readLen;
3485
3486next_record_is_invalid:
3487
3488 /*
3489 * If we're reading ahead, give up fast. Retries and error reporting will
3490 * be handled by a later read when recovery catches up to this point.
3491 */
3493 return XLREAD_WOULDBLOCK;
3494
3495 lastSourceFailed = true;
3496
3497 if (readFile >= 0)
3498 close(readFile);
3499 readFile = -1;
3500 readLen = 0;
3502
3503 /* In standby-mode, keep trying */
3504 if (StandbyMode)
3505 goto retry;
3506 else
3507 return XLREAD_FAIL;
3508}
3509
3510/*
3511 * Open the WAL segment containing WAL location 'RecPtr'.
3512 *
3513 * The segment can be fetched via restore_command, or via walreceiver having
3514 * streamed the record, or it can already be present in pg_wal. Checking
3515 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3516 * too, in case someone copies a new segment directly to pg_wal. That is not
3517 * documented or recommended, though.
3518 *
3519 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3520 * prepare to read WAL starting from RedoStartLSN after this.
3521 *
3522 * 'RecPtr' might not point to the beginning of the record we're interested
3523 * in, it might also point to the page or segment header. In that case,
3524 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3525 * used to decide which timeline to stream the requested WAL from.
3526 *
3527 * 'replayLSN' is the current replay LSN, so that if we scan for new
3528 * timelines, we can reject a switch to a timeline that branched off before
3529 * this point.
3530 *
3531 * If the record is not immediately available, the function returns false
3532 * if we're not in standby mode. In standby mode, waits for it to become
3533 * available.
3534 *
3535 * When the requested record becomes available, the function opens the file
3536 * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3537 * of standby mode is triggered by the user, and there is no more WAL
3538 * available, returns XLREAD_FAIL.
3539 *
3540 * If nonblocking is true, then give up immediately if we can't satisfy the
3541 * request, returning XLREAD_WOULDBLOCK instead of waiting.
3542 */
3543static XLogPageReadResult
3545 bool fetching_ckpt, XLogRecPtr tliRecPtr,
3546 TimeLineID replayTLI, XLogRecPtr replayLSN,
3547 bool nonblocking)
3548{
3549 static TimestampTz last_fail_time = 0;
3551 bool streaming_reply_sent = false;
3552
3553 /*-------
3554 * Standby mode is implemented by a state machine:
3555 *
3556 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3557 * pg_wal (XLOG_FROM_PG_WAL)
3558 * 2. Check for promotion trigger request
3559 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3560 * 4. Rescan timelines
3561 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3562 *
3563 * Failure to read from the current source advances the state machine to
3564 * the next state.
3565 *
3566 * 'currentSource' indicates the current state. There are no currentSource
3567 * values for "check trigger", "rescan timelines", and "sleep" states,
3568 * those actions are taken when reading from the previous source fails, as
3569 * part of advancing to the next state.
3570 *
3571 * If standby mode is turned off while reading WAL from stream, we move
3572 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3573 * the files (which would be required at end of recovery, e.g., timeline
3574 * history file) from archive or pg_wal. We don't need to kill WAL receiver
3575 * here because it's already stopped when standby mode is turned off at
3576 * the end of recovery.
3577 *-------
3578 */
3579 if (!InArchiveRecovery)
3581 else if (currentSource == XLOG_FROM_ANY ||
3583 {
3584 lastSourceFailed = false;
3586 }
3587
3588 for (;;)
3589 {
3590 XLogSource oldSource = currentSource;
3591 bool startWalReceiver = false;
3592
3593 /*
3594 * First check if we failed to read from the current source, and
3595 * advance the state machine if so. The failure to read might've
3596 * happened outside this function, e.g when a CRC check fails on a
3597 * record, or within this loop.
3598 */
3599 if (lastSourceFailed)
3600 {
3601 /*
3602 * Don't allow any retry loops to occur during nonblocking
3603 * readahead. Let the caller process everything that has been
3604 * decoded already first.
3605 */
3606 if (nonblocking)
3607 return XLREAD_WOULDBLOCK;
3608
3609 switch (currentSource)
3610 {
3611 case XLOG_FROM_ARCHIVE:
3612 case XLOG_FROM_PG_WAL:
3613
3614 /*
3615 * Check to see if promotion is requested. Note that we do
3616 * this only after failure, so when you promote, we still
3617 * finish replaying as much as we can from archive and
3618 * pg_wal before failover.
3619 */
3621 {
3623 return XLREAD_FAIL;
3624 }
3625
3626 /*
3627 * Not in standby mode, and we've now tried the archive
3628 * and pg_wal.
3629 */
3630 if (!StandbyMode)
3631 return XLREAD_FAIL;
3632
3633 /*
3634 * Move to XLOG_FROM_STREAM state, and set to start a
3635 * walreceiver if necessary.
3636 */
3638 startWalReceiver = true;
3639 break;
3640
3641 case XLOG_FROM_STREAM:
3642
3643 /*
3644 * Failure while streaming. Most likely, we got here
3645 * because streaming replication was terminated, or
3646 * promotion was triggered. But we also get here if we
3647 * find an invalid record in the WAL streamed from the
3648 * primary, in which case something is seriously wrong.
3649 * There's little chance that the problem will just go
3650 * away, but PANIC is not good for availability either,
3651 * especially in hot standby mode. So, we treat that the
3652 * same as disconnection, and retry from archive/pg_wal
3653 * again. The WAL in the archive should be identical to
3654 * what was streamed, so it's unlikely that it helps, but
3655 * one can hope...
3656 */
3657
3658 /*
3659 * We should be able to move to XLOG_FROM_STREAM only in
3660 * standby mode.
3661 */
3663
3664 /*
3665 * Before we leave XLOG_FROM_STREAM state, make sure that
3666 * walreceiver is not active, so that it won't overwrite
3667 * WAL that we restore from archive.
3668 */
3670
3671 /*
3672 * Before we sleep, re-scan for possible new timelines if
3673 * we were requested to recover to the latest timeline.
3674 */
3676 {
3677 if (rescanLatestTimeLine(replayTLI, replayLSN))
3678 {
3680 break;
3681 }
3682 }
3683
3684 /*
3685 * XLOG_FROM_STREAM is the last state in our state
3686 * machine, so we've exhausted all the options for
3687 * obtaining the requested WAL. We're going to loop back
3688 * and retry from the archive, but if it hasn't been long
3689 * since last attempt, sleep wal_retrieve_retry_interval
3690 * milliseconds to avoid busy-waiting.
3691 */
3693 if (!TimestampDifferenceExceeds(last_fail_time, now,
3695 {
3696 long wait_time;
3697
3698 wait_time = wal_retrieve_retry_interval -
3699 TimestampDifferenceMilliseconds(last_fail_time, now);
3700
3701 elog(LOG, "waiting for WAL to become available at %X/%X",
3702 LSN_FORMAT_ARGS(RecPtr));
3703
3704 /* Do background tasks that might benefit us later. */
3706
3710 wait_time,
3711 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3714
3715 /* Handle interrupt signals of startup process */
3717 }
3718 last_fail_time = now;
3720 break;
3721
3722 default:
3723 elog(ERROR, "unexpected WAL source %d", currentSource);
3724 }
3725 }
3726 else if (currentSource == XLOG_FROM_PG_WAL)
3727 {
3728 /*
3729 * We just successfully read a file in pg_wal. We prefer files in
3730 * the archive over ones in pg_wal, so try the next file again
3731 * from the archive first.
3732 */
3735 }
3736
3737 if (currentSource != oldSource)
3738 elog(DEBUG2, "switched WAL source from %s to %s after %s",
3740 lastSourceFailed ? "failure" : "success");
3741
3742 /*
3743 * We've now handled possible failure. Try to read from the chosen
3744 * source.
3745 */
3746 lastSourceFailed = false;
3747
3748 switch (currentSource)
3749 {
3750 case XLOG_FROM_ARCHIVE:
3751 case XLOG_FROM_PG_WAL:
3752
3753 /*
3754 * WAL receiver must not be running when reading WAL from
3755 * archive or pg_wal.
3756 */
3758
3759 /* Close any old file we might have open. */
3760 if (readFile >= 0)
3761 {
3762 close(readFile);
3763 readFile = -1;
3764 }
3765 /* Reset curFileTLI if random fetch. */
3766 if (randAccess)
3767 curFileTLI = 0;
3768
3769 /*
3770 * Try to restore the file from archive, or read an existing
3771 * file from pg_wal.
3772 */
3776 if (readFile >= 0)
3777 return XLREAD_SUCCESS; /* success! */
3778
3779 /*
3780 * Nope, not found in archive or pg_wal.
3781 */
3782 lastSourceFailed = true;
3783 break;
3784
3785 case XLOG_FROM_STREAM:
3786 {
3787 bool havedata;
3788
3789 /*
3790 * We should be able to move to XLOG_FROM_STREAM only in
3791 * standby mode.
3792 */
3794
3795 /*
3796 * First, shutdown walreceiver if its restart has been
3797 * requested -- but no point if we're already slated for
3798 * starting it.
3799 */
3800 if (pendingWalRcvRestart && !startWalReceiver)
3801 {
3803
3804 /*
3805 * Re-scan for possible new timelines if we were
3806 * requested to recover to the latest timeline.
3807 */
3810 rescanLatestTimeLine(replayTLI, replayLSN);
3811
3812 startWalReceiver = true;
3813 }
3814 pendingWalRcvRestart = false;
3815
3816 /*
3817 * Launch walreceiver if needed.
3818 *
3819 * If fetching_ckpt is true, RecPtr points to the initial
3820 * checkpoint location. In that case, we use RedoStartLSN
3821 * as the streaming start position instead of RecPtr, so
3822 * that when we later jump backwards to start redo at
3823 * RedoStartLSN, we will have the logs streamed already.
3824 */
3825 if (startWalReceiver &&
3826 PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3827 {
3828 XLogRecPtr ptr;
3829 TimeLineID tli;
3830
3831 if (fetching_ckpt)
3832 {
3833 ptr = RedoStartLSN;
3834 tli = RedoStartTLI;
3835 }
3836 else
3837 {
3838 ptr = RecPtr;
3839
3840 /*
3841 * Use the record begin position to determine the
3842 * TLI, rather than the position we're reading.
3843 */
3844 tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3845
3846 if (curFileTLI > 0 && tli < curFileTLI)
3847 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3848 LSN_FORMAT_ARGS(tliRecPtr),
3849 tli, curFileTLI);
3850 }
3851 curFileTLI = tli;
3856 flushedUpto = 0;
3857 }
3858
3859 /*
3860 * Check if WAL receiver is active or wait to start up.
3861 */
3862 if (!WalRcvStreaming())
3863 {
3864 lastSourceFailed = true;
3865 break;
3866 }
3867
3868 /*
3869 * Walreceiver is active, so see if new data has arrived.
3870 *
3871 * We only advance XLogReceiptTime when we obtain fresh
3872 * WAL from walreceiver and observe that we had already
3873 * processed everything before the most recent "chunk"
3874 * that it flushed to disk. In steady state where we are
3875 * keeping up with the incoming data, XLogReceiptTime will
3876 * be updated on each cycle. When we are behind,
3877 * XLogReceiptTime will not advance, so the grace time
3878 * allotted to conflicting queries will decrease.
3879 */
3880 if (RecPtr < flushedUpto)
3881 havedata = true;
3882 else
3883 {
3884 XLogRecPtr latestChunkStart;
3885
3886 flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3887 if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3888 {
3889 havedata = true;
3890 if (latestChunkStart <= RecPtr)
3891 {
3894 }
3895 }
3896 else
3897 havedata = false;
3898 }
3899 if (havedata)
3900 {
3901 /*
3902 * Great, streamed far enough. Open the file if it's
3903 * not open already. Also read the timeline history
3904 * file if we haven't initialized timeline history
3905 * yet; it should be streamed over and present in
3906 * pg_wal by now. Use XLOG_FROM_STREAM so that source
3907 * info is set correctly and XLogReceiptTime isn't
3908 * changed.
3909 *
3910 * NB: We must set readTimeLineHistory based on
3911 * recoveryTargetTLI, not receiveTLI. Normally they'll
3912 * be the same, but if recovery_target_timeline is
3913 * 'latest' and archiving is configured, then it's
3914 * possible that we managed to retrieve one or more
3915 * new timeline history files from the archive,
3916 * updating recoveryTargetTLI.
3917 */
3918 if (readFile < 0)
3919 {
3920 if (!expectedTLEs)
3923 XLOG_FROM_STREAM, false);
3924 Assert(readFile >= 0);
3925 }
3926 else
3927 {
3928 /* just make sure source info is correct... */
3931 return XLREAD_SUCCESS;
3932 }
3933 break;
3934 }
3935
3936 /* In nonblocking mode, return rather than sleeping. */
3937 if (nonblocking)
3938 return XLREAD_WOULDBLOCK;
3939
3940 /*
3941 * Data not here yet. Check for trigger, then wait for
3942 * walreceiver to wake us up when new WAL arrives.
3943 */
3945 {
3946 /*
3947 * Note that we don't return XLREAD_FAIL immediately
3948 * here. After being triggered, we still want to
3949 * replay all the WAL that was already streamed. It's
3950 * in pg_wal now, so we just treat this as a failure,
3951 * and the state machine will move on to replay the
3952 * streamed WAL from pg_wal, and then recheck the
3953 * trigger and exit replay.
3954 */
3955 lastSourceFailed = true;
3956 break;
3957 }
3958
3959 /*
3960 * Since we have replayed everything we have received so
3961 * far and are about to start waiting for more WAL, let's
3962 * tell the upstream server our replay location now so
3963 * that pg_stat_replication doesn't show stale
3964 * information.
3965 */
3966 if (!streaming_reply_sent)
3967 {
3969 streaming_reply_sent = true;
3970 }
3971
3972 /* Do any background tasks that might benefit us later. */
3974
3975 /* Update pg_stat_recovery_prefetch before sleeping. */
3977
3978 /*
3979 * Wait for more WAL to arrive, when we will be woken
3980 * immediately by the WAL receiver.
3981 */
3984 -1L,
3985 WAIT_EVENT_RECOVERY_WAL_STREAM);
3987 break;
3988 }
3989
3990 default:
3991 elog(ERROR, "unexpected WAL source %d", currentSource);
3992 }
3993
3994 /*
3995 * Check for recovery pause here so that we can confirm more quickly
3996 * that a requested pause has actually taken effect.
3997 */
3998 if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4000 recoveryPausesHere(false);
4001
4002 /*
4003 * This possibly-long loop needs to handle interrupts of startup
4004 * process.
4005 */
4007 }
4008
4009 return XLREAD_FAIL; /* not reached */
4010}
4011
4012
4013/*
4014 * Determine what log level should be used to report a corrupt WAL record
4015 * in the current WAL page, previously read by XLogPageRead().
4016 *
4017 * 'emode' is the error mode that would be used to report a file-not-found
4018 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4019 * we're retrying the exact same record that we've tried previously, only
4020 * complain the first time to keep the noise down. However, we only do when
4021 * reading from pg_wal, because we don't expect any invalid records in archive
4022 * or in records streamed from the primary. Files in the archive should be complete,
4023 * and we should never hit the end of WAL because we stop and wait for more WAL
4024 * to arrive before replaying it.
4025 *
4026 * NOTE: This function remembers the RecPtr value it was last called with,
4027 * to suppress repeated messages about the same record. Only call this when
4028 * you are about to ereport(), or you might cause a later message to be
4029 * erroneously suppressed.
4030 */
4031static int
4033{
4034 static XLogRecPtr lastComplaint = 0;
4035
4036 if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4037 {
4038 if (RecPtr == lastComplaint)
4039 emode = DEBUG1;
4040 else
4041 lastComplaint = RecPtr;
4042 }
4043 return emode;
4044}
4045
4046
4047/*
4048 * Subroutine to try to fetch and validate a prior checkpoint record.
4049 */
4050static XLogRecord *
4052 TimeLineID replayTLI)
4053{
4054 XLogRecord *record;
4055 uint8 info;
4056
4057 Assert(xlogreader != NULL);
4058
4059 if (!XRecOffIsValid(RecPtr))
4060 {
4061 ereport(LOG,
4062 (errmsg("invalid checkpoint location")));
4063 return NULL;
4064 }
4065
4067 record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4068
4069 if (record == NULL)
4070 {
4071 ereport(LOG,
4072 (errmsg("invalid checkpoint record")));
4073 return NULL;
4074 }
4075 if (record->xl_rmid != RM_XLOG_ID)
4076 {
4077 ereport(LOG,
4078 (errmsg("invalid resource manager ID in checkpoint record")));
4079 return NULL;
4080 }
4081 info = record->xl_info & ~XLR_INFO_MASK;
4082 if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4083 info != XLOG_CHECKPOINT_ONLINE)
4084 {
4085 ereport(LOG,
4086 (errmsg("invalid xl_info in checkpoint record")));
4087 return NULL;
4088 }
4090 {
4091 ereport(LOG,
4092 (errmsg("invalid length of checkpoint record")));
4093 return NULL;
4094 }
4095 return record;
4096}
4097
4098/*
4099 * Scan for new timelines that might have appeared in the archive since we
4100 * started recovery.
4101 *
4102 * If there are any, the function changes recovery target TLI to the latest
4103 * one and returns 'true'.
4104 */
4105static bool
4107{
4108 List *newExpectedTLEs;
4109 bool found;
4110 ListCell *cell;
4111 TimeLineID newtarget;
4112 TimeLineID oldtarget = recoveryTargetTLI;
4113 TimeLineHistoryEntry *currentTle = NULL;
4114
4116 if (newtarget == recoveryTargetTLI)
4117 {
4118 /* No new timelines found */
4119 return false;
4120 }
4121
4122 /*
4123 * Determine the list of expected TLIs for the new TLI
4124 */
4125
4126 newExpectedTLEs = readTimeLineHistory(newtarget);
4127
4128 /*
4129 * If the current timeline is not part of the history of the new timeline,
4130 * we cannot proceed to it.
4131 */
4132 found = false;
4133 foreach(cell, newExpectedTLEs)
4134 {
4135 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4136
4137 if (currentTle->tli == recoveryTargetTLI)
4138 {
4139 found = true;
4140 break;
4141 }
4142 }
4143 if (!found)
4144 {
4145 ereport(LOG,
4146 (errmsg("new timeline %u is not a child of database system timeline %u",
4147 newtarget,
4148 replayTLI)));
4149 return false;
4150 }
4151
4152 /*
4153 * The current timeline was found in the history file, but check that the
4154 * next timeline was forked off from it *after* the current recovery
4155 * location.
4156 */
4157 if (currentTle->end < replayLSN)
4158 {
4159 ereport(LOG,
4160 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4161 newtarget,
4162 replayTLI,
4163 LSN_FORMAT_ARGS(replayLSN))));
4164 return false;
4165 }
4166
4167 /* The new timeline history seems valid. Switch target */
4168 recoveryTargetTLI = newtarget;
4170 expectedTLEs = newExpectedTLEs;
4171
4172 /*
4173 * As in StartupXLOG(), try to ensure we have all the history files
4174 * between the old target and new target in pg_wal.
4175 */
4176 restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4177
4178 ereport(LOG,
4179 (errmsg("new target timeline is %u",
4181
4182 return true;
4183}
4184
4185
4186/*
4187 * Open a logfile segment for reading (during recovery).
4188 *
4189 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4190 * Otherwise, it's assumed to be already available in pg_wal.
4191 */
4192static int
4194 XLogSource source, bool notfoundOk)
4195{
4196 char xlogfname[MAXFNAMELEN];
4197 char activitymsg[MAXFNAMELEN + 16];
4198 char path[MAXPGPATH];
4199 int fd;
4200
4201 XLogFileName(xlogfname, tli, segno, wal_segment_size);
4202
4203 switch (source)
4204 {
4205 case XLOG_FROM_ARCHIVE:
4206 /* Report recovery progress in PS display */
4207 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4208 xlogfname);
4209 set_ps_display(activitymsg);
4210
4211 if (!RestoreArchivedFile(path, xlogfname,
4212 "RECOVERYXLOG",
4214 InRedo))
4215 return -1;
4216 break;
4217
4218 case XLOG_FROM_PG_WAL:
4219 case XLOG_FROM_STREAM:
4220 XLogFilePath(path, tli, segno, wal_segment_size);
4221 break;
4222
4223 default:
4224 elog(ERROR, "invalid XLogFileRead source %d", source);
4225 }
4226
4227 /*
4228 * If the segment was fetched from archival storage, replace the existing
4229 * xlog segment (if any) with the archival version.
4230 */
4232 {
4234 KeepFileRestoredFromArchive(path, xlogfname);
4235
4236 /*
4237 * Set path to point at the new file in pg_wal.
4238 */
4239 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4240 }
4241
4242 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4243 if (fd >= 0)
4244 {
4245 /* Success! */
4246 curFileTLI = tli;
4247
4248 /* Report recovery progress in PS display */
4249 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4250 xlogfname);
4251 set_ps_display(activitymsg);
4252
4253 /* Track source of data in assorted state variables */
4256 /* In FROM_STREAM case, caller tracks receipt time, not me */
4257 if (source != XLOG_FROM_STREAM)
4259
4260 return fd;
4261 }
4262 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4263 ereport(PANIC,
4265 errmsg("could not open file \"%s\": %m", path)));
4266 return -1;
4267}
4268
4269/*
4270 * Open a logfile segment for reading (during recovery).
4271 *
4272 * This version searches for the segment with any TLI listed in expectedTLEs.
4273 */
4274static int
4276{
4277 char path[MAXPGPATH];
4278 ListCell *cell;
4279 int fd;
4280 List *tles;
4281
4282 /*
4283 * Loop looking for a suitable timeline ID: we might need to read any of
4284 * the timelines listed in expectedTLEs.
4285 *
4286 * We expect curFileTLI on entry to be the TLI of the preceding file in
4287 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4288 * to go backwards; this prevents us from picking up the wrong file when a
4289 * parent timeline extends to higher segment numbers than the child we
4290 * want to read.
4291 *
4292 * If we haven't read the timeline history file yet, read it now, so that
4293 * we know which TLIs to scan. We don't save the list in expectedTLEs,
4294 * however, unless we actually find a valid segment. That way if there is
4295 * neither a timeline history file nor a WAL segment in the archive, and
4296 * streaming replication is set up, we'll read the timeline history file
4297 * streamed from the primary when we start streaming, instead of
4298 * recovering with a dummy history generated here.
4299 */
4300 if (expectedTLEs)
4301 tles = expectedTLEs;
4302 else
4304
4305 foreach(cell, tles)
4306 {
4308 TimeLineID tli = hent->tli;
4309
4310 if (tli < curFileTLI)
4311 break; /* don't bother looking at too-old TLIs */
4312
4313 /*
4314 * Skip scanning the timeline ID that the logfile segment to read
4315 * doesn't belong to
4316 */
4317 if (hent->begin != InvalidXLogRecPtr)
4318 {
4319 XLogSegNo beginseg = 0;
4320
4321 XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4322
4323 /*
4324 * The logfile segment that doesn't belong to the timeline is
4325 * older or newer than the segment that the timeline started or
4326 * ended at, respectively. It's sufficient to check only the
4327 * starting segment of the timeline here. Since the timelines are
4328 * scanned in descending order in this loop, any segments newer
4329 * than the ending segment should belong to newer timeline and
4330 * have already been read before. So it's not necessary to check
4331 * the ending segment of the timeline here.
4332 */
4333 if (segno < beginseg)
4334 continue;
4335 }
4336
4338 {
4339 fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4340 if (fd != -1)
4341 {
4342 elog(DEBUG1, "got WAL segment from archive");
4343 if (!expectedTLEs)
4344 expectedTLEs = tles;
4345 return fd;
4346 }
4347 }
4348
4350 {
4351 fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4352 if (fd != -1)
4353 {
4354 if (!expectedTLEs)
4355 expectedTLEs = tles;
4356 return fd;
4357 }
4358 }
4359 }
4360
4361 /* Couldn't find it. For simplicity, complain about front timeline */
4363 errno = ENOENT;
4366 errmsg("could not open file \"%s\": %m", path)));
4367 return -1;
4368}
4369
4370/*
4371 * Set flag to signal the walreceiver to restart. (The startup process calls
4372 * this on noticing a relevant configuration change.)
4373 */
4374void
4376{
4378 {
4379 ereport(LOG,
4380 (errmsg("WAL receiver process shutdown requested")));
4381
4382 pendingWalRcvRestart = true;
4383 }
4384}
4385
4386
4387/*
4388 * Has a standby promotion already been triggered?
4389 *
4390 * Unlike CheckForStandbyTrigger(), this works in any process
4391 * that's connected to shared memory.
4392 */
4393bool
4395{
4396 /*
4397 * We check shared state each time only until a standby promotion is
4398 * triggered. We can't trigger a promotion again, so there's no need to
4399 * keep checking after the shared variable has once been seen true.
4400 */
4402 return true;
4403
4407
4409}
4410
4411static void
4413{
4417
4418 /*
4419 * Mark the recovery pause state as 'not paused' because the paused state
4420 * ends and promotion continues if a promotion is triggered while recovery
4421 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4422 * return 'paused' while a promotion is ongoing.
4423 */
4424 SetRecoveryPause(false);
4425
4427}
4428
4429/*
4430 * Check whether a promote request has arrived.
4431 */
4432static bool
4434{
4436 return true;
4437
4439 {
4440 ereport(LOG, (errmsg("received promote request")));
4444 return true;
4445 }
4446
4447 return false;
4448}
4449
4450/*
4451 * Remove the files signaling a standby promotion request.
4452 */
4453void
4455{
4456 unlink(PROMOTE_SIGNAL_FILE);
4457}
4458
4459/*
4460 * Check to see if a promote request has arrived.
4461 */
4462bool
4464{
4465 struct stat stat_buf;
4466
4467 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4468 return true;
4469
4470 return false;
4471}
4472
4473/*
4474 * Wake up startup process to replay newly arrived WAL, or to notice that
4475 * failover has been requested.
4476 */
4477void
4479{
4481}
4482
4483/*
4484 * Schedule a walreceiver wakeup in the main recovery loop.
4485 */
4486void
4488{
4490}
4491
4492/*
4493 * Is HotStandby active yet? This is only important in special backends
4494 * since normal backends won't ever be able to connect until this returns
4495 * true. Postmaster knows this by way of signal, not via shared memory.
4496 *
4497 * Unlike testing standbyState, this works in any process that's connected to
4498 * shared memory. (And note that standbyState alone doesn't tell the truth
4499 * anyway.)
4500 */
4501bool
4503{
4504 /*
4505 * We check shared state each time only until Hot Standby is active. We
4506 * can't de-activate Hot Standby, so there's no need to keep checking
4507 * after the shared variable has once been seen true.
4508 */
4510 return true;
4511 else
4512 {
4513 /* spinlock is essential on machines with weak memory ordering! */
4517
4518 return LocalHotStandbyActive;
4519 }
4520}
4521
4522/*
4523 * Like HotStandbyActive(), but to be used only in WAL replay code,
4524 * where we don't need to ask any other process what the state is.
4525 */
4526static bool
4528{
4530 return LocalHotStandbyActive;
4531}
4532
4533/*
4534 * Get latest redo apply position.
4535 *
4536 * Exported to allow WALReceiver to read the pointer directly.
4537 */
4540{
4541 XLogRecPtr recptr;
4542 TimeLineID tli;
4543
4548
4549 if (replayTLI)
4550 *replayTLI = tli;
4551 return recptr;
4552}
4553
4554
4555/*
4556 * Get position of last applied, or the record being applied.
4557 *
4558 * This is different from GetXLogReplayRecPtr() in that if a WAL
4559 * record is currently being applied, this includes that record.
4560 */
4563{
4564 XLogRecPtr recptr;
4565 TimeLineID tli;
4566
4571
4572 if (replayEndTLI)
4573 *replayEndTLI = tli;
4574 return recptr;
4575}
4576
4577/*
4578 * Save timestamp of latest processed commit/abort record.
4579 *
4580 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4581 * seen by processes other than the startup process. Note in particular
4582 * that CreateRestartPoint is executed in the checkpointer.
4583 */
4584static void
4586{
4590}
4591
4592/*
4593 * Fetch timestamp of latest processed commit/abort record.
4594 */
4597{
4598 TimestampTz xtime;
4599
4603
4604 return xtime;
4605}
4606
4607/*
4608 * Save timestamp of the next chunk of WAL records to apply.
4609 *
4610 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4611 * seen by all backends.
4612 */
4613static void
4615{
4619}
4620
4621/*
4622 * Fetch timestamp of latest processed commit/abort record.
4623 * Startup process maintains an accurate local copy in XLogReceiptTime
4624 */
4627{
4628 TimestampTz xtime;
4629
4633
4634 return xtime;
4635}
4636
4637/*
4638 * Returns time of receipt of current chunk of XLOG data, as well as
4639 * whether it was received from streaming replication or from archives.
4640 */
4641void
4642GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4643{
4644 /*
4645 * This must be executed in the startup process, since we don't export the
4646 * relevant state to shared memory.
4647 */
4649
4650 *rtime = XLogReceiptTime;
4651 *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4652}
4653
4654/*
4655 * Note that text field supplied is a parameter name and does not require
4656 * translation
4657 */
4658void
4659RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4660{
4661 if (currValue < minValue)
4662 {
4664 {
4665 bool warned_for_promote = false;
4666
4668 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4669 errmsg("hot standby is not possible because of insufficient parameter settings"),
4670 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4671 param_name,
4672 currValue,
4673 minValue)));
4674
4675 SetRecoveryPause(true);
4676
4677 ereport(LOG,
4678 (errmsg("recovery has paused"),
4679 errdetail("If recovery is unpaused, the server will shut down."),
4680 errhint("You can then restart the server after making the necessary configuration changes.")));
4681
4683 {
4685
4687 {
4688 if (!warned_for_promote)
4690 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4691 errmsg("promotion is not possible because of insufficient parameter settings"),
4692
4693 /*
4694 * Repeat the detail from above so it's easy to find
4695 * in the log.
4696 */
4697 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4698 param_name,
4699 currValue,
4700 minValue),
4701 errhint("Restart the server after making the necessary configuration changes.")));
4702 warned_for_promote = true;
4703 }
4704
4705 /*
4706 * If recovery pause is requested then set it paused. While
4707 * we are in the loop, user might resume and pause again so
4708 * set this every time.
4709 */
4711
4712 /*
4713 * We wait on a condition variable that will wake us as soon
4714 * as the pause ends, but we use a timeout so we can check the
4715 * above conditions periodically too.
4716 */
4718 WAIT_EVENT_RECOVERY_PAUSE);
4719 }
4721 }
4722
4723 ereport(FATAL,
4724 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4725 errmsg("recovery aborted because of insufficient parameter settings"),
4726 /* Repeat the detail from above so it's easy to find in the log. */
4727 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4728 param_name,
4729 currValue,
4730 minValue),
4731 errhint("You can restart the server after making the necessary configuration changes.")));
4732 }
4733}
4734
4735
4736/*
4737 * GUC check_hook for primary_slot_name
4738 */
4739bool
4741{
4742 if (*newval && strcmp(*newval, "") != 0 &&
4744 return false;
4745
4746 return true;
4747}
4748
4749/*
4750 * Recovery target settings: Only one of the several recovery_target* settings
4751 * may be set. Setting a second one results in an error. The global variable
4752 * recoveryTarget tracks which kind of recovery target was chosen. Other
4753 * variables store the actual target value (for example a string or a xid).
4754 * The assign functions of the parameters check whether a competing parameter
4755 * was already set. But we want to allow setting the same parameter multiple
4756 * times. We also want to allow unsetting a parameter and setting a different
4757 * one, so we unset recoveryTarget when the parameter is set to an empty
4758 * string.
4759 *
4760 * XXX this code is broken by design. Throwing an error from a GUC assign
4761 * hook breaks fundamental assumptions of guc.c. So long as all the variables
4762 * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4763 * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4764 * that we have odd behaviors such as unexpected GUC ordering dependencies.
4765 */
4766
4767static void
4769error_multiple_recovery_targets(void)
4770{
4771 ereport(ERROR,
4772 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4773 errmsg("multiple recovery targets specified"),
4774 errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4775}
4776
4777/*
4778 * GUC check_hook for recovery_target
4779 */
4780bool
4782{
4783 if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4784 {
4785 GUC_check_errdetail("The only allowed value is \"immediate\".");
4786 return false;
4787 }
4788 return true;
4789}
4790
4791/*
4792 * GUC assign_hook for recovery_target
4793 */
4794void
4795assign_recovery_target(const char *newval, void *extra)
4796{
4799 error_multiple_recovery_targets();
4800
4801 if (newval && strcmp(newval, "") != 0)
4803 else
4805}
4806
4807/*
4808 * GUC check_hook for recovery_target_lsn
4809 */
4810bool
4812{
4813 if (strcmp(*newval, "") != 0)
4814 {
4815 XLogRecPtr lsn;
4816 XLogRecPtr *myextra;
4817 bool have_error = false;
4818
4819 lsn = pg_lsn_in_internal(*newval, &have_error);
4820 if (have_error)
4821 return false;
4822
4823 myextra = (XLogRecPtr *) guc_malloc(ERROR, sizeof(XLogRecPtr));
4824 *myextra = lsn;
4825 *extra = myextra;
4826 }
4827 return true;
4828}
4829
4830/*
4831 * GUC assign_hook for recovery_target_lsn
4832 */
4833void
4834assign_recovery_target_lsn(const char *newval, void *extra)
4835{
4838 error_multiple_recovery_targets();
4839
4840 if (newval && strcmp(newval, "") != 0)
4841 {
4843 recoveryTargetLSN = *((XLogRecPtr *) extra);
4844 }
4845 else
4847}
4848
4849/*
4850 * GUC check_hook for recovery_target_name
4851 */
4852bool
4854{
4855 /* Use the value of newval directly */
4856 if (strlen(*newval) >= MAXFNAMELEN)
4857 {
4858 GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4859 "recovery_target_name", MAXFNAMELEN - 1);
4860 return false;
4861 }
4862 return true;
4863}
4864
4865/*
4866 * GUC assign_hook for recovery_target_name
4867 */
4868void
4869assign_recovery_target_name(const char *newval, void *extra)
4870{
4873 error_multiple_recovery_targets();
4874
4875 if (newval && strcmp(newval, "") != 0)
4876 {
4879 }
4880 else
4882}
4883
4884/*
4885 * GUC check_hook for recovery_target_time
4886 *
4887 * The interpretation of the recovery_target_time string can depend on the
4888 * time zone setting, so we need to wait until after all GUC processing is
4889 * done before we can do the final parsing of the string. This check function
4890 * only does a parsing pass to catch syntax errors, but we store the string
4891 * and parse it again when we need to use it.
4892 */
4893bool
4895{
4896 if (strcmp(*newval, "") != 0)
4897 {
4898 /* reject some special values */
4899 if (strcmp(*newval, "now") == 0 ||
4900 strcmp(*newval, "today") == 0 ||
4901 strcmp(*newval, "tomorrow") == 0 ||
4902 strcmp(*newval, "yesterday") == 0)
4903 {
4904 return false;
4905 }
4906
4907 /*
4908 * parse timestamp value (see also timestamptz_in())
4909 */
4910 {
4911 char *str = *newval;
4912 fsec_t fsec;
4913 struct pg_tm tt,
4914 *tm = &tt;
4915 int tz;
4916 int dtype;
4917 int nf;
4918 int dterr;
4919 char *field[MAXDATEFIELDS];
4920 int ftype[MAXDATEFIELDS];
4921 char workbuf[MAXDATELEN + MAXDATEFIELDS];
4922 DateTimeErrorExtra dtextra;
4924
4925 dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4926 field, ftype, MAXDATEFIELDS, &nf);
4927 if (dterr == 0)
4928 dterr = DecodeDateTime(field, ftype, nf,
4929 &dtype, tm, &fsec, &tz, &dtextra);
4930 if (dterr != 0)
4931 return false;
4932 if (dtype != DTK_DATE)
4933 return false;
4934
4935 if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4936 {
4937 GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4938 return false;
4939 }
4940 }
4941 }
4942 return true;
4943}
4944
4945/*
4946 * GUC assign_hook for recovery_target_time
4947 */
4948void
4949assign_recovery_target_time(const char *newval, void *extra)
4950{
4953 error_multiple_recovery_targets();
4954
4955 if (newval && strcmp(newval, "") != 0)
4957 else
4959}
4960
4961/*
4962 * GUC check_hook for recovery_target_timeline
4963 */
4964bool
4966{
4969
4970 if (strcmp(*newval, "current") == 0)
4972 else if (strcmp(*newval, "latest") == 0)
4974 else
4975 {
4977
4978 errno = 0;
4979 strtoul(*newval, NULL, 0);
4980 if (errno == EINVAL || errno == ERANGE)
4981 {
4982 GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number.");
4983 return false;
4984 }
4985 }
4986
4988 *myextra = rttg;
4989 *extra = myextra;
4990
4991 return true;
4992}
4993
4994/*
4995 * GUC assign_hook for recovery_target_timeline
4996 */
4997void
4999{
5002 recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5003 else
5005}
5006
5007/*
5008 * GUC check_hook for recovery_target_xid
5009 */
5010bool
5012{
5013 if (strcmp(*newval, "") != 0)
5014 {
5015 TransactionId xid;
5016 TransactionId *myextra;
5017
5018 errno = 0;
5019 xid = (TransactionId) strtou64(*newval, NULL, 0);
5020 if (errno == EINVAL || errno == ERANGE)
5021 return false;
5022
5023 myextra = (TransactionId *) guc_malloc(ERROR, sizeof(TransactionId));
5024 *myextra = xid;
5025 *extra = myextra;
5026 }
5027 return true;
5028}
5029
5030/*
5031 * GUC assign_hook for recovery_target_xid
5032 */
5033void
5034assign_recovery_target_xid(const char *newval, void *extra)
5035{
5038 error_multiple_recovery_targets();
5039
5040 if (newval && strcmp(newval, "") != 0)
5041 {
5043 recoveryTargetXid = *((TransactionId *) extra);
5044 }
5045 else
5047}
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition: timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:544
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:572
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition: timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition: timeline.c:50
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition: timeline.c:526
void remove_tablespace_symlink(const char *linkloc)
Definition: tablespace.c:883
bool allow_in_place_tablespaces
Definition: tablespace.c:85
void HandleStartupProcInterrupts(void)
Definition: startup.c:154
void disable_startup_progress_timeout(void)
Definition: startup.c:309
bool IsPromoteSignaled(void)
Definition: startup.c:288
void begin_startup_progress_phase(void)
Definition: startup.c:343
void ResetPromoteSignaled(void)
Definition: startup.c:294
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition: datetime.c:764
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition: datetime.c:988
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1756
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition: timestamp.c:1987
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1780
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition: timestamp.c:417
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1644
const char * timestamptz_to_str(TimestampTz t)
Definition: timestamp.c:1843
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1608
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4883
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5100
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:396
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:191
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:51
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:347
PageData * Page
Definition: bufpage.h:82
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
uint8_t uint8
Definition: c.h:486
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:204
#define Assert(condition)
Definition: c.h:815
#define PG_BINARY
Definition: c.h:1230
#define UINT64_FORMAT
Definition: c.h:507
int32_t int32
Definition: c.h:484
uint64_t uint64
Definition: c.h:489
uint32_t uint32
Definition: c.h:488
uint32 TransactionId
Definition: c.h:609
size_t Size
Definition: c.h:562
void RequestCheckpoint(int flags)
Definition: checkpointer.c:952
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition: timestamp.h:39
int32 fsec_t
Definition: timestamp.h:41
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157