PostgreSQL Source Code  git master
xlogrecovery.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * xlogrecovery.c
4  * Functions for WAL recovery, standby mode
5  *
6  * This source file contains functions controlling WAL recovery.
7  * InitWalRecovery() initializes the system for crash or archive recovery,
8  * or standby mode, depending on configuration options and the state of
9  * the control file and possible backup label file. PerformWalRecovery()
10  * performs the actual WAL replay, calling the rmgr-specific redo routines.
11  * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12  * and prepares information needed to initialize the WAL for writes. In
13  * addition to these three main functions, there are a bunch of functions
14  * for interrogating recovery state and controlling the recovery process.
15  *
16  *
17  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
18  * Portions Copyright (c) 1994, Regents of the University of California
19  *
20  * src/backend/access/transam/xlogrecovery.c
21  *
22  *-------------------------------------------------------------------------
23  */
24 
25 #include "postgres.h"
26 
27 #include <ctype.h>
28 #include <math.h>
29 #include <time.h>
30 #include <sys/stat.h>
31 #include <sys/time.h>
32 #include <unistd.h>
33 
34 #include "access/timeline.h"
35 #include "access/transam.h"
36 #include "access/xact.h"
37 #include "access/xlog_internal.h"
38 #include "access/xlogarchive.h"
39 #include "access/xlogprefetcher.h"
40 #include "access/xlogreader.h"
41 #include "access/xlogrecovery.h"
42 #include "access/xlogutils.h"
43 #include "backup/basebackup.h"
44 #include "catalog/pg_control.h"
45 #include "commands/tablespace.h"
46 #include "common/file_utils.h"
47 #include "miscadmin.h"
48 #include "pgstat.h"
49 #include "postmaster/bgwriter.h"
50 #include "postmaster/startup.h"
51 #include "replication/slot.h"
52 #include "replication/slotsync.h"
54 #include "storage/fd.h"
55 #include "storage/ipc.h"
56 #include "storage/latch.h"
57 #include "storage/pmsignal.h"
58 #include "storage/proc.h"
59 #include "storage/procarray.h"
60 #include "storage/spin.h"
61 #include "utils/builtins.h"
62 #include "utils/datetime.h"
63 #include "utils/guc_hooks.h"
64 #include "utils/pg_lsn.h"
65 #include "utils/ps_status.h"
66 #include "utils/pg_rusage.h"
67 
68 /* Unsupported old recovery command file names (relative to $PGDATA) */
69 #define RECOVERY_COMMAND_FILE "recovery.conf"
70 #define RECOVERY_COMMAND_DONE "recovery.done"
71 
72 /*
73  * GUC support
74  */
76  {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
77  {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
78  {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
79  {NULL, 0, false}
80 };
81 
82 /* options formerly taken from recovery.conf for archive recovery */
84 char *recoveryEndCommand = NULL;
85 char *archiveCleanupCommand = NULL;
92 const char *recoveryTargetName;
95 
96 /* options formerly taken from recovery.conf for XLOG streaming */
97 char *PrimaryConnInfo = NULL;
98 char *PrimarySlotName = NULL;
100 
101 /*
102  * recoveryTargetTimeLineGoal: what the user requested, if any
103  *
104  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
105  *
106  * recoveryTargetTLI: the currently understood target timeline; changes
107  *
108  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
109  * the timelines of its known parents, newest first (so recoveryTargetTLI is
110  * always the first list member). Only these TLIs are expected to be seen in
111  * the WAL segments we read, and indeed only these TLIs will be considered as
112  * candidate WAL files to open at all.
113  *
114  * curFileTLI: the TLI appearing in the name of the current input WAL file.
115  * (This is not necessarily the same as the timeline from which we are
116  * replaying WAL, which StartupXLOG calls replayTLI, because we could be
117  * scanning data that was copied from an ancestor timeline when the current
118  * file was created.) During a sequential scan we do not allow this value
119  * to decrease.
120  */
126 
127 /*
128  * When ArchiveRecoveryRequested is set, archive recovery was requested,
129  * ie. signal files were present. When InArchiveRecovery is set, we are
130  * currently recovering using offline XLOG archives. These variables are only
131  * valid in the startup process.
132  *
133  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
134  * currently performing crash recovery using only XLOG files in pg_wal, but
135  * will switch to using offline XLOG archives as soon as we reach the end of
136  * WAL in pg_wal.
137  */
139 bool InArchiveRecovery = false;
140 
141 /*
142  * When StandbyModeRequested is set, standby mode was requested, i.e.
143  * standby.signal file was present. When StandbyMode is set, we are currently
144  * in standby mode. These variables are only valid in the startup process.
145  * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
146  */
147 static bool StandbyModeRequested = false;
148 bool StandbyMode = false;
149 
150 /* was a signal file present at startup? */
151 static bool standby_signal_file_found = false;
152 static bool recovery_signal_file_found = false;
153 
154 /*
155  * CheckPointLoc is the position of the checkpoint record that determines
156  * where to start the replay. It comes from the backup label file or the
157  * control file.
158  *
159  * RedoStartLSN is the checkpoint's REDO location, also from the backup label
160  * file or the control file. In standby mode, XLOG streaming usually starts
161  * from the position where an invalid record was found. But if we fail to
162  * read even the initial checkpoint record, we use the REDO location instead
163  * of the checkpoint location as the start position of XLOG streaming.
164  * Otherwise we would have to jump backwards to the REDO location after
165  * reading the checkpoint record, because the REDO record can precede the
166  * checkpoint record.
167  */
172 
173 /*
174  * Local copy of SharedHotStandbyActive variable. False actually means "not
175  * known, need to check the shared state".
176  */
177 static bool LocalHotStandbyActive = false;
178 
179 /*
180  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
181  * known, need to check the shared state".
182  */
183 static bool LocalPromoteIsTriggered = false;
184 
185 /* Has the recovery code requested a walreceiver wakeup? */
187 
188 /* XLogReader object used to parse the WAL records */
190 
191 /* XLogPrefetcher object used to consume WAL records with read-ahead */
193 
194 /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
195 typedef struct XLogPageReadPrivate
196 {
197  int emode;
198  bool fetching_ckpt; /* are we fetching a checkpoint record? */
202 
203 /* flag to tell XLogPageRead that we have started replaying */
204 static bool InRedo = false;
205 
206 /*
207  * Codes indicating where we got a WAL file from during recovery, or where
208  * to attempt to get one.
209  */
210 typedef enum
211 {
212  XLOG_FROM_ANY = 0, /* request to read WAL from any source */
213  XLOG_FROM_ARCHIVE, /* restored using restore_command */
214  XLOG_FROM_PG_WAL, /* existing file in pg_wal */
215  XLOG_FROM_STREAM, /* streamed from primary */
216 } XLogSource;
217 
218 /* human-readable names for XLogSources, for debugging output */
219 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
220 
221 /*
222  * readFile is -1 or a kernel FD for the log file segment that's currently
223  * open for reading. readSegNo identifies the segment. readOff is the offset
224  * of the page just read, readLen indicates how much of it has been read into
225  * readBuf, and readSource indicates where we got the currently open file from.
226  *
227  * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
228  * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
229  * worthwhile, since the XLOG is not read by general-purpose sessions.
230  */
231 static int readFile = -1;
232 static XLogSegNo readSegNo = 0;
233 static uint32 readOff = 0;
234 static uint32 readLen = 0;
236 
237 /*
238  * Keeps track of which source we're currently reading from. This is
239  * different from readSource in that this is always set, even when we don't
240  * currently have a WAL file open. If lastSourceFailed is set, our last
241  * attempt to read from currentSource failed, and we should try another source
242  * next.
243  *
244  * pendingWalRcvRestart is set when a config change occurs that requires a
245  * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
246  */
248 static bool lastSourceFailed = false;
249 static bool pendingWalRcvRestart = false;
250 
251 /*
252  * These variables track when we last obtained some WAL data to process,
253  * and where we got it from. (XLogReceiptSource is initially the same as
254  * readSource, but readSource gets reset to zero when we don't have data
255  * to process right now. It is also different from currentSource, which
256  * also changes when we try to read from a source and fail, while
257  * XLogReceiptSource tracks where we last successfully read some WAL.)
258  */
261 
262 /* Local copy of WalRcv->flushedUpto */
265 
266 /*
267  * Copy of minRecoveryPoint and backupEndPoint from the control file.
268  *
269  * In order to reach consistency, we must replay the WAL up to
270  * minRecoveryPoint. If backupEndRequired is true, we must also reach
271  * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
272  * to backupStartPoint.
273  *
274  * Note: In archive recovery, after consistency has been reached, the
275  * functions in xlog.c will start updating minRecoveryPoint in the control
276  * file. But this copy of minRecoveryPoint variable reflects the value at the
277  * beginning of recovery, and is *not* updated after consistency is reached.
278  */
281 
284 static bool backupEndRequired = false;
285 
286 /*
287  * Have we reached a consistent database state? In crash recovery, we have
288  * to replay all the WAL, so reachedConsistency is never set. During archive
289  * recovery, the database is consistent once minRecoveryPoint is reached.
290  *
291  * Consistent state means that the system is internally consistent, all
292  * the WAL has been replayed up to a certain point, and importantly, there
293  * is no trace of later actions on disk.
294  */
295 bool reachedConsistency = false;
296 
297 /* Buffers dedicated to consistency checks of size BLCKSZ */
298 static char *replay_image_masked = NULL;
299 static char *primary_image_masked = NULL;
300 
301 
302 /*
303  * Shared-memory state for WAL recovery.
304  */
305 typedef struct XLogRecoveryCtlData
306 {
307  /*
308  * SharedHotStandbyActive indicates if we allow hot standby queries to be
309  * run. Protected by info_lck.
310  */
312 
313  /*
314  * SharedPromoteIsTriggered indicates if a standby promotion has been
315  * triggered. Protected by info_lck.
316  */
318 
319  /*
320  * recoveryWakeupLatch is used to wake up the startup process to continue
321  * WAL replay, if it is waiting for WAL to arrive or promotion to be
322  * requested.
323  *
324  * Note that the startup process also uses another latch, its procLatch,
325  * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
326  * signaling the startup process in favor of using its procLatch, which
327  * comports better with possible generic signal handlers using that latch.
328  * But we should not do that because the startup process doesn't assume
329  * that it's waken up by walreceiver process or SIGHUP signal handler
330  * while it's waiting for recovery conflict. The separate latches,
331  * recoveryWakeupLatch and procLatch, should be used for inter-process
332  * communication for WAL replay and recovery conflict, respectively.
333  */
335 
336  /*
337  * Last record successfully replayed.
338  */
339  XLogRecPtr lastReplayedReadRecPtr; /* start position */
340  XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
341  TimeLineID lastReplayedTLI; /* timeline */
342 
343  /*
344  * When we're currently replaying a record, ie. in a redo function,
345  * replayEndRecPtr points to the end+1 of the record being replayed,
346  * otherwise it's equal to lastReplayedEndRecPtr.
347  */
350  /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
352 
353  /*
354  * timestamp of when we started replaying the current chunk of WAL data,
355  * only relevant for replication or archive recovery
356  */
358  /* Recovery pause state */
361 
362  slock_t info_lck; /* locks shared variables shown above */
364 
366 
367 /*
368  * abortedRecPtr is the start pointer of a broken record at end of WAL when
369  * recovery completes; missingContrecPtr is the location of the first
370  * contrecord that went missing. See CreateOverwriteContrecordRecord for
371  * details.
372  */
375 
376 /*
377  * if recoveryStopsBefore/After returns true, it saves information of the stop
378  * point here
379  */
384 static bool recoveryStopAfter;
385 
386 /* prototypes for local functions */
387 static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
388 
389 static void EnableStandbyMode(void);
390 static void readRecoverySignalFile(void);
391 static void validateRecoveryParameters(void);
392 static bool read_backup_label(XLogRecPtr *checkPointLoc,
393  TimeLineID *backupLabelTLI,
394  bool *backupEndRequired, bool *backupFromStandby);
395 static bool read_tablespace_map(List **tablespaces);
396 
397 static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
398 static void CheckRecoveryConsistency(void);
399 static void rm_redo_error_callback(void *arg);
400 #ifdef WAL_DEBUG
401 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
402 #endif
403 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
404 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
405  TimeLineID prevTLI, TimeLineID replayTLI);
406 static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
407 static void verifyBackupPageConsistency(XLogReaderState *record);
408 
409 static bool recoveryStopsBefore(XLogReaderState *record);
410 static bool recoveryStopsAfter(XLogReaderState *record);
411 static char *getRecoveryStopReason(void);
412 static void recoveryPausesHere(bool endOfRecovery);
413 static bool recoveryApplyDelay(XLogReaderState *record);
414 static void ConfirmRecoveryPaused(void);
415 
417  int emode, bool fetching_ckpt,
418  TimeLineID replayTLI);
419 
420 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
421  int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
423  bool randAccess,
424  bool fetching_ckpt,
425  XLogRecPtr tliRecPtr,
426  TimeLineID replayTLI,
427  XLogRecPtr replayLSN,
428  bool nonblocking);
429 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
431  XLogRecPtr RecPtr, TimeLineID replayTLI);
432 static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
433 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
434  XLogSource source, bool notfoundOk);
435 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
436 
437 static bool CheckForStandbyTrigger(void);
438 static void SetPromoteIsTriggered(void);
439 static bool HotStandbyActiveInReplay(void);
440 
441 static void SetCurrentChunkStartTime(TimestampTz xtime);
442 static void SetLatestXTime(TimestampTz xtime);
443 
444 /*
445  * Initialization of shared memory for WAL recovery
446  */
447 Size
449 {
450  Size size;
451 
452  /* XLogRecoveryCtl */
453  size = sizeof(XLogRecoveryCtlData);
454 
455  return size;
456 }
457 
458 void
460 {
461  bool found;
462 
464  ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
465  if (found)
466  return;
467  memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
468 
472 }
473 
474 /*
475  * A thin wrapper to enable StandbyMode and do other preparatory work as
476  * needed.
477  */
478 static void
480 {
481  StandbyMode = true;
482 
483  /*
484  * To avoid server log bloat, we don't report recovery progress in a
485  * standby as it will always be in recovery unless promoted. We disable
486  * startup progress timeout in standby mode to avoid calling
487  * startup_progress_timeout_handler() unnecessarily.
488  */
490 }
491 
492 /*
493  * Prepare the system for WAL recovery, if needed.
494  *
495  * This is called by StartupXLOG() which coordinates the server startup
496  * sequence. This function analyzes the control file and the backup label
497  * file, if any, and figures out whether we need to perform crash recovery or
498  * archive recovery, and how far we need to replay the WAL to reach a
499  * consistent state.
500  *
501  * This doesn't yet change the on-disk state, except for creating the symlinks
502  * from table space map file if any, and for fetching WAL files needed to find
503  * the checkpoint record. On entry, the caller has already read the control
504  * file into memory, and passes it as argument. This function updates it to
505  * reflect the recovery state, and the caller is expected to write it back to
506  * disk does after initializing other subsystems, but before calling
507  * PerformWalRecovery().
508  *
509  * This initializes some global variables like ArchiveRecoveryRequested, and
510  * StandbyModeRequested and InRecovery.
511  */
512 void
514  bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
515 {
516  XLogPageReadPrivate *private;
517  struct stat st;
518  bool wasShutdown;
519  XLogRecord *record;
520  DBState dbstate_at_startup;
521  bool haveTblspcMap = false;
522  bool haveBackupLabel = false;
523  CheckPoint checkPoint;
524  bool backupFromStandby = false;
525 
526  dbstate_at_startup = ControlFile->state;
527 
528  /*
529  * Initialize on the assumption we want to recover to the latest timeline
530  * that's active according to pg_control.
531  */
535  else
537 
538  /*
539  * Check for signal files, and if so set up state for offline recovery
540  */
543 
544  /*
545  * Take ownership of the wakeup latch if we're going to sleep during
546  * recovery, if required.
547  */
550 
551  /*
552  * Set the WAL reading processor now, as it will be needed when reading
553  * the checkpoint record required (backup_label or not).
554  */
555  private = palloc0(sizeof(XLogPageReadPrivate));
556  xlogreader =
558  XL_ROUTINE(.page_read = &XLogPageRead,
559  .segment_open = NULL,
560  .segment_close = wal_segment_close),
561  private);
562  if (!xlogreader)
563  ereport(ERROR,
564  (errcode(ERRCODE_OUT_OF_MEMORY),
565  errmsg("out of memory"),
566  errdetail("Failed while allocating a WAL reading processor.")));
568 
569  /*
570  * Set the WAL decode buffer size. This limits how far ahead we can read
571  * in the WAL.
572  */
574 
575  /* Create a WAL prefetcher. */
577 
578  /*
579  * Allocate two page buffers dedicated to WAL consistency checks. We do
580  * it this way, rather than just making static arrays, for two reasons:
581  * (1) no need to waste the storage in most instantiations of the backend;
582  * (2) a static char array isn't guaranteed to have any particular
583  * alignment, whereas palloc() will provide MAXALIGN'd storage.
584  */
585  replay_image_masked = (char *) palloc(BLCKSZ);
586  primary_image_masked = (char *) palloc(BLCKSZ);
587 
588  /*
589  * Read the backup_label file. We want to run this part of the recovery
590  * process after checking for signal files and after performing validation
591  * of the recovery parameters.
592  */
594  &backupFromStandby))
595  {
596  List *tablespaces = NIL;
597 
598  /*
599  * Archive recovery was requested, and thanks to the backup label
600  * file, we know how far we need to replay to reach consistency. Enter
601  * archive recovery directly.
602  */
603  InArchiveRecovery = true;
606 
607  /*
608  * Omitting backup_label when creating a new replica, PITR node etc.
609  * unfortunately is a common cause of corruption. Logging that
610  * backup_label was used makes it a bit easier to exclude that as the
611  * cause of observed corruption.
612  *
613  * Do so before we try to read the checkpoint record (which can fail),
614  * as otherwise it can be hard to understand why a checkpoint other
615  * than ControlFile->checkPoint is used.
616  */
617  ereport(LOG,
618  (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
621  CheckPointTLI)));
622 
623  /*
624  * When a backup_label file is present, we want to roll forward from
625  * the checkpoint it identifies, rather than using pg_control.
626  */
628  CheckPointTLI);
629  if (record != NULL)
630  {
631  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
632  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
633  ereport(DEBUG1,
634  (errmsg_internal("checkpoint record is at %X/%X",
636  InRecovery = true; /* force recovery even if SHUTDOWNED */
637 
638  /*
639  * Make sure that REDO location exists. This may not be the case
640  * if there was a crash during an online backup, which left a
641  * backup_label around that references a WAL segment that's
642  * already been archived.
643  */
644  if (checkPoint.redo < CheckPointLoc)
645  {
647  if (!ReadRecord(xlogprefetcher, LOG, false,
648  checkPoint.ThisTimeLineID))
649  ereport(FATAL,
650  (errmsg("could not find redo location referenced by checkpoint record"),
651  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
652  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
653  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
655  }
656  }
657  else
658  {
659  ereport(FATAL,
660  (errmsg("could not locate required checkpoint record"),
661  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
662  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
663  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
665  wasShutdown = false; /* keep compiler quiet */
666  }
667 
668  /* Read the tablespace_map file if present and create symlinks. */
669  if (read_tablespace_map(&tablespaces))
670  {
671  ListCell *lc;
672 
673  foreach(lc, tablespaces)
674  {
675  tablespaceinfo *ti = lfirst(lc);
676  char *linkloc;
677 
678  linkloc = psprintf("pg_tblspc/%u", ti->oid);
679 
680  /*
681  * Remove the existing symlink if any and Create the symlink
682  * under PGDATA.
683  */
684  remove_tablespace_symlink(linkloc);
685 
686  if (symlink(ti->path, linkloc) < 0)
687  ereport(ERROR,
689  errmsg("could not create symbolic link \"%s\": %m",
690  linkloc)));
691 
692  pfree(ti->path);
693  pfree(ti);
694  }
695 
696  /* tell the caller to delete it later */
697  haveTblspcMap = true;
698  }
699 
700  /* tell the caller to delete it later */
701  haveBackupLabel = true;
702  }
703  else
704  {
705  /* No backup_label file has been found if we are here. */
706 
707  /*
708  * If tablespace_map file is present without backup_label file, there
709  * is no use of such file. There is no harm in retaining it, but it
710  * is better to get rid of the map file so that we don't have any
711  * redundant file in data directory and it will avoid any sort of
712  * confusion. It seems prudent though to just rename the file out of
713  * the way rather than delete it completely, also we ignore any error
714  * that occurs in rename operation as even if map file is present
715  * without backup_label file, it is harmless.
716  */
717  if (stat(TABLESPACE_MAP, &st) == 0)
718  {
719  unlink(TABLESPACE_MAP_OLD);
721  ereport(LOG,
722  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
724  errdetail("File \"%s\" was renamed to \"%s\".",
726  else
727  ereport(LOG,
728  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
730  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
732  }
733 
734  /*
735  * It's possible that archive recovery was requested, but we don't
736  * know how far we need to replay the WAL before we reach consistency.
737  * This can happen for example if a base backup is taken from a
738  * running server using an atomic filesystem snapshot, without calling
739  * pg_backup_start/stop. Or if you just kill a running primary server
740  * and put it into archive recovery by creating a recovery signal
741  * file.
742  *
743  * Our strategy in that case is to perform crash recovery first,
744  * replaying all the WAL present in pg_wal, and only enter archive
745  * recovery after that.
746  *
747  * But usually we already know how far we need to replay the WAL (up
748  * to minRecoveryPoint, up to backupEndPoint, or until we see an
749  * end-of-backup record), and we can enter archive recovery directly.
750  */
756  {
757  InArchiveRecovery = true;
760  }
761 
762  /*
763  * For the same reason as when starting up with backup_label present,
764  * emit a log message when we continue initializing from a base
765  * backup.
766  */
768  ereport(LOG,
769  (errmsg("restarting backup recovery with redo LSN %X/%X",
771 
772  /* Get the last valid checkpoint record. */
778  CheckPointTLI);
779  if (record != NULL)
780  {
781  ereport(DEBUG1,
782  (errmsg_internal("checkpoint record is at %X/%X",
784  }
785  else
786  {
787  /*
788  * We used to attempt to go back to a secondary checkpoint record
789  * here, but only when not in standby mode. We now just fail if we
790  * can't read the last checkpoint because this allows us to
791  * simplify processing around checkpoints.
792  */
793  ereport(PANIC,
794  (errmsg("could not locate a valid checkpoint record")));
795  }
796  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
797  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
798  }
799 
801  {
803  ereport(LOG,
804  (errmsg("entering standby mode")));
806  ereport(LOG,
807  (errmsg("starting point-in-time recovery to XID %u",
810  ereport(LOG,
811  (errmsg("starting point-in-time recovery to %s",
814  ereport(LOG,
815  (errmsg("starting point-in-time recovery to \"%s\"",
818  ereport(LOG,
819  (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
822  ereport(LOG,
823  (errmsg("starting point-in-time recovery to earliest consistent point")));
824  else
825  ereport(LOG,
826  (errmsg("starting archive recovery")));
827  }
828 
829  /*
830  * If the location of the checkpoint record is not on the expected
831  * timeline in the history of the requested timeline, we cannot proceed:
832  * the backup is not part of the history of the requested timeline.
833  */
834  Assert(expectedTLEs); /* was initialized by reading checkpoint
835  * record */
838  {
839  XLogRecPtr switchpoint;
840 
841  /*
842  * tliSwitchPoint will throw an error if the checkpoint's timeline is
843  * not in expectedTLEs at all.
844  */
846  ereport(FATAL,
847  (errmsg("requested timeline %u is not a child of this server's history",
849  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
852  LSN_FORMAT_ARGS(switchpoint))));
853  }
854 
855  /*
856  * The min recovery point should be part of the requested timeline's
857  * history, too.
858  */
862  ereport(FATAL,
863  (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
867 
868  ereport(DEBUG1,
869  (errmsg_internal("redo record is at %X/%X; shutdown %s",
870  LSN_FORMAT_ARGS(checkPoint.redo),
871  wasShutdown ? "true" : "false")));
872  ereport(DEBUG1,
873  (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
874  U64FromFullTransactionId(checkPoint.nextXid),
875  checkPoint.nextOid)));
876  ereport(DEBUG1,
877  (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
878  checkPoint.nextMulti, checkPoint.nextMultiOffset)));
879  ereport(DEBUG1,
880  (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
881  checkPoint.oldestXid, checkPoint.oldestXidDB)));
882  ereport(DEBUG1,
883  (errmsg_internal("oldest MultiXactId: %u, in database %u",
884  checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
885  ereport(DEBUG1,
886  (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
887  checkPoint.oldestCommitTsXid,
888  checkPoint.newestCommitTsXid)));
890  ereport(PANIC,
891  (errmsg("invalid next transaction ID")));
892 
893  /* sanity check */
894  if (checkPoint.redo > CheckPointLoc)
895  ereport(PANIC,
896  (errmsg("invalid redo in checkpoint record")));
897 
898  /*
899  * Check whether we need to force recovery from WAL. If it appears to
900  * have been a clean shutdown and we did not have a recovery signal file,
901  * then assume no recovery needed.
902  */
903  if (checkPoint.redo < CheckPointLoc)
904  {
905  if (wasShutdown)
906  ereport(PANIC,
907  (errmsg("invalid redo record in shutdown checkpoint")));
908  InRecovery = true;
909  }
910  else if (ControlFile->state != DB_SHUTDOWNED)
911  InRecovery = true;
912  else if (ArchiveRecoveryRequested)
913  {
914  /* force recovery due to presence of recovery signal file */
915  InRecovery = true;
916  }
917 
918  /*
919  * If recovery is needed, update our in-memory copy of pg_control to show
920  * that we are recovering and to show the selected checkpoint as the place
921  * we are starting from. We also mark pg_control with any minimum recovery
922  * stop point obtained from a backup history file.
923  *
924  * We don't write the changes to disk yet, though. Only do that after
925  * initializing various subsystems.
926  */
927  if (InRecovery)
928  {
929  if (InArchiveRecovery)
930  {
932  }
933  else
934  {
935  ereport(LOG,
936  (errmsg("database system was not properly shut down; "
937  "automatic recovery in progress")));
939  ereport(LOG,
940  (errmsg("crash recovery starts in timeline %u "
941  "and has target timeline %u",
945  }
947  ControlFile->checkPointCopy = checkPoint;
948  if (InArchiveRecovery)
949  {
950  /* initialize minRecoveryPoint if not set yet */
951  if (ControlFile->minRecoveryPoint < checkPoint.redo)
952  {
953  ControlFile->minRecoveryPoint = checkPoint.redo;
955  }
956  }
957 
958  /*
959  * Set backupStartPoint if we're starting recovery from a base backup.
960  *
961  * Also set backupEndPoint and use minRecoveryPoint as the backup end
962  * location if we're starting recovery from a base backup which was
963  * taken from a standby. In this case, the database system status in
964  * pg_control must indicate that the database was already in recovery.
965  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
966  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
967  * before reaching this point; e.g. because restore_command or
968  * primary_conninfo were faulty.
969  *
970  * Any other state indicates that the backup somehow became corrupted
971  * and we can't sensibly continue with recovery.
972  */
973  if (haveBackupLabel)
974  {
975  ControlFile->backupStartPoint = checkPoint.redo;
977 
978  if (backupFromStandby)
979  {
980  if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
981  dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
982  ereport(FATAL,
983  (errmsg("backup_label contains data inconsistent with control file"),
984  errhint("This means that the backup is corrupted and you will "
985  "have to use another backup for recovery.")));
987  }
988  }
989  }
990 
991  /* remember these, so that we know when we have reached consistency */
995  if (InArchiveRecovery)
996  {
999  }
1000  else
1001  {
1003  minRecoveryPointTLI = 0;
1004  }
1005 
1006  /*
1007  * Start recovery assuming that the final record isn't lost.
1008  */
1011 
1012  *wasShutdown_ptr = wasShutdown;
1013  *haveBackupLabel_ptr = haveBackupLabel;
1014  *haveTblspcMap_ptr = haveTblspcMap;
1015 }
1016 
1017 /*
1018  * See if there are any recovery signal files and if so, set state for
1019  * recovery.
1020  *
1021  * See if there is a recovery command file (recovery.conf), and if so
1022  * throw an ERROR since as of PG12 we no longer recognize that.
1023  */
1024 static void
1026 {
1027  struct stat stat_buf;
1028 
1030  return;
1031 
1032  /*
1033  * Check for old recovery API file: recovery.conf
1034  */
1035  if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1036  ereport(FATAL,
1038  errmsg("using recovery command file \"%s\" is not supported",
1040 
1041  /*
1042  * Remove unused .done file, if present. Ignore if absent.
1043  */
1044  unlink(RECOVERY_COMMAND_DONE);
1045 
1046  /*
1047  * Check for recovery signal files and if found, fsync them since they
1048  * represent server state information. We don't sweat too much about the
1049  * possibility of fsync failure, however.
1050  *
1051  * If present, standby signal file takes precedence. If neither is present
1052  * then we won't enter archive recovery.
1053  */
1054  if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1055  {
1056  int fd;
1057 
1059  S_IRUSR | S_IWUSR);
1060  if (fd >= 0)
1061  {
1062  (void) pg_fsync(fd);
1063  close(fd);
1064  }
1066  }
1067  else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1068  {
1069  int fd;
1070 
1072  S_IRUSR | S_IWUSR);
1073  if (fd >= 0)
1074  {
1075  (void) pg_fsync(fd);
1076  close(fd);
1077  }
1079  }
1080 
1081  StandbyModeRequested = false;
1082  ArchiveRecoveryRequested = false;
1084  {
1085  StandbyModeRequested = true;
1086  ArchiveRecoveryRequested = true;
1087  }
1088  else if (recovery_signal_file_found)
1089  {
1090  StandbyModeRequested = false;
1091  ArchiveRecoveryRequested = true;
1092  }
1093  else
1094  return;
1095 
1096  /*
1097  * We don't support standby mode in standalone backends; that requires
1098  * other processes such as the WAL receiver to be alive.
1099  */
1101  ereport(FATAL,
1102  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1103  errmsg("standby mode is not supported by single-user servers")));
1104 }
1105 
1106 static void
1108 {
1110  return;
1111 
1112  /*
1113  * Check for compulsory parameters
1114  */
1116  {
1117  if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1118  (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1119  ereport(WARNING,
1120  (errmsg("specified neither primary_conninfo nor restore_command"),
1121  errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1122  }
1123  else
1124  {
1125  if (recoveryRestoreCommand == NULL ||
1126  strcmp(recoveryRestoreCommand, "") == 0)
1127  ereport(FATAL,
1128  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1129  errmsg("must specify restore_command when standby mode is not enabled")));
1130  }
1131 
1132  /*
1133  * Override any inconsistent requests. Note that this is a change of
1134  * behaviour in 9.5; prior to this we simply ignored a request to pause if
1135  * hot_standby = off, which was surprising behaviour.
1136  */
1140 
1141  /*
1142  * Final parsing of recovery_target_time string; see also
1143  * check_recovery_target_time().
1144  */
1146  {
1150  Int32GetDatum(-1)));
1151  }
1152 
1153  /*
1154  * If user specified recovery_target_timeline, validate it or compute the
1155  * "latest" value. We can't do this until after we've gotten the restore
1156  * command and set InArchiveRecovery, because we need to fetch timeline
1157  * history files from the archive.
1158  */
1160  {
1162 
1163  /* Timeline 1 does not have a history file, all else should */
1164  if (rtli != 1 && !existsTimeLineHistory(rtli))
1165  ereport(FATAL,
1166  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1167  errmsg("recovery target timeline %u does not exist",
1168  rtli)));
1169  recoveryTargetTLI = rtli;
1170  }
1172  {
1173  /* We start the "latest" search from pg_control's timeline */
1175  }
1176  else
1177  {
1178  /*
1179  * else we just use the recoveryTargetTLI as already read from
1180  * ControlFile
1181  */
1183  }
1184 }
1185 
1186 /*
1187  * read_backup_label: check to see if a backup_label file is present
1188  *
1189  * If we see a backup_label during recovery, we assume that we are recovering
1190  * from a backup dump file, and we therefore roll forward from the checkpoint
1191  * identified by the label file, NOT what pg_control says. This avoids the
1192  * problem that pg_control might have been archived one or more checkpoints
1193  * later than the start of the dump, and so if we rely on it as the start
1194  * point, we will fail to restore a consistent database state.
1195  *
1196  * Returns true if a backup_label was found (and fills the checkpoint
1197  * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1198  * returns false if not. If this backup_label came from a streamed backup,
1199  * *backupEndRequired is set to true. If this backup_label was created during
1200  * recovery, *backupFromStandby is set to true.
1201  *
1202  * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1203  * and TLI read from the backup file.
1204  */
1205 static bool
1206 read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1207  bool *backupEndRequired, bool *backupFromStandby)
1208 {
1209  char startxlogfilename[MAXFNAMELEN];
1210  TimeLineID tli_from_walseg,
1211  tli_from_file;
1212  FILE *lfp;
1213  char ch;
1214  char backuptype[20];
1215  char backupfrom[20];
1216  char backuplabel[MAXPGPATH];
1217  char backuptime[128];
1218  uint32 hi,
1219  lo;
1220 
1221  /* suppress possible uninitialized-variable warnings */
1222  *checkPointLoc = InvalidXLogRecPtr;
1223  *backupLabelTLI = 0;
1224  *backupEndRequired = false;
1225  *backupFromStandby = false;
1226 
1227  /*
1228  * See if label file is present
1229  */
1230  lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1231  if (!lfp)
1232  {
1233  if (errno != ENOENT)
1234  ereport(FATAL,
1236  errmsg("could not read file \"%s\": %m",
1237  BACKUP_LABEL_FILE)));
1238  return false; /* it's not there, all is fine */
1239  }
1240 
1241  /*
1242  * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1243  * is pretty crude, but we are not expecting any variability in the file
1244  * format).
1245  */
1246  if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1247  &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1248  ereport(FATAL,
1249  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1250  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1251  RedoStartLSN = ((uint64) hi) << 32 | lo;
1252  RedoStartTLI = tli_from_walseg;
1253  if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1254  &hi, &lo, &ch) != 3 || ch != '\n')
1255  ereport(FATAL,
1256  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1257  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1258  *checkPointLoc = ((uint64) hi) << 32 | lo;
1259  *backupLabelTLI = tli_from_walseg;
1260 
1261  /*
1262  * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1263  * which could mean either pg_basebackup or the pg_backup_start/stop
1264  * method was used) or if this label came from somewhere else (the only
1265  * other option today being from pg_rewind). If this was a streamed
1266  * backup then we know that we need to play through until we get to the
1267  * end of the WAL which was generated during the backup (at which point we
1268  * will have reached consistency and backupEndRequired will be reset to be
1269  * false).
1270  */
1271  if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1272  {
1273  if (strcmp(backuptype, "streamed") == 0)
1274  *backupEndRequired = true;
1275  }
1276 
1277  /*
1278  * BACKUP FROM lets us know if this was from a primary or a standby. If
1279  * it was from a standby, we'll double-check that the control file state
1280  * matches that of a standby.
1281  */
1282  if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1283  {
1284  if (strcmp(backupfrom, "standby") == 0)
1285  *backupFromStandby = true;
1286  }
1287 
1288  /*
1289  * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1290  * but checking for their presence is useful for debugging and the next
1291  * sanity checks. Cope also with the fact that the result buffers have a
1292  * pre-allocated size, hence if the backup_label file has been generated
1293  * with strings longer than the maximum assumed here an incorrect parsing
1294  * happens. That's fine as only minor consistency checks are done
1295  * afterwards.
1296  */
1297  if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1298  ereport(DEBUG1,
1299  (errmsg_internal("backup time %s in file \"%s\"",
1300  backuptime, BACKUP_LABEL_FILE)));
1301 
1302  if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1303  ereport(DEBUG1,
1304  (errmsg_internal("backup label %s in file \"%s\"",
1305  backuplabel, BACKUP_LABEL_FILE)));
1306 
1307  /*
1308  * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1309  * it as a sanity check if present.
1310  */
1311  if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1312  {
1313  if (tli_from_walseg != tli_from_file)
1314  ereport(FATAL,
1315  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1316  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1317  errdetail("Timeline ID parsed is %u, but expected %u.",
1318  tli_from_file, tli_from_walseg)));
1319 
1320  ereport(DEBUG1,
1321  (errmsg_internal("backup timeline %u in file \"%s\"",
1322  tli_from_file, BACKUP_LABEL_FILE)));
1323  }
1324 
1325  if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
1326  ereport(FATAL,
1327  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1328  errmsg("this is an incremental backup, not a data directory"),
1329  errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1330 
1331  if (ferror(lfp) || FreeFile(lfp))
1332  ereport(FATAL,
1334  errmsg("could not read file \"%s\": %m",
1335  BACKUP_LABEL_FILE)));
1336 
1337  return true;
1338 }
1339 
1340 /*
1341  * read_tablespace_map: check to see if a tablespace_map file is present
1342  *
1343  * If we see a tablespace_map file during recovery, we assume that we are
1344  * recovering from a backup dump file, and we therefore need to create symlinks
1345  * as per the information present in tablespace_map file.
1346  *
1347  * Returns true if a tablespace_map file was found (and fills *tablespaces
1348  * with a tablespaceinfo struct for each tablespace listed in the file);
1349  * returns false if not.
1350  */
1351 static bool
1353 {
1354  tablespaceinfo *ti;
1355  FILE *lfp;
1356  char str[MAXPGPATH];
1357  int ch,
1358  i,
1359  n;
1360  bool was_backslash;
1361 
1362  /*
1363  * See if tablespace_map file is present
1364  */
1365  lfp = AllocateFile(TABLESPACE_MAP, "r");
1366  if (!lfp)
1367  {
1368  if (errno != ENOENT)
1369  ereport(FATAL,
1371  errmsg("could not read file \"%s\": %m",
1372  TABLESPACE_MAP)));
1373  return false; /* it's not there, all is fine */
1374  }
1375 
1376  /*
1377  * Read and parse the link name and path lines from tablespace_map file
1378  * (this code is pretty crude, but we are not expecting any variability in
1379  * the file format). De-escape any backslashes that were inserted.
1380  */
1381  i = 0;
1382  was_backslash = false;
1383  while ((ch = fgetc(lfp)) != EOF)
1384  {
1385  if (!was_backslash && (ch == '\n' || ch == '\r'))
1386  {
1387  char *endp;
1388 
1389  if (i == 0)
1390  continue; /* \r immediately followed by \n */
1391 
1392  /*
1393  * The de-escaped line should contain an OID followed by exactly
1394  * one space followed by a path. The path might start with
1395  * spaces, so don't be too liberal about parsing.
1396  */
1397  str[i] = '\0';
1398  n = 0;
1399  while (str[n] && str[n] != ' ')
1400  n++;
1401  if (n < 1 || n >= i - 1)
1402  ereport(FATAL,
1403  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1404  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1405  str[n++] = '\0';
1406 
1407  ti = palloc0(sizeof(tablespaceinfo));
1408  errno = 0;
1409  ti->oid = strtoul(str, &endp, 10);
1410  if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1411  ereport(FATAL,
1412  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1413  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1414  ti->path = pstrdup(str + n);
1415  *tablespaces = lappend(*tablespaces, ti);
1416 
1417  i = 0;
1418  continue;
1419  }
1420  else if (!was_backslash && ch == '\\')
1421  was_backslash = true;
1422  else
1423  {
1424  if (i < sizeof(str) - 1)
1425  str[i++] = ch;
1426  was_backslash = false;
1427  }
1428  }
1429 
1430  if (i != 0 || was_backslash) /* last line not terminated? */
1431  ereport(FATAL,
1432  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1433  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1434 
1435  if (ferror(lfp) || FreeFile(lfp))
1436  ereport(FATAL,
1438  errmsg("could not read file \"%s\": %m",
1439  TABLESPACE_MAP)));
1440 
1441  return true;
1442 }
1443 
1444 /*
1445  * Finish WAL recovery.
1446  *
1447  * This does not close the 'xlogreader' yet, because in some cases the caller
1448  * still wants to re-read the last checkpoint record by calling
1449  * ReadCheckpointRecord().
1450  *
1451  * Returns the position of the last valid or applied record, after which new
1452  * WAL should be appended, information about why recovery was ended, and some
1453  * other things. See the EndOfWalRecoveryInfo struct for details.
1454  */
1457 {
1459  XLogRecPtr lastRec;
1460  TimeLineID lastRecTLI;
1461  XLogRecPtr endOfLog;
1462 
1463  /*
1464  * Kill WAL receiver, if it's still running, before we continue to write
1465  * the startup checkpoint and aborted-contrecord records. It will trump
1466  * over these records and subsequent ones if it's still alive when we
1467  * start writing WAL.
1468  */
1470 
1471  /*
1472  * Shutdown the slot sync worker to drop any temporary slots acquired by
1473  * it and to prevent it from keep trying to fetch the failover slots.
1474  *
1475  * We do not update the 'synced' column in 'pg_replication_slots' system
1476  * view from true to false here, as any failed update could leave 'synced'
1477  * column false for some slots. This could cause issues during slot sync
1478  * after restarting the server as a standby. While updating the 'synced'
1479  * column after switching to the new timeline is an option, it does not
1480  * simplify the handling for the 'synced' column. Therefore, we retain the
1481  * 'synced' column as true after promotion as it may provide useful
1482  * information about the slot origin.
1483  */
1484  ShutDownSlotSync();
1485 
1486  /*
1487  * We are now done reading the xlog from stream. Turn off streaming
1488  * recovery to force fetching the files (which would be required at end of
1489  * recovery, e.g., timeline history file) from archive or pg_wal.
1490  *
1491  * Note that standby mode must be turned off after killing WAL receiver,
1492  * i.e., calling XLogShutdownWalRcv().
1493  */
1494  Assert(!WalRcvStreaming());
1495  StandbyMode = false;
1496 
1497  /*
1498  * Determine where to start writing WAL next.
1499  *
1500  * Re-fetch the last valid or last applied record, so we can identify the
1501  * exact endpoint of what we consider the valid portion of WAL. There may
1502  * be an incomplete continuation record after that, in which case
1503  * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1504  * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1505  * it is intentionally missing. See CreateOverwriteContrecordRecord().
1506  *
1507  * An important side-effect of this is to load the last page into
1508  * xlogreader. The caller uses it to initialize the WAL for writing.
1509  */
1510  if (!InRecovery)
1511  {
1512  lastRec = CheckPointLoc;
1513  lastRecTLI = CheckPointTLI;
1514  }
1515  else
1516  {
1518  lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1519  }
1521  (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1522  endOfLog = xlogreader->EndRecPtr;
1523 
1524  /*
1525  * Remember the TLI in the filename of the XLOG segment containing the
1526  * end-of-log. It could be different from the timeline that endOfLog
1527  * nominally belongs to, if there was a timeline switch in that segment,
1528  * and we were reading the old WAL from a segment belonging to a higher
1529  * timeline.
1530  */
1531  result->endOfLogTLI = xlogreader->seg.ws_tli;
1532 
1534  {
1535  /*
1536  * We are no longer in archive recovery state.
1537  *
1538  * We are now done reading the old WAL. Turn off archive fetching if
1539  * it was active.
1540  */
1542  InArchiveRecovery = false;
1543 
1544  /*
1545  * If the ending log segment is still open, close it (to avoid
1546  * problems on Windows with trying to rename or delete an open file).
1547  */
1548  if (readFile >= 0)
1549  {
1550  close(readFile);
1551  readFile = -1;
1552  }
1553  }
1554 
1555  /*
1556  * Copy the last partial block to the caller, for initializing the WAL
1557  * buffer for appending new WAL.
1558  */
1559  if (endOfLog % XLOG_BLCKSZ != 0)
1560  {
1561  char *page;
1562  int len;
1563  XLogRecPtr pageBeginPtr;
1564 
1565  pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1566  Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1567 
1568  /* Copy the valid part of the last block */
1569  len = endOfLog % XLOG_BLCKSZ;
1570  page = palloc(len);
1571  memcpy(page, xlogreader->readBuf, len);
1572 
1573  result->lastPageBeginPtr = pageBeginPtr;
1574  result->lastPage = page;
1575  }
1576  else
1577  {
1578  /* There is no partial block to copy. */
1579  result->lastPageBeginPtr = endOfLog;
1580  result->lastPage = NULL;
1581  }
1582 
1583  /*
1584  * Create a comment for the history file to explain why and where timeline
1585  * changed.
1586  */
1588 
1589  result->lastRec = lastRec;
1590  result->lastRecTLI = lastRecTLI;
1591  result->endOfLog = endOfLog;
1592 
1593  result->abortedRecPtr = abortedRecPtr;
1595 
1598 
1599  return result;
1600 }
1601 
1602 /*
1603  * Clean up the WAL reader and leftovers from restoring WAL from archive
1604  */
1605 void
1607 {
1608  char recoveryPath[MAXPGPATH];
1609 
1610  /* Final update of pg_stat_recovery_prefetch. */
1612 
1613  /* Shut down xlogreader */
1614  if (readFile >= 0)
1615  {
1616  close(readFile);
1617  readFile = -1;
1618  }
1621 
1623  {
1624  /*
1625  * Since there might be a partial WAL segment named RECOVERYXLOG, get
1626  * rid of it.
1627  */
1628  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1629  unlink(recoveryPath); /* ignore any error */
1630 
1631  /* Get rid of any remaining recovered timeline-history file, too */
1632  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1633  unlink(recoveryPath); /* ignore any error */
1634  }
1635 
1636  /*
1637  * We don't need the latch anymore. It's not strictly necessary to disown
1638  * it, but let's do it for the sake of tidiness.
1639  */
1642 }
1643 
1644 /*
1645  * Perform WAL recovery.
1646  *
1647  * If the system was shut down cleanly, this is never called.
1648  */
1649 void
1651 {
1652  XLogRecord *record;
1653  bool reachedRecoveryTarget = false;
1654  TimeLineID replayTLI;
1655 
1656  /*
1657  * Initialize shared variables for tracking progress of WAL replay, as if
1658  * we had just replayed the record before the REDO location (or the
1659  * checkpoint record itself, if it's a shutdown checkpoint).
1660  */
1663  {
1667  }
1668  else
1669  {
1673  }
1680 
1681  /* Also ensure XLogReceiptTime has a sane value */
1683 
1684  /*
1685  * Let postmaster know we've started redo now, so that it can launch the
1686  * archiver if necessary.
1687  */
1688  if (IsUnderPostmaster)
1690 
1691  /*
1692  * Allow read-only connections immediately if we're consistent already.
1693  */
1695 
1696  /*
1697  * Find the first record that logically follows the checkpoint --- it
1698  * might physically precede it, though.
1699  */
1701  {
1702  /* back up to find the record */
1703  replayTLI = RedoStartTLI;
1705  record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1706 
1707  /*
1708  * If a checkpoint record's redo pointer points back to an earlier
1709  * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1710  * record.
1711  */
1712  if (record->xl_rmid != RM_XLOG_ID ||
1713  (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1714  ereport(FATAL,
1715  (errmsg("unexpected record type found at redo point %X/%X",
1717  }
1718  else
1719  {
1720  /* just have to read next record after CheckPoint */
1722  replayTLI = CheckPointTLI;
1723  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1724  }
1725 
1726  if (record != NULL)
1727  {
1728  TimestampTz xtime;
1729  PGRUsage ru0;
1730 
1731  pg_rusage_init(&ru0);
1732 
1733  InRedo = true;
1734 
1735  RmgrStartup();
1736 
1737  ereport(LOG,
1738  (errmsg("redo starts at %X/%X",
1740 
1741  /* Prepare to report progress of the redo phase. */
1742  if (!StandbyMode)
1744 
1745  /*
1746  * main redo apply loop
1747  */
1748  do
1749  {
1750  if (!StandbyMode)
1751  ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1753 
1754 #ifdef WAL_DEBUG
1755  if (XLOG_DEBUG)
1756  {
1758 
1759  initStringInfo(&buf);
1760  appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1763  xlog_outrec(&buf, xlogreader);
1764  appendStringInfoString(&buf, " - ");
1766  elog(LOG, "%s", buf.data);
1767  pfree(buf.data);
1768  }
1769 #endif
1770 
1771  /* Handle interrupt signals of startup process */
1773 
1774  /*
1775  * Pause WAL replay, if requested by a hot-standby session via
1776  * SetRecoveryPause().
1777  *
1778  * Note that we intentionally don't take the info_lck spinlock
1779  * here. We might therefore read a slightly stale value of the
1780  * recoveryPause flag, but it can't be very stale (no worse than
1781  * the last spinlock we did acquire). Since a pause request is a
1782  * pretty asynchronous thing anyway, possibly responding to it one
1783  * WAL record later than we otherwise would is a minor issue, so
1784  * it doesn't seem worth adding another spinlock cycle to prevent
1785  * that.
1786  */
1787  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1789  recoveryPausesHere(false);
1790 
1791  /*
1792  * Have we reached our recovery target?
1793  */
1795  {
1796  reachedRecoveryTarget = true;
1797  break;
1798  }
1799 
1800  /*
1801  * If we've been asked to lag the primary, wait on latch until
1802  * enough time has passed.
1803  */
1805  {
1806  /*
1807  * We test for paused recovery again here. If user sets
1808  * delayed apply, it may be because they expect to pause
1809  * recovery in case of problems, so we must test again here
1810  * otherwise pausing during the delay-wait wouldn't work.
1811  */
1812  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1814  recoveryPausesHere(false);
1815  }
1816 
1817  /*
1818  * Apply the record
1819  */
1820  ApplyWalRecord(xlogreader, record, &replayTLI);
1821 
1822  /* Exit loop if we reached inclusive recovery target */
1824  {
1825  reachedRecoveryTarget = true;
1826  break;
1827  }
1828 
1829  /* Else, try to fetch the next WAL record */
1830  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1831  } while (record != NULL);
1832 
1833  /*
1834  * end of main redo apply loop
1835  */
1836 
1837  if (reachedRecoveryTarget)
1838  {
1839  if (!reachedConsistency)
1840  ereport(FATAL,
1841  (errmsg("requested recovery stop point is before consistent recovery point")));
1842 
1843  /*
1844  * This is the last point where we can restart recovery with a new
1845  * recovery target, if we shutdown and begin again. After this,
1846  * Resource Managers may choose to do permanent corrective actions
1847  * at end of recovery.
1848  */
1849  switch (recoveryTargetAction)
1850  {
1852 
1853  /*
1854  * exit with special return code to request shutdown of
1855  * postmaster. Log messages issued from postmaster.
1856  */
1857  proc_exit(3);
1858 
1860  SetRecoveryPause(true);
1861  recoveryPausesHere(true);
1862 
1863  /* drop into promote */
1864 
1866  break;
1867  }
1868  }
1869 
1870  RmgrCleanup();
1871 
1872  ereport(LOG,
1873  (errmsg("redo done at %X/%X system usage: %s",
1875  pg_rusage_show(&ru0))));
1876  xtime = GetLatestXTime();
1877  if (xtime)
1878  ereport(LOG,
1879  (errmsg("last completed transaction was at log time %s",
1880  timestamptz_to_str(xtime))));
1881 
1882  InRedo = false;
1883  }
1884  else
1885  {
1886  /* there are no WAL records following the checkpoint */
1887  ereport(LOG,
1888  (errmsg("redo is not required")));
1889  }
1890 
1891  /*
1892  * This check is intentionally after the above log messages that indicate
1893  * how far recovery went.
1894  */
1897  !reachedRecoveryTarget)
1898  ereport(FATAL,
1899  (errmsg("recovery ended before configured recovery target was reached")));
1900 }
1901 
1902 /*
1903  * Subroutine of PerformWalRecovery, to apply one WAL record.
1904  */
1905 static void
1907 {
1908  ErrorContextCallback errcallback;
1909  bool switchedTLI = false;
1910 
1911  /* Setup error traceback support for ereport() */
1912  errcallback.callback = rm_redo_error_callback;
1913  errcallback.arg = (void *) xlogreader;
1914  errcallback.previous = error_context_stack;
1915  error_context_stack = &errcallback;
1916 
1917  /*
1918  * TransamVariables->nextXid must be beyond record's xid.
1919  */
1921 
1922  /*
1923  * Before replaying this record, check if this record causes the current
1924  * timeline to change. The record is already considered to be part of the
1925  * new timeline, so we update replayTLI before replaying it. That's
1926  * important so that replayEndTLI, which is recorded as the minimum
1927  * recovery point's TLI if recovery stops after this record, is set
1928  * correctly.
1929  */
1930  if (record->xl_rmid == RM_XLOG_ID)
1931  {
1932  TimeLineID newReplayTLI = *replayTLI;
1933  TimeLineID prevReplayTLI = *replayTLI;
1934  uint8 info = record->xl_info & ~XLR_INFO_MASK;
1935 
1936  if (info == XLOG_CHECKPOINT_SHUTDOWN)
1937  {
1938  CheckPoint checkPoint;
1939 
1940  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1941  newReplayTLI = checkPoint.ThisTimeLineID;
1942  prevReplayTLI = checkPoint.PrevTimeLineID;
1943  }
1944  else if (info == XLOG_END_OF_RECOVERY)
1945  {
1946  xl_end_of_recovery xlrec;
1947 
1948  memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1949  newReplayTLI = xlrec.ThisTimeLineID;
1950  prevReplayTLI = xlrec.PrevTimeLineID;
1951  }
1952 
1953  if (newReplayTLI != *replayTLI)
1954  {
1955  /* Check that it's OK to switch to this TLI */
1957  newReplayTLI, prevReplayTLI, *replayTLI);
1958 
1959  /* Following WAL records should be run with new TLI */
1960  *replayTLI = newReplayTLI;
1961  switchedTLI = true;
1962  }
1963  }
1964 
1965  /*
1966  * Update shared replayEndRecPtr before replaying this record, so that
1967  * XLogFlush will update minRecoveryPoint correctly.
1968  */
1971  XLogRecoveryCtl->replayEndTLI = *replayTLI;
1973 
1974  /*
1975  * If we are attempting to enter Hot Standby mode, process XIDs we see
1976  */
1978  TransactionIdIsValid(record->xl_xid))
1980 
1981  /*
1982  * Some XLOG record types that are related to recovery are processed
1983  * directly here, rather than in xlog_redo()
1984  */
1985  if (record->xl_rmid == RM_XLOG_ID)
1986  xlogrecovery_redo(xlogreader, *replayTLI);
1987 
1988  /* Now apply the WAL record itself */
1989  GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1990 
1991  /*
1992  * After redo, check whether the backup pages associated with the WAL
1993  * record are consistent with the existing pages. This check is done only
1994  * if consistency check is enabled for this record.
1995  */
1996  if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
1998 
1999  /* Pop the error context stack */
2000  error_context_stack = errcallback.previous;
2001 
2002  /*
2003  * Update lastReplayedEndRecPtr after this record has been successfully
2004  * replayed.
2005  */
2009  XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2011 
2012  /* ------
2013  * Wakeup walsenders:
2014  *
2015  * On the standby, the WAL is flushed first (which will only wake up
2016  * physical walsenders) and then applied, which will only wake up logical
2017  * walsenders.
2018  *
2019  * Indeed, logical walsenders on standby can't decode and send data until
2020  * it's been applied.
2021  *
2022  * Physical walsenders don't need to be woken up during replay unless
2023  * cascading replication is allowed and time line change occurred (so that
2024  * they can notice that they are on a new time line).
2025  *
2026  * That's why the wake up conditions are for:
2027  *
2028  * - physical walsenders in case of new time line and cascade
2029  * replication is allowed
2030  * - logical walsenders in case cascade replication is allowed (could not
2031  * be created otherwise)
2032  * ------
2033  */
2035  WalSndWakeup(switchedTLI, true);
2036 
2037  /*
2038  * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2039  * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2040  * a reply to the primary.
2041  */
2043  {
2044  doRequestWalReceiverReply = false;
2045  WalRcvForceReply();
2046  }
2047 
2048  /* Allow read-only connections if we're consistent now */
2050 
2051  /* Is this a timeline switch? */
2052  if (switchedTLI)
2053  {
2054  /*
2055  * Before we continue on the new timeline, clean up any (possibly
2056  * bogus) future WAL segments on the old timeline.
2057  */
2059 
2060  /* Reset the prefetcher. */
2062  }
2063 }
2064 
2065 /*
2066  * Some XLOG RM record types that are directly related to WAL recovery are
2067  * handled here rather than in the xlog_redo()
2068  */
2069 static void
2071 {
2072  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2073  XLogRecPtr lsn = record->EndRecPtr;
2074 
2075  Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2076 
2077  if (info == XLOG_OVERWRITE_CONTRECORD)
2078  {
2079  /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2081 
2082  memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2083  if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2084  elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
2087 
2088  /* We have safely skipped the aborted record */
2091 
2092  ereport(LOG,
2093  (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2096 
2097  /* Verifying the record should only happen once */
2099  }
2100  else if (info == XLOG_BACKUP_END)
2101  {
2102  XLogRecPtr startpoint;
2103 
2104  memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2105 
2106  if (backupStartPoint == startpoint)
2107  {
2108  /*
2109  * We have reached the end of base backup, the point where
2110  * pg_backup_stop() was done. The data on disk is now consistent
2111  * (assuming we have also reached minRecoveryPoint). Set
2112  * backupEndPoint to the current LSN, so that the next call to
2113  * CheckRecoveryConsistency() will notice it and do the
2114  * end-of-backup processing.
2115  */
2116  elog(DEBUG1, "end of backup record reached");
2117 
2118  backupEndPoint = lsn;
2119  }
2120  else
2121  elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2123  }
2124 }
2125 
2126 /*
2127  * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2128  * directories.
2129  *
2130  * Replay of database creation XLOG records for databases that were later
2131  * dropped can create fake directories in pg_tblspc. By the time consistency
2132  * is reached these directories should have been removed; here we verify
2133  * that this did indeed happen. This is to be called at the point where
2134  * consistent state is reached.
2135  *
2136  * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2137  * useful for testing purposes, and also allows for an escape hatch in case
2138  * things go south.
2139  */
2140 static void
2142 {
2143  DIR *dir;
2144  struct dirent *de;
2145 
2146  dir = AllocateDir("pg_tblspc");
2147  while ((de = ReadDir(dir, "pg_tblspc")) != NULL)
2148  {
2149  char path[MAXPGPATH + 10];
2150 
2151  /* Skip entries of non-oid names */
2152  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2153  continue;
2154 
2155  snprintf(path, sizeof(path), "pg_tblspc/%s", de->d_name);
2156 
2157  if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2160  errmsg("unexpected directory entry \"%s\" found in %s",
2161  de->d_name, "pg_tblspc/"),
2162  errdetail("All directory entries in pg_tblspc/ should be symbolic links."),
2163  errhint("Remove those directories, or set allow_in_place_tablespaces to ON transiently to let recovery complete.")));
2164  }
2165 }
2166 
2167 /*
2168  * Checks if recovery has reached a consistent state. When consistency is
2169  * reached and we have a valid starting standby snapshot, tell postmaster
2170  * that it can start accepting read-only connections.
2171  */
2172 static void
2174 {
2175  XLogRecPtr lastReplayedEndRecPtr;
2176  TimeLineID lastReplayedTLI;
2177 
2178  /*
2179  * During crash recovery, we don't reach a consistent state until we've
2180  * replayed all the WAL.
2181  */
2183  return;
2184 
2186 
2187  /*
2188  * assume that we are called in the startup process, and hence don't need
2189  * a lock to read lastReplayedEndRecPtr
2190  */
2191  lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2192  lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2193 
2194  /*
2195  * Have we reached the point where our base backup was completed?
2196  */
2198  backupEndPoint <= lastReplayedEndRecPtr)
2199  {
2200  XLogRecPtr saveBackupStartPoint = backupStartPoint;
2201  XLogRecPtr saveBackupEndPoint = backupEndPoint;
2202 
2203  elog(DEBUG1, "end of backup reached");
2204 
2205  /*
2206  * We have reached the end of base backup, as indicated by pg_control.
2207  * Update the control file accordingly.
2208  */
2209  ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2212  backupEndRequired = false;
2213 
2214  ereport(LOG,
2215  (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
2216  LSN_FORMAT_ARGS(saveBackupStartPoint),
2217  LSN_FORMAT_ARGS(saveBackupEndPoint))));
2218  }
2219 
2220  /*
2221  * Have we passed our safe starting point? Note that minRecoveryPoint is
2222  * known to be incorrectly set if recovering from a backup, until the
2223  * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2224  * All we know prior to that is that we're not consistent yet.
2225  */
2227  minRecoveryPoint <= lastReplayedEndRecPtr)
2228  {
2229  /*
2230  * Check to see if the XLOG sequence contained any unresolved
2231  * references to uninitialized pages.
2232  */
2234 
2235  /*
2236  * Check that pg_tblspc doesn't contain any real directories. Replay
2237  * of Database/CREATE_* records may have created fictitious tablespace
2238  * directories that should have been removed by the time consistency
2239  * was reached.
2240  */
2242 
2243  reachedConsistency = true;
2244  ereport(LOG,
2245  (errmsg("consistent recovery state reached at %X/%X",
2246  LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2247  }
2248 
2249  /*
2250  * Have we got a valid starting snapshot that will allow queries to be
2251  * run? If so, we can tell postmaster that the database is consistent now,
2252  * enabling connections.
2253  */
2258  {
2262 
2263  LocalHotStandbyActive = true;
2264 
2266  }
2267 }
2268 
2269 /*
2270  * Error context callback for errors occurring during rm_redo().
2271  */
2272 static void
2274 {
2275  XLogReaderState *record = (XLogReaderState *) arg;
2277 
2278  initStringInfo(&buf);
2279  xlog_outdesc(&buf, record);
2280  xlog_block_info(&buf, record);
2281 
2282  /* translator: %s is a WAL record description */
2283  errcontext("WAL redo at %X/%X for %s",
2284  LSN_FORMAT_ARGS(record->ReadRecPtr),
2285  buf.data);
2286 
2287  pfree(buf.data);
2288 }
2289 
2290 /*
2291  * Returns a string describing an XLogRecord, consisting of its identity
2292  * optionally followed by a colon, a space, and a further description.
2293  */
2294 void
2296 {
2297  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2298  uint8 info = XLogRecGetInfo(record);
2299  const char *id;
2300 
2302  appendStringInfoChar(buf, '/');
2303 
2304  id = rmgr.rm_identify(info);
2305  if (id == NULL)
2306  appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2307  else
2308  appendStringInfo(buf, "%s: ", id);
2309 
2310  rmgr.rm_desc(buf, record);
2311 }
2312 
2313 #ifdef WAL_DEBUG
2314 
2315 static void
2316 xlog_outrec(StringInfo buf, XLogReaderState *record)
2317 {
2318  appendStringInfo(buf, "prev %X/%X; xid %u",
2320  XLogRecGetXid(record));
2321 
2322  appendStringInfo(buf, "; len %u",
2323  XLogRecGetDataLen(record));
2324 
2325  xlog_block_info(buf, record);
2326 }
2327 #endif /* WAL_DEBUG */
2328 
2329 /*
2330  * Returns a string giving information about all the blocks in an
2331  * XLogRecord.
2332  */
2333 static void
2335 {
2336  int block_id;
2337 
2338  /* decode block references */
2339  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2340  {
2341  RelFileLocator rlocator;
2342  ForkNumber forknum;
2343  BlockNumber blk;
2344 
2345  if (!XLogRecGetBlockTagExtended(record, block_id,
2346  &rlocator, &forknum, &blk, NULL))
2347  continue;
2348 
2349  if (forknum != MAIN_FORKNUM)
2350  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2351  block_id,
2352  rlocator.spcOid, rlocator.dbOid,
2353  rlocator.relNumber,
2354  forknum,
2355  blk);
2356  else
2357  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2358  block_id,
2359  rlocator.spcOid, rlocator.dbOid,
2360  rlocator.relNumber,
2361  blk);
2362  if (XLogRecHasBlockImage(record, block_id))
2363  appendStringInfoString(buf, " FPW");
2364  }
2365 }
2366 
2367 
2368 /*
2369  * Check that it's OK to switch to new timeline during recovery.
2370  *
2371  * 'lsn' is the address of the shutdown checkpoint record we're about to
2372  * replay. (Currently, timeline can only change at a shutdown checkpoint).
2373  */
2374 static void
2376  TimeLineID replayTLI)
2377 {
2378  /* Check that the record agrees on what the current (old) timeline is */
2379  if (prevTLI != replayTLI)
2380  ereport(PANIC,
2381  (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2382  prevTLI, replayTLI)));
2383 
2384  /*
2385  * The new timeline better be in the list of timelines we expect to see,
2386  * according to the timeline history. It should also not decrease.
2387  */
2388  if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2389  ereport(PANIC,
2390  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2391  newTLI, replayTLI)));
2392 
2393  /*
2394  * If we have not yet reached min recovery point, and we're about to
2395  * switch to a timeline greater than the timeline of the min recovery
2396  * point: trouble. After switching to the new timeline, we could not
2397  * possibly visit the min recovery point on the correct timeline anymore.
2398  * This can happen if there is a newer timeline in the archive that
2399  * branched before the timeline the min recovery point is on, and you
2400  * attempt to do PITR to the new timeline.
2401  */
2403  lsn < minRecoveryPoint &&
2404  newTLI > minRecoveryPointTLI)
2405  ereport(PANIC,
2406  (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2407  newTLI,
2410 
2411  /* Looks good */
2412 }
2413 
2414 
2415 /*
2416  * Extract timestamp from WAL record.
2417  *
2418  * If the record contains a timestamp, returns true, and saves the timestamp
2419  * in *recordXtime. If the record type has no timestamp, returns false.
2420  * Currently, only transaction commit/abort records and restore points contain
2421  * timestamps.
2422  */
2423 static bool
2425 {
2426  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2427  uint8 xact_info = info & XLOG_XACT_OPMASK;
2428  uint8 rmid = XLogRecGetRmid(record);
2429 
2430  if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2431  {
2432  *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2433  return true;
2434  }
2435  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2436  xact_info == XLOG_XACT_COMMIT_PREPARED))
2437  {
2438  *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2439  return true;
2440  }
2441  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2442  xact_info == XLOG_XACT_ABORT_PREPARED))
2443  {
2444  *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2445  return true;
2446  }
2447  return false;
2448 }
2449 
2450 /*
2451  * Checks whether the current buffer page and backup page stored in the
2452  * WAL record are consistent or not. Before comparing the two pages, a
2453  * masking can be applied to the pages to ignore certain areas like hint bits,
2454  * unused space between pd_lower and pd_upper among other things. This
2455  * function should be called once WAL replay has been completed for a
2456  * given record.
2457  */
2458 static void
2460 {
2461  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2462  RelFileLocator rlocator;
2463  ForkNumber forknum;
2464  BlockNumber blkno;
2465  int block_id;
2466 
2467  /* Records with no backup blocks have no need for consistency checks. */
2468  if (!XLogRecHasAnyBlockRefs(record))
2469  return;
2470 
2471  Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2472 
2473  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2474  {
2475  Buffer buf;
2476  Page page;
2477 
2478  if (!XLogRecGetBlockTagExtended(record, block_id,
2479  &rlocator, &forknum, &blkno, NULL))
2480  {
2481  /*
2482  * WAL record doesn't contain a block reference with the given id.
2483  * Do nothing.
2484  */
2485  continue;
2486  }
2487 
2488  Assert(XLogRecHasBlockImage(record, block_id));
2489 
2490  if (XLogRecBlockImageApply(record, block_id))
2491  {
2492  /*
2493  * WAL record has already applied the page, so bypass the
2494  * consistency check as that would result in comparing the full
2495  * page stored in the record with itself.
2496  */
2497  continue;
2498  }
2499 
2500  /*
2501  * Read the contents from the current buffer and store it in a
2502  * temporary page.
2503  */
2504  buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2506  InvalidBuffer);
2507  if (!BufferIsValid(buf))
2508  continue;
2509 
2511  page = BufferGetPage(buf);
2512 
2513  /*
2514  * Take a copy of the local page where WAL has been applied to have a
2515  * comparison base before masking it...
2516  */
2517  memcpy(replay_image_masked, page, BLCKSZ);
2518 
2519  /* No need for this page anymore now that a copy is in. */
2521 
2522  /*
2523  * If the block LSN is already ahead of this WAL record, we can't
2524  * expect contents to match. This can happen if recovery is
2525  * restarted.
2526  */
2527  if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2528  continue;
2529 
2530  /*
2531  * Read the contents from the backup copy, stored in WAL record and
2532  * store it in a temporary page. There is no need to allocate a new
2533  * page here, a local buffer is fine to hold its contents and a mask
2534  * can be directly applied on it.
2535  */
2536  if (!RestoreBlockImage(record, block_id, primary_image_masked))
2537  ereport(ERROR,
2538  (errcode(ERRCODE_INTERNAL_ERROR),
2539  errmsg_internal("%s", record->errormsg_buf)));
2540 
2541  /*
2542  * If masking function is defined, mask both the primary and replay
2543  * images
2544  */
2545  if (rmgr.rm_mask != NULL)
2546  {
2547  rmgr.rm_mask(replay_image_masked, blkno);
2548  rmgr.rm_mask(primary_image_masked, blkno);
2549  }
2550 
2551  /* Time to compare the primary and replay images. */
2552  if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2553  {
2554  elog(FATAL,
2555  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2556  rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2557  forknum, blkno);
2558  }
2559  }
2560 }
2561 
2562 /*
2563  * For point-in-time recovery, this function decides whether we want to
2564  * stop applying the XLOG before the current record.
2565  *
2566  * Returns true if we are stopping, false otherwise. If stopping, some
2567  * information is saved in recoveryStopXid et al for use in annotating the
2568  * new timeline's history file.
2569  */
2570 static bool
2572 {
2573  bool stopsHere = false;
2574  uint8 xact_info;
2575  bool isCommit;
2576  TimestampTz recordXtime = 0;
2577  TransactionId recordXid;
2578 
2579  /*
2580  * Ignore recovery target settings when not in archive recovery (meaning
2581  * we are in crash recovery).
2582  */
2584  return false;
2585 
2586  /* Check if we should stop as soon as reaching consistency */
2588  {
2589  ereport(LOG,
2590  (errmsg("recovery stopping after reaching consistency")));
2591 
2592  recoveryStopAfter = false;
2595  recoveryStopTime = 0;
2596  recoveryStopName[0] = '\0';
2597  return true;
2598  }
2599 
2600  /* Check if target LSN has been reached */
2603  record->ReadRecPtr >= recoveryTargetLSN)
2604  {
2605  recoveryStopAfter = false;
2607  recoveryStopLSN = record->ReadRecPtr;
2608  recoveryStopTime = 0;
2609  recoveryStopName[0] = '\0';
2610  ereport(LOG,
2611  (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2613  return true;
2614  }
2615 
2616  /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2617  if (XLogRecGetRmid(record) != RM_XACT_ID)
2618  return false;
2619 
2620  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2621 
2622  if (xact_info == XLOG_XACT_COMMIT)
2623  {
2624  isCommit = true;
2625  recordXid = XLogRecGetXid(record);
2626  }
2627  else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2628  {
2629  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2630  xl_xact_parsed_commit parsed;
2631 
2632  isCommit = true;
2634  xlrec,
2635  &parsed);
2636  recordXid = parsed.twophase_xid;
2637  }
2638  else if (xact_info == XLOG_XACT_ABORT)
2639  {
2640  isCommit = false;
2641  recordXid = XLogRecGetXid(record);
2642  }
2643  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2644  {
2645  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2646  xl_xact_parsed_abort parsed;
2647 
2648  isCommit = false;
2650  xlrec,
2651  &parsed);
2652  recordXid = parsed.twophase_xid;
2653  }
2654  else
2655  return false;
2656 
2658  {
2659  /*
2660  * There can be only one transaction end record with this exact
2661  * transactionid
2662  *
2663  * when testing for an xid, we MUST test for equality only, since
2664  * transactions are numbered in the order they start, not the order
2665  * they complete. A higher numbered xid will complete before you about
2666  * 50% of the time...
2667  */
2668  stopsHere = (recordXid == recoveryTargetXid);
2669  }
2670 
2671  /*
2672  * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2673  * We don't expect getRecordTimestamp ever to fail, since we already know
2674  * this is a commit or abort record; but test its result anyway.
2675  */
2676  if (getRecordTimestamp(record, &recordXtime) &&
2678  {
2679  /*
2680  * There can be many transactions that share the same commit time, so
2681  * we stop after the last one, if we are inclusive, or stop at the
2682  * first one if we are exclusive
2683  */
2685  stopsHere = (recordXtime > recoveryTargetTime);
2686  else
2687  stopsHere = (recordXtime >= recoveryTargetTime);
2688  }
2689 
2690  if (stopsHere)
2691  {
2692  recoveryStopAfter = false;
2693  recoveryStopXid = recordXid;
2694  recoveryStopTime = recordXtime;
2696  recoveryStopName[0] = '\0';
2697 
2698  if (isCommit)
2699  {
2700  ereport(LOG,
2701  (errmsg("recovery stopping before commit of transaction %u, time %s",
2704  }
2705  else
2706  {
2707  ereport(LOG,
2708  (errmsg("recovery stopping before abort of transaction %u, time %s",
2711  }
2712  }
2713 
2714  return stopsHere;
2715 }
2716 
2717 /*
2718  * Same as recoveryStopsBefore, but called after applying the record.
2719  *
2720  * We also track the timestamp of the latest applied COMMIT/ABORT
2721  * record in XLogRecoveryCtl->recoveryLastXTime.
2722  */
2723 static bool
2725 {
2726  uint8 info;
2727  uint8 xact_info;
2728  uint8 rmid;
2729  TimestampTz recordXtime = 0;
2730 
2731  /*
2732  * Ignore recovery target settings when not in archive recovery (meaning
2733  * we are in crash recovery).
2734  */
2736  return false;
2737 
2738  info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2739  rmid = XLogRecGetRmid(record);
2740 
2741  /*
2742  * There can be many restore points that share the same name; we stop at
2743  * the first one.
2744  */
2746  rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2747  {
2748  xl_restore_point *recordRestorePointData;
2749 
2750  recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2751 
2752  if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2753  {
2754  recoveryStopAfter = true;
2757  (void) getRecordTimestamp(record, &recoveryStopTime);
2758  strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2759 
2760  ereport(LOG,
2761  (errmsg("recovery stopping at restore point \"%s\", time %s",
2764  return true;
2765  }
2766  }
2767 
2768  /* Check if the target LSN has been reached */
2771  record->ReadRecPtr >= recoveryTargetLSN)
2772  {
2773  recoveryStopAfter = true;
2775  recoveryStopLSN = record->ReadRecPtr;
2776  recoveryStopTime = 0;
2777  recoveryStopName[0] = '\0';
2778  ereport(LOG,
2779  (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2781  return true;
2782  }
2783 
2784  if (rmid != RM_XACT_ID)
2785  return false;
2786 
2787  xact_info = info & XLOG_XACT_OPMASK;
2788 
2789  if (xact_info == XLOG_XACT_COMMIT ||
2790  xact_info == XLOG_XACT_COMMIT_PREPARED ||
2791  xact_info == XLOG_XACT_ABORT ||
2792  xact_info == XLOG_XACT_ABORT_PREPARED)
2793  {
2794  TransactionId recordXid;
2795 
2796  /* Update the last applied transaction timestamp */
2797  if (getRecordTimestamp(record, &recordXtime))
2798  SetLatestXTime(recordXtime);
2799 
2800  /* Extract the XID of the committed/aborted transaction */
2801  if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2802  {
2803  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2804  xl_xact_parsed_commit parsed;
2805 
2807  xlrec,
2808  &parsed);
2809  recordXid = parsed.twophase_xid;
2810  }
2811  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2812  {
2813  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2814  xl_xact_parsed_abort parsed;
2815 
2817  xlrec,
2818  &parsed);
2819  recordXid = parsed.twophase_xid;
2820  }
2821  else
2822  recordXid = XLogRecGetXid(record);
2823 
2824  /*
2825  * There can be only one transaction end record with this exact
2826  * transactionid
2827  *
2828  * when testing for an xid, we MUST test for equality only, since
2829  * transactions are numbered in the order they start, not the order
2830  * they complete. A higher numbered xid will complete before you about
2831  * 50% of the time...
2832  */
2834  recordXid == recoveryTargetXid)
2835  {
2836  recoveryStopAfter = true;
2837  recoveryStopXid = recordXid;
2838  recoveryStopTime = recordXtime;
2840  recoveryStopName[0] = '\0';
2841 
2842  if (xact_info == XLOG_XACT_COMMIT ||
2843  xact_info == XLOG_XACT_COMMIT_PREPARED)
2844  {
2845  ereport(LOG,
2846  (errmsg("recovery stopping after commit of transaction %u, time %s",
2849  }
2850  else if (xact_info == XLOG_XACT_ABORT ||
2851  xact_info == XLOG_XACT_ABORT_PREPARED)
2852  {
2853  ereport(LOG,
2854  (errmsg("recovery stopping after abort of transaction %u, time %s",
2857  }
2858  return true;
2859  }
2860  }
2861 
2862  /* Check if we should stop as soon as reaching consistency */
2864  {
2865  ereport(LOG,
2866  (errmsg("recovery stopping after reaching consistency")));
2867 
2868  recoveryStopAfter = true;
2870  recoveryStopTime = 0;
2872  recoveryStopName[0] = '\0';
2873  return true;
2874  }
2875 
2876  return false;
2877 }
2878 
2879 /*
2880  * Create a comment for the history file to explain why and where
2881  * timeline changed.
2882  */
2883 static char *
2885 {
2886  char reason[200];
2887 
2889  snprintf(reason, sizeof(reason),
2890  "%s transaction %u",
2891  recoveryStopAfter ? "after" : "before",
2892  recoveryStopXid);
2894  snprintf(reason, sizeof(reason),
2895  "%s %s\n",
2896  recoveryStopAfter ? "after" : "before",
2898  else if (recoveryTarget == RECOVERY_TARGET_LSN)
2899  snprintf(reason, sizeof(reason),
2900  "%s LSN %X/%X\n",
2901  recoveryStopAfter ? "after" : "before",
2904  snprintf(reason, sizeof(reason),
2905  "at restore point \"%s\"",
2908  snprintf(reason, sizeof(reason), "reached consistency");
2909  else
2910  snprintf(reason, sizeof(reason), "no recovery target specified");
2911 
2912  return pstrdup(reason);
2913 }
2914 
2915 /*
2916  * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2917  *
2918  * endOfRecovery is true if the recovery target is reached and
2919  * the paused state starts at the end of recovery because of
2920  * recovery_target_action=pause, and false otherwise.
2921  */
2922 static void
2923 recoveryPausesHere(bool endOfRecovery)
2924 {
2925  /* Don't pause unless users can connect! */
2926  if (!LocalHotStandbyActive)
2927  return;
2928 
2929  /* Don't pause after standby promotion has been triggered */
2931  return;
2932 
2933  if (endOfRecovery)
2934  ereport(LOG,
2935  (errmsg("pausing at the end of recovery"),
2936  errhint("Execute pg_wal_replay_resume() to promote.")));
2937  else
2938  ereport(LOG,
2939  (errmsg("recovery has paused"),
2940  errhint("Execute pg_wal_replay_resume() to continue.")));
2941 
2942  /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2944  {
2946  if (CheckForStandbyTrigger())
2947  return;
2948 
2949  /*
2950  * If recovery pause is requested then set it paused. While we are in
2951  * the loop, user might resume and pause again so set this every time.
2952  */
2954 
2955  /*
2956  * We wait on a condition variable that will wake us as soon as the
2957  * pause ends, but we use a timeout so we can check the above exit
2958  * condition periodically too.
2959  */
2961  WAIT_EVENT_RECOVERY_PAUSE);
2962  }
2964 }
2965 
2966 /*
2967  * When recovery_min_apply_delay is set, we wait long enough to make sure
2968  * certain record types are applied at least that interval behind the primary.
2969  *
2970  * Returns true if we waited.
2971  *
2972  * Note that the delay is calculated between the WAL record log time and
2973  * the current time on standby. We would prefer to keep track of when this
2974  * standby received each WAL record, which would allow a more consistent
2975  * approach and one not affected by time synchronisation issues, but that
2976  * is significantly more effort and complexity for little actual gain in
2977  * usability.
2978  */
2979 static bool
2981 {
2982  uint8 xact_info;
2983  TimestampTz xtime;
2984  TimestampTz delayUntil;
2985  long msecs;
2986 
2987  /* nothing to do if no delay configured */
2988  if (recovery_min_apply_delay <= 0)
2989  return false;
2990 
2991  /* no delay is applied on a database not yet consistent */
2992  if (!reachedConsistency)
2993  return false;
2994 
2995  /* nothing to do if crash recovery is requested */
2997  return false;
2998 
2999  /*
3000  * Is it a COMMIT record?
3001  *
3002  * We deliberately choose not to delay aborts since they have no effect on
3003  * MVCC. We already allow replay of records that don't have a timestamp,
3004  * so there is already opportunity for issues caused by early conflicts on
3005  * standbys.
3006  */
3007  if (XLogRecGetRmid(record) != RM_XACT_ID)
3008  return false;
3009 
3010  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3011 
3012  if (xact_info != XLOG_XACT_COMMIT &&
3013  xact_info != XLOG_XACT_COMMIT_PREPARED)
3014  return false;
3015 
3016  if (!getRecordTimestamp(record, &xtime))
3017  return false;
3018 
3020 
3021  /*
3022  * Exit without arming the latch if it's already past time to apply this
3023  * record
3024  */
3026  if (msecs <= 0)
3027  return false;
3028 
3029  while (true)
3030  {
3032 
3033  /* This might change recovery_min_apply_delay. */
3035 
3036  if (CheckForStandbyTrigger())
3037  break;
3038 
3039  /*
3040  * Recalculate delayUntil as recovery_min_apply_delay could have
3041  * changed while waiting in this loop.
3042  */
3044 
3045  /*
3046  * Wait for difference between GetCurrentTimestamp() and delayUntil.
3047  */
3049  delayUntil);
3050 
3051  if (msecs <= 0)
3052  break;
3053 
3054  elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3055 
3058  msecs,
3059  WAIT_EVENT_RECOVERY_APPLY_DELAY);
3060  }
3061  return true;
3062 }
3063 
3064 /*
3065  * Get the current state of the recovery pause request.
3066  */
3069 {
3071 
3075 
3076  return state;
3077 }
3078 
3079 /*
3080  * Set the recovery pause state.
3081  *
3082  * If recovery pause is requested then sets the recovery pause state to
3083  * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3084  * to 'not paused' to resume the recovery. The recovery pause will be
3085  * confirmed by the ConfirmRecoveryPaused.
3086  */
3087 void
3088 SetRecoveryPause(bool recoveryPause)
3089 {
3091 
3092  if (!recoveryPause)
3096 
3098 
3099  if (!recoveryPause)
3101 }
3102 
3103 /*
3104  * Confirm the recovery pause by setting the recovery pause state to
3105  * RECOVERY_PAUSED.
3106  */
3107 static void
3109 {
3110  /* If recovery pause is requested then set it paused */
3115 }
3116 
3117 
3118 /*
3119  * Attempt to read the next XLOG record.
3120  *
3121  * Before first call, the reader needs to be positioned to the first record
3122  * by calling XLogPrefetcherBeginRead().
3123  *
3124  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3125  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3126  * record is available.
3127  */
3128 static XLogRecord *
3130  bool fetching_ckpt, TimeLineID replayTLI)
3131 {
3132  XLogRecord *record;
3135 
3136  /* Pass through parameters to XLogPageRead */
3137  private->fetching_ckpt = fetching_ckpt;
3138  private->emode = emode;
3139  private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3140  private->replayTLI = replayTLI;
3141 
3142  /* This is the first attempt to read this page. */
3143  lastSourceFailed = false;
3144 
3145  for (;;)
3146  {
3147  char *errormsg;
3148 
3149  record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3150  if (record == NULL)
3151  {
3152  /*
3153  * When we find that WAL ends in an incomplete record, keep track
3154  * of that record. After recovery is done, we'll write a record
3155  * to indicate to downstream WAL readers that that portion is to
3156  * be ignored.
3157  *
3158  * However, when ArchiveRecoveryRequested = true, we're going to
3159  * switch to a new timeline at the end of recovery. We will only
3160  * copy WAL over to the new timeline up to the end of the last
3161  * complete record, so if we did this, we would later create an
3162  * overwrite contrecord in the wrong place, breaking everything.
3163  */
3164  if (!ArchiveRecoveryRequested &&
3166  {
3169  }
3170 
3171  if (readFile >= 0)
3172  {
3173  close(readFile);
3174  readFile = -1;
3175  }
3176 
3177  /*
3178  * We only end up here without a message when XLogPageRead()
3179  * failed - in that case we already logged something. In
3180  * StandbyMode that only happens if we have been triggered, so we
3181  * shouldn't loop anymore in that case.
3182  */
3183  if (errormsg)
3185  (errmsg_internal("%s", errormsg) /* already translated */ ));
3186  }
3187 
3188  /*
3189  * Check page TLI is one of the expected values.
3190  */
3192  {
3193  char fname[MAXFNAMELEN];
3194  XLogSegNo segno;
3195  int32 offset;
3196 
3200  XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3203  (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3205  fname,
3207  offset)));
3208  record = NULL;
3209  }
3210 
3211  if (record)
3212  {
3213  /* Great, got a record */
3214  return record;
3215  }
3216  else
3217  {
3218  /* No valid record available from this source */
3219  lastSourceFailed = true;
3220 
3221  /*
3222  * If archive recovery was requested, but we were still doing
3223  * crash recovery, switch to archive recovery and retry using the
3224  * offline archive. We have now replayed all the valid WAL in
3225  * pg_wal, so we are presumably now consistent.
3226  *
3227  * We require that there's at least some valid WAL present in
3228  * pg_wal, however (!fetching_ckpt). We could recover using the
3229  * WAL from the archive, even if pg_wal is completely empty, but
3230  * we'd have no idea how far we'd have to replay to reach
3231  * consistency. So err on the safe side and give up.
3232  */
3234  !fetching_ckpt)
3235  {
3236  ereport(DEBUG1,
3237  (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3238  InArchiveRecovery = true;
3241 
3244  minRecoveryPointTLI = replayTLI;
3245 
3247 
3248  /*
3249  * Before we retry, reset lastSourceFailed and currentSource
3250  * so that we will check the archive next.
3251  */
3252  lastSourceFailed = false;
3254 
3255  continue;
3256  }
3257 
3258  /* In standby mode, loop back to retry. Otherwise, give up. */
3260  continue;
3261  else
3262  return NULL;
3263  }
3264  }
3265 }
3266 
3267 /*
3268  * Read the XLOG page containing targetPagePtr into readBuf (if not read
3269  * already). Returns number of bytes read, if the page is read successfully,
3270  * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3271  * but only if they have not been previously reported.
3272  *
3273  * See XLogReaderRoutine.page_read for more details.
3274  *
3275  * While prefetching, xlogreader->nonblocking may be set. In that case,
3276  * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3277  *
3278  * This is responsible for restoring files from archive as needed, as well
3279  * as for waiting for the requested WAL record to arrive in standby mode.
3280  *
3281  * xlogreader->private_data->emode specifies the log level used for reporting
3282  * "file not found" or "end of WAL" situations in archive recovery, or in
3283  * standby mode when promotion is triggered. If set to WARNING or below,
3284  * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3285  * levels the ereport() won't return.
3286  *
3287  * In standby mode, if after a successful return of XLogPageRead() the
3288  * caller finds the record it's interested in to be broken, it should
3289  * ereport the error with the level determined by
3290  * emode_for_corrupt_record(), and then set lastSourceFailed
3291  * and call XLogPageRead() again with the same arguments. This lets
3292  * XLogPageRead() to try fetching the record from another source, or to
3293  * sleep and retry.
3294  */
3295 static int
3297  XLogRecPtr targetRecPtr, char *readBuf)
3298 {
3299  XLogPageReadPrivate *private =
3301  int emode = private->emode;
3302  uint32 targetPageOff;
3303  XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3304  int r;
3305 
3306  XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3307  targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3308 
3309  /*
3310  * See if we need to switch to a new segment because the requested record
3311  * is not in the currently open one.
3312  */
3313  if (readFile >= 0 &&
3314  !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3315  {
3316  /*
3317  * Request a restartpoint if we've replayed too much xlog since the
3318  * last one.
3319  */
3321  {
3323  {
3324  (void) GetRedoRecPtr();
3327  }
3328  }
3329 
3330  close(readFile);
3331  readFile = -1;
3333  }
3334 
3335  XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3336 
3337 retry:
3338  /* See if we need to retrieve more data */
3339  if (readFile < 0 ||
3341  flushedUpto < targetPagePtr + reqLen))
3342  {
3343  if (readFile >= 0 &&
3346  flushedUpto < targetPagePtr + reqLen)
3347  return XLREAD_WOULDBLOCK;
3348 
3349  switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3350  private->randAccess,
3351  private->fetching_ckpt,
3352  targetRecPtr,
3353  private->replayTLI,
3356  {
3357  case XLREAD_WOULDBLOCK:
3358  return XLREAD_WOULDBLOCK;
3359  case XLREAD_FAIL:
3360  if (readFile >= 0)
3361  close(readFile);
3362  readFile = -1;
3363  readLen = 0;
3365  return XLREAD_FAIL;
3366  case XLREAD_SUCCESS:
3367  break;
3368  }
3369  }
3370 
3371  /*
3372  * At this point, we have the right segment open and if we're streaming we
3373  * know the requested record is in it.
3374  */
3375  Assert(readFile != -1);
3376 
3377  /*
3378  * If the current segment is being streamed from the primary, calculate
3379  * how much of the current page we have received already. We know the
3380  * requested record has been received, but this is for the benefit of
3381  * future calls, to allow quick exit at the top of this function.
3382  */
3384  {
3385  if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3386  readLen = XLOG_BLCKSZ;
3387  else
3389  targetPageOff;
3390  }
3391  else
3392  readLen = XLOG_BLCKSZ;
3393 
3394  /* Read the requested page */
3395  readOff = targetPageOff;
3396 
3397  pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3398  r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3399  if (r != XLOG_BLCKSZ)
3400  {
3401  char fname[MAXFNAMELEN];
3402  int save_errno = errno;
3403 
3406  if (r < 0)
3407  {
3408  errno = save_errno;
3409  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3411  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3412  fname, LSN_FORMAT_ARGS(targetPagePtr),
3413  readOff)));
3414  }
3415  else
3416  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3418  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3419  fname, LSN_FORMAT_ARGS(targetPagePtr),
3420  readOff, r, (Size) XLOG_BLCKSZ)));
3421  goto next_record_is_invalid;
3422  }
3424 
3425  Assert(targetSegNo == readSegNo);
3426  Assert(targetPageOff == readOff);
3427  Assert(reqLen <= readLen);
3428 
3430 
3431  /*
3432  * Check the page header immediately, so that we can retry immediately if
3433  * it's not valid. This may seem unnecessary, because ReadPageInternal()
3434  * validates the page header anyway, and would propagate the failure up to
3435  * ReadRecord(), which would retry. However, there's a corner case with
3436  * continuation records, if a record is split across two pages such that
3437  * we would need to read the two pages from different sources. For
3438  * example, imagine a scenario where a streaming replica is started up,
3439  * and replay reaches a record that's split across two WAL segments. The
3440  * first page is only available locally, in pg_wal, because it's already
3441  * been recycled on the primary. The second page, however, is not present
3442  * in pg_wal, and we should stream it from the primary. There is a
3443  * recycled WAL segment present in pg_wal, with garbage contents, however.
3444  * We would read the first page from the local WAL segment, but when
3445  * reading the second page, we would read the bogus, recycled, WAL
3446  * segment. If we didn't catch that case here, we would never recover,
3447  * because ReadRecord() would retry reading the whole record from the
3448  * beginning.
3449  *
3450  * Of course, this only catches errors in the page header, which is what
3451  * happens in the case of a recycled WAL segment. Other kinds of errors or
3452  * corruption still has the same problem. But this at least fixes the
3453  * common case, which can happen as part of normal operation.
3454  *
3455  * Validating the page header is cheap enough that doing it twice
3456  * shouldn't be a big deal from a performance point of view.
3457  *
3458  * When not in standby mode, an invalid page header should cause recovery
3459  * to end, not retry reading the page, so we don't need to validate the
3460  * page header here for the retry. Instead, ReadPageInternal() is
3461  * responsible for the validation.
3462  */
3463  if (StandbyMode &&
3464  !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3465  {
3466  /*
3467  * Emit this error right now then retry this page immediately. Use
3468  * errmsg_internal() because the message was already translated.
3469  */
3470  if (xlogreader->errormsg_buf[0])
3473 
3474  /* reset any error XLogReaderValidatePageHeader() might have set */
3476  goto next_record_is_invalid;
3477  }
3478 
3479  return readLen;
3480 
3481 next_record_is_invalid:
3482 
3483  /*
3484  * If we're reading ahead, give up fast. Retries and error reporting will
3485  * be handled by a later read when recovery catches up to this point.
3486  */
3487  if (xlogreader->nonblocking)
3488  return XLREAD_WOULDBLOCK;
3489 
3490  lastSourceFailed = true;
3491 
3492  if (readFile >= 0)
3493  close(readFile);
3494  readFile = -1;
3495  readLen = 0;
3497 
3498  /* In standby-mode, keep trying */
3499  if (StandbyMode)
3500  goto retry;
3501  else
3502  return XLREAD_FAIL;
3503 }
3504 
3505 /*
3506  * Open the WAL segment containing WAL location 'RecPtr'.
3507  *
3508  * The segment can be fetched via restore_command, or via walreceiver having
3509  * streamed the record, or it can already be present in pg_wal. Checking
3510  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3511  * too, in case someone copies a new segment directly to pg_wal. That is not
3512  * documented or recommended, though.
3513  *
3514  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3515  * prepare to read WAL starting from RedoStartLSN after this.
3516  *
3517  * 'RecPtr' might not point to the beginning of the record we're interested
3518  * in, it might also point to the page or segment header. In that case,
3519  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3520  * used to decide which timeline to stream the requested WAL from.
3521  *
3522  * 'replayLSN' is the current replay LSN, so that if we scan for new
3523  * timelines, we can reject a switch to a timeline that branched off before
3524  * this point.
3525  *
3526  * If the record is not immediately available, the function returns false
3527  * if we're not in standby mode. In standby mode, waits for it to become
3528  * available.
3529  *
3530  * When the requested record becomes available, the function opens the file
3531  * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3532  * of standby mode is triggered by the user, and there is no more WAL
3533  * available, returns XLREAD_FAIL.
3534  *
3535  * If nonblocking is true, then give up immediately if we can't satisfy the
3536  * request, returning XLREAD_WOULDBLOCK instead of waiting.
3537  */
3538 static XLogPageReadResult
3539 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3540  bool fetching_ckpt, XLogRecPtr tliRecPtr,
3541  TimeLineID replayTLI, XLogRecPtr replayLSN,
3542  bool nonblocking)
3543 {
3544  static TimestampTz last_fail_time = 0;
3545  TimestampTz now;
3546  bool streaming_reply_sent = false;
3547 
3548  /*-------
3549  * Standby mode is implemented by a state machine:
3550  *
3551  * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3552  * pg_wal (XLOG_FROM_PG_WAL)
3553  * 2. Check for promotion trigger request
3554  * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3555  * 4. Rescan timelines
3556  * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3557  *
3558  * Failure to read from the current source advances the state machine to
3559  * the next state.
3560  *
3561  * 'currentSource' indicates the current state. There are no currentSource
3562  * values for "check trigger", "rescan timelines", and "sleep" states,
3563  * those actions are taken when reading from the previous source fails, as
3564  * part of advancing to the next state.
3565  *
3566  * If standby mode is turned off while reading WAL from stream, we move
3567  * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3568  * the files (which would be required at end of recovery, e.g., timeline
3569  * history file) from archive or pg_wal. We don't need to kill WAL receiver
3570  * here because it's already stopped when standby mode is turned off at
3571  * the end of recovery.
3572  *-------
3573  */
3574  if (!InArchiveRecovery)
3576  else if (currentSource == XLOG_FROM_ANY ||
3578  {
3579  lastSourceFailed = false;
3581  }
3582 
3583  for (;;)
3584  {
3585  XLogSource oldSource = currentSource;
3586  bool startWalReceiver = false;
3587 
3588  /*
3589  * First check if we failed to read from the current source, and
3590  * advance the state machine if so. The failure to read might've
3591  * happened outside this function, e.g when a CRC check fails on a
3592  * record, or within this loop.
3593  */
3594  if (lastSourceFailed)
3595  {
3596  /*
3597  * Don't allow any retry loops to occur during nonblocking
3598  * readahead. Let the caller process everything that has been
3599  * decoded already first.
3600  */
3601  if (nonblocking)
3602  return XLREAD_WOULDBLOCK;
3603 
3604  switch (currentSource)
3605  {
3606  case XLOG_FROM_ARCHIVE:
3607  case XLOG_FROM_PG_WAL:
3608 
3609  /*
3610  * Check to see if promotion is requested. Note that we do
3611  * this only after failure, so when you promote, we still
3612  * finish replaying as much as we can from archive and
3613  * pg_wal before failover.
3614  */
3616  {
3618  return XLREAD_FAIL;
3619  }
3620 
3621  /*
3622  * Not in standby mode, and we've now tried the archive
3623  * and pg_wal.
3624  */
3625  if (!StandbyMode)
3626  return XLREAD_FAIL;
3627 
3628  /*
3629  * Move to XLOG_FROM_STREAM state, and set to start a
3630  * walreceiver if necessary.
3631  */
3633  startWalReceiver = true;
3634  break;
3635 
3636  case XLOG_FROM_STREAM:
3637 
3638  /*
3639  * Failure while streaming. Most likely, we got here
3640  * because streaming replication was terminated, or
3641  * promotion was triggered. But we also get here if we
3642  * find an invalid record in the WAL streamed from the
3643  * primary, in which case something is seriously wrong.
3644  * There's little chance that the problem will just go
3645  * away, but PANIC is not good for availability either,
3646  * especially in hot standby mode. So, we treat that the
3647  * same as disconnection, and retry from archive/pg_wal
3648  * again. The WAL in the archive should be identical to
3649  * what was streamed, so it's unlikely that it helps, but
3650  * one can hope...
3651  */
3652 
3653  /*
3654  * We should be able to move to XLOG_FROM_STREAM only in
3655  * standby mode.
3656  */
3658 
3659  /*
3660  * Before we leave XLOG_FROM_STREAM state, make sure that
3661  * walreceiver is not active, so that it won't overwrite
3662  * WAL that we restore from archive.
3663  */
3665 
3666  /*
3667  * Before we sleep, re-scan for possible new timelines if
3668  * we were requested to recover to the latest timeline.
3669  */
3671  {
3672  if (rescanLatestTimeLine(replayTLI, replayLSN))
3673  {
3675  break;
3676  }
3677  }
3678 
3679  /*
3680  * XLOG_FROM_STREAM is the last state in our state
3681  * machine, so we've exhausted all the options for
3682  * obtaining the requested WAL. We're going to loop back
3683  * and retry from the archive, but if it hasn't been long
3684  * since last attempt, sleep wal_retrieve_retry_interval
3685  * milliseconds to avoid busy-waiting.
3686  */
3688  if (!TimestampDifferenceExceeds(last_fail_time, now,
3690  {
3691  long wait_time;
3692 
3693  wait_time = wal_retrieve_retry_interval -
3694  TimestampDifferenceMilliseconds(last_fail_time, now);
3695 
3696  elog(LOG, "waiting for WAL to become available at %X/%X",
3697  LSN_FORMAT_ARGS(RecPtr));
3698 
3699  /* Do background tasks that might benefit us later. */
3701 
3705  wait_time,
3706  WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3709 
3710  /* Handle interrupt signals of startup process */
3712  }
3713  last_fail_time = now;
3715  break;
3716 
3717  default:
3718  elog(ERROR, "unexpected WAL source %d", currentSource);
3719  }
3720  }
3721  else if (currentSource == XLOG_FROM_PG_WAL)
3722  {
3723  /*
3724  * We just successfully read a file in pg_wal. We prefer files in
3725  * the archive over ones in pg_wal, so try the next file again
3726  * from the archive first.
3727  */
3728  if (InArchiveRecovery)
3730  }
3731 
3732  if (currentSource != oldSource)
3733  elog(DEBUG2, "switched WAL source from %s to %s after %s",
3735  lastSourceFailed ? "failure" : "success");
3736 
3737  /*
3738  * We've now handled possible failure. Try to read from the chosen
3739  * source.
3740  */
3741  lastSourceFailed = false;
3742 
3743  switch (currentSource)
3744  {
3745  case XLOG_FROM_ARCHIVE:
3746  case XLOG_FROM_PG_WAL:
3747 
3748  /*
3749  * WAL receiver must not be running when reading WAL from
3750  * archive or pg_wal.
3751  */
3752  Assert(!WalRcvStreaming());
3753 
3754  /* Close any old file we might have open. */
3755  if (readFile >= 0)
3756  {
3757  close(readFile);
3758  readFile = -1;
3759  }
3760  /* Reset curFileTLI if random fetch. */
3761  if (randAccess)
3762  curFileTLI = 0;
3763 
3764  /*
3765  * Try to restore the file from archive, or read an existing
3766  * file from pg_wal.
3767  */
3770  currentSource);
3771  if (readFile >= 0)
3772  return XLREAD_SUCCESS; /* success! */
3773 
3774  /*
3775  * Nope, not found in archive or pg_wal.
3776  */
3777  lastSourceFailed = true;
3778  break;
3779 
3780  case XLOG_FROM_STREAM:
3781  {
3782  bool havedata;
3783 
3784  /*
3785  * We should be able to move to XLOG_FROM_STREAM only in
3786  * standby mode.
3787  */
3789 
3790  /*
3791  * First, shutdown walreceiver if its restart has been
3792  * requested -- but no point if we're already slated for
3793  * starting it.
3794  */
3795  if (pendingWalRcvRestart && !startWalReceiver)
3796  {
3798 
3799  /*
3800  * Re-scan for possible new timelines if we were
3801  * requested to recover to the latest timeline.
3802  */
3805  rescanLatestTimeLine(replayTLI, replayLSN);
3806 
3807  startWalReceiver = true;
3808  }
3809  pendingWalRcvRestart = false;
3810 
3811  /*
3812  * Launch walreceiver if needed.
3813  *
3814  * If fetching_ckpt is true, RecPtr points to the initial
3815  * checkpoint location. In that case, we use RedoStartLSN
3816  * as the streaming start position instead of RecPtr, so
3817  * that when we later jump backwards to start redo at
3818  * RedoStartLSN, we will have the logs streamed already.
3819  */
3820  if (startWalReceiver &&
3821  PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3822  {
3823  XLogRecPtr ptr;
3824  TimeLineID tli;
3825 
3826  if (fetching_ckpt)
3827  {
3828  ptr = RedoStartLSN;
3829  tli = RedoStartTLI;
3830  }
3831  else
3832  {
3833  ptr = RecPtr;
3834 
3835  /*
3836  * Use the record begin position to determine the
3837  * TLI, rather than the position we're reading.
3838  */
3839  tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3840 
3841  if (curFileTLI > 0 && tli < curFileTLI)
3842  elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3843  LSN_FORMAT_ARGS(tliRecPtr),
3844  tli, curFileTLI);
3845  }
3846  curFileTLI = tli;
3851  flushedUpto = 0;
3852  }
3853 
3854  /*
3855  * Check if WAL receiver is active or wait to start up.
3856  */
3857  if (!WalRcvStreaming())
3858  {
3859  lastSourceFailed = true;
3860  break;
3861  }
3862 
3863  /*
3864  * Walreceiver is active, so see if new data has arrived.
3865  *
3866  * We only advance XLogReceiptTime when we obtain fresh
3867  * WAL from walreceiver and observe that we had already
3868  * processed everything before the most recent "chunk"
3869  * that it flushed to disk. In steady state where we are
3870  * keeping up with the incoming data, XLogReceiptTime will
3871  * be updated on each cycle. When we are behind,
3872  * XLogReceiptTime will not advance, so the grace time
3873  * allotted to conflicting queries will decrease.
3874  */
3875  if (RecPtr < flushedUpto)
3876  havedata = true;
3877  else
3878  {
3879  XLogRecPtr latestChunkStart;
3880 
3881  flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3882  if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3883  {
3884  havedata = true;
3885  if (latestChunkStart <= RecPtr)
3886  {
3889  }
3890  }
3891  else
3892  havedata = false;
3893  }
3894  if (havedata)
3895  {
3896  /*
3897  * Great, streamed far enough. Open the file if it's
3898  * not open already. Also read the timeline history
3899  * file if we haven't initialized timeline history
3900  * yet; it should be streamed over and present in
3901  * pg_wal by now. Use XLOG_FROM_STREAM so that source
3902  * info is set correctly and XLogReceiptTime isn't
3903  * changed.
3904  *
3905  * NB: We must set readTimeLineHistory based on
3906  * recoveryTargetTLI, not receiveTLI. Normally they'll
3907  * be the same, but if recovery_target_timeline is
3908  * 'latest' and archiving is configured, then it's
3909  * possible that we managed to retrieve one or more
3910  * new timeline history files from the archive,
3911  * updating recoveryTargetTLI.
3912  */
3913  if (readFile < 0)
3914  {
3915  if (!expectedTLEs)
3918  receiveTLI,
3919  XLOG_FROM_STREAM, false);
3920  Assert(readFile >= 0);
3921  }
3922  else
3923  {
3924  /* just make sure source info is correct... */
3927  return XLREAD_SUCCESS;
3928  }
3929  break;
3930  }
3931 
3932  /* In nonblocking mode, return rather than sleeping. */
3933  if (nonblocking)
3934  return XLREAD_WOULDBLOCK;
3935 
3936  /*
3937  * Data not here yet. Check for trigger, then wait for
3938  * walreceiver to wake us up when new WAL arrives.
3939  */
3940  if (CheckForStandbyTrigger())
3941  {
3942  /*
3943  * Note that we don't return XLREAD_FAIL immediately
3944  * here. After being triggered, we still want to
3945  * replay all the WAL that was already streamed. It's
3946  * in pg_wal now, so we just treat this as a failure,
3947  * and the state machine will move on to replay the
3948  * streamed WAL from pg_wal, and then recheck the
3949  * trigger and exit replay.
3950  */
3951  lastSourceFailed = true;
3952  break;
3953  }
3954 
3955  /*
3956  * Since we have replayed everything we have received so
3957  * far and are about to start waiting for more WAL, let's
3958  * tell the upstream server our replay location now so
3959  * that pg_stat_replication doesn't show stale
3960  * information.
3961  */
3962  if (!streaming_reply_sent)
3963  {
3964  WalRcvForceReply();
3965  streaming_reply_sent = true;
3966  }
3967 
3968  /* Do any background tasks that might benefit us later. */
3970 
3971  /* Update pg_stat_recovery_prefetch before sleeping. */
3973 
3974  /*
3975  * Wait for more WAL to arrive, when we will be woken
3976  * immediately by the WAL receiver.
3977  */
3980  -1L,
3981  WAIT_EVENT_RECOVERY_WAL_STREAM);
3983  break;
3984  }
3985 
3986  default:
3987  elog(ERROR, "unexpected WAL source %d", currentSource);
3988  }
3989 
3990  /*
3991  * Check for recovery pause here so that we can confirm more quickly
3992  * that a requested pause has actually taken effect.
3993  */
3994  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
3996  recoveryPausesHere(false);
3997 
3998  /*
3999  * This possibly-long loop needs to handle interrupts of startup
4000  * process.
4001  */
4003  }
4004 
4005  return XLREAD_FAIL; /* not reached */
4006 }
4007 
4008 
4009 /*
4010  * Determine what log level should be used to report a corrupt WAL record
4011  * in the current WAL page, previously read by XLogPageRead().
4012  *
4013  * 'emode' is the error mode that would be used to report a file-not-found
4014  * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4015  * we're retrying the exact same record that we've tried previously, only
4016  * complain the first time to keep the noise down. However, we only do when
4017  * reading from pg_wal, because we don't expect any invalid records in archive
4018  * or in records streamed from the primary. Files in the archive should be complete,
4019  * and we should never hit the end of WAL because we stop and wait for more WAL
4020  * to arrive before replaying it.
4021  *
4022  * NOTE: This function remembers the RecPtr value it was last called with,
4023  * to suppress repeated messages about the same record. Only call this when
4024  * you are about to ereport(), or you might cause a later message to be
4025  * erroneously suppressed.
4026  */
4027 static int
4029 {
4030  static XLogRecPtr lastComplaint = 0;
4031 
4032  if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4033  {
4034  if (RecPtr == lastComplaint)
4035  emode = DEBUG1;
4036  else
4037  lastComplaint = RecPtr;
4038  }
4039  return emode;
4040 }
4041 
4042 
4043 /*
4044  * Subroutine to try to fetch and validate a prior checkpoint record.
4045  */
4046 static XLogRecord *
4048  TimeLineID replayTLI)
4049 {
4050  XLogRecord *record;
4051  uint8 info;
4052 
4053  Assert(xlogreader != NULL);
4054 
4055  if (!XRecOffIsValid(RecPtr))
4056  {
4057  ereport(LOG,
4058  (errmsg("invalid checkpoint location")));
4059  return NULL;
4060  }
4061 
4063  record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4064 
4065  if (record == NULL)
4066  {
4067  ereport(LOG,
4068  (errmsg("invalid checkpoint record")));
4069  return NULL;
4070  }
4071  if (record->xl_rmid != RM_XLOG_ID)
4072  {
4073  ereport(LOG,
4074  (errmsg("invalid resource manager ID in checkpoint record")));
4075  return NULL;
4076  }
4077  info = record->xl_info & ~XLR_INFO_MASK;
4078  if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4079  info != XLOG_CHECKPOINT_ONLINE)
4080  {
4081  ereport(LOG,
4082  (errmsg("invalid xl_info in checkpoint record")));
4083  return NULL;
4084  }
4086  {
4087  ereport(LOG,
4088  (errmsg("invalid length of checkpoint record")));
4089  return NULL;
4090  }
4091  return record;
4092 }
4093 
4094 /*
4095  * Scan for new timelines that might have appeared in the archive since we
4096  * started recovery.
4097  *
4098  * If there are any, the function changes recovery target TLI to the latest
4099  * one and returns 'true'.
4100  */
4101 static bool
4103 {
4104  List *newExpectedTLEs;
4105  bool found;
4106  ListCell *cell;
4107  TimeLineID newtarget;
4108  TimeLineID oldtarget = recoveryTargetTLI;
4109  TimeLineHistoryEntry *currentTle = NULL;
4110 
4112  if (newtarget == recoveryTargetTLI)
4113  {
4114  /* No new timelines found */
4115  return false;
4116  }
4117 
4118  /*
4119  * Determine the list of expected TLIs for the new TLI
4120  */
4121 
4122  newExpectedTLEs = readTimeLineHistory(newtarget);
4123 
4124  /*
4125  * If the current timeline is not part of the history of the new timeline,
4126  * we cannot proceed to it.
4127  */
4128  found = false;
4129  foreach(cell, newExpectedTLEs)
4130  {
4131  currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4132 
4133  if (currentTle->tli == recoveryTargetTLI)
4134  {
4135  found = true;
4136  break;
4137  }
4138  }
4139  if (!found)
4140  {
4141  ereport(LOG,
4142  (errmsg("new timeline %u is not a child of database system timeline %u",
4143  newtarget,
4144  replayTLI)));
4145  return false;
4146  }
4147 
4148  /*
4149  * The current timeline was found in the history file, but check that the
4150  * next timeline was forked off from it *after* the current recovery
4151  * location.
4152  */
4153  if (currentTle->end < replayLSN)
4154  {
4155  ereport(LOG,
4156  (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4157  newtarget,
4158  replayTLI,
4159  LSN_FORMAT_ARGS(replayLSN))));
4160  return false;
4161  }
4162 
4163  /* The new timeline history seems valid. Switch target */
4164  recoveryTargetTLI = newtarget;
4166  expectedTLEs = newExpectedTLEs;
4167 
4168  /*
4169  * As in StartupXLOG(), try to ensure we have all the history files
4170  * between the old target and new target in pg_wal.
4171  */
4172  restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4173 
4174  ereport(LOG,
4175  (errmsg("new target timeline is %u",
4176  recoveryTargetTLI)));
4177 
4178  return true;
4179 }
4180 
4181 
4182 /*
4183  * Open a logfile segment for reading (during recovery).
4184  *
4185  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4186  * Otherwise, it's assumed to be already available in pg_wal.
4187  */
4188 static int
4189 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
4190  XLogSource source, bool notfoundOk)
4191 {
4192  char xlogfname[MAXFNAMELEN];
4193  char activitymsg[MAXFNAMELEN + 16];
4194  char path[MAXPGPATH];
4195  int fd;
4196 
4197  XLogFileName(xlogfname, tli, segno, wal_segment_size);
4198 
4199  switch (source)
4200  {
4201  case XLOG_FROM_ARCHIVE:
4202  /* Report recovery progress in PS display */
4203  snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4204  xlogfname);
4205  set_ps_display(activitymsg);
4206 
4207  if (!RestoreArchivedFile(path, xlogfname,
4208  "RECOVERYXLOG",
4210  InRedo))
4211  return -1;
4212  break;
4213 
4214  case XLOG_FROM_PG_WAL:
4215  case XLOG_FROM_STREAM:
4216  XLogFilePath(path, tli, segno, wal_segment_size);
4217  break;
4218 
4219  default:
4220  elog(ERROR, "invalid XLogFileRead source %d", source);
4221  }
4222 
4223  /*
4224  * If the segment was fetched from archival storage, replace the existing
4225  * xlog segment (if any) with the archival version.
4226  */
4227  if (source == XLOG_FROM_ARCHIVE)
4228  {
4230  KeepFileRestoredFromArchive(path, xlogfname);
4231 
4232  /*
4233  * Set path to point at the new file in pg_wal.
4234  */
4235  snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4236  }
4237 
4238  fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4239  if (fd >= 0)
4240  {
4241  /* Success! */
4242  curFileTLI = tli;
4243 
4244  /* Report recovery progress in PS display */
4245  snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4246  xlogfname);
4247  set_ps_display(activitymsg);
4248 
4249  /* Track source of data in assorted state variables */
4250  readSource = source;
4252  /* In FROM_STREAM case, caller tracks receipt time, not me */
4253  if (source != XLOG_FROM_STREAM)
4255 
4256  return fd;
4257  }
4258  if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4259  ereport(PANIC,
4261  errmsg("could not open file \"%s\": %m", path)));
4262  return -1;
4263 }
4264 
4265 /*
4266  * Open a logfile segment for reading (during recovery).
4267  *
4268  * This version searches for the segment with any TLI listed in expectedTLEs.
4269  */
4270 static int
4272 {
4273  char path[MAXPGPATH];
4274  ListCell *cell;
4275  int fd;
4276  List *tles;
4277 
4278  /*
4279  * Loop looking for a suitable timeline ID: we might need to read any of
4280  * the timelines listed in expectedTLEs.
4281  *
4282  * We expect curFileTLI on entry to be the TLI of the preceding file in
4283  * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4284  * to go backwards; this prevents us from picking up the wrong file when a
4285  * parent timeline extends to higher segment numbers than the child we
4286  * want to read.
4287  *
4288  * If we haven't read the timeline history file yet, read it now, so that
4289  * we know which TLIs to scan. We don't save the list in expectedTLEs,
4290  * however, unless we actually find a valid segment. That way if there is
4291  * neither a timeline history file nor a WAL segment in the archive, and
4292  * streaming replication is set up, we'll read the timeline history file
4293  * streamed from the primary when we start streaming, instead of
4294  * recovering with a dummy history generated here.
4295  */
4296  if (expectedTLEs)
4297  tles = expectedTLEs;
4298  else
4300 
4301  foreach(cell, tles)
4302  {
4304  TimeLineID tli = hent->tli;
4305 
4306  if (tli < curFileTLI)
4307  break; /* don't bother looking at too-old TLIs */
4308 
4309  /*
4310  * Skip scanning the timeline ID that the logfile segment to read
4311  * doesn't belong to
4312  */
4313  if (hent->begin != InvalidXLogRecPtr)
4314  {
4315  XLogSegNo beginseg = 0;
4316 
4317  XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4318 
4319  /*
4320  * The logfile segment that doesn't belong to the timeline is
4321  * older or newer than the segment that the timeline started or
4322  * ended at, respectively. It's sufficient to check only the
4323  * starting segment of the timeline here. Since the timelines are
4324  * scanned in descending order in this loop, any segments newer
4325  * than the ending segment should belong to newer timeline and
4326  * have already been read before. So it's not necessary to check
4327  * the ending segment of the timeline here.
4328  */
4329  if (segno < beginseg)
4330  continue;
4331  }
4332 
4334  {
4335  fd = XLogFileRead(segno, emode, tli,
4336  XLOG_FROM_ARCHIVE, true);
4337  if (fd != -1)
4338  {
4339  elog(DEBUG1, "got WAL segment from archive");
4340  if (!expectedTLEs)
4341  expectedTLEs = tles;
4342  return fd;
4343  }
4344  }
4345 
4347  {
4348  fd = XLogFileRead(segno, emode, tli,
4349  XLOG_FROM_PG_WAL, true);
4350  if (fd != -1)
4351  {
4352  if (!expectedTLEs)
4353  expectedTLEs = tles;
4354  return fd;
4355  }
4356  }
4357  }
4358 
4359  /* Couldn't find it. For simplicity, complain about front timeline */
4361  errno = ENOENT;
4362  ereport(emode,
4364  errmsg("could not open file \"%s\": %m", path)));
4365  return -1;
4366 }
4367 
4368 /*
4369  * Set flag to signal the walreceiver to restart. (The startup process calls
4370  * this on noticing a relevant configuration change.)
4371  */
4372 void
4374 {
4376  {
4377  ereport(LOG,
4378  (errmsg("WAL receiver process shutdown requested")));
4379 
4380  pendingWalRcvRestart = true;
4381  }
4382 }
4383 
4384 
4385 /*
4386  * Has a standby promotion already been triggered?
4387  *
4388  * Unlike CheckForStandbyTrigger(), this works in any process
4389  * that's connected to shared memory.
4390  */
4391 bool
4393 {
4394  /*
4395  * We check shared state each time only until a standby promotion is
4396  * triggered. We can't trigger a promotion again, so there's no need to
4397  * keep checking after the shared variable has once been seen true.
4398  */
4400  return true;
4401 
4405 
4406  return LocalPromoteIsTriggered;
4407 }
4408 
4409 static void
4411 {
4415 
4416  /*
4417  * Mark the recovery pause state as 'not paused' because the paused state
4418  * ends and promotion continues if a promotion is triggered while recovery
4419  * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4420  * return 'paused' while a promotion is ongoing.
4421  */
4422  SetRecoveryPause(false);
4423 
4424  LocalPromoteIsTriggered = true;
4425 }
4426 
4427 /*
4428  * Check whether a promote request has arrived.
4429  */
4430 static bool
4432 {
4434  return true;
4435 
4437  {
4438  ereport(LOG, (errmsg("received promote request")));
4442  return true;
4443  }
4444 
4445  return false;
4446 }
4447 
4448 /*
4449  * Remove the files signaling a standby promotion request.
4450  */
4451 void
4453 {
4454  unlink(PROMOTE_SIGNAL_FILE);
4455 }
4456 
4457 /*
4458  * Check to see if a promote request has arrived.
4459  */
4460 bool
4462 {
4463  struct stat stat_buf;
4464 
4465  if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4466  return true;
4467 
4468  return false;
4469 }
4470 
4471 /*
4472  * Wake up startup process to replay newly arrived WAL, or to notice that
4473  * failover has been requested.
4474  */
4475 void
4477 {
4479 }
4480 
4481 /*
4482  * Schedule a walreceiver wakeup in the main recovery loop.
4483  */
4484 void
4486 {
4488 }
4489 
4490 /*
4491  * Is HotStandby active yet? This is only important in special backends
4492  * since normal backends won't ever be able to connect until this returns
4493  * true. Postmaster knows this by way of signal, not via shared memory.
4494  *
4495  * Unlike testing standbyState, this works in any process that's connected to
4496  * shared memory. (And note that standbyState alone doesn't tell the truth
4497  * anyway.)
4498  */
4499 bool
4501 {
4502  /*
4503  * We check shared state each time only until Hot Standby is active. We
4504  * can't de-activate Hot Standby, so there's no need to keep checking
4505  * after the shared variable has once been seen true.
4506  */
4508  return true;
4509  else
4510  {
4511  /* spinlock is essential on machines with weak memory ordering! */
4515 
4516  return LocalHotStandbyActive;
4517  }
4518 }
4519 
4520 /*
4521  * Like HotStandbyActive(), but to be used only in WAL replay code,
4522  * where we don't need to ask any other process what the state is.
4523  */
4524 static bool
4526 {
4528  return LocalHotStandbyActive;
4529 }
4530 
4531 /*
4532  * Get latest redo apply position.
4533  *
4534  * Exported to allow WALReceiver to read the pointer directly.
4535  */
4536 XLogRecPtr
4538 {
4539  XLogRecPtr recptr;
4540  TimeLineID tli;
4541 
4546 
4547  if (replayTLI)
4548  *replayTLI = tli;
4549  return recptr;
4550 }
4551 
4552 
4553 /*
4554  * Get position of last applied, or the record being applied.
4555  *
4556  * This is different from GetXLogReplayRecPtr() in that if a WAL
4557  * record is currently being applied, this includes that record.
4558  */
4559 XLogRecPtr
4561 {
4562  XLogRecPtr recptr;
4563  TimeLineID tli;
4564 
4566  recptr = XLogRecoveryCtl->replayEndRecPtr;
4569 
4570  if (replayEndTLI)
4571  *replayEndTLI = tli;
4572  return recptr;
4573 }
4574 
4575 /*
4576  * Save timestamp of latest processed commit/abort record.
4577  *
4578  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4579  * seen by processes other than the startup process. Note in particular
4580  * that CreateRestartPoint is executed in the checkpointer.
4581  */
4582 static void
4584 {
4588 }
4589 
4590 /*
4591  * Fetch timestamp of latest processed commit/abort record.
4592  */
4595 {
4596  TimestampTz xtime;
4597 
4601 
4602  return xtime;
4603 }
4604 
4605 /*
4606  * Save timestamp of the next chunk of WAL records to apply.
4607  *
4608  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4609  * seen by all backends.
4610  */
4611 static void
4613 {
4617 }
4618 
4619 /*
4620  * Fetch timestamp of latest processed commit/abort record.
4621  * Startup process maintains an accurate local copy in XLogReceiptTime
4622  */
4625 {
4626  TimestampTz xtime;
4627 
4631 
4632  return xtime;
4633 }
4634 
4635 /*
4636  * Returns time of receipt of current chunk of XLOG data, as well as
4637  * whether it was received from streaming replication or from archives.
4638  */
4639 void
4640 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4641 {
4642  /*
4643  * This must be executed in the startup process, since we don't export the
4644  * relevant state to shared memory.
4645  */
4646  Assert(InRecovery);
4647 
4648  *rtime = XLogReceiptTime;
4649  *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4650 }
4651 
4652 /*
4653  * Note that text field supplied is a parameter name and does not require
4654  * translation
4655  */
4656 void
4657 RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4658 {
4659  if (currValue < minValue)
4660  {
4662  {
4663  bool warned_for_promote = false;
4664 
4665  ereport(WARNING,
4666  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4667  errmsg("hot standby is not possible because of insufficient parameter settings"),
4668  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4669  param_name,
4670  currValue,
4671  minValue)));
4672 
4673  SetRecoveryPause(true);
4674 
4675  ereport(LOG,
4676  (errmsg("recovery has paused"),
4677  errdetail("If recovery is unpaused, the server will shut down."),
4678  errhint("You can then restart the server after making the necessary configuration changes.")));
4679 
4681  {
4683 
4684  if (CheckForStandbyTrigger())
4685  {
4686  if (!warned_for_promote)
4687  ereport(WARNING,
4688  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4689  errmsg("promotion is not possible because of insufficient parameter settings"),
4690 
4691  /*
4692  * Repeat the detail from above so it's easy to find
4693  * in the log.
4694  */
4695  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4696  param_name,
4697  currValue,
4698  minValue),
4699  errhint("Restart the server after making the necessary configuration changes.")));
4700  warned_for_promote = true;
4701  }
4702 
4703  /*
4704  * If recovery pause is requested then set it paused. While
4705  * we are in the loop, user might resume and pause again so
4706  * set this every time.
4707  */
4709 
4710  /*
4711  * We wait on a condition variable that will wake us as soon
4712  * as the pause ends, but we use a timeout so we can check the
4713  * above conditions periodically too.
4714  */
4716  WAIT_EVENT_RECOVERY_PAUSE);
4717  }
4719  }
4720 
4721  ereport(FATAL,
4722  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4723  errmsg("recovery aborted because of insufficient parameter settings"),
4724  /* Repeat the detail from above so it's easy to find in the log. */
4725  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4726  param_name,
4727  currValue,
4728  minValue),
4729  errhint("You can restart the server after making the necessary configuration changes.")));
4730  }
4731 }
4732 
4733 
4734 /*
4735  * GUC check_hook for primary_slot_name
4736  */
4737 bool
4739 {
4740  if (*newval && strcmp(*newval, "") != 0 &&
4742  return false;
4743 
4744  return true;
4745 }
4746 
4747 /*
4748  * Recovery target settings: Only one of the several recovery_target* settings
4749  * may be set. Setting a second one results in an error. The global variable
4750  * recoveryTarget tracks which kind of recovery target was chosen. Other
4751  * variables store the actual target value (for example a string or a xid).
4752  * The assign functions of the parameters check whether a competing parameter
4753  * was already set. But we want to allow setting the same parameter multiple
4754  * times. We also want to allow unsetting a parameter and setting a different
4755  * one, so we unset recoveryTarget when the parameter is set to an empty
4756  * string.
4757  *
4758  * XXX this code is broken by design. Throwing an error from a GUC assign
4759  * hook breaks fundamental assumptions of guc.c. So long as all the variables
4760  * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4761  * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4762  * that we have odd behaviors such as unexpected GUC ordering dependencies.
4763  */
4764 
4765 static void
4767 error_multiple_recovery_targets(void)
4768 {
4769  ereport(ERROR,
4770  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4771  errmsg("multiple recovery targets specified"),
4772  errdetail("At most one of recovery_target, recovery_target_lsn, recovery_target_name, recovery_target_time, recovery_target_xid may be set.")));
4773 }
4774 
4775 /*
4776  * GUC check_hook for recovery_target
4777  */
4778 bool
4780 {
4781  if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4782  {
4783  GUC_check_errdetail("The only allowed value is \"immediate\".");
4784  return false;
4785  }
4786  return true;
4787 }
4788 
4789 /*
4790  * GUC assign_hook for recovery_target
4791  */
4792 void
4793 assign_recovery_target(const char *newval, void *extra)
4794 {
4797  error_multiple_recovery_targets();
4798 
4799  if (newval && strcmp(newval, "") != 0)
4801  else
4803 }
4804 
4805 /*
4806  * GUC check_hook for recovery_target_lsn
4807  */
4808 bool
4810 {
4811  if (strcmp(*newval, "") != 0)
4812  {
4813  XLogRecPtr lsn;
4814  XLogRecPtr *myextra;
4815  bool have_error = false;
4816 
4817  lsn = pg_lsn_in_internal(*newval, &have_error);
4818  if (have_error)
4819  return false;
4820 
4821  myextra = (XLogRecPtr *) guc_malloc(ERROR, sizeof(XLogRecPtr));
4822  *myextra = lsn;
4823  *extra = (void *) myextra;
4824  }
4825  return true;
4826 }
4827 
4828 /*
4829  * GUC assign_hook for recovery_target_lsn
4830  */
4831 void
4832 assign_recovery_target_lsn(const char *newval, void *extra)
4833 {
4836  error_multiple_recovery_targets();
4837 
4838  if (newval && strcmp(newval, "") != 0)
4839  {
4841  recoveryTargetLSN = *((XLogRecPtr *) extra);
4842  }
4843  else
4845 }
4846 
4847 /*
4848  * GUC check_hook for recovery_target_name
4849  */
4850 bool
4852 {
4853  /* Use the value of newval directly */
4854  if (strlen(*newval) >= MAXFNAMELEN)
4855  {
4856  GUC_check_errdetail("%s is too long (maximum %d characters).",
4857  "recovery_target_name", MAXFNAMELEN - 1);
4858  return false;
4859  }
4860  return true;
4861 }
4862 
4863 /*
4864  * GUC assign_hook for recovery_target_name
4865  */
4866 void
4867 assign_recovery_target_name(const char *newval, void *extra)
4868 {
4871  error_multiple_recovery_targets();
4872 
4873  if (newval && strcmp(newval, "") != 0)
4874  {
4877  }
4878  else
4880 }
4881 
4882 /*
4883  * GUC check_hook for recovery_target_time
4884  *
4885  * The interpretation of the recovery_target_time string can depend on the
4886  * time zone setting, so we need to wait until after all GUC processing is
4887  * done before we can do the final parsing of the string. This check function
4888  * only does a parsing pass to catch syntax errors, but we store the string
4889  * and parse it again when we need to use it.
4890  */
4891 bool
4893 {
4894  if (strcmp(*newval, "") != 0)
4895  {
4896  /* reject some special values */
4897  if (strcmp(*newval, "now") == 0 ||
4898  strcmp(*newval, "today") == 0 ||
4899  strcmp(*newval, "tomorrow") == 0 ||
4900  strcmp(*newval, "yesterday") == 0)
4901  {
4902  return false;
4903  }
4904 
4905  /*
4906  * parse timestamp value (see also timestamptz_in())
4907  */
4908  {
4909  char *str = *newval;
4910  fsec_t fsec;
4911  struct pg_tm tt,
4912  *tm = &tt;
4913  int tz;
4914  int dtype;
4915  int nf;
4916  int dterr;
4917  char *field[MAXDATEFIELDS];
4918  int ftype[MAXDATEFIELDS];
4919  char workbuf[MAXDATELEN + MAXDATEFIELDS];
4920  DateTimeErrorExtra dtextra;
4922 
4923  dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4924  field, ftype, MAXDATEFIELDS, &nf);
4925  if (dterr == 0)
4926  dterr = DecodeDateTime(field, ftype, nf,
4927  &dtype, tm, &fsec, &tz, &dtextra);
4928  if (dterr != 0)
4929  return false;
4930  if (dtype != DTK_DATE)
4931  return false;
4932 
4933  if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4934  {
4935  GUC_check_errdetail("timestamp out of range: \"%s\"", str);
4936  return false;
4937  }
4938  }
4939  }
4940  return true;
4941 }
4942 
4943 /*
4944  * GUC assign_hook for recovery_target_time
4945  */
4946 void
4947 assign_recovery_target_time(const char *newval, void *extra)
4948 {
4951  error_multiple_recovery_targets();
4952 
4953  if (newval && strcmp(newval, "") != 0)
4955  else
4957 }
4958 
4959 /*
4960  * GUC check_hook for recovery_target_timeline
4961  */
4962 bool
4964 {
4966  RecoveryTargetTimeLineGoal *myextra;
4967 
4968  if (strcmp(*newval, "current") == 0)
4970  else if (strcmp(*newval, "latest") == 0)
4972  else
4973  {
4975 
4976  errno = 0;
4977  strtoul(*newval, NULL, 0);
4978  if (errno == EINVAL || errno == ERANGE)
4979  {
4980  GUC_check_errdetail("recovery_target_timeline is not a valid number.");
4981  return false;
4982  }
4983  }
4984 
4986  *myextra = rttg;
4987  *extra = (void *) myextra;
4988 
4989  return true;
4990 }
4991 
4992 /*
4993  * GUC assign_hook for recovery_target_timeline
4994  */
4995 void
4996 assign_recovery_target_timeline(const char *newval, void *extra)
4997 {
5000  recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5001  else
5003 }
5004 
5005 /*
5006  * GUC check_hook for recovery_target_xid
5007  */
5008 bool
5010 {
5011  if (strcmp(*newval, "") != 0)
5012  {
5013  TransactionId xid;
5014  TransactionId *myextra;
5015 
5016  errno = 0;
5017  xid = (TransactionId) strtou64(*newval, NULL, 0);
5018  if (errno == EINVAL || errno == ERANGE)
5019  return false;
5020 
5021  myextra = (TransactionId *) guc_malloc(ERROR, sizeof(TransactionId));
5022  *myextra = xid;
5023  *extra = (void *) myextra;
5024  }
5025  return true;
5026 }
5027 
5028 /*
5029  * GUC assign_hook for recovery_target_xid
5030  */
5031 void
5032 assign_recovery_target_xid(const char *newval, void *extra)
5033 {
5036  error_multiple_recovery_targets();
5037 
5038  if (newval && strcmp(newval, "") != 0)
5039  {
5041  recoveryTargetXid = *((TransactionId *) extra);
5042  }
5043  else
5045 }
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition: timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:544
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:572
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition: timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition: timeline.c:50
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition: timeline.c:526
void remove_tablespace_symlink(const char *linkloc)
Definition: tablespace.c:889
bool allow_in_place_tablespaces
Definition: tablespace.c:91
void HandleStartupProcInterrupts(void)
Definition: startup.c:155
void disable_startup_progress_timeout(void)
Definition: startup.c:305
bool IsPromoteSignaled(void)
Definition: startup.c:284
void begin_startup_progress_phase(void)
Definition: startup.c:339
void ResetPromoteSignaled(void)
Definition: startup.c:290
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition: datetime.c:756
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition: datetime.c:980
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1767
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition: timestamp.c:1998
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1791
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition: timestamp.c:417
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1655
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1619
const char * timestamptz_to_str(TimestampTz t)
Definition: timestamp.c:1854
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4578
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4796
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:159
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:50
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:301
Pointer Page
Definition: bufpage.h:78
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
unsigned int uint32
Definition: c.h:493
signed int int32
Definition: c.h:481
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:169
#define PG_BINARY
Definition: c.h:1260
#define UINT64_FORMAT
Definition: c.h:536
#define strtou64(str, endptr, base)
Definition: c.h:1285
unsigned char uint8
Definition: c.h:491
uint32 TransactionId
Definition: c.h:639
size_t Size
Definition: c.h:592
void RequestCheckpoint(int flags)
Definition: checkpointer.c:935
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition: timestamp.h:39
int32 fsec_t
Definition: timestamp.h:41
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1160
int errcode_for_file_access(void)
Definition: elog.c:883
int errdetail(const char *fmt,...)
Definition: elog.c:1206
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errhint(const char *fmt,...)
Definition: elog.c:1320
int errcode(int sqlerrcode)
Definition: elog.c:860
int errmsg(const char *fmt,...)
Definition: elog.c:1073
#define LOG
Definition: elog.h:31
#define errcontext
Definition: elog.h:196
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2909
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2583
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1109
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:782
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1087
int FreeFile(FILE *file)
Definition: fd.c:2781
int pg_fsync(int fd)
Definition: fd.c:386
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2843
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:525
@ PGFILETYPE_LNK
Definition: file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:646
bool IsUnderPostmaster
Definition: globals.c:116
char * DataDir
Definition: globals.c:67
bool IsPostmasterEnvironment
Definition: globals.c:115
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:633
#define newval
#define GUC_check_errdetail
Definition: guc.h:446
GucSource
Definition: guc.h:108
#define MAXDATEFIELDS
Definition: datetime.h:202
#define DTK_DATE
Definition: datetime.h:144
#define MAXDATELEN
Definition: datetime.h:200
#define close(a)
Definition: win32.h:12
void proc_exit(int code)
Definition: ipc.c:104
int i
Definition: isn.c:73
void OwnLatch(Latch *latch)
Definition: latch.c:464
void DisownLatch(Latch *latch)
Definition: latch.c:490
void InitSharedLatch(Latch *latch)
Definition: latch.c:431
void SetLatch(Latch *latch)
Definition: latch.c:633
void ResetLatch(Latch *latch)
Definition: latch.c:725
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:518
#define WL_TIMEOUT
Definition: latch.h:130
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:132
#define WL_LATCH_SET
Definition: latch.h:127
Assert(fmt[strlen(fmt) - 1] !='\n')
List * lappend(List *list, void *datum)
Definition: list.c:339
void list_free_deep(List *list)
Definition: list.c:1560
static struct pg_tm tm
Definition: localtime.c:104
char * pstrdup(const char *in)
Definition: mcxt.c:1580
void pfree(void *pointer)
Definition: mcxt.c:1405
void * palloc0(Size size)
Definition: mcxt.c:1231
void * palloc(Size size)
Definition: mcxt.c:1201
#define AmStartupProcess()
Definition: miscadmin.h:456
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:419
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition: pg_control.h:74
#define XLOG_CHECKPOINT_REDO
Definition: pg_control.h:81
#define XLOG_OVERWRITE_CONTRECORD
Definition: pg_control.h:80
DBState
Definition: pg_control.h:89
@ DB_IN_ARCHIVE_RECOVERY
Definition: pg_control.h:95
@ DB_SHUTDOWNED_IN_RECOVERY
Definition: pg_control.h:92
@ DB_SHUTDOWNED
Definition: pg_control.h:91
@ DB_IN_CRASH_RECOVERY
Definition: pg_control.h:94
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:67
#define XLOG_BACKUP_END
Definition: pg_control.h:72
#define XLOG_CHECKPOINT_ONLINE
Definition: pg_control.h:68
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:76
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
XLogRecPtr pg_lsn_in_internal(const char *str, bool *have_error)
Definition: pg_lsn.c:30
static rewind_source * source
Definition: pg_rewind.c:89
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:73
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:181
@ PMSIGNAL_RECOVERY_STARTED
Definition: pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition: pmsignal.h:36
#define pg_pread
Definition: port.h:225
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:350
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:36
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition: procarray.c:4407
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition: procarray.c:4544
static void set_ps_display(const char *activity)
Definition: ps_status.h:40
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
void RmgrStartup(void)
Definition: rmgr.c:49
void RmgrCleanup(void)
Definition: rmgr.c:65
int slock_t
Definition: s_lock.h:735
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:388
static pg_noinline void Size size
Definition: slab.c:607
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:215
void ShutDownSlotSync(void)
Definition: slotsync.c:1361
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:97
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:182
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:194
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
Oid oldestMultiDB
Definition: pg_control.h:50
MultiXactId oldestMulti
Definition: pg_control.h:49
MultiXactOffset nextMultiOffset
Definition: pg_control.h:46
TransactionId newestCommitTsXid
Definition: pg_control.h:54
TransactionId oldestXid
Definition: pg_control.h:47
TimeLineID PrevTimeLineID
Definition: pg_control.h:40
TimeLineID ThisTimeLineID
Definition: pg_control.h:39
Oid nextOid
Definition: pg_control.h:44
MultiXactId nextMulti
Definition: pg_control.h:45
FullTransactionId nextXid
Definition: pg_control.h:43
TransactionId oldestCommitTsXid
Definition: pg_control.h:52
XLogRecPtr redo
Definition: pg_control.h:37
Oid oldestXidDB
Definition: pg_control.h:48
XLogRecPtr backupStartPoint
Definition: pg_control.h:169
bool backupEndRequired
Definition: pg_control.h:171
CheckPoint checkPointCopy
Definition: pg_control.h:134
XLogRecPtr backupEndPoint
Definition: pg_control.h:170
XLogRecPtr minRecoveryPoint
Definition: pg_control.h:167
XLogRecPtr checkPoint
Definition: pg_control.h:132
uint64 system_identifier
Definition: pg_control.h:109
TimeLineID minRecoveryPointTLI
Definition: pg_control.h:168
Definition: dirent.c:26
XLogRecPtr lastPageBeginPtr
Definition: xlogrecovery.h:111
XLogRecPtr abortedRecPtr
Definition: xlogrecovery.h:120
XLogRecPtr missingContrecPtr
Definition: xlogrecovery.h:121
TimeLineID endOfLogTLI
Definition: xlogrecovery.h:109
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
Definition: latch.h:113
Definition: pg_list.h:54
RelFileNumber relNumber
void(* rm_mask)(char *pagedata, BlockNumber blkno)
void(* rm_redo)(XLogReaderState *record)
const char *(* rm_identify)(uint8 info)
const char * rm_name
void(* rm_desc)(StringInfo buf, XLogReaderState *record)
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h:29
TimeLineID ws_tli
Definition: xlogreader.h:49
Definition: <