PostgreSQL Source Code  git master
xlogrecovery.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * xlogrecovery.c
4  * Functions for WAL recovery, standby mode
5  *
6  * This source file contains functions controlling WAL recovery.
7  * InitWalRecovery() initializes the system for crash or archive recovery,
8  * or standby mode, depending on configuration options and the state of
9  * the control file and possible backup label file. PerformWalRecovery()
10  * performs the actual WAL replay, calling the rmgr-specific redo routines.
11  * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12  * and prepares information needed to initialize the WAL for writes. In
13  * addition to these three main functions, there are a bunch of functions
14  * for interrogating recovery state and controlling the recovery process.
15  *
16  *
17  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
18  * Portions Copyright (c) 1994, Regents of the University of California
19  *
20  * src/backend/access/transam/xlogrecovery.c
21  *
22  *-------------------------------------------------------------------------
23  */
24 
25 #include "postgres.h"
26 
27 #include <ctype.h>
28 #include <math.h>
29 #include <time.h>
30 #include <sys/stat.h>
31 #include <sys/time.h>
32 #include <unistd.h>
33 
34 #include "access/timeline.h"
35 #include "access/transam.h"
36 #include "access/xact.h"
37 #include "access/xlog_internal.h"
38 #include "access/xlogarchive.h"
39 #include "access/xlogprefetcher.h"
40 #include "access/xlogreader.h"
41 #include "access/xlogrecovery.h"
42 #include "access/xlogutils.h"
43 #include "backup/basebackup.h"
44 #include "catalog/pg_control.h"
45 #include "commands/tablespace.h"
46 #include "common/file_utils.h"
47 #include "miscadmin.h"
48 #include "pgstat.h"
49 #include "postmaster/bgwriter.h"
50 #include "postmaster/startup.h"
51 #include "replication/slot.h"
53 #include "storage/fd.h"
54 #include "storage/ipc.h"
55 #include "storage/latch.h"
56 #include "storage/pmsignal.h"
57 #include "storage/proc.h"
58 #include "storage/procarray.h"
59 #include "storage/spin.h"
60 #include "utils/builtins.h"
61 #include "utils/datetime.h"
62 #include "utils/guc_hooks.h"
63 #include "utils/pg_lsn.h"
64 #include "utils/ps_status.h"
65 #include "utils/pg_rusage.h"
66 
67 /* Unsupported old recovery command file names (relative to $PGDATA) */
68 #define RECOVERY_COMMAND_FILE "recovery.conf"
69 #define RECOVERY_COMMAND_DONE "recovery.done"
70 
71 /*
72  * GUC support
73  */
75  {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
76  {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
77  {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
78  {NULL, 0, false}
79 };
80 
81 /* options formerly taken from recovery.conf for archive recovery */
83 char *recoveryEndCommand = NULL;
84 char *archiveCleanupCommand = NULL;
91 const char *recoveryTargetName;
94 
95 /* options formerly taken from recovery.conf for XLOG streaming */
96 char *PrimaryConnInfo = NULL;
97 char *PrimarySlotName = NULL;
99 
100 /*
101  * recoveryTargetTimeLineGoal: what the user requested, if any
102  *
103  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
104  *
105  * recoveryTargetTLI: the currently understood target timeline; changes
106  *
107  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
108  * the timelines of its known parents, newest first (so recoveryTargetTLI is
109  * always the first list member). Only these TLIs are expected to be seen in
110  * the WAL segments we read, and indeed only these TLIs will be considered as
111  * candidate WAL files to open at all.
112  *
113  * curFileTLI: the TLI appearing in the name of the current input WAL file.
114  * (This is not necessarily the same as the timeline from which we are
115  * replaying WAL, which StartupXLOG calls replayTLI, because we could be
116  * scanning data that was copied from an ancestor timeline when the current
117  * file was created.) During a sequential scan we do not allow this value
118  * to decrease.
119  */
125 
126 /*
127  * When ArchiveRecoveryRequested is set, archive recovery was requested,
128  * ie. signal files were present. When InArchiveRecovery is set, we are
129  * currently recovering using offline XLOG archives. These variables are only
130  * valid in the startup process.
131  *
132  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
133  * currently performing crash recovery using only XLOG files in pg_wal, but
134  * will switch to using offline XLOG archives as soon as we reach the end of
135  * WAL in pg_wal.
136 */
138 bool InArchiveRecovery = false;
139 
140 /*
141  * When StandbyModeRequested is set, standby mode was requested, i.e.
142  * standby.signal file was present. When StandbyMode is set, we are currently
143  * in standby mode. These variables are only valid in the startup process.
144  * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
145  */
146 static bool StandbyModeRequested = false;
147 bool StandbyMode = false;
148 
149 /* was a signal file present at startup? */
150 static bool standby_signal_file_found = false;
151 static bool recovery_signal_file_found = false;
152 
153 /*
154  * CheckPointLoc is the position of the checkpoint record that determines
155  * where to start the replay. It comes from the backup label file or the
156  * control file.
157  *
158  * RedoStartLSN is the checkpoint's REDO location, also from the backup label
159  * file or the control file. In standby mode, XLOG streaming usually starts
160  * from the position where an invalid record was found. But if we fail to
161  * read even the initial checkpoint record, we use the REDO location instead
162  * of the checkpoint location as the start position of XLOG streaming.
163  * Otherwise we would have to jump backwards to the REDO location after
164  * reading the checkpoint record, because the REDO record can precede the
165  * checkpoint record.
166  */
171 
172 /*
173  * Local copy of SharedHotStandbyActive variable. False actually means "not
174  * known, need to check the shared state".
175  */
176 static bool LocalHotStandbyActive = false;
177 
178 /*
179  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
180  * known, need to check the shared state".
181  */
182 static bool LocalPromoteIsTriggered = false;
183 
184 /* Has the recovery code requested a walreceiver wakeup? */
186 
187 /* XLogReader object used to parse the WAL records */
189 
190 /* XLogPrefetcher object used to consume WAL records with read-ahead */
192 
193 /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
194 typedef struct XLogPageReadPrivate
195 {
196  int emode;
197  bool fetching_ckpt; /* are we fetching a checkpoint record? */
201 
202 /* flag to tell XLogPageRead that we have started replaying */
203 static bool InRedo = false;
204 
205 /*
206  * Codes indicating where we got a WAL file from during recovery, or where
207  * to attempt to get one.
208  */
209 typedef enum
210 {
211  XLOG_FROM_ANY = 0, /* request to read WAL from any source */
212  XLOG_FROM_ARCHIVE, /* restored using restore_command */
213  XLOG_FROM_PG_WAL, /* existing file in pg_wal */
214  XLOG_FROM_STREAM /* streamed from primary */
216 
217 /* human-readable names for XLogSources, for debugging output */
218 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
219 
220 /*
221  * readFile is -1 or a kernel FD for the log file segment that's currently
222  * open for reading. readSegNo identifies the segment. readOff is the offset
223  * of the page just read, readLen indicates how much of it has been read into
224  * readBuf, and readSource indicates where we got the currently open file from.
225  *
226  * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
227  * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
228  * worthwhile, since the XLOG is not read by general-purpose sessions.
229  */
230 static int readFile = -1;
231 static XLogSegNo readSegNo = 0;
232 static uint32 readOff = 0;
233 static uint32 readLen = 0;
235 
236 /*
237  * Keeps track of which source we're currently reading from. This is
238  * different from readSource in that this is always set, even when we don't
239  * currently have a WAL file open. If lastSourceFailed is set, our last
240  * attempt to read from currentSource failed, and we should try another source
241  * next.
242  *
243  * pendingWalRcvRestart is set when a config change occurs that requires a
244  * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
245  */
247 static bool lastSourceFailed = false;
248 static bool pendingWalRcvRestart = false;
249 
250 /*
251  * These variables track when we last obtained some WAL data to process,
252  * and where we got it from. (XLogReceiptSource is initially the same as
253  * readSource, but readSource gets reset to zero when we don't have data
254  * to process right now. It is also different from currentSource, which
255  * also changes when we try to read from a source and fail, while
256  * XLogReceiptSource tracks where we last successfully read some WAL.)
257  */
260 
261 /* Local copy of WalRcv->flushedUpto */
264 
265 /*
266  * Copy of minRecoveryPoint and backupEndPoint from the control file.
267  *
268  * In order to reach consistency, we must replay the WAL up to
269  * minRecoveryPoint. If backupEndRequired is true, we must also reach
270  * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
271  * to backupStartPoint.
272  *
273  * Note: In archive recovery, after consistency has been reached, the
274  * functions in xlog.c will start updating minRecoveryPoint in the control
275  * file. But this copy of minRecoveryPoint variable reflects the value at the
276  * beginning of recovery, and is *not* updated after consistency is reached.
277  */
280 
283 static bool backupEndRequired = false;
284 
285 /*
286  * Have we reached a consistent database state? In crash recovery, we have
287  * to replay all the WAL, so reachedConsistency is never set. During archive
288  * recovery, the database is consistent once minRecoveryPoint is reached.
289  *
290  * Consistent state means that the system is internally consistent, all
291  * the WAL has been replayed up to a certain point, and importantly, there
292  * is no trace of later actions on disk.
293  */
294 bool reachedConsistency = false;
295 
296 /* Buffers dedicated to consistency checks of size BLCKSZ */
297 static char *replay_image_masked = NULL;
298 static char *primary_image_masked = NULL;
299 
300 
301 /*
302  * Shared-memory state for WAL recovery.
303  */
304 typedef struct XLogRecoveryCtlData
305 {
306  /*
307  * SharedHotStandbyActive indicates if we allow hot standby queries to be
308  * run. Protected by info_lck.
309  */
311 
312  /*
313  * SharedPromoteIsTriggered indicates if a standby promotion has been
314  * triggered. Protected by info_lck.
315  */
317 
318  /*
319  * recoveryWakeupLatch is used to wake up the startup process to continue
320  * WAL replay, if it is waiting for WAL to arrive or promotion to be
321  * requested.
322  *
323  * Note that the startup process also uses another latch, its procLatch,
324  * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
325  * signaling the startup process in favor of using its procLatch, which
326  * comports better with possible generic signal handlers using that latch.
327  * But we should not do that because the startup process doesn't assume
328  * that it's waken up by walreceiver process or SIGHUP signal handler
329  * while it's waiting for recovery conflict. The separate latches,
330  * recoveryWakeupLatch and procLatch, should be used for inter-process
331  * communication for WAL replay and recovery conflict, respectively.
332  */
334 
335  /*
336  * Last record successfully replayed.
337  */
338  XLogRecPtr lastReplayedReadRecPtr; /* start position */
339  XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
340  TimeLineID lastReplayedTLI; /* timeline */
341 
342  /*
343  * When we're currently replaying a record, ie. in a redo function,
344  * replayEndRecPtr points to the end+1 of the record being replayed,
345  * otherwise it's equal to lastReplayedEndRecPtr.
346  */
349  /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
351 
352  /*
353  * timestamp of when we started replaying the current chunk of WAL data,
354  * only relevant for replication or archive recovery
355  */
357  /* Recovery pause state */
360 
361  slock_t info_lck; /* locks shared variables shown above */
363 
365 
366 /*
367  * abortedRecPtr is the start pointer of a broken record at end of WAL when
368  * recovery completes; missingContrecPtr is the location of the first
369  * contrecord that went missing. See CreateOverwriteContrecordRecord for
370  * details.
371  */
374 
375 /*
376  * if recoveryStopsBefore/After returns true, it saves information of the stop
377  * point here
378  */
383 static bool recoveryStopAfter;
384 
385 /* prototypes for local functions */
386 static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
387 
388 static void EnableStandbyMode(void);
389 static void readRecoverySignalFile(void);
390 static void validateRecoveryParameters(void);
391 static bool read_backup_label(XLogRecPtr *checkPointLoc,
392  TimeLineID *backupLabelTLI,
393  bool *backupEndRequired, bool *backupFromStandby);
394 static bool read_tablespace_map(List **tablespaces);
395 
396 static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
397 static void CheckRecoveryConsistency(void);
398 static void rm_redo_error_callback(void *arg);
399 #ifdef WAL_DEBUG
400 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
401 #endif
402 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
403 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
404  TimeLineID prevTLI, TimeLineID replayTLI);
405 static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
406 static void verifyBackupPageConsistency(XLogReaderState *record);
407 
408 static bool recoveryStopsBefore(XLogReaderState *record);
409 static bool recoveryStopsAfter(XLogReaderState *record);
410 static char *getRecoveryStopReason(void);
411 static void recoveryPausesHere(bool endOfRecovery);
412 static bool recoveryApplyDelay(XLogReaderState *record);
413 static void ConfirmRecoveryPaused(void);
414 
416  int emode, bool fetching_ckpt,
417  TimeLineID replayTLI);
418 
419 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
420  int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
422  bool randAccess,
423  bool fetching_ckpt,
424  XLogRecPtr tliRecPtr,
425  TimeLineID replayTLI,
426  XLogRecPtr replayLSN,
427  bool nonblocking);
428 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
430  XLogRecPtr RecPtr, TimeLineID replayTLI);
431 static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
432 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
433  XLogSource source, bool notfoundOk);
434 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
435 
436 static bool CheckForStandbyTrigger(void);
437 static void SetPromoteIsTriggered(void);
438 static bool HotStandbyActiveInReplay(void);
439 
440 static void SetCurrentChunkStartTime(TimestampTz xtime);
441 static void SetLatestXTime(TimestampTz xtime);
442 
443 /*
444  * Initialization of shared memory for WAL recovery
445  */
446 Size
448 {
449  Size size;
450 
451  /* XLogRecoveryCtl */
452  size = sizeof(XLogRecoveryCtlData);
453 
454  return size;
455 }
456 
457 void
459 {
460  bool found;
461 
463  ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
464  if (found)
465  return;
466  memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
467 
471 }
472 
473 /*
474  * A thin wrapper to enable StandbyMode and do other preparatory work as
475  * needed.
476  */
477 static void
479 {
480  StandbyMode = true;
481 
482  /*
483  * To avoid server log bloat, we don't report recovery progress in a
484  * standby as it will always be in recovery unless promoted. We disable
485  * startup progress timeout in standby mode to avoid calling
486  * startup_progress_timeout_handler() unnecessarily.
487  */
489 }
490 
491 /*
492  * Prepare the system for WAL recovery, if needed.
493  *
494  * This is called by StartupXLOG() which coordinates the server startup
495  * sequence. This function analyzes the control file and the backup label
496  * file, if any, and figures out whether we need to perform crash recovery or
497  * archive recovery, and how far we need to replay the WAL to reach a
498  * consistent state.
499  *
500  * This doesn't yet change the on-disk state, except for creating the symlinks
501  * from table space map file if any, and for fetching WAL files needed to find
502  * the checkpoint record. On entry, the caller has already read the control
503  * file into memory, and passes it as argument. This function updates it to
504  * reflect the recovery state, and the caller is expected to write it back to
505  * disk does after initializing other subsystems, but before calling
506  * PerformWalRecovery().
507  *
508  * This initializes some global variables like ArchiveRecoveryRequested, and
509  * StandbyModeRequested and InRecovery.
510  */
511 void
513  bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
514 {
515  XLogPageReadPrivate *private;
516  struct stat st;
517  bool wasShutdown;
518  XLogRecord *record;
519  DBState dbstate_at_startup;
520  bool haveTblspcMap = false;
521  bool haveBackupLabel = false;
522  CheckPoint checkPoint;
523  bool backupFromStandby = false;
524 
525  dbstate_at_startup = ControlFile->state;
526 
527  /*
528  * Initialize on the assumption we want to recover to the latest timeline
529  * that's active according to pg_control.
530  */
534  else
536 
537  /*
538  * Check for signal files, and if so set up state for offline recovery
539  */
542 
544  {
546  ereport(LOG,
547  (errmsg("entering standby mode")));
549  ereport(LOG,
550  (errmsg("starting point-in-time recovery to XID %u",
553  ereport(LOG,
554  (errmsg("starting point-in-time recovery to %s",
557  ereport(LOG,
558  (errmsg("starting point-in-time recovery to \"%s\"",
561  ereport(LOG,
562  (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
565  ereport(LOG,
566  (errmsg("starting point-in-time recovery to earliest consistent point")));
567  else
568  ereport(LOG,
569  (errmsg("starting archive recovery")));
570  }
571 
572  /*
573  * Take ownership of the wakeup latch if we're going to sleep during
574  * recovery.
575  */
578 
579  private = palloc0(sizeof(XLogPageReadPrivate));
580  xlogreader =
582  XL_ROUTINE(.page_read = &XLogPageRead,
583  .segment_open = NULL,
584  .segment_close = wal_segment_close),
585  private);
586  if (!xlogreader)
587  ereport(ERROR,
588  (errcode(ERRCODE_OUT_OF_MEMORY),
589  errmsg("out of memory"),
590  errdetail("Failed while allocating a WAL reading processor.")));
592 
593  /*
594  * Set the WAL decode buffer size. This limits how far ahead we can read
595  * in the WAL.
596  */
598 
599  /* Create a WAL prefetcher. */
601 
602  /*
603  * Allocate two page buffers dedicated to WAL consistency checks. We do
604  * it this way, rather than just making static arrays, for two reasons:
605  * (1) no need to waste the storage in most instantiations of the backend;
606  * (2) a static char array isn't guaranteed to have any particular
607  * alignment, whereas palloc() will provide MAXALIGN'd storage.
608  */
609  replay_image_masked = (char *) palloc(BLCKSZ);
610  primary_image_masked = (char *) palloc(BLCKSZ);
611 
613  &backupFromStandby))
614  {
615  List *tablespaces = NIL;
616 
617  /*
618  * Archive recovery was requested, and thanks to the backup label
619  * file, we know how far we need to replay to reach consistency. Enter
620  * archive recovery directly.
621  */
622  InArchiveRecovery = true;
625 
626  /*
627  * When a backup_label file is present, we want to roll forward from
628  * the checkpoint it identifies, rather than using pg_control.
629  */
631  CheckPointTLI);
632  if (record != NULL)
633  {
634  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
635  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
636  ereport(DEBUG1,
637  (errmsg_internal("checkpoint record is at %X/%X",
639  InRecovery = true; /* force recovery even if SHUTDOWNED */
640 
641  /*
642  * Make sure that REDO location exists. This may not be the case
643  * if there was a crash during an online backup, which left a
644  * backup_label around that references a WAL segment that's
645  * already been archived.
646  */
647  if (checkPoint.redo < CheckPointLoc)
648  {
650  if (!ReadRecord(xlogprefetcher, LOG, false,
651  checkPoint.ThisTimeLineID))
652  ereport(FATAL,
653  (errmsg("could not find redo location referenced by checkpoint record"),
654  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
655  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
656  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
657  DataDir, DataDir, DataDir)));
658  }
659  }
660  else
661  {
662  ereport(FATAL,
663  (errmsg("could not locate required checkpoint record"),
664  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
665  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
666  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
667  DataDir, DataDir, DataDir)));
668  wasShutdown = false; /* keep compiler quiet */
669  }
670 
671  /* Read the tablespace_map file if present and create symlinks. */
672  if (read_tablespace_map(&tablespaces))
673  {
674  ListCell *lc;
675 
676  foreach(lc, tablespaces)
677  {
678  tablespaceinfo *ti = lfirst(lc);
679  char *linkloc;
680 
681  linkloc = psprintf("pg_tblspc/%s", ti->oid);
682 
683  /*
684  * Remove the existing symlink if any and Create the symlink
685  * under PGDATA.
686  */
687  remove_tablespace_symlink(linkloc);
688 
689  if (symlink(ti->path, linkloc) < 0)
690  ereport(ERROR,
692  errmsg("could not create symbolic link \"%s\": %m",
693  linkloc)));
694 
695  pfree(ti->oid);
696  pfree(ti->path);
697  pfree(ti);
698  }
699 
700  /* tell the caller to delete it later */
701  haveTblspcMap = true;
702  }
703 
704  /* tell the caller to delete it later */
705  haveBackupLabel = true;
706  }
707  else
708  {
709  /*
710  * If tablespace_map file is present without backup_label file, there
711  * is no use of such file. There is no harm in retaining it, but it
712  * is better to get rid of the map file so that we don't have any
713  * redundant file in data directory and it will avoid any sort of
714  * confusion. It seems prudent though to just rename the file out of
715  * the way rather than delete it completely, also we ignore any error
716  * that occurs in rename operation as even if map file is present
717  * without backup_label file, it is harmless.
718  */
719  if (stat(TABLESPACE_MAP, &st) == 0)
720  {
721  unlink(TABLESPACE_MAP_OLD);
723  ereport(LOG,
724  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
726  errdetail("File \"%s\" was renamed to \"%s\".",
728  else
729  ereport(LOG,
730  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
732  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
734  }
735 
736  /*
737  * It's possible that archive recovery was requested, but we don't
738  * know how far we need to replay the WAL before we reach consistency.
739  * This can happen for example if a base backup is taken from a
740  * running server using an atomic filesystem snapshot, without calling
741  * pg_backup_start/stop. Or if you just kill a running primary server
742  * and put it into archive recovery by creating a recovery signal
743  * file.
744  *
745  * Our strategy in that case is to perform crash recovery first,
746  * replaying all the WAL present in pg_wal, and only enter archive
747  * recovery after that.
748  *
749  * But usually we already know how far we need to replay the WAL (up
750  * to minRecoveryPoint, up to backupEndPoint, or until we see an
751  * end-of-backup record), and we can enter archive recovery directly.
752  */
758  {
759  InArchiveRecovery = true;
762  }
763 
764  /* Get the last valid checkpoint record. */
770  CheckPointTLI);
771  if (record != NULL)
772  {
773  ereport(DEBUG1,
774  (errmsg_internal("checkpoint record is at %X/%X",
776  }
777  else
778  {
779  /*
780  * We used to attempt to go back to a secondary checkpoint record
781  * here, but only when not in standby mode. We now just fail if we
782  * can't read the last checkpoint because this allows us to
783  * simplify processing around checkpoints.
784  */
785  ereport(PANIC,
786  (errmsg("could not locate a valid checkpoint record")));
787  }
788  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
789  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
790  }
791 
792  /*
793  * If the location of the checkpoint record is not on the expected
794  * timeline in the history of the requested timeline, we cannot proceed:
795  * the backup is not part of the history of the requested timeline.
796  */
797  Assert(expectedTLEs); /* was initialized by reading checkpoint
798  * record */
801  {
802  XLogRecPtr switchpoint;
803 
804  /*
805  * tliSwitchPoint will throw an error if the checkpoint's timeline is
806  * not in expectedTLEs at all.
807  */
809  ereport(FATAL,
810  (errmsg("requested timeline %u is not a child of this server's history",
812  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
815  LSN_FORMAT_ARGS(switchpoint))));
816  }
817 
818  /*
819  * The min recovery point should be part of the requested timeline's
820  * history, too.
821  */
825  ereport(FATAL,
826  (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
830 
831  ereport(DEBUG1,
832  (errmsg_internal("redo record is at %X/%X; shutdown %s",
833  LSN_FORMAT_ARGS(checkPoint.redo),
834  wasShutdown ? "true" : "false")));
835  ereport(DEBUG1,
836  (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
837  U64FromFullTransactionId(checkPoint.nextXid),
838  checkPoint.nextOid)));
839  ereport(DEBUG1,
840  (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
841  checkPoint.nextMulti, checkPoint.nextMultiOffset)));
842  ereport(DEBUG1,
843  (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
844  checkPoint.oldestXid, checkPoint.oldestXidDB)));
845  ereport(DEBUG1,
846  (errmsg_internal("oldest MultiXactId: %u, in database %u",
847  checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
848  ereport(DEBUG1,
849  (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
850  checkPoint.oldestCommitTsXid,
851  checkPoint.newestCommitTsXid)));
853  ereport(PANIC,
854  (errmsg("invalid next transaction ID")));
855 
856  /* sanity check */
857  if (checkPoint.redo > CheckPointLoc)
858  ereport(PANIC,
859  (errmsg("invalid redo in checkpoint record")));
860 
861  /*
862  * Check whether we need to force recovery from WAL. If it appears to
863  * have been a clean shutdown and we did not have a recovery signal file,
864  * then assume no recovery needed.
865  */
866  if (checkPoint.redo < CheckPointLoc)
867  {
868  if (wasShutdown)
869  ereport(PANIC,
870  (errmsg("invalid redo record in shutdown checkpoint")));
871  InRecovery = true;
872  }
873  else if (ControlFile->state != DB_SHUTDOWNED)
874  InRecovery = true;
875  else if (ArchiveRecoveryRequested)
876  {
877  /* force recovery due to presence of recovery signal file */
878  InRecovery = true;
879  }
880 
881  /*
882  * If recovery is needed, update our in-memory copy of pg_control to show
883  * that we are recovering and to show the selected checkpoint as the place
884  * we are starting from. We also mark pg_control with any minimum recovery
885  * stop point obtained from a backup history file.
886  *
887  * We don't write the changes to disk yet, though. Only do that after
888  * initializing various subsystems.
889  */
890  if (InRecovery)
891  {
892  if (InArchiveRecovery)
893  {
895  }
896  else
897  {
898  ereport(LOG,
899  (errmsg("database system was not properly shut down; "
900  "automatic recovery in progress")));
902  ereport(LOG,
903  (errmsg("crash recovery starts in timeline %u "
904  "and has target timeline %u",
908  }
910  ControlFile->checkPointCopy = checkPoint;
911  if (InArchiveRecovery)
912  {
913  /* initialize minRecoveryPoint if not set yet */
914  if (ControlFile->minRecoveryPoint < checkPoint.redo)
915  {
916  ControlFile->minRecoveryPoint = checkPoint.redo;
918  }
919  }
920 
921  /*
922  * Set backupStartPoint if we're starting recovery from a base backup.
923  *
924  * Also set backupEndPoint and use minRecoveryPoint as the backup end
925  * location if we're starting recovery from a base backup which was
926  * taken from a standby. In this case, the database system status in
927  * pg_control must indicate that the database was already in recovery.
928  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
929  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
930  * before reaching this point; e.g. because restore_command or
931  * primary_conninfo were faulty.
932  *
933  * Any other state indicates that the backup somehow became corrupted
934  * and we can't sensibly continue with recovery.
935  */
936  if (haveBackupLabel)
937  {
938  ControlFile->backupStartPoint = checkPoint.redo;
940 
941  if (backupFromStandby)
942  {
943  if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
944  dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
945  ereport(FATAL,
946  (errmsg("backup_label contains data inconsistent with control file"),
947  errhint("This means that the backup is corrupted and you will "
948  "have to use another backup for recovery.")));
950  }
951  }
952  }
953 
954  /* remember these, so that we know when we have reached consistency */
958  if (InArchiveRecovery)
959  {
962  }
963  else
964  {
967  }
968 
969  /*
970  * Start recovery assuming that the final record isn't lost.
971  */
974 
975  *wasShutdown_ptr = wasShutdown;
976  *haveBackupLabel_ptr = haveBackupLabel;
977  *haveTblspcMap_ptr = haveTblspcMap;
978 }
979 
980 /*
981  * See if there are any recovery signal files and if so, set state for
982  * recovery.
983  *
984  * See if there is a recovery command file (recovery.conf), and if so
985  * throw an ERROR since as of PG12 we no longer recognize that.
986  */
987 static void
989 {
990  struct stat stat_buf;
991 
993  return;
994 
995  /*
996  * Check for old recovery API file: recovery.conf
997  */
998  if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
999  ereport(FATAL,
1001  errmsg("using recovery command file \"%s\" is not supported",
1003 
1004  /*
1005  * Remove unused .done file, if present. Ignore if absent.
1006  */
1007  unlink(RECOVERY_COMMAND_DONE);
1008 
1009  /*
1010  * Check for recovery signal files and if found, fsync them since they
1011  * represent server state information. We don't sweat too much about the
1012  * possibility of fsync failure, however.
1013  *
1014  * If present, standby signal file takes precedence. If neither is present
1015  * then we won't enter archive recovery.
1016  */
1017  if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1018  {
1019  int fd;
1020 
1022  S_IRUSR | S_IWUSR);
1023  if (fd >= 0)
1024  {
1025  (void) pg_fsync(fd);
1026  close(fd);
1027  }
1029  }
1030  else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1031  {
1032  int fd;
1033 
1035  S_IRUSR | S_IWUSR);
1036  if (fd >= 0)
1037  {
1038  (void) pg_fsync(fd);
1039  close(fd);
1040  }
1042  }
1043 
1044  StandbyModeRequested = false;
1045  ArchiveRecoveryRequested = false;
1047  {
1048  StandbyModeRequested = true;
1049  ArchiveRecoveryRequested = true;
1050  }
1051  else if (recovery_signal_file_found)
1052  {
1053  StandbyModeRequested = false;
1054  ArchiveRecoveryRequested = true;
1055  }
1056  else
1057  return;
1058 
1059  /*
1060  * We don't support standby mode in standalone backends; that requires
1061  * other processes such as the WAL receiver to be alive.
1062  */
1064  ereport(FATAL,
1065  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1066  errmsg("standby mode is not supported by single-user servers")));
1067 }
1068 
1069 static void
1071 {
1073  return;
1074 
1075  /*
1076  * Check for compulsory parameters
1077  */
1079  {
1080  if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1081  (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1082  ereport(WARNING,
1083  (errmsg("specified neither primary_conninfo nor restore_command"),
1084  errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1085  }
1086  else
1087  {
1088  if (recoveryRestoreCommand == NULL ||
1089  strcmp(recoveryRestoreCommand, "") == 0)
1090  ereport(FATAL,
1091  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1092  errmsg("must specify restore_command when standby mode is not enabled")));
1093  }
1094 
1095  /*
1096  * Override any inconsistent requests. Note that this is a change of
1097  * behaviour in 9.5; prior to this we simply ignored a request to pause if
1098  * hot_standby = off, which was surprising behaviour.
1099  */
1103 
1104  /*
1105  * Final parsing of recovery_target_time string; see also
1106  * check_recovery_target_time().
1107  */
1109  {
1113  Int32GetDatum(-1)));
1114  }
1115 
1116  /*
1117  * If user specified recovery_target_timeline, validate it or compute the
1118  * "latest" value. We can't do this until after we've gotten the restore
1119  * command and set InArchiveRecovery, because we need to fetch timeline
1120  * history files from the archive.
1121  */
1123  {
1125 
1126  /* Timeline 1 does not have a history file, all else should */
1127  if (rtli != 1 && !existsTimeLineHistory(rtli))
1128  ereport(FATAL,
1129  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1130  errmsg("recovery target timeline %u does not exist",
1131  rtli)));
1132  recoveryTargetTLI = rtli;
1133  }
1135  {
1136  /* We start the "latest" search from pg_control's timeline */
1138  }
1139  else
1140  {
1141  /*
1142  * else we just use the recoveryTargetTLI as already read from
1143  * ControlFile
1144  */
1146  }
1147 }
1148 
1149 /*
1150  * read_backup_label: check to see if a backup_label file is present
1151  *
1152  * If we see a backup_label during recovery, we assume that we are recovering
1153  * from a backup dump file, and we therefore roll forward from the checkpoint
1154  * identified by the label file, NOT what pg_control says. This avoids the
1155  * problem that pg_control might have been archived one or more checkpoints
1156  * later than the start of the dump, and so if we rely on it as the start
1157  * point, we will fail to restore a consistent database state.
1158  *
1159  * Returns true if a backup_label was found (and fills the checkpoint
1160  * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1161  * returns false if not. If this backup_label came from a streamed backup,
1162  * *backupEndRequired is set to true. If this backup_label was created during
1163  * recovery, *backupFromStandby is set to true.
1164  *
1165  * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1166  * and TLI read from the backup file.
1167  */
1168 static bool
1169 read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1170  bool *backupEndRequired, bool *backupFromStandby)
1171 {
1172  char startxlogfilename[MAXFNAMELEN];
1173  TimeLineID tli_from_walseg,
1174  tli_from_file;
1175  FILE *lfp;
1176  char ch;
1177  char backuptype[20];
1178  char backupfrom[20];
1179  char backuplabel[MAXPGPATH];
1180  char backuptime[128];
1181  uint32 hi,
1182  lo;
1183 
1184  /* suppress possible uninitialized-variable warnings */
1185  *checkPointLoc = InvalidXLogRecPtr;
1186  *backupLabelTLI = 0;
1187  *backupEndRequired = false;
1188  *backupFromStandby = false;
1189 
1190  /*
1191  * See if label file is present
1192  */
1193  lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1194  if (!lfp)
1195  {
1196  if (errno != ENOENT)
1197  ereport(FATAL,
1199  errmsg("could not read file \"%s\": %m",
1200  BACKUP_LABEL_FILE)));
1201  return false; /* it's not there, all is fine */
1202  }
1203 
1204  /*
1205  * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1206  * is pretty crude, but we are not expecting any variability in the file
1207  * format).
1208  */
1209  if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1210  &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1211  ereport(FATAL,
1212  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1213  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1214  RedoStartLSN = ((uint64) hi) << 32 | lo;
1215  RedoStartTLI = tli_from_walseg;
1216  if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1217  &hi, &lo, &ch) != 3 || ch != '\n')
1218  ereport(FATAL,
1219  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1220  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1221  *checkPointLoc = ((uint64) hi) << 32 | lo;
1222  *backupLabelTLI = tli_from_walseg;
1223 
1224  /*
1225  * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1226  * which could mean either pg_basebackup or the pg_backup_start/stop
1227  * method was used) or if this label came from somewhere else (the only
1228  * other option today being from pg_rewind). If this was a streamed
1229  * backup then we know that we need to play through until we get to the
1230  * end of the WAL which was generated during the backup (at which point we
1231  * will have reached consistency and backupEndRequired will be reset to be
1232  * false).
1233  */
1234  if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1235  {
1236  if (strcmp(backuptype, "streamed") == 0)
1237  *backupEndRequired = true;
1238  }
1239 
1240  /*
1241  * BACKUP FROM lets us know if this was from a primary or a standby. If
1242  * it was from a standby, we'll double-check that the control file state
1243  * matches that of a standby.
1244  */
1245  if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1246  {
1247  if (strcmp(backupfrom, "standby") == 0)
1248  *backupFromStandby = true;
1249  }
1250 
1251  /*
1252  * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1253  * but checking for their presence is useful for debugging and the next
1254  * sanity checks. Cope also with the fact that the result buffers have a
1255  * pre-allocated size, hence if the backup_label file has been generated
1256  * with strings longer than the maximum assumed here an incorrect parsing
1257  * happens. That's fine as only minor consistency checks are done
1258  * afterwards.
1259  */
1260  if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1261  ereport(DEBUG1,
1262  (errmsg_internal("backup time %s in file \"%s\"",
1263  backuptime, BACKUP_LABEL_FILE)));
1264 
1265  if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1266  ereport(DEBUG1,
1267  (errmsg_internal("backup label %s in file \"%s\"",
1268  backuplabel, BACKUP_LABEL_FILE)));
1269 
1270  /*
1271  * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1272  * it as a sanity check if present.
1273  */
1274  if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1275  {
1276  if (tli_from_walseg != tli_from_file)
1277  ereport(FATAL,
1278  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1279  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1280  errdetail("Timeline ID parsed is %u, but expected %u.",
1281  tli_from_file, tli_from_walseg)));
1282 
1283  ereport(DEBUG1,
1284  (errmsg_internal("backup timeline %u in file \"%s\"",
1285  tli_from_file, BACKUP_LABEL_FILE)));
1286  }
1287 
1288  if (ferror(lfp) || FreeFile(lfp))
1289  ereport(FATAL,
1291  errmsg("could not read file \"%s\": %m",
1292  BACKUP_LABEL_FILE)));
1293 
1294  return true;
1295 }
1296 
1297 /*
1298  * read_tablespace_map: check to see if a tablespace_map file is present
1299  *
1300  * If we see a tablespace_map file during recovery, we assume that we are
1301  * recovering from a backup dump file, and we therefore need to create symlinks
1302  * as per the information present in tablespace_map file.
1303  *
1304  * Returns true if a tablespace_map file was found (and fills *tablespaces
1305  * with a tablespaceinfo struct for each tablespace listed in the file);
1306  * returns false if not.
1307  */
1308 static bool
1310 {
1311  tablespaceinfo *ti;
1312  FILE *lfp;
1313  char str[MAXPGPATH];
1314  int ch,
1315  i,
1316  n;
1317  bool was_backslash;
1318 
1319  /*
1320  * See if tablespace_map file is present
1321  */
1322  lfp = AllocateFile(TABLESPACE_MAP, "r");
1323  if (!lfp)
1324  {
1325  if (errno != ENOENT)
1326  ereport(FATAL,
1328  errmsg("could not read file \"%s\": %m",
1329  TABLESPACE_MAP)));
1330  return false; /* it's not there, all is fine */
1331  }
1332 
1333  /*
1334  * Read and parse the link name and path lines from tablespace_map file
1335  * (this code is pretty crude, but we are not expecting any variability in
1336  * the file format). De-escape any backslashes that were inserted.
1337  */
1338  i = 0;
1339  was_backslash = false;
1340  while ((ch = fgetc(lfp)) != EOF)
1341  {
1342  if (!was_backslash && (ch == '\n' || ch == '\r'))
1343  {
1344  if (i == 0)
1345  continue; /* \r immediately followed by \n */
1346 
1347  /*
1348  * The de-escaped line should contain an OID followed by exactly
1349  * one space followed by a path. The path might start with
1350  * spaces, so don't be too liberal about parsing.
1351  */
1352  str[i] = '\0';
1353  n = 0;
1354  while (str[n] && str[n] != ' ')
1355  n++;
1356  if (n < 1 || n >= i - 1)
1357  ereport(FATAL,
1358  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1359  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1360  str[n++] = '\0';
1361 
1362  ti = palloc0(sizeof(tablespaceinfo));
1363  ti->oid = pstrdup(str);
1364  ti->path = pstrdup(str + n);
1365  *tablespaces = lappend(*tablespaces, ti);
1366 
1367  i = 0;
1368  continue;
1369  }
1370  else if (!was_backslash && ch == '\\')
1371  was_backslash = true;
1372  else
1373  {
1374  if (i < sizeof(str) - 1)
1375  str[i++] = ch;
1376  was_backslash = false;
1377  }
1378  }
1379 
1380  if (i != 0 || was_backslash) /* last line not terminated? */
1381  ereport(FATAL,
1382  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1383  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1384 
1385  if (ferror(lfp) || FreeFile(lfp))
1386  ereport(FATAL,
1388  errmsg("could not read file \"%s\": %m",
1389  TABLESPACE_MAP)));
1390 
1391  return true;
1392 }
1393 
1394 /*
1395  * Finish WAL recovery.
1396  *
1397  * This does not close the 'xlogreader' yet, because in some cases the caller
1398  * still wants to re-read the last checkpoint record by calling
1399  * ReadCheckpointRecord().
1400  *
1401  * Returns the position of the last valid or applied record, after which new
1402  * WAL should be appended, information about why recovery was ended, and some
1403  * other things. See the EndOfWalRecoveryInfo struct for details.
1404  */
1407 {
1409  XLogRecPtr lastRec;
1410  TimeLineID lastRecTLI;
1411  XLogRecPtr endOfLog;
1412 
1413  /*
1414  * Kill WAL receiver, if it's still running, before we continue to write
1415  * the startup checkpoint and aborted-contrecord records. It will trump
1416  * over these records and subsequent ones if it's still alive when we
1417  * start writing WAL.
1418  */
1420 
1421  /*
1422  * We are now done reading the xlog from stream. Turn off streaming
1423  * recovery to force fetching the files (which would be required at end of
1424  * recovery, e.g., timeline history file) from archive or pg_wal.
1425  *
1426  * Note that standby mode must be turned off after killing WAL receiver,
1427  * i.e., calling XLogShutdownWalRcv().
1428  */
1429  Assert(!WalRcvStreaming());
1430  StandbyMode = false;
1431 
1432  /*
1433  * Determine where to start writing WAL next.
1434  *
1435  * Re-fetch the last valid or last applied record, so we can identify the
1436  * exact endpoint of what we consider the valid portion of WAL. There may
1437  * be an incomplete continuation record after that, in which case
1438  * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1439  * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1440  * it is intentionally missing. See CreateOverwriteContrecordRecord().
1441  *
1442  * An important side-effect of this is to load the last page into
1443  * xlogreader. The caller uses it to initialize the WAL for writing.
1444  */
1445  if (!InRecovery)
1446  {
1447  lastRec = CheckPointLoc;
1448  lastRecTLI = CheckPointTLI;
1449  }
1450  else
1451  {
1453  lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1454  }
1456  (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1457  endOfLog = xlogreader->EndRecPtr;
1458 
1459  /*
1460  * Remember the TLI in the filename of the XLOG segment containing the
1461  * end-of-log. It could be different from the timeline that endOfLog
1462  * nominally belongs to, if there was a timeline switch in that segment,
1463  * and we were reading the old WAL from a segment belonging to a higher
1464  * timeline.
1465  */
1466  result->endOfLogTLI = xlogreader->seg.ws_tli;
1467 
1469  {
1470  /*
1471  * We are no longer in archive recovery state.
1472  *
1473  * We are now done reading the old WAL. Turn off archive fetching if
1474  * it was active.
1475  */
1477  InArchiveRecovery = false;
1478 
1479  /*
1480  * If the ending log segment is still open, close it (to avoid
1481  * problems on Windows with trying to rename or delete an open file).
1482  */
1483  if (readFile >= 0)
1484  {
1485  close(readFile);
1486  readFile = -1;
1487  }
1488  }
1489 
1490  /*
1491  * Copy the last partial block to the caller, for initializing the WAL
1492  * buffer for appending new WAL.
1493  */
1494  if (endOfLog % XLOG_BLCKSZ != 0)
1495  {
1496  char *page;
1497  int len;
1498  XLogRecPtr pageBeginPtr;
1499 
1500  pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1501  Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1502 
1503  /* Copy the valid part of the last block */
1504  len = endOfLog % XLOG_BLCKSZ;
1505  page = palloc(len);
1506  memcpy(page, xlogreader->readBuf, len);
1507 
1508  result->lastPageBeginPtr = pageBeginPtr;
1509  result->lastPage = page;
1510  }
1511  else
1512  {
1513  /* There is no partial block to copy. */
1514  result->lastPageBeginPtr = endOfLog;
1515  result->lastPage = NULL;
1516  }
1517 
1518  /*
1519  * Create a comment for the history file to explain why and where timeline
1520  * changed.
1521  */
1523 
1524  result->lastRec = lastRec;
1525  result->lastRecTLI = lastRecTLI;
1526  result->endOfLog = endOfLog;
1527 
1528  result->abortedRecPtr = abortedRecPtr;
1530 
1533 
1534  return result;
1535 }
1536 
1537 /*
1538  * Clean up the WAL reader and leftovers from restoring WAL from archive
1539  */
1540 void
1542 {
1543  char recoveryPath[MAXPGPATH];
1544 
1545  /* Final update of pg_stat_recovery_prefetch. */
1547 
1548  /* Shut down xlogreader */
1549  if (readFile >= 0)
1550  {
1551  close(readFile);
1552  readFile = -1;
1553  }
1556 
1558  {
1559  /*
1560  * Since there might be a partial WAL segment named RECOVERYXLOG, get
1561  * rid of it.
1562  */
1563  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1564  unlink(recoveryPath); /* ignore any error */
1565 
1566  /* Get rid of any remaining recovered timeline-history file, too */
1567  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1568  unlink(recoveryPath); /* ignore any error */
1569  }
1570 
1571  /*
1572  * We don't need the latch anymore. It's not strictly necessary to disown
1573  * it, but let's do it for the sake of tidiness.
1574  */
1577 }
1578 
1579 /*
1580  * Perform WAL recovery.
1581  *
1582  * If the system was shut down cleanly, this is never called.
1583  */
1584 void
1586 {
1587  XLogRecord *record;
1588  bool reachedRecoveryTarget = false;
1589  TimeLineID replayTLI;
1590 
1591  /*
1592  * Initialize shared variables for tracking progress of WAL replay, as if
1593  * we had just replayed the record before the REDO location (or the
1594  * checkpoint record itself, if it's a shutdown checkpoint).
1595  */
1598  {
1602  }
1603  else
1604  {
1608  }
1615 
1616  /* Also ensure XLogReceiptTime has a sane value */
1618 
1619  /*
1620  * Let postmaster know we've started redo now, so that it can launch the
1621  * archiver if necessary.
1622  */
1623  if (IsUnderPostmaster)
1625 
1626  /*
1627  * Allow read-only connections immediately if we're consistent already.
1628  */
1630 
1631  /*
1632  * Find the first record that logically follows the checkpoint --- it
1633  * might physically precede it, though.
1634  */
1636  {
1637  /* back up to find the record */
1638  replayTLI = RedoStartTLI;
1640  record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1641  }
1642  else
1643  {
1644  /* just have to read next record after CheckPoint */
1646  replayTLI = CheckPointTLI;
1647  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1648  }
1649 
1650  if (record != NULL)
1651  {
1652  TimestampTz xtime;
1653  PGRUsage ru0;
1654 
1655  pg_rusage_init(&ru0);
1656 
1657  InRedo = true;
1658 
1659  RmgrStartup();
1660 
1661  ereport(LOG,
1662  (errmsg("redo starts at %X/%X",
1664 
1665  /* Prepare to report progress of the redo phase. */
1666  if (!StandbyMode)
1668 
1669  /*
1670  * main redo apply loop
1671  */
1672  do
1673  {
1674  if (!StandbyMode)
1675  ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1677 
1678 #ifdef WAL_DEBUG
1679  if (XLOG_DEBUG ||
1680  (record->xl_rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
1681  (record->xl_rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
1682  {
1684 
1685  initStringInfo(&buf);
1686  appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1689  xlog_outrec(&buf, xlogreader);
1690  appendStringInfoString(&buf, " - ");
1692  elog(LOG, "%s", buf.data);
1693  pfree(buf.data);
1694  }
1695 #endif
1696 
1697  /* Handle interrupt signals of startup process */
1699 
1700  /*
1701  * Pause WAL replay, if requested by a hot-standby session via
1702  * SetRecoveryPause().
1703  *
1704  * Note that we intentionally don't take the info_lck spinlock
1705  * here. We might therefore read a slightly stale value of the
1706  * recoveryPause flag, but it can't be very stale (no worse than
1707  * the last spinlock we did acquire). Since a pause request is a
1708  * pretty asynchronous thing anyway, possibly responding to it one
1709  * WAL record later than we otherwise would is a minor issue, so
1710  * it doesn't seem worth adding another spinlock cycle to prevent
1711  * that.
1712  */
1713  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1715  recoveryPausesHere(false);
1716 
1717  /*
1718  * Have we reached our recovery target?
1719  */
1721  {
1722  reachedRecoveryTarget = true;
1723  break;
1724  }
1725 
1726  /*
1727  * If we've been asked to lag the primary, wait on latch until
1728  * enough time has passed.
1729  */
1731  {
1732  /*
1733  * We test for paused recovery again here. If user sets
1734  * delayed apply, it may be because they expect to pause
1735  * recovery in case of problems, so we must test again here
1736  * otherwise pausing during the delay-wait wouldn't work.
1737  */
1738  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1740  recoveryPausesHere(false);
1741  }
1742 
1743  /*
1744  * Apply the record
1745  */
1746  ApplyWalRecord(xlogreader, record, &replayTLI);
1747 
1748  /* Exit loop if we reached inclusive recovery target */
1750  {
1751  reachedRecoveryTarget = true;
1752  break;
1753  }
1754 
1755  /* Else, try to fetch the next WAL record */
1756  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1757  } while (record != NULL);
1758 
1759  /*
1760  * end of main redo apply loop
1761  */
1762 
1763  if (reachedRecoveryTarget)
1764  {
1765  if (!reachedConsistency)
1766  ereport(FATAL,
1767  (errmsg("requested recovery stop point is before consistent recovery point")));
1768 
1769  /*
1770  * This is the last point where we can restart recovery with a new
1771  * recovery target, if we shutdown and begin again. After this,
1772  * Resource Managers may choose to do permanent corrective actions
1773  * at end of recovery.
1774  */
1775  switch (recoveryTargetAction)
1776  {
1778 
1779  /*
1780  * exit with special return code to request shutdown of
1781  * postmaster. Log messages issued from postmaster.
1782  */
1783  proc_exit(3);
1784 
1786  SetRecoveryPause(true);
1787  recoveryPausesHere(true);
1788 
1789  /* drop into promote */
1790 
1792  break;
1793  }
1794  }
1795 
1796  RmgrCleanup();
1797 
1798  ereport(LOG,
1799  (errmsg("redo done at %X/%X system usage: %s",
1801  pg_rusage_show(&ru0))));
1802  xtime = GetLatestXTime();
1803  if (xtime)
1804  ereport(LOG,
1805  (errmsg("last completed transaction was at log time %s",
1806  timestamptz_to_str(xtime))));
1807 
1808  InRedo = false;
1809  }
1810  else
1811  {
1812  /* there are no WAL records following the checkpoint */
1813  ereport(LOG,
1814  (errmsg("redo is not required")));
1815  }
1816 
1817  /*
1818  * This check is intentionally after the above log messages that indicate
1819  * how far recovery went.
1820  */
1823  !reachedRecoveryTarget)
1824  ereport(FATAL,
1825  (errmsg("recovery ended before configured recovery target was reached")));
1826 }
1827 
1828 /*
1829  * Subroutine of PerformWalRecovery, to apply one WAL record.
1830  */
1831 static void
1833 {
1834  ErrorContextCallback errcallback;
1835  bool switchedTLI = false;
1836 
1837  /* Setup error traceback support for ereport() */
1838  errcallback.callback = rm_redo_error_callback;
1839  errcallback.arg = (void *) xlogreader;
1840  errcallback.previous = error_context_stack;
1841  error_context_stack = &errcallback;
1842 
1843  /*
1844  * ShmemVariableCache->nextXid must be beyond record's xid.
1845  */
1847 
1848  /*
1849  * Before replaying this record, check if this record causes the current
1850  * timeline to change. The record is already considered to be part of the
1851  * new timeline, so we update replayTLI before replaying it. That's
1852  * important so that replayEndTLI, which is recorded as the minimum
1853  * recovery point's TLI if recovery stops after this record, is set
1854  * correctly.
1855  */
1856  if (record->xl_rmid == RM_XLOG_ID)
1857  {
1858  TimeLineID newReplayTLI = *replayTLI;
1859  TimeLineID prevReplayTLI = *replayTLI;
1860  uint8 info = record->xl_info & ~XLR_INFO_MASK;
1861 
1862  if (info == XLOG_CHECKPOINT_SHUTDOWN)
1863  {
1864  CheckPoint checkPoint;
1865 
1866  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1867  newReplayTLI = checkPoint.ThisTimeLineID;
1868  prevReplayTLI = checkPoint.PrevTimeLineID;
1869  }
1870  else if (info == XLOG_END_OF_RECOVERY)
1871  {
1872  xl_end_of_recovery xlrec;
1873 
1874  memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1875  newReplayTLI = xlrec.ThisTimeLineID;
1876  prevReplayTLI = xlrec.PrevTimeLineID;
1877  }
1878 
1879  if (newReplayTLI != *replayTLI)
1880  {
1881  /* Check that it's OK to switch to this TLI */
1883  newReplayTLI, prevReplayTLI, *replayTLI);
1884 
1885  /* Following WAL records should be run with new TLI */
1886  *replayTLI = newReplayTLI;
1887  switchedTLI = true;
1888  }
1889  }
1890 
1891  /*
1892  * Update shared replayEndRecPtr before replaying this record, so that
1893  * XLogFlush will update minRecoveryPoint correctly.
1894  */
1897  XLogRecoveryCtl->replayEndTLI = *replayTLI;
1899 
1900  /*
1901  * If we are attempting to enter Hot Standby mode, process XIDs we see
1902  */
1904  TransactionIdIsValid(record->xl_xid))
1906 
1907  /*
1908  * Some XLOG record types that are related to recovery are processed
1909  * directly here, rather than in xlog_redo()
1910  */
1911  if (record->xl_rmid == RM_XLOG_ID)
1912  xlogrecovery_redo(xlogreader, *replayTLI);
1913 
1914  /* Now apply the WAL record itself */
1915  GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1916 
1917  /*
1918  * After redo, check whether the backup pages associated with the WAL
1919  * record are consistent with the existing pages. This check is done only
1920  * if consistency check is enabled for this record.
1921  */
1922  if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
1924 
1925  /* Pop the error context stack */
1926  error_context_stack = errcallback.previous;
1927 
1928  /*
1929  * Update lastReplayedEndRecPtr after this record has been successfully
1930  * replayed.
1931  */
1935  XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
1937 
1938  /* ------
1939  * Wakeup walsenders:
1940  *
1941  * On the standby, the WAL is flushed first (which will only wake up
1942  * physical walsenders) and then applied, which will only wake up logical
1943  * walsenders.
1944  *
1945  * Indeed, logical walsenders on standby can't decode and send data until
1946  * it's been applied.
1947  *
1948  * Physical walsenders don't need to be woken up during replay unless
1949  * cascading replication is allowed and time line change occurred (so that
1950  * they can notice that they are on a new time line).
1951  *
1952  * That's why the wake up conditions are for:
1953  *
1954  * - physical walsenders in case of new time line and cascade
1955  * replication is allowed
1956  * - logical walsenders in case cascade replication is allowed (could not
1957  * be created otherwise)
1958  * ------
1959  */
1961  WalSndWakeup(switchedTLI, true);
1962 
1963  /*
1964  * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
1965  * receiver so that it notices the updated lastReplayedEndRecPtr and sends
1966  * a reply to the primary.
1967  */
1969  {
1970  doRequestWalReceiverReply = false;
1971  WalRcvForceReply();
1972  }
1973 
1974  /* Allow read-only connections if we're consistent now */
1976 
1977  /* Is this a timeline switch? */
1978  if (switchedTLI)
1979  {
1980  /*
1981  * Before we continue on the new timeline, clean up any (possibly
1982  * bogus) future WAL segments on the old timeline.
1983  */
1985 
1986  /* Reset the prefetcher. */
1988  }
1989 }
1990 
1991 /*
1992  * Some XLOG RM record types that are directly related to WAL recovery are
1993  * handled here rather than in the xlog_redo()
1994  */
1995 static void
1997 {
1998  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1999  XLogRecPtr lsn = record->EndRecPtr;
2000 
2001  Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2002 
2003  if (info == XLOG_OVERWRITE_CONTRECORD)
2004  {
2005  /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2007 
2008  memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2009  if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2010  elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
2013 
2014  /* We have safely skipped the aborted record */
2017 
2018  ereport(LOG,
2019  (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2022 
2023  /* Verifying the record should only happen once */
2025  }
2026  else if (info == XLOG_BACKUP_END)
2027  {
2028  XLogRecPtr startpoint;
2029 
2030  memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2031 
2032  if (backupStartPoint == startpoint)
2033  {
2034  /*
2035  * We have reached the end of base backup, the point where
2036  * pg_backup_stop() was done. The data on disk is now consistent
2037  * (assuming we have also reached minRecoveryPoint). Set
2038  * backupEndPoint to the current LSN, so that the next call to
2039  * CheckRecoveryConsistency() will notice it and do the
2040  * end-of-backup processing.
2041  */
2042  elog(DEBUG1, "end of backup record reached");
2043 
2044  backupEndPoint = lsn;
2045  }
2046  else
2047  elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2049  }
2050 }
2051 
2052 /*
2053  * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2054  * directories.
2055  *
2056  * Replay of database creation XLOG records for databases that were later
2057  * dropped can create fake directories in pg_tblspc. By the time consistency
2058  * is reached these directories should have been removed; here we verify
2059  * that this did indeed happen. This is to be called at the point where
2060  * consistent state is reached.
2061  *
2062  * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2063  * useful for testing purposes, and also allows for an escape hatch in case
2064  * things go south.
2065  */
2066 static void
2068 {
2069  DIR *dir;
2070  struct dirent *de;
2071 
2072  dir = AllocateDir("pg_tblspc");
2073  while ((de = ReadDir(dir, "pg_tblspc")) != NULL)
2074  {
2075  char path[MAXPGPATH + 10];
2076 
2077  /* Skip entries of non-oid names */
2078  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2079  continue;
2080 
2081  snprintf(path, sizeof(path), "pg_tblspc/%s", de->d_name);
2082 
2083  if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2086  errmsg("unexpected directory entry \"%s\" found in %s",
2087  de->d_name, "pg_tblspc/"),
2088  errdetail("All directory entries in pg_tblspc/ should be symbolic links."),
2089  errhint("Remove those directories, or set allow_in_place_tablespaces to ON transiently to let recovery complete.")));
2090  }
2091 }
2092 
2093 /*
2094  * Checks if recovery has reached a consistent state. When consistency is
2095  * reached and we have a valid starting standby snapshot, tell postmaster
2096  * that it can start accepting read-only connections.
2097  */
2098 static void
2100 {
2101  XLogRecPtr lastReplayedEndRecPtr;
2102  TimeLineID lastReplayedTLI;
2103 
2104  /*
2105  * During crash recovery, we don't reach a consistent state until we've
2106  * replayed all the WAL.
2107  */
2109  return;
2110 
2112 
2113  /*
2114  * assume that we are called in the startup process, and hence don't need
2115  * a lock to read lastReplayedEndRecPtr
2116  */
2117  lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2118  lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2119 
2120  /*
2121  * Have we reached the point where our base backup was completed?
2122  */
2124  backupEndPoint <= lastReplayedEndRecPtr)
2125  {
2126  elog(DEBUG1, "end of backup reached");
2127 
2128  /*
2129  * We have reached the end of base backup, as indicated by pg_control.
2130  * Update the control file accordingly.
2131  */
2132  ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2135  backupEndRequired = false;
2136  }
2137 
2138  /*
2139  * Have we passed our safe starting point? Note that minRecoveryPoint is
2140  * known to be incorrectly set if recovering from a backup, until the
2141  * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2142  * All we know prior to that is that we're not consistent yet.
2143  */
2145  minRecoveryPoint <= lastReplayedEndRecPtr)
2146  {
2147  /*
2148  * Check to see if the XLOG sequence contained any unresolved
2149  * references to uninitialized pages.
2150  */
2152 
2153  /*
2154  * Check that pg_tblspc doesn't contain any real directories. Replay
2155  * of Database/CREATE_* records may have created fictitious tablespace
2156  * directories that should have been removed by the time consistency
2157  * was reached.
2158  */
2160 
2161  reachedConsistency = true;
2162  ereport(LOG,
2163  (errmsg("consistent recovery state reached at %X/%X",
2164  LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2165  }
2166 
2167  /*
2168  * Have we got a valid starting snapshot that will allow queries to be
2169  * run? If so, we can tell postmaster that the database is consistent now,
2170  * enabling connections.
2171  */
2176  {
2180 
2181  LocalHotStandbyActive = true;
2182 
2184  }
2185 }
2186 
2187 /*
2188  * Error context callback for errors occurring during rm_redo().
2189  */
2190 static void
2192 {
2193  XLogReaderState *record = (XLogReaderState *) arg;
2195 
2196  initStringInfo(&buf);
2197  xlog_outdesc(&buf, record);
2198  xlog_block_info(&buf, record);
2199 
2200  /* translator: %s is a WAL record description */
2201  errcontext("WAL redo at %X/%X for %s",
2202  LSN_FORMAT_ARGS(record->ReadRecPtr),
2203  buf.data);
2204 
2205  pfree(buf.data);
2206 }
2207 
2208 /*
2209  * Returns a string describing an XLogRecord, consisting of its identity
2210  * optionally followed by a colon, a space, and a further description.
2211  */
2212 void
2214 {
2215  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2216  uint8 info = XLogRecGetInfo(record);
2217  const char *id;
2218 
2220  appendStringInfoChar(buf, '/');
2221 
2222  id = rmgr.rm_identify(info);
2223  if (id == NULL)
2224  appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2225  else
2226  appendStringInfo(buf, "%s: ", id);
2227 
2228  rmgr.rm_desc(buf, record);
2229 }
2230 
2231 #ifdef WAL_DEBUG
2232 
2233 static void
2234 xlog_outrec(StringInfo buf, XLogReaderState *record)
2235 {
2236  appendStringInfo(buf, "prev %X/%X; xid %u",
2238  XLogRecGetXid(record));
2239 
2240  appendStringInfo(buf, "; len %u",
2241  XLogRecGetDataLen(record));
2242 
2243  xlog_block_info(buf, record);
2244 }
2245 #endif /* WAL_DEBUG */
2246 
2247 /*
2248  * Returns a string giving information about all the blocks in an
2249  * XLogRecord.
2250  */
2251 static void
2253 {
2254  int block_id;
2255 
2256  /* decode block references */
2257  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2258  {
2259  RelFileLocator rlocator;
2260  ForkNumber forknum;
2261  BlockNumber blk;
2262 
2263  if (!XLogRecGetBlockTagExtended(record, block_id,
2264  &rlocator, &forknum, &blk, NULL))
2265  continue;
2266 
2267  if (forknum != MAIN_FORKNUM)
2268  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2269  block_id,
2270  rlocator.spcOid, rlocator.dbOid,
2271  rlocator.relNumber,
2272  forknum,
2273  blk);
2274  else
2275  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2276  block_id,
2277  rlocator.spcOid, rlocator.dbOid,
2278  rlocator.relNumber,
2279  blk);
2280  if (XLogRecHasBlockImage(record, block_id))
2281  appendStringInfoString(buf, " FPW");
2282  }
2283 }
2284 
2285 
2286 /*
2287  * Check that it's OK to switch to new timeline during recovery.
2288  *
2289  * 'lsn' is the address of the shutdown checkpoint record we're about to
2290  * replay. (Currently, timeline can only change at a shutdown checkpoint).
2291  */
2292 static void
2294  TimeLineID replayTLI)
2295 {
2296  /* Check that the record agrees on what the current (old) timeline is */
2297  if (prevTLI != replayTLI)
2298  ereport(PANIC,
2299  (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2300  prevTLI, replayTLI)));
2301 
2302  /*
2303  * The new timeline better be in the list of timelines we expect to see,
2304  * according to the timeline history. It should also not decrease.
2305  */
2306  if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2307  ereport(PANIC,
2308  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2309  newTLI, replayTLI)));
2310 
2311  /*
2312  * If we have not yet reached min recovery point, and we're about to
2313  * switch to a timeline greater than the timeline of the min recovery
2314  * point: trouble. After switching to the new timeline, we could not
2315  * possibly visit the min recovery point on the correct timeline anymore.
2316  * This can happen if there is a newer timeline in the archive that
2317  * branched before the timeline the min recovery point is on, and you
2318  * attempt to do PITR to the new timeline.
2319  */
2321  lsn < minRecoveryPoint &&
2322  newTLI > minRecoveryPointTLI)
2323  ereport(PANIC,
2324  (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2325  newTLI,
2328 
2329  /* Looks good */
2330 }
2331 
2332 
2333 /*
2334  * Extract timestamp from WAL record.
2335  *
2336  * If the record contains a timestamp, returns true, and saves the timestamp
2337  * in *recordXtime. If the record type has no timestamp, returns false.
2338  * Currently, only transaction commit/abort records and restore points contain
2339  * timestamps.
2340  */
2341 static bool
2343 {
2344  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2345  uint8 xact_info = info & XLOG_XACT_OPMASK;
2346  uint8 rmid = XLogRecGetRmid(record);
2347 
2348  if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2349  {
2350  *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2351  return true;
2352  }
2353  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2354  xact_info == XLOG_XACT_COMMIT_PREPARED))
2355  {
2356  *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2357  return true;
2358  }
2359  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2360  xact_info == XLOG_XACT_ABORT_PREPARED))
2361  {
2362  *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2363  return true;
2364  }
2365  return false;
2366 }
2367 
2368 /*
2369  * Checks whether the current buffer page and backup page stored in the
2370  * WAL record are consistent or not. Before comparing the two pages, a
2371  * masking can be applied to the pages to ignore certain areas like hint bits,
2372  * unused space between pd_lower and pd_upper among other things. This
2373  * function should be called once WAL replay has been completed for a
2374  * given record.
2375  */
2376 static void
2378 {
2379  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2380  RelFileLocator rlocator;
2381  ForkNumber forknum;
2382  BlockNumber blkno;
2383  int block_id;
2384 
2385  /* Records with no backup blocks have no need for consistency checks. */
2386  if (!XLogRecHasAnyBlockRefs(record))
2387  return;
2388 
2389  Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2390 
2391  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2392  {
2393  Buffer buf;
2394  Page page;
2395 
2396  if (!XLogRecGetBlockTagExtended(record, block_id,
2397  &rlocator, &forknum, &blkno, NULL))
2398  {
2399  /*
2400  * WAL record doesn't contain a block reference with the given id.
2401  * Do nothing.
2402  */
2403  continue;
2404  }
2405 
2406  Assert(XLogRecHasBlockImage(record, block_id));
2407 
2408  if (XLogRecBlockImageApply(record, block_id))
2409  {
2410  /*
2411  * WAL record has already applied the page, so bypass the
2412  * consistency check as that would result in comparing the full
2413  * page stored in the record with itself.
2414  */
2415  continue;
2416  }
2417 
2418  /*
2419  * Read the contents from the current buffer and store it in a
2420  * temporary page.
2421  */
2422  buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2424  InvalidBuffer);
2425  if (!BufferIsValid(buf))
2426  continue;
2427 
2429  page = BufferGetPage(buf);
2430 
2431  /*
2432  * Take a copy of the local page where WAL has been applied to have a
2433  * comparison base before masking it...
2434  */
2435  memcpy(replay_image_masked, page, BLCKSZ);
2436 
2437  /* No need for this page anymore now that a copy is in. */
2439 
2440  /*
2441  * If the block LSN is already ahead of this WAL record, we can't
2442  * expect contents to match. This can happen if recovery is
2443  * restarted.
2444  */
2445  if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2446  continue;
2447 
2448  /*
2449  * Read the contents from the backup copy, stored in WAL record and
2450  * store it in a temporary page. There is no need to allocate a new
2451  * page here, a local buffer is fine to hold its contents and a mask
2452  * can be directly applied on it.
2453  */
2454  if (!RestoreBlockImage(record, block_id, primary_image_masked))
2455  ereport(ERROR,
2456  (errcode(ERRCODE_INTERNAL_ERROR),
2457  errmsg_internal("%s", record->errormsg_buf)));
2458 
2459  /*
2460  * If masking function is defined, mask both the primary and replay
2461  * images
2462  */
2463  if (rmgr.rm_mask != NULL)
2464  {
2465  rmgr.rm_mask(replay_image_masked, blkno);
2466  rmgr.rm_mask(primary_image_masked, blkno);
2467  }
2468 
2469  /* Time to compare the primary and replay images. */
2470  if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2471  {
2472  elog(FATAL,
2473  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2474  rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2475  forknum, blkno);
2476  }
2477  }
2478 }
2479 
2480 /*
2481  * For point-in-time recovery, this function decides whether we want to
2482  * stop applying the XLOG before the current record.
2483  *
2484  * Returns true if we are stopping, false otherwise. If stopping, some
2485  * information is saved in recoveryStopXid et al for use in annotating the
2486  * new timeline's history file.
2487  */
2488 static bool
2490 {
2491  bool stopsHere = false;
2492  uint8 xact_info;
2493  bool isCommit;
2494  TimestampTz recordXtime = 0;
2495  TransactionId recordXid;
2496 
2497  /*
2498  * Ignore recovery target settings when not in archive recovery (meaning
2499  * we are in crash recovery).
2500  */
2502  return false;
2503 
2504  /* Check if we should stop as soon as reaching consistency */
2506  {
2507  ereport(LOG,
2508  (errmsg("recovery stopping after reaching consistency")));
2509 
2510  recoveryStopAfter = false;
2513  recoveryStopTime = 0;
2514  recoveryStopName[0] = '\0';
2515  return true;
2516  }
2517 
2518  /* Check if target LSN has been reached */
2521  record->ReadRecPtr >= recoveryTargetLSN)
2522  {
2523  recoveryStopAfter = false;
2525  recoveryStopLSN = record->ReadRecPtr;
2526  recoveryStopTime = 0;
2527  recoveryStopName[0] = '\0';
2528  ereport(LOG,
2529  (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2531  return true;
2532  }
2533 
2534  /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2535  if (XLogRecGetRmid(record) != RM_XACT_ID)
2536  return false;
2537 
2538  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2539 
2540  if (xact_info == XLOG_XACT_COMMIT)
2541  {
2542  isCommit = true;
2543  recordXid = XLogRecGetXid(record);
2544  }
2545  else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2546  {
2547  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2548  xl_xact_parsed_commit parsed;
2549 
2550  isCommit = true;
2552  xlrec,
2553  &parsed);
2554  recordXid = parsed.twophase_xid;
2555  }
2556  else if (xact_info == XLOG_XACT_ABORT)
2557  {
2558  isCommit = false;
2559  recordXid = XLogRecGetXid(record);
2560  }
2561  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2562  {
2563  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2564  xl_xact_parsed_abort parsed;
2565 
2566  isCommit = false;
2568  xlrec,
2569  &parsed);
2570  recordXid = parsed.twophase_xid;
2571  }
2572  else
2573  return false;
2574 
2576  {
2577  /*
2578  * There can be only one transaction end record with this exact
2579  * transactionid
2580  *
2581  * when testing for an xid, we MUST test for equality only, since
2582  * transactions are numbered in the order they start, not the order
2583  * they complete. A higher numbered xid will complete before you about
2584  * 50% of the time...
2585  */
2586  stopsHere = (recordXid == recoveryTargetXid);
2587  }
2588 
2589  /*
2590  * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2591  * We don't expect getRecordTimestamp ever to fail, since we already know
2592  * this is a commit or abort record; but test its result anyway.
2593  */
2594  if (getRecordTimestamp(record, &recordXtime) &&
2596  {
2597  /*
2598  * There can be many transactions that share the same commit time, so
2599  * we stop after the last one, if we are inclusive, or stop at the
2600  * first one if we are exclusive
2601  */
2603  stopsHere = (recordXtime > recoveryTargetTime);
2604  else
2605  stopsHere = (recordXtime >= recoveryTargetTime);
2606  }
2607 
2608  if (stopsHere)
2609  {
2610  recoveryStopAfter = false;
2611  recoveryStopXid = recordXid;
2612  recoveryStopTime = recordXtime;
2614  recoveryStopName[0] = '\0';
2615 
2616  if (isCommit)
2617  {
2618  ereport(LOG,
2619  (errmsg("recovery stopping before commit of transaction %u, time %s",
2622  }
2623  else
2624  {
2625  ereport(LOG,
2626  (errmsg("recovery stopping before abort of transaction %u, time %s",
2629  }
2630  }
2631 
2632  return stopsHere;
2633 }
2634 
2635 /*
2636  * Same as recoveryStopsBefore, but called after applying the record.
2637  *
2638  * We also track the timestamp of the latest applied COMMIT/ABORT
2639  * record in XLogRecoveryCtl->recoveryLastXTime.
2640  */
2641 static bool
2643 {
2644  uint8 info;
2645  uint8 xact_info;
2646  uint8 rmid;
2647  TimestampTz recordXtime = 0;
2648 
2649  /*
2650  * Ignore recovery target settings when not in archive recovery (meaning
2651  * we are in crash recovery).
2652  */
2654  return false;
2655 
2656  info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2657  rmid = XLogRecGetRmid(record);
2658 
2659  /*
2660  * There can be many restore points that share the same name; we stop at
2661  * the first one.
2662  */
2664  rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2665  {
2666  xl_restore_point *recordRestorePointData;
2667 
2668  recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2669 
2670  if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2671  {
2672  recoveryStopAfter = true;
2675  (void) getRecordTimestamp(record, &recoveryStopTime);
2676  strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2677 
2678  ereport(LOG,
2679  (errmsg("recovery stopping at restore point \"%s\", time %s",
2682  return true;
2683  }
2684  }
2685 
2686  /* Check if the target LSN has been reached */
2689  record->ReadRecPtr >= recoveryTargetLSN)
2690  {
2691  recoveryStopAfter = true;
2693  recoveryStopLSN = record->ReadRecPtr;
2694  recoveryStopTime = 0;
2695  recoveryStopName[0] = '\0';
2696  ereport(LOG,
2697  (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2699  return true;
2700  }
2701 
2702  if (rmid != RM_XACT_ID)
2703  return false;
2704 
2705  xact_info = info & XLOG_XACT_OPMASK;
2706 
2707  if (xact_info == XLOG_XACT_COMMIT ||
2708  xact_info == XLOG_XACT_COMMIT_PREPARED ||
2709  xact_info == XLOG_XACT_ABORT ||
2710  xact_info == XLOG_XACT_ABORT_PREPARED)
2711  {
2712  TransactionId recordXid;
2713 
2714  /* Update the last applied transaction timestamp */
2715  if (getRecordTimestamp(record, &recordXtime))
2716  SetLatestXTime(recordXtime);
2717 
2718  /* Extract the XID of the committed/aborted transaction */
2719  if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2720  {
2721  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2722  xl_xact_parsed_commit parsed;
2723 
2725  xlrec,
2726  &parsed);
2727  recordXid = parsed.twophase_xid;
2728  }
2729  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2730  {
2731  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2732  xl_xact_parsed_abort parsed;
2733 
2735  xlrec,
2736  &parsed);
2737  recordXid = parsed.twophase_xid;
2738  }
2739  else
2740  recordXid = XLogRecGetXid(record);
2741 
2742  /*
2743  * There can be only one transaction end record with this exact
2744  * transactionid
2745  *
2746  * when testing for an xid, we MUST test for equality only, since
2747  * transactions are numbered in the order they start, not the order
2748  * they complete. A higher numbered xid will complete before you about
2749  * 50% of the time...
2750  */
2752  recordXid == recoveryTargetXid)
2753  {
2754  recoveryStopAfter = true;
2755  recoveryStopXid = recordXid;
2756  recoveryStopTime = recordXtime;
2758  recoveryStopName[0] = '\0';
2759 
2760  if (xact_info == XLOG_XACT_COMMIT ||
2761  xact_info == XLOG_XACT_COMMIT_PREPARED)
2762  {
2763  ereport(LOG,
2764  (errmsg("recovery stopping after commit of transaction %u, time %s",
2767  }
2768  else if (xact_info == XLOG_XACT_ABORT ||
2769  xact_info == XLOG_XACT_ABORT_PREPARED)
2770  {
2771  ereport(LOG,
2772  (errmsg("recovery stopping after abort of transaction %u, time %s",
2775  }
2776  return true;
2777  }
2778  }
2779 
2780  /* Check if we should stop as soon as reaching consistency */
2782  {
2783  ereport(LOG,
2784  (errmsg("recovery stopping after reaching consistency")));
2785 
2786  recoveryStopAfter = true;
2788  recoveryStopTime = 0;
2790  recoveryStopName[0] = '\0';
2791  return true;
2792  }
2793 
2794  return false;
2795 }
2796 
2797 /*
2798  * Create a comment for the history file to explain why and where
2799  * timeline changed.
2800  */
2801 static char *
2803 {
2804  char reason[200];
2805 
2807  snprintf(reason, sizeof(reason),
2808  "%s transaction %u",
2809  recoveryStopAfter ? "after" : "before",
2810  recoveryStopXid);
2812  snprintf(reason, sizeof(reason),
2813  "%s %s\n",
2814  recoveryStopAfter ? "after" : "before",
2816  else if (recoveryTarget == RECOVERY_TARGET_LSN)
2817  snprintf(reason, sizeof(reason),
2818  "%s LSN %X/%X\n",
2819  recoveryStopAfter ? "after" : "before",
2822  snprintf(reason, sizeof(reason),
2823  "at restore point \"%s\"",
2826  snprintf(reason, sizeof(reason), "reached consistency");
2827  else
2828  snprintf(reason, sizeof(reason), "no recovery target specified");
2829 
2830  return pstrdup(reason);
2831 }
2832 
2833 /*
2834  * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2835  *
2836  * endOfRecovery is true if the recovery target is reached and
2837  * the paused state starts at the end of recovery because of
2838  * recovery_target_action=pause, and false otherwise.
2839  */
2840 static void
2841 recoveryPausesHere(bool endOfRecovery)
2842 {
2843  /* Don't pause unless users can connect! */
2844  if (!LocalHotStandbyActive)
2845  return;
2846 
2847  /* Don't pause after standby promotion has been triggered */
2849  return;
2850 
2851  if (endOfRecovery)
2852  ereport(LOG,
2853  (errmsg("pausing at the end of recovery"),
2854  errhint("Execute pg_wal_replay_resume() to promote.")));
2855  else
2856  ereport(LOG,
2857  (errmsg("recovery has paused"),
2858  errhint("Execute pg_wal_replay_resume() to continue.")));
2859 
2860  /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2862  {
2864  if (CheckForStandbyTrigger())
2865  return;
2866 
2867  /*
2868  * If recovery pause is requested then set it paused. While we are in
2869  * the loop, user might resume and pause again so set this every time.
2870  */
2872 
2873  /*
2874  * We wait on a condition variable that will wake us as soon as the
2875  * pause ends, but we use a timeout so we can check the above exit
2876  * condition periodically too.
2877  */
2879  WAIT_EVENT_RECOVERY_PAUSE);
2880  }
2882 }
2883 
2884 /*
2885  * When recovery_min_apply_delay is set, we wait long enough to make sure
2886  * certain record types are applied at least that interval behind the primary.
2887  *
2888  * Returns true if we waited.
2889  *
2890  * Note that the delay is calculated between the WAL record log time and
2891  * the current time on standby. We would prefer to keep track of when this
2892  * standby received each WAL record, which would allow a more consistent
2893  * approach and one not affected by time synchronisation issues, but that
2894  * is significantly more effort and complexity for little actual gain in
2895  * usability.
2896  */
2897 static bool
2899 {
2900  uint8 xact_info;
2901  TimestampTz xtime;
2902  TimestampTz delayUntil;
2903  long msecs;
2904 
2905  /* nothing to do if no delay configured */
2906  if (recovery_min_apply_delay <= 0)
2907  return false;
2908 
2909  /* no delay is applied on a database not yet consistent */
2910  if (!reachedConsistency)
2911  return false;
2912 
2913  /* nothing to do if crash recovery is requested */
2915  return false;
2916 
2917  /*
2918  * Is it a COMMIT record?
2919  *
2920  * We deliberately choose not to delay aborts since they have no effect on
2921  * MVCC. We already allow replay of records that don't have a timestamp,
2922  * so there is already opportunity for issues caused by early conflicts on
2923  * standbys.
2924  */
2925  if (XLogRecGetRmid(record) != RM_XACT_ID)
2926  return false;
2927 
2928  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2929 
2930  if (xact_info != XLOG_XACT_COMMIT &&
2931  xact_info != XLOG_XACT_COMMIT_PREPARED)
2932  return false;
2933 
2934  if (!getRecordTimestamp(record, &xtime))
2935  return false;
2936 
2938 
2939  /*
2940  * Exit without arming the latch if it's already past time to apply this
2941  * record
2942  */
2944  if (msecs <= 0)
2945  return false;
2946 
2947  while (true)
2948  {
2950 
2951  /* This might change recovery_min_apply_delay. */
2953 
2954  if (CheckForStandbyTrigger())
2955  break;
2956 
2957  /*
2958  * Recalculate delayUntil as recovery_min_apply_delay could have
2959  * changed while waiting in this loop.
2960  */
2962 
2963  /*
2964  * Wait for difference between GetCurrentTimestamp() and delayUntil.
2965  */
2967  delayUntil);
2968 
2969  if (msecs <= 0)
2970  break;
2971 
2972  elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
2973 
2976  msecs,
2977  WAIT_EVENT_RECOVERY_APPLY_DELAY);
2978  }
2979  return true;
2980 }
2981 
2982 /*
2983  * Get the current state of the recovery pause request.
2984  */
2987 {
2989 
2993 
2994  return state;
2995 }
2996 
2997 /*
2998  * Set the recovery pause state.
2999  *
3000  * If recovery pause is requested then sets the recovery pause state to
3001  * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3002  * to 'not paused' to resume the recovery. The recovery pause will be
3003  * confirmed by the ConfirmRecoveryPaused.
3004  */
3005 void
3006 SetRecoveryPause(bool recoveryPause)
3007 {
3009 
3010  if (!recoveryPause)
3014 
3016 
3017  if (!recoveryPause)
3019 }
3020 
3021 /*
3022  * Confirm the recovery pause by setting the recovery pause state to
3023  * RECOVERY_PAUSED.
3024  */
3025 static void
3027 {
3028  /* If recovery pause is requested then set it paused */
3033 }
3034 
3035 
3036 /*
3037  * Attempt to read the next XLOG record.
3038  *
3039  * Before first call, the reader needs to be positioned to the first record
3040  * by calling XLogPrefetcherBeginRead().
3041  *
3042  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3043  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3044  * record is available.
3045  */
3046 static XLogRecord *
3048  bool fetching_ckpt, TimeLineID replayTLI)
3049 {
3050  XLogRecord *record;
3053 
3054  /* Pass through parameters to XLogPageRead */
3055  private->fetching_ckpt = fetching_ckpt;
3056  private->emode = emode;
3057  private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3058  private->replayTLI = replayTLI;
3059 
3060  /* This is the first attempt to read this page. */
3061  lastSourceFailed = false;
3062 
3063  for (;;)
3064  {
3065  char *errormsg;
3066 
3067  record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3068  if (record == NULL)
3069  {
3070  /*
3071  * When we find that WAL ends in an incomplete record, keep track
3072  * of that record. After recovery is done, we'll write a record
3073  * to indicate to downstream WAL readers that that portion is to
3074  * be ignored.
3075  *
3076  * However, when ArchiveRecoveryRequested = true, we're going to
3077  * switch to a new timeline at the end of recovery. We will only
3078  * copy WAL over to the new timeline up to the end of the last
3079  * complete record, so if we did this, we would later create an
3080  * overwrite contrecord in the wrong place, breaking everything.
3081  */
3082  if (!ArchiveRecoveryRequested &&
3084  {
3087  }
3088 
3089  if (readFile >= 0)
3090  {
3091  close(readFile);
3092  readFile = -1;
3093  }
3094 
3095  /*
3096  * We only end up here without a message when XLogPageRead()
3097  * failed - in that case we already logged something. In
3098  * StandbyMode that only happens if we have been triggered, so we
3099  * shouldn't loop anymore in that case.
3100  */
3101  if (errormsg)
3103  (errmsg_internal("%s", errormsg) /* already translated */ ));
3104  }
3105 
3106  /*
3107  * Check page TLI is one of the expected values.
3108  */
3110  {
3111  char fname[MAXFNAMELEN];
3112  XLogSegNo segno;
3113  int32 offset;
3114 
3118  XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3121  (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3123  fname,
3125  offset)));
3126  record = NULL;
3127  }
3128 
3129  if (record)
3130  {
3131  /* Great, got a record */
3132  return record;
3133  }
3134  else
3135  {
3136  /* No valid record available from this source */
3137  lastSourceFailed = true;
3138 
3139  /*
3140  * If archive recovery was requested, but we were still doing
3141  * crash recovery, switch to archive recovery and retry using the
3142  * offline archive. We have now replayed all the valid WAL in
3143  * pg_wal, so we are presumably now consistent.
3144  *
3145  * We require that there's at least some valid WAL present in
3146  * pg_wal, however (!fetching_ckpt). We could recover using the
3147  * WAL from the archive, even if pg_wal is completely empty, but
3148  * we'd have no idea how far we'd have to replay to reach
3149  * consistency. So err on the safe side and give up.
3150  */
3152  !fetching_ckpt)
3153  {
3154  ereport(DEBUG1,
3155  (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3156  InArchiveRecovery = true;
3159 
3162  minRecoveryPointTLI = replayTLI;
3163 
3165 
3166  /*
3167  * Before we retry, reset lastSourceFailed and currentSource
3168  * so that we will check the archive next.
3169  */
3170  lastSourceFailed = false;
3172 
3173  continue;
3174  }
3175 
3176  /* In standby mode, loop back to retry. Otherwise, give up. */
3178  continue;
3179  else
3180  return NULL;
3181  }
3182  }
3183 }
3184 
3185 /*
3186  * Read the XLOG page containing targetPagePtr into readBuf (if not read
3187  * already). Returns number of bytes read, if the page is read successfully,
3188  * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3189  * but only if they have not been previously reported.
3190  *
3191  * See XLogReaderRoutine.page_read for more details.
3192  *
3193  * While prefetching, xlogreader->nonblocking may be set. In that case,
3194  * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3195  *
3196  * This is responsible for restoring files from archive as needed, as well
3197  * as for waiting for the requested WAL record to arrive in standby mode.
3198  *
3199  * xlogreader->private_data->emode specifies the log level used for reporting
3200  * "file not found" or "end of WAL" situations in archive recovery, or in
3201  * standby mode when promotion is triggered. If set to WARNING or below,
3202  * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3203  * levels the ereport() won't return.
3204  *
3205  * In standby mode, if after a successful return of XLogPageRead() the
3206  * caller finds the record it's interested in to be broken, it should
3207  * ereport the error with the level determined by
3208  * emode_for_corrupt_record(), and then set lastSourceFailed
3209  * and call XLogPageRead() again with the same arguments. This lets
3210  * XLogPageRead() to try fetching the record from another source, or to
3211  * sleep and retry.
3212  */
3213 static int
3215  XLogRecPtr targetRecPtr, char *readBuf)
3216 {
3217  XLogPageReadPrivate *private =
3219  int emode = private->emode;
3220  uint32 targetPageOff;
3221  XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3222  int r;
3223 
3224  XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3225  targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3226 
3227  /*
3228  * See if we need to switch to a new segment because the requested record
3229  * is not in the currently open one.
3230  */
3231  if (readFile >= 0 &&
3232  !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3233  {
3234  /*
3235  * Request a restartpoint if we've replayed too much xlog since the
3236  * last one.
3237  */
3239  {
3241  {
3242  (void) GetRedoRecPtr();
3245  }
3246  }
3247 
3248  close(readFile);
3249  readFile = -1;
3251  }
3252 
3253  XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3254 
3255 retry:
3256  /* See if we need to retrieve more data */
3257  if (readFile < 0 ||
3259  flushedUpto < targetPagePtr + reqLen))
3260  {
3261  if (readFile >= 0 &&
3264  flushedUpto < targetPagePtr + reqLen)
3265  return XLREAD_WOULDBLOCK;
3266 
3267  switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3268  private->randAccess,
3269  private->fetching_ckpt,
3270  targetRecPtr,
3271  private->replayTLI,
3274  {
3275  case XLREAD_WOULDBLOCK:
3276  return XLREAD_WOULDBLOCK;
3277  case XLREAD_FAIL:
3278  if (readFile >= 0)
3279  close(readFile);
3280  readFile = -1;
3281  readLen = 0;
3283  return XLREAD_FAIL;
3284  case XLREAD_SUCCESS:
3285  break;
3286  }
3287  }
3288 
3289  /*
3290  * At this point, we have the right segment open and if we're streaming we
3291  * know the requested record is in it.
3292  */
3293  Assert(readFile != -1);
3294 
3295  /*
3296  * If the current segment is being streamed from the primary, calculate
3297  * how much of the current page we have received already. We know the
3298  * requested record has been received, but this is for the benefit of
3299  * future calls, to allow quick exit at the top of this function.
3300  */
3302  {
3303  if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3304  readLen = XLOG_BLCKSZ;
3305  else
3307  targetPageOff;
3308  }
3309  else
3310  readLen = XLOG_BLCKSZ;
3311 
3312  /* Read the requested page */
3313  readOff = targetPageOff;
3314 
3315  pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3316  r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3317  if (r != XLOG_BLCKSZ)
3318  {
3319  char fname[MAXFNAMELEN];
3320  int save_errno = errno;
3321 
3324  if (r < 0)
3325  {
3326  errno = save_errno;
3327  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3329  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3330  fname, LSN_FORMAT_ARGS(targetPagePtr),
3331  readOff)));
3332  }
3333  else
3334  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3336  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3337  fname, LSN_FORMAT_ARGS(targetPagePtr),
3338  readOff, r, (Size) XLOG_BLCKSZ)));
3339  goto next_record_is_invalid;
3340  }
3342 
3343  Assert(targetSegNo == readSegNo);
3344  Assert(targetPageOff == readOff);
3345  Assert(reqLen <= readLen);
3346 
3348 
3349  /*
3350  * Check the page header immediately, so that we can retry immediately if
3351  * it's not valid. This may seem unnecessary, because ReadPageInternal()
3352  * validates the page header anyway, and would propagate the failure up to
3353  * ReadRecord(), which would retry. However, there's a corner case with
3354  * continuation records, if a record is split across two pages such that
3355  * we would need to read the two pages from different sources. For
3356  * example, imagine a scenario where a streaming replica is started up,
3357  * and replay reaches a record that's split across two WAL segments. The
3358  * first page is only available locally, in pg_wal, because it's already
3359  * been recycled on the primary. The second page, however, is not present
3360  * in pg_wal, and we should stream it from the primary. There is a
3361  * recycled WAL segment present in pg_wal, with garbage contents, however.
3362  * We would read the first page from the local WAL segment, but when
3363  * reading the second page, we would read the bogus, recycled, WAL
3364  * segment. If we didn't catch that case here, we would never recover,
3365  * because ReadRecord() would retry reading the whole record from the
3366  * beginning.
3367  *
3368  * Of course, this only catches errors in the page header, which is what
3369  * happens in the case of a recycled WAL segment. Other kinds of errors or
3370  * corruption still has the same problem. But this at least fixes the
3371  * common case, which can happen as part of normal operation.
3372  *
3373  * Validating the page header is cheap enough that doing it twice
3374  * shouldn't be a big deal from a performance point of view.
3375  *
3376  * When not in standby mode, an invalid page header should cause recovery
3377  * to end, not retry reading the page, so we don't need to validate the
3378  * page header here for the retry. Instead, ReadPageInternal() is
3379  * responsible for the validation.
3380  */
3381  if (StandbyMode &&
3382  !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3383  {
3384  /*
3385  * Emit this error right now then retry this page immediately. Use
3386  * errmsg_internal() because the message was already translated.
3387  */
3388  if (xlogreader->errormsg_buf[0])
3391 
3392  /* reset any error XLogReaderValidatePageHeader() might have set */
3394  goto next_record_is_invalid;
3395  }
3396 
3397  return readLen;
3398 
3399 next_record_is_invalid:
3400 
3401  /*
3402  * If we're reading ahead, give up fast. Retries and error reporting will
3403  * be handled by a later read when recovery catches up to this point.
3404  */
3405  if (xlogreader->nonblocking)
3406  return XLREAD_WOULDBLOCK;
3407 
3408  lastSourceFailed = true;
3409 
3410  if (readFile >= 0)
3411  close(readFile);
3412  readFile = -1;
3413  readLen = 0;
3415 
3416  /* In standby-mode, keep trying */
3417  if (StandbyMode)
3418  goto retry;
3419  else
3420  return XLREAD_FAIL;
3421 }
3422 
3423 /*
3424  * Open the WAL segment containing WAL location 'RecPtr'.
3425  *
3426  * The segment can be fetched via restore_command, or via walreceiver having
3427  * streamed the record, or it can already be present in pg_wal. Checking
3428  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3429  * too, in case someone copies a new segment directly to pg_wal. That is not
3430  * documented or recommended, though.
3431  *
3432  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3433  * prepare to read WAL starting from RedoStartLSN after this.
3434  *
3435  * 'RecPtr' might not point to the beginning of the record we're interested
3436  * in, it might also point to the page or segment header. In that case,
3437  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3438  * used to decide which timeline to stream the requested WAL from.
3439  *
3440  * 'replayLSN' is the current replay LSN, so that if we scan for new
3441  * timelines, we can reject a switch to a timeline that branched off before
3442  * this point.
3443  *
3444  * If the record is not immediately available, the function returns false
3445  * if we're not in standby mode. In standby mode, waits for it to become
3446  * available.
3447  *
3448  * When the requested record becomes available, the function opens the file
3449  * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3450  * of standby mode is triggered by the user, and there is no more WAL
3451  * available, returns XLREAD_FAIL.
3452  *
3453  * If nonblocking is true, then give up immediately if we can't satisfy the
3454  * request, returning XLREAD_WOULDBLOCK instead of waiting.
3455  */
3456 static XLogPageReadResult
3457 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3458  bool fetching_ckpt, XLogRecPtr tliRecPtr,
3459  TimeLineID replayTLI, XLogRecPtr replayLSN,
3460  bool nonblocking)
3461 {
3462  static TimestampTz last_fail_time = 0;
3463  TimestampTz now;
3464  bool streaming_reply_sent = false;
3465 
3466  /*-------
3467  * Standby mode is implemented by a state machine:
3468  *
3469  * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3470  * pg_wal (XLOG_FROM_PG_WAL)
3471  * 2. Check for promotion trigger request
3472  * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3473  * 4. Rescan timelines
3474  * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3475  *
3476  * Failure to read from the current source advances the state machine to
3477  * the next state.
3478  *
3479  * 'currentSource' indicates the current state. There are no currentSource
3480  * values for "check trigger", "rescan timelines", and "sleep" states,
3481  * those actions are taken when reading from the previous source fails, as
3482  * part of advancing to the next state.
3483  *
3484  * If standby mode is turned off while reading WAL from stream, we move
3485  * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3486  * the files (which would be required at end of recovery, e.g., timeline
3487  * history file) from archive or pg_wal. We don't need to kill WAL receiver
3488  * here because it's already stopped when standby mode is turned off at
3489  * the end of recovery.
3490  *-------
3491  */
3492  if (!InArchiveRecovery)
3494  else if (currentSource == XLOG_FROM_ANY ||
3496  {
3497  lastSourceFailed = false;
3499  }
3500 
3501  for (;;)
3502  {
3503  XLogSource oldSource = currentSource;
3504  bool startWalReceiver = false;
3505 
3506  /*
3507  * First check if we failed to read from the current source, and
3508  * advance the state machine if so. The failure to read might've
3509  * happened outside this function, e.g when a CRC check fails on a
3510  * record, or within this loop.
3511  */
3512  if (lastSourceFailed)
3513  {
3514  /*
3515  * Don't allow any retry loops to occur during nonblocking
3516  * readahead. Let the caller process everything that has been
3517  * decoded already first.
3518  */
3519  if (nonblocking)
3520  return XLREAD_WOULDBLOCK;
3521 
3522  switch (currentSource)
3523  {
3524  case XLOG_FROM_ARCHIVE:
3525  case XLOG_FROM_PG_WAL:
3526 
3527  /*
3528  * Check to see if promotion is requested. Note that we do
3529  * this only after failure, so when you promote, we still
3530  * finish replaying as much as we can from archive and
3531  * pg_wal before failover.
3532  */
3534  {
3536  return XLREAD_FAIL;
3537  }
3538 
3539  /*
3540  * Not in standby mode, and we've now tried the archive
3541  * and pg_wal.
3542  */
3543  if (!StandbyMode)
3544  return XLREAD_FAIL;
3545 
3546  /*
3547  * Move to XLOG_FROM_STREAM state, and set to start a
3548  * walreceiver if necessary.
3549  */
3551  startWalReceiver = true;
3552  break;
3553 
3554  case XLOG_FROM_STREAM:
3555 
3556  /*
3557  * Failure while streaming. Most likely, we got here
3558  * because streaming replication was terminated, or
3559  * promotion was triggered. But we also get here if we
3560  * find an invalid record in the WAL streamed from the
3561  * primary, in which case something is seriously wrong.
3562  * There's little chance that the problem will just go
3563  * away, but PANIC is not good for availability either,
3564  * especially in hot standby mode. So, we treat that the
3565  * same as disconnection, and retry from archive/pg_wal
3566  * again. The WAL in the archive should be identical to
3567  * what was streamed, so it's unlikely that it helps, but
3568  * one can hope...
3569  */
3570 
3571  /*
3572  * We should be able to move to XLOG_FROM_STREAM only in
3573  * standby mode.
3574  */
3576 
3577  /*
3578  * Before we leave XLOG_FROM_STREAM state, make sure that
3579  * walreceiver is not active, so that it won't overwrite
3580  * WAL that we restore from archive.
3581  */
3583 
3584  /*
3585  * Before we sleep, re-scan for possible new timelines if
3586  * we were requested to recover to the latest timeline.
3587  */
3589  {
3590  if (rescanLatestTimeLine(replayTLI, replayLSN))
3591  {
3593  break;
3594  }
3595  }
3596 
3597  /*
3598  * XLOG_FROM_STREAM is the last state in our state
3599  * machine, so we've exhausted all the options for
3600  * obtaining the requested WAL. We're going to loop back
3601  * and retry from the archive, but if it hasn't been long
3602  * since last attempt, sleep wal_retrieve_retry_interval
3603  * milliseconds to avoid busy-waiting.
3604  */
3606  if (!TimestampDifferenceExceeds(last_fail_time, now,
3608  {
3609  long wait_time;
3610 
3611  wait_time = wal_retrieve_retry_interval -
3612  TimestampDifferenceMilliseconds(last_fail_time, now);
3613 
3614  elog(LOG, "waiting for WAL to become available at %X/%X",
3615  LSN_FORMAT_ARGS(RecPtr));
3616 
3617  /* Do background tasks that might benefit us later. */
3619 
3623  wait_time,
3624  WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3627 
3628  /* Handle interrupt signals of startup process */
3630  }
3631  last_fail_time = now;
3633  break;
3634 
3635  default:
3636  elog(ERROR, "unexpected WAL source %d", currentSource);
3637  }
3638  }
3639  else if (currentSource == XLOG_FROM_PG_WAL)
3640  {
3641  /*
3642  * We just successfully read a file in pg_wal. We prefer files in
3643  * the archive over ones in pg_wal, so try the next file again
3644  * from the archive first.
3645  */
3646  if (InArchiveRecovery)
3648  }
3649 
3650  if (currentSource != oldSource)
3651  elog(DEBUG2, "switched WAL source from %s to %s after %s",
3653  lastSourceFailed ? "failure" : "success");
3654 
3655  /*
3656  * We've now handled possible failure. Try to read from the chosen
3657  * source.
3658  */
3659  lastSourceFailed = false;
3660 
3661  switch (currentSource)
3662  {
3663  case XLOG_FROM_ARCHIVE:
3664  case XLOG_FROM_PG_WAL:
3665 
3666  /*
3667  * WAL receiver must not be running when reading WAL from
3668  * archive or pg_wal.
3669  */
3670  Assert(!WalRcvStreaming());
3671 
3672  /* Close any old file we might have open. */
3673  if (readFile >= 0)
3674  {
3675  close(readFile);
3676  readFile = -1;
3677  }
3678  /* Reset curFileTLI if random fetch. */
3679  if (randAccess)
3680  curFileTLI = 0;
3681 
3682  /*
3683  * Try to restore the file from archive, or read an existing
3684  * file from pg_wal.
3685  */
3688  currentSource);
3689  if (readFile >= 0)
3690  return XLREAD_SUCCESS; /* success! */
3691 
3692  /*
3693  * Nope, not found in archive or pg_wal.
3694  */
3695  lastSourceFailed = true;
3696  break;
3697 
3698  case XLOG_FROM_STREAM:
3699  {
3700  bool havedata;
3701 
3702  /*
3703  * We should be able to move to XLOG_FROM_STREAM only in
3704  * standby mode.
3705  */
3707 
3708  /*
3709  * First, shutdown walreceiver if its restart has been
3710  * requested -- but no point if we're already slated for
3711  * starting it.
3712  */
3713  if (pendingWalRcvRestart && !startWalReceiver)
3714  {
3716 
3717  /*
3718  * Re-scan for possible new timelines if we were
3719  * requested to recover to the latest timeline.
3720  */
3723  rescanLatestTimeLine(replayTLI, replayLSN);
3724 
3725  startWalReceiver = true;
3726  }
3727  pendingWalRcvRestart = false;
3728 
3729  /*
3730  * Launch walreceiver if needed.
3731  *
3732  * If fetching_ckpt is true, RecPtr points to the initial
3733  * checkpoint location. In that case, we use RedoStartLSN
3734  * as the streaming start position instead of RecPtr, so
3735  * that when we later jump backwards to start redo at
3736  * RedoStartLSN, we will have the logs streamed already.
3737  */
3738  if (startWalReceiver &&
3739  PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3740  {
3741  XLogRecPtr ptr;
3742  TimeLineID tli;
3743 
3744  if (fetching_ckpt)
3745  {
3746  ptr = RedoStartLSN;
3747  tli = RedoStartTLI;
3748  }
3749  else
3750  {
3751  ptr = RecPtr;
3752 
3753  /*
3754  * Use the record begin position to determine the
3755  * TLI, rather than the position we're reading.
3756  */
3757  tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3758 
3759  if (curFileTLI > 0 && tli < curFileTLI)
3760  elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3761  LSN_FORMAT_ARGS(tliRecPtr),
3762  tli, curFileTLI);
3763  }
3764  curFileTLI = tli;
3769  flushedUpto = 0;
3770  }
3771 
3772  /*
3773  * Check if WAL receiver is active or wait to start up.
3774  */
3775  if (!WalRcvStreaming())
3776  {
3777  lastSourceFailed = true;
3778  break;
3779  }
3780 
3781  /*
3782  * Walreceiver is active, so see if new data has arrived.
3783  *
3784  * We only advance XLogReceiptTime when we obtain fresh
3785  * WAL from walreceiver and observe that we had already
3786  * processed everything before the most recent "chunk"
3787  * that it flushed to disk. In steady state where we are
3788  * keeping up with the incoming data, XLogReceiptTime will
3789  * be updated on each cycle. When we are behind,
3790  * XLogReceiptTime will not advance, so the grace time
3791  * allotted to conflicting queries will decrease.
3792  */
3793  if (RecPtr < flushedUpto)
3794  havedata = true;
3795  else
3796  {
3797  XLogRecPtr latestChunkStart;
3798 
3799  flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3800  if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3801  {
3802  havedata = true;
3803  if (latestChunkStart <= RecPtr)
3804  {
3807  }
3808  }
3809  else
3810  havedata = false;
3811  }
3812  if (havedata)
3813  {
3814  /*
3815  * Great, streamed far enough. Open the file if it's
3816  * not open already. Also read the timeline history
3817  * file if we haven't initialized timeline history
3818  * yet; it should be streamed over and present in
3819  * pg_wal by now. Use XLOG_FROM_STREAM so that source
3820  * info is set correctly and XLogReceiptTime isn't
3821  * changed.
3822  *
3823  * NB: We must set readTimeLineHistory based on
3824  * recoveryTargetTLI, not receiveTLI. Normally they'll
3825  * be the same, but if recovery_target_timeline is
3826  * 'latest' and archiving is configured, then it's
3827  * possible that we managed to retrieve one or more
3828  * new timeline history files from the archive,
3829  * updating recoveryTargetTLI.
3830  */
3831  if (readFile < 0)
3832  {
3833  if (!expectedTLEs)
3836  receiveTLI,
3837  XLOG_FROM_STREAM, false);
3838  Assert(readFile >= 0);
3839  }
3840  else
3841  {
3842  /* just make sure source info is correct... */
3845  return XLREAD_SUCCESS;
3846  }
3847  break;
3848  }
3849 
3850  /* In nonblocking mode, return rather than sleeping. */
3851  if (nonblocking)
3852  return XLREAD_WOULDBLOCK;
3853 
3854  /*
3855  * Data not here yet. Check for trigger, then wait for
3856  * walreceiver to wake us up when new WAL arrives.
3857  */
3858  if (CheckForStandbyTrigger())
3859  {
3860  /*
3861  * Note that we don't return XLREAD_FAIL immediately
3862  * here. After being triggered, we still want to
3863  * replay all the WAL that was already streamed. It's
3864  * in pg_wal now, so we just treat this as a failure,
3865  * and the state machine will move on to replay the
3866  * streamed WAL from pg_wal, and then recheck the
3867  * trigger and exit replay.
3868  */
3869  lastSourceFailed = true;
3870  break;
3871  }
3872 
3873  /*
3874  * Since we have replayed everything we have received so
3875  * far and are about to start waiting for more WAL, let's
3876  * tell the upstream server our replay location now so
3877  * that pg_stat_replication doesn't show stale
3878  * information.
3879  */
3880  if (!streaming_reply_sent)
3881  {
3882  WalRcvForceReply();
3883  streaming_reply_sent = true;
3884  }
3885 
3886  /* Do any background tasks that might benefit us later. */
3888 
3889  /* Update pg_stat_recovery_prefetch before sleeping. */
3891 
3892  /*
3893  * Wait for more WAL to arrive, when we will be woken
3894  * immediately by the WAL receiver.
3895  */
3898  -1L,
3899  WAIT_EVENT_RECOVERY_WAL_STREAM);
3901  break;
3902  }
3903 
3904  default:
3905  elog(ERROR, "unexpected WAL source %d", currentSource);
3906  }
3907 
3908  /*
3909  * Check for recovery pause here so that we can confirm more quickly
3910  * that a requested pause has actually taken effect.
3911  */
3912  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
3914  recoveryPausesHere(false);
3915 
3916  /*
3917  * This possibly-long loop needs to handle interrupts of startup
3918  * process.
3919  */
3921  }
3922 
3923  return XLREAD_FAIL; /* not reached */
3924 }
3925 
3926 
3927 /*
3928  * Determine what log level should be used to report a corrupt WAL record
3929  * in the current WAL page, previously read by XLogPageRead().
3930  *
3931  * 'emode' is the error mode that would be used to report a file-not-found
3932  * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
3933  * we're retrying the exact same record that we've tried previously, only
3934  * complain the first time to keep the noise down. However, we only do when
3935  * reading from pg_wal, because we don't expect any invalid records in archive
3936  * or in records streamed from the primary. Files in the archive should be complete,
3937  * and we should never hit the end of WAL because we stop and wait for more WAL
3938  * to arrive before replaying it.
3939  *
3940  * NOTE: This function remembers the RecPtr value it was last called with,
3941  * to suppress repeated messages about the same record. Only call this when
3942  * you are about to ereport(), or you might cause a later message to be
3943  * erroneously suppressed.
3944  */
3945 static int
3947 {
3948  static XLogRecPtr lastComplaint = 0;
3949 
3950  if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
3951  {
3952  if (RecPtr == lastComplaint)
3953  emode = DEBUG1;
3954  else
3955  lastComplaint = RecPtr;
3956  }
3957  return emode;
3958 }
3959 
3960 
3961 /*
3962  * Subroutine to try to fetch and validate a prior checkpoint record.
3963  */
3964 static XLogRecord *
3966  TimeLineID replayTLI)
3967 {
3968  XLogRecord *record;
3969  uint8 info;
3970 
3971  Assert(xlogreader != NULL);
3972 
3973  if (!XRecOffIsValid(RecPtr))
3974  {
3975  ereport(LOG,
3976  (errmsg("invalid checkpoint location")));
3977  return NULL;
3978  }
3979 
3981  record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
3982 
3983  if (record == NULL)
3984  {
3985  ereport(LOG,
3986  (errmsg("invalid checkpoint record")));
3987  return NULL;
3988  }
3989  if (record->xl_rmid != RM_XLOG_ID)
3990  {
3991  ereport(LOG,
3992  (errmsg("invalid resource manager ID in checkpoint record")));
3993  return NULL;
3994  }
3995  info = record->xl_info & ~XLR_INFO_MASK;
3996  if (info != XLOG_CHECKPOINT_SHUTDOWN &&
3997  info != XLOG_CHECKPOINT_ONLINE)
3998  {
3999  ereport(LOG,
4000  (errmsg("invalid xl_info in checkpoint record")));
4001  return NULL;
4002  }
4004  {
4005  ereport(LOG,
4006  (errmsg("invalid length of checkpoint record")));
4007  return NULL;
4008  }
4009  return record;
4010 }
4011 
4012 /*
4013  * Scan for new timelines that might have appeared in the archive since we
4014  * started recovery.
4015  *
4016  * If there are any, the function changes recovery target TLI to the latest
4017  * one and returns 'true'.
4018  */
4019 static bool
4021 {
4022  List *newExpectedTLEs;
4023  bool found;
4024  ListCell *cell;
4025  TimeLineID newtarget;
4026  TimeLineID oldtarget = recoveryTargetTLI;
4027  TimeLineHistoryEntry *currentTle = NULL;
4028 
4030  if (newtarget == recoveryTargetTLI)
4031  {
4032  /* No new timelines found */
4033  return false;
4034  }
4035 
4036  /*
4037  * Determine the list of expected TLIs for the new TLI
4038  */
4039 
4040  newExpectedTLEs = readTimeLineHistory(newtarget);
4041 
4042  /*
4043  * If the current timeline is not part of the history of the new timeline,
4044  * we cannot proceed to it.
4045  */
4046  found = false;
4047  foreach(cell, newExpectedTLEs)
4048  {
4049  currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4050 
4051  if (currentTle->tli == recoveryTargetTLI)
4052  {
4053  found = true;
4054  break;
4055  }
4056  }
4057  if (!found)
4058  {
4059  ereport(LOG,
4060  (errmsg("new timeline %u is not a child of database system timeline %u",
4061  newtarget,
4062  replayTLI)));
4063  return false;
4064  }
4065 
4066  /*
4067  * The current timeline was found in the history file, but check that the
4068  * next timeline was forked off from it *after* the current recovery
4069  * location.
4070  */
4071  if (currentTle->end < replayLSN)
4072  {
4073  ereport(LOG,
4074  (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4075  newtarget,
4076  replayTLI,
4077  LSN_FORMAT_ARGS(replayLSN))));
4078  return false;
4079  }
4080 
4081  /* The new timeline history seems valid. Switch target */
4082  recoveryTargetTLI = newtarget;
4084  expectedTLEs = newExpectedTLEs;
4085 
4086  /*
4087  * As in StartupXLOG(), try to ensure we have all the history files
4088  * between the old target and new target in pg_wal.
4089  */
4090  restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4091 
4092  ereport(LOG,
4093  (errmsg("new target timeline is %u",
4094  recoveryTargetTLI)));
4095 
4096  return true;
4097 }
4098 
4099 
4100 /*
4101  * Open a logfile segment for reading (during recovery).
4102  *
4103  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4104  * Otherwise, it's assumed to be already available in pg_wal.
4105  */
4106 static int
4107 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
4108  XLogSource source, bool notfoundOk)
4109 {
4110  char xlogfname[MAXFNAMELEN];
4111  char activitymsg[MAXFNAMELEN + 16];
4112  char path[MAXPGPATH];
4113  int fd;
4114 
4115  XLogFileName(xlogfname, tli, segno, wal_segment_size);
4116 
4117  switch (source)
4118  {
4119  case XLOG_FROM_ARCHIVE:
4120  /* Report recovery progress in PS display */
4121  snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4122  xlogfname);
4123  set_ps_display(activitymsg);
4124 
4125  if (!RestoreArchivedFile(path, xlogfname,
4126  "RECOVERYXLOG",
4128  InRedo))
4129  return -1;
4130  break;
4131 
4132  case XLOG_FROM_PG_WAL:
4133  case XLOG_FROM_STREAM:
4134  XLogFilePath(path, tli, segno, wal_segment_size);
4135  break;
4136 
4137  default:
4138  elog(ERROR, "invalid XLogFileRead source %d", source);
4139  }
4140 
4141  /*
4142  * If the segment was fetched from archival storage, replace the existing
4143  * xlog segment (if any) with the archival version.
4144  */
4145  if (source == XLOG_FROM_ARCHIVE)
4146  {
4148  KeepFileRestoredFromArchive(path, xlogfname);
4149 
4150  /*
4151  * Set path to point at the new file in pg_wal.
4152  */
4153  snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4154  }
4155 
4156  fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4157  if (fd >= 0)
4158  {
4159  /* Success! */
4160  curFileTLI = tli;
4161 
4162  /* Report recovery progress in PS display */
4163  snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4164  xlogfname);
4165  set_ps_display(activitymsg);
4166 
4167  /* Track source of data in assorted state variables */
4168  readSource = source;
4170  /* In FROM_STREAM case, caller tracks receipt time, not me */
4171  if (source != XLOG_FROM_STREAM)
4173 
4174  return fd;
4175  }
4176  if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4177  ereport(PANIC,
4179  errmsg("could not open file \"%s\": %m", path)));
4180  return -1;
4181 }
4182 
4183 /*
4184  * Open a logfile segment for reading (during recovery).
4185  *
4186  * This version searches for the segment with any TLI listed in expectedTLEs.
4187  */
4188 static int
4190 {
4191  char path[MAXPGPATH];
4192  ListCell *cell;
4193  int fd;
4194  List *tles;
4195 
4196  /*
4197  * Loop looking for a suitable timeline ID: we might need to read any of
4198  * the timelines listed in expectedTLEs.
4199  *
4200  * We expect curFileTLI on entry to be the TLI of the preceding file in
4201  * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4202  * to go backwards; this prevents us from picking up the wrong file when a
4203  * parent timeline extends to higher segment numbers than the child we
4204  * want to read.
4205  *
4206  * If we haven't read the timeline history file yet, read it now, so that
4207  * we know which TLIs to scan. We don't save the list in expectedTLEs,
4208  * however, unless we actually find a valid segment. That way if there is
4209  * neither a timeline history file nor a WAL segment in the archive, and
4210  * streaming replication is set up, we'll read the timeline history file
4211  * streamed from the primary when we start streaming, instead of
4212  * recovering with a dummy history generated here.
4213  */
4214  if (expectedTLEs)
4215  tles = expectedTLEs;
4216  else
4218 
4219  foreach(cell, tles)
4220  {
4222  TimeLineID tli = hent->tli;
4223 
4224  if (tli < curFileTLI)
4225  break; /* don't bother looking at too-old TLIs */
4226 
4227  /*
4228  * Skip scanning the timeline ID that the logfile segment to read
4229  * doesn't belong to
4230  */
4231  if (hent->begin != InvalidXLogRecPtr)
4232  {
4233  XLogSegNo beginseg = 0;
4234 
4235  XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4236 
4237  /*
4238  * The logfile segment that doesn't belong to the timeline is
4239  * older or newer than the segment that the timeline started or
4240  * ended at, respectively. It's sufficient to check only the
4241  * starting segment of the timeline here. Since the timelines are
4242  * scanned in descending order in this loop, any segments newer
4243  * than the ending segment should belong to newer timeline and
4244  * have already been read before. So it's not necessary to check
4245  * the ending segment of the timeline here.
4246  */
4247  if (segno < beginseg)
4248  continue;
4249  }
4250 
4252  {
4253  fd = XLogFileRead(segno, emode, tli,
4254  XLOG_FROM_ARCHIVE, true);
4255  if (fd != -1)
4256  {
4257  elog(DEBUG1, "got WAL segment from archive");
4258  if (!expectedTLEs)
4259  expectedTLEs = tles;
4260  return fd;
4261  }
4262  }
4263 
4265  {
4266  fd = XLogFileRead(segno, emode, tli,
4267  XLOG_FROM_PG_WAL, true);
4268  if (fd != -1)
4269  {
4270  if (!expectedTLEs)
4271  expectedTLEs = tles;
4272  return fd;
4273  }
4274  }
4275  }
4276 
4277  /* Couldn't find it. For simplicity, complain about front timeline */
4279  errno = ENOENT;
4280  ereport(emode,
4282  errmsg("could not open file \"%s\": %m", path)));
4283  return -1;
4284 }
4285 
4286 /*
4287  * Set flag to signal the walreceiver to restart. (The startup process calls
4288  * this on noticing a relevant configuration change.)
4289  */
4290 void
4292 {
4294  {
4295  ereport(LOG,
4296  (errmsg("WAL receiver process shutdown requested")));
4297 
4298  pendingWalRcvRestart = true;
4299  }
4300 }
4301 
4302 
4303 /*
4304  * Has a standby promotion already been triggered?
4305  *
4306  * Unlike CheckForStandbyTrigger(), this works in any process
4307  * that's connected to shared memory.
4308  */
4309 bool
4311 {
4312  /*
4313  * We check shared state each time only until a standby promotion is
4314  * triggered. We can't trigger a promotion again, so there's no need to
4315  * keep checking after the shared variable has once been seen true.
4316  */
4318  return true;
4319 
4323 
4324  return LocalPromoteIsTriggered;
4325 }
4326 
4327 static void
4329 {
4333 
4334  /*
4335  * Mark the recovery pause state as 'not paused' because the paused state
4336  * ends and promotion continues if a promotion is triggered while recovery
4337  * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4338  * return 'paused' while a promotion is ongoing.
4339  */
4340  SetRecoveryPause(false);
4341 
4342  LocalPromoteIsTriggered = true;
4343 }
4344 
4345 /*
4346  * Check whether a promote request has arrived.
4347  */
4348 static bool
4350 {
4352  return true;
4353 
4355  {
4356  ereport(LOG, (errmsg("received promote request")));
4360  return true;
4361  }
4362 
4363  return false;
4364 }
4365 
4366 /*
4367  * Remove the files signaling a standby promotion request.
4368  */
4369 void
4371 {
4372  unlink(PROMOTE_SIGNAL_FILE);
4373 }
4374 
4375 /*
4376  * Check to see if a promote request has arrived.
4377  */
4378 bool
4380 {
4381  struct stat stat_buf;
4382 
4383  if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4384  return true;
4385 
4386  return false;
4387 }
4388 
4389 /*
4390  * Wake up startup process to replay newly arrived WAL, or to notice that
4391  * failover has been requested.
4392  */
4393 void
4395 {
4397 }
4398 
4399 /*
4400  * Schedule a walreceiver wakeup in the main recovery loop.
4401  */
4402 void
4404 {
4406 }
4407 
4408 /*
4409  * Is HotStandby active yet? This is only important in special backends
4410  * since normal backends won't ever be able to connect until this returns
4411  * true. Postmaster knows this by way of signal, not via shared memory.
4412  *
4413  * Unlike testing standbyState, this works in any process that's connected to
4414  * shared memory. (And note that standbyState alone doesn't tell the truth
4415  * anyway.)
4416  */
4417 bool
4419 {
4420  /*
4421  * We check shared state each time only until Hot Standby is active. We
4422  * can't de-activate Hot Standby, so there's no need to keep checking
4423  * after the shared variable has once been seen true.
4424  */
4426  return true;
4427  else
4428  {
4429  /* spinlock is essential on machines with weak memory ordering! */
4433 
4434  return LocalHotStandbyActive;
4435  }
4436 }
4437 
4438 /*
4439  * Like HotStandbyActive(), but to be used only in WAL replay code,
4440  * where we don't need to ask any other process what the state is.
4441  */
4442 static bool
4444 {
4446  return LocalHotStandbyActive;
4447 }
4448 
4449 /*
4450  * Get latest redo apply position.
4451  *
4452  * Exported to allow WALReceiver to read the pointer directly.
4453  */
4454 XLogRecPtr
4456 {
4457  XLogRecPtr recptr;
4458  TimeLineID tli;
4459 
4464 
4465  if (replayTLI)
4466  *replayTLI = tli;
4467  return recptr;
4468 }
4469 
4470 
4471 /*
4472  * Get position of last applied, or the record being applied.
4473  *
4474  * This is different from GetXLogReplayRecPtr() in that if a WAL
4475  * record is currently being applied, this includes that record.
4476  */
4477 XLogRecPtr
4479 {
4480  XLogRecPtr recptr;
4481  TimeLineID tli;
4482 
4484  recptr = XLogRecoveryCtl->replayEndRecPtr;
4487 
4488  if (replayEndTLI)
4489  *replayEndTLI = tli;
4490  return recptr;
4491 }
4492 
4493 /*
4494  * Save timestamp of latest processed commit/abort record.
4495  *
4496  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4497  * seen by processes other than the startup process. Note in particular
4498  * that CreateRestartPoint is executed in the checkpointer.
4499  */
4500 static void
4502 {
4506 }
4507 
4508 /*
4509  * Fetch timestamp of latest processed commit/abort record.
4510  */
4513 {
4514  TimestampTz xtime;
4515 
4519 
4520  return xtime;
4521 }
4522 
4523 /*
4524  * Save timestamp of the next chunk of WAL records to apply.
4525  *
4526  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4527  * seen by all backends.
4528  */
4529 static void
4531 {
4535 }
4536 
4537 /*
4538  * Fetch timestamp of latest processed commit/abort record.
4539  * Startup process maintains an accurate local copy in XLogReceiptTime
4540  */
4543 {
4544  TimestampTz xtime;
4545 
4549 
4550  return xtime;
4551 }
4552 
4553 /*
4554  * Returns time of receipt of current chunk of XLOG data, as well as
4555  * whether it was received from streaming replication or from archives.
4556  */
4557 void
4558 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4559 {
4560  /*
4561  * This must be executed in the startup process, since we don't export the
4562  * relevant state to shared memory.
4563  */
4564  Assert(InRecovery);
4565 
4566  *rtime = XLogReceiptTime;
4567  *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4568 }
4569 
4570 /*
4571  * Note that text field supplied is a parameter name and does not require
4572  * translation
4573  */
4574 void
4575 RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4576 {
4577  if (currValue < minValue)
4578  {
4580  {
4581  bool warned_for_promote = false;
4582 
4583  ereport(WARNING,
4584  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4585  errmsg("hot standby is not possible because of insufficient parameter settings"),
4586  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4587  param_name,
4588  currValue,
4589  minValue)));
4590 
4591  SetRecoveryPause(true);
4592 
4593  ereport(LOG,
4594  (errmsg("recovery has paused"),
4595  errdetail("If recovery is unpaused, the server will shut down."),
4596  errhint("You can then restart the server after making the necessary configuration changes.")));
4597 
4599  {
4601 
4602  if (CheckForStandbyTrigger())
4603  {
4604  if (!warned_for_promote)
4605  ereport(WARNING,
4606  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4607  errmsg("promotion is not possible because of insufficient parameter settings"),
4608 
4609  /*
4610  * Repeat the detail from above so it's easy to find
4611  * in the log.
4612  */
4613  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4614  param_name,
4615  currValue,
4616  minValue),
4617  errhint("Restart the server after making the necessary configuration changes.")));
4618  warned_for_promote = true;
4619  }
4620 
4621  /*
4622  * If recovery pause is requested then set it paused. While
4623  * we are in the loop, user might resume and pause again so
4624  * set this every time.
4625  */
4627 
4628  /*
4629  * We wait on a condition variable that will wake us as soon
4630  * as the pause ends, but we use a timeout so we can check the
4631  * above conditions periodically too.
4632  */
4634  WAIT_EVENT_RECOVERY_PAUSE);
4635  }
4637  }
4638 
4639  ereport(FATAL,
4640  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4641  errmsg("recovery aborted because of insufficient parameter settings"),
4642  /* Repeat the detail from above so it's easy to find in the log. */
4643  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4644  param_name,
4645  currValue,
4646  minValue),
4647  errhint("You can restart the server after making the necessary configuration changes.")));
4648  }
4649 }
4650 
4651 
4652 /*
4653  * GUC check_hook for primary_slot_name
4654  */
4655 bool
4657 {
4658  if (*newval && strcmp(*newval, "") != 0 &&
4660  return false;
4661 
4662  return true;
4663 }
4664 
4665 /*
4666  * Recovery target settings: Only one of the several recovery_target* settings
4667  * may be set. Setting a second one results in an error. The global variable
4668  * recoveryTarget tracks which kind of recovery target was chosen. Other
4669  * variables store the actual target value (for example a string or a xid).
4670  * The assign functions of the parameters check whether a competing parameter
4671  * was already set. But we want to allow setting the same parameter multiple
4672  * times. We also want to allow unsetting a parameter and setting a different
4673  * one, so we unset recoveryTarget when the parameter is set to an empty
4674  * string.
4675  *
4676  * XXX this code is broken by design. Throwing an error from a GUC assign
4677  * hook breaks fundamental assumptions of guc.c. So long as all the variables
4678  * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4679  * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4680  * that we have odd behaviors such as unexpected GUC ordering dependencies.
4681  */
4682 
4683 static void
4685 error_multiple_recovery_targets(void)
4686 {
4687  ereport(ERROR,
4688  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4689  errmsg("multiple recovery targets specified"),
4690  errdetail("At most one of recovery_target, recovery_target_lsn, recovery_target_name, recovery_target_time, recovery_target_xid may be set.")));
4691 }
4692 
4693 /*
4694  * GUC check_hook for recovery_target
4695  */
4696 bool
4698 {
4699  if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4700  {
4701  GUC_check_errdetail("The only allowed value is \"immediate\".");
4702  return false;
4703  }
4704  return true;
4705 }
4706 
4707 /*
4708  * GUC assign_hook for recovery_target
4709  */
4710 void
4711 assign_recovery_target(const char *newval, void *extra)
4712 {
4715  error_multiple_recovery_targets();
4716 
4717  if (newval && strcmp(newval, "") != 0)
4719  else
4721 }
4722 
4723 /*
4724  * GUC check_hook for recovery_target_lsn
4725  */
4726 bool
4728 {
4729  if (strcmp(*newval, "") != 0)
4730  {
4731  XLogRecPtr lsn;
4732  XLogRecPtr *myextra;
4733  bool have_error = false;
4734 
4735  lsn = pg_lsn_in_internal(*newval, &have_error);
4736  if (have_error)
4737  return false;
4738 
4739  myextra = (XLogRecPtr *) guc_malloc(ERROR, sizeof(XLogRecPtr));
4740  *myextra = lsn;
4741  *extra = (void *) myextra;
4742  }
4743  return true;
4744 }
4745 
4746 /*
4747  * GUC assign_hook for recovery_target_lsn
4748  */
4749 void
4750 assign_recovery_target_lsn(const char *newval, void *extra)
4751 {
4754  error_multiple_recovery_targets();
4755 
4756  if (newval && strcmp(newval, "") != 0)
4757  {
4759  recoveryTargetLSN = *((XLogRecPtr *) extra);
4760  }
4761  else
4763 }
4764 
4765 /*
4766  * GUC check_hook for recovery_target_name
4767  */
4768 bool
4770 {
4771  /* Use the value of newval directly */
4772  if (strlen(*newval) >= MAXFNAMELEN)
4773  {
4774  GUC_check_errdetail("%s is too long (maximum %d characters).",
4775  "recovery_target_name", MAXFNAMELEN - 1);
4776  return false;
4777  }
4778  return true;
4779 }
4780 
4781 /*
4782  * GUC assign_hook for recovery_target_name
4783  */
4784 void
4785 assign_recovery_target_name(const char *newval, void *extra)
4786 {
4789  error_multiple_recovery_targets();
4790 
4791  if (newval && strcmp(newval, "") != 0)
4792  {
4795  }
4796  else
4798 }
4799 
4800 /*
4801  * GUC check_hook for recovery_target_time
4802  *
4803  * The interpretation of the recovery_target_time string can depend on the
4804  * time zone setting, so we need to wait until after all GUC processing is
4805  * done before we can do the final parsing of the string. This check function
4806  * only does a parsing pass to catch syntax errors, but we store the string
4807  * and parse it again when we need to use it.
4808  */
4809 bool
4811 {
4812  if (strcmp(*newval, "") != 0)
4813  {
4814  /* reject some special values */
4815  if (strcmp(*newval, "now") == 0 ||
4816  strcmp(*newval, "today") == 0 ||
4817  strcmp(*newval, "tomorrow") == 0 ||
4818  strcmp(*newval, "yesterday") == 0)
4819  {
4820  return false;
4821  }
4822 
4823  /*
4824  * parse timestamp value (see also timestamptz_in())
4825  */
4826  {
4827  char *str = *newval;
4828  fsec_t fsec;
4829  struct pg_tm tt,
4830  *tm = &tt;
4831  int tz;
4832  int dtype;
4833  int nf;
4834  int dterr;
4835  char *field[MAXDATEFIELDS];
4836  int ftype[MAXDATEFIELDS];
4837  char workbuf[MAXDATELEN + MAXDATEFIELDS];
4838  DateTimeErrorExtra dtextra;
4840 
4841  dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4842  field, ftype, MAXDATEFIELDS, &nf);
4843  if (dterr == 0)
4844  dterr = DecodeDateTime(field, ftype, nf,
4845  &dtype, tm, &fsec, &tz, &dtextra);
4846  if (dterr != 0)
4847  return false;
4848  if (dtype != DTK_DATE)
4849  return false;
4850 
4851  if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4852  {
4853  GUC_check_errdetail("timestamp out of range: \"%s\"", str);
4854  return false;
4855  }
4856  }
4857  }
4858  return true;
4859 }
4860 
4861 /*
4862  * GUC assign_hook for recovery_target_time
4863  */
4864 void
4865 assign_recovery_target_time(const char *newval, void *extra)
4866 {
4869  error_multiple_recovery_targets();
4870 
4871  if (newval && strcmp(newval, "") != 0)
4873  else
4875 }
4876 
4877 /*
4878  * GUC check_hook for recovery_target_timeline
4879  */
4880 bool
4882 {
4884  RecoveryTargetTimeLineGoal *myextra;
4885 
4886  if (strcmp(*newval, "current") == 0)
4888  else if (strcmp(*newval, "latest") == 0)
4890  else
4891  {
4893 
4894  errno = 0;
4895  strtoul(*newval, NULL, 0);
4896  if (errno == EINVAL || errno == ERANGE)
4897  {
4898  GUC_check_errdetail("recovery_target_timeline is not a valid number.");
4899  return false;
4900  }
4901  }
4902 
4904  *myextra = rttg;
4905  *extra = (void *) myextra;
4906 
4907  return true;
4908 }
4909 
4910 /*
4911  * GUC assign_hook for recovery_target_timeline
4912  */
4913 void
4914 assign_recovery_target_timeline(const char *newval, void *extra)
4915 {
4918  recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
4919  else
4921 }
4922 
4923 /*
4924  * GUC check_hook for recovery_target_xid
4925  */
4926 bool
4928 {
4929  if (strcmp(*newval, "") != 0)
4930  {
4931  TransactionId xid;
4932  TransactionId *myextra;
4933 
4934  errno = 0;
4935  xid = (TransactionId) strtou64(*newval, NULL, 0);
4936  if (errno == EINVAL || errno == ERANGE)
4937  return false;
4938 
4939  myextra = (TransactionId *) guc_malloc(ERROR, sizeof(TransactionId));
4940  *myextra = xid;
4941  *extra = (void *) myextra;
4942  }
4943  return true;
4944 }
4945 
4946 /*
4947  * GUC assign_hook for recovery_target_xid
4948  */
4949 void
4950 assign_recovery_target_xid(const char *newval, void *extra)
4951 {
4954  error_multiple_recovery_targets();
4955 
4956  if (newval && strcmp(newval, "") != 0)
4957  {
4959  recoveryTargetXid = *((TransactionId *) extra);
4960  }
4961  else
4963 }
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition: timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:544
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:572
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition: timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition: timeline.c:50
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition: timeline.c:526
void remove_tablespace_symlink(const char *linkloc)
Definition: tablespace.c:889
bool allow_in_place_tablespaces
Definition: tablespace.c:91
void HandleStartupProcInterrupts(void)
Definition: startup.c:168
void disable_startup_progress_timeout(void)
Definition: startup.c:318
bool IsPromoteSignaled(void)
Definition: startup.c:297
void begin_startup_progress_phase(void)
Definition: startup.c:352
void ResetPromoteSignaled(void)
Definition: startup.c:303
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition: datetime.c:756
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition: datetime.c:980
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1695
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition: timestamp.c:1926
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1719
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition: timestamp.c:400
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1583
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1547
const char * timestamptz_to_str(TimestampTz t)
Definition: timestamp.c:1782
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4497
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4715
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:350
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:159
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:50
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:301
Pointer Page
Definition: bufpage.h:78
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
unsigned int uint32
Definition: c.h:495
signed int int32
Definition: c.h:483
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:171
#define PG_BINARY
Definition: c.h:1283
#define UINT64_FORMAT
Definition: c.h:538
#define strtou64(str, endptr, base)
Definition: c.h:1308
unsigned char uint8
Definition: c.h:493
uint32 TransactionId
Definition: c.h:641
size_t Size
Definition: c.h:594
void RequestCheckpoint(int flags)
Definition: checkpointer.c:930
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition: timestamp.h:39
int32 fsec_t
Definition: timestamp.h:41
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
int errcode_for_file_access(void)
Definition: elog.c:881
int errdetail(const char *fmt,...)
Definition: elog.c:1202
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errhint(const char *fmt,...)
Definition: elog.c:1316
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define LOG
Definition: elog.h:31
#define errcontext
Definition: elog.h:196
#define DEBUG3
Definition: elog.h:28
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2528
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1061
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:734
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1039
int FreeFile(FILE *file)
Definition: fd.c:2726
int pg_fsync(int fd)
Definition: fd.c:361
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2788
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:525
@ PGFILETYPE_LNK
Definition: file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:646
bool IsUnderPostmaster
Definition: globals.c:113
char * DataDir
Definition: globals.c:66
bool IsPostmasterEnvironment
Definition: globals.c:112
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:631
#define newval
#define GUC_check_errdetail
Definition: guc.h:436
GucSource
Definition: guc.h:108
int trace_recovery_messages
Definition: guc_tables.c:528
#define MAXDATEFIELDS
Definition: datetime.h:202
#define DTK_DATE
Definition: datetime.h:144
#define MAXDATELEN
Definition: datetime.h:200
#define close(a)
Definition: win32.h:12
void proc_exit(int code)
Definition: ipc.c:104
int i
Definition: isn.c:73
void OwnLatch(Latch *latch)
Definition: latch.c:436
void DisownLatch(Latch *latch)
Definition: latch.c:462
void InitSharedLatch(Latch *latch)
Definition: latch.c:403
void SetLatch(Latch *latch)
Definition: latch.c:605
void ResetLatch(Latch *latch)
Definition: latch.c:697
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:490
#define WL_TIMEOUT
Definition: latch.h:128
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:130
#define WL_LATCH_SET
Definition: latch.h:125
Assert(fmt[strlen(fmt) - 1] !='\n')
List * lappend(List *list, void *datum)
Definition: list.c:338
void list_free_deep(List *list)
Definition: list.c:1559
static struct pg_tm tm
Definition: localtime.c:104
char * pstrdup(const char *in)
Definition: mcxt.c:1644
void pfree(void *pointer)
Definition: mcxt.c:1456
void * palloc0(Size size)
Definition: mcxt.c:1257
void * palloc(Size size)
Definition: mcxt.c:1226
#define AmStartupProcess()
Definition: miscadmin.h:452
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:414
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition: pg_control.h:74
#define XLOG_OVERWRITE_CONTRECORD
Definition: pg_control.h:80
DBState
Definition: pg_control.h:88
@ DB_IN_ARCHIVE_RECOVERY
Definition: pg_control.h:94
@ DB_SHUTDOWNED_IN_RECOVERY
Definition: pg_control.h:91
@ DB_SHUTDOWNED
Definition: pg_control.h:90
@ DB_IN_CRASH_RECOVERY
Definition: pg_control.h:93
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:67
#define XLOG_BACKUP_END
Definition: pg_control.h:72
#define XLOG_CHECKPOINT_ONLINE
Definition: pg_control.h:68
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:76
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
XLogRecPtr pg_lsn_in_internal(const char *str, bool *have_error)
Definition: pg_lsn.c:30
static rewind_source * source
Definition: pg_rewind.c:89
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:67
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:181
@ PMSIGNAL_RECOVERY_STARTED
Definition: pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition: pmsignal.h:36
#define pg_pread
Definition: port.h:225
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:350
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:36
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition: procarray.c:4337
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition: procarray.c:4474
static void set_ps_display(const char *activity)
Definition: ps_status.h:40
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
void RmgrStartup(void)
Definition: rmgr.c:49
void RmgrCleanup(void)
Definition: rmgr.c:65
int slock_t
Definition: s_lock.h:754
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:396
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:199
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:91
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:176
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:188
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
Oid oldestMultiDB
Definition: pg_control.h:50
MultiXactId oldestMulti
Definition: pg_control.h:49
MultiXactOffset nextMultiOffset
Definition: pg_control.h:46
TransactionId newestCommitTsXid
Definition: pg_control.h:54
TransactionId oldestXid
Definition: pg_control.h:47
TimeLineID PrevTimeLineID
Definition: pg_control.h:40
TimeLineID ThisTimeLineID
Definition: pg_control.h:39
Oid nextOid
Definition: pg_control.h:44
MultiXactId nextMulti
Definition: pg_control.h:45
FullTransactionId nextXid
Definition: pg_control.h:43
TransactionId oldestCommitTsXid
Definition: pg_control.h:52
XLogRecPtr redo
Definition: pg_control.h:37
Oid oldestXidDB
Definition: pg_control.h:48
XLogRecPtr backupStartPoint
Definition: pg_control.h:168
bool backupEndRequired
Definition: pg_control.h:170
CheckPoint checkPointCopy
Definition: pg_control.h:133
XLogRecPtr backupEndPoint
Definition: pg_control.h:169
XLogRecPtr minRecoveryPoint
Definition: pg_control.h:166
XLogRecPtr checkPoint
Definition: pg_control.h:131
uint64 system_identifier
Definition: pg_control.h:108
TimeLineID minRecoveryPointTLI
Definition: pg_control.h:167
Definition: dirent.c:26
XLogRecPtr lastPageBeginPtr
Definition: xlogrecovery.h:111
XLogRecPtr abortedRecPtr
Definition: xlogrecovery.h:120
XLogRecPtr missingContrecPtr
Definition: xlogrecovery.h:121
TimeLineID endOfLogTLI
Definition: xlogrecovery.h:109
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
Definition: latch.h:111
Definition: pg_list.h:54
RelFileNumber relNumber
void(* rm_mask)(char *pagedata, BlockNumber blkno)
void(* rm_redo)(XLogReaderState *record)
const char *(* rm_identify)(uint8 info)
const char * rm_name
void(* rm_desc)(StringInfo buf, XLogReaderState *record)
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h:29
TimeLineID ws_tli
Definition: xlogreader.h:49
TimeLineID replayTLI
Definition: xlogrecovery.c:199
XLogRecPtr missingContrecPtr
Definition: xlogreader.h:215
char * errormsg_buf
Definition: xlogreader.h:311
XLogRecPtr EndRecPtr
Definition: xlogreader.h:207
uint64 system_identifier
Definition: xlogreader.h:191
XLogRecPtr ReadRecPtr
Definition: xlogreader.h:206
XLogRecPtr abortedRecPtr
Definition: xlogreader.h:214
TimeLineID latestPageTLI
Definition: xlogreader.h:280
XLogRecPtr overwrittenRecPtr
Definition: xlogreader.h:217
XLogRecPtr latestPagePtr
Definition: xlogreader.h:279
WALOpenSegment seg
Definition: xlogreader.h:272
void * private_data
Definition: xlogreader.h:196
uint8 xl_info
Definition: xlogrecord.h:46
uint32 xl_tot_len
Definition: xlogrecord.h:43
TransactionId xl_xid
Definition: xlogrecord.h:44
RmgrId xl_rmid
Definition: xlogrecord.h:47
ConditionVariable recoveryNotPausedCV
Definition: xlogrecovery.c:359
XLogRecPtr lastReplayedEndRecPtr
Definition: xlogrecovery.c:339
TimeLineID replayEndTLI
Definition: xlogrecovery.c:348
TimeLineID lastReplayedTLI
Definition: xlogrecovery.c:340
TimestampTz currentChunkStartTime
Definition: xlogrecovery.c:356
XLogRecPtr replayEndRecPtr
Definition: xlogrecovery.c:347
TimestampTz recoveryLastXTime
Definition: xlogrecovery.c:350
RecoveryPauseState recoveryPauseState
Definition: xlogrecovery.c:358
XLogRecPtr lastReplayedReadRecPtr
Definition: xlogrecovery.c:338
Definition: guc.h:168
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
Definition: pgtime.h:35
Definition: regguts.h:323