PostgreSQL Source Code  git master
xlogrecovery.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * xlogrecovery.c
4  * Functions for WAL recovery, standby mode
5  *
6  * This source file contains functions controlling WAL recovery.
7  * InitWalRecovery() initializes the system for crash or archive recovery,
8  * or standby mode, depending on configuration options and the state of
9  * the control file and possible backup label file. PerformWalRecovery()
10  * performs the actual WAL replay, calling the rmgr-specific redo routines.
11  * EndWalRecovery() performs end-of-recovery checks and cleanup actions,
12  * and prepares information needed to initialize the WAL for writes. In
13  * addition to these three main functions, there are a bunch of functions
14  * for interrogating recovery state and controlling the recovery process.
15  *
16  *
17  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
18  * Portions Copyright (c) 1994, Regents of the University of California
19  *
20  * src/backend/access/transam/xlogrecovery.c
21  *
22  *-------------------------------------------------------------------------
23  */
24 
25 #include "postgres.h"
26 
27 #include <ctype.h>
28 #include <math.h>
29 #include <time.h>
30 #include <sys/stat.h>
31 #include <sys/time.h>
32 #include <unistd.h>
33 
34 #include "access/timeline.h"
35 #include "access/transam.h"
36 #include "access/xact.h"
37 #include "access/xlog_internal.h"
38 #include "access/xlogarchive.h"
39 #include "access/xlogprefetcher.h"
40 #include "access/xlogreader.h"
41 #include "access/xlogrecovery.h"
42 #include "access/xlogutils.h"
43 #include "backup/basebackup.h"
44 #include "catalog/pg_control.h"
45 #include "commands/tablespace.h"
46 #include "common/file_utils.h"
47 #include "miscadmin.h"
48 #include "pgstat.h"
49 #include "postmaster/bgwriter.h"
50 #include "postmaster/startup.h"
51 #include "replication/slot.h"
53 #include "storage/fd.h"
54 #include "storage/ipc.h"
55 #include "storage/latch.h"
56 #include "storage/pmsignal.h"
57 #include "storage/proc.h"
58 #include "storage/procarray.h"
59 #include "storage/spin.h"
60 #include "utils/builtins.h"
61 #include "utils/datetime.h"
62 #include "utils/guc_hooks.h"
63 #include "utils/pg_lsn.h"
64 #include "utils/ps_status.h"
65 #include "utils/pg_rusage.h"
66 
67 /* Unsupported old recovery command file names (relative to $PGDATA) */
68 #define RECOVERY_COMMAND_FILE "recovery.conf"
69 #define RECOVERY_COMMAND_DONE "recovery.done"
70 
71 /*
72  * GUC support
73  */
75  {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
76  {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
77  {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
78  {NULL, 0, false}
79 };
80 
81 /* options formerly taken from recovery.conf for archive recovery */
83 char *recoveryEndCommand = NULL;
84 char *archiveCleanupCommand = NULL;
91 const char *recoveryTargetName;
94 
95 /* options formerly taken from recovery.conf for XLOG streaming */
96 char *PrimaryConnInfo = NULL;
97 char *PrimarySlotName = NULL;
99 
100 /*
101  * recoveryTargetTimeLineGoal: what the user requested, if any
102  *
103  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
104  *
105  * recoveryTargetTLI: the currently understood target timeline; changes
106  *
107  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
108  * the timelines of its known parents, newest first (so recoveryTargetTLI is
109  * always the first list member). Only these TLIs are expected to be seen in
110  * the WAL segments we read, and indeed only these TLIs will be considered as
111  * candidate WAL files to open at all.
112  *
113  * curFileTLI: the TLI appearing in the name of the current input WAL file.
114  * (This is not necessarily the same as the timeline from which we are
115  * replaying WAL, which StartupXLOG calls replayTLI, because we could be
116  * scanning data that was copied from an ancestor timeline when the current
117  * file was created.) During a sequential scan we do not allow this value
118  * to decrease.
119  */
125 
126 /*
127  * When ArchiveRecoveryRequested is set, archive recovery was requested,
128  * ie. signal files were present. When InArchiveRecovery is set, we are
129  * currently recovering using offline XLOG archives. These variables are only
130  * valid in the startup process.
131  *
132  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
133  * currently performing crash recovery using only XLOG files in pg_wal, but
134  * will switch to using offline XLOG archives as soon as we reach the end of
135  * WAL in pg_wal.
136 */
138 bool InArchiveRecovery = false;
139 
140 /*
141  * When StandbyModeRequested is set, standby mode was requested, i.e.
142  * standby.signal file was present. When StandbyMode is set, we are currently
143  * in standby mode. These variables are only valid in the startup process.
144  * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
145  */
146 static bool StandbyModeRequested = false;
147 bool StandbyMode = false;
148 
149 /* was a signal file present at startup? */
150 static bool standby_signal_file_found = false;
151 static bool recovery_signal_file_found = false;
152 
153 /*
154  * CheckPointLoc is the position of the checkpoint record that determines
155  * where to start the replay. It comes from the backup label file or the
156  * control file.
157  *
158  * RedoStartLSN is the checkpoint's REDO location, also from the backup label
159  * file or the control file. In standby mode, XLOG streaming usually starts
160  * from the position where an invalid record was found. But if we fail to
161  * read even the initial checkpoint record, we use the REDO location instead
162  * of the checkpoint location as the start position of XLOG streaming.
163  * Otherwise we would have to jump backwards to the REDO location after
164  * reading the checkpoint record, because the REDO record can precede the
165  * checkpoint record.
166  */
171 
172 /*
173  * Local copy of SharedHotStandbyActive variable. False actually means "not
174  * known, need to check the shared state".
175  */
176 static bool LocalHotStandbyActive = false;
177 
178 /*
179  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
180  * known, need to check the shared state".
181  */
182 static bool LocalPromoteIsTriggered = false;
183 
184 /* Has the recovery code requested a walreceiver wakeup? */
186 
187 /* XLogReader object used to parse the WAL records */
189 
190 /* XLogPrefetcher object used to consume WAL records with read-ahead */
192 
193 /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
194 typedef struct XLogPageReadPrivate
195 {
196  int emode;
197  bool fetching_ckpt; /* are we fetching a checkpoint record? */
201 
202 /* flag to tell XLogPageRead that we have started replaying */
203 static bool InRedo = false;
204 
205 /*
206  * Codes indicating where we got a WAL file from during recovery, or where
207  * to attempt to get one.
208  */
209 typedef enum
210 {
211  XLOG_FROM_ANY = 0, /* request to read WAL from any source */
212  XLOG_FROM_ARCHIVE, /* restored using restore_command */
213  XLOG_FROM_PG_WAL, /* existing file in pg_wal */
214  XLOG_FROM_STREAM /* streamed from primary */
216 
217 /* human-readable names for XLogSources, for debugging output */
218 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
219 
220 /*
221  * readFile is -1 or a kernel FD for the log file segment that's currently
222  * open for reading. readSegNo identifies the segment. readOff is the offset
223  * of the page just read, readLen indicates how much of it has been read into
224  * readBuf, and readSource indicates where we got the currently open file from.
225  *
226  * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
227  * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
228  * worthwhile, since the XLOG is not read by general-purpose sessions.
229  */
230 static int readFile = -1;
231 static XLogSegNo readSegNo = 0;
232 static uint32 readOff = 0;
233 static uint32 readLen = 0;
235 
236 /*
237  * Keeps track of which source we're currently reading from. This is
238  * different from readSource in that this is always set, even when we don't
239  * currently have a WAL file open. If lastSourceFailed is set, our last
240  * attempt to read from currentSource failed, and we should try another source
241  * next.
242  *
243  * pendingWalRcvRestart is set when a config change occurs that requires a
244  * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
245  */
247 static bool lastSourceFailed = false;
248 static bool pendingWalRcvRestart = false;
249 
250 /*
251  * These variables track when we last obtained some WAL data to process,
252  * and where we got it from. (XLogReceiptSource is initially the same as
253  * readSource, but readSource gets reset to zero when we don't have data
254  * to process right now. It is also different from currentSource, which
255  * also changes when we try to read from a source and fail, while
256  * XLogReceiptSource tracks where we last successfully read some WAL.)
257  */
260 
261 /* Local copy of WalRcv->flushedUpto */
264 
265 /*
266  * Copy of minRecoveryPoint and backupEndPoint from the control file.
267  *
268  * In order to reach consistency, we must replay the WAL up to
269  * minRecoveryPoint. If backupEndRequired is true, we must also reach
270  * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
271  * to backupStartPoint.
272  *
273  * Note: In archive recovery, after consistency has been reached, the
274  * functions in xlog.c will start updating minRecoveryPoint in the control
275  * file. But this copy of minRecoveryPoint variable reflects the value at the
276  * beginning of recovery, and is *not* updated after consistency is reached.
277  */
280 
283 static bool backupEndRequired = false;
284 
285 /*
286  * Have we reached a consistent database state? In crash recovery, we have
287  * to replay all the WAL, so reachedConsistency is never set. During archive
288  * recovery, the database is consistent once minRecoveryPoint is reached.
289  *
290  * Consistent state means that the system is internally consistent, all
291  * the WAL has been replayed up to a certain point, and importantly, there
292  * is no trace of later actions on disk.
293  */
294 bool reachedConsistency = false;
295 
296 /* Buffers dedicated to consistency checks of size BLCKSZ */
297 static char *replay_image_masked = NULL;
298 static char *primary_image_masked = NULL;
299 
300 
301 /*
302  * Shared-memory state for WAL recovery.
303  */
304 typedef struct XLogRecoveryCtlData
305 {
306  /*
307  * SharedHotStandbyActive indicates if we allow hot standby queries to be
308  * run. Protected by info_lck.
309  */
311 
312  /*
313  * SharedPromoteIsTriggered indicates if a standby promotion has been
314  * triggered. Protected by info_lck.
315  */
317 
318  /*
319  * recoveryWakeupLatch is used to wake up the startup process to continue
320  * WAL replay, if it is waiting for WAL to arrive or promotion to be
321  * requested.
322  *
323  * Note that the startup process also uses another latch, its procLatch,
324  * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
325  * signaling the startup process in favor of using its procLatch, which
326  * comports better with possible generic signal handlers using that latch.
327  * But we should not do that because the startup process doesn't assume
328  * that it's waken up by walreceiver process or SIGHUP signal handler
329  * while it's waiting for recovery conflict. The separate latches,
330  * recoveryWakeupLatch and procLatch, should be used for inter-process
331  * communication for WAL replay and recovery conflict, respectively.
332  */
334 
335  /*
336  * Last record successfully replayed.
337  */
338  XLogRecPtr lastReplayedReadRecPtr; /* start position */
339  XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
340  TimeLineID lastReplayedTLI; /* timeline */
341 
342  /*
343  * When we're currently replaying a record, ie. in a redo function,
344  * replayEndRecPtr points to the end+1 of the record being replayed,
345  * otherwise it's equal to lastReplayedEndRecPtr.
346  */
349  /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
351 
352  /*
353  * timestamp of when we started replaying the current chunk of WAL data,
354  * only relevant for replication or archive recovery
355  */
357  /* Recovery pause state */
360 
361  slock_t info_lck; /* locks shared variables shown above */
363 
365 
366 /*
367  * abortedRecPtr is the start pointer of a broken record at end of WAL when
368  * recovery completes; missingContrecPtr is the location of the first
369  * contrecord that went missing. See CreateOverwriteContrecordRecord for
370  * details.
371  */
374 
375 /*
376  * if recoveryStopsBefore/After returns true, it saves information of the stop
377  * point here
378  */
383 static bool recoveryStopAfter;
384 
385 /* prototypes for local functions */
386 static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
387 
388 static void EnableStandbyMode(void);
389 static void readRecoverySignalFile(void);
390 static void validateRecoveryParameters(void);
391 static bool read_backup_label(XLogRecPtr *checkPointLoc,
392  TimeLineID *backupLabelTLI,
393  bool *backupEndRequired, bool *backupFromStandby);
394 static bool read_tablespace_map(List **tablespaces);
395 
396 static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
397 static void CheckRecoveryConsistency(void);
398 static void rm_redo_error_callback(void *arg);
399 #ifdef WAL_DEBUG
400 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
401 #endif
402 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
403 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
404  TimeLineID prevTLI, TimeLineID replayTLI);
405 static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
406 static void verifyBackupPageConsistency(XLogReaderState *record);
407 
408 static bool recoveryStopsBefore(XLogReaderState *record);
409 static bool recoveryStopsAfter(XLogReaderState *record);
410 static char *getRecoveryStopReason(void);
411 static void recoveryPausesHere(bool endOfRecovery);
412 static bool recoveryApplyDelay(XLogReaderState *record);
413 static void ConfirmRecoveryPaused(void);
414 
416  int emode, bool fetching_ckpt,
417  TimeLineID replayTLI);
418 
419 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
420  int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
422  bool randAccess,
423  bool fetching_ckpt,
424  XLogRecPtr tliRecPtr,
425  TimeLineID replayTLI,
426  XLogRecPtr replayLSN,
427  bool nonblocking);
428 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
430  XLogRecPtr RecPtr, TimeLineID replayTLI);
431 static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
432 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
433  XLogSource source, bool notfoundOk);
434 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
435 
436 static bool CheckForStandbyTrigger(void);
437 static void SetPromoteIsTriggered(void);
438 static bool HotStandbyActiveInReplay(void);
439 
440 static void SetCurrentChunkStartTime(TimestampTz xtime);
441 static void SetLatestXTime(TimestampTz xtime);
442 
443 /*
444  * Initialization of shared memory for WAL recovery
445  */
446 Size
448 {
449  Size size;
450 
451  /* XLogRecoveryCtl */
452  size = sizeof(XLogRecoveryCtlData);
453 
454  return size;
455 }
456 
457 void
459 {
460  bool found;
461 
463  ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
464  if (found)
465  return;
466  memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
467 
471 }
472 
473 /*
474  * A thin wrapper to enable StandbyMode and do other preparatory work as
475  * needed.
476  */
477 static void
479 {
480  StandbyMode = true;
481 
482  /*
483  * To avoid server log bloat, we don't report recovery progress in a
484  * standby as it will always be in recovery unless promoted. We disable
485  * startup progress timeout in standby mode to avoid calling
486  * startup_progress_timeout_handler() unnecessarily.
487  */
489 }
490 
491 /*
492  * Prepare the system for WAL recovery, if needed.
493  *
494  * This is called by StartupXLOG() which coordinates the server startup
495  * sequence. This function analyzes the control file and the backup label
496  * file, if any, and figures out whether we need to perform crash recovery or
497  * archive recovery, and how far we need to replay the WAL to reach a
498  * consistent state.
499  *
500  * This doesn't yet change the on-disk state, except for creating the symlinks
501  * from table space map file if any, and for fetching WAL files needed to find
502  * the checkpoint record. On entry, the caller has already read the control
503  * file into memory, and passes it as argument. This function updates it to
504  * reflect the recovery state, and the caller is expected to write it back to
505  * disk does after initializing other subsystems, but before calling
506  * PerformWalRecovery().
507  *
508  * This initializes some global variables like ArchiveModeRequested, and
509  * StandbyModeRequested and InRecovery.
510  */
511 void
513  bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
514 {
515  XLogPageReadPrivate *private;
516  struct stat st;
517  bool wasShutdown;
518  XLogRecord *record;
519  DBState dbstate_at_startup;
520  bool haveTblspcMap = false;
521  bool haveBackupLabel = false;
522  CheckPoint checkPoint;
523  bool backupFromStandby = false;
524 
525  dbstate_at_startup = ControlFile->state;
526 
527  /*
528  * Initialize on the assumption we want to recover to the latest timeline
529  * that's active according to pg_control.
530  */
534  else
536 
537  /*
538  * Check for signal files, and if so set up state for offline recovery
539  */
542 
544  {
546  ereport(LOG,
547  (errmsg("entering standby mode")));
549  ereport(LOG,
550  (errmsg("starting point-in-time recovery to XID %u",
553  ereport(LOG,
554  (errmsg("starting point-in-time recovery to %s",
557  ereport(LOG,
558  (errmsg("starting point-in-time recovery to \"%s\"",
561  ereport(LOG,
562  (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
565  ereport(LOG,
566  (errmsg("starting point-in-time recovery to earliest consistent point")));
567  else
568  ereport(LOG,
569  (errmsg("starting archive recovery")));
570  }
571 
572  /*
573  * Take ownership of the wakeup latch if we're going to sleep during
574  * recovery.
575  */
578 
579  private = palloc0(sizeof(XLogPageReadPrivate));
580  xlogreader =
582  XL_ROUTINE(.page_read = &XLogPageRead,
583  .segment_open = NULL,
584  .segment_close = wal_segment_close),
585  private);
586  if (!xlogreader)
587  ereport(ERROR,
588  (errcode(ERRCODE_OUT_OF_MEMORY),
589  errmsg("out of memory"),
590  errdetail("Failed while allocating a WAL reading processor.")));
592 
593  /*
594  * Set the WAL decode buffer size. This limits how far ahead we can read
595  * in the WAL.
596  */
598 
599  /* Create a WAL prefetcher. */
601 
602  /*
603  * Allocate two page buffers dedicated to WAL consistency checks. We do
604  * it this way, rather than just making static arrays, for two reasons:
605  * (1) no need to waste the storage in most instantiations of the backend;
606  * (2) a static char array isn't guaranteed to have any particular
607  * alignment, whereas palloc() will provide MAXALIGN'd storage.
608  */
609  replay_image_masked = (char *) palloc(BLCKSZ);
610  primary_image_masked = (char *) palloc(BLCKSZ);
611 
613  &backupFromStandby))
614  {
615  List *tablespaces = NIL;
616 
617  /*
618  * Archive recovery was requested, and thanks to the backup label
619  * file, we know how far we need to replay to reach consistency. Enter
620  * archive recovery directly.
621  */
622  InArchiveRecovery = true;
625 
626  /*
627  * When a backup_label file is present, we want to roll forward from
628  * the checkpoint it identifies, rather than using pg_control.
629  */
631  CheckPointTLI);
632  if (record != NULL)
633  {
634  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
635  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
636  ereport(DEBUG1,
637  (errmsg_internal("checkpoint record is at %X/%X",
639  InRecovery = true; /* force recovery even if SHUTDOWNED */
640 
641  /*
642  * Make sure that REDO location exists. This may not be the case
643  * if there was a crash during an online backup, which left a
644  * backup_label around that references a WAL segment that's
645  * already been archived.
646  */
647  if (checkPoint.redo < CheckPointLoc)
648  {
650  if (!ReadRecord(xlogprefetcher, LOG, false,
651  checkPoint.ThisTimeLineID))
652  ereport(FATAL,
653  (errmsg("could not find redo location referenced by checkpoint record"),
654  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
655  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
656  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
657  DataDir, DataDir, DataDir)));
658  }
659  }
660  else
661  {
662  ereport(FATAL,
663  (errmsg("could not locate required checkpoint record"),
664  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
665  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
666  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
667  DataDir, DataDir, DataDir)));
668  wasShutdown = false; /* keep compiler quiet */
669  }
670 
671  /* Read the tablespace_map file if present and create symlinks. */
672  if (read_tablespace_map(&tablespaces))
673  {
674  ListCell *lc;
675 
676  foreach(lc, tablespaces)
677  {
678  tablespaceinfo *ti = lfirst(lc);
679  char *linkloc;
680 
681  linkloc = psprintf("pg_tblspc/%s", ti->oid);
682 
683  /*
684  * Remove the existing symlink if any and Create the symlink
685  * under PGDATA.
686  */
687  remove_tablespace_symlink(linkloc);
688 
689  if (symlink(ti->path, linkloc) < 0)
690  ereport(ERROR,
692  errmsg("could not create symbolic link \"%s\": %m",
693  linkloc)));
694 
695  pfree(ti->oid);
696  pfree(ti->path);
697  pfree(ti);
698  }
699 
700  /* tell the caller to delete it later */
701  haveTblspcMap = true;
702  }
703 
704  /* tell the caller to delete it later */
705  haveBackupLabel = true;
706  }
707  else
708  {
709  /*
710  * If tablespace_map file is present without backup_label file, there
711  * is no use of such file. There is no harm in retaining it, but it
712  * is better to get rid of the map file so that we don't have any
713  * redundant file in data directory and it will avoid any sort of
714  * confusion. It seems prudent though to just rename the file out of
715  * the way rather than delete it completely, also we ignore any error
716  * that occurs in rename operation as even if map file is present
717  * without backup_label file, it is harmless.
718  */
719  if (stat(TABLESPACE_MAP, &st) == 0)
720  {
721  unlink(TABLESPACE_MAP_OLD);
723  ereport(LOG,
724  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
726  errdetail("File \"%s\" was renamed to \"%s\".",
728  else
729  ereport(LOG,
730  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
732  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
734  }
735 
736  /*
737  * It's possible that archive recovery was requested, but we don't
738  * know how far we need to replay the WAL before we reach consistency.
739  * This can happen for example if a base backup is taken from a
740  * running server using an atomic filesystem snapshot, without calling
741  * pg_backup_start/stop. Or if you just kill a running primary server
742  * and put it into archive recovery by creating a recovery signal
743  * file.
744  *
745  * Our strategy in that case is to perform crash recovery first,
746  * replaying all the WAL present in pg_wal, and only enter archive
747  * recovery after that.
748  *
749  * But usually we already know how far we need to replay the WAL (up
750  * to minRecoveryPoint, up to backupEndPoint, or until we see an
751  * end-of-backup record), and we can enter archive recovery directly.
752  */
758  {
759  InArchiveRecovery = true;
762  }
763 
764  /* Get the last valid checkpoint record. */
770  CheckPointTLI);
771  if (record != NULL)
772  {
773  ereport(DEBUG1,
774  (errmsg_internal("checkpoint record is at %X/%X",
776  }
777  else
778  {
779  /*
780  * We used to attempt to go back to a secondary checkpoint record
781  * here, but only when not in standby mode. We now just fail if we
782  * can't read the last checkpoint because this allows us to
783  * simplify processing around checkpoints.
784  */
785  ereport(PANIC,
786  (errmsg("could not locate a valid checkpoint record")));
787  }
788  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
789  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
790  }
791 
792  /*
793  * If the location of the checkpoint record is not on the expected
794  * timeline in the history of the requested timeline, we cannot proceed:
795  * the backup is not part of the history of the requested timeline.
796  */
797  Assert(expectedTLEs); /* was initialized by reading checkpoint
798  * record */
801  {
802  XLogRecPtr switchpoint;
803 
804  /*
805  * tliSwitchPoint will throw an error if the checkpoint's timeline is
806  * not in expectedTLEs at all.
807  */
809  ereport(FATAL,
810  (errmsg("requested timeline %u is not a child of this server's history",
812  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
815  LSN_FORMAT_ARGS(switchpoint))));
816  }
817 
818  /*
819  * The min recovery point should be part of the requested timeline's
820  * history, too.
821  */
825  ereport(FATAL,
826  (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
830 
831  ereport(DEBUG1,
832  (errmsg_internal("redo record is at %X/%X; shutdown %s",
833  LSN_FORMAT_ARGS(checkPoint.redo),
834  wasShutdown ? "true" : "false")));
835  ereport(DEBUG1,
836  (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
837  U64FromFullTransactionId(checkPoint.nextXid),
838  checkPoint.nextOid)));
839  ereport(DEBUG1,
840  (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
841  checkPoint.nextMulti, checkPoint.nextMultiOffset)));
842  ereport(DEBUG1,
843  (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
844  checkPoint.oldestXid, checkPoint.oldestXidDB)));
845  ereport(DEBUG1,
846  (errmsg_internal("oldest MultiXactId: %u, in database %u",
847  checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
848  ereport(DEBUG1,
849  (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
850  checkPoint.oldestCommitTsXid,
851  checkPoint.newestCommitTsXid)));
853  ereport(PANIC,
854  (errmsg("invalid next transaction ID")));
855 
856  /* sanity check */
857  if (checkPoint.redo > CheckPointLoc)
858  ereport(PANIC,
859  (errmsg("invalid redo in checkpoint record")));
860 
861  /*
862  * Check whether we need to force recovery from WAL. If it appears to
863  * have been a clean shutdown and we did not have a recovery signal file,
864  * then assume no recovery needed.
865  */
866  if (checkPoint.redo < CheckPointLoc)
867  {
868  if (wasShutdown)
869  ereport(PANIC,
870  (errmsg("invalid redo record in shutdown checkpoint")));
871  InRecovery = true;
872  }
873  else if (ControlFile->state != DB_SHUTDOWNED)
874  InRecovery = true;
875  else if (ArchiveRecoveryRequested)
876  {
877  /* force recovery due to presence of recovery signal file */
878  InRecovery = true;
879  }
880 
881  /*
882  * If recovery is needed, update our in-memory copy of pg_control to show
883  * that we are recovering and to show the selected checkpoint as the place
884  * we are starting from. We also mark pg_control with any minimum recovery
885  * stop point obtained from a backup history file.
886  *
887  * We don't write the changes to disk yet, though. Only do that after
888  * initializing various subsystems.
889  */
890  if (InRecovery)
891  {
892  if (InArchiveRecovery)
893  {
895  }
896  else
897  {
898  ereport(LOG,
899  (errmsg("database system was not properly shut down; "
900  "automatic recovery in progress")));
902  ereport(LOG,
903  (errmsg("crash recovery starts in timeline %u "
904  "and has target timeline %u",
908  }
910  ControlFile->checkPointCopy = checkPoint;
911  if (InArchiveRecovery)
912  {
913  /* initialize minRecoveryPoint if not set yet */
914  if (ControlFile->minRecoveryPoint < checkPoint.redo)
915  {
916  ControlFile->minRecoveryPoint = checkPoint.redo;
918  }
919  }
920 
921  /*
922  * Set backupStartPoint if we're starting recovery from a base backup.
923  *
924  * Also set backupEndPoint and use minRecoveryPoint as the backup end
925  * location if we're starting recovery from a base backup which was
926  * taken from a standby. In this case, the database system status in
927  * pg_control must indicate that the database was already in recovery.
928  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
929  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
930  * before reaching this point; e.g. because restore_command or
931  * primary_conninfo were faulty.
932  *
933  * Any other state indicates that the backup somehow became corrupted
934  * and we can't sensibly continue with recovery.
935  */
936  if (haveBackupLabel)
937  {
938  ControlFile->backupStartPoint = checkPoint.redo;
940 
941  if (backupFromStandby)
942  {
943  if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
944  dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
945  ereport(FATAL,
946  (errmsg("backup_label contains data inconsistent with control file"),
947  errhint("This means that the backup is corrupted and you will "
948  "have to use another backup for recovery.")));
950  }
951  }
952  }
953 
954  /* remember these, so that we know when we have reached consistency */
958  if (InArchiveRecovery)
959  {
962  }
963  else
964  {
967  }
968 
969  /*
970  * Start recovery assuming that the final record isn't lost.
971  */
974 
975  *wasShutdown_ptr = wasShutdown;
976  *haveBackupLabel_ptr = haveBackupLabel;
977  *haveTblspcMap_ptr = haveTblspcMap;
978 }
979 
980 /*
981  * See if there are any recovery signal files and if so, set state for
982  * recovery.
983  *
984  * See if there is a recovery command file (recovery.conf), and if so
985  * throw an ERROR since as of PG12 we no longer recognize that.
986  */
987 static void
989 {
990  struct stat stat_buf;
991 
993  return;
994 
995  /*
996  * Check for old recovery API file: recovery.conf
997  */
998  if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
999  ereport(FATAL,
1001  errmsg("using recovery command file \"%s\" is not supported",
1003 
1004  /*
1005  * Remove unused .done file, if present. Ignore if absent.
1006  */
1007  unlink(RECOVERY_COMMAND_DONE);
1008 
1009  /*
1010  * Check for recovery signal files and if found, fsync them since they
1011  * represent server state information. We don't sweat too much about the
1012  * possibility of fsync failure, however.
1013  *
1014  * If present, standby signal file takes precedence. If neither is present
1015  * then we won't enter archive recovery.
1016  */
1017  if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1018  {
1019  int fd;
1020 
1022  S_IRUSR | S_IWUSR);
1023  if (fd >= 0)
1024  {
1025  (void) pg_fsync(fd);
1026  close(fd);
1027  }
1029  }
1030  else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1031  {
1032  int fd;
1033 
1035  S_IRUSR | S_IWUSR);
1036  if (fd >= 0)
1037  {
1038  (void) pg_fsync(fd);
1039  close(fd);
1040  }
1042  }
1043 
1044  StandbyModeRequested = false;
1045  ArchiveRecoveryRequested = false;
1047  {
1048  StandbyModeRequested = true;
1049  ArchiveRecoveryRequested = true;
1050  }
1051  else if (recovery_signal_file_found)
1052  {
1053  StandbyModeRequested = false;
1054  ArchiveRecoveryRequested = true;
1055  }
1056  else
1057  return;
1058 
1059  /*
1060  * We don't support standby mode in standalone backends; that requires
1061  * other processes such as the WAL receiver to be alive.
1062  */
1064  ereport(FATAL,
1065  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1066  errmsg("standby mode is not supported by single-user servers")));
1067 }
1068 
1069 static void
1071 {
1073  return;
1074 
1075  /*
1076  * Check for compulsory parameters
1077  */
1079  {
1080  if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1081  (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1082  ereport(WARNING,
1083  (errmsg("specified neither primary_conninfo nor restore_command"),
1084  errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1085  }
1086  else
1087  {
1088  if (recoveryRestoreCommand == NULL ||
1089  strcmp(recoveryRestoreCommand, "") == 0)
1090  ereport(FATAL,
1091  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1092  errmsg("must specify restore_command when standby mode is not enabled")));
1093  }
1094 
1095  /*
1096  * Override any inconsistent requests. Note that this is a change of
1097  * behaviour in 9.5; prior to this we simply ignored a request to pause if
1098  * hot_standby = off, which was surprising behaviour.
1099  */
1103 
1104  /*
1105  * Final parsing of recovery_target_time string; see also
1106  * check_recovery_target_time().
1107  */
1109  {
1113  Int32GetDatum(-1)));
1114  }
1115 
1116  /*
1117  * If user specified recovery_target_timeline, validate it or compute the
1118  * "latest" value. We can't do this until after we've gotten the restore
1119  * command and set InArchiveRecovery, because we need to fetch timeline
1120  * history files from the archive.
1121  */
1123  {
1125 
1126  /* Timeline 1 does not have a history file, all else should */
1127  if (rtli != 1 && !existsTimeLineHistory(rtli))
1128  ereport(FATAL,
1129  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1130  errmsg("recovery target timeline %u does not exist",
1131  rtli)));
1132  recoveryTargetTLI = rtli;
1133  }
1135  {
1136  /* We start the "latest" search from pg_control's timeline */
1138  }
1139  else
1140  {
1141  /*
1142  * else we just use the recoveryTargetTLI as already read from
1143  * ControlFile
1144  */
1146  }
1147 }
1148 
1149 /*
1150  * read_backup_label: check to see if a backup_label file is present
1151  *
1152  * If we see a backup_label during recovery, we assume that we are recovering
1153  * from a backup dump file, and we therefore roll forward from the checkpoint
1154  * identified by the label file, NOT what pg_control says. This avoids the
1155  * problem that pg_control might have been archived one or more checkpoints
1156  * later than the start of the dump, and so if we rely on it as the start
1157  * point, we will fail to restore a consistent database state.
1158  *
1159  * Returns true if a backup_label was found (and fills the checkpoint
1160  * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1161  * returns false if not. If this backup_label came from a streamed backup,
1162  * *backupEndRequired is set to true. If this backup_label was created during
1163  * recovery, *backupFromStandby is set to true.
1164  *
1165  * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1166  * and TLI read from the backup file.
1167  */
1168 static bool
1169 read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1170  bool *backupEndRequired, bool *backupFromStandby)
1171 {
1172  char startxlogfilename[MAXFNAMELEN];
1173  TimeLineID tli_from_walseg,
1174  tli_from_file;
1175  FILE *lfp;
1176  char ch;
1177  char backuptype[20];
1178  char backupfrom[20];
1179  char backuplabel[MAXPGPATH];
1180  char backuptime[128];
1181  uint32 hi,
1182  lo;
1183 
1184  /* suppress possible uninitialized-variable warnings */
1185  *checkPointLoc = InvalidXLogRecPtr;
1186  *backupLabelTLI = 0;
1187  *backupEndRequired = false;
1188  *backupFromStandby = false;
1189 
1190  /*
1191  * See if label file is present
1192  */
1193  lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1194  if (!lfp)
1195  {
1196  if (errno != ENOENT)
1197  ereport(FATAL,
1199  errmsg("could not read file \"%s\": %m",
1200  BACKUP_LABEL_FILE)));
1201  return false; /* it's not there, all is fine */
1202  }
1203 
1204  /*
1205  * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1206  * is pretty crude, but we are not expecting any variability in the file
1207  * format).
1208  */
1209  if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1210  &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1211  ereport(FATAL,
1212  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1213  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1214  RedoStartLSN = ((uint64) hi) << 32 | lo;
1215  RedoStartTLI = tli_from_walseg;
1216  if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1217  &hi, &lo, &ch) != 3 || ch != '\n')
1218  ereport(FATAL,
1219  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1220  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1221  *checkPointLoc = ((uint64) hi) << 32 | lo;
1222  *backupLabelTLI = tli_from_walseg;
1223 
1224  /*
1225  * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1226  * which could mean either pg_basebackup or the pg_backup_start/stop
1227  * method was used) or if this label came from somewhere else (the only
1228  * other option today being from pg_rewind). If this was a streamed
1229  * backup then we know that we need to play through until we get to the
1230  * end of the WAL which was generated during the backup (at which point we
1231  * will have reached consistency and backupEndRequired will be reset to be
1232  * false).
1233  */
1234  if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1235  {
1236  if (strcmp(backuptype, "streamed") == 0)
1237  *backupEndRequired = true;
1238  }
1239 
1240  /*
1241  * BACKUP FROM lets us know if this was from a primary or a standby. If
1242  * it was from a standby, we'll double-check that the control file state
1243  * matches that of a standby.
1244  */
1245  if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1246  {
1247  if (strcmp(backupfrom, "standby") == 0)
1248  *backupFromStandby = true;
1249  }
1250 
1251  /*
1252  * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1253  * but checking for their presence is useful for debugging and the next
1254  * sanity checks. Cope also with the fact that the result buffers have a
1255  * pre-allocated size, hence if the backup_label file has been generated
1256  * with strings longer than the maximum assumed here an incorrect parsing
1257  * happens. That's fine as only minor consistency checks are done
1258  * afterwards.
1259  */
1260  if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1261  ereport(DEBUG1,
1262  (errmsg_internal("backup time %s in file \"%s\"",
1263  backuptime, BACKUP_LABEL_FILE)));
1264 
1265  if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1266  ereport(DEBUG1,
1267  (errmsg_internal("backup label %s in file \"%s\"",
1268  backuplabel, BACKUP_LABEL_FILE)));
1269 
1270  /*
1271  * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1272  * it as a sanity check if present.
1273  */
1274  if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1275  {
1276  if (tli_from_walseg != tli_from_file)
1277  ereport(FATAL,
1278  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1279  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1280  errdetail("Timeline ID parsed is %u, but expected %u.",
1281  tli_from_file, tli_from_walseg)));
1282 
1283  ereport(DEBUG1,
1284  (errmsg_internal("backup timeline %u in file \"%s\"",
1285  tli_from_file, BACKUP_LABEL_FILE)));
1286  }
1287 
1288  if (ferror(lfp) || FreeFile(lfp))
1289  ereport(FATAL,
1291  errmsg("could not read file \"%s\": %m",
1292  BACKUP_LABEL_FILE)));
1293 
1294  return true;
1295 }
1296 
1297 /*
1298  * read_tablespace_map: check to see if a tablespace_map file is present
1299  *
1300  * If we see a tablespace_map file during recovery, we assume that we are
1301  * recovering from a backup dump file, and we therefore need to create symlinks
1302  * as per the information present in tablespace_map file.
1303  *
1304  * Returns true if a tablespace_map file was found (and fills *tablespaces
1305  * with a tablespaceinfo struct for each tablespace listed in the file);
1306  * returns false if not.
1307  */
1308 static bool
1310 {
1311  tablespaceinfo *ti;
1312  FILE *lfp;
1313  char str[MAXPGPATH];
1314  int ch,
1315  i,
1316  n;
1317  bool was_backslash;
1318 
1319  /*
1320  * See if tablespace_map file is present
1321  */
1322  lfp = AllocateFile(TABLESPACE_MAP, "r");
1323  if (!lfp)
1324  {
1325  if (errno != ENOENT)
1326  ereport(FATAL,
1328  errmsg("could not read file \"%s\": %m",
1329  TABLESPACE_MAP)));
1330  return false; /* it's not there, all is fine */
1331  }
1332 
1333  /*
1334  * Read and parse the link name and path lines from tablespace_map file
1335  * (this code is pretty crude, but we are not expecting any variability in
1336  * the file format). De-escape any backslashes that were inserted.
1337  */
1338  i = 0;
1339  was_backslash = false;
1340  while ((ch = fgetc(lfp)) != EOF)
1341  {
1342  if (!was_backslash && (ch == '\n' || ch == '\r'))
1343  {
1344  if (i == 0)
1345  continue; /* \r immediately followed by \n */
1346 
1347  /*
1348  * The de-escaped line should contain an OID followed by exactly
1349  * one space followed by a path. The path might start with
1350  * spaces, so don't be too liberal about parsing.
1351  */
1352  str[i] = '\0';
1353  n = 0;
1354  while (str[n] && str[n] != ' ')
1355  n++;
1356  if (n < 1 || n >= i - 1)
1357  ereport(FATAL,
1358  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1359  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1360  str[n++] = '\0';
1361 
1362  ti = palloc0(sizeof(tablespaceinfo));
1363  ti->oid = pstrdup(str);
1364  ti->path = pstrdup(str + n);
1365  *tablespaces = lappend(*tablespaces, ti);
1366 
1367  i = 0;
1368  continue;
1369  }
1370  else if (!was_backslash && ch == '\\')
1371  was_backslash = true;
1372  else
1373  {
1374  if (i < sizeof(str) - 1)
1375  str[i++] = ch;
1376  was_backslash = false;
1377  }
1378  }
1379 
1380  if (i != 0 || was_backslash) /* last line not terminated? */
1381  ereport(FATAL,
1382  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1383  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1384 
1385  if (ferror(lfp) || FreeFile(lfp))
1386  ereport(FATAL,
1388  errmsg("could not read file \"%s\": %m",
1389  TABLESPACE_MAP)));
1390 
1391  return true;
1392 }
1393 
1394 /*
1395  * Finish WAL recovery.
1396  *
1397  * This does not close the 'xlogreader' yet, because in some cases the caller
1398  * still wants to re-read the last checkpoint record by calling
1399  * ReadCheckPointRecord().
1400  *
1401  * Returns the position of the last valid or applied record, after which new
1402  * WAL should be appended, information about why recovery was ended, and some
1403  * other things. See the WalRecoveryResult struct for details.
1404  */
1407 {
1409  XLogRecPtr lastRec;
1410  TimeLineID lastRecTLI;
1411  XLogRecPtr endOfLog;
1412 
1413  /*
1414  * Kill WAL receiver, if it's still running, before we continue to write
1415  * the startup checkpoint and aborted-contrecord records. It will trump
1416  * over these records and subsequent ones if it's still alive when we
1417  * start writing WAL.
1418  */
1420 
1421  /*
1422  * We are now done reading the xlog from stream. Turn off streaming
1423  * recovery to force fetching the files (which would be required at end of
1424  * recovery, e.g., timeline history file) from archive or pg_wal.
1425  *
1426  * Note that standby mode must be turned off after killing WAL receiver,
1427  * i.e., calling XLogShutdownWalRcv().
1428  */
1429  Assert(!WalRcvStreaming());
1430  StandbyMode = false;
1431 
1432  /*
1433  * Determine where to start writing WAL next.
1434  *
1435  * Re-fetch the last valid or last applied record, so we can identify the
1436  * exact endpoint of what we consider the valid portion of WAL. There may
1437  * be an incomplete continuation record after that, in which case
1438  * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1439  * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1440  * it is intentionally missing. See CreateOverwriteContrecordRecord().
1441  *
1442  * An important side-effect of this is to load the last page into
1443  * xlogreader. The caller uses it to initialize the WAL for writing.
1444  */
1445  if (!InRecovery)
1446  {
1447  lastRec = CheckPointLoc;
1448  lastRecTLI = CheckPointTLI;
1449  }
1450  else
1451  {
1453  lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1454  }
1456  (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1457  endOfLog = xlogreader->EndRecPtr;
1458 
1459  /*
1460  * Remember the TLI in the filename of the XLOG segment containing the
1461  * end-of-log. It could be different from the timeline that endOfLog
1462  * nominally belongs to, if there was a timeline switch in that segment,
1463  * and we were reading the old WAL from a segment belonging to a higher
1464  * timeline.
1465  */
1466  result->endOfLogTLI = xlogreader->seg.ws_tli;
1467 
1469  {
1470  /*
1471  * We are no longer in archive recovery state.
1472  *
1473  * We are now done reading the old WAL. Turn off archive fetching if
1474  * it was active.
1475  */
1477  InArchiveRecovery = false;
1478 
1479  /*
1480  * If the ending log segment is still open, close it (to avoid
1481  * problems on Windows with trying to rename or delete an open file).
1482  */
1483  if (readFile >= 0)
1484  {
1485  close(readFile);
1486  readFile = -1;
1487  }
1488  }
1489 
1490  /*
1491  * Copy the last partial block to the caller, for initializing the WAL
1492  * buffer for appending new WAL.
1493  */
1494  if (endOfLog % XLOG_BLCKSZ != 0)
1495  {
1496  char *page;
1497  int len;
1498  XLogRecPtr pageBeginPtr;
1499 
1500  pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1501  Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1502 
1503  /* Copy the valid part of the last block */
1504  len = endOfLog % XLOG_BLCKSZ;
1505  page = palloc(len);
1506  memcpy(page, xlogreader->readBuf, len);
1507 
1508  result->lastPageBeginPtr = pageBeginPtr;
1509  result->lastPage = page;
1510  }
1511  else
1512  {
1513  /* There is no partial block to copy. */
1514  result->lastPageBeginPtr = endOfLog;
1515  result->lastPage = NULL;
1516  }
1517 
1518  /*
1519  * Create a comment for the history file to explain why and where timeline
1520  * changed.
1521  */
1523 
1524  result->lastRec = lastRec;
1525  result->lastRecTLI = lastRecTLI;
1526  result->endOfLog = endOfLog;
1527 
1528  result->abortedRecPtr = abortedRecPtr;
1530 
1533 
1534  return result;
1535 }
1536 
1537 /*
1538  * Clean up the WAL reader and leftovers from restoring WAL from archive
1539  */
1540 void
1542 {
1543  char recoveryPath[MAXPGPATH];
1544 
1545  /* Final update of pg_stat_recovery_prefetch. */
1547 
1548  /* Shut down xlogreader */
1549  if (readFile >= 0)
1550  {
1551  close(readFile);
1552  readFile = -1;
1553  }
1556 
1558  {
1559  /*
1560  * Since there might be a partial WAL segment named RECOVERYXLOG, get
1561  * rid of it.
1562  */
1563  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1564  unlink(recoveryPath); /* ignore any error */
1565 
1566  /* Get rid of any remaining recovered timeline-history file, too */
1567  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1568  unlink(recoveryPath); /* ignore any error */
1569  }
1570 
1571  /*
1572  * We don't need the latch anymore. It's not strictly necessary to disown
1573  * it, but let's do it for the sake of tidiness.
1574  */
1577 }
1578 
1579 /*
1580  * Perform WAL recovery.
1581  *
1582  * If the system was shut down cleanly, this is never called.
1583  */
1584 void
1586 {
1587  XLogRecord *record;
1588  bool reachedRecoveryTarget = false;
1589  TimeLineID replayTLI;
1590 
1591  /*
1592  * Initialize shared variables for tracking progress of WAL replay, as if
1593  * we had just replayed the record before the REDO location (or the
1594  * checkpoint record itself, if it's a shutdown checkpoint).
1595  */
1598  {
1602  }
1603  else
1604  {
1608  }
1615 
1616  /* Also ensure XLogReceiptTime has a sane value */
1618 
1619  /*
1620  * Let postmaster know we've started redo now, so that it can launch the
1621  * archiver if necessary.
1622  */
1623  if (IsUnderPostmaster)
1625 
1626  /*
1627  * Allow read-only connections immediately if we're consistent already.
1628  */
1630 
1631  /*
1632  * Find the first record that logically follows the checkpoint --- it
1633  * might physically precede it, though.
1634  */
1636  {
1637  /* back up to find the record */
1638  replayTLI = RedoStartTLI;
1640  record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1641  }
1642  else
1643  {
1644  /* just have to read next record after CheckPoint */
1646  replayTLI = CheckPointTLI;
1647  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1648  }
1649 
1650  if (record != NULL)
1651  {
1652  TimestampTz xtime;
1653  PGRUsage ru0;
1654 
1655  pg_rusage_init(&ru0);
1656 
1657  InRedo = true;
1658 
1659  RmgrStartup();
1660 
1661  ereport(LOG,
1662  (errmsg("redo starts at %X/%X",
1664 
1665  /* Prepare to report progress of the redo phase. */
1666  if (!StandbyMode)
1668 
1669  /*
1670  * main redo apply loop
1671  */
1672  do
1673  {
1674  if (!StandbyMode)
1675  ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1677 
1678 #ifdef WAL_DEBUG
1679  if (XLOG_DEBUG ||
1680  (record->xl_rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
1681  (record->xl_rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
1682  {
1684 
1685  initStringInfo(&buf);
1686  appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1689  xlog_outrec(&buf, xlogreader);
1690  appendStringInfoString(&buf, " - ");
1692  elog(LOG, "%s", buf.data);
1693  pfree(buf.data);
1694  }
1695 #endif
1696 
1697  /* Handle interrupt signals of startup process */
1699 
1700  /*
1701  * Pause WAL replay, if requested by a hot-standby session via
1702  * SetRecoveryPause().
1703  *
1704  * Note that we intentionally don't take the info_lck spinlock
1705  * here. We might therefore read a slightly stale value of the
1706  * recoveryPause flag, but it can't be very stale (no worse than
1707  * the last spinlock we did acquire). Since a pause request is a
1708  * pretty asynchronous thing anyway, possibly responding to it one
1709  * WAL record later than we otherwise would is a minor issue, so
1710  * it doesn't seem worth adding another spinlock cycle to prevent
1711  * that.
1712  */
1713  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1715  recoveryPausesHere(false);
1716 
1717  /*
1718  * Have we reached our recovery target?
1719  */
1721  {
1722  reachedRecoveryTarget = true;
1723  break;
1724  }
1725 
1726  /*
1727  * If we've been asked to lag the primary, wait on latch until
1728  * enough time has passed.
1729  */
1731  {
1732  /*
1733  * We test for paused recovery again here. If user sets
1734  * delayed apply, it may be because they expect to pause
1735  * recovery in case of problems, so we must test again here
1736  * otherwise pausing during the delay-wait wouldn't work.
1737  */
1738  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1740  recoveryPausesHere(false);
1741  }
1742 
1743  /*
1744  * Apply the record
1745  */
1746  ApplyWalRecord(xlogreader, record, &replayTLI);
1747 
1748  /* Exit loop if we reached inclusive recovery target */
1750  {
1751  reachedRecoveryTarget = true;
1752  break;
1753  }
1754 
1755  /* Else, try to fetch the next WAL record */
1756  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1757  } while (record != NULL);
1758 
1759  /*
1760  * end of main redo apply loop
1761  */
1762 
1763  if (reachedRecoveryTarget)
1764  {
1765  if (!reachedConsistency)
1766  ereport(FATAL,
1767  (errmsg("requested recovery stop point is before consistent recovery point")));
1768 
1769  /*
1770  * This is the last point where we can restart recovery with a new
1771  * recovery target, if we shutdown and begin again. After this,
1772  * Resource Managers may choose to do permanent corrective actions
1773  * at end of recovery.
1774  */
1775  switch (recoveryTargetAction)
1776  {
1778 
1779  /*
1780  * exit with special return code to request shutdown of
1781  * postmaster. Log messages issued from postmaster.
1782  */
1783  proc_exit(3);
1784 
1786  SetRecoveryPause(true);
1787  recoveryPausesHere(true);
1788 
1789  /* drop into promote */
1790 
1792  break;
1793  }
1794  }
1795 
1796  RmgrCleanup();
1797 
1798  ereport(LOG,
1799  (errmsg("redo done at %X/%X system usage: %s",
1801  pg_rusage_show(&ru0))));
1802  xtime = GetLatestXTime();
1803  if (xtime)
1804  ereport(LOG,
1805  (errmsg("last completed transaction was at log time %s",
1806  timestamptz_to_str(xtime))));
1807 
1808  InRedo = false;
1809  }
1810  else
1811  {
1812  /* there are no WAL records following the checkpoint */
1813  ereport(LOG,
1814  (errmsg("redo is not required")));
1815  }
1816 
1817  /*
1818  * This check is intentionally after the above log messages that indicate
1819  * how far recovery went.
1820  */
1823  !reachedRecoveryTarget)
1824  ereport(FATAL,
1825  (errmsg("recovery ended before configured recovery target was reached")));
1826 }
1827 
1828 /*
1829  * Subroutine of PerformWalRecovery, to apply one WAL record.
1830  */
1831 static void
1833 {
1834  ErrorContextCallback errcallback;
1835  bool switchedTLI = false;
1836 
1837  /* Setup error traceback support for ereport() */
1838  errcallback.callback = rm_redo_error_callback;
1839  errcallback.arg = (void *) xlogreader;
1840  errcallback.previous = error_context_stack;
1841  error_context_stack = &errcallback;
1842 
1843  /*
1844  * ShmemVariableCache->nextXid must be beyond record's xid.
1845  */
1847 
1848  /*
1849  * Before replaying this record, check if this record causes the current
1850  * timeline to change. The record is already considered to be part of the
1851  * new timeline, so we update replayTLI before replaying it. That's
1852  * important so that replayEndTLI, which is recorded as the minimum
1853  * recovery point's TLI if recovery stops after this record, is set
1854  * correctly.
1855  */
1856  if (record->xl_rmid == RM_XLOG_ID)
1857  {
1858  TimeLineID newReplayTLI = *replayTLI;
1859  TimeLineID prevReplayTLI = *replayTLI;
1860  uint8 info = record->xl_info & ~XLR_INFO_MASK;
1861 
1862  if (info == XLOG_CHECKPOINT_SHUTDOWN)
1863  {
1864  CheckPoint checkPoint;
1865 
1866  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1867  newReplayTLI = checkPoint.ThisTimeLineID;
1868  prevReplayTLI = checkPoint.PrevTimeLineID;
1869  }
1870  else if (info == XLOG_END_OF_RECOVERY)
1871  {
1872  xl_end_of_recovery xlrec;
1873 
1874  memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1875  newReplayTLI = xlrec.ThisTimeLineID;
1876  prevReplayTLI = xlrec.PrevTimeLineID;
1877  }
1878 
1879  if (newReplayTLI != *replayTLI)
1880  {
1881  /* Check that it's OK to switch to this TLI */
1883  newReplayTLI, prevReplayTLI, *replayTLI);
1884 
1885  /* Following WAL records should be run with new TLI */
1886  *replayTLI = newReplayTLI;
1887  switchedTLI = true;
1888  }
1889  }
1890 
1891  /*
1892  * Update shared replayEndRecPtr before replaying this record, so that
1893  * XLogFlush will update minRecoveryPoint correctly.
1894  */
1897  XLogRecoveryCtl->replayEndTLI = *replayTLI;
1899 
1900  /*
1901  * If we are attempting to enter Hot Standby mode, process XIDs we see
1902  */
1904  TransactionIdIsValid(record->xl_xid))
1906 
1907  /*
1908  * Some XLOG record types that are related to recovery are processed
1909  * directly here, rather than in xlog_redo()
1910  */
1911  if (record->xl_rmid == RM_XLOG_ID)
1912  xlogrecovery_redo(xlogreader, *replayTLI);
1913 
1914  /* Now apply the WAL record itself */
1915  GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1916 
1917  /*
1918  * After redo, check whether the backup pages associated with the WAL
1919  * record are consistent with the existing pages. This check is done only
1920  * if consistency check is enabled for this record.
1921  */
1922  if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
1924 
1925  /* Pop the error context stack */
1926  error_context_stack = errcallback.previous;
1927 
1928  /*
1929  * Update lastReplayedEndRecPtr after this record has been successfully
1930  * replayed.
1931  */
1935  XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
1937 
1938  /*
1939  * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
1940  * receiver so that it notices the updated lastReplayedEndRecPtr and sends
1941  * a reply to the primary.
1942  */
1944  {
1945  doRequestWalReceiverReply = false;
1946  WalRcvForceReply();
1947  }
1948 
1949  /* Allow read-only connections if we're consistent now */
1951 
1952  /* Is this a timeline switch? */
1953  if (switchedTLI)
1954  {
1955  /*
1956  * Before we continue on the new timeline, clean up any (possibly
1957  * bogus) future WAL segments on the old timeline.
1958  */
1960 
1961  /*
1962  * Wake up any walsenders to notice that we are on a new timeline.
1963  */
1965  WalSndWakeup();
1966 
1967  /* Reset the prefetcher. */
1969  }
1970 }
1971 
1972 /*
1973  * Some XLOG RM record types that are directly related to WAL recovery are
1974  * handled here rather than in the xlog_redo()
1975  */
1976 static void
1978 {
1979  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1980  XLogRecPtr lsn = record->EndRecPtr;
1981 
1982  Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
1983 
1984  if (info == XLOG_OVERWRITE_CONTRECORD)
1985  {
1986  /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
1988 
1989  memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
1990  if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
1991  elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
1994 
1995  /* We have safely skipped the aborted record */
1998 
1999  ereport(LOG,
2000  (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2003 
2004  /* Verifying the record should only happen once */
2006  }
2007  else if (info == XLOG_BACKUP_END)
2008  {
2009  XLogRecPtr startpoint;
2010 
2011  memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2012 
2013  if (backupStartPoint == startpoint)
2014  {
2015  /*
2016  * We have reached the end of base backup, the point where
2017  * pg_backup_stop() was done. The data on disk is now consistent
2018  * (assuming we have also reached minRecoveryPoint). Set
2019  * backupEndPoint to the current LSN, so that the next call to
2020  * CheckRecoveryConsistency() will notice it and do the
2021  * end-of-backup processing.
2022  */
2023  elog(DEBUG1, "end of backup record reached");
2024 
2025  backupEndPoint = lsn;
2026  }
2027  else
2028  elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2030  }
2031 }
2032 
2033 /*
2034  * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2035  * directories.
2036  *
2037  * Replay of database creation XLOG records for databases that were later
2038  * dropped can create fake directories in pg_tblspc. By the time consistency
2039  * is reached these directories should have been removed; here we verify
2040  * that this did indeed happen. This is to be called at the point where
2041  * consistent state is reached.
2042  *
2043  * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2044  * useful for testing purposes, and also allows for an escape hatch in case
2045  * things go south.
2046  */
2047 static void
2049 {
2050  DIR *dir;
2051  struct dirent *de;
2052 
2053  dir = AllocateDir("pg_tblspc");
2054  while ((de = ReadDir(dir, "pg_tblspc")) != NULL)
2055  {
2056  char path[MAXPGPATH + 10];
2057 
2058  /* Skip entries of non-oid names */
2059  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2060  continue;
2061 
2062  snprintf(path, sizeof(path), "pg_tblspc/%s", de->d_name);
2063 
2064  if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2067  errmsg("unexpected directory entry \"%s\" found in %s",
2068  de->d_name, "pg_tblspc/"),
2069  errdetail("All directory entries in pg_tblspc/ should be symbolic links."),
2070  errhint("Remove those directories, or set allow_in_place_tablespaces to ON transiently to let recovery complete.")));
2071  }
2072 }
2073 
2074 /*
2075  * Checks if recovery has reached a consistent state. When consistency is
2076  * reached and we have a valid starting standby snapshot, tell postmaster
2077  * that it can start accepting read-only connections.
2078  */
2079 static void
2081 {
2082  XLogRecPtr lastReplayedEndRecPtr;
2083  TimeLineID lastReplayedTLI;
2084 
2085  /*
2086  * During crash recovery, we don't reach a consistent state until we've
2087  * replayed all the WAL.
2088  */
2090  return;
2091 
2093 
2094  /*
2095  * assume that we are called in the startup process, and hence don't need
2096  * a lock to read lastReplayedEndRecPtr
2097  */
2098  lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2099  lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2100 
2101  /*
2102  * Have we reached the point where our base backup was completed?
2103  */
2105  backupEndPoint <= lastReplayedEndRecPtr)
2106  {
2107  elog(DEBUG1, "end of backup reached");
2108 
2109  /*
2110  * We have reached the end of base backup, as indicated by pg_control.
2111  * Update the control file accordingly.
2112  */
2113  ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2116  backupEndRequired = false;
2117  }
2118 
2119  /*
2120  * Have we passed our safe starting point? Note that minRecoveryPoint is
2121  * known to be incorrectly set if recovering from a backup, until the
2122  * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2123  * All we know prior to that is that we're not consistent yet.
2124  */
2126  minRecoveryPoint <= lastReplayedEndRecPtr)
2127  {
2128  /*
2129  * Check to see if the XLOG sequence contained any unresolved
2130  * references to uninitialized pages.
2131  */
2133 
2134  /*
2135  * Check that pg_tblspc doesn't contain any real directories. Replay
2136  * of Database/CREATE_* records may have created fictitious tablespace
2137  * directories that should have been removed by the time consistency
2138  * was reached.
2139  */
2141 
2142  reachedConsistency = true;
2143  ereport(LOG,
2144  (errmsg("consistent recovery state reached at %X/%X",
2145  LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2146  }
2147 
2148  /*
2149  * Have we got a valid starting snapshot that will allow queries to be
2150  * run? If so, we can tell postmaster that the database is consistent now,
2151  * enabling connections.
2152  */
2157  {
2161 
2162  LocalHotStandbyActive = true;
2163 
2165  }
2166 }
2167 
2168 /*
2169  * Error context callback for errors occurring during rm_redo().
2170  */
2171 static void
2173 {
2174  XLogReaderState *record = (XLogReaderState *) arg;
2176 
2177  initStringInfo(&buf);
2178  xlog_outdesc(&buf, record);
2179  xlog_block_info(&buf, record);
2180 
2181  /* translator: %s is a WAL record description */
2182  errcontext("WAL redo at %X/%X for %s",
2183  LSN_FORMAT_ARGS(record->ReadRecPtr),
2184  buf.data);
2185 
2186  pfree(buf.data);
2187 }
2188 
2189 /*
2190  * Returns a string describing an XLogRecord, consisting of its identity
2191  * optionally followed by a colon, a space, and a further description.
2192  */
2193 void
2195 {
2196  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2197  uint8 info = XLogRecGetInfo(record);
2198  const char *id;
2199 
2201  appendStringInfoChar(buf, '/');
2202 
2203  id = rmgr.rm_identify(info);
2204  if (id == NULL)
2205  appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2206  else
2207  appendStringInfo(buf, "%s: ", id);
2208 
2209  rmgr.rm_desc(buf, record);
2210 }
2211 
2212 #ifdef WAL_DEBUG
2213 
2214 static void
2215 xlog_outrec(StringInfo buf, XLogReaderState *record)
2216 {
2217  appendStringInfo(buf, "prev %X/%X; xid %u",
2219  XLogRecGetXid(record));
2220 
2221  appendStringInfo(buf, "; len %u",
2222  XLogRecGetDataLen(record));
2223 
2224  xlog_block_info(buf, record);
2225 }
2226 #endif /* WAL_DEBUG */
2227 
2228 /*
2229  * Returns a string giving information about all the blocks in an
2230  * XLogRecord.
2231  */
2232 static void
2234 {
2235  int block_id;
2236 
2237  /* decode block references */
2238  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2239  {
2240  RelFileLocator rlocator;
2241  ForkNumber forknum;
2242  BlockNumber blk;
2243 
2244  if (!XLogRecGetBlockTagExtended(record, block_id,
2245  &rlocator, &forknum, &blk, NULL))
2246  continue;
2247 
2248  if (forknum != MAIN_FORKNUM)
2249  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2250  block_id,
2251  rlocator.spcOid, rlocator.dbOid,
2252  rlocator.relNumber,
2253  forknum,
2254  blk);
2255  else
2256  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2257  block_id,
2258  rlocator.spcOid, rlocator.dbOid,
2259  rlocator.relNumber,
2260  blk);
2261  if (XLogRecHasBlockImage(record, block_id))
2262  appendStringInfoString(buf, " FPW");
2263  }
2264 }
2265 
2266 
2267 /*
2268  * Check that it's OK to switch to new timeline during recovery.
2269  *
2270  * 'lsn' is the address of the shutdown checkpoint record we're about to
2271  * replay. (Currently, timeline can only change at a shutdown checkpoint).
2272  */
2273 static void
2275  TimeLineID replayTLI)
2276 {
2277  /* Check that the record agrees on what the current (old) timeline is */
2278  if (prevTLI != replayTLI)
2279  ereport(PANIC,
2280  (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2281  prevTLI, replayTLI)));
2282 
2283  /*
2284  * The new timeline better be in the list of timelines we expect to see,
2285  * according to the timeline history. It should also not decrease.
2286  */
2287  if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2288  ereport(PANIC,
2289  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2290  newTLI, replayTLI)));
2291 
2292  /*
2293  * If we have not yet reached min recovery point, and we're about to
2294  * switch to a timeline greater than the timeline of the min recovery
2295  * point: trouble. After switching to the new timeline, we could not
2296  * possibly visit the min recovery point on the correct timeline anymore.
2297  * This can happen if there is a newer timeline in the archive that
2298  * branched before the timeline the min recovery point is on, and you
2299  * attempt to do PITR to the new timeline.
2300  */
2302  lsn < minRecoveryPoint &&
2303  newTLI > minRecoveryPointTLI)
2304  ereport(PANIC,
2305  (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2306  newTLI,
2309 
2310  /* Looks good */
2311 }
2312 
2313 
2314 /*
2315  * Extract timestamp from WAL record.
2316  *
2317  * If the record contains a timestamp, returns true, and saves the timestamp
2318  * in *recordXtime. If the record type has no timestamp, returns false.
2319  * Currently, only transaction commit/abort records and restore points contain
2320  * timestamps.
2321  */
2322 static bool
2324 {
2325  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2326  uint8 xact_info = info & XLOG_XACT_OPMASK;
2327  uint8 rmid = XLogRecGetRmid(record);
2328 
2329  if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2330  {
2331  *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2332  return true;
2333  }
2334  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2335  xact_info == XLOG_XACT_COMMIT_PREPARED))
2336  {
2337  *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2338  return true;
2339  }
2340  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2341  xact_info == XLOG_XACT_ABORT_PREPARED))
2342  {
2343  *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2344  return true;
2345  }
2346  return false;
2347 }
2348 
2349 /*
2350  * Checks whether the current buffer page and backup page stored in the
2351  * WAL record are consistent or not. Before comparing the two pages, a
2352  * masking can be applied to the pages to ignore certain areas like hint bits,
2353  * unused space between pd_lower and pd_upper among other things. This
2354  * function should be called once WAL replay has been completed for a
2355  * given record.
2356  */
2357 static void
2359 {
2360  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2361  RelFileLocator rlocator;
2362  ForkNumber forknum;
2363  BlockNumber blkno;
2364  int block_id;
2365 
2366  /* Records with no backup blocks have no need for consistency checks. */
2367  if (!XLogRecHasAnyBlockRefs(record))
2368  return;
2369 
2370  Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2371 
2372  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2373  {
2374  Buffer buf;
2375  Page page;
2376 
2377  if (!XLogRecGetBlockTagExtended(record, block_id,
2378  &rlocator, &forknum, &blkno, NULL))
2379  {
2380  /*
2381  * WAL record doesn't contain a block reference with the given id.
2382  * Do nothing.
2383  */
2384  continue;
2385  }
2386 
2387  Assert(XLogRecHasBlockImage(record, block_id));
2388 
2389  if (XLogRecBlockImageApply(record, block_id))
2390  {
2391  /*
2392  * WAL record has already applied the page, so bypass the
2393  * consistency check as that would result in comparing the full
2394  * page stored in the record with itself.
2395  */
2396  continue;
2397  }
2398 
2399  /*
2400  * Read the contents from the current buffer and store it in a
2401  * temporary page.
2402  */
2403  buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2405  InvalidBuffer);
2406  if (!BufferIsValid(buf))
2407  continue;
2408 
2410  page = BufferGetPage(buf);
2411 
2412  /*
2413  * Take a copy of the local page where WAL has been applied to have a
2414  * comparison base before masking it...
2415  */
2416  memcpy(replay_image_masked, page, BLCKSZ);
2417 
2418  /* No need for this page anymore now that a copy is in. */
2420 
2421  /*
2422  * If the block LSN is already ahead of this WAL record, we can't
2423  * expect contents to match. This can happen if recovery is
2424  * restarted.
2425  */
2426  if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2427  continue;
2428 
2429  /*
2430  * Read the contents from the backup copy, stored in WAL record and
2431  * store it in a temporary page. There is no need to allocate a new
2432  * page here, a local buffer is fine to hold its contents and a mask
2433  * can be directly applied on it.
2434  */
2435  if (!RestoreBlockImage(record, block_id, primary_image_masked))
2436  ereport(ERROR,
2437  (errcode(ERRCODE_INTERNAL_ERROR),
2438  errmsg_internal("%s", record->errormsg_buf)));
2439 
2440  /*
2441  * If masking function is defined, mask both the primary and replay
2442  * images
2443  */
2444  if (rmgr.rm_mask != NULL)
2445  {
2446  rmgr.rm_mask(replay_image_masked, blkno);
2447  rmgr.rm_mask(primary_image_masked, blkno);
2448  }
2449 
2450  /* Time to compare the primary and replay images. */
2451  if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2452  {
2453  elog(FATAL,
2454  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2455  rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2456  forknum, blkno);
2457  }
2458  }
2459 }
2460 
2461 /*
2462  * For point-in-time recovery, this function decides whether we want to
2463  * stop applying the XLOG before the current record.
2464  *
2465  * Returns true if we are stopping, false otherwise. If stopping, some
2466  * information is saved in recoveryStopXid et al for use in annotating the
2467  * new timeline's history file.
2468  */
2469 static bool
2471 {
2472  bool stopsHere = false;
2473  uint8 xact_info;
2474  bool isCommit;
2475  TimestampTz recordXtime = 0;
2476  TransactionId recordXid;
2477 
2478  /*
2479  * Ignore recovery target settings when not in archive recovery (meaning
2480  * we are in crash recovery).
2481  */
2483  return false;
2484 
2485  /* Check if we should stop as soon as reaching consistency */
2487  {
2488  ereport(LOG,
2489  (errmsg("recovery stopping after reaching consistency")));
2490 
2491  recoveryStopAfter = false;
2494  recoveryStopTime = 0;
2495  recoveryStopName[0] = '\0';
2496  return true;
2497  }
2498 
2499  /* Check if target LSN has been reached */
2502  record->ReadRecPtr >= recoveryTargetLSN)
2503  {
2504  recoveryStopAfter = false;
2506  recoveryStopLSN = record->ReadRecPtr;
2507  recoveryStopTime = 0;
2508  recoveryStopName[0] = '\0';
2509  ereport(LOG,
2510  (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2512  return true;
2513  }
2514 
2515  /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2516  if (XLogRecGetRmid(record) != RM_XACT_ID)
2517  return false;
2518 
2519  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2520 
2521  if (xact_info == XLOG_XACT_COMMIT)
2522  {
2523  isCommit = true;
2524  recordXid = XLogRecGetXid(record);
2525  }
2526  else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2527  {
2528  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2529  xl_xact_parsed_commit parsed;
2530 
2531  isCommit = true;
2533  xlrec,
2534  &parsed);
2535  recordXid = parsed.twophase_xid;
2536  }
2537  else if (xact_info == XLOG_XACT_ABORT)
2538  {
2539  isCommit = false;
2540  recordXid = XLogRecGetXid(record);
2541  }
2542  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2543  {
2544  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2545  xl_xact_parsed_abort parsed;
2546 
2547  isCommit = false;
2549  xlrec,
2550  &parsed);
2551  recordXid = parsed.twophase_xid;
2552  }
2553  else
2554  return false;
2555 
2557  {
2558  /*
2559  * There can be only one transaction end record with this exact
2560  * transactionid
2561  *
2562  * when testing for an xid, we MUST test for equality only, since
2563  * transactions are numbered in the order they start, not the order
2564  * they complete. A higher numbered xid will complete before you about
2565  * 50% of the time...
2566  */
2567  stopsHere = (recordXid == recoveryTargetXid);
2568  }
2569 
2570  /*
2571  * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2572  * We don't expect getRecordTimestamp ever to fail, since we already know
2573  * this is a commit or abort record; but test its result anyway.
2574  */
2575  if (getRecordTimestamp(record, &recordXtime) &&
2577  {
2578  /*
2579  * There can be many transactions that share the same commit time, so
2580  * we stop after the last one, if we are inclusive, or stop at the
2581  * first one if we are exclusive
2582  */
2584  stopsHere = (recordXtime > recoveryTargetTime);
2585  else
2586  stopsHere = (recordXtime >= recoveryTargetTime);
2587  }
2588 
2589  if (stopsHere)
2590  {
2591  recoveryStopAfter = false;
2592  recoveryStopXid = recordXid;
2593  recoveryStopTime = recordXtime;
2595  recoveryStopName[0] = '\0';
2596 
2597  if (isCommit)
2598  {
2599  ereport(LOG,
2600  (errmsg("recovery stopping before commit of transaction %u, time %s",
2603  }
2604  else
2605  {
2606  ereport(LOG,
2607  (errmsg("recovery stopping before abort of transaction %u, time %s",
2610  }
2611  }
2612 
2613  return stopsHere;
2614 }
2615 
2616 /*
2617  * Same as recoveryStopsBefore, but called after applying the record.
2618  *
2619  * We also track the timestamp of the latest applied COMMIT/ABORT
2620  * record in XLogRecoveryCtl->recoveryLastXTime.
2621  */
2622 static bool
2624 {
2625  uint8 info;
2626  uint8 xact_info;
2627  uint8 rmid;
2628  TimestampTz recordXtime;
2629 
2630  /*
2631  * Ignore recovery target settings when not in archive recovery (meaning
2632  * we are in crash recovery).
2633  */
2635  return false;
2636 
2637  info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2638  rmid = XLogRecGetRmid(record);
2639 
2640  /*
2641  * There can be many restore points that share the same name; we stop at
2642  * the first one.
2643  */
2645  rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2646  {
2647  xl_restore_point *recordRestorePointData;
2648 
2649  recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2650 
2651  if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2652  {
2653  recoveryStopAfter = true;
2656  (void) getRecordTimestamp(record, &recoveryStopTime);
2657  strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2658 
2659  ereport(LOG,
2660  (errmsg("recovery stopping at restore point \"%s\", time %s",
2663  return true;
2664  }
2665  }
2666 
2667  /* Check if the target LSN has been reached */
2670  record->ReadRecPtr >= recoveryTargetLSN)
2671  {
2672  recoveryStopAfter = true;
2674  recoveryStopLSN = record->ReadRecPtr;
2675  recoveryStopTime = 0;
2676  recoveryStopName[0] = '\0';
2677  ereport(LOG,
2678  (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2680  return true;
2681  }
2682 
2683  if (rmid != RM_XACT_ID)
2684  return false;
2685 
2686  xact_info = info & XLOG_XACT_OPMASK;
2687 
2688  if (xact_info == XLOG_XACT_COMMIT ||
2689  xact_info == XLOG_XACT_COMMIT_PREPARED ||
2690  xact_info == XLOG_XACT_ABORT ||
2691  xact_info == XLOG_XACT_ABORT_PREPARED)
2692  {
2693  TransactionId recordXid;
2694 
2695  /* Update the last applied transaction timestamp */
2696  if (getRecordTimestamp(record, &recordXtime))
2697  SetLatestXTime(recordXtime);
2698 
2699  /* Extract the XID of the committed/aborted transaction */
2700  if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2701  {
2702  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2703  xl_xact_parsed_commit parsed;
2704 
2706  xlrec,
2707  &parsed);
2708  recordXid = parsed.twophase_xid;
2709  }
2710  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2711  {
2712  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2713  xl_xact_parsed_abort parsed;
2714 
2716  xlrec,
2717  &parsed);
2718  recordXid = parsed.twophase_xid;
2719  }
2720  else
2721  recordXid = XLogRecGetXid(record);
2722 
2723  /*
2724  * There can be only one transaction end record with this exact
2725  * transactionid
2726  *
2727  * when testing for an xid, we MUST test for equality only, since
2728  * transactions are numbered in the order they start, not the order
2729  * they complete. A higher numbered xid will complete before you about
2730  * 50% of the time...
2731  */
2733  recordXid == recoveryTargetXid)
2734  {
2735  recoveryStopAfter = true;
2736  recoveryStopXid = recordXid;
2737  recoveryStopTime = recordXtime;
2739  recoveryStopName[0] = '\0';
2740 
2741  if (xact_info == XLOG_XACT_COMMIT ||
2742  xact_info == XLOG_XACT_COMMIT_PREPARED)
2743  {
2744  ereport(LOG,
2745  (errmsg("recovery stopping after commit of transaction %u, time %s",
2748  }
2749  else if (xact_info == XLOG_XACT_ABORT ||
2750  xact_info == XLOG_XACT_ABORT_PREPARED)
2751  {
2752  ereport(LOG,
2753  (errmsg("recovery stopping after abort of transaction %u, time %s",
2756  }
2757  return true;
2758  }
2759  }
2760 
2761  /* Check if we should stop as soon as reaching consistency */
2763  {
2764  ereport(LOG,
2765  (errmsg("recovery stopping after reaching consistency")));
2766 
2767  recoveryStopAfter = true;
2769  recoveryStopTime = 0;
2771  recoveryStopName[0] = '\0';
2772  return true;
2773  }
2774 
2775  return false;
2776 }
2777 
2778 /*
2779  * Create a comment for the history file to explain why and where
2780  * timeline changed.
2781  */
2782 static char *
2784 {
2785  char reason[200];
2786 
2788  snprintf(reason, sizeof(reason),
2789  "%s transaction %u",
2790  recoveryStopAfter ? "after" : "before",
2791  recoveryStopXid);
2793  snprintf(reason, sizeof(reason),
2794  "%s %s\n",
2795  recoveryStopAfter ? "after" : "before",
2797  else if (recoveryTarget == RECOVERY_TARGET_LSN)
2798  snprintf(reason, sizeof(reason),
2799  "%s LSN %X/%X\n",
2800  recoveryStopAfter ? "after" : "before",
2803  snprintf(reason, sizeof(reason),
2804  "at restore point \"%s\"",
2807  snprintf(reason, sizeof(reason), "reached consistency");
2808  else
2809  snprintf(reason, sizeof(reason), "no recovery target specified");
2810 
2811  return pstrdup(reason);
2812 }
2813 
2814 /*
2815  * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2816  *
2817  * endOfRecovery is true if the recovery target is reached and
2818  * the paused state starts at the end of recovery because of
2819  * recovery_target_action=pause, and false otherwise.
2820  */
2821 static void
2822 recoveryPausesHere(bool endOfRecovery)
2823 {
2824  /* Don't pause unless users can connect! */
2825  if (!LocalHotStandbyActive)
2826  return;
2827 
2828  /* Don't pause after standby promotion has been triggered */
2830  return;
2831 
2832  if (endOfRecovery)
2833  ereport(LOG,
2834  (errmsg("pausing at the end of recovery"),
2835  errhint("Execute pg_wal_replay_resume() to promote.")));
2836  else
2837  ereport(LOG,
2838  (errmsg("recovery has paused"),
2839  errhint("Execute pg_wal_replay_resume() to continue.")));
2840 
2841  /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2843  {
2845  if (CheckForStandbyTrigger())
2846  return;
2847 
2848  /*
2849  * If recovery pause is requested then set it paused. While we are in
2850  * the loop, user might resume and pause again so set this every time.
2851  */
2853 
2854  /*
2855  * We wait on a condition variable that will wake us as soon as the
2856  * pause ends, but we use a timeout so we can check the above exit
2857  * condition periodically too.
2858  */
2861  }
2863 }
2864 
2865 /*
2866  * When recovery_min_apply_delay is set, we wait long enough to make sure
2867  * certain record types are applied at least that interval behind the primary.
2868  *
2869  * Returns true if we waited.
2870  *
2871  * Note that the delay is calculated between the WAL record log time and
2872  * the current time on standby. We would prefer to keep track of when this
2873  * standby received each WAL record, which would allow a more consistent
2874  * approach and one not affected by time synchronisation issues, but that
2875  * is significantly more effort and complexity for little actual gain in
2876  * usability.
2877  */
2878 static bool
2880 {
2881  uint8 xact_info;
2882  TimestampTz xtime;
2883  TimestampTz delayUntil;
2884  long msecs;
2885 
2886  /* nothing to do if no delay configured */
2887  if (recovery_min_apply_delay <= 0)
2888  return false;
2889 
2890  /* no delay is applied on a database not yet consistent */
2891  if (!reachedConsistency)
2892  return false;
2893 
2894  /* nothing to do if crash recovery is requested */
2896  return false;
2897 
2898  /*
2899  * Is it a COMMIT record?
2900  *
2901  * We deliberately choose not to delay aborts since they have no effect on
2902  * MVCC. We already allow replay of records that don't have a timestamp,
2903  * so there is already opportunity for issues caused by early conflicts on
2904  * standbys.
2905  */
2906  if (XLogRecGetRmid(record) != RM_XACT_ID)
2907  return false;
2908 
2909  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2910 
2911  if (xact_info != XLOG_XACT_COMMIT &&
2912  xact_info != XLOG_XACT_COMMIT_PREPARED)
2913  return false;
2914 
2915  if (!getRecordTimestamp(record, &xtime))
2916  return false;
2917 
2919 
2920  /*
2921  * Exit without arming the latch if it's already past time to apply this
2922  * record
2923  */
2925  if (msecs <= 0)
2926  return false;
2927 
2928  while (true)
2929  {
2931 
2932  /* This might change recovery_min_apply_delay. */
2934 
2935  if (CheckForStandbyTrigger())
2936  break;
2937 
2938  /*
2939  * Recalculate delayUntil as recovery_min_apply_delay could have
2940  * changed while waiting in this loop.
2941  */
2943 
2944  /*
2945  * Wait for difference between GetCurrentTimestamp() and delayUntil.
2946  */
2948  delayUntil);
2949 
2950  if (msecs <= 0)
2951  break;
2952 
2953  elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
2954 
2957  msecs,
2959  }
2960  return true;
2961 }
2962 
2963 /*
2964  * Get the current state of the recovery pause request.
2965  */
2968 {
2970 
2974 
2975  return state;
2976 }
2977 
2978 /*
2979  * Set the recovery pause state.
2980  *
2981  * If recovery pause is requested then sets the recovery pause state to
2982  * 'pause requested' if it is not already 'paused'. Otherwise, sets it
2983  * to 'not paused' to resume the recovery. The recovery pause will be
2984  * confirmed by the ConfirmRecoveryPaused.
2985  */
2986 void
2987 SetRecoveryPause(bool recoveryPause)
2988 {
2990 
2991  if (!recoveryPause)
2995 
2997 
2998  if (!recoveryPause)
3000 }
3001 
3002 /*
3003  * Confirm the recovery pause by setting the recovery pause state to
3004  * RECOVERY_PAUSED.
3005  */
3006 static void
3008 {
3009  /* If recovery pause is requested then set it paused */
3014 }
3015 
3016 
3017 /*
3018  * Attempt to read the next XLOG record.
3019  *
3020  * Before first call, the reader needs to be positioned to the first record
3021  * by calling XLogPrefetcherBeginRead().
3022  *
3023  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3024  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3025  * record is available.
3026  */
3027 static XLogRecord *
3029  bool fetching_ckpt, TimeLineID replayTLI)
3030 {
3031  XLogRecord *record;
3034 
3035  /* Pass through parameters to XLogPageRead */
3036  private->fetching_ckpt = fetching_ckpt;
3037  private->emode = emode;
3038  private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3039  private->replayTLI = replayTLI;
3040 
3041  /* This is the first attempt to read this page. */
3042  lastSourceFailed = false;
3043 
3044  for (;;)
3045  {
3046  char *errormsg;
3047 
3048  record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3049  if (record == NULL)
3050  {
3051  /*
3052  * When we find that WAL ends in an incomplete record, keep track
3053  * of that record. After recovery is done, we'll write a record to
3054  * indicate to downstream WAL readers that that portion is to be
3055  * ignored.
3056  *
3057  * However, when ArchiveRecoveryRequested = true, we're going to
3058  * switch to a new timeline at the end of recovery. We will only
3059  * copy WAL over to the new timeline up to the end of the last
3060  * complete record, so if we did this, we would later create an
3061  * overwrite contrecord in the wrong place, breaking everything.
3062  */
3063  if (!ArchiveRecoveryRequested &&
3065  {
3068  }
3069 
3070  if (readFile >= 0)
3071  {
3072  close(readFile);
3073  readFile = -1;
3074  }
3075 
3076  /*
3077  * We only end up here without a message when XLogPageRead()
3078  * failed - in that case we already logged something. In
3079  * StandbyMode that only happens if we have been triggered, so we
3080  * shouldn't loop anymore in that case.
3081  */
3082  if (errormsg)
3084  (errmsg_internal("%s", errormsg) /* already translated */ ));
3085  }
3086 
3087  /*
3088  * Check page TLI is one of the expected values.
3089  */
3091  {
3092  char fname[MAXFNAMELEN];
3093  XLogSegNo segno;
3094  int32 offset;
3095 
3099  XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3102  (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3104  fname,
3106  offset)));
3107  record = NULL;
3108  }
3109 
3110  if (record)
3111  {
3112  /* Great, got a record */
3113  return record;
3114  }
3115  else
3116  {
3117  /* No valid record available from this source */
3118  lastSourceFailed = true;
3119 
3120  /*
3121  * If archive recovery was requested, but we were still doing
3122  * crash recovery, switch to archive recovery and retry using the
3123  * offline archive. We have now replayed all the valid WAL in
3124  * pg_wal, so we are presumably now consistent.
3125  *
3126  * We require that there's at least some valid WAL present in
3127  * pg_wal, however (!fetching_ckpt). We could recover using the
3128  * WAL from the archive, even if pg_wal is completely empty, but
3129  * we'd have no idea how far we'd have to replay to reach
3130  * consistency. So err on the safe side and give up.
3131  */
3133  !fetching_ckpt)
3134  {
3135  ereport(DEBUG1,
3136  (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3137  InArchiveRecovery = true;
3140 
3143  minRecoveryPointTLI = replayTLI;
3144 
3146 
3147  /*
3148  * Before we retry, reset lastSourceFailed and currentSource
3149  * so that we will check the archive next.
3150  */
3151  lastSourceFailed = false;
3153 
3154  continue;
3155  }
3156 
3157  /* In standby mode, loop back to retry. Otherwise, give up. */
3159  continue;
3160  else
3161  return NULL;
3162  }
3163  }
3164 }
3165 
3166 /*
3167  * Read the XLOG page containing targetPagePtr into readBuf (if not read
3168  * already). Returns number of bytes read, if the page is read successfully,
3169  * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3170  * but only if they have not been previously reported.
3171  *
3172  * See XLogReaderRoutine.page_read for more details.
3173  *
3174  * While prefetching, xlogreader->nonblocking may be set. In that case,
3175  * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3176  *
3177  * This is responsible for restoring files from archive as needed, as well
3178  * as for waiting for the requested WAL record to arrive in standby mode.
3179  *
3180  * xlogreader->private_data->emode specifies the log level used for reporting
3181  * "file not found" or "end of WAL" situations in archive recovery, or in
3182  * standby mode when promotion is triggered. If set to WARNING or below,
3183  * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3184  * levels the ereport() won't return.
3185  *
3186  * In standby mode, if after a successful return of XLogPageRead() the
3187  * caller finds the record it's interested in to be broken, it should
3188  * ereport the error with the level determined by
3189  * emode_for_corrupt_record(), and then set lastSourceFailed
3190  * and call XLogPageRead() again with the same arguments. This lets
3191  * XLogPageRead() to try fetching the record from another source, or to
3192  * sleep and retry.
3193  */
3194 static int
3196  XLogRecPtr targetRecPtr, char *readBuf)
3197 {
3198  XLogPageReadPrivate *private =
3200  int emode = private->emode;
3201  uint32 targetPageOff;
3202  XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3203  int r;
3204 
3205  XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3206  targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3207 
3208  /*
3209  * See if we need to switch to a new segment because the requested record
3210  * is not in the currently open one.
3211  */
3212  if (readFile >= 0 &&
3213  !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3214  {
3215  /*
3216  * Request a restartpoint if we've replayed too much xlog since the
3217  * last one.
3218  */
3220  {
3222  {
3223  (void) GetRedoRecPtr();
3226  }
3227  }
3228 
3229  close(readFile);
3230  readFile = -1;
3232  }
3233 
3234  XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3235 
3236 retry:
3237  /* See if we need to retrieve more data */
3238  if (readFile < 0 ||
3240  flushedUpto < targetPagePtr + reqLen))
3241  {
3242  if (readFile >= 0 &&
3245  flushedUpto < targetPagePtr + reqLen)
3246  return XLREAD_WOULDBLOCK;
3247 
3248  switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3249  private->randAccess,
3250  private->fetching_ckpt,
3251  targetRecPtr,
3252  private->replayTLI,
3255  {
3256  case XLREAD_WOULDBLOCK:
3257  return XLREAD_WOULDBLOCK;
3258  case XLREAD_FAIL:
3259  if (readFile >= 0)
3260  close(readFile);
3261  readFile = -1;
3262  readLen = 0;
3264  return XLREAD_FAIL;
3265  case XLREAD_SUCCESS:
3266  break;
3267  }
3268  }
3269 
3270  /*
3271  * At this point, we have the right segment open and if we're streaming we
3272  * know the requested record is in it.
3273  */
3274  Assert(readFile != -1);
3275 
3276  /*
3277  * If the current segment is being streamed from the primary, calculate
3278  * how much of the current page we have received already. We know the
3279  * requested record has been received, but this is for the benefit of
3280  * future calls, to allow quick exit at the top of this function.
3281  */
3283  {
3284  if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3285  readLen = XLOG_BLCKSZ;
3286  else
3288  targetPageOff;
3289  }
3290  else
3291  readLen = XLOG_BLCKSZ;
3292 
3293  /* Read the requested page */
3294  readOff = targetPageOff;
3295 
3297  r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3298  if (r != XLOG_BLCKSZ)
3299  {
3300  char fname[MAXFNAMELEN];
3301  int save_errno = errno;
3302 
3305  if (r < 0)
3306  {
3307  errno = save_errno;
3308  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3310  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3311  fname, LSN_FORMAT_ARGS(targetPagePtr),
3312  readOff)));
3313  }
3314  else
3315  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3317  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3318  fname, LSN_FORMAT_ARGS(targetPagePtr),
3319  readOff, r, (Size) XLOG_BLCKSZ)));
3320  goto next_record_is_invalid;
3321  }
3323 
3324  Assert(targetSegNo == readSegNo);
3325  Assert(targetPageOff == readOff);
3326  Assert(reqLen <= readLen);
3327 
3329 
3330  /*
3331  * Check the page header immediately, so that we can retry immediately if
3332  * it's not valid. This may seem unnecessary, because ReadPageInternal()
3333  * validates the page header anyway, and would propagate the failure up to
3334  * ReadRecord(), which would retry. However, there's a corner case with
3335  * continuation records, if a record is split across two pages such that
3336  * we would need to read the two pages from different sources. For
3337  * example, imagine a scenario where a streaming replica is started up,
3338  * and replay reaches a record that's split across two WAL segments. The
3339  * first page is only available locally, in pg_wal, because it's already
3340  * been recycled on the primary. The second page, however, is not present
3341  * in pg_wal, and we should stream it from the primary. There is a
3342  * recycled WAL segment present in pg_wal, with garbage contents, however.
3343  * We would read the first page from the local WAL segment, but when
3344  * reading the second page, we would read the bogus, recycled, WAL
3345  * segment. If we didn't catch that case here, we would never recover,
3346  * because ReadRecord() would retry reading the whole record from the
3347  * beginning.
3348  *
3349  * Of course, this only catches errors in the page header, which is what
3350  * happens in the case of a recycled WAL segment. Other kinds of errors or
3351  * corruption still has the same problem. But this at least fixes the
3352  * common case, which can happen as part of normal operation.
3353  *
3354  * Validating the page header is cheap enough that doing it twice
3355  * shouldn't be a big deal from a performance point of view.
3356  *
3357  * When not in standby mode, an invalid page header should cause recovery
3358  * to end, not retry reading the page, so we don't need to validate the
3359  * page header here for the retry. Instead, ReadPageInternal() is
3360  * responsible for the validation.
3361  */
3362  if (StandbyMode &&
3363  !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3364  {
3365  /*
3366  * Emit this error right now then retry this page immediately. Use
3367  * errmsg_internal() because the message was already translated.
3368  */
3369  if (xlogreader->errormsg_buf[0])
3372 
3373  /* reset any error XLogReaderValidatePageHeader() might have set */
3375  goto next_record_is_invalid;
3376  }
3377 
3378  return readLen;
3379 
3380 next_record_is_invalid:
3381 
3382  /*
3383  * If we're reading ahead, give up fast. Retries and error reporting will
3384  * be handled by a later read when recovery catches up to this point.
3385  */
3386  if (xlogreader->nonblocking)
3387  return XLREAD_WOULDBLOCK;
3388 
3389  lastSourceFailed = true;
3390 
3391  if (readFile >= 0)
3392  close(readFile);
3393  readFile = -1;
3394  readLen = 0;
3396 
3397  /* In standby-mode, keep trying */
3398  if (StandbyMode)
3399  goto retry;
3400  else
3401  return XLREAD_FAIL;
3402 }
3403 
3404 /*
3405  * Open the WAL segment containing WAL location 'RecPtr'.
3406  *
3407  * The segment can be fetched via restore_command, or via walreceiver having
3408  * streamed the record, or it can already be present in pg_wal. Checking
3409  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3410  * too, in case someone copies a new segment directly to pg_wal. That is not
3411  * documented or recommended, though.
3412  *
3413  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3414  * prepare to read WAL starting from RedoStartLSN after this.
3415  *
3416  * 'RecPtr' might not point to the beginning of the record we're interested
3417  * in, it might also point to the page or segment header. In that case,
3418  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3419  * used to decide which timeline to stream the requested WAL from.
3420  *
3421  * 'replayLSN' is the current replay LSN, so that if we scan for new
3422  * timelines, we can reject a switch to a timeline that branched off before
3423  * this point.
3424  *
3425  * If the record is not immediately available, the function returns false
3426  * if we're not in standby mode. In standby mode, waits for it to become
3427  * available.
3428  *
3429  * When the requested record becomes available, the function opens the file
3430  * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3431  * of standby mode is triggered by the user, and there is no more WAL
3432  * available, returns XLREAD_FAIL.
3433  *
3434  * If nonblocking is true, then give up immediately if we can't satisfy the
3435  * request, returning XLREAD_WOULDBLOCK instead of waiting.
3436  */
3437 static XLogPageReadResult
3438 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3439  bool fetching_ckpt, XLogRecPtr tliRecPtr,
3440  TimeLineID replayTLI, XLogRecPtr replayLSN,
3441  bool nonblocking)
3442 {
3443  static TimestampTz last_fail_time = 0;
3444  TimestampTz now;
3445  bool streaming_reply_sent = false;
3446 
3447  /*-------
3448  * Standby mode is implemented by a state machine:
3449  *
3450  * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3451  * pg_wal (XLOG_FROM_PG_WAL)
3452  * 2. Check for promotion trigger request
3453  * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3454  * 4. Rescan timelines
3455  * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3456  *
3457  * Failure to read from the current source advances the state machine to
3458  * the next state.
3459  *
3460  * 'currentSource' indicates the current state. There are no currentSource
3461  * values for "check trigger", "rescan timelines", and "sleep" states,
3462  * those actions are taken when reading from the previous source fails, as
3463  * part of advancing to the next state.
3464  *
3465  * If standby mode is turned off while reading WAL from stream, we move
3466  * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3467  * the files (which would be required at end of recovery, e.g., timeline
3468  * history file) from archive or pg_wal. We don't need to kill WAL receiver
3469  * here because it's already stopped when standby mode is turned off at
3470  * the end of recovery.
3471  *-------
3472  */
3473  if (!InArchiveRecovery)
3475  else if (currentSource == XLOG_FROM_ANY ||
3477  {
3478  lastSourceFailed = false;
3480  }
3481 
3482  for (;;)
3483  {
3484  XLogSource oldSource = currentSource;
3485  bool startWalReceiver = false;
3486 
3487  /*
3488  * First check if we failed to read from the current source, and
3489  * advance the state machine if so. The failure to read might've
3490  * happened outside this function, e.g when a CRC check fails on a
3491  * record, or within this loop.
3492  */
3493  if (lastSourceFailed)
3494  {
3495  /*
3496  * Don't allow any retry loops to occur during nonblocking
3497  * readahead. Let the caller process everything that has been
3498  * decoded already first.
3499  */
3500  if (nonblocking)
3501  return XLREAD_WOULDBLOCK;
3502 
3503  switch (currentSource)
3504  {
3505  case XLOG_FROM_ARCHIVE:
3506  case XLOG_FROM_PG_WAL:
3507 
3508  /*
3509  * Check to see if promotion is requested. Note that we do
3510  * this only after failure, so when you promote, we still
3511  * finish replaying as much as we can from archive and
3512  * pg_wal before failover.
3513  */
3515  {
3517  return XLREAD_FAIL;
3518  }
3519 
3520  /*
3521  * Not in standby mode, and we've now tried the archive
3522  * and pg_wal.
3523  */
3524  if (!StandbyMode)
3525  return XLREAD_FAIL;
3526 
3527  /*
3528  * Move to XLOG_FROM_STREAM state, and set to start a
3529  * walreceiver if necessary.
3530  */
3532  startWalReceiver = true;
3533  break;
3534 
3535  case XLOG_FROM_STREAM:
3536 
3537  /*
3538  * Failure while streaming. Most likely, we got here
3539  * because streaming replication was terminated, or
3540  * promotion was triggered. But we also get here if we
3541  * find an invalid record in the WAL streamed from the
3542  * primary, in which case something is seriously wrong.
3543  * There's little chance that the problem will just go
3544  * away, but PANIC is not good for availability either,
3545  * especially in hot standby mode. So, we treat that the
3546  * same as disconnection, and retry from archive/pg_wal
3547  * again. The WAL in the archive should be identical to
3548  * what was streamed, so it's unlikely that it helps, but
3549  * one can hope...
3550  */
3551 
3552  /*
3553  * We should be able to move to XLOG_FROM_STREAM only in
3554  * standby mode.
3555  */
3557 
3558  /*
3559  * Before we leave XLOG_FROM_STREAM state, make sure that
3560  * walreceiver is not active, so that it won't overwrite
3561  * WAL that we restore from archive.
3562  */
3564 
3565  /*
3566  * Before we sleep, re-scan for possible new timelines if
3567  * we were requested to recover to the latest timeline.
3568  */
3570  {
3571  if (rescanLatestTimeLine(replayTLI, replayLSN))
3572  {
3574  break;
3575  }
3576  }
3577 
3578  /*
3579  * XLOG_FROM_STREAM is the last state in our state
3580  * machine, so we've exhausted all the options for
3581  * obtaining the requested WAL. We're going to loop back
3582  * and retry from the archive, but if it hasn't been long
3583  * since last attempt, sleep wal_retrieve_retry_interval
3584  * milliseconds to avoid busy-waiting.
3585  */
3587  if (!TimestampDifferenceExceeds(last_fail_time, now,
3589  {
3590  long wait_time;
3591 
3592  wait_time = wal_retrieve_retry_interval -
3593  TimestampDifferenceMilliseconds(last_fail_time, now);
3594 
3595  elog(LOG, "waiting for WAL to become available at %X/%X",
3596  LSN_FORMAT_ARGS(RecPtr));
3597 
3598  /* Do background tasks that might benefit us later. */
3600 
3604  wait_time,
3608 
3609  /* Handle interrupt signals of startup process */
3611  }
3612  last_fail_time = now;
3614  break;
3615 
3616  default:
3617  elog(ERROR, "unexpected WAL source %d", currentSource);
3618  }
3619  }
3620  else if (currentSource == XLOG_FROM_PG_WAL)
3621  {
3622  /*
3623  * We just successfully read a file in pg_wal. We prefer files in
3624  * the archive over ones in pg_wal, so try the next file again
3625  * from the archive first.
3626  */
3627  if (InArchiveRecovery)
3629  }
3630 
3631  if (currentSource != oldSource)
3632  elog(DEBUG2, "switched WAL source from %s to %s after %s",
3634  lastSourceFailed ? "failure" : "success");
3635 
3636  /*
3637  * We've now handled possible failure. Try to read from the chosen
3638  * source.
3639  */
3640  lastSourceFailed = false;
3641 
3642  switch (currentSource)
3643  {
3644  case XLOG_FROM_ARCHIVE:
3645  case XLOG_FROM_PG_WAL:
3646 
3647  /*
3648  * WAL receiver must not be running when reading WAL from
3649  * archive or pg_wal.
3650  */
3651  Assert(!WalRcvStreaming());
3652 
3653  /* Close any old file we might have open. */
3654  if (readFile >= 0)
3655  {
3656  close(readFile);
3657  readFile = -1;
3658  }
3659  /* Reset curFileTLI if random fetch. */
3660  if (randAccess)
3661  curFileTLI = 0;
3662 
3663  /*
3664  * Try to restore the file from archive, or read an existing
3665  * file from pg_wal.
3666  */
3669  currentSource);
3670  if (readFile >= 0)
3671  return XLREAD_SUCCESS; /* success! */
3672 
3673  /*
3674  * Nope, not found in archive or pg_wal.
3675  */
3676  lastSourceFailed = true;
3677  break;
3678 
3679  case XLOG_FROM_STREAM:
3680  {
3681  bool havedata;
3682 
3683  /*
3684  * We should be able to move to XLOG_FROM_STREAM only in
3685  * standby mode.
3686  */
3688 
3689  /*
3690  * First, shutdown walreceiver if its restart has been
3691  * requested -- but no point if we're already slated for
3692  * starting it.
3693  */
3694  if (pendingWalRcvRestart && !startWalReceiver)
3695  {
3697 
3698  /*
3699  * Re-scan for possible new timelines if we were
3700  * requested to recover to the latest timeline.
3701  */
3704  rescanLatestTimeLine(replayTLI, replayLSN);
3705 
3706  startWalReceiver = true;
3707  }
3708  pendingWalRcvRestart = false;
3709 
3710  /*
3711  * Launch walreceiver if needed.
3712  *
3713  * If fetching_ckpt is true, RecPtr points to the initial
3714  * checkpoint location. In that case, we use RedoStartLSN
3715  * as the streaming start position instead of RecPtr, so
3716  * that when we later jump backwards to start redo at
3717  * RedoStartLSN, we will have the logs streamed already.
3718  */
3719  if (startWalReceiver &&
3720  PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3721  {
3722  XLogRecPtr ptr;
3723  TimeLineID tli;
3724 
3725  if (fetching_ckpt)
3726  {
3727  ptr = RedoStartLSN;
3728  tli = RedoStartTLI;
3729  }
3730  else
3731  {
3732  ptr = RecPtr;
3733 
3734  /*
3735  * Use the record begin position to determine the
3736  * TLI, rather than the position we're reading.
3737  */
3738  tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3739 
3740  if (curFileTLI > 0 && tli < curFileTLI)
3741  elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3742  LSN_FORMAT_ARGS(tliRecPtr),
3743  tli, curFileTLI);
3744  }
3745  curFileTLI = tli;
3750  flushedUpto = 0;
3751  }
3752 
3753  /*
3754  * Check if WAL receiver is active or wait to start up.
3755  */
3756  if (!WalRcvStreaming())
3757  {
3758  lastSourceFailed = true;
3759  break;
3760  }
3761 
3762  /*
3763  * Walreceiver is active, so see if new data has arrived.
3764  *
3765  * We only advance XLogReceiptTime when we obtain fresh
3766  * WAL from walreceiver and observe that we had already
3767  * processed everything before the most recent "chunk"
3768  * that it flushed to disk. In steady state where we are
3769  * keeping up with the incoming data, XLogReceiptTime will
3770  * be updated on each cycle. When we are behind,
3771  * XLogReceiptTime will not advance, so the grace time
3772  * allotted to conflicting queries will decrease.
3773  */
3774  if (RecPtr < flushedUpto)
3775  havedata = true;
3776  else
3777  {
3778  XLogRecPtr latestChunkStart;
3779 
3780  flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3781  if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3782  {
3783  havedata = true;
3784  if (latestChunkStart <= RecPtr)
3785  {
3788  }
3789  }
3790  else
3791  havedata = false;
3792  }
3793  if (havedata)
3794  {
3795  /*
3796  * Great, streamed far enough. Open the file if it's
3797  * not open already. Also read the timeline history
3798  * file if we haven't initialized timeline history
3799  * yet; it should be streamed over and present in
3800  * pg_wal by now. Use XLOG_FROM_STREAM so that source
3801  * info is set correctly and XLogReceiptTime isn't
3802  * changed.
3803  *
3804  * NB: We must set readTimeLineHistory based on
3805  * recoveryTargetTLI, not receiveTLI. Normally they'll
3806  * be the same, but if recovery_target_timeline is
3807  * 'latest' and archiving is configured, then it's
3808  * possible that we managed to retrieve one or more
3809  * new timeline history files from the archive,
3810  * updating recoveryTargetTLI.
3811  */
3812  if (readFile < 0)
3813  {
3814  if (!expectedTLEs)
3817  receiveTLI,
3818  XLOG_FROM_STREAM, false);
3819  Assert(readFile >= 0);
3820  }
3821  else
3822  {
3823  /* just make sure source info is correct... */
3826  return XLREAD_SUCCESS;
3827  }
3828  break;
3829  }
3830 
3831  /* In nonblocking mode, return rather than sleeping. */
3832  if (nonblocking)
3833  return XLREAD_WOULDBLOCK;
3834 
3835  /*
3836  * Data not here yet. Check for trigger, then wait for
3837  * walreceiver to wake us up when new WAL arrives.
3838  */
3839  if (CheckForStandbyTrigger())
3840  {
3841  /*
3842  * Note that we don't return XLREAD_FAIL immediately
3843  * here. After being triggered, we still want to
3844  * replay all the WAL that was already streamed. It's
3845  * in pg_wal now, so we just treat this as a failure,
3846  * and the state machine will move on to replay the
3847  * streamed WAL from pg_wal, and then recheck the
3848  * trigger and exit replay.
3849  */
3850  lastSourceFailed = true;
3851  break;
3852  }
3853 
3854  /*
3855  * Since we have replayed everything we have received so
3856  * far and are about to start waiting for more WAL, let's
3857  * tell the upstream server our replay location now so
3858  * that pg_stat_replication doesn't show stale
3859  * information.
3860  */
3861  if (!streaming_reply_sent)
3862  {
3863  WalRcvForceReply();
3864  streaming_reply_sent = true;
3865  }
3866 
3867  /* Do any background tasks that might benefit us later. */
3869 
3870  /* Update pg_stat_recovery_prefetch before sleeping. */
3872 
3873  /*
3874  * Wait for more WAL to arrive, when we will be woken
3875  * immediately by the WAL receiver.
3876  */
3879  -1L,
3882  break;
3883  }
3884 
3885  default:
3886  elog(ERROR, "unexpected WAL source %d", currentSource);
3887  }
3888 
3889  /*
3890  * Check for recovery pause here so that we can confirm more quickly
3891  * that a requested pause has actually taken effect.
3892  */
3893  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
3895  recoveryPausesHere(false);
3896 
3897  /*
3898  * This possibly-long loop needs to handle interrupts of startup
3899  * process.
3900  */
3902  }
3903 
3904  return XLREAD_FAIL; /* not reached */
3905 }
3906 
3907 
3908 /*
3909  * Determine what log level should be used to report a corrupt WAL record
3910  * in the current WAL page, previously read by XLogPageRead().
3911  *
3912  * 'emode' is the error mode that would be used to report a file-not-found
3913  * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
3914  * we're retrying the exact same record that we've tried previously, only
3915  * complain the first time to keep the noise down. However, we only do when
3916  * reading from pg_wal, because we don't expect any invalid records in archive
3917  * or in records streamed from the primary. Files in the archive should be complete,
3918  * and we should never hit the end of WAL because we stop and wait for more WAL
3919  * to arrive before replaying it.
3920  *
3921  * NOTE: This function remembers the RecPtr value it was last called with,
3922  * to suppress repeated messages about the same record. Only call this when
3923  * you are about to ereport(), or you might cause a later message to be
3924  * erroneously suppressed.
3925  */
3926 static int
3928 {
3929  static XLogRecPtr lastComplaint = 0;
3930 
3931  if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
3932  {
3933  if (RecPtr == lastComplaint)
3934  emode = DEBUG1;
3935  else
3936  lastComplaint = RecPtr;
3937  }
3938  return emode;
3939 }
3940 
3941 
3942 /*
3943  * Subroutine to try to fetch and validate a prior checkpoint record.
3944  */
3945 static XLogRecord *
3947  TimeLineID replayTLI)
3948 {
3949  XLogRecord *record;
3950  uint8 info;
3951 
3952  Assert(xlogreader != NULL);
3953 
3954  if (!XRecOffIsValid(RecPtr))
3955  {
3956  ereport(LOG,
3957  (errmsg("invalid checkpoint location")));
3958  return NULL;
3959  }
3960 
3962  record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
3963 
3964  if (record == NULL)
3965  {
3966  ereport(LOG,
3967  (errmsg("invalid checkpoint record")));
3968  return NULL;
3969  }
3970  if (record->xl_rmid != RM_XLOG_ID)
3971  {
3972  ereport(LOG,
3973  (errmsg("invalid resource manager ID in checkpoint record")));
3974  return NULL;
3975  }
3976  info = record->xl_info & ~XLR_INFO_MASK;
3977  if (info != XLOG_CHECKPOINT_SHUTDOWN &&
3978  info != XLOG_CHECKPOINT_ONLINE)
3979  {
3980  ereport(LOG,
3981  (errmsg("invalid xl_info in checkpoint record")));
3982  return NULL;
3983  }
3985  {
3986  ereport(LOG,
3987  (errmsg("invalid length of checkpoint record")));
3988  return NULL;
3989  }
3990  return record;
3991 }
3992 
3993 /*
3994  * Scan for new timelines that might have appeared in the archive since we
3995  * started recovery.
3996  *
3997  * If there are any, the function changes recovery target TLI to the latest
3998  * one and returns 'true'.
3999  */
4000 static bool
4002 {
4003  List *newExpectedTLEs;
4004  bool found;
4005  ListCell *cell;
4006  TimeLineID newtarget;
4007  TimeLineID oldtarget = recoveryTargetTLI;
4008  TimeLineHistoryEntry *currentTle = NULL;
4009 
4011  if (newtarget == recoveryTargetTLI)
4012  {
4013  /* No new timelines found */
4014  return false;
4015  }
4016 
4017  /*
4018  * Determine the list of expected TLIs for the new TLI
4019  */
4020 
4021  newExpectedTLEs = readTimeLineHistory(newtarget);
4022 
4023  /*
4024  * If the current timeline is not part of the history of the new timeline,
4025  * we cannot proceed to it.
4026  */
4027  found = false;
4028  foreach(cell, newExpectedTLEs)
4029  {
4030  currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4031 
4032  if (currentTle->tli == recoveryTargetTLI)
4033  {
4034  found = true;
4035  break;
4036  }
4037  }
4038  if (!found)
4039  {
4040  ereport(LOG,
4041  (errmsg("new timeline %u is not a child of database system timeline %u",
4042  newtarget,
4043  replayTLI)));
4044  return false;
4045  }
4046 
4047  /*
4048  * The current timeline was found in the history file, but check that the
4049  * next timeline was forked off from it *after* the current recovery
4050  * location.
4051  */
4052  if (currentTle->end < replayLSN)
4053  {
4054  ereport(LOG,
4055  (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4056  newtarget,
4057  replayTLI,
4058  LSN_FORMAT_ARGS(replayLSN))));
4059  return false;
4060  }
4061 
4062  /* The new timeline history seems valid. Switch target */
4063  recoveryTargetTLI = newtarget;
4065  expectedTLEs = newExpectedTLEs;
4066 
4067  /*
4068  * As in StartupXLOG(), try to ensure we have all the history files
4069  * between the old target and new target in pg_wal.
4070  */
4071  restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4072 
4073  ereport(LOG,
4074  (errmsg("new target timeline is %u",
4075  recoveryTargetTLI)));
4076 
4077  return true;
4078 }
4079 
4080 
4081 /*
4082  * Open a logfile segment for reading (during recovery).
4083  *
4084  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4085  * Otherwise, it's assumed to be already available in pg_wal.
4086  */
4087 static int
4088 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
4089  XLogSource source, bool notfoundOk)
4090 {
4091  char xlogfname[MAXFNAMELEN];
4092  char activitymsg[MAXFNAMELEN + 16];
4093  char path[MAXPGPATH];
4094  int fd;
4095 
4096  XLogFileName(xlogfname, tli, segno, wal_segment_size);
4097 
4098  switch (source)
4099  {
4100  case XLOG_FROM_ARCHIVE:
4101  /* Report recovery progress in PS display */
4102  snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4103  xlogfname);
4104  set_ps_display(activitymsg);
4105 
4106  if (!RestoreArchivedFile(path, xlogfname,
4107  "RECOVERYXLOG",
4109  InRedo))
4110  return -1;
4111  break;
4112 
4113  case XLOG_FROM_PG_WAL:
4114  case XLOG_FROM_STREAM:
4115  XLogFilePath(path, tli, segno, wal_segment_size);
4116  break;
4117 
4118  default:
4119  elog(ERROR, "invalid XLogFileRead source %d", source);
4120  }
4121 
4122  /*
4123  * If the segment was fetched from archival storage, replace the existing
4124  * xlog segment (if any) with the archival version.
4125  */
4126  if (source == XLOG_FROM_ARCHIVE)
4127  {
4129  KeepFileRestoredFromArchive(path, xlogfname);
4130 
4131  /*
4132  * Set path to point at the new file in pg_wal.
4133  */
4134  snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4135  }
4136 
4137  fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4138  if (fd >= 0)
4139  {
4140  /* Success! */
4141  curFileTLI = tli;
4142 
4143  /* Report recovery progress in PS display */
4144  snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4145  xlogfname);
4146  set_ps_display(activitymsg);
4147 
4148  /* Track source of data in assorted state variables */
4149  readSource = source;
4151  /* In FROM_STREAM case, caller tracks receipt time, not me */
4152  if (source != XLOG_FROM_STREAM)
4154 
4155  return fd;
4156  }
4157  if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4158  ereport(PANIC,
4160  errmsg("could not open file \"%s\": %m", path)));
4161  return -1;
4162 }
4163 
4164 /*
4165  * Open a logfile segment for reading (during recovery).
4166  *
4167  * This version searches for the segment with any TLI listed in expectedTLEs.
4168  */
4169 static int
4171 {
4172  char path[MAXPGPATH];
4173  ListCell *cell;
4174  int fd;
4175  List *tles;
4176 
4177  /*
4178  * Loop looking for a suitable timeline ID: we might need to read any of
4179  * the timelines listed in expectedTLEs.
4180  *
4181  * We expect curFileTLI on entry to be the TLI of the preceding file in
4182  * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4183  * to go backwards; this prevents us from picking up the wrong file when a
4184  * parent timeline extends to higher segment numbers than the child we
4185  * want to read.
4186  *
4187  * If we haven't read the timeline history file yet, read it now, so that
4188  * we know which TLIs to scan. We don't save the list in expectedTLEs,
4189  * however, unless we actually find a valid segment. That way if there is
4190  * neither a timeline history file nor a WAL segment in the archive, and
4191  * streaming replication is set up, we'll read the timeline history file
4192  * streamed from the primary when we start streaming, instead of
4193  * recovering with a dummy history generated here.
4194  */
4195  if (expectedTLEs)
4196  tles = expectedTLEs;
4197  else
4199 
4200  foreach(cell, tles)
4201  {
4203  TimeLineID tli = hent->tli;
4204 
4205  if (tli < curFileTLI)
4206  break; /* don't bother looking at too-old TLIs */
4207 
4208  /*
4209  * Skip scanning the timeline ID that the logfile segment to read
4210  * doesn't belong to
4211  */
4212  if (hent->begin != InvalidXLogRecPtr)
4213  {
4214  XLogSegNo beginseg = 0;
4215 
4216  XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4217 
4218  /*
4219  * The logfile segment that doesn't belong to the timeline is
4220  * older or newer than the segment that the timeline started or
4221  * ended at, respectively. It's sufficient to check only the
4222  * starting segment of the timeline here. Since the timelines are
4223  * scanned in descending order in this loop, any segments newer
4224  * than the ending segment should belong to newer timeline and
4225  * have already been read before. So it's not necessary to check
4226  * the ending segment of the timeline here.
4227  */
4228  if (segno < beginseg)
4229  continue;
4230  }
4231 
4233  {
4234  fd = XLogFileRead(segno, emode, tli,
4235  XLOG_FROM_ARCHIVE, true);
4236  if (fd != -1)
4237  {
4238  elog(DEBUG1, "got WAL segment from archive");
4239  if (!expectedTLEs)
4240  expectedTLEs = tles;
4241  return fd;
4242  }
4243  }
4244 
4246  {
4247  fd = XLogFileRead(segno, emode, tli,
4248  XLOG_FROM_PG_WAL, true);
4249  if (fd != -1)
4250  {
4251  if (!expectedTLEs)
4252  expectedTLEs = tles;
4253  return fd;
4254  }
4255  }
4256  }
4257 
4258  /* Couldn't find it. For simplicity, complain about front timeline */
4260  errno = ENOENT;
4261  ereport(emode,
4263  errmsg("could not open file \"%s\": %m", path)));
4264  return -1;
4265 }
4266 
4267 /*
4268  * Set flag to signal the walreceiver to restart. (The startup process calls
4269  * this on noticing a relevant configuration change.)
4270  */
4271 void
4273 {
4275  {
4276  ereport(LOG,
4277  (errmsg("WAL receiver process shutdown requested")));
4278 
4279  pendingWalRcvRestart = true;
4280  }
4281 }
4282 
4283 
4284 /*
4285  * Has a standby promotion already been triggered?
4286  *
4287  * Unlike CheckForStandbyTrigger(), this works in any process
4288  * that's connected to shared memory.
4289  */
4290 bool
4292 {
4293  /*
4294  * We check shared state each time only until a standby promotion is
4295  * triggered. We can't trigger a promotion again, so there's no need to
4296  * keep checking after the shared variable has once been seen true.
4297  */
4299  return true;
4300 
4304 
4305  return LocalPromoteIsTriggered;
4306 }
4307 
4308 static void
4310 {
4314 
4315  /*
4316  * Mark the recovery pause state as 'not paused' because the paused state
4317  * ends and promotion continues if a promotion is triggered while recovery
4318  * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4319  * return 'paused' while a promotion is ongoing.
4320  */
4321  SetRecoveryPause(false);
4322 
4323  LocalPromoteIsTriggered = true;
4324 }
4325 
4326 /*
4327  * Check whether a promote request has arrived.
4328  */
4329 static bool
4331 {
4333  return true;
4334 
4336  {
4337  ereport(LOG, (errmsg("received promote request")));
4341  return true;
4342  }
4343 
4344  return false;
4345 }
4346 
4347 /*
4348  * Remove the files signaling a standby promotion request.
4349  */
4350 void
4352 {
4353  unlink(PROMOTE_SIGNAL_FILE);
4354 }
4355 
4356 /*
4357  * Check to see if a promote request has arrived.
4358  */
4359 bool
4361 {
4362  struct stat stat_buf;
4363 
4364  if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4365  return true;
4366 
4367  return false;
4368 }
4369 
4370 /*
4371  * Wake up startup process to replay newly arrived WAL, or to notice that
4372  * failover has been requested.
4373  */
4374 void
4376 {
4378 }
4379 
4380 /*
4381  * Schedule a walreceiver wakeup in the main recovery loop.
4382  */
4383 void
4385 {
4387 }
4388 
4389 /*
4390  * Is HotStandby active yet? This is only important in special backends
4391  * since normal backends won't ever be able to connect until this returns
4392  * true. Postmaster knows this by way of signal, not via shared memory.
4393  *
4394  * Unlike testing standbyState, this works in any process that's connected to
4395  * shared memory. (And note that standbyState alone doesn't tell the truth
4396  * anyway.)
4397  */
4398 bool
4400 {
4401  /*
4402  * We check shared state each time only until Hot Standby is active. We
4403  * can't de-activate Hot Standby, so there's no need to keep checking
4404  * after the shared variable has once been seen true.
4405  */
4407  return true;
4408  else
4409  {
4410  /* spinlock is essential on machines with weak memory ordering! */
4414 
4415  return LocalHotStandbyActive;
4416  }
4417 }
4418 
4419 /*
4420  * Like HotStandbyActive(), but to be used only in WAL replay code,
4421  * where we don't need to ask any other process what the state is.
4422  */
4423 static bool
4425 {
4427  return LocalHotStandbyActive;
4428 }
4429 
4430 /*
4431  * Get latest redo apply position.
4432  *
4433  * Exported to allow WALReceiver to read the pointer directly.
4434  */
4435 XLogRecPtr
4437 {
4438  XLogRecPtr recptr;
4439  TimeLineID tli;
4440 
4445 
4446  if (replayTLI)
4447  *replayTLI = tli;
4448  return recptr;
4449 }
4450 
4451 
4452 /*
4453  * Get position of last applied, or the record being applied.
4454  *
4455  * This is different from GetXLogReplayRecPtr() in that if a WAL
4456  * record is currently being applied, this includes that record.
4457  */
4458 XLogRecPtr
4460 {
4461  XLogRecPtr recptr;
4462  TimeLineID tli;
4463 
4465  recptr = XLogRecoveryCtl->replayEndRecPtr;
4468 
4469  if (replayEndTLI)
4470  *replayEndTLI = tli;
4471  return recptr;
4472 }
4473 
4474 /*
4475  * Save timestamp of latest processed commit/abort record.
4476  *
4477  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4478  * seen by processes other than the startup process. Note in particular
4479  * that CreateRestartPoint is executed in the checkpointer.
4480  */
4481 static void
4483 {
4487 }
4488 
4489 /*
4490  * Fetch timestamp of latest processed commit/abort record.
4491  */
4494 {
4495  TimestampTz xtime;
4496 
4500 
4501  return xtime;
4502 }
4503 
4504 /*
4505  * Save timestamp of the next chunk of WAL records to apply.
4506  *
4507  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4508  * seen by all backends.
4509  */
4510 static void
4512 {
4516 }
4517 
4518 /*
4519  * Fetch timestamp of latest processed commit/abort record.
4520  * Startup process maintains an accurate local copy in XLogReceiptTime
4521  */
4524 {
4525  TimestampTz xtime;
4526 
4530 
4531  return xtime;
4532 }
4533 
4534 /*
4535  * Returns time of receipt of current chunk of XLOG data, as well as
4536  * whether it was received from streaming replication or from archives.
4537  */
4538 void
4539 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4540 {
4541  /*
4542  * This must be executed in the startup process, since we don't export the
4543  * relevant state to shared memory.
4544  */
4545  Assert(InRecovery);
4546 
4547  *rtime = XLogReceiptTime;
4548  *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4549 }
4550 
4551 /*
4552  * Note that text field supplied is a parameter name and does not require
4553  * translation
4554  */
4555 void
4556 RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4557 {
4558  if (currValue < minValue)
4559  {
4561  {
4562  bool warned_for_promote = false;
4563 
4564  ereport(WARNING,
4565  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4566  errmsg("hot standby is not possible because of insufficient parameter settings"),
4567  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4568  param_name,
4569  currValue,
4570  minValue)));
4571 
4572  SetRecoveryPause(true);
4573 
4574  ereport(LOG,
4575  (errmsg("recovery has paused"),
4576  errdetail("If recovery is unpaused, the server will shut down."),
4577  errhint("You can then restart the server after making the necessary configuration changes.")));
4578 
4580  {
4582 
4583  if (CheckForStandbyTrigger())
4584  {
4585  if (!warned_for_promote)
4586  ereport(WARNING,
4587  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4588  errmsg("promotion is not possible because of insufficient parameter settings"),
4589 
4590  /*
4591  * Repeat the detail from above so it's easy to find
4592  * in the log.
4593  */
4594  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4595  param_name,
4596  currValue,
4597  minValue),
4598  errhint("Restart the server after making the necessary configuration changes.")));
4599  warned_for_promote = true;
4600  }
4601 
4602  /*
4603  * If recovery pause is requested then set it paused. While
4604  * we are in the loop, user might resume and pause again so
4605  * set this every time.
4606  */
4608 
4609  /*
4610  * We wait on a condition variable that will wake us as soon
4611  * as the pause ends, but we use a timeout so we can check the
4612  * above conditions periodically too.
4613  */
4616  }
4618  }
4619 
4620  ereport(FATAL,
4621  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4622  errmsg("recovery aborted because of insufficient parameter settings"),
4623  /* Repeat the detail from above so it's easy to find in the log. */
4624  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4625  param_name,
4626  currValue,
4627  minValue),
4628  errhint("You can restart the server after making the necessary configuration changes.")));
4629  }
4630 }
4631 
4632 
4633 /*
4634  * GUC check_hook for primary_slot_name
4635  */
4636 bool
4638 {
4639  if (*newval && strcmp(*newval, "") != 0 &&
4641  return false;
4642 
4643  return true;
4644 }
4645 
4646 /*
4647  * Recovery target settings: Only one of the several recovery_target* settings
4648  * may be set. Setting a second one results in an error. The global variable
4649  * recoveryTarget tracks which kind of recovery target was chosen. Other
4650  * variables store the actual target value (for example a string or a xid).
4651  * The assign functions of the parameters check whether a competing parameter
4652  * was already set. But we want to allow setting the same parameter multiple
4653  * times. We also want to allow unsetting a parameter and setting a different
4654  * one, so we unset recoveryTarget when the parameter is set to an empty
4655  * string.
4656  *
4657  * XXX this code is broken by design. Throwing an error from a GUC assign
4658  * hook breaks fundamental assumptions of guc.c. So long as all the variables
4659  * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4660  * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4661  * that we have odd behaviors such as unexpected GUC ordering dependencies.
4662  */
4663 
4664 static void
4666 error_multiple_recovery_targets(void)
4667 {
4668  ereport(ERROR,
4669  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4670  errmsg("multiple recovery targets specified"),
4671  errdetail("At most one of recovery_target, recovery_target_lsn, recovery_target_name, recovery_target_time, recovery_target_xid may be set.")));
4672 }
4673 
4674 /*
4675  * GUC check_hook for recovery_target
4676  */
4677 bool
4679 {
4680  if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4681  {
4682  GUC_check_errdetail("The only allowed value is \"immediate\".");
4683  return false;
4684  }
4685  return true;
4686 }
4687 
4688 /*
4689  * GUC assign_hook for recovery_target
4690  */
4691 void
4692 assign_recovery_target(const char *newval, void *extra)
4693 {
4696  error_multiple_recovery_targets();
4697 
4698  if (newval && strcmp(newval, "") != 0)
4700  else
4702 }
4703 
4704 /*
4705  * GUC check_hook for recovery_target_lsn
4706  */
4707 bool
4709 {
4710  if (strcmp(*newval, "") != 0)
4711  {
4712  XLogRecPtr lsn;
4713  XLogRecPtr *myextra;
4714  bool have_error = false;
4715 
4716  lsn = pg_lsn_in_internal(*newval, &have_error);
4717  if (have_error)
4718  return false;
4719 
4720  myextra = (XLogRecPtr *) guc_malloc(ERROR, sizeof(XLogRecPtr));
4721  *myextra = lsn;
4722  *extra = (void *) myextra;
4723  }
4724  return true;
4725 }
4726 
4727 /*
4728  * GUC assign_hook for recovery_target_lsn
4729  */
4730 void
4731 assign_recovery_target_lsn(const char *newval, void *extra)
4732 {
4735  error_multiple_recovery_targets();
4736 
4737  if (newval && strcmp(newval, "") != 0)
4738  {
4740  recoveryTargetLSN = *((XLogRecPtr *) extra);
4741  }
4742  else
4744 }
4745 
4746 /*
4747  * GUC check_hook for recovery_target_name
4748  */
4749 bool
4751 {
4752  /* Use the value of newval directly */
4753  if (strlen(*newval) >= MAXFNAMELEN)
4754  {
4755  GUC_check_errdetail("%s is too long (maximum %d characters).",
4756  "recovery_target_name", MAXFNAMELEN - 1);
4757  return false;
4758  }
4759  return true;
4760 }
4761 
4762 /*
4763  * GUC assign_hook for recovery_target_name
4764  */
4765 void
4766 assign_recovery_target_name(const char *newval, void *extra)
4767 {
4770  error_multiple_recovery_targets();
4771 
4772  if (newval && strcmp(newval, "") != 0)
4773  {
4776  }
4777  else
4779 }
4780 
4781 /*
4782  * GUC check_hook for recovery_target_time
4783  *
4784  * The interpretation of the recovery_target_time string can depend on the
4785  * time zone setting, so we need to wait until after all GUC processing is
4786  * done before we can do the final parsing of the string. This check function
4787  * only does a parsing pass to catch syntax errors, but we store the string
4788  * and parse it again when we need to use it.
4789  */
4790 bool
4792 {
4793  if (strcmp(*newval, "") != 0)
4794  {
4795  /* reject some special values */
4796  if (strcmp(*newval, "now") == 0 ||
4797  strcmp(*newval, "today") == 0 ||
4798  strcmp(*newval, "tomorrow") == 0 ||
4799  strcmp(*newval, "yesterday") == 0)
4800  {
4801  return false;
4802  }
4803 
4804  /*
4805  * parse timestamp value (see also timestamptz_in())
4806  */
4807  {
4808  char *str = *newval;
4809  fsec_t fsec;
4810  struct pg_tm tt,
4811  *tm = &tt;
4812  int tz;
4813  int dtype;
4814  int nf;
4815  int dterr;
4816  char *field[MAXDATEFIELDS];
4817  int ftype[MAXDATEFIELDS];
4818  char workbuf[MAXDATELEN + MAXDATEFIELDS];
4819  DateTimeErrorExtra dtextra;
4821 
4822  dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4823  field, ftype, MAXDATEFIELDS, &nf);
4824  if (dterr == 0)
4825  dterr = DecodeDateTime(field, ftype, nf,
4826  &dtype, tm, &fsec, &tz, &dtextra);
4827  if (dterr != 0)
4828  return false;
4829  if (dtype != DTK_DATE)
4830  return false;
4831 
4832  if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4833  {
4834  GUC_check_errdetail("timestamp out of range: \"%s\"", str);
4835  return false;
4836  }
4837  }
4838  }
4839  return true;
4840 }
4841 
4842 /*
4843  * GUC assign_hook for recovery_target_time
4844  */
4845 void
4846 assign_recovery_target_time(const char *newval, void *extra)
4847 {
4850  error_multiple_recovery_targets();
4851 
4852  if (newval && strcmp(newval, "") != 0)
4854  else
4856 }
4857 
4858 /*
4859  * GUC check_hook for recovery_target_timeline
4860  */
4861 bool
4863 {
4865  RecoveryTargetTimeLineGoal *myextra;
4866 
4867  if (strcmp(*newval, "current") == 0)
4869  else if (strcmp(*newval, "latest") == 0)
4871  else
4872  {
4874 
4875  errno = 0;
4876  strtoul(*newval, NULL, 0);
4877  if (errno == EINVAL || errno == ERANGE)
4878  {
4879  GUC_check_errdetail("recovery_target_timeline is not a valid number.");
4880  return false;
4881  }
4882  }
4883 
4885  *myextra = rttg;
4886  *extra = (void *) myextra;
4887 
4888  return true;
4889 }
4890 
4891 /*
4892  * GUC assign_hook for recovery_target_timeline
4893  */
4894 void
4895 assign_recovery_target_timeline(const char *newval, void *extra)
4896 {
4899  recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
4900  else
4902 }
4903 
4904 /*
4905  * GUC check_hook for recovery_target_xid
4906  */
4907 bool
4909 {
4910  if (strcmp(*newval, "") != 0)
4911  {
4912  TransactionId xid;
4913  TransactionId *myextra;
4914 
4915  errno = 0;
4916  xid = (TransactionId) strtou64(*newval, NULL, 0);
4917  if (errno == EINVAL || errno == ERANGE)
4918  return false;
4919 
4920  myextra = (TransactionId *) guc_malloc(ERROR, sizeof(TransactionId));
4921  *myextra = xid;
4922  *extra = (void *) myextra;
4923  }
4924  return true;
4925 }
4926 
4927 /*
4928  * GUC assign_hook for recovery_target_xid
4929  */
4930 void
4931 assign_recovery_target_xid(const char *newval, void *extra)
4932 {
4935  error_multiple_recovery_targets();
4936 
4937  if (newval && strcmp(newval, "") != 0)
4938  {
4940  recoveryTargetXid = *((TransactionId *) extra);
4941  }
4942  else
4944 }
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition: timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:544
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:572
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition: timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition: timeline.c:50
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition: timeline.c:526
void remove_tablespace_symlink(const char *linkloc)
Definition: tablespace.c:889
bool allow_in_place_tablespaces
Definition: tablespace.c:91
void HandleStartupProcInterrupts(void)
Definition: startup.c:168
void disable_startup_progress_timeout(void)
Definition: startup.c:318
bool IsPromoteSignaled(void)
Definition: startup.c:297
void begin_startup_progress_phase(void)
Definition: startup.c:352
void ResetPromoteSignaled(void)
Definition: startup.c:303
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition: datetime.c:756
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition: datetime.c:980
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1703
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition: timestamp.c:1934
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1727
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition: timestamp.c:399
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1582
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1546
const char * timestamptz_to_str(TimestampTz t)
Definition: timestamp.c:1790
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4008
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4226
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:284
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:112
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:50
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:232
Pointer Page
Definition: bufpage.h:78
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
unsigned int uint32
Definition: c.h:490
signed int int32
Definition: c.h:478
#define PG_BINARY
Definition: c.h:1260
#define UINT64_FORMAT
Definition: c.h:533
#define strtou64(str, endptr, base)
Definition: c.h:1285
unsigned char uint8
Definition: c.h:488
uint32 TransactionId
Definition: c.h:636
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:166
size_t Size
Definition: c.h:589
void RequestCheckpoint(int flags)
Definition: checkpointer.c:931
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableCancelSleep(void)
int64 TimestampTz
Definition: timestamp.h:39
int32 fsec_t
Definition: timestamp.h:41
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1156
int errcode_for_file_access(void)
Definition: elog.c:881
int errdetail(const char *fmt,...)
Definition: elog.c:1202
ErrorContextCallback * error_context_stack
Definition: elog.c:95
int errhint(const char *fmt,...)
Definition: elog.c:1316
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define LOG
Definition: elog.h:31
#define errcontext
Definition: elog.h:196
#define DEBUG3
Definition: elog.h:28
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2710
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2384
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1015
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:688
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:993
int FreeFile(FILE *file)
Definition: fd.c:2582
int pg_fsync(int fd)
Definition: fd.c:356
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2644
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:406
@ PGFILETYPE_LNK
Definition: file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:646
bool IsUnderPostmaster
Definition: globals.c:113
char * DataDir
Definition: globals.c:66
bool IsPostmasterEnvironment
Definition: globals.c:112
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:631
#define newval
#define GUC_check_errdetail
Definition: guc.h:437
GucSource
Definition: guc.h:108
int trace_recovery_messages
Definition: guc_tables.c:520
#define MAXDATEFIELDS
Definition: datetime.h:202
#define DTK_DATE
Definition: datetime.h:144
#define MAXDATELEN
Definition: datetime.h:200
#define close(a)
Definition: win32.h:12
void proc_exit(int code)
Definition: ipc.c:104
int i
Definition: isn.c:73
void OwnLatch(Latch *latch)
Definition: latch.c:438
void DisownLatch(Latch *latch)
Definition: latch.c:464
void InitSharedLatch(Latch *latch)
Definition: latch.c:405
void SetLatch(Latch *latch)
Definition: latch.c:607
void ResetLatch(Latch *latch)
Definition: latch.c:699
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:492
#define WL_TIMEOUT
Definition: latch.h:128
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:130
#define WL_LATCH_SET
Definition: latch.h:125
Assert(fmt[strlen(fmt) - 1] !='\n')
List * lappend(List *list, void *datum)
Definition: list.c:338
void list_free_deep(List *list)
Definition: list.c:1559
static struct pg_tm tm
Definition: localtime.c:104
char * pstrdup(const char *in)
Definition: mcxt.c:1624
void pfree(void *pointer)
Definition: mcxt.c:1436
void * palloc0(Size size)
Definition: mcxt.c:1241
void * palloc(Size size)
Definition: mcxt.c:1210
#define AmStartupProcess()
Definition: miscadmin.h:443
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:405
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition: pg_control.h:74
#define XLOG_OVERWRITE_CONTRECORD
Definition: pg_control.h:80
DBState
Definition: pg_control.h:88
@ DB_IN_ARCHIVE_RECOVERY
Definition: pg_control.h:94
@ DB_SHUTDOWNED_IN_RECOVERY
Definition: pg_control.h:91
@ DB_SHUTDOWNED
Definition: pg_control.h:90
@ DB_IN_CRASH_RECOVERY
Definition: pg_control.h:93
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:67
#define XLOG_BACKUP_END
Definition: pg_control.h:72
#define XLOG_CHECKPOINT_ONLINE
Definition: pg_control.h:68
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:76
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
XLogRecPtr pg_lsn_in_internal(const char *str, bool *have_error)
Definition: pg_lsn.c:30
static rewind_source * source
Definition: pg_rewind.c:87
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:67
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:181
@ PMSIGNAL_RECOVERY_STARTED
Definition: pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition: pmsignal.h:36
#define pg_pread
Definition: port.h:225
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:350
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:36
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition: procarray.c:4455
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition: procarray.c:4592
static void set_ps_display(const char *activity)
Definition: ps_status.h:40
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
void RmgrStartup(void)
Definition: rmgr.c:49
void RmgrCleanup(void)
Definition: rmgr.c:65
int slock_t
Definition: s_lock.h:754
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:396
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:198
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:91
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:176
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:188
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
Oid oldestMultiDB
Definition: pg_control.h:50
MultiXactId oldestMulti
Definition: pg_control.h:49
MultiXactOffset nextMultiOffset
Definition: pg_control.h:46
TransactionId newestCommitTsXid
Definition: pg_control.h:54
TransactionId oldestXid
Definition: pg_control.h:47
TimeLineID PrevTimeLineID
Definition: pg_control.h:40
TimeLineID ThisTimeLineID
Definition: pg_control.h:39
Oid nextOid
Definition: pg_control.h:44
MultiXactId nextMulti
Definition: pg_control.h:45
FullTransactionId nextXid
Definition: pg_control.h:43
TransactionId oldestCommitTsXid
Definition: pg_control.h:52
XLogRecPtr redo
Definition: pg_control.h:37
Oid oldestXidDB
Definition: pg_control.h:48
XLogRecPtr backupStartPoint
Definition: pg_control.h:168
bool backupEndRequired
Definition: pg_control.h:170
CheckPoint checkPointCopy
Definition: pg_control.h:133
XLogRecPtr backupEndPoint
Definition: pg_control.h:169
XLogRecPtr minRecoveryPoint
Definition: pg_control.h:166
XLogRecPtr checkPoint
Definition: pg_control.h:131
uint64 system_identifier
Definition: pg_control.h:108
TimeLineID minRecoveryPointTLI
Definition: pg_control.h:167
Definition: dirent.c:26
XLogRecPtr lastPageBeginPtr
Definition: xlogrecovery.h:111
XLogRecPtr abortedRecPtr
Definition: xlogrecovery.h:120
XLogRecPtr missingContrecPtr
Definition: xlogrecovery.h:121
TimeLineID endOfLogTLI
Definition: xlogrecovery.h:109
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
Definition: latch.h:111
Definition: pg_list.h:54
RelFileNumber relNumber
void(* rm_mask)(char *pagedata, BlockNumber blkno)
void(* rm_redo)(XLogReaderState *record)
const char *(* rm_identify)(uint8 info)
const char * rm_name
void(* rm_desc)(StringInfo buf, XLogReaderState *record)
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h:29
TimeLineID ws_tli
Definition: xlogreader.h:49
TimeLineID replayTLI
Definition: xlogrecovery.c:199
XLogRecPtr missingContrecPtr
Definition: xlogreader.h:215
char * errormsg_buf
Definition: xlogreader.h:311
XLogRecPtr EndRecPtr
Definition: xlogreader.h:207
uint64 system_identifier
Definition: xlogreader.h:191
XLogRecPtr ReadRecPtr
Definition: xlogreader.h:206
XLogRecPtr abortedRecPtr
Definition: xlogreader.h:214
TimeLineID latestPageTLI
Definition: xlogreader.h:280
XLogRecPtr overwrittenRecPtr
Definition: xlogreader.h:217
XLogRecPtr latestPagePtr
Definition: xlogreader.h:279
WALOpenSegment seg
Definition: xlogreader.h:272
void * private_data
Definition: xlogreader.h:196
uint8 xl_info
Definition: xlogrecord.h:46
uint32 xl_tot_len
Definition: xlogrecord.h:43
TransactionId xl_xid
Definition: xlogrecord.h:44
RmgrId xl_rmid
Definition: xlogrecord.h:47
ConditionVariable recoveryNotPausedCV
Definition: xlogrecovery.c:359
XLogRecPtr lastReplayedEndRecPtr
Definition: xlogrecovery.c:339
TimeLineID replayEndTLI
Definition: xlogrecovery.c:348
TimeLineID lastReplayedTLI
Definition: xlogrecovery.c:340
TimestampTz currentChunkStartTime
Definition: xlogrecovery.c:356
XLogRecPtr replayEndRecPtr
Definition: xlogrecovery.c:347
TimestampTz recoveryLastXTime
Definition: xlogrecovery.c:350
RecoveryPauseState recoveryPauseState
Definition: xlogrecovery.c:358
XLogRecPtr lastReplayedReadRecPtr
Definition: xlogrecovery.c:338
Definition: guc.h:168
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
Definition: pgtime.h:35
Definition: regguts.h:318
TimeLineID PrevTimeLineID
TimeLineID ThisTimeLineID