PostgreSQL Source Code  git master
xlogrecovery.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * xlogrecovery.c
4  * Functions for WAL recovery, standby mode
5  *
6  * This source file contains functions controlling WAL recovery.
7  * InitWalRecovery() initializes the system for crash or archive recovery,
8  * or standby mode, depending on configuration options and the state of
9  * the control file and possible backup label file. PerformWalRecovery()
10  * performs the actual WAL replay, calling the rmgr-specific redo routines.
11  * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12  * and prepares information needed to initialize the WAL for writes. In
13  * addition to these three main functions, there are a bunch of functions
14  * for interrogating recovery state and controlling the recovery process.
15  *
16  *
17  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
18  * Portions Copyright (c) 1994, Regents of the University of California
19  *
20  * src/backend/access/transam/xlogrecovery.c
21  *
22  *-------------------------------------------------------------------------
23  */
24 
25 #include "postgres.h"
26 
27 #include <ctype.h>
28 #include <math.h>
29 #include <time.h>
30 #include <sys/stat.h>
31 #include <sys/time.h>
32 #include <unistd.h>
33 
34 #include "access/timeline.h"
35 #include "access/transam.h"
36 #include "access/xact.h"
37 #include "access/xlog_internal.h"
38 #include "access/xlogarchive.h"
39 #include "access/xlogprefetcher.h"
40 #include "access/xlogreader.h"
41 #include "access/xlogrecovery.h"
42 #include "access/xlogutils.h"
43 #include "backup/basebackup.h"
44 #include "catalog/pg_control.h"
45 #include "commands/tablespace.h"
46 #include "common/file_utils.h"
47 #include "miscadmin.h"
48 #include "pgstat.h"
49 #include "postmaster/bgwriter.h"
50 #include "postmaster/startup.h"
51 #include "replication/slot.h"
52 #include "replication/slotsync.h"
54 #include "storage/fd.h"
55 #include "storage/ipc.h"
56 #include "storage/latch.h"
57 #include "storage/pmsignal.h"
58 #include "storage/procarray.h"
59 #include "storage/spin.h"
60 #include "utils/datetime.h"
61 #include "utils/fmgrprotos.h"
62 #include "utils/guc_hooks.h"
63 #include "utils/pg_lsn.h"
64 #include "utils/ps_status.h"
65 #include "utils/pg_rusage.h"
66 
67 /* Unsupported old recovery command file names (relative to $PGDATA) */
68 #define RECOVERY_COMMAND_FILE "recovery.conf"
69 #define RECOVERY_COMMAND_DONE "recovery.done"
70 
71 /*
72  * GUC support
73  */
75  {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
76  {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
77  {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
78  {NULL, 0, false}
79 };
80 
81 /* options formerly taken from recovery.conf for archive recovery */
83 char *recoveryEndCommand = NULL;
84 char *archiveCleanupCommand = NULL;
91 const char *recoveryTargetName;
94 
95 /* options formerly taken from recovery.conf for XLOG streaming */
96 char *PrimaryConnInfo = NULL;
97 char *PrimarySlotName = NULL;
99 
100 /*
101  * recoveryTargetTimeLineGoal: what the user requested, if any
102  *
103  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
104  *
105  * recoveryTargetTLI: the currently understood target timeline; changes
106  *
107  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
108  * the timelines of its known parents, newest first (so recoveryTargetTLI is
109  * always the first list member). Only these TLIs are expected to be seen in
110  * the WAL segments we read, and indeed only these TLIs will be considered as
111  * candidate WAL files to open at all.
112  *
113  * curFileTLI: the TLI appearing in the name of the current input WAL file.
114  * (This is not necessarily the same as the timeline from which we are
115  * replaying WAL, which StartupXLOG calls replayTLI, because we could be
116  * scanning data that was copied from an ancestor timeline when the current
117  * file was created.) During a sequential scan we do not allow this value
118  * to decrease.
119  */
125 
126 /*
127  * When ArchiveRecoveryRequested is set, archive recovery was requested,
128  * ie. signal files were present. When InArchiveRecovery is set, we are
129  * currently recovering using offline XLOG archives. These variables are only
130  * valid in the startup process.
131  *
132  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
133  * currently performing crash recovery using only XLOG files in pg_wal, but
134  * will switch to using offline XLOG archives as soon as we reach the end of
135  * WAL in pg_wal.
136  */
138 bool InArchiveRecovery = false;
139 
140 /*
141  * When StandbyModeRequested is set, standby mode was requested, i.e.
142  * standby.signal file was present. When StandbyMode is set, we are currently
143  * in standby mode. These variables are only valid in the startup process.
144  * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
145  */
146 static bool StandbyModeRequested = false;
147 bool StandbyMode = false;
148 
149 /* was a signal file present at startup? */
150 static bool standby_signal_file_found = false;
151 static bool recovery_signal_file_found = false;
152 
153 /*
154  * CheckPointLoc is the position of the checkpoint record that determines
155  * where to start the replay. It comes from the backup label file or the
156  * control file.
157  *
158  * RedoStartLSN is the checkpoint's REDO location, also from the backup label
159  * file or the control file. In standby mode, XLOG streaming usually starts
160  * from the position where an invalid record was found. But if we fail to
161  * read even the initial checkpoint record, we use the REDO location instead
162  * of the checkpoint location as the start position of XLOG streaming.
163  * Otherwise we would have to jump backwards to the REDO location after
164  * reading the checkpoint record, because the REDO record can precede the
165  * checkpoint record.
166  */
171 
172 /*
173  * Local copy of SharedHotStandbyActive variable. False actually means "not
174  * known, need to check the shared state".
175  */
176 static bool LocalHotStandbyActive = false;
177 
178 /*
179  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
180  * known, need to check the shared state".
181  */
182 static bool LocalPromoteIsTriggered = false;
183 
184 /* Has the recovery code requested a walreceiver wakeup? */
186 
187 /* XLogReader object used to parse the WAL records */
189 
190 /* XLogPrefetcher object used to consume WAL records with read-ahead */
192 
193 /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
194 typedef struct XLogPageReadPrivate
195 {
196  int emode;
197  bool fetching_ckpt; /* are we fetching a checkpoint record? */
201 
202 /* flag to tell XLogPageRead that we have started replaying */
203 static bool InRedo = false;
204 
205 /*
206  * Codes indicating where we got a WAL file from during recovery, or where
207  * to attempt to get one.
208  */
209 typedef enum
210 {
211  XLOG_FROM_ANY = 0, /* request to read WAL from any source */
212  XLOG_FROM_ARCHIVE, /* restored using restore_command */
213  XLOG_FROM_PG_WAL, /* existing file in pg_wal */
214  XLOG_FROM_STREAM, /* streamed from primary */
215 } XLogSource;
216 
217 /* human-readable names for XLogSources, for debugging output */
218 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
219 
220 /*
221  * readFile is -1 or a kernel FD for the log file segment that's currently
222  * open for reading. readSegNo identifies the segment. readOff is the offset
223  * of the page just read, readLen indicates how much of it has been read into
224  * readBuf, and readSource indicates where we got the currently open file from.
225  *
226  * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
227  * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
228  * worthwhile, since the XLOG is not read by general-purpose sessions.
229  */
230 static int readFile = -1;
231 static XLogSegNo readSegNo = 0;
232 static uint32 readOff = 0;
233 static uint32 readLen = 0;
235 
236 /*
237  * Keeps track of which source we're currently reading from. This is
238  * different from readSource in that this is always set, even when we don't
239  * currently have a WAL file open. If lastSourceFailed is set, our last
240  * attempt to read from currentSource failed, and we should try another source
241  * next.
242  *
243  * pendingWalRcvRestart is set when a config change occurs that requires a
244  * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
245  */
247 static bool lastSourceFailed = false;
248 static bool pendingWalRcvRestart = false;
249 
250 /*
251  * These variables track when we last obtained some WAL data to process,
252  * and where we got it from. (XLogReceiptSource is initially the same as
253  * readSource, but readSource gets reset to zero when we don't have data
254  * to process right now. It is also different from currentSource, which
255  * also changes when we try to read from a source and fail, while
256  * XLogReceiptSource tracks where we last successfully read some WAL.)
257  */
260 
261 /* Local copy of WalRcv->flushedUpto */
264 
265 /*
266  * Copy of minRecoveryPoint and backupEndPoint from the control file.
267  *
268  * In order to reach consistency, we must replay the WAL up to
269  * minRecoveryPoint. If backupEndRequired is true, we must also reach
270  * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
271  * to backupStartPoint.
272  *
273  * Note: In archive recovery, after consistency has been reached, the
274  * functions in xlog.c will start updating minRecoveryPoint in the control
275  * file. But this copy of minRecoveryPoint variable reflects the value at the
276  * beginning of recovery, and is *not* updated after consistency is reached.
277  */
280 
283 static bool backupEndRequired = false;
284 
285 /*
286  * Have we reached a consistent database state? In crash recovery, we have
287  * to replay all the WAL, so reachedConsistency is never set. During archive
288  * recovery, the database is consistent once minRecoveryPoint is reached.
289  *
290  * Consistent state means that the system is internally consistent, all
291  * the WAL has been replayed up to a certain point, and importantly, there
292  * is no trace of later actions on disk.
293  */
294 bool reachedConsistency = false;
295 
296 /* Buffers dedicated to consistency checks of size BLCKSZ */
297 static char *replay_image_masked = NULL;
298 static char *primary_image_masked = NULL;
299 
300 
301 /*
302  * Shared-memory state for WAL recovery.
303  */
304 typedef struct XLogRecoveryCtlData
305 {
306  /*
307  * SharedHotStandbyActive indicates if we allow hot standby queries to be
308  * run. Protected by info_lck.
309  */
311 
312  /*
313  * SharedPromoteIsTriggered indicates if a standby promotion has been
314  * triggered. Protected by info_lck.
315  */
317 
318  /*
319  * recoveryWakeupLatch is used to wake up the startup process to continue
320  * WAL replay, if it is waiting for WAL to arrive or promotion to be
321  * requested.
322  *
323  * Note that the startup process also uses another latch, its procLatch,
324  * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
325  * signaling the startup process in favor of using its procLatch, which
326  * comports better with possible generic signal handlers using that latch.
327  * But we should not do that because the startup process doesn't assume
328  * that it's waken up by walreceiver process or SIGHUP signal handler
329  * while it's waiting for recovery conflict. The separate latches,
330  * recoveryWakeupLatch and procLatch, should be used for inter-process
331  * communication for WAL replay and recovery conflict, respectively.
332  */
334 
335  /*
336  * Last record successfully replayed.
337  */
338  XLogRecPtr lastReplayedReadRecPtr; /* start position */
339  XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
340  TimeLineID lastReplayedTLI; /* timeline */
341 
342  /*
343  * When we're currently replaying a record, ie. in a redo function,
344  * replayEndRecPtr points to the end+1 of the record being replayed,
345  * otherwise it's equal to lastReplayedEndRecPtr.
346  */
349  /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
351 
352  /*
353  * timestamp of when we started replaying the current chunk of WAL data,
354  * only relevant for replication or archive recovery
355  */
357  /* Recovery pause state */
360 
361  slock_t info_lck; /* locks shared variables shown above */
363 
365 
366 /*
367  * abortedRecPtr is the start pointer of a broken record at end of WAL when
368  * recovery completes; missingContrecPtr is the location of the first
369  * contrecord that went missing. See CreateOverwriteContrecordRecord for
370  * details.
371  */
374 
375 /*
376  * if recoveryStopsBefore/After returns true, it saves information of the stop
377  * point here
378  */
383 static bool recoveryStopAfter;
384 
385 /* prototypes for local functions */
386 static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
387 
388 static void EnableStandbyMode(void);
389 static void readRecoverySignalFile(void);
390 static void validateRecoveryParameters(void);
391 static bool read_backup_label(XLogRecPtr *checkPointLoc,
392  TimeLineID *backupLabelTLI,
393  bool *backupEndRequired, bool *backupFromStandby);
394 static bool read_tablespace_map(List **tablespaces);
395 
396 static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
397 static void CheckRecoveryConsistency(void);
398 static void rm_redo_error_callback(void *arg);
399 #ifdef WAL_DEBUG
400 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
401 #endif
402 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
403 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
404  TimeLineID prevTLI, TimeLineID replayTLI);
405 static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
406 static void verifyBackupPageConsistency(XLogReaderState *record);
407 
408 static bool recoveryStopsBefore(XLogReaderState *record);
409 static bool recoveryStopsAfter(XLogReaderState *record);
410 static char *getRecoveryStopReason(void);
411 static void recoveryPausesHere(bool endOfRecovery);
412 static bool recoveryApplyDelay(XLogReaderState *record);
413 static void ConfirmRecoveryPaused(void);
414 
416  int emode, bool fetching_ckpt,
417  TimeLineID replayTLI);
418 
419 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
420  int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
422  bool randAccess,
423  bool fetching_ckpt,
424  XLogRecPtr tliRecPtr,
425  TimeLineID replayTLI,
426  XLogRecPtr replayLSN,
427  bool nonblocking);
428 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
430  XLogRecPtr RecPtr, TimeLineID replayTLI);
431 static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
432 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
433  XLogSource source, bool notfoundOk);
434 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
435 
436 static bool CheckForStandbyTrigger(void);
437 static void SetPromoteIsTriggered(void);
438 static bool HotStandbyActiveInReplay(void);
439 
440 static void SetCurrentChunkStartTime(TimestampTz xtime);
441 static void SetLatestXTime(TimestampTz xtime);
442 
443 /*
444  * Initialization of shared memory for WAL recovery
445  */
446 Size
448 {
449  Size size;
450 
451  /* XLogRecoveryCtl */
452  size = sizeof(XLogRecoveryCtlData);
453 
454  return size;
455 }
456 
457 void
459 {
460  bool found;
461 
463  ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
464  if (found)
465  return;
466  memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
467 
471 }
472 
473 /*
474  * A thin wrapper to enable StandbyMode and do other preparatory work as
475  * needed.
476  */
477 static void
479 {
480  StandbyMode = true;
481 
482  /*
483  * To avoid server log bloat, we don't report recovery progress in a
484  * standby as it will always be in recovery unless promoted. We disable
485  * startup progress timeout in standby mode to avoid calling
486  * startup_progress_timeout_handler() unnecessarily.
487  */
489 }
490 
491 /*
492  * Prepare the system for WAL recovery, if needed.
493  *
494  * This is called by StartupXLOG() which coordinates the server startup
495  * sequence. This function analyzes the control file and the backup label
496  * file, if any, and figures out whether we need to perform crash recovery or
497  * archive recovery, and how far we need to replay the WAL to reach a
498  * consistent state.
499  *
500  * This doesn't yet change the on-disk state, except for creating the symlinks
501  * from table space map file if any, and for fetching WAL files needed to find
502  * the checkpoint record. On entry, the caller has already read the control
503  * file into memory, and passes it as argument. This function updates it to
504  * reflect the recovery state, and the caller is expected to write it back to
505  * disk does after initializing other subsystems, but before calling
506  * PerformWalRecovery().
507  *
508  * This initializes some global variables like ArchiveRecoveryRequested, and
509  * StandbyModeRequested and InRecovery.
510  */
511 void
513  bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
514 {
515  XLogPageReadPrivate *private;
516  struct stat st;
517  bool wasShutdown;
518  XLogRecord *record;
519  DBState dbstate_at_startup;
520  bool haveTblspcMap = false;
521  bool haveBackupLabel = false;
522  CheckPoint checkPoint;
523  bool backupFromStandby = false;
524 
525  dbstate_at_startup = ControlFile->state;
526 
527  /*
528  * Initialize on the assumption we want to recover to the latest timeline
529  * that's active according to pg_control.
530  */
534  else
536 
537  /*
538  * Check for signal files, and if so set up state for offline recovery
539  */
542 
543  /*
544  * Take ownership of the wakeup latch if we're going to sleep during
545  * recovery, if required.
546  */
549 
550  /*
551  * Set the WAL reading processor now, as it will be needed when reading
552  * the checkpoint record required (backup_label or not).
553  */
554  private = palloc0(sizeof(XLogPageReadPrivate));
555  xlogreader =
557  XL_ROUTINE(.page_read = &XLogPageRead,
558  .segment_open = NULL,
559  .segment_close = wal_segment_close),
560  private);
561  if (!xlogreader)
562  ereport(ERROR,
563  (errcode(ERRCODE_OUT_OF_MEMORY),
564  errmsg("out of memory"),
565  errdetail("Failed while allocating a WAL reading processor.")));
567 
568  /*
569  * Set the WAL decode buffer size. This limits how far ahead we can read
570  * in the WAL.
571  */
573 
574  /* Create a WAL prefetcher. */
576 
577  /*
578  * Allocate two page buffers dedicated to WAL consistency checks. We do
579  * it this way, rather than just making static arrays, for two reasons:
580  * (1) no need to waste the storage in most instantiations of the backend;
581  * (2) a static char array isn't guaranteed to have any particular
582  * alignment, whereas palloc() will provide MAXALIGN'd storage.
583  */
584  replay_image_masked = (char *) palloc(BLCKSZ);
585  primary_image_masked = (char *) palloc(BLCKSZ);
586 
587  /*
588  * Read the backup_label file. We want to run this part of the recovery
589  * process after checking for signal files and after performing validation
590  * of the recovery parameters.
591  */
593  &backupFromStandby))
594  {
595  List *tablespaces = NIL;
596 
597  /*
598  * Archive recovery was requested, and thanks to the backup label
599  * file, we know how far we need to replay to reach consistency. Enter
600  * archive recovery directly.
601  */
602  InArchiveRecovery = true;
605 
606  /*
607  * Omitting backup_label when creating a new replica, PITR node etc.
608  * unfortunately is a common cause of corruption. Logging that
609  * backup_label was used makes it a bit easier to exclude that as the
610  * cause of observed corruption.
611  *
612  * Do so before we try to read the checkpoint record (which can fail),
613  * as otherwise it can be hard to understand why a checkpoint other
614  * than ControlFile->checkPoint is used.
615  */
616  ereport(LOG,
617  (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
620  CheckPointTLI)));
621 
622  /*
623  * When a backup_label file is present, we want to roll forward from
624  * the checkpoint it identifies, rather than using pg_control.
625  */
627  CheckPointTLI);
628  if (record != NULL)
629  {
630  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
631  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
632  ereport(DEBUG1,
633  (errmsg_internal("checkpoint record is at %X/%X",
635  InRecovery = true; /* force recovery even if SHUTDOWNED */
636 
637  /*
638  * Make sure that REDO location exists. This may not be the case
639  * if there was a crash during an online backup, which left a
640  * backup_label around that references a WAL segment that's
641  * already been archived.
642  */
643  if (checkPoint.redo < CheckPointLoc)
644  {
646  if (!ReadRecord(xlogprefetcher, LOG, false,
647  checkPoint.ThisTimeLineID))
648  ereport(FATAL,
649  (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
651  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
652  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
653  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
655  }
656  }
657  else
658  {
659  ereport(FATAL,
660  (errmsg("could not locate required checkpoint record at %X/%X",
662  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
663  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
664  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
666  wasShutdown = false; /* keep compiler quiet */
667  }
668 
669  /* Read the tablespace_map file if present and create symlinks. */
670  if (read_tablespace_map(&tablespaces))
671  {
672  ListCell *lc;
673 
674  foreach(lc, tablespaces)
675  {
676  tablespaceinfo *ti = lfirst(lc);
677  char *linkloc;
678 
679  linkloc = psprintf("pg_tblspc/%u", ti->oid);
680 
681  /*
682  * Remove the existing symlink if any and Create the symlink
683  * under PGDATA.
684  */
685  remove_tablespace_symlink(linkloc);
686 
687  if (symlink(ti->path, linkloc) < 0)
688  ereport(ERROR,
690  errmsg("could not create symbolic link \"%s\": %m",
691  linkloc)));
692 
693  pfree(ti->path);
694  pfree(ti);
695  }
696 
697  /* tell the caller to delete it later */
698  haveTblspcMap = true;
699  }
700 
701  /* tell the caller to delete it later */
702  haveBackupLabel = true;
703  }
704  else
705  {
706  /* No backup_label file has been found if we are here. */
707 
708  /*
709  * If tablespace_map file is present without backup_label file, there
710  * is no use of such file. There is no harm in retaining it, but it
711  * is better to get rid of the map file so that we don't have any
712  * redundant file in data directory and it will avoid any sort of
713  * confusion. It seems prudent though to just rename the file out of
714  * the way rather than delete it completely, also we ignore any error
715  * that occurs in rename operation as even if map file is present
716  * without backup_label file, it is harmless.
717  */
718  if (stat(TABLESPACE_MAP, &st) == 0)
719  {
720  unlink(TABLESPACE_MAP_OLD);
722  ereport(LOG,
723  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
725  errdetail("File \"%s\" was renamed to \"%s\".",
727  else
728  ereport(LOG,
729  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
731  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
733  }
734 
735  /*
736  * It's possible that archive recovery was requested, but we don't
737  * know how far we need to replay the WAL before we reach consistency.
738  * This can happen for example if a base backup is taken from a
739  * running server using an atomic filesystem snapshot, without calling
740  * pg_backup_start/stop. Or if you just kill a running primary server
741  * and put it into archive recovery by creating a recovery signal
742  * file.
743  *
744  * Our strategy in that case is to perform crash recovery first,
745  * replaying all the WAL present in pg_wal, and only enter archive
746  * recovery after that.
747  *
748  * But usually we already know how far we need to replay the WAL (up
749  * to minRecoveryPoint, up to backupEndPoint, or until we see an
750  * end-of-backup record), and we can enter archive recovery directly.
751  */
757  {
758  InArchiveRecovery = true;
761  }
762 
763  /*
764  * For the same reason as when starting up with backup_label present,
765  * emit a log message when we continue initializing from a base
766  * backup.
767  */
769  ereport(LOG,
770  (errmsg("restarting backup recovery with redo LSN %X/%X",
772 
773  /* Get the last valid checkpoint record. */
779  CheckPointTLI);
780  if (record != NULL)
781  {
782  ereport(DEBUG1,
783  (errmsg_internal("checkpoint record is at %X/%X",
785  }
786  else
787  {
788  /*
789  * We used to attempt to go back to a secondary checkpoint record
790  * here, but only when not in standby mode. We now just fail if we
791  * can't read the last checkpoint because this allows us to
792  * simplify processing around checkpoints.
793  */
794  ereport(PANIC,
795  (errmsg("could not locate a valid checkpoint record at %X/%X",
797  }
798  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
799  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
800  }
801 
803  {
805  ereport(LOG,
806  (errmsg("entering standby mode")));
808  ereport(LOG,
809  (errmsg("starting point-in-time recovery to XID %u",
812  ereport(LOG,
813  (errmsg("starting point-in-time recovery to %s",
816  ereport(LOG,
817  (errmsg("starting point-in-time recovery to \"%s\"",
820  ereport(LOG,
821  (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
824  ereport(LOG,
825  (errmsg("starting point-in-time recovery to earliest consistent point")));
826  else
827  ereport(LOG,
828  (errmsg("starting archive recovery")));
829  }
830 
831  /*
832  * If the location of the checkpoint record is not on the expected
833  * timeline in the history of the requested timeline, we cannot proceed:
834  * the backup is not part of the history of the requested timeline.
835  */
836  Assert(expectedTLEs); /* was initialized by reading checkpoint
837  * record */
840  {
841  XLogRecPtr switchpoint;
842 
843  /*
844  * tliSwitchPoint will throw an error if the checkpoint's timeline is
845  * not in expectedTLEs at all.
846  */
848  ereport(FATAL,
849  (errmsg("requested timeline %u is not a child of this server's history",
851  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
854  LSN_FORMAT_ARGS(switchpoint))));
855  }
856 
857  /*
858  * The min recovery point should be part of the requested timeline's
859  * history, too.
860  */
864  ereport(FATAL,
865  (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
869 
870  ereport(DEBUG1,
871  (errmsg_internal("redo record is at %X/%X; shutdown %s",
872  LSN_FORMAT_ARGS(checkPoint.redo),
873  wasShutdown ? "true" : "false")));
874  ereport(DEBUG1,
875  (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
876  U64FromFullTransactionId(checkPoint.nextXid),
877  checkPoint.nextOid)));
878  ereport(DEBUG1,
879  (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
880  checkPoint.nextMulti, checkPoint.nextMultiOffset)));
881  ereport(DEBUG1,
882  (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
883  checkPoint.oldestXid, checkPoint.oldestXidDB)));
884  ereport(DEBUG1,
885  (errmsg_internal("oldest MultiXactId: %u, in database %u",
886  checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
887  ereport(DEBUG1,
888  (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
889  checkPoint.oldestCommitTsXid,
890  checkPoint.newestCommitTsXid)));
892  ereport(PANIC,
893  (errmsg("invalid next transaction ID")));
894 
895  /* sanity check */
896  if (checkPoint.redo > CheckPointLoc)
897  ereport(PANIC,
898  (errmsg("invalid redo in checkpoint record")));
899 
900  /*
901  * Check whether we need to force recovery from WAL. If it appears to
902  * have been a clean shutdown and we did not have a recovery signal file,
903  * then assume no recovery needed.
904  */
905  if (checkPoint.redo < CheckPointLoc)
906  {
907  if (wasShutdown)
908  ereport(PANIC,
909  (errmsg("invalid redo record in shutdown checkpoint")));
910  InRecovery = true;
911  }
912  else if (ControlFile->state != DB_SHUTDOWNED)
913  InRecovery = true;
914  else if (ArchiveRecoveryRequested)
915  {
916  /* force recovery due to presence of recovery signal file */
917  InRecovery = true;
918  }
919 
920  /*
921  * If recovery is needed, update our in-memory copy of pg_control to show
922  * that we are recovering and to show the selected checkpoint as the place
923  * we are starting from. We also mark pg_control with any minimum recovery
924  * stop point obtained from a backup history file.
925  *
926  * We don't write the changes to disk yet, though. Only do that after
927  * initializing various subsystems.
928  */
929  if (InRecovery)
930  {
931  if (InArchiveRecovery)
932  {
934  }
935  else
936  {
937  ereport(LOG,
938  (errmsg("database system was not properly shut down; "
939  "automatic recovery in progress")));
941  ereport(LOG,
942  (errmsg("crash recovery starts in timeline %u "
943  "and has target timeline %u",
947  }
949  ControlFile->checkPointCopy = checkPoint;
950  if (InArchiveRecovery)
951  {
952  /* initialize minRecoveryPoint if not set yet */
953  if (ControlFile->minRecoveryPoint < checkPoint.redo)
954  {
955  ControlFile->minRecoveryPoint = checkPoint.redo;
957  }
958  }
959 
960  /*
961  * Set backupStartPoint if we're starting recovery from a base backup.
962  *
963  * Also set backupEndPoint and use minRecoveryPoint as the backup end
964  * location if we're starting recovery from a base backup which was
965  * taken from a standby. In this case, the database system status in
966  * pg_control must indicate that the database was already in recovery.
967  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
968  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
969  * before reaching this point; e.g. because restore_command or
970  * primary_conninfo were faulty.
971  *
972  * Any other state indicates that the backup somehow became corrupted
973  * and we can't sensibly continue with recovery.
974  */
975  if (haveBackupLabel)
976  {
977  ControlFile->backupStartPoint = checkPoint.redo;
979 
980  if (backupFromStandby)
981  {
982  if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
983  dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
984  ereport(FATAL,
985  (errmsg("backup_label contains data inconsistent with control file"),
986  errhint("This means that the backup is corrupted and you will "
987  "have to use another backup for recovery.")));
989  }
990  }
991  }
992 
993  /* remember these, so that we know when we have reached consistency */
997  if (InArchiveRecovery)
998  {
1001  }
1002  else
1003  {
1005  minRecoveryPointTLI = 0;
1006  }
1007 
1008  /*
1009  * Start recovery assuming that the final record isn't lost.
1010  */
1013 
1014  *wasShutdown_ptr = wasShutdown;
1015  *haveBackupLabel_ptr = haveBackupLabel;
1016  *haveTblspcMap_ptr = haveTblspcMap;
1017 }
1018 
1019 /*
1020  * See if there are any recovery signal files and if so, set state for
1021  * recovery.
1022  *
1023  * See if there is a recovery command file (recovery.conf), and if so
1024  * throw an ERROR since as of PG12 we no longer recognize that.
1025  */
1026 static void
1028 {
1029  struct stat stat_buf;
1030 
1032  return;
1033 
1034  /*
1035  * Check for old recovery API file: recovery.conf
1036  */
1037  if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1038  ereport(FATAL,
1040  errmsg("using recovery command file \"%s\" is not supported",
1042 
1043  /*
1044  * Remove unused .done file, if present. Ignore if absent.
1045  */
1046  unlink(RECOVERY_COMMAND_DONE);
1047 
1048  /*
1049  * Check for recovery signal files and if found, fsync them since they
1050  * represent server state information. We don't sweat too much about the
1051  * possibility of fsync failure, however.
1052  *
1053  * If present, standby signal file takes precedence. If neither is present
1054  * then we won't enter archive recovery.
1055  */
1056  if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1057  {
1058  int fd;
1059 
1061  S_IRUSR | S_IWUSR);
1062  if (fd >= 0)
1063  {
1064  (void) pg_fsync(fd);
1065  close(fd);
1066  }
1068  }
1069  else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1070  {
1071  int fd;
1072 
1074  S_IRUSR | S_IWUSR);
1075  if (fd >= 0)
1076  {
1077  (void) pg_fsync(fd);
1078  close(fd);
1079  }
1081  }
1082 
1083  StandbyModeRequested = false;
1084  ArchiveRecoveryRequested = false;
1086  {
1087  StandbyModeRequested = true;
1088  ArchiveRecoveryRequested = true;
1089  }
1090  else if (recovery_signal_file_found)
1091  {
1092  StandbyModeRequested = false;
1093  ArchiveRecoveryRequested = true;
1094  }
1095  else
1096  return;
1097 
1098  /*
1099  * We don't support standby mode in standalone backends; that requires
1100  * other processes such as the WAL receiver to be alive.
1101  */
1103  ereport(FATAL,
1104  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1105  errmsg("standby mode is not supported by single-user servers")));
1106 }
1107 
1108 static void
1110 {
1112  return;
1113 
1114  /*
1115  * Check for compulsory parameters
1116  */
1118  {
1119  if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1120  (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1121  ereport(WARNING,
1122  (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1123  errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1124  }
1125  else
1126  {
1127  if (recoveryRestoreCommand == NULL ||
1128  strcmp(recoveryRestoreCommand, "") == 0)
1129  ereport(FATAL,
1130  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1131  errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1132  }
1133 
1134  /*
1135  * Override any inconsistent requests. Note that this is a change of
1136  * behaviour in 9.5; prior to this we simply ignored a request to pause if
1137  * hot_standby = off, which was surprising behaviour.
1138  */
1142 
1143  /*
1144  * Final parsing of recovery_target_time string; see also
1145  * check_recovery_target_time().
1146  */
1148  {
1152  Int32GetDatum(-1)));
1153  }
1154 
1155  /*
1156  * If user specified recovery_target_timeline, validate it or compute the
1157  * "latest" value. We can't do this until after we've gotten the restore
1158  * command and set InArchiveRecovery, because we need to fetch timeline
1159  * history files from the archive.
1160  */
1162  {
1164 
1165  /* Timeline 1 does not have a history file, all else should */
1166  if (rtli != 1 && !existsTimeLineHistory(rtli))
1167  ereport(FATAL,
1168  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1169  errmsg("recovery target timeline %u does not exist",
1170  rtli)));
1171  recoveryTargetTLI = rtli;
1172  }
1174  {
1175  /* We start the "latest" search from pg_control's timeline */
1177  }
1178  else
1179  {
1180  /*
1181  * else we just use the recoveryTargetTLI as already read from
1182  * ControlFile
1183  */
1185  }
1186 }
1187 
1188 /*
1189  * read_backup_label: check to see if a backup_label file is present
1190  *
1191  * If we see a backup_label during recovery, we assume that we are recovering
1192  * from a backup dump file, and we therefore roll forward from the checkpoint
1193  * identified by the label file, NOT what pg_control says. This avoids the
1194  * problem that pg_control might have been archived one or more checkpoints
1195  * later than the start of the dump, and so if we rely on it as the start
1196  * point, we will fail to restore a consistent database state.
1197  *
1198  * Returns true if a backup_label was found (and fills the checkpoint
1199  * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1200  * returns false if not. If this backup_label came from a streamed backup,
1201  * *backupEndRequired is set to true. If this backup_label was created during
1202  * recovery, *backupFromStandby is set to true.
1203  *
1204  * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1205  * and TLI read from the backup file.
1206  */
1207 static bool
1208 read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1209  bool *backupEndRequired, bool *backupFromStandby)
1210 {
1211  char startxlogfilename[MAXFNAMELEN];
1212  TimeLineID tli_from_walseg,
1213  tli_from_file;
1214  FILE *lfp;
1215  char ch;
1216  char backuptype[20];
1217  char backupfrom[20];
1218  char backuplabel[MAXPGPATH];
1219  char backuptime[128];
1220  uint32 hi,
1221  lo;
1222 
1223  /* suppress possible uninitialized-variable warnings */
1224  *checkPointLoc = InvalidXLogRecPtr;
1225  *backupLabelTLI = 0;
1226  *backupEndRequired = false;
1227  *backupFromStandby = false;
1228 
1229  /*
1230  * See if label file is present
1231  */
1232  lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1233  if (!lfp)
1234  {
1235  if (errno != ENOENT)
1236  ereport(FATAL,
1238  errmsg("could not read file \"%s\": %m",
1239  BACKUP_LABEL_FILE)));
1240  return false; /* it's not there, all is fine */
1241  }
1242 
1243  /*
1244  * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1245  * is pretty crude, but we are not expecting any variability in the file
1246  * format).
1247  */
1248  if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1249  &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1250  ereport(FATAL,
1251  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1252  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1253  RedoStartLSN = ((uint64) hi) << 32 | lo;
1254  RedoStartTLI = tli_from_walseg;
1255  if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1256  &hi, &lo, &ch) != 3 || ch != '\n')
1257  ereport(FATAL,
1258  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1259  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1260  *checkPointLoc = ((uint64) hi) << 32 | lo;
1261  *backupLabelTLI = tli_from_walseg;
1262 
1263  /*
1264  * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1265  * which could mean either pg_basebackup or the pg_backup_start/stop
1266  * method was used) or if this label came from somewhere else (the only
1267  * other option today being from pg_rewind). If this was a streamed
1268  * backup then we know that we need to play through until we get to the
1269  * end of the WAL which was generated during the backup (at which point we
1270  * will have reached consistency and backupEndRequired will be reset to be
1271  * false).
1272  */
1273  if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1274  {
1275  if (strcmp(backuptype, "streamed") == 0)
1276  *backupEndRequired = true;
1277  }
1278 
1279  /*
1280  * BACKUP FROM lets us know if this was from a primary or a standby. If
1281  * it was from a standby, we'll double-check that the control file state
1282  * matches that of a standby.
1283  */
1284  if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1285  {
1286  if (strcmp(backupfrom, "standby") == 0)
1287  *backupFromStandby = true;
1288  }
1289 
1290  /*
1291  * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1292  * but checking for their presence is useful for debugging and the next
1293  * sanity checks. Cope also with the fact that the result buffers have a
1294  * pre-allocated size, hence if the backup_label file has been generated
1295  * with strings longer than the maximum assumed here an incorrect parsing
1296  * happens. That's fine as only minor consistency checks are done
1297  * afterwards.
1298  */
1299  if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1300  ereport(DEBUG1,
1301  (errmsg_internal("backup time %s in file \"%s\"",
1302  backuptime, BACKUP_LABEL_FILE)));
1303 
1304  if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1305  ereport(DEBUG1,
1306  (errmsg_internal("backup label %s in file \"%s\"",
1307  backuplabel, BACKUP_LABEL_FILE)));
1308 
1309  /*
1310  * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1311  * it as a sanity check if present.
1312  */
1313  if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1314  {
1315  if (tli_from_walseg != tli_from_file)
1316  ereport(FATAL,
1317  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1318  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1319  errdetail("Timeline ID parsed is %u, but expected %u.",
1320  tli_from_file, tli_from_walseg)));
1321 
1322  ereport(DEBUG1,
1323  (errmsg_internal("backup timeline %u in file \"%s\"",
1324  tli_from_file, BACKUP_LABEL_FILE)));
1325  }
1326 
1327  if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
1328  ereport(FATAL,
1329  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1330  errmsg("this is an incremental backup, not a data directory"),
1331  errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1332 
1333  if (ferror(lfp) || FreeFile(lfp))
1334  ereport(FATAL,
1336  errmsg("could not read file \"%s\": %m",
1337  BACKUP_LABEL_FILE)));
1338 
1339  return true;
1340 }
1341 
1342 /*
1343  * read_tablespace_map: check to see if a tablespace_map file is present
1344  *
1345  * If we see a tablespace_map file during recovery, we assume that we are
1346  * recovering from a backup dump file, and we therefore need to create symlinks
1347  * as per the information present in tablespace_map file.
1348  *
1349  * Returns true if a tablespace_map file was found (and fills *tablespaces
1350  * with a tablespaceinfo struct for each tablespace listed in the file);
1351  * returns false if not.
1352  */
1353 static bool
1355 {
1356  tablespaceinfo *ti;
1357  FILE *lfp;
1358  char str[MAXPGPATH];
1359  int ch,
1360  i,
1361  n;
1362  bool was_backslash;
1363 
1364  /*
1365  * See if tablespace_map file is present
1366  */
1367  lfp = AllocateFile(TABLESPACE_MAP, "r");
1368  if (!lfp)
1369  {
1370  if (errno != ENOENT)
1371  ereport(FATAL,
1373  errmsg("could not read file \"%s\": %m",
1374  TABLESPACE_MAP)));
1375  return false; /* it's not there, all is fine */
1376  }
1377 
1378  /*
1379  * Read and parse the link name and path lines from tablespace_map file
1380  * (this code is pretty crude, but we are not expecting any variability in
1381  * the file format). De-escape any backslashes that were inserted.
1382  */
1383  i = 0;
1384  was_backslash = false;
1385  while ((ch = fgetc(lfp)) != EOF)
1386  {
1387  if (!was_backslash && (ch == '\n' || ch == '\r'))
1388  {
1389  char *endp;
1390 
1391  if (i == 0)
1392  continue; /* \r immediately followed by \n */
1393 
1394  /*
1395  * The de-escaped line should contain an OID followed by exactly
1396  * one space followed by a path. The path might start with
1397  * spaces, so don't be too liberal about parsing.
1398  */
1399  str[i] = '\0';
1400  n = 0;
1401  while (str[n] && str[n] != ' ')
1402  n++;
1403  if (n < 1 || n >= i - 1)
1404  ereport(FATAL,
1405  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1406  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1407  str[n++] = '\0';
1408 
1409  ti = palloc0(sizeof(tablespaceinfo));
1410  errno = 0;
1411  ti->oid = strtoul(str, &endp, 10);
1412  if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1413  ereport(FATAL,
1414  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1415  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1416  ti->path = pstrdup(str + n);
1417  *tablespaces = lappend(*tablespaces, ti);
1418 
1419  i = 0;
1420  continue;
1421  }
1422  else if (!was_backslash && ch == '\\')
1423  was_backslash = true;
1424  else
1425  {
1426  if (i < sizeof(str) - 1)
1427  str[i++] = ch;
1428  was_backslash = false;
1429  }
1430  }
1431 
1432  if (i != 0 || was_backslash) /* last line not terminated? */
1433  ereport(FATAL,
1434  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1435  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1436 
1437  if (ferror(lfp) || FreeFile(lfp))
1438  ereport(FATAL,
1440  errmsg("could not read file \"%s\": %m",
1441  TABLESPACE_MAP)));
1442 
1443  return true;
1444 }
1445 
1446 /*
1447  * Finish WAL recovery.
1448  *
1449  * This does not close the 'xlogreader' yet, because in some cases the caller
1450  * still wants to re-read the last checkpoint record by calling
1451  * ReadCheckpointRecord().
1452  *
1453  * Returns the position of the last valid or applied record, after which new
1454  * WAL should be appended, information about why recovery was ended, and some
1455  * other things. See the EndOfWalRecoveryInfo struct for details.
1456  */
1459 {
1461  XLogRecPtr lastRec;
1462  TimeLineID lastRecTLI;
1463  XLogRecPtr endOfLog;
1464 
1465  /*
1466  * Kill WAL receiver, if it's still running, before we continue to write
1467  * the startup checkpoint and aborted-contrecord records. It will trump
1468  * over these records and subsequent ones if it's still alive when we
1469  * start writing WAL.
1470  */
1472 
1473  /*
1474  * Shutdown the slot sync worker to drop any temporary slots acquired by
1475  * it and to prevent it from keep trying to fetch the failover slots.
1476  *
1477  * We do not update the 'synced' column in 'pg_replication_slots' system
1478  * view from true to false here, as any failed update could leave 'synced'
1479  * column false for some slots. This could cause issues during slot sync
1480  * after restarting the server as a standby. While updating the 'synced'
1481  * column after switching to the new timeline is an option, it does not
1482  * simplify the handling for the 'synced' column. Therefore, we retain the
1483  * 'synced' column as true after promotion as it may provide useful
1484  * information about the slot origin.
1485  */
1486  ShutDownSlotSync();
1487 
1488  /*
1489  * We are now done reading the xlog from stream. Turn off streaming
1490  * recovery to force fetching the files (which would be required at end of
1491  * recovery, e.g., timeline history file) from archive or pg_wal.
1492  *
1493  * Note that standby mode must be turned off after killing WAL receiver,
1494  * i.e., calling XLogShutdownWalRcv().
1495  */
1496  Assert(!WalRcvStreaming());
1497  StandbyMode = false;
1498 
1499  /*
1500  * Determine where to start writing WAL next.
1501  *
1502  * Re-fetch the last valid or last applied record, so we can identify the
1503  * exact endpoint of what we consider the valid portion of WAL. There may
1504  * be an incomplete continuation record after that, in which case
1505  * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1506  * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1507  * it is intentionally missing. See CreateOverwriteContrecordRecord().
1508  *
1509  * An important side-effect of this is to load the last page into
1510  * xlogreader. The caller uses it to initialize the WAL for writing.
1511  */
1512  if (!InRecovery)
1513  {
1514  lastRec = CheckPointLoc;
1515  lastRecTLI = CheckPointTLI;
1516  }
1517  else
1518  {
1520  lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1521  }
1523  (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1524  endOfLog = xlogreader->EndRecPtr;
1525 
1526  /*
1527  * Remember the TLI in the filename of the XLOG segment containing the
1528  * end-of-log. It could be different from the timeline that endOfLog
1529  * nominally belongs to, if there was a timeline switch in that segment,
1530  * and we were reading the old WAL from a segment belonging to a higher
1531  * timeline.
1532  */
1533  result->endOfLogTLI = xlogreader->seg.ws_tli;
1534 
1536  {
1537  /*
1538  * We are no longer in archive recovery state.
1539  *
1540  * We are now done reading the old WAL. Turn off archive fetching if
1541  * it was active.
1542  */
1544  InArchiveRecovery = false;
1545 
1546  /*
1547  * If the ending log segment is still open, close it (to avoid
1548  * problems on Windows with trying to rename or delete an open file).
1549  */
1550  if (readFile >= 0)
1551  {
1552  close(readFile);
1553  readFile = -1;
1554  }
1555  }
1556 
1557  /*
1558  * Copy the last partial block to the caller, for initializing the WAL
1559  * buffer for appending new WAL.
1560  */
1561  if (endOfLog % XLOG_BLCKSZ != 0)
1562  {
1563  char *page;
1564  int len;
1565  XLogRecPtr pageBeginPtr;
1566 
1567  pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1568  Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1569 
1570  /* Copy the valid part of the last block */
1571  len = endOfLog % XLOG_BLCKSZ;
1572  page = palloc(len);
1573  memcpy(page, xlogreader->readBuf, len);
1574 
1575  result->lastPageBeginPtr = pageBeginPtr;
1576  result->lastPage = page;
1577  }
1578  else
1579  {
1580  /* There is no partial block to copy. */
1581  result->lastPageBeginPtr = endOfLog;
1582  result->lastPage = NULL;
1583  }
1584 
1585  /*
1586  * Create a comment for the history file to explain why and where timeline
1587  * changed.
1588  */
1590 
1591  result->lastRec = lastRec;
1592  result->lastRecTLI = lastRecTLI;
1593  result->endOfLog = endOfLog;
1594 
1595  result->abortedRecPtr = abortedRecPtr;
1597 
1600 
1601  return result;
1602 }
1603 
1604 /*
1605  * Clean up the WAL reader and leftovers from restoring WAL from archive
1606  */
1607 void
1609 {
1610  char recoveryPath[MAXPGPATH];
1611 
1612  /* Final update of pg_stat_recovery_prefetch. */
1614 
1615  /* Shut down xlogreader */
1616  if (readFile >= 0)
1617  {
1618  close(readFile);
1619  readFile = -1;
1620  }
1623 
1625  {
1626  /*
1627  * Since there might be a partial WAL segment named RECOVERYXLOG, get
1628  * rid of it.
1629  */
1630  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1631  unlink(recoveryPath); /* ignore any error */
1632 
1633  /* Get rid of any remaining recovered timeline-history file, too */
1634  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1635  unlink(recoveryPath); /* ignore any error */
1636  }
1637 
1638  /*
1639  * We don't need the latch anymore. It's not strictly necessary to disown
1640  * it, but let's do it for the sake of tidiness.
1641  */
1644 }
1645 
1646 /*
1647  * Perform WAL recovery.
1648  *
1649  * If the system was shut down cleanly, this is never called.
1650  */
1651 void
1653 {
1654  XLogRecord *record;
1655  bool reachedRecoveryTarget = false;
1656  TimeLineID replayTLI;
1657 
1658  /*
1659  * Initialize shared variables for tracking progress of WAL replay, as if
1660  * we had just replayed the record before the REDO location (or the
1661  * checkpoint record itself, if it's a shutdown checkpoint).
1662  */
1665  {
1669  }
1670  else
1671  {
1675  }
1682 
1683  /* Also ensure XLogReceiptTime has a sane value */
1685 
1686  /*
1687  * Let postmaster know we've started redo now, so that it can launch the
1688  * archiver if necessary.
1689  */
1690  if (IsUnderPostmaster)
1692 
1693  /*
1694  * Allow read-only connections immediately if we're consistent already.
1695  */
1697 
1698  /*
1699  * Find the first record that logically follows the checkpoint --- it
1700  * might physically precede it, though.
1701  */
1703  {
1704  /* back up to find the record */
1705  replayTLI = RedoStartTLI;
1707  record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1708 
1709  /*
1710  * If a checkpoint record's redo pointer points back to an earlier
1711  * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1712  * record.
1713  */
1714  if (record->xl_rmid != RM_XLOG_ID ||
1715  (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1716  ereport(FATAL,
1717  (errmsg("unexpected record type found at redo point %X/%X",
1719  }
1720  else
1721  {
1722  /* just have to read next record after CheckPoint */
1724  replayTLI = CheckPointTLI;
1725  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1726  }
1727 
1728  if (record != NULL)
1729  {
1730  TimestampTz xtime;
1731  PGRUsage ru0;
1732 
1733  pg_rusage_init(&ru0);
1734 
1735  InRedo = true;
1736 
1737  RmgrStartup();
1738 
1739  ereport(LOG,
1740  (errmsg("redo starts at %X/%X",
1742 
1743  /* Prepare to report progress of the redo phase. */
1744  if (!StandbyMode)
1746 
1747  /*
1748  * main redo apply loop
1749  */
1750  do
1751  {
1752  if (!StandbyMode)
1753  ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1755 
1756 #ifdef WAL_DEBUG
1757  if (XLOG_DEBUG)
1758  {
1760 
1761  initStringInfo(&buf);
1762  appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1765  xlog_outrec(&buf, xlogreader);
1766  appendStringInfoString(&buf, " - ");
1768  elog(LOG, "%s", buf.data);
1769  pfree(buf.data);
1770  }
1771 #endif
1772 
1773  /* Handle interrupt signals of startup process */
1775 
1776  /*
1777  * Pause WAL replay, if requested by a hot-standby session via
1778  * SetRecoveryPause().
1779  *
1780  * Note that we intentionally don't take the info_lck spinlock
1781  * here. We might therefore read a slightly stale value of the
1782  * recoveryPause flag, but it can't be very stale (no worse than
1783  * the last spinlock we did acquire). Since a pause request is a
1784  * pretty asynchronous thing anyway, possibly responding to it one
1785  * WAL record later than we otherwise would is a minor issue, so
1786  * it doesn't seem worth adding another spinlock cycle to prevent
1787  * that.
1788  */
1789  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1791  recoveryPausesHere(false);
1792 
1793  /*
1794  * Have we reached our recovery target?
1795  */
1797  {
1798  reachedRecoveryTarget = true;
1799  break;
1800  }
1801 
1802  /*
1803  * If we've been asked to lag the primary, wait on latch until
1804  * enough time has passed.
1805  */
1807  {
1808  /*
1809  * We test for paused recovery again here. If user sets
1810  * delayed apply, it may be because they expect to pause
1811  * recovery in case of problems, so we must test again here
1812  * otherwise pausing during the delay-wait wouldn't work.
1813  */
1814  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1816  recoveryPausesHere(false);
1817  }
1818 
1819  /*
1820  * Apply the record
1821  */
1822  ApplyWalRecord(xlogreader, record, &replayTLI);
1823 
1824  /* Exit loop if we reached inclusive recovery target */
1826  {
1827  reachedRecoveryTarget = true;
1828  break;
1829  }
1830 
1831  /* Else, try to fetch the next WAL record */
1832  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1833  } while (record != NULL);
1834 
1835  /*
1836  * end of main redo apply loop
1837  */
1838 
1839  if (reachedRecoveryTarget)
1840  {
1841  if (!reachedConsistency)
1842  ereport(FATAL,
1843  (errmsg("requested recovery stop point is before consistent recovery point")));
1844 
1845  /*
1846  * This is the last point where we can restart recovery with a new
1847  * recovery target, if we shutdown and begin again. After this,
1848  * Resource Managers may choose to do permanent corrective actions
1849  * at end of recovery.
1850  */
1851  switch (recoveryTargetAction)
1852  {
1854 
1855  /*
1856  * exit with special return code to request shutdown of
1857  * postmaster. Log messages issued from postmaster.
1858  */
1859  proc_exit(3);
1860 
1862  SetRecoveryPause(true);
1863  recoveryPausesHere(true);
1864 
1865  /* drop into promote */
1866 
1868  break;
1869  }
1870  }
1871 
1872  RmgrCleanup();
1873 
1874  ereport(LOG,
1875  (errmsg("redo done at %X/%X system usage: %s",
1877  pg_rusage_show(&ru0))));
1878  xtime = GetLatestXTime();
1879  if (xtime)
1880  ereport(LOG,
1881  (errmsg("last completed transaction was at log time %s",
1882  timestamptz_to_str(xtime))));
1883 
1884  InRedo = false;
1885  }
1886  else
1887  {
1888  /* there are no WAL records following the checkpoint */
1889  ereport(LOG,
1890  (errmsg("redo is not required")));
1891  }
1892 
1893  /*
1894  * This check is intentionally after the above log messages that indicate
1895  * how far recovery went.
1896  */
1899  !reachedRecoveryTarget)
1900  ereport(FATAL,
1901  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1902  errmsg("recovery ended before configured recovery target was reached")));
1903 }
1904 
1905 /*
1906  * Subroutine of PerformWalRecovery, to apply one WAL record.
1907  */
1908 static void
1910 {
1911  ErrorContextCallback errcallback;
1912  bool switchedTLI = false;
1913 
1914  /* Setup error traceback support for ereport() */
1915  errcallback.callback = rm_redo_error_callback;
1916  errcallback.arg = (void *) xlogreader;
1917  errcallback.previous = error_context_stack;
1918  error_context_stack = &errcallback;
1919 
1920  /*
1921  * TransamVariables->nextXid must be beyond record's xid.
1922  */
1924 
1925  /*
1926  * Before replaying this record, check if this record causes the current
1927  * timeline to change. The record is already considered to be part of the
1928  * new timeline, so we update replayTLI before replaying it. That's
1929  * important so that replayEndTLI, which is recorded as the minimum
1930  * recovery point's TLI if recovery stops after this record, is set
1931  * correctly.
1932  */
1933  if (record->xl_rmid == RM_XLOG_ID)
1934  {
1935  TimeLineID newReplayTLI = *replayTLI;
1936  TimeLineID prevReplayTLI = *replayTLI;
1937  uint8 info = record->xl_info & ~XLR_INFO_MASK;
1938 
1939  if (info == XLOG_CHECKPOINT_SHUTDOWN)
1940  {
1941  CheckPoint checkPoint;
1942 
1943  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1944  newReplayTLI = checkPoint.ThisTimeLineID;
1945  prevReplayTLI = checkPoint.PrevTimeLineID;
1946  }
1947  else if (info == XLOG_END_OF_RECOVERY)
1948  {
1949  xl_end_of_recovery xlrec;
1950 
1951  memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1952  newReplayTLI = xlrec.ThisTimeLineID;
1953  prevReplayTLI = xlrec.PrevTimeLineID;
1954  }
1955 
1956  if (newReplayTLI != *replayTLI)
1957  {
1958  /* Check that it's OK to switch to this TLI */
1960  newReplayTLI, prevReplayTLI, *replayTLI);
1961 
1962  /* Following WAL records should be run with new TLI */
1963  *replayTLI = newReplayTLI;
1964  switchedTLI = true;
1965  }
1966  }
1967 
1968  /*
1969  * Update shared replayEndRecPtr before replaying this record, so that
1970  * XLogFlush will update minRecoveryPoint correctly.
1971  */
1974  XLogRecoveryCtl->replayEndTLI = *replayTLI;
1976 
1977  /*
1978  * If we are attempting to enter Hot Standby mode, process XIDs we see
1979  */
1981  TransactionIdIsValid(record->xl_xid))
1983 
1984  /*
1985  * Some XLOG record types that are related to recovery are processed
1986  * directly here, rather than in xlog_redo()
1987  */
1988  if (record->xl_rmid == RM_XLOG_ID)
1989  xlogrecovery_redo(xlogreader, *replayTLI);
1990 
1991  /* Now apply the WAL record itself */
1992  GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1993 
1994  /*
1995  * After redo, check whether the backup pages associated with the WAL
1996  * record are consistent with the existing pages. This check is done only
1997  * if consistency check is enabled for this record.
1998  */
1999  if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2001 
2002  /* Pop the error context stack */
2003  error_context_stack = errcallback.previous;
2004 
2005  /*
2006  * Update lastReplayedEndRecPtr after this record has been successfully
2007  * replayed.
2008  */
2012  XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2014 
2015  /* ------
2016  * Wakeup walsenders:
2017  *
2018  * On the standby, the WAL is flushed first (which will only wake up
2019  * physical walsenders) and then applied, which will only wake up logical
2020  * walsenders.
2021  *
2022  * Indeed, logical walsenders on standby can't decode and send data until
2023  * it's been applied.
2024  *
2025  * Physical walsenders don't need to be woken up during replay unless
2026  * cascading replication is allowed and time line change occurred (so that
2027  * they can notice that they are on a new time line).
2028  *
2029  * That's why the wake up conditions are for:
2030  *
2031  * - physical walsenders in case of new time line and cascade
2032  * replication is allowed
2033  * - logical walsenders in case cascade replication is allowed (could not
2034  * be created otherwise)
2035  * ------
2036  */
2038  WalSndWakeup(switchedTLI, true);
2039 
2040  /*
2041  * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2042  * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2043  * a reply to the primary.
2044  */
2046  {
2047  doRequestWalReceiverReply = false;
2048  WalRcvForceReply();
2049  }
2050 
2051  /* Allow read-only connections if we're consistent now */
2053 
2054  /* Is this a timeline switch? */
2055  if (switchedTLI)
2056  {
2057  /*
2058  * Before we continue on the new timeline, clean up any (possibly
2059  * bogus) future WAL segments on the old timeline.
2060  */
2062 
2063  /* Reset the prefetcher. */
2065  }
2066 }
2067 
2068 /*
2069  * Some XLOG RM record types that are directly related to WAL recovery are
2070  * handled here rather than in the xlog_redo()
2071  */
2072 static void
2074 {
2075  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2076  XLogRecPtr lsn = record->EndRecPtr;
2077 
2078  Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2079 
2080  if (info == XLOG_OVERWRITE_CONTRECORD)
2081  {
2082  /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2084 
2085  memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2086  if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2087  elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
2090 
2091  /* We have safely skipped the aborted record */
2094 
2095  ereport(LOG,
2096  (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2099 
2100  /* Verifying the record should only happen once */
2102  }
2103  else if (info == XLOG_BACKUP_END)
2104  {
2105  XLogRecPtr startpoint;
2106 
2107  memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2108 
2109  if (backupStartPoint == startpoint)
2110  {
2111  /*
2112  * We have reached the end of base backup, the point where
2113  * pg_backup_stop() was done. The data on disk is now consistent
2114  * (assuming we have also reached minRecoveryPoint). Set
2115  * backupEndPoint to the current LSN, so that the next call to
2116  * CheckRecoveryConsistency() will notice it and do the
2117  * end-of-backup processing.
2118  */
2119  elog(DEBUG1, "end of backup record reached");
2120 
2121  backupEndPoint = lsn;
2122  }
2123  else
2124  elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2126  }
2127 }
2128 
2129 /*
2130  * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2131  * directories.
2132  *
2133  * Replay of database creation XLOG records for databases that were later
2134  * dropped can create fake directories in pg_tblspc. By the time consistency
2135  * is reached these directories should have been removed; here we verify
2136  * that this did indeed happen. This is to be called at the point where
2137  * consistent state is reached.
2138  *
2139  * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2140  * useful for testing purposes, and also allows for an escape hatch in case
2141  * things go south.
2142  */
2143 static void
2145 {
2146  DIR *dir;
2147  struct dirent *de;
2148 
2149  dir = AllocateDir("pg_tblspc");
2150  while ((de = ReadDir(dir, "pg_tblspc")) != NULL)
2151  {
2152  char path[MAXPGPATH + 10];
2153 
2154  /* Skip entries of non-oid names */
2155  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2156  continue;
2157 
2158  snprintf(path, sizeof(path), "pg_tblspc/%s", de->d_name);
2159 
2160  if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2163  errmsg("unexpected directory entry \"%s\" found in %s",
2164  de->d_name, "pg_tblspc/"),
2165  errdetail("All directory entries in pg_tblspc/ should be symbolic links."),
2166  errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2167  }
2168 }
2169 
2170 /*
2171  * Checks if recovery has reached a consistent state. When consistency is
2172  * reached and we have a valid starting standby snapshot, tell postmaster
2173  * that it can start accepting read-only connections.
2174  */
2175 static void
2177 {
2178  XLogRecPtr lastReplayedEndRecPtr;
2179  TimeLineID lastReplayedTLI;
2180 
2181  /*
2182  * During crash recovery, we don't reach a consistent state until we've
2183  * replayed all the WAL.
2184  */
2186  return;
2187 
2189 
2190  /*
2191  * assume that we are called in the startup process, and hence don't need
2192  * a lock to read lastReplayedEndRecPtr
2193  */
2194  lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2195  lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2196 
2197  /*
2198  * Have we reached the point where our base backup was completed?
2199  */
2201  backupEndPoint <= lastReplayedEndRecPtr)
2202  {
2203  XLogRecPtr saveBackupStartPoint = backupStartPoint;
2204  XLogRecPtr saveBackupEndPoint = backupEndPoint;
2205 
2206  elog(DEBUG1, "end of backup reached");
2207 
2208  /*
2209  * We have reached the end of base backup, as indicated by pg_control.
2210  * Update the control file accordingly.
2211  */
2212  ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2215  backupEndRequired = false;
2216 
2217  ereport(LOG,
2218  (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
2219  LSN_FORMAT_ARGS(saveBackupStartPoint),
2220  LSN_FORMAT_ARGS(saveBackupEndPoint))));
2221  }
2222 
2223  /*
2224  * Have we passed our safe starting point? Note that minRecoveryPoint is
2225  * known to be incorrectly set if recovering from a backup, until the
2226  * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2227  * All we know prior to that is that we're not consistent yet.
2228  */
2230  minRecoveryPoint <= lastReplayedEndRecPtr)
2231  {
2232  /*
2233  * Check to see if the XLOG sequence contained any unresolved
2234  * references to uninitialized pages.
2235  */
2237 
2238  /*
2239  * Check that pg_tblspc doesn't contain any real directories. Replay
2240  * of Database/CREATE_* records may have created fictitious tablespace
2241  * directories that should have been removed by the time consistency
2242  * was reached.
2243  */
2245 
2246  reachedConsistency = true;
2247  ereport(LOG,
2248  (errmsg("consistent recovery state reached at %X/%X",
2249  LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2250  }
2251 
2252  /*
2253  * Have we got a valid starting snapshot that will allow queries to be
2254  * run? If so, we can tell postmaster that the database is consistent now,
2255  * enabling connections.
2256  */
2261  {
2265 
2266  LocalHotStandbyActive = true;
2267 
2269  }
2270 }
2271 
2272 /*
2273  * Error context callback for errors occurring during rm_redo().
2274  */
2275 static void
2277 {
2278  XLogReaderState *record = (XLogReaderState *) arg;
2280 
2281  initStringInfo(&buf);
2282  xlog_outdesc(&buf, record);
2283  xlog_block_info(&buf, record);
2284 
2285  /* translator: %s is a WAL record description */
2286  errcontext("WAL redo at %X/%X for %s",
2287  LSN_FORMAT_ARGS(record->ReadRecPtr),
2288  buf.data);
2289 
2290  pfree(buf.data);
2291 }
2292 
2293 /*
2294  * Returns a string describing an XLogRecord, consisting of its identity
2295  * optionally followed by a colon, a space, and a further description.
2296  */
2297 void
2299 {
2300  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2301  uint8 info = XLogRecGetInfo(record);
2302  const char *id;
2303 
2305  appendStringInfoChar(buf, '/');
2306 
2307  id = rmgr.rm_identify(info);
2308  if (id == NULL)
2309  appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2310  else
2311  appendStringInfo(buf, "%s: ", id);
2312 
2313  rmgr.rm_desc(buf, record);
2314 }
2315 
2316 #ifdef WAL_DEBUG
2317 
2318 static void
2319 xlog_outrec(StringInfo buf, XLogReaderState *record)
2320 {
2321  appendStringInfo(buf, "prev %X/%X; xid %u",
2323  XLogRecGetXid(record));
2324 
2325  appendStringInfo(buf, "; len %u",
2326  XLogRecGetDataLen(record));
2327 
2328  xlog_block_info(buf, record);
2329 }
2330 #endif /* WAL_DEBUG */
2331 
2332 /*
2333  * Returns a string giving information about all the blocks in an
2334  * XLogRecord.
2335  */
2336 static void
2338 {
2339  int block_id;
2340 
2341  /* decode block references */
2342  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2343  {
2344  RelFileLocator rlocator;
2345  ForkNumber forknum;
2346  BlockNumber blk;
2347 
2348  if (!XLogRecGetBlockTagExtended(record, block_id,
2349  &rlocator, &forknum, &blk, NULL))
2350  continue;
2351 
2352  if (forknum != MAIN_FORKNUM)
2353  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2354  block_id,
2355  rlocator.spcOid, rlocator.dbOid,
2356  rlocator.relNumber,
2357  forknum,
2358  blk);
2359  else
2360  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2361  block_id,
2362  rlocator.spcOid, rlocator.dbOid,
2363  rlocator.relNumber,
2364  blk);
2365  if (XLogRecHasBlockImage(record, block_id))
2366  appendStringInfoString(buf, " FPW");
2367  }
2368 }
2369 
2370 
2371 /*
2372  * Check that it's OK to switch to new timeline during recovery.
2373  *
2374  * 'lsn' is the address of the shutdown checkpoint record we're about to
2375  * replay. (Currently, timeline can only change at a shutdown checkpoint).
2376  */
2377 static void
2379  TimeLineID replayTLI)
2380 {
2381  /* Check that the record agrees on what the current (old) timeline is */
2382  if (prevTLI != replayTLI)
2383  ereport(PANIC,
2384  (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2385  prevTLI, replayTLI)));
2386 
2387  /*
2388  * The new timeline better be in the list of timelines we expect to see,
2389  * according to the timeline history. It should also not decrease.
2390  */
2391  if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2392  ereport(PANIC,
2393  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2394  newTLI, replayTLI)));
2395 
2396  /*
2397  * If we have not yet reached min recovery point, and we're about to
2398  * switch to a timeline greater than the timeline of the min recovery
2399  * point: trouble. After switching to the new timeline, we could not
2400  * possibly visit the min recovery point on the correct timeline anymore.
2401  * This can happen if there is a newer timeline in the archive that
2402  * branched before the timeline the min recovery point is on, and you
2403  * attempt to do PITR to the new timeline.
2404  */
2406  lsn < minRecoveryPoint &&
2407  newTLI > minRecoveryPointTLI)
2408  ereport(PANIC,
2409  (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2410  newTLI,
2413 
2414  /* Looks good */
2415 }
2416 
2417 
2418 /*
2419  * Extract timestamp from WAL record.
2420  *
2421  * If the record contains a timestamp, returns true, and saves the timestamp
2422  * in *recordXtime. If the record type has no timestamp, returns false.
2423  * Currently, only transaction commit/abort records and restore points contain
2424  * timestamps.
2425  */
2426 static bool
2428 {
2429  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2430  uint8 xact_info = info & XLOG_XACT_OPMASK;
2431  uint8 rmid = XLogRecGetRmid(record);
2432 
2433  if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2434  {
2435  *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2436  return true;
2437  }
2438  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2439  xact_info == XLOG_XACT_COMMIT_PREPARED))
2440  {
2441  *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2442  return true;
2443  }
2444  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2445  xact_info == XLOG_XACT_ABORT_PREPARED))
2446  {
2447  *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2448  return true;
2449  }
2450  return false;
2451 }
2452 
2453 /*
2454  * Checks whether the current buffer page and backup page stored in the
2455  * WAL record are consistent or not. Before comparing the two pages, a
2456  * masking can be applied to the pages to ignore certain areas like hint bits,
2457  * unused space between pd_lower and pd_upper among other things. This
2458  * function should be called once WAL replay has been completed for a
2459  * given record.
2460  */
2461 static void
2463 {
2464  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2465  RelFileLocator rlocator;
2466  ForkNumber forknum;
2467  BlockNumber blkno;
2468  int block_id;
2469 
2470  /* Records with no backup blocks have no need for consistency checks. */
2471  if (!XLogRecHasAnyBlockRefs(record))
2472  return;
2473 
2474  Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2475 
2476  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2477  {
2478  Buffer buf;
2479  Page page;
2480 
2481  if (!XLogRecGetBlockTagExtended(record, block_id,
2482  &rlocator, &forknum, &blkno, NULL))
2483  {
2484  /*
2485  * WAL record doesn't contain a block reference with the given id.
2486  * Do nothing.
2487  */
2488  continue;
2489  }
2490 
2491  Assert(XLogRecHasBlockImage(record, block_id));
2492 
2493  if (XLogRecBlockImageApply(record, block_id))
2494  {
2495  /*
2496  * WAL record has already applied the page, so bypass the
2497  * consistency check as that would result in comparing the full
2498  * page stored in the record with itself.
2499  */
2500  continue;
2501  }
2502 
2503  /*
2504  * Read the contents from the current buffer and store it in a
2505  * temporary page.
2506  */
2507  buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2509  InvalidBuffer);
2510  if (!BufferIsValid(buf))
2511  continue;
2512 
2514  page = BufferGetPage(buf);
2515 
2516  /*
2517  * Take a copy of the local page where WAL has been applied to have a
2518  * comparison base before masking it...
2519  */
2520  memcpy(replay_image_masked, page, BLCKSZ);
2521 
2522  /* No need for this page anymore now that a copy is in. */
2524 
2525  /*
2526  * If the block LSN is already ahead of this WAL record, we can't
2527  * expect contents to match. This can happen if recovery is
2528  * restarted.
2529  */
2530  if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2531  continue;
2532 
2533  /*
2534  * Read the contents from the backup copy, stored in WAL record and
2535  * store it in a temporary page. There is no need to allocate a new
2536  * page here, a local buffer is fine to hold its contents and a mask
2537  * can be directly applied on it.
2538  */
2539  if (!RestoreBlockImage(record, block_id, primary_image_masked))
2540  ereport(ERROR,
2541  (errcode(ERRCODE_INTERNAL_ERROR),
2542  errmsg_internal("%s", record->errormsg_buf)));
2543 
2544  /*
2545  * If masking function is defined, mask both the primary and replay
2546  * images
2547  */
2548  if (rmgr.rm_mask != NULL)
2549  {
2550  rmgr.rm_mask(replay_image_masked, blkno);
2551  rmgr.rm_mask(primary_image_masked, blkno);
2552  }
2553 
2554  /* Time to compare the primary and replay images. */
2555  if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2556  {
2557  elog(FATAL,
2558  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2559  rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2560  forknum, blkno);
2561  }
2562  }
2563 }
2564 
2565 /*
2566  * For point-in-time recovery, this function decides whether we want to
2567  * stop applying the XLOG before the current record.
2568  *
2569  * Returns true if we are stopping, false otherwise. If stopping, some
2570  * information is saved in recoveryStopXid et al for use in annotating the
2571  * new timeline's history file.
2572  */
2573 static bool
2575 {
2576  bool stopsHere = false;
2577  uint8 xact_info;
2578  bool isCommit;
2579  TimestampTz recordXtime = 0;
2580  TransactionId recordXid;
2581 
2582  /*
2583  * Ignore recovery target settings when not in archive recovery (meaning
2584  * we are in crash recovery).
2585  */
2587  return false;
2588 
2589  /* Check if we should stop as soon as reaching consistency */
2591  {
2592  ereport(LOG,
2593  (errmsg("recovery stopping after reaching consistency")));
2594 
2595  recoveryStopAfter = false;
2598  recoveryStopTime = 0;
2599  recoveryStopName[0] = '\0';
2600  return true;
2601  }
2602 
2603  /* Check if target LSN has been reached */
2606  record->ReadRecPtr >= recoveryTargetLSN)
2607  {
2608  recoveryStopAfter = false;
2610  recoveryStopLSN = record->ReadRecPtr;
2611  recoveryStopTime = 0;
2612  recoveryStopName[0] = '\0';
2613  ereport(LOG,
2614  (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2616  return true;
2617  }
2618 
2619  /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2620  if (XLogRecGetRmid(record) != RM_XACT_ID)
2621  return false;
2622 
2623  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2624 
2625  if (xact_info == XLOG_XACT_COMMIT)
2626  {
2627  isCommit = true;
2628  recordXid = XLogRecGetXid(record);
2629  }
2630  else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2631  {
2632  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2633  xl_xact_parsed_commit parsed;
2634 
2635  isCommit = true;
2637  xlrec,
2638  &parsed);
2639  recordXid = parsed.twophase_xid;
2640  }
2641  else if (xact_info == XLOG_XACT_ABORT)
2642  {
2643  isCommit = false;
2644  recordXid = XLogRecGetXid(record);
2645  }
2646  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2647  {
2648  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2649  xl_xact_parsed_abort parsed;
2650 
2651  isCommit = false;
2653  xlrec,
2654  &parsed);
2655  recordXid = parsed.twophase_xid;
2656  }
2657  else
2658  return false;
2659 
2661  {
2662  /*
2663  * There can be only one transaction end record with this exact
2664  * transactionid
2665  *
2666  * when testing for an xid, we MUST test for equality only, since
2667  * transactions are numbered in the order they start, not the order
2668  * they complete. A higher numbered xid will complete before you about
2669  * 50% of the time...
2670  */
2671  stopsHere = (recordXid == recoveryTargetXid);
2672  }
2673 
2674  /*
2675  * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2676  * We don't expect getRecordTimestamp ever to fail, since we already know
2677  * this is a commit or abort record; but test its result anyway.
2678  */
2679  if (getRecordTimestamp(record, &recordXtime) &&
2681  {
2682  /*
2683  * There can be many transactions that share the same commit time, so
2684  * we stop after the last one, if we are inclusive, or stop at the
2685  * first one if we are exclusive
2686  */
2688  stopsHere = (recordXtime > recoveryTargetTime);
2689  else
2690  stopsHere = (recordXtime >= recoveryTargetTime);
2691  }
2692 
2693  if (stopsHere)
2694  {
2695  recoveryStopAfter = false;
2696  recoveryStopXid = recordXid;
2697  recoveryStopTime = recordXtime;
2699  recoveryStopName[0] = '\0';
2700 
2701  if (isCommit)
2702  {
2703  ereport(LOG,
2704  (errmsg("recovery stopping before commit of transaction %u, time %s",
2707  }
2708  else
2709  {
2710  ereport(LOG,
2711  (errmsg("recovery stopping before abort of transaction %u, time %s",
2714  }
2715  }
2716 
2717  return stopsHere;
2718 }
2719 
2720 /*
2721  * Same as recoveryStopsBefore, but called after applying the record.
2722  *
2723  * We also track the timestamp of the latest applied COMMIT/ABORT
2724  * record in XLogRecoveryCtl->recoveryLastXTime.
2725  */
2726 static bool
2728 {
2729  uint8 info;
2730  uint8 xact_info;
2731  uint8 rmid;
2732  TimestampTz recordXtime = 0;
2733 
2734  /*
2735  * Ignore recovery target settings when not in archive recovery (meaning
2736  * we are in crash recovery).
2737  */
2739  return false;
2740 
2741  info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2742  rmid = XLogRecGetRmid(record);
2743 
2744  /*
2745  * There can be many restore points that share the same name; we stop at
2746  * the first one.
2747  */
2749  rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2750  {
2751  xl_restore_point *recordRestorePointData;
2752 
2753  recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2754 
2755  if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2756  {
2757  recoveryStopAfter = true;
2760  (void) getRecordTimestamp(record, &recoveryStopTime);
2761  strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2762 
2763  ereport(LOG,
2764  (errmsg("recovery stopping at restore point \"%s\", time %s",
2767  return true;
2768  }
2769  }
2770 
2771  /* Check if the target LSN has been reached */
2774  record->ReadRecPtr >= recoveryTargetLSN)
2775  {
2776  recoveryStopAfter = true;
2778  recoveryStopLSN = record->ReadRecPtr;
2779  recoveryStopTime = 0;
2780  recoveryStopName[0] = '\0';
2781  ereport(LOG,
2782  (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2784  return true;
2785  }
2786 
2787  if (rmid != RM_XACT_ID)
2788  return false;
2789 
2790  xact_info = info & XLOG_XACT_OPMASK;
2791 
2792  if (xact_info == XLOG_XACT_COMMIT ||
2793  xact_info == XLOG_XACT_COMMIT_PREPARED ||
2794  xact_info == XLOG_XACT_ABORT ||
2795  xact_info == XLOG_XACT_ABORT_PREPARED)
2796  {
2797  TransactionId recordXid;
2798 
2799  /* Update the last applied transaction timestamp */
2800  if (getRecordTimestamp(record, &recordXtime))
2801  SetLatestXTime(recordXtime);
2802 
2803  /* Extract the XID of the committed/aborted transaction */
2804  if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2805  {
2806  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2807  xl_xact_parsed_commit parsed;
2808 
2810  xlrec,
2811  &parsed);
2812  recordXid = parsed.twophase_xid;
2813  }
2814  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2815  {
2816  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2817  xl_xact_parsed_abort parsed;
2818 
2820  xlrec,
2821  &parsed);
2822  recordXid = parsed.twophase_xid;
2823  }
2824  else
2825  recordXid = XLogRecGetXid(record);
2826 
2827  /*
2828  * There can be only one transaction end record with this exact
2829  * transactionid
2830  *
2831  * when testing for an xid, we MUST test for equality only, since
2832  * transactions are numbered in the order they start, not the order
2833  * they complete. A higher numbered xid will complete before you about
2834  * 50% of the time...
2835  */
2837  recordXid == recoveryTargetXid)
2838  {
2839  recoveryStopAfter = true;
2840  recoveryStopXid = recordXid;
2841  recoveryStopTime = recordXtime;
2843  recoveryStopName[0] = '\0';
2844 
2845  if (xact_info == XLOG_XACT_COMMIT ||
2846  xact_info == XLOG_XACT_COMMIT_PREPARED)
2847  {
2848  ereport(LOG,
2849  (errmsg("recovery stopping after commit of transaction %u, time %s",
2852  }
2853  else if (xact_info == XLOG_XACT_ABORT ||
2854  xact_info == XLOG_XACT_ABORT_PREPARED)
2855  {
2856  ereport(LOG,
2857  (errmsg("recovery stopping after abort of transaction %u, time %s",
2860  }
2861  return true;
2862  }
2863  }
2864 
2865  /* Check if we should stop as soon as reaching consistency */
2867  {
2868  ereport(LOG,
2869  (errmsg("recovery stopping after reaching consistency")));
2870 
2871  recoveryStopAfter = true;
2873  recoveryStopTime = 0;
2875  recoveryStopName[0] = '\0';
2876  return true;
2877  }
2878 
2879  return false;
2880 }
2881 
2882 /*
2883  * Create a comment for the history file to explain why and where
2884  * timeline changed.
2885  */
2886 static char *
2888 {
2889  char reason[200];
2890 
2892  snprintf(reason, sizeof(reason),
2893  "%s transaction %u",
2894  recoveryStopAfter ? "after" : "before",
2895  recoveryStopXid);
2897  snprintf(reason, sizeof(reason),
2898  "%s %s\n",
2899  recoveryStopAfter ? "after" : "before",
2901  else if (recoveryTarget == RECOVERY_TARGET_LSN)
2902  snprintf(reason, sizeof(reason),
2903  "%s LSN %X/%X\n",
2904  recoveryStopAfter ? "after" : "before",
2907  snprintf(reason, sizeof(reason),
2908  "at restore point \"%s\"",
2911  snprintf(reason, sizeof(reason), "reached consistency");
2912  else
2913  snprintf(reason, sizeof(reason), "no recovery target specified");
2914 
2915  return pstrdup(reason);
2916 }
2917 
2918 /*
2919  * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2920  *
2921  * endOfRecovery is true if the recovery target is reached and
2922  * the paused state starts at the end of recovery because of
2923  * recovery_target_action=pause, and false otherwise.
2924  */
2925 static void
2926 recoveryPausesHere(bool endOfRecovery)
2927 {
2928  /* Don't pause unless users can connect! */
2929  if (!LocalHotStandbyActive)
2930  return;
2931 
2932  /* Don't pause after standby promotion has been triggered */
2934  return;
2935 
2936  if (endOfRecovery)
2937  ereport(LOG,
2938  (errmsg("pausing at the end of recovery"),
2939  errhint("Execute pg_wal_replay_resume() to promote.")));
2940  else
2941  ereport(LOG,
2942  (errmsg("recovery has paused"),
2943  errhint("Execute pg_wal_replay_resume() to continue.")));
2944 
2945  /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2947  {
2949  if (CheckForStandbyTrigger())
2950  return;
2951 
2952  /*
2953  * If recovery pause is requested then set it paused. While we are in
2954  * the loop, user might resume and pause again so set this every time.
2955  */
2957 
2958  /*
2959  * We wait on a condition variable that will wake us as soon as the
2960  * pause ends, but we use a timeout so we can check the above exit
2961  * condition periodically too.
2962  */
2964  WAIT_EVENT_RECOVERY_PAUSE);
2965  }
2967 }
2968 
2969 /*
2970  * When recovery_min_apply_delay is set, we wait long enough to make sure
2971  * certain record types are applied at least that interval behind the primary.
2972  *
2973  * Returns true if we waited.
2974  *
2975  * Note that the delay is calculated between the WAL record log time and
2976  * the current time on standby. We would prefer to keep track of when this
2977  * standby received each WAL record, which would allow a more consistent
2978  * approach and one not affected by time synchronisation issues, but that
2979  * is significantly more effort and complexity for little actual gain in
2980  * usability.
2981  */
2982 static bool
2984 {
2985  uint8 xact_info;
2986  TimestampTz xtime;
2987  TimestampTz delayUntil;
2988  long msecs;
2989 
2990  /* nothing to do if no delay configured */
2991  if (recovery_min_apply_delay <= 0)
2992  return false;
2993 
2994  /* no delay is applied on a database not yet consistent */
2995  if (!reachedConsistency)
2996  return false;
2997 
2998  /* nothing to do if crash recovery is requested */
3000  return false;
3001 
3002  /*
3003  * Is it a COMMIT record?
3004  *
3005  * We deliberately choose not to delay aborts since they have no effect on
3006  * MVCC. We already allow replay of records that don't have a timestamp,
3007  * so there is already opportunity for issues caused by early conflicts on
3008  * standbys.
3009  */
3010  if (XLogRecGetRmid(record) != RM_XACT_ID)
3011  return false;
3012 
3013  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3014 
3015  if (xact_info != XLOG_XACT_COMMIT &&
3016  xact_info != XLOG_XACT_COMMIT_PREPARED)
3017  return false;
3018 
3019  if (!getRecordTimestamp(record, &xtime))
3020  return false;
3021 
3023 
3024  /*
3025  * Exit without arming the latch if it's already past time to apply this
3026  * record
3027  */
3029  if (msecs <= 0)
3030  return false;
3031 
3032  while (true)
3033  {
3035 
3036  /* This might change recovery_min_apply_delay. */
3038 
3039  if (CheckForStandbyTrigger())
3040  break;
3041 
3042  /*
3043  * Recalculate delayUntil as recovery_min_apply_delay could have
3044  * changed while waiting in this loop.
3045  */
3047 
3048  /*
3049  * Wait for difference between GetCurrentTimestamp() and delayUntil.
3050  */
3052  delayUntil);
3053 
3054  if (msecs <= 0)
3055  break;
3056 
3057  elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3058 
3061  msecs,
3062  WAIT_EVENT_RECOVERY_APPLY_DELAY);
3063  }
3064  return true;
3065 }
3066 
3067 /*
3068  * Get the current state of the recovery pause request.
3069  */
3072 {
3074 
3078 
3079  return state;
3080 }
3081 
3082 /*
3083  * Set the recovery pause state.
3084  *
3085  * If recovery pause is requested then sets the recovery pause state to
3086  * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3087  * to 'not paused' to resume the recovery. The recovery pause will be
3088  * confirmed by the ConfirmRecoveryPaused.
3089  */
3090 void
3091 SetRecoveryPause(bool recoveryPause)
3092 {
3094 
3095  if (!recoveryPause)
3099 
3101 
3102  if (!recoveryPause)
3104 }
3105 
3106 /*
3107  * Confirm the recovery pause by setting the recovery pause state to
3108  * RECOVERY_PAUSED.
3109  */
3110 static void
3112 {
3113  /* If recovery pause is requested then set it paused */
3118 }
3119 
3120 
3121 /*
3122  * Attempt to read the next XLOG record.
3123  *
3124  * Before first call, the reader needs to be positioned to the first record
3125  * by calling XLogPrefetcherBeginRead().
3126  *
3127  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3128  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3129  * record is available.
3130  */
3131 static XLogRecord *
3133  bool fetching_ckpt, TimeLineID replayTLI)
3134 {
3135  XLogRecord *record;
3138 
3139  /* Pass through parameters to XLogPageRead */
3140  private->fetching_ckpt = fetching_ckpt;
3141  private->emode = emode;
3142  private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3143  private->replayTLI = replayTLI;
3144 
3145  /* This is the first attempt to read this page. */
3146  lastSourceFailed = false;
3147 
3148  for (;;)
3149  {
3150  char *errormsg;
3151 
3152  record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3153  if (record == NULL)
3154  {
3155  /*
3156  * When we find that WAL ends in an incomplete record, keep track
3157  * of that record. After recovery is done, we'll write a record
3158  * to indicate to downstream WAL readers that that portion is to
3159  * be ignored.
3160  *
3161  * However, when ArchiveRecoveryRequested = true, we're going to
3162  * switch to a new timeline at the end of recovery. We will only
3163  * copy WAL over to the new timeline up to the end of the last
3164  * complete record, so if we did this, we would later create an
3165  * overwrite contrecord in the wrong place, breaking everything.
3166  */
3167  if (!ArchiveRecoveryRequested &&
3169  {
3172  }
3173 
3174  if (readFile >= 0)
3175  {
3176  close(readFile);
3177  readFile = -1;
3178  }
3179 
3180  /*
3181  * We only end up here without a message when XLogPageRead()
3182  * failed - in that case we already logged something. In
3183  * StandbyMode that only happens if we have been triggered, so we
3184  * shouldn't loop anymore in that case.
3185  */
3186  if (errormsg)
3188  (errmsg_internal("%s", errormsg) /* already translated */ ));
3189  }
3190 
3191  /*
3192  * Check page TLI is one of the expected values.
3193  */
3195  {
3196  char fname[MAXFNAMELEN];
3197  XLogSegNo segno;
3198  int32 offset;
3199 
3203  XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3206  (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3208  fname,
3210  offset)));
3211  record = NULL;
3212  }
3213 
3214  if (record)
3215  {
3216  /* Great, got a record */
3217  return record;
3218  }
3219  else
3220  {
3221  /* No valid record available from this source */
3222  lastSourceFailed = true;
3223 
3224  /*
3225  * If archive recovery was requested, but we were still doing
3226  * crash recovery, switch to archive recovery and retry using the
3227  * offline archive. We have now replayed all the valid WAL in
3228  * pg_wal, so we are presumably now consistent.
3229  *
3230  * We require that there's at least some valid WAL present in
3231  * pg_wal, however (!fetching_ckpt). We could recover using the
3232  * WAL from the archive, even if pg_wal is completely empty, but
3233  * we'd have no idea how far we'd have to replay to reach
3234  * consistency. So err on the safe side and give up.
3235  */
3237  !fetching_ckpt)
3238  {
3239  ereport(DEBUG1,
3240  (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3241  InArchiveRecovery = true;
3244 
3247  minRecoveryPointTLI = replayTLI;
3248 
3250 
3251  /*
3252  * Before we retry, reset lastSourceFailed and currentSource
3253  * so that we will check the archive next.
3254  */
3255  lastSourceFailed = false;
3257 
3258  continue;
3259  }
3260 
3261  /* In standby mode, loop back to retry. Otherwise, give up. */
3263  continue;
3264  else
3265  return NULL;
3266  }
3267  }
3268 }
3269 
3270 /*
3271  * Read the XLOG page containing targetPagePtr into readBuf (if not read
3272  * already). Returns number of bytes read, if the page is read successfully,
3273  * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3274  * but only if they have not been previously reported.
3275  *
3276  * See XLogReaderRoutine.page_read for more details.
3277  *
3278  * While prefetching, xlogreader->nonblocking may be set. In that case,
3279  * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3280  *
3281  * This is responsible for restoring files from archive as needed, as well
3282  * as for waiting for the requested WAL record to arrive in standby mode.
3283  *
3284  * xlogreader->private_data->emode specifies the log level used for reporting
3285  * "file not found" or "end of WAL" situations in archive recovery, or in
3286  * standby mode when promotion is triggered. If set to WARNING or below,
3287  * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3288  * levels the ereport() won't return.
3289  *
3290  * In standby mode, if after a successful return of XLogPageRead() the
3291  * caller finds the record it's interested in to be broken, it should
3292  * ereport the error with the level determined by
3293  * emode_for_corrupt_record(), and then set lastSourceFailed
3294  * and call XLogPageRead() again with the same arguments. This lets
3295  * XLogPageRead() to try fetching the record from another source, or to
3296  * sleep and retry.
3297  */
3298 static int
3300  XLogRecPtr targetRecPtr, char *readBuf)
3301 {
3302  XLogPageReadPrivate *private =
3304  int emode = private->emode;
3305  uint32 targetPageOff;
3306  XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3307  int r;
3308 
3309  XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3310  targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3311 
3312  /*
3313  * See if we need to switch to a new segment because the requested record
3314  * is not in the currently open one.
3315  */
3316  if (readFile >= 0 &&
3317  !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3318  {
3319  /*
3320  * Request a restartpoint if we've replayed too much xlog since the
3321  * last one.
3322  */
3324  {
3326  {
3327  (void) GetRedoRecPtr();
3330  }
3331  }
3332 
3333  close(readFile);
3334  readFile = -1;
3336  }
3337 
3338  XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3339 
3340 retry:
3341  /* See if we need to retrieve more data */
3342  if (readFile < 0 ||
3344  flushedUpto < targetPagePtr + reqLen))
3345  {
3346  if (readFile >= 0 &&
3349  flushedUpto < targetPagePtr + reqLen)
3350  return XLREAD_WOULDBLOCK;
3351 
3352  switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3353  private->randAccess,
3354  private->fetching_ckpt,
3355  targetRecPtr,
3356  private->replayTLI,
3359  {
3360  case XLREAD_WOULDBLOCK:
3361  return XLREAD_WOULDBLOCK;
3362  case XLREAD_FAIL:
3363  if (readFile >= 0)
3364  close(readFile);
3365  readFile = -1;
3366  readLen = 0;
3368  return XLREAD_FAIL;
3369  case XLREAD_SUCCESS:
3370  break;
3371  }
3372  }
3373 
3374  /*
3375  * At this point, we have the right segment open and if we're streaming we
3376  * know the requested record is in it.
3377  */
3378  Assert(readFile != -1);
3379 
3380  /*
3381  * If the current segment is being streamed from the primary, calculate
3382  * how much of the current page we have received already. We know the
3383  * requested record has been received, but this is for the benefit of
3384  * future calls, to allow quick exit at the top of this function.
3385  */
3387  {
3388  if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3389  readLen = XLOG_BLCKSZ;
3390  else
3392  targetPageOff;
3393  }
3394  else
3395  readLen = XLOG_BLCKSZ;
3396 
3397  /* Read the requested page */
3398  readOff = targetPageOff;
3399 
3400  pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3401  r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3402  if (r != XLOG_BLCKSZ)
3403  {
3404  char fname[MAXFNAMELEN];
3405  int save_errno = errno;
3406 
3409  if (r < 0)
3410  {
3411  errno = save_errno;
3412  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3414  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3415  fname, LSN_FORMAT_ARGS(targetPagePtr),
3416  readOff)));
3417  }
3418  else
3419  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3421  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3422  fname, LSN_FORMAT_ARGS(targetPagePtr),
3423  readOff, r, (Size) XLOG_BLCKSZ)));
3424  goto next_record_is_invalid;
3425  }
3427 
3428  Assert(targetSegNo == readSegNo);
3429  Assert(targetPageOff == readOff);
3430  Assert(reqLen <= readLen);
3431 
3433 
3434  /*
3435  * Check the page header immediately, so that we can retry immediately if
3436  * it's not valid. This may seem unnecessary, because ReadPageInternal()
3437  * validates the page header anyway, and would propagate the failure up to
3438  * ReadRecord(), which would retry. However, there's a corner case with
3439  * continuation records, if a record is split across two pages such that
3440  * we would need to read the two pages from different sources. For
3441  * example, imagine a scenario where a streaming replica is started up,
3442  * and replay reaches a record that's split across two WAL segments. The
3443  * first page is only available locally, in pg_wal, because it's already
3444  * been recycled on the primary. The second page, however, is not present
3445  * in pg_wal, and we should stream it from the primary. There is a
3446  * recycled WAL segment present in pg_wal, with garbage contents, however.
3447  * We would read the first page from the local WAL segment, but when
3448  * reading the second page, we would read the bogus, recycled, WAL
3449  * segment. If we didn't catch that case here, we would never recover,
3450  * because ReadRecord() would retry reading the whole record from the
3451  * beginning.
3452  *
3453  * Of course, this only catches errors in the page header, which is what
3454  * happens in the case of a recycled WAL segment. Other kinds of errors or
3455  * corruption still has the same problem. But this at least fixes the
3456  * common case, which can happen as part of normal operation.
3457  *
3458  * Validating the page header is cheap enough that doing it twice
3459  * shouldn't be a big deal from a performance point of view.
3460  *
3461  * When not in standby mode, an invalid page header should cause recovery
3462  * to end, not retry reading the page, so we don't need to validate the
3463  * page header here for the retry. Instead, ReadPageInternal() is
3464  * responsible for the validation.
3465  */
3466  if (StandbyMode &&
3467  !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3468  {
3469  /*
3470  * Emit this error right now then retry this page immediately. Use
3471  * errmsg_internal() because the message was already translated.
3472  */
3473  if (xlogreader->errormsg_buf[0])
3476 
3477  /* reset any error XLogReaderValidatePageHeader() might have set */
3479  goto next_record_is_invalid;
3480  }
3481 
3482  return readLen;
3483 
3484 next_record_is_invalid:
3485 
3486  /*
3487  * If we're reading ahead, give up fast. Retries and error reporting will
3488  * be handled by a later read when recovery catches up to this point.
3489  */
3490  if (xlogreader->nonblocking)
3491  return XLREAD_WOULDBLOCK;
3492 
3493  lastSourceFailed = true;
3494 
3495  if (readFile >= 0)
3496  close(readFile);
3497  readFile = -1;
3498  readLen = 0;
3500 
3501  /* In standby-mode, keep trying */
3502  if (StandbyMode)
3503  goto retry;
3504  else
3505  return XLREAD_FAIL;
3506 }
3507 
3508 /*
3509  * Open the WAL segment containing WAL location 'RecPtr'.
3510  *
3511  * The segment can be fetched via restore_command, or via walreceiver having
3512  * streamed the record, or it can already be present in pg_wal. Checking
3513  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3514  * too, in case someone copies a new segment directly to pg_wal. That is not
3515  * documented or recommended, though.
3516  *
3517  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3518  * prepare to read WAL starting from RedoStartLSN after this.
3519  *
3520  * 'RecPtr' might not point to the beginning of the record we're interested
3521  * in, it might also point to the page or segment header. In that case,
3522  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3523  * used to decide which timeline to stream the requested WAL from.
3524  *
3525  * 'replayLSN' is the current replay LSN, so that if we scan for new
3526  * timelines, we can reject a switch to a timeline that branched off before
3527  * this point.
3528  *
3529  * If the record is not immediately available, the function returns false
3530  * if we're not in standby mode. In standby mode, waits for it to become
3531  * available.
3532  *
3533  * When the requested record becomes available, the function opens the file
3534  * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3535  * of standby mode is triggered by the user, and there is no more WAL
3536  * available, returns XLREAD_FAIL.
3537  *
3538  * If nonblocking is true, then give up immediately if we can't satisfy the
3539  * request, returning XLREAD_WOULDBLOCK instead of waiting.
3540  */
3541 static XLogPageReadResult
3542 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3543  bool fetching_ckpt, XLogRecPtr tliRecPtr,
3544  TimeLineID replayTLI, XLogRecPtr replayLSN,
3545  bool nonblocking)
3546 {
3547  static TimestampTz last_fail_time = 0;
3548  TimestampTz now;
3549  bool streaming_reply_sent = false;
3550 
3551  /*-------
3552  * Standby mode is implemented by a state machine:
3553  *
3554  * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3555  * pg_wal (XLOG_FROM_PG_WAL)
3556  * 2. Check for promotion trigger request
3557  * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3558  * 4. Rescan timelines
3559  * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3560  *
3561  * Failure to read from the current source advances the state machine to
3562  * the next state.
3563  *
3564  * 'currentSource' indicates the current state. There are no currentSource
3565  * values for "check trigger", "rescan timelines", and "sleep" states,
3566  * those actions are taken when reading from the previous source fails, as
3567  * part of advancing to the next state.
3568  *
3569  * If standby mode is turned off while reading WAL from stream, we move
3570  * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3571  * the files (which would be required at end of recovery, e.g., timeline
3572  * history file) from archive or pg_wal. We don't need to kill WAL receiver
3573  * here because it's already stopped when standby mode is turned off at
3574  * the end of recovery.
3575  *-------
3576  */
3577  if (!InArchiveRecovery)
3579  else if (currentSource == XLOG_FROM_ANY ||
3581  {
3582  lastSourceFailed = false;
3584  }
3585 
3586  for (;;)
3587  {
3588  XLogSource oldSource = currentSource;
3589  bool startWalReceiver = false;
3590 
3591  /*
3592  * First check if we failed to read from the current source, and
3593  * advance the state machine if so. The failure to read might've
3594  * happened outside this function, e.g when a CRC check fails on a
3595  * record, or within this loop.
3596  */
3597  if (lastSourceFailed)
3598  {
3599  /*
3600  * Don't allow any retry loops to occur during nonblocking
3601  * readahead. Let the caller process everything that has been
3602  * decoded already first.
3603  */
3604  if (nonblocking)
3605  return XLREAD_WOULDBLOCK;
3606 
3607  switch (currentSource)
3608  {
3609  case XLOG_FROM_ARCHIVE:
3610  case XLOG_FROM_PG_WAL:
3611 
3612  /*
3613  * Check to see if promotion is requested. Note that we do
3614  * this only after failure, so when you promote, we still
3615  * finish replaying as much as we can from archive and
3616  * pg_wal before failover.
3617  */
3619  {
3621  return XLREAD_FAIL;
3622  }
3623 
3624  /*
3625  * Not in standby mode, and we've now tried the archive
3626  * and pg_wal.
3627  */
3628  if (!StandbyMode)
3629  return XLREAD_FAIL;
3630 
3631  /*
3632  * Move to XLOG_FROM_STREAM state, and set to start a
3633  * walreceiver if necessary.
3634  */
3636  startWalReceiver = true;
3637  break;
3638 
3639  case XLOG_FROM_STREAM:
3640 
3641  /*
3642  * Failure while streaming. Most likely, we got here
3643  * because streaming replication was terminated, or
3644  * promotion was triggered. But we also get here if we
3645  * find an invalid record in the WAL streamed from the
3646  * primary, in which case something is seriously wrong.
3647  * There's little chance that the problem will just go
3648  * away, but PANIC is not good for availability either,
3649  * especially in hot standby mode. So, we treat that the
3650  * same as disconnection, and retry from archive/pg_wal
3651  * again. The WAL in the archive should be identical to
3652  * what was streamed, so it's unlikely that it helps, but
3653  * one can hope...
3654  */
3655 
3656  /*
3657  * We should be able to move to XLOG_FROM_STREAM only in
3658  * standby mode.
3659  */
3661 
3662  /*
3663  * Before we leave XLOG_FROM_STREAM state, make sure that
3664  * walreceiver is not active, so that it won't overwrite
3665  * WAL that we restore from archive.
3666  */
3668 
3669  /*
3670  * Before we sleep, re-scan for possible new timelines if
3671  * we were requested to recover to the latest timeline.
3672  */
3674  {
3675  if (rescanLatestTimeLine(replayTLI, replayLSN))
3676  {
3678  break;
3679  }
3680  }
3681 
3682  /*
3683  * XLOG_FROM_STREAM is the last state in our state
3684  * machine, so we've exhausted all the options for
3685  * obtaining the requested WAL. We're going to loop back
3686  * and retry from the archive, but if it hasn't been long
3687  * since last attempt, sleep wal_retrieve_retry_interval
3688  * milliseconds to avoid busy-waiting.
3689  */
3691  if (!TimestampDifferenceExceeds(last_fail_time, now,
3693  {
3694  long wait_time;
3695 
3696  wait_time = wal_retrieve_retry_interval -
3697  TimestampDifferenceMilliseconds(last_fail_time, now);
3698 
3699  elog(LOG, "waiting for WAL to become available at %X/%X",
3700  LSN_FORMAT_ARGS(RecPtr));
3701 
3702  /* Do background tasks that might benefit us later. */
3704 
3708  wait_time,
3709  WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3712 
3713  /* Handle interrupt signals of startup process */
3715  }
3716  last_fail_time = now;
3718  break;
3719 
3720  default:
3721  elog(ERROR, "unexpected WAL source %d", currentSource);
3722  }
3723  }
3724  else if (currentSource == XLOG_FROM_PG_WAL)
3725  {
3726  /*
3727  * We just successfully read a file in pg_wal. We prefer files in
3728  * the archive over ones in pg_wal, so try the next file again
3729  * from the archive first.
3730  */
3731  if (InArchiveRecovery)
3733  }
3734 
3735  if (currentSource != oldSource)
3736  elog(DEBUG2, "switched WAL source from %s to %s after %s",
3738  lastSourceFailed ? "failure" : "success");
3739 
3740  /*
3741  * We've now handled possible failure. Try to read from the chosen
3742  * source.
3743  */
3744  lastSourceFailed = false;
3745 
3746  switch (currentSource)
3747  {
3748  case XLOG_FROM_ARCHIVE:
3749  case XLOG_FROM_PG_WAL:
3750 
3751  /*
3752  * WAL receiver must not be running when reading WAL from
3753  * archive or pg_wal.
3754  */
3755  Assert(!WalRcvStreaming());
3756 
3757  /* Close any old file we might have open. */
3758  if (readFile >= 0)
3759  {
3760  close(readFile);
3761  readFile = -1;
3762  }
3763  /* Reset curFileTLI if random fetch. */
3764  if (randAccess)
3765  curFileTLI = 0;
3766 
3767  /*
3768  * Try to restore the file from archive, or read an existing
3769  * file from pg_wal.
3770  */
3773  currentSource);
3774  if (readFile >= 0)
3775  return XLREAD_SUCCESS; /* success! */
3776 
3777  /*
3778  * Nope, not found in archive or pg_wal.
3779  */
3780  lastSourceFailed = true;
3781  break;
3782 
3783  case XLOG_FROM_STREAM:
3784  {
3785  bool havedata;
3786 
3787  /*
3788  * We should be able to move to XLOG_FROM_STREAM only in
3789  * standby mode.
3790  */
3792 
3793  /*
3794  * First, shutdown walreceiver if its restart has been
3795  * requested -- but no point if we're already slated for
3796  * starting it.
3797  */
3798  if (pendingWalRcvRestart && !startWalReceiver)
3799  {
3801 
3802  /*
3803  * Re-scan for possible new timelines if we were
3804  * requested to recover to the latest timeline.
3805  */
3808  rescanLatestTimeLine(replayTLI, replayLSN);
3809 
3810  startWalReceiver = true;
3811  }
3812  pendingWalRcvRestart = false;
3813 
3814  /*
3815  * Launch walreceiver if needed.
3816  *
3817  * If fetching_ckpt is true, RecPtr points to the initial
3818  * checkpoint location. In that case, we use RedoStartLSN
3819  * as the streaming start position instead of RecPtr, so
3820  * that when we later jump backwards to start redo at
3821  * RedoStartLSN, we will have the logs streamed already.
3822  */
3823  if (startWalReceiver &&
3824  PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3825  {
3826  XLogRecPtr ptr;
3827  TimeLineID tli;
3828 
3829  if (fetching_ckpt)
3830  {
3831  ptr = RedoStartLSN;
3832  tli = RedoStartTLI;
3833  }
3834  else
3835  {
3836  ptr = RecPtr;
3837 
3838  /*
3839  * Use the record begin position to determine the
3840  * TLI, rather than the position we're reading.
3841  */
3842  tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3843 
3844  if (curFileTLI > 0 && tli < curFileTLI)
3845  elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3846  LSN_FORMAT_ARGS(tliRecPtr),
3847  tli, curFileTLI);
3848  }
3849  curFileTLI = tli;
3854  flushedUpto = 0;
3855  }
3856 
3857  /*
3858  * Check if WAL receiver is active or wait to start up.
3859  */
3860  if (!WalRcvStreaming())
3861  {
3862  lastSourceFailed = true;
3863  break;
3864  }
3865 
3866  /*
3867  * Walreceiver is active, so see if new data has arrived.
3868  *
3869  * We only advance XLogReceiptTime when we obtain fresh
3870  * WAL from walreceiver and observe that we had already
3871  * processed everything before the most recent "chunk"
3872  * that it flushed to disk. In steady state where we are
3873  * keeping up with the incoming data, XLogReceiptTime will
3874  * be updated on each cycle. When we are behind,
3875  * XLogReceiptTime will not advance, so the grace time
3876  * allotted to conflicting queries will decrease.
3877  */
3878  if (RecPtr < flushedUpto)
3879  havedata = true;
3880  else
3881  {
3882  XLogRecPtr latestChunkStart;
3883 
3884  flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3885  if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3886  {
3887  havedata = true;
3888  if (latestChunkStart <= RecPtr)
3889  {
3892  }
3893  }
3894  else
3895  havedata = false;
3896  }
3897  if (havedata)
3898  {
3899  /*
3900  * Great, streamed far enough. Open the file if it's
3901  * not open already. Also read the timeline history
3902  * file if we haven't initialized timeline history
3903  * yet; it should be streamed over and present in
3904  * pg_wal by now. Use XLOG_FROM_STREAM so that source
3905  * info is set correctly and XLogReceiptTime isn't
3906  * changed.
3907  *
3908  * NB: We must set readTimeLineHistory based on
3909  * recoveryTargetTLI, not receiveTLI. Normally they'll
3910  * be the same, but if recovery_target_timeline is
3911  * 'latest' and archiving is configured, then it's
3912  * possible that we managed to retrieve one or more
3913  * new timeline history files from the archive,
3914  * updating recoveryTargetTLI.
3915  */
3916  if (readFile < 0)
3917  {
3918  if (!expectedTLEs)
3921  receiveTLI,
3922  XLOG_FROM_STREAM, false);
3923  Assert(readFile >= 0);
3924  }
3925  else
3926  {
3927  /* just make sure source info is correct... */
3930  return XLREAD_SUCCESS;
3931  }
3932  break;
3933  }
3934 
3935  /* In nonblocking mode, return rather than sleeping. */
3936  if (nonblocking)
3937  return XLREAD_WOULDBLOCK;
3938 
3939  /*
3940  * Data not here yet. Check for trigger, then wait for
3941  * walreceiver to wake us up when new WAL arrives.
3942  */
3943  if (CheckForStandbyTrigger())
3944  {
3945  /*
3946  * Note that we don't return XLREAD_FAIL immediately
3947  * here. After being triggered, we still want to
3948  * replay all the WAL that was already streamed. It's
3949  * in pg_wal now, so we just treat this as a failure,
3950  * and the state machine will move on to replay the
3951  * streamed WAL from pg_wal, and then recheck the
3952  * trigger and exit replay.
3953  */
3954  lastSourceFailed = true;
3955  break;
3956  }
3957 
3958  /*
3959  * Since we have replayed everything we have received so
3960  * far and are about to start waiting for more WAL, let's
3961  * tell the upstream server our replay location now so
3962  * that pg_stat_replication doesn't show stale
3963  * information.
3964  */
3965  if (!streaming_reply_sent)
3966  {
3967  WalRcvForceReply();
3968  streaming_reply_sent = true;
3969  }
3970 
3971  /* Do any background tasks that might benefit us later. */
3973 
3974  /* Update pg_stat_recovery_prefetch before sleeping. */
3976 
3977  /*
3978  * Wait for more WAL to arrive, when we will be woken
3979  * immediately by the WAL receiver.
3980  */
3983  -1L,
3984  WAIT_EVENT_RECOVERY_WAL_STREAM);
3986  break;
3987  }
3988 
3989  default:
3990  elog(ERROR, "unexpected WAL source %d", currentSource);
3991  }
3992 
3993  /*
3994  * Check for recovery pause here so that we can confirm more quickly
3995  * that a requested pause has actually taken effect.
3996  */
3997  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
3999  recoveryPausesHere(false);
4000 
4001  /*
4002  * This possibly-long loop needs to handle interrupts of startup
4003  * process.
4004  */
4006  }
4007 
4008  return XLREAD_FAIL; /* not reached */
4009 }
4010 
4011 
4012 /*
4013  * Determine what log level should be used to report a corrupt WAL record
4014  * in the current WAL page, previously read by XLogPageRead().
4015  *
4016  * 'emode' is the error mode that would be used to report a file-not-found
4017  * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4018  * we're retrying the exact same record that we've tried previously, only
4019  * complain the first time to keep the noise down. However, we only do when
4020  * reading from pg_wal, because we don't expect any invalid records in archive
4021  * or in records streamed from the primary. Files in the archive should be complete,
4022  * and we should never hit the end of WAL because we stop and wait for more WAL
4023  * to arrive before replaying it.
4024  *
4025  * NOTE: This function remembers the RecPtr value it was last called with,
4026  * to suppress repeated messages about the same record. Only call this when
4027  * you are about to ereport(), or you might cause a later message to be
4028  * erroneously suppressed.
4029  */
4030 static int
4032 {
4033  static XLogRecPtr lastComplaint = 0;
4034 
4035  if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4036  {
4037  if (RecPtr == lastComplaint)
4038  emode = DEBUG1;
4039  else
4040  lastComplaint = RecPtr;
4041  }
4042  return emode;
4043 }
4044 
4045 
4046 /*
4047  * Subroutine to try to fetch and validate a prior checkpoint record.
4048  */
4049 static XLogRecord *
4051  TimeLineID replayTLI)
4052 {
4053  XLogRecord *record;
4054  uint8 info;
4055 
4056  Assert(xlogreader != NULL);
4057 
4058  if (!XRecOffIsValid(RecPtr))
4059  {
4060  ereport(LOG,
4061  (errmsg("invalid checkpoint location")));
4062  return NULL;
4063  }
4064 
4066  record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4067 
4068  if (record == NULL)
4069  {
4070  ereport(LOG,
4071  (errmsg("invalid checkpoint record")));
4072  return NULL;
4073  }
4074  if (record->xl_rmid != RM_XLOG_ID)
4075  {
4076  ereport(LOG,
4077  (errmsg("invalid resource manager ID in checkpoint record")));
4078  return NULL;
4079  }
4080  info = record->xl_info & ~XLR_INFO_MASK;
4081  if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4082  info != XLOG_CHECKPOINT_ONLINE)
4083  {
4084  ereport(LOG,
4085  (errmsg("invalid xl_info in checkpoint record")));
4086  return NULL;
4087  }
4089  {
4090  ereport(LOG,
4091  (errmsg("invalid length of checkpoint record")));
4092  return NULL;
4093  }
4094  return record;
4095 }
4096 
4097 /*
4098  * Scan for new timelines that might have appeared in the archive since we
4099  * started recovery.
4100  *
4101  * If there are any, the function changes recovery target TLI to the latest
4102  * one and returns 'true'.
4103  */
4104 static bool
4106 {
4107  List *newExpectedTLEs;
4108  bool found;
4109  ListCell *cell;
4110  TimeLineID newtarget;
4111  TimeLineID oldtarget = recoveryTargetTLI;
4112  TimeLineHistoryEntry *currentTle = NULL;
4113 
4115  if (newtarget == recoveryTargetTLI)
4116  {
4117  /* No new timelines found */
4118  return false;
4119  }
4120 
4121  /*
4122  * Determine the list of expected TLIs for the new TLI
4123  */
4124 
4125  newExpectedTLEs = readTimeLineHistory(newtarget);
4126 
4127  /*
4128  * If the current timeline is not part of the history of the new timeline,
4129  * we cannot proceed to it.
4130  */
4131  found = false;
4132  foreach(cell, newExpectedTLEs)
4133  {
4134  currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4135 
4136  if (currentTle->tli == recoveryTargetTLI)
4137  {
4138  found = true;
4139  break;
4140  }
4141  }
4142  if (!found)
4143  {
4144  ereport(LOG,
4145  (errmsg("new timeline %u is not a child of database system timeline %u",
4146  newtarget,
4147  replayTLI)));
4148  return false;
4149  }
4150 
4151  /*
4152  * The current timeline was found in the history file, but check that the
4153  * next timeline was forked off from it *after* the current recovery
4154  * location.
4155  */
4156  if (currentTle->end < replayLSN)
4157  {
4158  ereport(LOG,
4159  (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4160  newtarget,
4161  replayTLI,
4162  LSN_FORMAT_ARGS(replayLSN))));
4163  return false;
4164  }
4165 
4166  /* The new timeline history seems valid. Switch target */
4167  recoveryTargetTLI = newtarget;
4169  expectedTLEs = newExpectedTLEs;
4170 
4171  /*
4172  * As in StartupXLOG(), try to ensure we have all the history files
4173  * between the old target and new target in pg_wal.
4174  */
4175  restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4176 
4177  ereport(LOG,
4178  (errmsg("new target timeline is %u",
4179  recoveryTargetTLI)));
4180 
4181  return true;
4182 }
4183 
4184 
4185 /*
4186  * Open a logfile segment for reading (during recovery).
4187  *
4188  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4189  * Otherwise, it's assumed to be already available in pg_wal.
4190  */
4191 static int
4192 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
4193  XLogSource source, bool notfoundOk)
4194 {
4195  char xlogfname[MAXFNAMELEN];
4196  char activitymsg[MAXFNAMELEN + 16];
4197  char path[MAXPGPATH];
4198  int fd;
4199 
4200  XLogFileName(xlogfname, tli, segno, wal_segment_size);
4201 
4202  switch (source)
4203  {
4204  case XLOG_FROM_ARCHIVE:
4205  /* Report recovery progress in PS display */
4206  snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4207  xlogfname);
4208  set_ps_display(activitymsg);
4209 
4210  if (!RestoreArchivedFile(path, xlogfname,
4211  "RECOVERYXLOG",
4213  InRedo))
4214  return -1;
4215  break;
4216 
4217  case XLOG_FROM_PG_WAL:
4218  case XLOG_FROM_STREAM:
4219  XLogFilePath(path, tli, segno, wal_segment_size);
4220  break;
4221 
4222  default:
4223  elog(ERROR, "invalid XLogFileRead source %d", source);
4224  }
4225 
4226  /*
4227  * If the segment was fetched from archival storage, replace the existing
4228  * xlog segment (if any) with the archival version.
4229  */
4230  if (source == XLOG_FROM_ARCHIVE)
4231  {
4233  KeepFileRestoredFromArchive(path, xlogfname);
4234 
4235  /*
4236  * Set path to point at the new file in pg_wal.
4237  */
4238  snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4239  }
4240 
4241  fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4242  if (fd >= 0)
4243  {
4244  /* Success! */
4245  curFileTLI = tli;
4246 
4247  /* Report recovery progress in PS display */
4248  snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4249  xlogfname);
4250  set_ps_display(activitymsg);
4251 
4252  /* Track source of data in assorted state variables */
4253  readSource = source;
4255  /* In FROM_STREAM case, caller tracks receipt time, not me */
4256  if (source != XLOG_FROM_STREAM)
4258 
4259  return fd;
4260  }
4261  if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4262  ereport(PANIC,
4264  errmsg("could not open file \"%s\": %m", path)));
4265  return -1;
4266 }
4267 
4268 /*
4269  * Open a logfile segment for reading (during recovery).
4270  *
4271  * This version searches for the segment with any TLI listed in expectedTLEs.
4272  */
4273 static int
4275 {
4276  char path[MAXPGPATH];
4277  ListCell *cell;
4278  int fd;
4279  List *tles;
4280 
4281  /*
4282  * Loop looking for a suitable timeline ID: we might need to read any of
4283  * the timelines listed in expectedTLEs.
4284  *
4285  * We expect curFileTLI on entry to be the TLI of the preceding file in
4286  * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4287  * to go backwards; this prevents us from picking up the wrong file when a
4288  * parent timeline extends to higher segment numbers than the child we
4289  * want to read.
4290  *
4291  * If we haven't read the timeline history file yet, read it now, so that
4292  * we know which TLIs to scan. We don't save the list in expectedTLEs,
4293  * however, unless we actually find a valid segment. That way if there is
4294  * neither a timeline history file nor a WAL segment in the archive, and
4295  * streaming replication is set up, we'll read the timeline history file
4296  * streamed from the primary when we start streaming, instead of
4297  * recovering with a dummy history generated here.
4298  */
4299  if (expectedTLEs)
4300  tles = expectedTLEs;
4301  else
4303 
4304  foreach(cell, tles)
4305  {
4307  TimeLineID tli = hent->tli;
4308 
4309  if (tli < curFileTLI)
4310  break; /* don't bother looking at too-old TLIs */
4311 
4312  /*
4313  * Skip scanning the timeline ID that the logfile segment to read
4314  * doesn't belong to
4315  */
4316  if (hent->begin != InvalidXLogRecPtr)
4317  {
4318  XLogSegNo beginseg = 0;
4319 
4320  XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4321 
4322  /*
4323  * The logfile segment that doesn't belong to the timeline is
4324  * older or newer than the segment that the timeline started or
4325  * ended at, respectively. It's sufficient to check only the
4326  * starting segment of the timeline here. Since the timelines are
4327  * scanned in descending order in this loop, any segments newer
4328  * than the ending segment should belong to newer timeline and
4329  * have already been read before. So it's not necessary to check
4330  * the ending segment of the timeline here.
4331  */
4332  if (segno < beginseg)
4333  continue;
4334  }
4335 
4337  {
4338  fd = XLogFileRead(segno, emode, tli,
4339  XLOG_FROM_ARCHIVE, true);
4340  if (fd != -1)
4341  {
4342  elog(DEBUG1, "got WAL segment from archive");
4343  if (!expectedTLEs)
4344  expectedTLEs = tles;
4345  return fd;
4346  }
4347  }
4348 
4350  {
4351  fd = XLogFileRead(segno, emode, tli,
4352  XLOG_FROM_PG_WAL, true);
4353  if (fd != -1)
4354  {
4355  if (!expectedTLEs)
4356  expectedTLEs = tles;
4357  return fd;
4358  }
4359  }
4360  }
4361 
4362  /* Couldn't find it. For simplicity, complain about front timeline */
4364  errno = ENOENT;
4365  ereport(emode,
4367  errmsg("could not open file \"%s\": %m", path)));
4368  return -1;
4369 }
4370 
4371 /*
4372  * Set flag to signal the walreceiver to restart. (The startup process calls
4373  * this on noticing a relevant configuration change.)
4374  */
4375 void
4377 {
4379  {
4380  ereport(LOG,
4381  (errmsg("WAL receiver process shutdown requested")));
4382 
4383  pendingWalRcvRestart = true;
4384  }
4385 }
4386 
4387 
4388 /*
4389  * Has a standby promotion already been triggered?
4390  *
4391  * Unlike CheckForStandbyTrigger(), this works in any process
4392  * that's connected to shared memory.
4393  */
4394 bool
4396 {
4397  /*
4398  * We check shared state each time only until a standby promotion is
4399  * triggered. We can't trigger a promotion again, so there's no need to
4400  * keep checking after the shared variable has once been seen true.
4401  */
4403  return true;
4404 
4408 
4409  return LocalPromoteIsTriggered;
4410 }
4411 
4412 static void
4414 {
4418 
4419  /*
4420  * Mark the recovery pause state as 'not paused' because the paused state
4421  * ends and promotion continues if a promotion is triggered while recovery
4422  * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4423  * return 'paused' while a promotion is ongoing.
4424  */
4425  SetRecoveryPause(false);
4426 
4427  LocalPromoteIsTriggered = true;
4428 }
4429 
4430 /*
4431  * Check whether a promote request has arrived.
4432  */
4433 static bool
4435 {
4437  return true;
4438 
4440  {
4441  ereport(LOG, (errmsg("received promote request")));
4445  return true;
4446  }
4447 
4448  return false;
4449 }
4450 
4451 /*
4452  * Remove the files signaling a standby promotion request.
4453  */
4454 void
4456 {
4457  unlink(PROMOTE_SIGNAL_FILE);
4458 }
4459 
4460 /*
4461  * Check to see if a promote request has arrived.
4462  */
4463 bool
4465 {
4466  struct stat stat_buf;
4467 
4468  if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4469  return true;
4470 
4471  return false;
4472 }
4473 
4474 /*
4475  * Wake up startup process to replay newly arrived WAL, or to notice that
4476  * failover has been requested.
4477  */
4478 void
4480 {
4482 }
4483 
4484 /*
4485  * Schedule a walreceiver wakeup in the main recovery loop.
4486  */
4487 void
4489 {
4491 }
4492 
4493 /*
4494  * Is HotStandby active yet? This is only important in special backends
4495  * since normal backends won't ever be able to connect until this returns
4496  * true. Postmaster knows this by way of signal, not via shared memory.
4497  *
4498  * Unlike testing standbyState, this works in any process that's connected to
4499  * shared memory. (And note that standbyState alone doesn't tell the truth
4500  * anyway.)
4501  */
4502 bool
4504 {
4505  /*
4506  * We check shared state each time only until Hot Standby is active. We
4507  * can't de-activate Hot Standby, so there's no need to keep checking
4508  * after the shared variable has once been seen true.
4509  */
4511  return true;
4512  else
4513  {
4514  /* spinlock is essential on machines with weak memory ordering! */
4518 
4519  return LocalHotStandbyActive;
4520  }
4521 }
4522 
4523 /*
4524  * Like HotStandbyActive(), but to be used only in WAL replay code,
4525  * where we don't need to ask any other process what the state is.
4526  */
4527 static bool
4529 {
4531  return LocalHotStandbyActive;
4532 }
4533 
4534 /*
4535  * Get latest redo apply position.
4536  *
4537  * Exported to allow WALReceiver to read the pointer directly.
4538  */
4539 XLogRecPtr
4541 {
4542  XLogRecPtr recptr;
4543  TimeLineID tli;
4544 
4549 
4550  if (replayTLI)
4551  *replayTLI = tli;
4552  return recptr;
4553 }
4554 
4555 
4556 /*
4557  * Get position of last applied, or the record being applied.
4558  *
4559  * This is different from GetXLogReplayRecPtr() in that if a WAL
4560  * record is currently being applied, this includes that record.
4561  */
4562 XLogRecPtr
4564 {
4565  XLogRecPtr recptr;
4566  TimeLineID tli;
4567 
4569  recptr = XLogRecoveryCtl->replayEndRecPtr;
4572 
4573  if (replayEndTLI)
4574  *replayEndTLI = tli;
4575  return recptr;
4576 }
4577 
4578 /*
4579  * Save timestamp of latest processed commit/abort record.
4580  *
4581  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4582  * seen by processes other than the startup process. Note in particular
4583  * that CreateRestartPoint is executed in the checkpointer.
4584  */
4585 static void
4587 {
4591 }
4592 
4593 /*
4594  * Fetch timestamp of latest processed commit/abort record.
4595  */
4598 {
4599  TimestampTz xtime;
4600 
4604 
4605  return xtime;
4606 }
4607 
4608 /*
4609  * Save timestamp of the next chunk of WAL records to apply.
4610  *
4611  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4612  * seen by all backends.
4613  */
4614 static void
4616 {
4620 }
4621 
4622 /*
4623  * Fetch timestamp of latest processed commit/abort record.
4624  * Startup process maintains an accurate local copy in XLogReceiptTime
4625  */
4628 {
4629  TimestampTz xtime;
4630 
4634 
4635  return xtime;
4636 }
4637 
4638 /*
4639  * Returns time of receipt of current chunk of XLOG data, as well as
4640  * whether it was received from streaming replication or from archives.
4641  */
4642 void
4643 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4644 {
4645  /*
4646  * This must be executed in the startup process, since we don't export the
4647  * relevant state to shared memory.
4648  */
4649  Assert(InRecovery);
4650 
4651  *rtime = XLogReceiptTime;
4652  *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4653 }
4654 
4655 /*
4656  * Note that text field supplied is a parameter name and does not require
4657  * translation
4658  */
4659 void
4660 RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4661 {
4662  if (currValue < minValue)
4663  {
4665  {
4666  bool warned_for_promote = false;
4667 
4668  ereport(WARNING,
4669  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4670  errmsg("hot standby is not possible because of insufficient parameter settings"),
4671  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4672  param_name,
4673  currValue,
4674  minValue)));
4675 
4676  SetRecoveryPause(true);
4677 
4678  ereport(LOG,
4679  (errmsg("recovery has paused"),
4680  errdetail("If recovery is unpaused, the server will shut down."),
4681  errhint("You can then restart the server after making the necessary configuration changes.")));
4682 
4684  {
4686 
4687  if (CheckForStandbyTrigger())
4688  {
4689  if (!warned_for_promote)
4690  ereport(WARNING,
4691  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4692  errmsg("promotion is not possible because of insufficient parameter settings"),
4693 
4694  /*
4695  * Repeat the detail from above so it's easy to find
4696  * in the log.
4697  */
4698  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4699  param_name,
4700  currValue,
4701  minValue),
4702  errhint("Restart the server after making the necessary configuration changes.")));
4703  warned_for_promote = true;
4704  }
4705 
4706  /*
4707  * If recovery pause is requested then set it paused. While
4708  * we are in the loop, user might resume and pause again so
4709  * set this every time.
4710  */
4712 
4713  /*
4714  * We wait on a condition variable that will wake us as soon
4715  * as the pause ends, but we use a timeout so we can check the
4716  * above conditions periodically too.
4717  */
4719  WAIT_EVENT_RECOVERY_PAUSE);
4720  }
4722  }
4723 
4724  ereport(FATAL,
4725  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4726  errmsg("recovery aborted because of insufficient parameter settings"),
4727  /* Repeat the detail from above so it's easy to find in the log. */
4728  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4729  param_name,
4730  currValue,
4731  minValue),
4732  errhint("You can restart the server after making the necessary configuration changes.")));
4733  }
4734 }
4735 
4736 
4737 /*
4738  * GUC check_hook for primary_slot_name
4739  */
4740 bool
4742 {
4743  if (*newval && strcmp(*newval, "") != 0 &&
4745  return false;
4746 
4747  return true;
4748 }
4749 
4750 /*
4751  * Recovery target settings: Only one of the several recovery_target* settings
4752  * may be set. Setting a second one results in an error. The global variable
4753  * recoveryTarget tracks which kind of recovery target was chosen. Other
4754  * variables store the actual target value (for example a string or a xid).
4755  * The assign functions of the parameters check whether a competing parameter
4756  * was already set. But we want to allow setting the same parameter multiple
4757  * times. We also want to allow unsetting a parameter and setting a different
4758  * one, so we unset recoveryTarget when the parameter is set to an empty
4759  * string.
4760  *
4761  * XXX this code is broken by design. Throwing an error from a GUC assign
4762  * hook breaks fundamental assumptions of guc.c. So long as all the variables
4763  * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4764  * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4765  * that we have odd behaviors such as unexpected GUC ordering dependencies.
4766  */
4767 
4768 static void
4770 error_multiple_recovery_targets(void)
4771 {
4772  ereport(ERROR,
4773  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4774  errmsg("multiple recovery targets specified"),
4775  errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4776 }
4777 
4778 /*
4779  * GUC check_hook for recovery_target
4780  */
4781 bool
4783 {
4784  if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4785  {
4786  GUC_check_errdetail("The only allowed value is \"immediate\".");
4787  return false;
4788  }
4789  return true;
4790 }
4791 
4792 /*
4793  * GUC assign_hook for recovery_target
4794  */
4795 void
4796 assign_recovery_target(const char *newval, void *extra)
4797 {
4800  error_multiple_recovery_targets();
4801 
4802  if (newval && strcmp(newval, "") != 0)
4804  else
4806 }
4807 
4808 /*
4809  * GUC check_hook for recovery_target_lsn
4810  */
4811 bool
4813 {
4814  if (strcmp(*newval, "") != 0)
4815  {
4816  XLogRecPtr lsn;
4817  XLogRecPtr *myextra;
4818  bool have_error = false;
4819 
4820  lsn = pg_lsn_in_internal(*newval, &have_error);
4821  if (have_error)
4822  return false;
4823 
4824  myextra = (XLogRecPtr *) guc_malloc(ERROR, sizeof(XLogRecPtr));
4825  *myextra = lsn;
4826  *extra = (void *) myextra;
4827  }
4828  return true;
4829 }
4830 
4831 /*
4832  * GUC assign_hook for recovery_target_lsn
4833  */
4834 void
4835 assign_recovery_target_lsn(const char *newval, void *extra)
4836 {
4839  error_multiple_recovery_targets();
4840 
4841  if (newval && strcmp(newval, "") != 0)
4842  {
4844  recoveryTargetLSN = *((XLogRecPtr *) extra);
4845  }
4846  else
4848 }
4849 
4850 /*
4851  * GUC check_hook for recovery_target_name
4852  */
4853 bool
4855 {
4856  /* Use the value of newval directly */
4857  if (strlen(*newval) >= MAXFNAMELEN)
4858  {
4859  GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4860  "recovery_target_name", MAXFNAMELEN - 1);
4861  return false;
4862  }
4863  return true;
4864 }
4865 
4866 /*
4867  * GUC assign_hook for recovery_target_name
4868  */
4869 void
4870 assign_recovery_target_name(const char *newval, void *extra)
4871 {
4874  error_multiple_recovery_targets();
4875 
4876  if (newval && strcmp(newval, "") != 0)
4877  {
4880  }
4881  else
4883 }
4884 
4885 /*
4886  * GUC check_hook for recovery_target_time
4887  *
4888  * The interpretation of the recovery_target_time string can depend on the
4889  * time zone setting, so we need to wait until after all GUC processing is
4890  * done before we can do the final parsing of the string. This check function
4891  * only does a parsing pass to catch syntax errors, but we store the string
4892  * and parse it again when we need to use it.
4893  */
4894 bool
4896 {
4897  if (strcmp(*newval, "") != 0)
4898  {
4899  /* reject some special values */
4900  if (strcmp(*newval, "now") == 0 ||
4901  strcmp(*newval, "today") == 0 ||
4902  strcmp(*newval, "tomorrow") == 0 ||
4903  strcmp(*newval, "yesterday") == 0)
4904  {
4905  return false;
4906  }
4907 
4908  /*
4909  * parse timestamp value (see also timestamptz_in())
4910  */
4911  {
4912  char *str = *newval;
4913  fsec_t fsec;
4914  struct pg_tm tt,
4915  *tm = &tt;
4916  int tz;
4917  int dtype;
4918  int nf;
4919  int dterr;
4920  char *field[MAXDATEFIELDS];
4921  int ftype[MAXDATEFIELDS];
4922  char workbuf[MAXDATELEN + MAXDATEFIELDS];
4923  DateTimeErrorExtra dtextra;
4925 
4926  dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4927  field, ftype, MAXDATEFIELDS, &nf);
4928  if (dterr == 0)
4929  dterr = DecodeDateTime(field, ftype, nf,
4930  &dtype, tm, &fsec, &tz, &dtextra);
4931  if (dterr != 0)
4932  return false;
4933  if (dtype != DTK_DATE)
4934  return false;
4935 
4936  if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4937  {
4938  GUC_check_errdetail("timestamp out of range: \"%s\"", str);
4939  return false;
4940  }
4941  }
4942  }
4943  return true;
4944 }
4945 
4946 /*
4947  * GUC assign_hook for recovery_target_time
4948  */
4949 void
4950 assign_recovery_target_time(const char *newval, void *extra)
4951 {
4954  error_multiple_recovery_targets();
4955 
4956  if (newval && strcmp(newval, "") != 0)
4958  else
4960 }
4961 
4962 /*
4963  * GUC check_hook for recovery_target_timeline
4964  */
4965 bool
4967 {
4969  RecoveryTargetTimeLineGoal *myextra;
4970 
4971  if (strcmp(*newval, "current") == 0)
4973  else if (strcmp(*newval, "latest") == 0)
4975  else
4976  {
4978 
4979  errno = 0;
4980  strtoul(*newval, NULL, 0);
4981  if (errno == EINVAL || errno == ERANGE)
4982  {
4983  GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number.");
4984  return false;
4985  }
4986  }
4987 
4989  *myextra = rttg;
4990  *extra = (void *) myextra;
4991 
4992  return true;
4993 }
4994 
4995 /*
4996  * GUC assign_hook for recovery_target_timeline
4997  */
4998 void
4999 assign_recovery_target_timeline(const char *newval, void *extra)
5000 {
5003  recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5004  else
5006 }
5007 
5008 /*
5009  * GUC check_hook for recovery_target_xid
5010  */
5011 bool
5013 {
5014  if (strcmp(*newval, "") != 0)
5015  {
5016  TransactionId xid;
5017  TransactionId *myextra;
5018 
5019  errno = 0;
5020  xid = (TransactionId) strtou64(*newval, NULL, 0);
5021  if (errno == EINVAL || errno == ERANGE)
5022  return false;
5023 
5024  myextra = (TransactionId *) guc_malloc(ERROR, sizeof(TransactionId));
5025  *myextra = xid;
5026  *extra = (void *) myextra;
5027  }
5028  return true;
5029 }
5030 
5031 /*
5032  * GUC assign_hook for recovery_target_xid
5033  */
5034 void
5035 assign_recovery_target_xid(const char *newval, void *extra)
5036 {
5039  error_multiple_recovery_targets();
5040 
5041  if (newval && strcmp(newval, "") != 0)
5042  {
5044  recoveryTargetXid = *((TransactionId *) extra);
5045  }
5046  else
5048 }
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition: timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:544
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:572
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition: timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition: timeline.c:50
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition: timeline.c:526
void remove_tablespace_symlink(const char *linkloc)
Definition: tablespace.c:883
bool allow_in_place_tablespaces
Definition: tablespace.c:85
void HandleStartupProcInterrupts(void)
Definition: startup.c:154
void disable_startup_progress_timeout(void)
Definition: startup.c:309
bool IsPromoteSignaled(void)
Definition: startup.c:288
void begin_startup_progress_phase(void)
Definition: startup.c:343
void ResetPromoteSignaled(void)
Definition: startup.c:294
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition: datetime.c:754
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition: datetime.c:978
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1767
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition: timestamp.c:1998
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1791
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition: timestamp.c:417
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1655
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1619
const char * timestamptz_to_str(TimestampTz t)
Definition: timestamp.c:1854
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4913
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5131
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:404
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:195
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:51
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:355
Pointer Page
Definition: bufpage.h:78
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
unsigned int uint32
Definition: c.h:506
signed int int32
Definition: c.h:494
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:182
#define Assert(condition)
Definition: c.h:858
#define PG_BINARY
Definition: c.h:1273
#define UINT64_FORMAT
Definition: c.h:549
#define strtou64(str, endptr, base)
Definition: c.h:1298
unsigned char uint8
Definition: c.h:504
uint32 TransactionId
Definition: c.h:652
size_t Size
Definition: c.h:605
void RequestCheckpoint(int flags)
Definition: checkpointer.c:941
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition: timestamp.h:39
int32 fsec_t
Definition: timestamp.h:41
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1155
int errcode_for_file_access(void)
Definition: elog.c:878
int errdetail(const char *fmt,...)
Definition: elog.c:1201
ErrorContextCallback * error_context_stack
Definition: elog.c:94
int errhint(const char *fmt,...)
Definition: elog.c:1315
int errcode(int sqlerrcode)
Definition: elog.c:855
int errmsg(const char *fmt,...)
Definition: elog.c:1068
#define LOG
Definition: elog.h:31
#define errcontext
Definition: elog.h:196
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2909
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2583
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1109
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:782
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1087
int FreeFile(FILE *file)
Definition: fd.c:2781
int pg_fsync(int fd)
Definition: fd.c:386
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2843
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:525
@ PGFILETYPE_LNK
Definition: file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:646
bool IsUnderPostmaster
Definition: globals.c:118
char * DataDir
Definition: globals.c:69
bool IsPostmasterEnvironment
Definition: globals.c:117
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:640
#define newval
#define GUC_check_errdetail
Definition: guc.h:447
GucSource
Definition: guc.h:108
const char * str
#define MAXDATEFIELDS
Definition: datetime.h:202
#define DTK_DATE
Definition: datetime.h:144
#define MAXDATELEN
Definition: datetime.h:200
#define close(a)
Definition: win32.h:12
void proc_exit(int code)
Definition: ipc.c:104
int i
Definition: isn.c:73
void OwnLatch(Latch *latch)
Definition: latch.c:463
void DisownLatch(Latch *latch)
Definition: latch.c:489
void InitSharedLatch(Latch *latch)
Definition: latch.c:430
void SetLatch(Latch *latch)
Definition: latch.c:632
void ResetLatch(Latch *latch)
Definition: latch.c:724
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:517
#define WL_TIMEOUT
Definition: latch.h:130
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:132
#define WL_LATCH_SET
Definition: latch.h:127
List * lappend(List *list, void *datum)
Definition: list.c:339
void list_free_deep(List *list)
Definition: list.c:1560
static struct pg_tm tm
Definition: localtime.c:104
char * pstrdup(const char *in)
Definition: mcxt.c:1696
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * palloc(Size size)
Definition: mcxt.c:1317
#define AmStartupProcess()
Definition: miscadmin.h:382
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:454
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition: pg_control.h:74
#define XLOG_CHECKPOINT_REDO
Definition: pg_control.h:81
#define XLOG_OVERWRITE_CONTRECORD
Definition: pg_control.h:80
DBState
Definition: pg_control.h:89
@ DB_IN_ARCHIVE_RECOVERY
Definition: pg_control.h:95
@ DB_SHUTDOWNED_IN_RECOVERY
Definition: pg_control.h:92
@ DB_SHUTDOWNED
Definition: pg_control.h:91
@ DB_IN_CRASH_RECOVERY
Definition: pg_control.h:94
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:67
#define XLOG_BACKUP_END
Definition: pg_control.h:72
#define XLOG_CHECKPOINT_ONLINE
Definition: pg_control.h:68
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:76
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
XLogRecPtr pg_lsn_in_internal(const char *str, bool *have_error)
Definition: pg_lsn.c:29
static rewind_source * source
Definition: pg_rewind.c:89
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:73
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:181
@ PMSIGNAL_RECOVERY_STARTED
Definition: pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition: pmsignal.h:36
#define pg_pread
Definition: port.h:225
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:350
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:36
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition: procarray.c:4407
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition: procarray.c:4544
static void set_ps_display(const char *activity)
Definition: ps_status.h:40
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
void RmgrStartup(void)
Definition: rmgr.c:58
void RmgrCleanup(void)
Definition: rmgr.c:74
int slock_t
Definition: s_lock.h:670
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:387
static pg_noinline void Size size
Definition: slab.c:607
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:252
void ShutDownSlotSync(void)
Definition: slotsync.c:1563
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:97
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:182
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:194
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
Oid oldestMultiDB
Definition: pg_control.h:50
MultiXactId oldestMulti
Definition: pg_control.h:49
MultiXactOffset nextMultiOffset
Definition: pg_control.h:46
TransactionId newestCommitTsXid
Definition: pg_control.h:54
TransactionId oldestXid
Definition: pg_control.h:47
TimeLineID PrevTimeLineID
Definition: pg_control.h:40
TimeLineID ThisTimeLineID
Definition: pg_control.h:39
Oid nextOid
Definition: pg_control.h:44
MultiXactId nextMulti
Definition: pg_control.h:45
FullTransactionId nextXid
Definition: pg_control.h:43
TransactionId oldestCommitTsXid
Definition: pg_control.h:52
XLogRecPtr redo
Definition: pg_control.h:37
Oid oldestXidDB
Definition: pg_control.h:48
XLogRecPtr backupStartPoint
Definition: pg_control.h:169
bool backupEndRequired
Definition: pg_control.h:171
CheckPoint checkPointCopy
Definition: pg_control.h:134
XLogRecPtr backupEndPoint
Definition: pg_control.h:170
XLogRecPtr minRecoveryPoint
Definition: pg_control.h:167
XLogRecPtr checkPoint
Definition: pg_control.h:132
uint64 system_identifier
Definition: pg_control.h:109
TimeLineID minRecoveryPointTLI
Definition: pg_control.h:168
Definition: dirent.c:26
XLogRecPtr lastPageBeginPtr
Definition: xlogrecovery.h:111
XLogRecPtr abortedRecPtr
Definition: xlogrecovery.h:120
XLogRecPtr missingContrecPtr
Definition: xlogrecovery.h:121
TimeLineID endOfLogTLI
Definition: xlogrecovery.h:109
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
Definition: latch.h:113
Definition: pg_list.h:54
RelFileNumber relNumber
void(* rm_mask)(char *pagedata, BlockNumber blkno)
void(* rm_redo)(XLogReaderState *record)
const char *(* rm_identify)(uint8 info)
const char * rm_name
void(* rm_desc)(StringInfo buf, XLogReaderState *record)
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h