PostgreSQL Source Code  git master
xlogrecovery.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * xlogrecovery.c
4  * Functions for WAL recovery, standby mode
5  *
6  * This source file contains functions controlling WAL recovery.
7  * InitWalRecovery() initializes the system for crash or archive recovery,
8  * or standby mode, depending on configuration options and the state of
9  * the control file and possible backup label file. PerformWalRecovery()
10  * performs the actual WAL replay, calling the rmgr-specific redo routines.
11  * EndWalRecovery() performs end-of-recovery checks and cleanup actions,
12  * and prepares information needed to initialize the WAL for writes. In
13  * addition to these three main functions, there are a bunch of functions
14  * for interrogating recovery state and controlling the recovery process.
15  *
16  *
17  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
18  * Portions Copyright (c) 1994, Regents of the University of California
19  *
20  * src/backend/access/transam/xlogrecovery.c
21  *
22  *-------------------------------------------------------------------------
23  */
24 
25 #include "postgres.h"
26 
27 #include <ctype.h>
28 #include <math.h>
29 #include <time.h>
30 #include <sys/stat.h>
31 #include <sys/time.h>
32 #include <unistd.h>
33 
34 #include "access/timeline.h"
35 #include "access/transam.h"
36 #include "access/xact.h"
37 #include "access/xlog_internal.h"
38 #include "access/xlogarchive.h"
39 #include "access/xlogprefetcher.h"
40 #include "access/xlogreader.h"
41 #include "access/xlogrecovery.h"
42 #include "access/xlogutils.h"
43 #include "backup/basebackup.h"
44 #include "catalog/pg_control.h"
45 #include "commands/tablespace.h"
46 #include "common/file_utils.h"
47 #include "miscadmin.h"
48 #include "pgstat.h"
49 #include "postmaster/bgwriter.h"
50 #include "postmaster/startup.h"
51 #include "replication/slot.h"
53 #include "storage/fd.h"
54 #include "storage/ipc.h"
55 #include "storage/latch.h"
56 #include "storage/pmsignal.h"
57 #include "storage/proc.h"
58 #include "storage/procarray.h"
59 #include "storage/spin.h"
60 #include "utils/builtins.h"
61 #include "utils/datetime.h"
62 #include "utils/guc_hooks.h"
63 #include "utils/pg_lsn.h"
64 #include "utils/ps_status.h"
65 #include "utils/pg_rusage.h"
66 
67 /* Unsupported old recovery command file names (relative to $PGDATA) */
68 #define RECOVERY_COMMAND_FILE "recovery.conf"
69 #define RECOVERY_COMMAND_DONE "recovery.done"
70 
71 /*
72  * GUC support
73  */
75  {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
76  {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
77  {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
78  {NULL, 0, false}
79 };
80 
81 /* options formerly taken from recovery.conf for archive recovery */
83 char *recoveryEndCommand = NULL;
84 char *archiveCleanupCommand = NULL;
91 const char *recoveryTargetName;
94 
95 /* options formerly taken from recovery.conf for XLOG streaming */
96 char *PrimaryConnInfo = NULL;
97 char *PrimarySlotName = NULL;
99 
100 /*
101  * recoveryTargetTimeLineGoal: what the user requested, if any
102  *
103  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
104  *
105  * recoveryTargetTLI: the currently understood target timeline; changes
106  *
107  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
108  * the timelines of its known parents, newest first (so recoveryTargetTLI is
109  * always the first list member). Only these TLIs are expected to be seen in
110  * the WAL segments we read, and indeed only these TLIs will be considered as
111  * candidate WAL files to open at all.
112  *
113  * curFileTLI: the TLI appearing in the name of the current input WAL file.
114  * (This is not necessarily the same as the timeline from which we are
115  * replaying WAL, which StartupXLOG calls replayTLI, because we could be
116  * scanning data that was copied from an ancestor timeline when the current
117  * file was created.) During a sequential scan we do not allow this value
118  * to decrease.
119  */
125 
126 /*
127  * When ArchiveRecoveryRequested is set, archive recovery was requested,
128  * ie. signal files were present. When InArchiveRecovery is set, we are
129  * currently recovering using offline XLOG archives. These variables are only
130  * valid in the startup process.
131  *
132  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
133  * currently performing crash recovery using only XLOG files in pg_wal, but
134  * will switch to using offline XLOG archives as soon as we reach the end of
135  * WAL in pg_wal.
136 */
138 bool InArchiveRecovery = false;
139 
140 /*
141  * When StandbyModeRequested is set, standby mode was requested, i.e.
142  * standby.signal file was present. When StandbyMode is set, we are currently
143  * in standby mode. These variables are only valid in the startup process.
144  * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
145  */
146 static bool StandbyModeRequested = false;
147 bool StandbyMode = false;
148 
149 /* was a signal file present at startup? */
150 static bool standby_signal_file_found = false;
151 static bool recovery_signal_file_found = false;
152 
153 /*
154  * CheckPointLoc is the position of the checkpoint record that determines
155  * where to start the replay. It comes from the backup label file or the
156  * control file.
157  *
158  * RedoStartLSN is the checkpoint's REDO location, also from the backup label
159  * file or the control file. In standby mode, XLOG streaming usually starts
160  * from the position where an invalid record was found. But if we fail to
161  * read even the initial checkpoint record, we use the REDO location instead
162  * of the checkpoint location as the start position of XLOG streaming.
163  * Otherwise we would have to jump backwards to the REDO location after
164  * reading the checkpoint record, because the REDO record can precede the
165  * checkpoint record.
166  */
171 
172 /*
173  * Local copy of SharedHotStandbyActive variable. False actually means "not
174  * known, need to check the shared state".
175  */
176 static bool LocalHotStandbyActive = false;
177 
178 /*
179  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
180  * known, need to check the shared state".
181  */
182 static bool LocalPromoteIsTriggered = false;
183 
184 /* Has the recovery code requested a walreceiver wakeup? */
186 
187 /* XLogReader object used to parse the WAL records */
189 
190 /* XLogPrefetcher object used to consume WAL records with read-ahead */
192 
193 /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
194 typedef struct XLogPageReadPrivate
195 {
196  int emode;
197  bool fetching_ckpt; /* are we fetching a checkpoint record? */
201 
202 /* flag to tell XLogPageRead that we have started replaying */
203 static bool InRedo = false;
204 
205 /*
206  * Codes indicating where we got a WAL file from during recovery, or where
207  * to attempt to get one.
208  */
209 typedef enum
210 {
211  XLOG_FROM_ANY = 0, /* request to read WAL from any source */
212  XLOG_FROM_ARCHIVE, /* restored using restore_command */
213  XLOG_FROM_PG_WAL, /* existing file in pg_wal */
214  XLOG_FROM_STREAM /* streamed from primary */
216 
217 /* human-readable names for XLogSources, for debugging output */
218 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
219 
220 /*
221  * readFile is -1 or a kernel FD for the log file segment that's currently
222  * open for reading. readSegNo identifies the segment. readOff is the offset
223  * of the page just read, readLen indicates how much of it has been read into
224  * readBuf, and readSource indicates where we got the currently open file from.
225  *
226  * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
227  * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
228  * worthwhile, since the XLOG is not read by general-purpose sessions.
229  */
230 static int readFile = -1;
231 static XLogSegNo readSegNo = 0;
232 static uint32 readOff = 0;
233 static uint32 readLen = 0;
235 
236 /*
237  * Keeps track of which source we're currently reading from. This is
238  * different from readSource in that this is always set, even when we don't
239  * currently have a WAL file open. If lastSourceFailed is set, our last
240  * attempt to read from currentSource failed, and we should try another source
241  * next.
242  *
243  * pendingWalRcvRestart is set when a config change occurs that requires a
244  * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
245  */
247 static bool lastSourceFailed = false;
248 static bool pendingWalRcvRestart = false;
249 
250 /*
251  * These variables track when we last obtained some WAL data to process,
252  * and where we got it from. (XLogReceiptSource is initially the same as
253  * readSource, but readSource gets reset to zero when we don't have data
254  * to process right now. It is also different from currentSource, which
255  * also changes when we try to read from a source and fail, while
256  * XLogReceiptSource tracks where we last successfully read some WAL.)
257  */
260 
261 /* Local copy of WalRcv->flushedUpto */
264 
265 /*
266  * Copy of minRecoveryPoint and backupEndPoint from the control file.
267  *
268  * In order to reach consistency, we must replay the WAL up to
269  * minRecoveryPoint. If backupEndRequired is true, we must also reach
270  * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
271  * to backupStartPoint.
272  *
273  * Note: In archive recovery, after consistency has been reached, the
274  * functions in xlog.c will start updating minRecoveryPoint in the control
275  * file. But this copy of minRecoveryPoint variable reflects the value at the
276  * beginning of recovery, and is *not* updated after consistency is reached.
277  */
280 
283 static bool backupEndRequired = false;
284 
285 /*
286  * Have we reached a consistent database state? In crash recovery, we have
287  * to replay all the WAL, so reachedConsistency is never set. During archive
288  * recovery, the database is consistent once minRecoveryPoint is reached.
289  *
290  * Consistent state means that the system is internally consistent, all
291  * the WAL has been replayed up to a certain point, and importantly, there
292  * is no trace of later actions on disk.
293  */
294 bool reachedConsistency = false;
295 
296 /* Buffers dedicated to consistency checks of size BLCKSZ */
297 static char *replay_image_masked = NULL;
298 static char *primary_image_masked = NULL;
299 
300 
301 /*
302  * Shared-memory state for WAL recovery.
303  */
304 typedef struct XLogRecoveryCtlData
305 {
306  /*
307  * SharedHotStandbyActive indicates if we allow hot standby queries to be
308  * run. Protected by info_lck.
309  */
311 
312  /*
313  * SharedPromoteIsTriggered indicates if a standby promotion has been
314  * triggered. Protected by info_lck.
315  */
317 
318  /*
319  * recoveryWakeupLatch is used to wake up the startup process to continue
320  * WAL replay, if it is waiting for WAL to arrive or promotion to be
321  * requested.
322  *
323  * Note that the startup process also uses another latch, its procLatch,
324  * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
325  * signaling the startup process in favor of using its procLatch, which
326  * comports better with possible generic signal handlers using that latch.
327  * But we should not do that because the startup process doesn't assume
328  * that it's waken up by walreceiver process or SIGHUP signal handler
329  * while it's waiting for recovery conflict. The separate latches,
330  * recoveryWakeupLatch and procLatch, should be used for inter-process
331  * communication for WAL replay and recovery conflict, respectively.
332  */
334 
335  /*
336  * Last record successfully replayed.
337  */
338  XLogRecPtr lastReplayedReadRecPtr; /* start position */
339  XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
340  TimeLineID lastReplayedTLI; /* timeline */
341 
342  /*
343  * When we're currently replaying a record, ie. in a redo function,
344  * replayEndRecPtr points to the end+1 of the record being replayed,
345  * otherwise it's equal to lastReplayedEndRecPtr.
346  */
349  /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
351 
352  /*
353  * timestamp of when we started replaying the current chunk of WAL data,
354  * only relevant for replication or archive recovery
355  */
357  /* Recovery pause state */
360 
361  slock_t info_lck; /* locks shared variables shown above */
363 
365 
366 /*
367  * abortedRecPtr is the start pointer of a broken record at end of WAL when
368  * recovery completes; missingContrecPtr is the location of the first
369  * contrecord that went missing. See CreateOverwriteContrecordRecord for
370  * details.
371  */
374 
375 /*
376  * if recoveryStopsBefore/After returns true, it saves information of the stop
377  * point here
378  */
383 static bool recoveryStopAfter;
384 
385 /* prototypes for local functions */
386 static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
387 
388 static void readRecoverySignalFile(void);
389 static void validateRecoveryParameters(void);
390 static bool read_backup_label(XLogRecPtr *checkPointLoc,
391  TimeLineID *backupLabelTLI,
392  bool *backupEndRequired, bool *backupFromStandby);
393 static bool read_tablespace_map(List **tablespaces);
394 
395 static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
396 static void CheckRecoveryConsistency(void);
397 static void rm_redo_error_callback(void *arg);
398 #ifdef WAL_DEBUG
399 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
400 #endif
401 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
402 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
403  TimeLineID prevTLI, TimeLineID replayTLI);
404 static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
405 static void verifyBackupPageConsistency(XLogReaderState *record);
406 
407 static bool recoveryStopsBefore(XLogReaderState *record);
408 static bool recoveryStopsAfter(XLogReaderState *record);
409 static char *getRecoveryStopReason(void);
410 static void recoveryPausesHere(bool endOfRecovery);
411 static bool recoveryApplyDelay(XLogReaderState *record);
412 static void ConfirmRecoveryPaused(void);
413 
415  int emode, bool fetching_ckpt,
416  TimeLineID replayTLI);
417 
418 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
419  int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
421  bool randAccess,
422  bool fetching_ckpt,
423  XLogRecPtr tliRecPtr,
424  TimeLineID replayTLI,
425  XLogRecPtr replayLSN,
426  bool nonblocking);
427 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
429  XLogRecPtr RecPtr, TimeLineID replayTLI);
430 static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
431 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
432  XLogSource source, bool notfoundOk);
433 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
434 
435 static bool CheckForStandbyTrigger(void);
436 static void SetPromoteIsTriggered(void);
437 static bool HotStandbyActiveInReplay(void);
438 
439 static void SetCurrentChunkStartTime(TimestampTz xtime);
440 static void SetLatestXTime(TimestampTz xtime);
441 
442 /*
443  * Initialization of shared memory for WAL recovery
444  */
445 Size
447 {
448  Size size;
449 
450  /* XLogRecoveryCtl */
451  size = sizeof(XLogRecoveryCtlData);
452 
453  return size;
454 }
455 
456 void
458 {
459  bool found;
460 
462  ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
463  if (found)
464  return;
465  memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
466 
470 }
471 
472 /*
473  * Prepare the system for WAL recovery, if needed.
474  *
475  * This is called by StartupXLOG() which coordinates the server startup
476  * sequence. This function analyzes the control file and the backup label
477  * file, if any, and figures out whether we need to perform crash recovery or
478  * archive recovery, and how far we need to replay the WAL to reach a
479  * consistent state.
480  *
481  * This doesn't yet change the on-disk state, except for creating the symlinks
482  * from table space map file if any, and for fetching WAL files needed to find
483  * the checkpoint record. On entry, the caller has already read the control
484  * file into memory, and passes it as argument. This function updates it to
485  * reflect the recovery state, and the caller is expected to write it back to
486  * disk does after initializing other subsystems, but before calling
487  * PerformWalRecovery().
488  *
489  * This initializes some global variables like ArchiveModeRequested, and
490  * StandbyModeRequested and InRecovery.
491  */
492 void
494  bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
495 {
496  XLogPageReadPrivate *private;
497  struct stat st;
498  bool wasShutdown;
499  XLogRecord *record;
500  DBState dbstate_at_startup;
501  bool haveTblspcMap = false;
502  bool haveBackupLabel = false;
503  CheckPoint checkPoint;
504  bool backupFromStandby = false;
505 
506  dbstate_at_startup = ControlFile->state;
507 
508  /*
509  * Initialize on the assumption we want to recover to the latest timeline
510  * that's active according to pg_control.
511  */
515  else
517 
518  /*
519  * Check for signal files, and if so set up state for offline recovery
520  */
523 
525  {
527  ereport(LOG,
528  (errmsg("entering standby mode")));
530  ereport(LOG,
531  (errmsg("starting point-in-time recovery to XID %u",
534  ereport(LOG,
535  (errmsg("starting point-in-time recovery to %s",
538  ereport(LOG,
539  (errmsg("starting point-in-time recovery to \"%s\"",
542  ereport(LOG,
543  (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
546  ereport(LOG,
547  (errmsg("starting point-in-time recovery to earliest consistent point")));
548  else
549  ereport(LOG,
550  (errmsg("starting archive recovery")));
551  }
552 
553  /*
554  * Take ownership of the wakeup latch if we're going to sleep during
555  * recovery.
556  */
559 
560  private = palloc0(sizeof(XLogPageReadPrivate));
561  xlogreader =
563  XL_ROUTINE(.page_read = &XLogPageRead,
564  .segment_open = NULL,
565  .segment_close = wal_segment_close),
566  private);
567  if (!xlogreader)
568  ereport(ERROR,
569  (errcode(ERRCODE_OUT_OF_MEMORY),
570  errmsg("out of memory"),
571  errdetail("Failed while allocating a WAL reading processor.")));
573 
574  /*
575  * Set the WAL decode buffer size. This limits how far ahead we can read
576  * in the WAL.
577  */
579 
580  /* Create a WAL prefetcher. */
582 
583  /*
584  * Allocate two page buffers dedicated to WAL consistency checks. We do
585  * it this way, rather than just making static arrays, for two reasons:
586  * (1) no need to waste the storage in most instantiations of the backend;
587  * (2) a static char array isn't guaranteed to have any particular
588  * alignment, whereas palloc() will provide MAXALIGN'd storage.
589  */
590  replay_image_masked = (char *) palloc(BLCKSZ);
591  primary_image_masked = (char *) palloc(BLCKSZ);
592 
594  &backupFromStandby))
595  {
596  List *tablespaces = NIL;
597 
598  /*
599  * Archive recovery was requested, and thanks to the backup label
600  * file, we know how far we need to replay to reach consistency. Enter
601  * archive recovery directly.
602  */
603  InArchiveRecovery = true;
605  StandbyMode = true;
606 
607  /*
608  * When a backup_label file is present, we want to roll forward from
609  * the checkpoint it identifies, rather than using pg_control.
610  */
612  CheckPointTLI);
613  if (record != NULL)
614  {
615  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
616  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
617  ereport(DEBUG1,
618  (errmsg_internal("checkpoint record is at %X/%X",
620  InRecovery = true; /* force recovery even if SHUTDOWNED */
621 
622  /*
623  * Make sure that REDO location exists. This may not be the case
624  * if there was a crash during an online backup, which left a
625  * backup_label around that references a WAL segment that's
626  * already been archived.
627  */
628  if (checkPoint.redo < CheckPointLoc)
629  {
631  if (!ReadRecord(xlogprefetcher, LOG, false,
632  checkPoint.ThisTimeLineID))
633  ereport(FATAL,
634  (errmsg("could not find redo location referenced by checkpoint record"),
635  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
636  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
637  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
638  DataDir, DataDir, DataDir)));
639  }
640  }
641  else
642  {
643  ereport(FATAL,
644  (errmsg("could not locate required checkpoint record"),
645  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
646  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
647  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
648  DataDir, DataDir, DataDir)));
649  wasShutdown = false; /* keep compiler quiet */
650  }
651 
652  /* Read the tablespace_map file if present and create symlinks. */
653  if (read_tablespace_map(&tablespaces))
654  {
655  ListCell *lc;
656 
657  foreach(lc, tablespaces)
658  {
659  tablespaceinfo *ti = lfirst(lc);
660  char *linkloc;
661 
662  linkloc = psprintf("pg_tblspc/%s", ti->oid);
663 
664  /*
665  * Remove the existing symlink if any and Create the symlink
666  * under PGDATA.
667  */
668  remove_tablespace_symlink(linkloc);
669 
670  if (symlink(ti->path, linkloc) < 0)
671  ereport(ERROR,
673  errmsg("could not create symbolic link \"%s\": %m",
674  linkloc)));
675 
676  pfree(ti->oid);
677  pfree(ti->path);
678  pfree(ti);
679  }
680 
681  /* tell the caller to delete it later */
682  haveTblspcMap = true;
683  }
684 
685  /* tell the caller to delete it later */
686  haveBackupLabel = true;
687  }
688  else
689  {
690  /*
691  * If tablespace_map file is present without backup_label file, there
692  * is no use of such file. There is no harm in retaining it, but it
693  * is better to get rid of the map file so that we don't have any
694  * redundant file in data directory and it will avoid any sort of
695  * confusion. It seems prudent though to just rename the file out of
696  * the way rather than delete it completely, also we ignore any error
697  * that occurs in rename operation as even if map file is present
698  * without backup_label file, it is harmless.
699  */
700  if (stat(TABLESPACE_MAP, &st) == 0)
701  {
702  unlink(TABLESPACE_MAP_OLD);
704  ereport(LOG,
705  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
707  errdetail("File \"%s\" was renamed to \"%s\".",
709  else
710  ereport(LOG,
711  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
713  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
715  }
716 
717  /*
718  * It's possible that archive recovery was requested, but we don't
719  * know how far we need to replay the WAL before we reach consistency.
720  * This can happen for example if a base backup is taken from a
721  * running server using an atomic filesystem snapshot, without calling
722  * pg_backup_start/stop. Or if you just kill a running primary server
723  * and put it into archive recovery by creating a recovery signal
724  * file.
725  *
726  * Our strategy in that case is to perform crash recovery first,
727  * replaying all the WAL present in pg_wal, and only enter archive
728  * recovery after that.
729  *
730  * But usually we already know how far we need to replay the WAL (up
731  * to minRecoveryPoint, up to backupEndPoint, or until we see an
732  * end-of-backup record), and we can enter archive recovery directly.
733  */
739  {
740  InArchiveRecovery = true;
742  StandbyMode = true;
743  }
744 
745  /* Get the last valid checkpoint record. */
751  CheckPointTLI);
752  if (record != NULL)
753  {
754  ereport(DEBUG1,
755  (errmsg_internal("checkpoint record is at %X/%X",
757  }
758  else
759  {
760  /*
761  * We used to attempt to go back to a secondary checkpoint record
762  * here, but only when not in standby mode. We now just fail if we
763  * can't read the last checkpoint because this allows us to
764  * simplify processing around checkpoints.
765  */
766  ereport(PANIC,
767  (errmsg("could not locate a valid checkpoint record")));
768  }
769  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
770  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
771  }
772 
773  /*
774  * If the location of the checkpoint record is not on the expected
775  * timeline in the history of the requested timeline, we cannot proceed:
776  * the backup is not part of the history of the requested timeline.
777  */
778  Assert(expectedTLEs); /* was initialized by reading checkpoint
779  * record */
782  {
783  XLogRecPtr switchpoint;
784 
785  /*
786  * tliSwitchPoint will throw an error if the checkpoint's timeline is
787  * not in expectedTLEs at all.
788  */
790  ereport(FATAL,
791  (errmsg("requested timeline %u is not a child of this server's history",
793  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
796  LSN_FORMAT_ARGS(switchpoint))));
797  }
798 
799  /*
800  * The min recovery point should be part of the requested timeline's
801  * history, too.
802  */
806  ereport(FATAL,
807  (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
811 
812  ereport(DEBUG1,
813  (errmsg_internal("redo record is at %X/%X; shutdown %s",
814  LSN_FORMAT_ARGS(checkPoint.redo),
815  wasShutdown ? "true" : "false")));
816  ereport(DEBUG1,
817  (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
818  U64FromFullTransactionId(checkPoint.nextXid),
819  checkPoint.nextOid)));
820  ereport(DEBUG1,
821  (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
822  checkPoint.nextMulti, checkPoint.nextMultiOffset)));
823  ereport(DEBUG1,
824  (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
825  checkPoint.oldestXid, checkPoint.oldestXidDB)));
826  ereport(DEBUG1,
827  (errmsg_internal("oldest MultiXactId: %u, in database %u",
828  checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
829  ereport(DEBUG1,
830  (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
831  checkPoint.oldestCommitTsXid,
832  checkPoint.newestCommitTsXid)));
834  ereport(PANIC,
835  (errmsg("invalid next transaction ID")));
836 
837  /* sanity check */
838  if (checkPoint.redo > CheckPointLoc)
839  ereport(PANIC,
840  (errmsg("invalid redo in checkpoint record")));
841 
842  /*
843  * Check whether we need to force recovery from WAL. If it appears to
844  * have been a clean shutdown and we did not have a recovery signal file,
845  * then assume no recovery needed.
846  */
847  if (checkPoint.redo < CheckPointLoc)
848  {
849  if (wasShutdown)
850  ereport(PANIC,
851  (errmsg("invalid redo record in shutdown checkpoint")));
852  InRecovery = true;
853  }
854  else if (ControlFile->state != DB_SHUTDOWNED)
855  InRecovery = true;
856  else if (ArchiveRecoveryRequested)
857  {
858  /* force recovery due to presence of recovery signal file */
859  InRecovery = true;
860  }
861 
862  /*
863  * If recovery is needed, update our in-memory copy of pg_control to show
864  * that we are recovering and to show the selected checkpoint as the place
865  * we are starting from. We also mark pg_control with any minimum recovery
866  * stop point obtained from a backup history file.
867  *
868  * We don't write the changes to disk yet, though. Only do that after
869  * initializing various subsystems.
870  */
871  if (InRecovery)
872  {
873  if (InArchiveRecovery)
874  {
876  }
877  else
878  {
879  ereport(LOG,
880  (errmsg("database system was not properly shut down; "
881  "automatic recovery in progress")));
883  ereport(LOG,
884  (errmsg("crash recovery starts in timeline %u "
885  "and has target timeline %u",
889  }
891  ControlFile->checkPointCopy = checkPoint;
892  if (InArchiveRecovery)
893  {
894  /* initialize minRecoveryPoint if not set yet */
895  if (ControlFile->minRecoveryPoint < checkPoint.redo)
896  {
897  ControlFile->minRecoveryPoint = checkPoint.redo;
899  }
900  }
901 
902  /*
903  * Set backupStartPoint if we're starting recovery from a base backup.
904  *
905  * Also set backupEndPoint and use minRecoveryPoint as the backup end
906  * location if we're starting recovery from a base backup which was
907  * taken from a standby. In this case, the database system status in
908  * pg_control must indicate that the database was already in recovery.
909  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
910  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
911  * before reaching this point; e.g. because restore_command or
912  * primary_conninfo were faulty.
913  *
914  * Any other state indicates that the backup somehow became corrupted
915  * and we can't sensibly continue with recovery.
916  */
917  if (haveBackupLabel)
918  {
919  ControlFile->backupStartPoint = checkPoint.redo;
921 
922  if (backupFromStandby)
923  {
924  if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
925  dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
926  ereport(FATAL,
927  (errmsg("backup_label contains data inconsistent with control file"),
928  errhint("This means that the backup is corrupted and you will "
929  "have to use another backup for recovery.")));
931  }
932  }
933  }
934 
935  /* remember these, so that we know when we have reached consistency */
939  if (InArchiveRecovery)
940  {
943  }
944  else
945  {
948  }
949 
950  /*
951  * Start recovery assuming that the final record isn't lost.
952  */
955 
956  *wasShutdown_ptr = wasShutdown;
957  *haveBackupLabel_ptr = haveBackupLabel;
958  *haveTblspcMap_ptr = haveTblspcMap;
959 }
960 
961 /*
962  * See if there are any recovery signal files and if so, set state for
963  * recovery.
964  *
965  * See if there is a recovery command file (recovery.conf), and if so
966  * throw an ERROR since as of PG12 we no longer recognize that.
967  */
968 static void
970 {
971  struct stat stat_buf;
972 
974  return;
975 
976  /*
977  * Check for old recovery API file: recovery.conf
978  */
979  if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
980  ereport(FATAL,
982  errmsg("using recovery command file \"%s\" is not supported",
984 
985  /*
986  * Remove unused .done file, if present. Ignore if absent.
987  */
988  unlink(RECOVERY_COMMAND_DONE);
989 
990  /*
991  * Check for recovery signal files and if found, fsync them since they
992  * represent server state information. We don't sweat too much about the
993  * possibility of fsync failure, however.
994  *
995  * If present, standby signal file takes precedence. If neither is present
996  * then we won't enter archive recovery.
997  */
998  if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
999  {
1000  int fd;
1001 
1003  S_IRUSR | S_IWUSR);
1004  if (fd >= 0)
1005  {
1006  (void) pg_fsync(fd);
1007  close(fd);
1008  }
1010  }
1011  else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1012  {
1013  int fd;
1014 
1016  S_IRUSR | S_IWUSR);
1017  if (fd >= 0)
1018  {
1019  (void) pg_fsync(fd);
1020  close(fd);
1021  }
1023  }
1024 
1025  StandbyModeRequested = false;
1026  ArchiveRecoveryRequested = false;
1028  {
1029  StandbyModeRequested = true;
1030  ArchiveRecoveryRequested = true;
1031  }
1032  else if (recovery_signal_file_found)
1033  {
1034  StandbyModeRequested = false;
1035  ArchiveRecoveryRequested = true;
1036  }
1037  else
1038  return;
1039 
1040  /*
1041  * We don't support standby mode in standalone backends; that requires
1042  * other processes such as the WAL receiver to be alive.
1043  */
1045  ereport(FATAL,
1046  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1047  errmsg("standby mode is not supported by single-user servers")));
1048 }
1049 
1050 static void
1052 {
1054  return;
1055 
1056  /*
1057  * Check for compulsory parameters
1058  */
1060  {
1061  if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1062  (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1063  ereport(WARNING,
1064  (errmsg("specified neither primary_conninfo nor restore_command"),
1065  errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1066  }
1067  else
1068  {
1069  if (recoveryRestoreCommand == NULL ||
1070  strcmp(recoveryRestoreCommand, "") == 0)
1071  ereport(FATAL,
1072  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1073  errmsg("must specify restore_command when standby mode is not enabled")));
1074  }
1075 
1076  /*
1077  * Override any inconsistent requests. Note that this is a change of
1078  * behaviour in 9.5; prior to this we simply ignored a request to pause if
1079  * hot_standby = off, which was surprising behaviour.
1080  */
1084 
1085  /*
1086  * Final parsing of recovery_target_time string; see also
1087  * check_recovery_target_time().
1088  */
1090  {
1094  Int32GetDatum(-1)));
1095  }
1096 
1097  /*
1098  * If user specified recovery_target_timeline, validate it or compute the
1099  * "latest" value. We can't do this until after we've gotten the restore
1100  * command and set InArchiveRecovery, because we need to fetch timeline
1101  * history files from the archive.
1102  */
1104  {
1106 
1107  /* Timeline 1 does not have a history file, all else should */
1108  if (rtli != 1 && !existsTimeLineHistory(rtli))
1109  ereport(FATAL,
1110  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1111  errmsg("recovery target timeline %u does not exist",
1112  rtli)));
1113  recoveryTargetTLI = rtli;
1114  }
1116  {
1117  /* We start the "latest" search from pg_control's timeline */
1119  }
1120  else
1121  {
1122  /*
1123  * else we just use the recoveryTargetTLI as already read from
1124  * ControlFile
1125  */
1127  }
1128 }
1129 
1130 /*
1131  * read_backup_label: check to see if a backup_label file is present
1132  *
1133  * If we see a backup_label during recovery, we assume that we are recovering
1134  * from a backup dump file, and we therefore roll forward from the checkpoint
1135  * identified by the label file, NOT what pg_control says. This avoids the
1136  * problem that pg_control might have been archived one or more checkpoints
1137  * later than the start of the dump, and so if we rely on it as the start
1138  * point, we will fail to restore a consistent database state.
1139  *
1140  * Returns true if a backup_label was found (and fills the checkpoint
1141  * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1142  * returns false if not. If this backup_label came from a streamed backup,
1143  * *backupEndRequired is set to true. If this backup_label was created during
1144  * recovery, *backupFromStandby is set to true.
1145  *
1146  * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1147  * and TLI read from the backup file.
1148  */
1149 static bool
1150 read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1151  bool *backupEndRequired, bool *backupFromStandby)
1152 {
1153  char startxlogfilename[MAXFNAMELEN];
1154  TimeLineID tli_from_walseg,
1155  tli_from_file;
1156  FILE *lfp;
1157  char ch;
1158  char backuptype[20];
1159  char backupfrom[20];
1160  char backuplabel[MAXPGPATH];
1161  char backuptime[128];
1162  uint32 hi,
1163  lo;
1164 
1165  /* suppress possible uninitialized-variable warnings */
1166  *checkPointLoc = InvalidXLogRecPtr;
1167  *backupLabelTLI = 0;
1168  *backupEndRequired = false;
1169  *backupFromStandby = false;
1170 
1171  /*
1172  * See if label file is present
1173  */
1174  lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1175  if (!lfp)
1176  {
1177  if (errno != ENOENT)
1178  ereport(FATAL,
1180  errmsg("could not read file \"%s\": %m",
1181  BACKUP_LABEL_FILE)));
1182  return false; /* it's not there, all is fine */
1183  }
1184 
1185  /*
1186  * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1187  * is pretty crude, but we are not expecting any variability in the file
1188  * format).
1189  */
1190  if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1191  &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1192  ereport(FATAL,
1193  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1194  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1195  RedoStartLSN = ((uint64) hi) << 32 | lo;
1196  RedoStartTLI = tli_from_walseg;
1197  if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1198  &hi, &lo, &ch) != 3 || ch != '\n')
1199  ereport(FATAL,
1200  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1201  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1202  *checkPointLoc = ((uint64) hi) << 32 | lo;
1203  *backupLabelTLI = tli_from_walseg;
1204 
1205  /*
1206  * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1207  * which could mean either pg_basebackup or the pg_backup_start/stop
1208  * method was used) or if this label came from somewhere else (the only
1209  * other option today being from pg_rewind). If this was a streamed
1210  * backup then we know that we need to play through until we get to the
1211  * end of the WAL which was generated during the backup (at which point we
1212  * will have reached consistency and backupEndRequired will be reset to be
1213  * false).
1214  */
1215  if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1216  {
1217  if (strcmp(backuptype, "streamed") == 0)
1218  *backupEndRequired = true;
1219  }
1220 
1221  /*
1222  * BACKUP FROM lets us know if this was from a primary or a standby. If
1223  * it was from a standby, we'll double-check that the control file state
1224  * matches that of a standby.
1225  */
1226  if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1227  {
1228  if (strcmp(backupfrom, "standby") == 0)
1229  *backupFromStandby = true;
1230  }
1231 
1232  /*
1233  * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1234  * but checking for their presence is useful for debugging and the next
1235  * sanity checks. Cope also with the fact that the result buffers have a
1236  * pre-allocated size, hence if the backup_label file has been generated
1237  * with strings longer than the maximum assumed here an incorrect parsing
1238  * happens. That's fine as only minor consistency checks are done
1239  * afterwards.
1240  */
1241  if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1242  ereport(DEBUG1,
1243  (errmsg_internal("backup time %s in file \"%s\"",
1244  backuptime, BACKUP_LABEL_FILE)));
1245 
1246  if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1247  ereport(DEBUG1,
1248  (errmsg_internal("backup label %s in file \"%s\"",
1249  backuplabel, BACKUP_LABEL_FILE)));
1250 
1251  /*
1252  * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1253  * it as a sanity check if present.
1254  */
1255  if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1256  {
1257  if (tli_from_walseg != tli_from_file)
1258  ereport(FATAL,
1259  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1260  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1261  errdetail("Timeline ID parsed is %u, but expected %u.",
1262  tli_from_file, tli_from_walseg)));
1263 
1264  ereport(DEBUG1,
1265  (errmsg_internal("backup timeline %u in file \"%s\"",
1266  tli_from_file, BACKUP_LABEL_FILE)));
1267  }
1268 
1269  if (ferror(lfp) || FreeFile(lfp))
1270  ereport(FATAL,
1272  errmsg("could not read file \"%s\": %m",
1273  BACKUP_LABEL_FILE)));
1274 
1275  return true;
1276 }
1277 
1278 /*
1279  * read_tablespace_map: check to see if a tablespace_map file is present
1280  *
1281  * If we see a tablespace_map file during recovery, we assume that we are
1282  * recovering from a backup dump file, and we therefore need to create symlinks
1283  * as per the information present in tablespace_map file.
1284  *
1285  * Returns true if a tablespace_map file was found (and fills *tablespaces
1286  * with a tablespaceinfo struct for each tablespace listed in the file);
1287  * returns false if not.
1288  */
1289 static bool
1291 {
1292  tablespaceinfo *ti;
1293  FILE *lfp;
1294  char str[MAXPGPATH];
1295  int ch,
1296  i,
1297  n;
1298  bool was_backslash;
1299 
1300  /*
1301  * See if tablespace_map file is present
1302  */
1303  lfp = AllocateFile(TABLESPACE_MAP, "r");
1304  if (!lfp)
1305  {
1306  if (errno != ENOENT)
1307  ereport(FATAL,
1309  errmsg("could not read file \"%s\": %m",
1310  TABLESPACE_MAP)));
1311  return false; /* it's not there, all is fine */
1312  }
1313 
1314  /*
1315  * Read and parse the link name and path lines from tablespace_map file
1316  * (this code is pretty crude, but we are not expecting any variability in
1317  * the file format). De-escape any backslashes that were inserted.
1318  */
1319  i = 0;
1320  was_backslash = false;
1321  while ((ch = fgetc(lfp)) != EOF)
1322  {
1323  if (!was_backslash && (ch == '\n' || ch == '\r'))
1324  {
1325  if (i == 0)
1326  continue; /* \r immediately followed by \n */
1327 
1328  /*
1329  * The de-escaped line should contain an OID followed by exactly
1330  * one space followed by a path. The path might start with
1331  * spaces, so don't be too liberal about parsing.
1332  */
1333  str[i] = '\0';
1334  n = 0;
1335  while (str[n] && str[n] != ' ')
1336  n++;
1337  if (n < 1 || n >= i - 1)
1338  ereport(FATAL,
1339  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1340  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1341  str[n++] = '\0';
1342 
1343  ti = palloc0(sizeof(tablespaceinfo));
1344  ti->oid = pstrdup(str);
1345  ti->path = pstrdup(str + n);
1346  *tablespaces = lappend(*tablespaces, ti);
1347 
1348  i = 0;
1349  continue;
1350  }
1351  else if (!was_backslash && ch == '\\')
1352  was_backslash = true;
1353  else
1354  {
1355  if (i < sizeof(str) - 1)
1356  str[i++] = ch;
1357  was_backslash = false;
1358  }
1359  }
1360 
1361  if (i != 0 || was_backslash) /* last line not terminated? */
1362  ereport(FATAL,
1363  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1364  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1365 
1366  if (ferror(lfp) || FreeFile(lfp))
1367  ereport(FATAL,
1369  errmsg("could not read file \"%s\": %m",
1370  TABLESPACE_MAP)));
1371 
1372  return true;
1373 }
1374 
1375 /*
1376  * Finish WAL recovery.
1377  *
1378  * This does not close the 'xlogreader' yet, because in some cases the caller
1379  * still wants to re-read the last checkpoint record by calling
1380  * ReadCheckPointRecord().
1381  *
1382  * Returns the position of the last valid or applied record, after which new
1383  * WAL should be appended, information about why recovery was ended, and some
1384  * other things. See the WalRecoveryResult struct for details.
1385  */
1388 {
1390  XLogRecPtr lastRec;
1391  TimeLineID lastRecTLI;
1392  XLogRecPtr endOfLog;
1393 
1394  /*
1395  * Kill WAL receiver, if it's still running, before we continue to write
1396  * the startup checkpoint and aborted-contrecord records. It will trump
1397  * over these records and subsequent ones if it's still alive when we
1398  * start writing WAL.
1399  */
1401 
1402  /*
1403  * We are now done reading the xlog from stream. Turn off streaming
1404  * recovery to force fetching the files (which would be required at end of
1405  * recovery, e.g., timeline history file) from archive or pg_wal.
1406  *
1407  * Note that standby mode must be turned off after killing WAL receiver,
1408  * i.e., calling XLogShutdownWalRcv().
1409  */
1410  Assert(!WalRcvStreaming());
1411  StandbyMode = false;
1412 
1413  /*
1414  * Determine where to start writing WAL next.
1415  *
1416  * Re-fetch the last valid or last applied record, so we can identify the
1417  * exact endpoint of what we consider the valid portion of WAL. There may
1418  * be an incomplete continuation record after that, in which case
1419  * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1420  * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1421  * it is intentionally missing. See CreateOverwriteContrecordRecord().
1422  *
1423  * An important side-effect of this is to load the last page into
1424  * xlogreader. The caller uses it to initialize the WAL for writing.
1425  */
1426  if (!InRecovery)
1427  {
1428  lastRec = CheckPointLoc;
1429  lastRecTLI = CheckPointTLI;
1430  }
1431  else
1432  {
1434  lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1435  }
1437  (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1438  endOfLog = xlogreader->EndRecPtr;
1439 
1440  /*
1441  * Remember the TLI in the filename of the XLOG segment containing the
1442  * end-of-log. It could be different from the timeline that endOfLog
1443  * nominally belongs to, if there was a timeline switch in that segment,
1444  * and we were reading the old WAL from a segment belonging to a higher
1445  * timeline.
1446  */
1447  result->endOfLogTLI = xlogreader->seg.ws_tli;
1448 
1450  {
1451  /*
1452  * We are no longer in archive recovery state.
1453  *
1454  * We are now done reading the old WAL. Turn off archive fetching if
1455  * it was active.
1456  */
1458  InArchiveRecovery = false;
1459 
1460  /*
1461  * If the ending log segment is still open, close it (to avoid
1462  * problems on Windows with trying to rename or delete an open file).
1463  */
1464  if (readFile >= 0)
1465  {
1466  close(readFile);
1467  readFile = -1;
1468  }
1469  }
1470 
1471  /*
1472  * Copy the last partial block to the caller, for initializing the WAL
1473  * buffer for appending new WAL.
1474  */
1475  if (endOfLog % XLOG_BLCKSZ != 0)
1476  {
1477  char *page;
1478  int len;
1479  XLogRecPtr pageBeginPtr;
1480 
1481  pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1482  Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1483 
1484  /* Copy the valid part of the last block */
1485  len = endOfLog % XLOG_BLCKSZ;
1486  page = palloc(len);
1487  memcpy(page, xlogreader->readBuf, len);
1488 
1489  result->lastPageBeginPtr = pageBeginPtr;
1490  result->lastPage = page;
1491  }
1492  else
1493  {
1494  /* There is no partial block to copy. */
1495  result->lastPageBeginPtr = endOfLog;
1496  result->lastPage = NULL;
1497  }
1498 
1499  /*
1500  * Create a comment for the history file to explain why and where timeline
1501  * changed.
1502  */
1504 
1505  result->lastRec = lastRec;
1506  result->lastRecTLI = lastRecTLI;
1507  result->endOfLog = endOfLog;
1508 
1509  result->abortedRecPtr = abortedRecPtr;
1511 
1514 
1515  return result;
1516 }
1517 
1518 /*
1519  * Clean up the WAL reader and leftovers from restoring WAL from archive
1520  */
1521 void
1523 {
1524  char recoveryPath[MAXPGPATH];
1525 
1526  /* Final update of pg_stat_recovery_prefetch. */
1528 
1529  /* Shut down xlogreader */
1530  if (readFile >= 0)
1531  {
1532  close(readFile);
1533  readFile = -1;
1534  }
1537 
1539  {
1540  /*
1541  * Since there might be a partial WAL segment named RECOVERYXLOG, get
1542  * rid of it.
1543  */
1544  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1545  unlink(recoveryPath); /* ignore any error */
1546 
1547  /* Get rid of any remaining recovered timeline-history file, too */
1548  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1549  unlink(recoveryPath); /* ignore any error */
1550  }
1551 
1552  /*
1553  * We don't need the latch anymore. It's not strictly necessary to disown
1554  * it, but let's do it for the sake of tidiness.
1555  */
1558 }
1559 
1560 /*
1561  * Perform WAL recovery.
1562  *
1563  * If the system was shut down cleanly, this is never called.
1564  */
1565 void
1567 {
1568  XLogRecord *record;
1569  bool reachedRecoveryTarget = false;
1570  TimeLineID replayTLI;
1571 
1572  /*
1573  * Initialize shared variables for tracking progress of WAL replay, as if
1574  * we had just replayed the record before the REDO location (or the
1575  * checkpoint record itself, if it's a shutdown checkpoint).
1576  */
1579  {
1583  }
1584  else
1585  {
1589  }
1596 
1597  /* Also ensure XLogReceiptTime has a sane value */
1599 
1600  /*
1601  * Let postmaster know we've started redo now, so that it can launch the
1602  * archiver if necessary.
1603  */
1604  if (IsUnderPostmaster)
1606 
1607  /*
1608  * Allow read-only connections immediately if we're consistent already.
1609  */
1611 
1612  /*
1613  * Find the first record that logically follows the checkpoint --- it
1614  * might physically precede it, though.
1615  */
1617  {
1618  /* back up to find the record */
1619  replayTLI = RedoStartTLI;
1621  record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1622  }
1623  else
1624  {
1625  /* just have to read next record after CheckPoint */
1627  replayTLI = CheckPointTLI;
1628  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1629  }
1630 
1631  if (record != NULL)
1632  {
1633  TimestampTz xtime;
1634  PGRUsage ru0;
1635 
1636  pg_rusage_init(&ru0);
1637 
1638  InRedo = true;
1639 
1640  RmgrStartup();
1641 
1642  ereport(LOG,
1643  (errmsg("redo starts at %X/%X",
1645 
1646  /* Prepare to report progress of the redo phase. */
1647  if (!StandbyMode)
1649 
1650  /*
1651  * main redo apply loop
1652  */
1653  do
1654  {
1655  if (!StandbyMode)
1656  ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1658 
1659 #ifdef WAL_DEBUG
1660  if (XLOG_DEBUG ||
1661  (record->xl_rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
1662  (record->xl_rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
1663  {
1665 
1666  initStringInfo(&buf);
1667  appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1670  xlog_outrec(&buf, xlogreader);
1671  appendStringInfoString(&buf, " - ");
1673  elog(LOG, "%s", buf.data);
1674  pfree(buf.data);
1675  }
1676 #endif
1677 
1678  /* Handle interrupt signals of startup process */
1680 
1681  /*
1682  * Pause WAL replay, if requested by a hot-standby session via
1683  * SetRecoveryPause().
1684  *
1685  * Note that we intentionally don't take the info_lck spinlock
1686  * here. We might therefore read a slightly stale value of the
1687  * recoveryPause flag, but it can't be very stale (no worse than
1688  * the last spinlock we did acquire). Since a pause request is a
1689  * pretty asynchronous thing anyway, possibly responding to it one
1690  * WAL record later than we otherwise would is a minor issue, so
1691  * it doesn't seem worth adding another spinlock cycle to prevent
1692  * that.
1693  */
1694  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1696  recoveryPausesHere(false);
1697 
1698  /*
1699  * Have we reached our recovery target?
1700  */
1702  {
1703  reachedRecoveryTarget = true;
1704  break;
1705  }
1706 
1707  /*
1708  * If we've been asked to lag the primary, wait on latch until
1709  * enough time has passed.
1710  */
1712  {
1713  /*
1714  * We test for paused recovery again here. If user sets
1715  * delayed apply, it may be because they expect to pause
1716  * recovery in case of problems, so we must test again here
1717  * otherwise pausing during the delay-wait wouldn't work.
1718  */
1719  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1721  recoveryPausesHere(false);
1722  }
1723 
1724  /*
1725  * Apply the record
1726  */
1727  ApplyWalRecord(xlogreader, record, &replayTLI);
1728 
1729  /* Exit loop if we reached inclusive recovery target */
1731  {
1732  reachedRecoveryTarget = true;
1733  break;
1734  }
1735 
1736  /* Else, try to fetch the next WAL record */
1737  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1738  } while (record != NULL);
1739 
1740  /*
1741  * end of main redo apply loop
1742  */
1743 
1744  if (reachedRecoveryTarget)
1745  {
1746  if (!reachedConsistency)
1747  ereport(FATAL,
1748  (errmsg("requested recovery stop point is before consistent recovery point")));
1749 
1750  /*
1751  * This is the last point where we can restart recovery with a new
1752  * recovery target, if we shutdown and begin again. After this,
1753  * Resource Managers may choose to do permanent corrective actions
1754  * at end of recovery.
1755  */
1756  switch (recoveryTargetAction)
1757  {
1759 
1760  /*
1761  * exit with special return code to request shutdown of
1762  * postmaster. Log messages issued from postmaster.
1763  */
1764  proc_exit(3);
1765 
1767  SetRecoveryPause(true);
1768  recoveryPausesHere(true);
1769 
1770  /* drop into promote */
1771 
1773  break;
1774  }
1775  }
1776 
1777  RmgrCleanup();
1778 
1779  ereport(LOG,
1780  (errmsg("redo done at %X/%X system usage: %s",
1782  pg_rusage_show(&ru0))));
1783  xtime = GetLatestXTime();
1784  if (xtime)
1785  ereport(LOG,
1786  (errmsg("last completed transaction was at log time %s",
1787  timestamptz_to_str(xtime))));
1788 
1789  InRedo = false;
1790  }
1791  else
1792  {
1793  /* there are no WAL records following the checkpoint */
1794  ereport(LOG,
1795  (errmsg("redo is not required")));
1796  }
1797 
1798  /*
1799  * This check is intentionally after the above log messages that indicate
1800  * how far recovery went.
1801  */
1804  !reachedRecoveryTarget)
1805  ereport(FATAL,
1806  (errmsg("recovery ended before configured recovery target was reached")));
1807 }
1808 
1809 /*
1810  * Subroutine of PerformWalRecovery, to apply one WAL record.
1811  */
1812 static void
1814 {
1815  ErrorContextCallback errcallback;
1816  bool switchedTLI = false;
1817 
1818  /* Setup error traceback support for ereport() */
1819  errcallback.callback = rm_redo_error_callback;
1820  errcallback.arg = (void *) xlogreader;
1821  errcallback.previous = error_context_stack;
1822  error_context_stack = &errcallback;
1823 
1824  /*
1825  * ShmemVariableCache->nextXid must be beyond record's xid.
1826  */
1828 
1829  /*
1830  * Before replaying this record, check if this record causes the current
1831  * timeline to change. The record is already considered to be part of the
1832  * new timeline, so we update replayTLI before replaying it. That's
1833  * important so that replayEndTLI, which is recorded as the minimum
1834  * recovery point's TLI if recovery stops after this record, is set
1835  * correctly.
1836  */
1837  if (record->xl_rmid == RM_XLOG_ID)
1838  {
1839  TimeLineID newReplayTLI = *replayTLI;
1840  TimeLineID prevReplayTLI = *replayTLI;
1841  uint8 info = record->xl_info & ~XLR_INFO_MASK;
1842 
1843  if (info == XLOG_CHECKPOINT_SHUTDOWN)
1844  {
1845  CheckPoint checkPoint;
1846 
1847  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1848  newReplayTLI = checkPoint.ThisTimeLineID;
1849  prevReplayTLI = checkPoint.PrevTimeLineID;
1850  }
1851  else if (info == XLOG_END_OF_RECOVERY)
1852  {
1853  xl_end_of_recovery xlrec;
1854 
1855  memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1856  newReplayTLI = xlrec.ThisTimeLineID;
1857  prevReplayTLI = xlrec.PrevTimeLineID;
1858  }
1859 
1860  if (newReplayTLI != *replayTLI)
1861  {
1862  /* Check that it's OK to switch to this TLI */
1864  newReplayTLI, prevReplayTLI, *replayTLI);
1865 
1866  /* Following WAL records should be run with new TLI */
1867  *replayTLI = newReplayTLI;
1868  switchedTLI = true;
1869  }
1870  }
1871 
1872  /*
1873  * Update shared replayEndRecPtr before replaying this record, so that
1874  * XLogFlush will update minRecoveryPoint correctly.
1875  */
1878  XLogRecoveryCtl->replayEndTLI = *replayTLI;
1880 
1881  /*
1882  * If we are attempting to enter Hot Standby mode, process XIDs we see
1883  */
1885  TransactionIdIsValid(record->xl_xid))
1887 
1888  /*
1889  * Some XLOG record types that are related to recovery are processed
1890  * directly here, rather than in xlog_redo()
1891  */
1892  if (record->xl_rmid == RM_XLOG_ID)
1893  xlogrecovery_redo(xlogreader, *replayTLI);
1894 
1895  /* Now apply the WAL record itself */
1896  GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1897 
1898  /*
1899  * After redo, check whether the backup pages associated with the WAL
1900  * record are consistent with the existing pages. This check is done only
1901  * if consistency check is enabled for this record.
1902  */
1903  if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
1905 
1906  /* Pop the error context stack */
1907  error_context_stack = errcallback.previous;
1908 
1909  /*
1910  * Update lastReplayedEndRecPtr after this record has been successfully
1911  * replayed.
1912  */
1916  XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
1918 
1919  /*
1920  * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
1921  * receiver so that it notices the updated lastReplayedEndRecPtr and sends
1922  * a reply to the primary.
1923  */
1925  {
1926  doRequestWalReceiverReply = false;
1927  WalRcvForceReply();
1928  }
1929 
1930  /* Allow read-only connections if we're consistent now */
1932 
1933  /* Is this a timeline switch? */
1934  if (switchedTLI)
1935  {
1936  /*
1937  * Before we continue on the new timeline, clean up any (possibly
1938  * bogus) future WAL segments on the old timeline.
1939  */
1941 
1942  /*
1943  * Wake up any walsenders to notice that we are on a new timeline.
1944  */
1946  WalSndWakeup();
1947 
1948  /* Reset the prefetcher. */
1950  }
1951 }
1952 
1953 /*
1954  * Some XLOG RM record types that are directly related to WAL recovery are
1955  * handled here rather than in the xlog_redo()
1956  */
1957 static void
1959 {
1960  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1961  XLogRecPtr lsn = record->EndRecPtr;
1962 
1963  Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
1964 
1965  if (info == XLOG_OVERWRITE_CONTRECORD)
1966  {
1967  /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
1969 
1970  memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
1971  if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
1972  elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
1975 
1976  /* We have safely skipped the aborted record */
1979 
1980  ereport(LOG,
1981  (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
1984 
1985  /* Verifying the record should only happen once */
1987  }
1988  else if (info == XLOG_BACKUP_END)
1989  {
1990  XLogRecPtr startpoint;
1991 
1992  memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
1993 
1994  if (backupStartPoint == startpoint)
1995  {
1996  /*
1997  * We have reached the end of base backup, the point where
1998  * pg_backup_stop() was done. The data on disk is now consistent
1999  * (assuming we have also reached minRecoveryPoint). Set
2000  * backupEndPoint to the current LSN, so that the next call to
2001  * CheckRecoveryConsistency() will notice it and do the
2002  * end-of-backup processing.
2003  */
2004  elog(DEBUG1, "end of backup record reached");
2005 
2006  backupEndPoint = lsn;
2007  }
2008  else
2009  elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2011  }
2012 }
2013 
2014 /*
2015  * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2016  * directories.
2017  *
2018  * Replay of database creation XLOG records for databases that were later
2019  * dropped can create fake directories in pg_tblspc. By the time consistency
2020  * is reached these directories should have been removed; here we verify
2021  * that this did indeed happen. This is to be called at the point where
2022  * consistent state is reached.
2023  *
2024  * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2025  * useful for testing purposes, and also allows for an escape hatch in case
2026  * things go south.
2027  */
2028 static void
2030 {
2031  DIR *dir;
2032  struct dirent *de;
2033 
2034  dir = AllocateDir("pg_tblspc");
2035  while ((de = ReadDir(dir, "pg_tblspc")) != NULL)
2036  {
2037  char path[MAXPGPATH + 10];
2038 
2039  /* Skip entries of non-oid names */
2040  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2041  continue;
2042 
2043  snprintf(path, sizeof(path), "pg_tblspc/%s", de->d_name);
2044 
2045  if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2048  errmsg("unexpected directory entry \"%s\" found in %s",
2049  de->d_name, "pg_tblspc/"),
2050  errdetail("All directory entries in pg_tblspc/ should be symbolic links."),
2051  errhint("Remove those directories, or set allow_in_place_tablespaces to ON transiently to let recovery complete.")));
2052  }
2053 }
2054 
2055 /*
2056  * Checks if recovery has reached a consistent state. When consistency is
2057  * reached and we have a valid starting standby snapshot, tell postmaster
2058  * that it can start accepting read-only connections.
2059  */
2060 static void
2062 {
2063  XLogRecPtr lastReplayedEndRecPtr;
2064  TimeLineID lastReplayedTLI;
2065 
2066  /*
2067  * During crash recovery, we don't reach a consistent state until we've
2068  * replayed all the WAL.
2069  */
2071  return;
2072 
2074 
2075  /*
2076  * assume that we are called in the startup process, and hence don't need
2077  * a lock to read lastReplayedEndRecPtr
2078  */
2079  lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2080  lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2081 
2082  /*
2083  * Have we reached the point where our base backup was completed?
2084  */
2086  backupEndPoint <= lastReplayedEndRecPtr)
2087  {
2088  elog(DEBUG1, "end of backup reached");
2089 
2090  /*
2091  * We have reached the end of base backup, as indicated by pg_control.
2092  * Update the control file accordingly.
2093  */
2094  ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2097  backupEndRequired = false;
2098  }
2099 
2100  /*
2101  * Have we passed our safe starting point? Note that minRecoveryPoint is
2102  * known to be incorrectly set if recovering from a backup, until the
2103  * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2104  * All we know prior to that is that we're not consistent yet.
2105  */
2107  minRecoveryPoint <= lastReplayedEndRecPtr)
2108  {
2109  /*
2110  * Check to see if the XLOG sequence contained any unresolved
2111  * references to uninitialized pages.
2112  */
2114 
2115  /*
2116  * Check that pg_tblspc doesn't contain any real directories. Replay
2117  * of Database/CREATE_* records may have created ficticious tablespace
2118  * directories that should have been removed by the time consistency
2119  * was reached.
2120  */
2122 
2123  reachedConsistency = true;
2124  ereport(LOG,
2125  (errmsg("consistent recovery state reached at %X/%X",
2126  LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2127  }
2128 
2129  /*
2130  * Have we got a valid starting snapshot that will allow queries to be
2131  * run? If so, we can tell postmaster that the database is consistent now,
2132  * enabling connections.
2133  */
2138  {
2142 
2143  LocalHotStandbyActive = true;
2144 
2146  }
2147 }
2148 
2149 /*
2150  * Error context callback for errors occurring during rm_redo().
2151  */
2152 static void
2154 {
2155  XLogReaderState *record = (XLogReaderState *) arg;
2157 
2158  initStringInfo(&buf);
2159  xlog_outdesc(&buf, record);
2160  xlog_block_info(&buf, record);
2161 
2162  /* translator: %s is a WAL record description */
2163  errcontext("WAL redo at %X/%X for %s",
2164  LSN_FORMAT_ARGS(record->ReadRecPtr),
2165  buf.data);
2166 
2167  pfree(buf.data);
2168 }
2169 
2170 /*
2171  * Returns a string describing an XLogRecord, consisting of its identity
2172  * optionally followed by a colon, a space, and a further description.
2173  */
2174 void
2176 {
2177  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2178  uint8 info = XLogRecGetInfo(record);
2179  const char *id;
2180 
2182  appendStringInfoChar(buf, '/');
2183 
2184  id = rmgr.rm_identify(info);
2185  if (id == NULL)
2186  appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2187  else
2188  appendStringInfo(buf, "%s: ", id);
2189 
2190  rmgr.rm_desc(buf, record);
2191 }
2192 
2193 #ifdef WAL_DEBUG
2194 
2195 static void
2196 xlog_outrec(StringInfo buf, XLogReaderState *record)
2197 {
2198  appendStringInfo(buf, "prev %X/%X; xid %u",
2200  XLogRecGetXid(record));
2201 
2202  appendStringInfo(buf, "; len %u",
2203  XLogRecGetDataLen(record));
2204 
2205  xlog_block_info(buf, record);
2206 }
2207 #endif /* WAL_DEBUG */
2208 
2209 /*
2210  * Returns a string giving information about all the blocks in an
2211  * XLogRecord.
2212  */
2213 static void
2215 {
2216  int block_id;
2217 
2218  /* decode block references */
2219  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2220  {
2221  RelFileLocator rlocator;
2222  ForkNumber forknum;
2223  BlockNumber blk;
2224 
2225  if (!XLogRecGetBlockTagExtended(record, block_id,
2226  &rlocator, &forknum, &blk, NULL))
2227  continue;
2228 
2229  if (forknum != MAIN_FORKNUM)
2230  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2231  block_id,
2232  rlocator.spcOid, rlocator.dbOid,
2233  rlocator.relNumber,
2234  forknum,
2235  blk);
2236  else
2237  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2238  block_id,
2239  rlocator.spcOid, rlocator.dbOid,
2240  rlocator.relNumber,
2241  blk);
2242  if (XLogRecHasBlockImage(record, block_id))
2243  appendStringInfoString(buf, " FPW");
2244  }
2245 }
2246 
2247 
2248 /*
2249  * Check that it's OK to switch to new timeline during recovery.
2250  *
2251  * 'lsn' is the address of the shutdown checkpoint record we're about to
2252  * replay. (Currently, timeline can only change at a shutdown checkpoint).
2253  */
2254 static void
2256  TimeLineID replayTLI)
2257 {
2258  /* Check that the record agrees on what the current (old) timeline is */
2259  if (prevTLI != replayTLI)
2260  ereport(PANIC,
2261  (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2262  prevTLI, replayTLI)));
2263 
2264  /*
2265  * The new timeline better be in the list of timelines we expect to see,
2266  * according to the timeline history. It should also not decrease.
2267  */
2268  if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2269  ereport(PANIC,
2270  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2271  newTLI, replayTLI)));
2272 
2273  /*
2274  * If we have not yet reached min recovery point, and we're about to
2275  * switch to a timeline greater than the timeline of the min recovery
2276  * point: trouble. After switching to the new timeline, we could not
2277  * possibly visit the min recovery point on the correct timeline anymore.
2278  * This can happen if there is a newer timeline in the archive that
2279  * branched before the timeline the min recovery point is on, and you
2280  * attempt to do PITR to the new timeline.
2281  */
2283  lsn < minRecoveryPoint &&
2284  newTLI > minRecoveryPointTLI)
2285  ereport(PANIC,
2286  (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2287  newTLI,
2290 
2291  /* Looks good */
2292 }
2293 
2294 
2295 /*
2296  * Extract timestamp from WAL record.
2297  *
2298  * If the record contains a timestamp, returns true, and saves the timestamp
2299  * in *recordXtime. If the record type has no timestamp, returns false.
2300  * Currently, only transaction commit/abort records and restore points contain
2301  * timestamps.
2302  */
2303 static bool
2305 {
2306  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2307  uint8 xact_info = info & XLOG_XACT_OPMASK;
2308  uint8 rmid = XLogRecGetRmid(record);
2309 
2310  if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2311  {
2312  *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2313  return true;
2314  }
2315  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2316  xact_info == XLOG_XACT_COMMIT_PREPARED))
2317  {
2318  *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2319  return true;
2320  }
2321  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2322  xact_info == XLOG_XACT_ABORT_PREPARED))
2323  {
2324  *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2325  return true;
2326  }
2327  return false;
2328 }
2329 
2330 /*
2331  * Checks whether the current buffer page and backup page stored in the
2332  * WAL record are consistent or not. Before comparing the two pages, a
2333  * masking can be applied to the pages to ignore certain areas like hint bits,
2334  * unused space between pd_lower and pd_upper among other things. This
2335  * function should be called once WAL replay has been completed for a
2336  * given record.
2337  */
2338 static void
2340 {
2341  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2342  RelFileLocator rlocator;
2343  ForkNumber forknum;
2344  BlockNumber blkno;
2345  int block_id;
2346 
2347  /* Records with no backup blocks have no need for consistency checks. */
2348  if (!XLogRecHasAnyBlockRefs(record))
2349  return;
2350 
2351  Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2352 
2353  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2354  {
2355  Buffer buf;
2356  Page page;
2357 
2358  if (!XLogRecGetBlockTagExtended(record, block_id,
2359  &rlocator, &forknum, &blkno, NULL))
2360  {
2361  /*
2362  * WAL record doesn't contain a block reference with the given id.
2363  * Do nothing.
2364  */
2365  continue;
2366  }
2367 
2368  Assert(XLogRecHasBlockImage(record, block_id));
2369 
2370  if (XLogRecBlockImageApply(record, block_id))
2371  {
2372  /*
2373  * WAL record has already applied the page, so bypass the
2374  * consistency check as that would result in comparing the full
2375  * page stored in the record with itself.
2376  */
2377  continue;
2378  }
2379 
2380  /*
2381  * Read the contents from the current buffer and store it in a
2382  * temporary page.
2383  */
2384  buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2386  InvalidBuffer);
2387  if (!BufferIsValid(buf))
2388  continue;
2389 
2391  page = BufferGetPage(buf);
2392 
2393  /*
2394  * Take a copy of the local page where WAL has been applied to have a
2395  * comparison base before masking it...
2396  */
2397  memcpy(replay_image_masked, page, BLCKSZ);
2398 
2399  /* No need for this page anymore now that a copy is in. */
2401 
2402  /*
2403  * If the block LSN is already ahead of this WAL record, we can't
2404  * expect contents to match. This can happen if recovery is
2405  * restarted.
2406  */
2407  if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2408  continue;
2409 
2410  /*
2411  * Read the contents from the backup copy, stored in WAL record and
2412  * store it in a temporary page. There is no need to allocate a new
2413  * page here, a local buffer is fine to hold its contents and a mask
2414  * can be directly applied on it.
2415  */
2416  if (!RestoreBlockImage(record, block_id, primary_image_masked))
2417  ereport(ERROR,
2418  (errcode(ERRCODE_INTERNAL_ERROR),
2419  errmsg_internal("%s", record->errormsg_buf)));
2420 
2421  /*
2422  * If masking function is defined, mask both the primary and replay
2423  * images
2424  */
2425  if (rmgr.rm_mask != NULL)
2426  {
2427  rmgr.rm_mask(replay_image_masked, blkno);
2428  rmgr.rm_mask(primary_image_masked, blkno);
2429  }
2430 
2431  /* Time to compare the primary and replay images. */
2432  if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2433  {
2434  elog(FATAL,
2435  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2436  rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2437  forknum, blkno);
2438  }
2439  }
2440 }
2441 
2442 /*
2443  * For point-in-time recovery, this function decides whether we want to
2444  * stop applying the XLOG before the current record.
2445  *
2446  * Returns true if we are stopping, false otherwise. If stopping, some
2447  * information is saved in recoveryStopXid et al for use in annotating the
2448  * new timeline's history file.
2449  */
2450 static bool
2452 {
2453  bool stopsHere = false;
2454  uint8 xact_info;
2455  bool isCommit;
2456  TimestampTz recordXtime = 0;
2457  TransactionId recordXid;
2458 
2459  /*
2460  * Ignore recovery target settings when not in archive recovery (meaning
2461  * we are in crash recovery).
2462  */
2464  return false;
2465 
2466  /* Check if we should stop as soon as reaching consistency */
2468  {
2469  ereport(LOG,
2470  (errmsg("recovery stopping after reaching consistency")));
2471 
2472  recoveryStopAfter = false;
2475  recoveryStopTime = 0;
2476  recoveryStopName[0] = '\0';
2477  return true;
2478  }
2479 
2480  /* Check if target LSN has been reached */
2483  record->ReadRecPtr >= recoveryTargetLSN)
2484  {
2485  recoveryStopAfter = false;
2487  recoveryStopLSN = record->ReadRecPtr;
2488  recoveryStopTime = 0;
2489  recoveryStopName[0] = '\0';
2490  ereport(LOG,
2491  (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2493  return true;
2494  }
2495 
2496  /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2497  if (XLogRecGetRmid(record) != RM_XACT_ID)
2498  return false;
2499 
2500  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2501 
2502  if (xact_info == XLOG_XACT_COMMIT)
2503  {
2504  isCommit = true;
2505  recordXid = XLogRecGetXid(record);
2506  }
2507  else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2508  {
2509  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2510  xl_xact_parsed_commit parsed;
2511 
2512  isCommit = true;
2514  xlrec,
2515  &parsed);
2516  recordXid = parsed.twophase_xid;
2517  }
2518  else if (xact_info == XLOG_XACT_ABORT)
2519  {
2520  isCommit = false;
2521  recordXid = XLogRecGetXid(record);
2522  }
2523  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2524  {
2525  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2526  xl_xact_parsed_abort parsed;
2527 
2528  isCommit = false;
2530  xlrec,
2531  &parsed);
2532  recordXid = parsed.twophase_xid;
2533  }
2534  else
2535  return false;
2536 
2538  {
2539  /*
2540  * There can be only one transaction end record with this exact
2541  * transactionid
2542  *
2543  * when testing for an xid, we MUST test for equality only, since
2544  * transactions are numbered in the order they start, not the order
2545  * they complete. A higher numbered xid will complete before you about
2546  * 50% of the time...
2547  */
2548  stopsHere = (recordXid == recoveryTargetXid);
2549  }
2550 
2552  getRecordTimestamp(record, &recordXtime))
2553  {
2554  /*
2555  * There can be many transactions that share the same commit time, so
2556  * we stop after the last one, if we are inclusive, or stop at the
2557  * first one if we are exclusive
2558  */
2560  stopsHere = (recordXtime > recoveryTargetTime);
2561  else
2562  stopsHere = (recordXtime >= recoveryTargetTime);
2563  }
2564 
2565  if (stopsHere)
2566  {
2567  recoveryStopAfter = false;
2568  recoveryStopXid = recordXid;
2569  recoveryStopTime = recordXtime;
2571  recoveryStopName[0] = '\0';
2572 
2573  if (isCommit)
2574  {
2575  ereport(LOG,
2576  (errmsg("recovery stopping before commit of transaction %u, time %s",
2579  }
2580  else
2581  {
2582  ereport(LOG,
2583  (errmsg("recovery stopping before abort of transaction %u, time %s",
2586  }
2587  }
2588 
2589  return stopsHere;
2590 }
2591 
2592 /*
2593  * Same as recoveryStopsBefore, but called after applying the record.
2594  *
2595  * We also track the timestamp of the latest applied COMMIT/ABORT
2596  * record in XLogRecoveryCtl->recoveryLastXTime.
2597  */
2598 static bool
2600 {
2601  uint8 info;
2602  uint8 xact_info;
2603  uint8 rmid;
2604  TimestampTz recordXtime;
2605 
2606  /*
2607  * Ignore recovery target settings when not in archive recovery (meaning
2608  * we are in crash recovery).
2609  */
2611  return false;
2612 
2613  info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2614  rmid = XLogRecGetRmid(record);
2615 
2616  /*
2617  * There can be many restore points that share the same name; we stop at
2618  * the first one.
2619  */
2621  rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2622  {
2623  xl_restore_point *recordRestorePointData;
2624 
2625  recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2626 
2627  if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2628  {
2629  recoveryStopAfter = true;
2632  (void) getRecordTimestamp(record, &recoveryStopTime);
2633  strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2634 
2635  ereport(LOG,
2636  (errmsg("recovery stopping at restore point \"%s\", time %s",
2639  return true;
2640  }
2641  }
2642 
2643  /* Check if the target LSN has been reached */
2646  record->ReadRecPtr >= recoveryTargetLSN)
2647  {
2648  recoveryStopAfter = true;
2650  recoveryStopLSN = record->ReadRecPtr;
2651  recoveryStopTime = 0;
2652  recoveryStopName[0] = '\0';
2653  ereport(LOG,
2654  (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2656  return true;
2657  }
2658 
2659  if (rmid != RM_XACT_ID)
2660  return false;
2661 
2662  xact_info = info & XLOG_XACT_OPMASK;
2663 
2664  if (xact_info == XLOG_XACT_COMMIT ||
2665  xact_info == XLOG_XACT_COMMIT_PREPARED ||
2666  xact_info == XLOG_XACT_ABORT ||
2667  xact_info == XLOG_XACT_ABORT_PREPARED)
2668  {
2669  TransactionId recordXid;
2670 
2671  /* Update the last applied transaction timestamp */
2672  if (getRecordTimestamp(record, &recordXtime))
2673  SetLatestXTime(recordXtime);
2674 
2675  /* Extract the XID of the committed/aborted transaction */
2676  if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2677  {
2678  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2679  xl_xact_parsed_commit parsed;
2680 
2682  xlrec,
2683  &parsed);
2684  recordXid = parsed.twophase_xid;
2685  }
2686  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2687  {
2688  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2689  xl_xact_parsed_abort parsed;
2690 
2692  xlrec,
2693  &parsed);
2694  recordXid = parsed.twophase_xid;
2695  }
2696  else
2697  recordXid = XLogRecGetXid(record);
2698 
2699  /*
2700  * There can be only one transaction end record with this exact
2701  * transactionid
2702  *
2703  * when testing for an xid, we MUST test for equality only, since
2704  * transactions are numbered in the order they start, not the order
2705  * they complete. A higher numbered xid will complete before you about
2706  * 50% of the time...
2707  */
2709  recordXid == recoveryTargetXid)
2710  {
2711  recoveryStopAfter = true;
2712  recoveryStopXid = recordXid;
2713  recoveryStopTime = recordXtime;
2715  recoveryStopName[0] = '\0';
2716 
2717  if (xact_info == XLOG_XACT_COMMIT ||
2718  xact_info == XLOG_XACT_COMMIT_PREPARED)
2719  {
2720  ereport(LOG,
2721  (errmsg("recovery stopping after commit of transaction %u, time %s",
2724  }
2725  else if (xact_info == XLOG_XACT_ABORT ||
2726  xact_info == XLOG_XACT_ABORT_PREPARED)
2727  {
2728  ereport(LOG,
2729  (errmsg("recovery stopping after abort of transaction %u, time %s",
2732  }
2733  return true;
2734  }
2735  }
2736 
2737  /* Check if we should stop as soon as reaching consistency */
2739  {
2740  ereport(LOG,
2741  (errmsg("recovery stopping after reaching consistency")));
2742 
2743  recoveryStopAfter = true;
2745  recoveryStopTime = 0;
2747  recoveryStopName[0] = '\0';
2748  return true;
2749  }
2750 
2751  return false;
2752 }
2753 
2754 /*
2755  * Create a comment for the history file to explain why and where
2756  * timeline changed.
2757  */
2758 static char *
2760 {
2761  char reason[200];
2762 
2764  snprintf(reason, sizeof(reason),
2765  "%s transaction %u",
2766  recoveryStopAfter ? "after" : "before",
2767  recoveryStopXid);
2769  snprintf(reason, sizeof(reason),
2770  "%s %s\n",
2771  recoveryStopAfter ? "after" : "before",
2773  else if (recoveryTarget == RECOVERY_TARGET_LSN)
2774  snprintf(reason, sizeof(reason),
2775  "%s LSN %X/%X\n",
2776  recoveryStopAfter ? "after" : "before",
2779  snprintf(reason, sizeof(reason),
2780  "at restore point \"%s\"",
2783  snprintf(reason, sizeof(reason), "reached consistency");
2784  else
2785  snprintf(reason, sizeof(reason), "no recovery target specified");
2786 
2787  return pstrdup(reason);
2788 }
2789 
2790 /*
2791  * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2792  *
2793  * endOfRecovery is true if the recovery target is reached and
2794  * the paused state starts at the end of recovery because of
2795  * recovery_target_action=pause, and false otherwise.
2796  */
2797 static void
2798 recoveryPausesHere(bool endOfRecovery)
2799 {
2800  /* Don't pause unless users can connect! */
2801  if (!LocalHotStandbyActive)
2802  return;
2803 
2804  /* Don't pause after standby promotion has been triggered */
2806  return;
2807 
2808  if (endOfRecovery)
2809  ereport(LOG,
2810  (errmsg("pausing at the end of recovery"),
2811  errhint("Execute pg_wal_replay_resume() to promote.")));
2812  else
2813  ereport(LOG,
2814  (errmsg("recovery has paused"),
2815  errhint("Execute pg_wal_replay_resume() to continue.")));
2816 
2817  /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2819  {
2821  if (CheckForStandbyTrigger())
2822  return;
2823 
2824  /*
2825  * If recovery pause is requested then set it paused. While we are in
2826  * the loop, user might resume and pause again so set this every time.
2827  */
2829 
2830  /*
2831  * We wait on a condition variable that will wake us as soon as the
2832  * pause ends, but we use a timeout so we can check the above exit
2833  * condition periodically too.
2834  */
2837  }
2839 }
2840 
2841 /*
2842  * When recovery_min_apply_delay is set, we wait long enough to make sure
2843  * certain record types are applied at least that interval behind the primary.
2844  *
2845  * Returns true if we waited.
2846  *
2847  * Note that the delay is calculated between the WAL record log time and
2848  * the current time on standby. We would prefer to keep track of when this
2849  * standby received each WAL record, which would allow a more consistent
2850  * approach and one not affected by time synchronisation issues, but that
2851  * is significantly more effort and complexity for little actual gain in
2852  * usability.
2853  */
2854 static bool
2856 {
2857  uint8 xact_info;
2858  TimestampTz xtime;
2859  TimestampTz delayUntil;
2860  long msecs;
2861 
2862  /* nothing to do if no delay configured */
2863  if (recovery_min_apply_delay <= 0)
2864  return false;
2865 
2866  /* no delay is applied on a database not yet consistent */
2867  if (!reachedConsistency)
2868  return false;
2869 
2870  /* nothing to do if crash recovery is requested */
2872  return false;
2873 
2874  /*
2875  * Is it a COMMIT record?
2876  *
2877  * We deliberately choose not to delay aborts since they have no effect on
2878  * MVCC. We already allow replay of records that don't have a timestamp,
2879  * so there is already opportunity for issues caused by early conflicts on
2880  * standbys.
2881  */
2882  if (XLogRecGetRmid(record) != RM_XACT_ID)
2883  return false;
2884 
2885  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2886 
2887  if (xact_info != XLOG_XACT_COMMIT &&
2888  xact_info != XLOG_XACT_COMMIT_PREPARED)
2889  return false;
2890 
2891  if (!getRecordTimestamp(record, &xtime))
2892  return false;
2893 
2895 
2896  /*
2897  * Exit without arming the latch if it's already past time to apply this
2898  * record
2899  */
2901  if (msecs <= 0)
2902  return false;
2903 
2904  while (true)
2905  {
2907 
2908  /* This might change recovery_min_apply_delay. */
2910 
2911  if (CheckForStandbyTrigger())
2912  break;
2913 
2914  /*
2915  * Recalculate delayUntil as recovery_min_apply_delay could have
2916  * changed while waiting in this loop.
2917  */
2919 
2920  /*
2921  * Wait for difference between GetCurrentTimestamp() and delayUntil.
2922  */
2924  delayUntil);
2925 
2926  if (msecs <= 0)
2927  break;
2928 
2929  elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
2930 
2933  msecs,
2935  }
2936  return true;
2937 }
2938 
2939 /*
2940  * Get the current state of the recovery pause request.
2941  */
2944 {
2946 
2950 
2951  return state;
2952 }
2953 
2954 /*
2955  * Set the recovery pause state.
2956  *
2957  * If recovery pause is requested then sets the recovery pause state to
2958  * 'pause requested' if it is not already 'paused'. Otherwise, sets it
2959  * to 'not paused' to resume the recovery. The recovery pause will be
2960  * confirmed by the ConfirmRecoveryPaused.
2961  */
2962 void
2963 SetRecoveryPause(bool recoveryPause)
2964 {
2966 
2967  if (!recoveryPause)
2971 
2973 
2974  if (!recoveryPause)
2976 }
2977 
2978 /*
2979  * Confirm the recovery pause by setting the recovery pause state to
2980  * RECOVERY_PAUSED.
2981  */
2982 static void
2984 {
2985  /* If recovery pause is requested then set it paused */
2990 }
2991 
2992 
2993 /*
2994  * Attempt to read the next XLOG record.
2995  *
2996  * Before first call, the reader needs to be positioned to the first record
2997  * by calling XLogPrefetcherBeginRead().
2998  *
2999  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3000  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3001  * record is available.
3002  */
3003 static XLogRecord *
3005  bool fetching_ckpt, TimeLineID replayTLI)
3006 {
3007  XLogRecord *record;
3010 
3011  /* Pass through parameters to XLogPageRead */
3012  private->fetching_ckpt = fetching_ckpt;
3013  private->emode = emode;
3014  private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3015  private->replayTLI = replayTLI;
3016 
3017  /* This is the first attempt to read this page. */
3018  lastSourceFailed = false;
3019 
3020  for (;;)
3021  {
3022  char *errormsg;
3023 
3024  record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3025  if (record == NULL)
3026  {
3027  /*
3028  * When we find that WAL ends in an incomplete record, keep track
3029  * of that record. After recovery is done, we'll write a record to
3030  * indicate to downstream WAL readers that that portion is to be
3031  * ignored.
3032  *
3033  * However, when ArchiveRecoveryRequested = true, we're going to
3034  * switch to a new timeline at the end of recovery. We will only
3035  * copy WAL over to the new timeline up to the end of the last
3036  * complete record, so if we did this, we would later create an
3037  * overwrite contrecord in the wrong place, breaking everything.
3038  */
3039  if (!ArchiveRecoveryRequested &&
3041  {
3044  }
3045 
3046  if (readFile >= 0)
3047  {
3048  close(readFile);
3049  readFile = -1;
3050  }
3051 
3052  /*
3053  * We only end up here without a message when XLogPageRead()
3054  * failed - in that case we already logged something. In
3055  * StandbyMode that only happens if we have been triggered, so we
3056  * shouldn't loop anymore in that case.
3057  */
3058  if (errormsg)
3060  (errmsg_internal("%s", errormsg) /* already translated */ ));
3061  }
3062 
3063  /*
3064  * Check page TLI is one of the expected values.
3065  */
3067  {
3068  char fname[MAXFNAMELEN];
3069  XLogSegNo segno;
3070  int32 offset;
3071 
3075  XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3078  (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3080  fname,
3082  offset)));
3083  record = NULL;
3084  }
3085 
3086  if (record)
3087  {
3088  /* Great, got a record */
3089  return record;
3090  }
3091  else
3092  {
3093  /* No valid record available from this source */
3094  lastSourceFailed = true;
3095 
3096  /*
3097  * If archive recovery was requested, but we were still doing
3098  * crash recovery, switch to archive recovery and retry using the
3099  * offline archive. We have now replayed all the valid WAL in
3100  * pg_wal, so we are presumably now consistent.
3101  *
3102  * We require that there's at least some valid WAL present in
3103  * pg_wal, however (!fetching_ckpt). We could recover using the
3104  * WAL from the archive, even if pg_wal is completely empty, but
3105  * we'd have no idea how far we'd have to replay to reach
3106  * consistency. So err on the safe side and give up.
3107  */
3109  !fetching_ckpt)
3110  {
3111  ereport(DEBUG1,
3112  (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3113  InArchiveRecovery = true;
3115  StandbyMode = true;
3116 
3119  minRecoveryPointTLI = replayTLI;
3120 
3122 
3123  /*
3124  * Before we retry, reset lastSourceFailed and currentSource
3125  * so that we will check the archive next.
3126  */
3127  lastSourceFailed = false;
3129 
3130  continue;
3131  }
3132 
3133  /* In standby mode, loop back to retry. Otherwise, give up. */
3135  continue;
3136  else
3137  return NULL;
3138  }
3139  }
3140 }
3141 
3142 /*
3143  * Read the XLOG page containing RecPtr into readBuf (if not read already).
3144  * Returns number of bytes read, if the page is read successfully, or
3145  * XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed, but
3146  * only if they have not been previously reported.
3147  *
3148  * While prefetching, xlogreader->nonblocking may be set. In that case,
3149  * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3150  *
3151  * This is responsible for restoring files from archive as needed, as well
3152  * as for waiting for the requested WAL record to arrive in standby mode.
3153  *
3154  * 'emode' specifies the log level used for reporting "file not found" or
3155  * "end of WAL" situations in archive recovery, or in standby mode when
3156  * promotion is triggered. If set to WARNING or below, XLogPageRead() returns
3157  * XLREAD_FAIL in those situations, on higher log levels the ereport() won't
3158  * return.
3159  *
3160  * In standby mode, if after a successful return of XLogPageRead() the
3161  * caller finds the record it's interested in to be broken, it should
3162  * ereport the error with the level determined by
3163  * emode_for_corrupt_record(), and then set lastSourceFailed
3164  * and call XLogPageRead() again with the same arguments. This lets
3165  * XLogPageRead() to try fetching the record from another source, or to
3166  * sleep and retry.
3167  */
3168 static int
3170  XLogRecPtr targetRecPtr, char *readBuf)
3171 {
3172  XLogPageReadPrivate *private =
3174  int emode = private->emode;
3175  uint32 targetPageOff;
3176  XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3177  int r;
3178 
3179  XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3180  targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3181 
3182  /*
3183  * See if we need to switch to a new segment because the requested record
3184  * is not in the currently open one.
3185  */
3186  if (readFile >= 0 &&
3187  !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3188  {
3189  /*
3190  * Request a restartpoint if we've replayed too much xlog since the
3191  * last one.
3192  */
3194  {
3196  {
3197  (void) GetRedoRecPtr();
3200  }
3201  }
3202 
3203  close(readFile);
3204  readFile = -1;
3206  }
3207 
3208  XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3209 
3210 retry:
3211  /* See if we need to retrieve more data */
3212  if (readFile < 0 ||
3214  flushedUpto < targetPagePtr + reqLen))
3215  {
3216  if (readFile >= 0 &&
3219  flushedUpto < targetPagePtr + reqLen)
3220  return XLREAD_WOULDBLOCK;
3221 
3222  switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3223  private->randAccess,
3224  private->fetching_ckpt,
3225  targetRecPtr,
3226  private->replayTLI,
3229  {
3230  case XLREAD_WOULDBLOCK:
3231  return XLREAD_WOULDBLOCK;
3232  case XLREAD_FAIL:
3233  if (readFile >= 0)
3234  close(readFile);
3235  readFile = -1;
3236  readLen = 0;
3238  return XLREAD_FAIL;
3239  case XLREAD_SUCCESS:
3240  break;
3241  }
3242  }
3243 
3244  /*
3245  * At this point, we have the right segment open and if we're streaming we
3246  * know the requested record is in it.
3247  */
3248  Assert(readFile != -1);
3249 
3250  /*
3251  * If the current segment is being streamed from the primary, calculate
3252  * how much of the current page we have received already. We know the
3253  * requested record has been received, but this is for the benefit of
3254  * future calls, to allow quick exit at the top of this function.
3255  */
3257  {
3258  if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3259  readLen = XLOG_BLCKSZ;
3260  else
3262  targetPageOff;
3263  }
3264  else
3265  readLen = XLOG_BLCKSZ;
3266 
3267  /* Read the requested page */
3268  readOff = targetPageOff;
3269 
3271  r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3272  if (r != XLOG_BLCKSZ)
3273  {
3274  char fname[MAXFNAMELEN];
3275  int save_errno = errno;
3276 
3279  if (r < 0)
3280  {
3281  errno = save_errno;
3282  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3284  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3285  fname, LSN_FORMAT_ARGS(targetPagePtr),
3286  readOff)));
3287  }
3288  else
3289  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3291  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3292  fname, LSN_FORMAT_ARGS(targetPagePtr),
3293  readOff, r, (Size) XLOG_BLCKSZ)));
3294  goto next_record_is_invalid;
3295  }
3297 
3298  Assert(targetSegNo == readSegNo);
3299  Assert(targetPageOff == readOff);
3300  Assert(reqLen <= readLen);
3301 
3303 
3304  /*
3305  * Check the page header immediately, so that we can retry immediately if
3306  * it's not valid. This may seem unnecessary, because ReadPageInternal()
3307  * validates the page header anyway, and would propagate the failure up to
3308  * ReadRecord(), which would retry. However, there's a corner case with
3309  * continuation records, if a record is split across two pages such that
3310  * we would need to read the two pages from different sources. For
3311  * example, imagine a scenario where a streaming replica is started up,
3312  * and replay reaches a record that's split across two WAL segments. The
3313  * first page is only available locally, in pg_wal, because it's already
3314  * been recycled on the primary. The second page, however, is not present
3315  * in pg_wal, and we should stream it from the primary. There is a
3316  * recycled WAL segment present in pg_wal, with garbage contents, however.
3317  * We would read the first page from the local WAL segment, but when
3318  * reading the second page, we would read the bogus, recycled, WAL
3319  * segment. If we didn't catch that case here, we would never recover,
3320  * because ReadRecord() would retry reading the whole record from the
3321  * beginning.
3322  *
3323  * Of course, this only catches errors in the page header, which is what
3324  * happens in the case of a recycled WAL segment. Other kinds of errors or
3325  * corruption still has the same problem. But this at least fixes the
3326  * common case, which can happen as part of normal operation.
3327  *
3328  * Validating the page header is cheap enough that doing it twice
3329  * shouldn't be a big deal from a performance point of view.
3330  *
3331  * When not in standby mode, an invalid page header should cause recovery
3332  * to end, not retry reading the page, so we don't need to validate the
3333  * page header here for the retry. Instead, ReadPageInternal() is
3334  * responsible for the validation.
3335  */
3336  if (StandbyMode &&
3337  !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3338  {
3339  /*
3340  * Emit this error right now then retry this page immediately. Use
3341  * errmsg_internal() because the message was already translated.
3342  */
3343  if (xlogreader->errormsg_buf[0])
3346 
3347  /* reset any error XLogReaderValidatePageHeader() might have set */
3349  goto next_record_is_invalid;
3350  }
3351 
3352  return readLen;
3353 
3354 next_record_is_invalid:
3355 
3356  /*
3357  * If we're reading ahead, give up fast. Retries and error reporting will
3358  * be handled by a later read when recovery catches up to this point.
3359  */
3360  if (xlogreader->nonblocking)
3361  return XLREAD_WOULDBLOCK;
3362 
3363  lastSourceFailed = true;
3364 
3365  if (readFile >= 0)
3366  close(readFile);
3367  readFile = -1;
3368  readLen = 0;
3370 
3371  /* In standby-mode, keep trying */
3372  if (StandbyMode)
3373  goto retry;
3374  else
3375  return XLREAD_FAIL;
3376 }
3377 
3378 /*
3379  * Open the WAL segment containing WAL location 'RecPtr'.
3380  *
3381  * The segment can be fetched via restore_command, or via walreceiver having
3382  * streamed the record, or it can already be present in pg_wal. Checking
3383  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3384  * too, in case someone copies a new segment directly to pg_wal. That is not
3385  * documented or recommended, though.
3386  *
3387  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3388  * prepare to read WAL starting from RedoStartLSN after this.
3389  *
3390  * 'RecPtr' might not point to the beginning of the record we're interested
3391  * in, it might also point to the page or segment header. In that case,
3392  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3393  * used to decide which timeline to stream the requested WAL from.
3394  *
3395  * 'replayLSN' is the current replay LSN, so that if we scan for new
3396  * timelines, we can reject a switch to a timeline that branched off before
3397  * this point.
3398  *
3399  * If the record is not immediately available, the function returns false
3400  * if we're not in standby mode. In standby mode, waits for it to become
3401  * available.
3402  *
3403  * When the requested record becomes available, the function opens the file
3404  * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3405  * of standby mode is triggered by the user, and there is no more WAL
3406  * available, returns XLREAD_FAIL.
3407  *
3408  * If nonblocking is true, then give up immediately if we can't satisfy the
3409  * request, returning XLREAD_WOULDBLOCK instead of waiting.
3410  */
3411 static XLogPageReadResult
3412 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3413  bool fetching_ckpt, XLogRecPtr tliRecPtr,
3414  TimeLineID replayTLI, XLogRecPtr replayLSN,
3415  bool nonblocking)
3416 {
3417  static TimestampTz last_fail_time = 0;
3418  TimestampTz now;
3419  bool streaming_reply_sent = false;
3420 
3421  /*-------
3422  * Standby mode is implemented by a state machine:
3423  *
3424  * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3425  * pg_wal (XLOG_FROM_PG_WAL)
3426  * 2. Check for promotion trigger request
3427  * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3428  * 4. Rescan timelines
3429  * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3430  *
3431  * Failure to read from the current source advances the state machine to
3432  * the next state.
3433  *
3434  * 'currentSource' indicates the current state. There are no currentSource
3435  * values for "check trigger", "rescan timelines", and "sleep" states,
3436  * those actions are taken when reading from the previous source fails, as
3437  * part of advancing to the next state.
3438  *
3439  * If standby mode is turned off while reading WAL from stream, we move
3440  * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3441  * the files (which would be required at end of recovery, e.g., timeline
3442  * history file) from archive or pg_wal. We don't need to kill WAL receiver
3443  * here because it's already stopped when standby mode is turned off at
3444  * the end of recovery.
3445  *-------
3446  */
3447  if (!InArchiveRecovery)
3449  else if (currentSource == XLOG_FROM_ANY ||
3451  {
3452  lastSourceFailed = false;
3454  }
3455 
3456  for (;;)
3457  {
3458  XLogSource oldSource = currentSource;
3459  bool startWalReceiver = false;
3460 
3461  /*
3462  * First check if we failed to read from the current source, and
3463  * advance the state machine if so. The failure to read might've
3464  * happened outside this function, e.g when a CRC check fails on a
3465  * record, or within this loop.
3466  */
3467  if (lastSourceFailed)
3468  {
3469  /*
3470  * Don't allow any retry loops to occur during nonblocking
3471  * readahead. Let the caller process everything that has been
3472  * decoded already first.
3473  */
3474  if (nonblocking)
3475  return XLREAD_WOULDBLOCK;
3476 
3477  switch (currentSource)
3478  {
3479  case XLOG_FROM_ARCHIVE:
3480  case XLOG_FROM_PG_WAL:
3481 
3482  /*
3483  * Check to see if promotion is requested. Note that we do
3484  * this only after failure, so when you promote, we still
3485  * finish replaying as much as we can from archive and
3486  * pg_wal before failover.
3487  */
3489  {
3491  return XLREAD_FAIL;
3492  }
3493 
3494  /*
3495  * Not in standby mode, and we've now tried the archive
3496  * and pg_wal.
3497  */
3498  if (!StandbyMode)
3499  return XLREAD_FAIL;
3500 
3501  /*
3502  * Move to XLOG_FROM_STREAM state, and set to start a
3503  * walreceiver if necessary.
3504  */
3506  startWalReceiver = true;
3507  break;
3508 
3509  case XLOG_FROM_STREAM:
3510 
3511  /*
3512  * Failure while streaming. Most likely, we got here
3513  * because streaming replication was terminated, or
3514  * promotion was triggered. But we also get here if we
3515  * find an invalid record in the WAL streamed from the
3516  * primary, in which case something is seriously wrong.
3517  * There's little chance that the problem will just go
3518  * away, but PANIC is not good for availability either,
3519  * especially in hot standby mode. So, we treat that the
3520  * same as disconnection, and retry from archive/pg_wal
3521  * again. The WAL in the archive should be identical to
3522  * what was streamed, so it's unlikely that it helps, but
3523  * one can hope...
3524  */
3525 
3526  /*
3527  * We should be able to move to XLOG_FROM_STREAM only in
3528  * standby mode.
3529  */
3531 
3532  /*
3533  * Before we leave XLOG_FROM_STREAM state, make sure that
3534  * walreceiver is not active, so that it won't overwrite
3535  * WAL that we restore from archive.
3536  */
3538 
3539  /*
3540  * Before we sleep, re-scan for possible new timelines if
3541  * we were requested to recover to the latest timeline.
3542  */
3544  {
3545  if (rescanLatestTimeLine(replayTLI, replayLSN))
3546  {
3548  break;
3549  }
3550  }
3551 
3552  /*
3553  * XLOG_FROM_STREAM is the last state in our state
3554  * machine, so we've exhausted all the options for
3555  * obtaining the requested WAL. We're going to loop back
3556  * and retry from the archive, but if it hasn't been long
3557  * since last attempt, sleep wal_retrieve_retry_interval
3558  * milliseconds to avoid busy-waiting.
3559  */
3561  if (!TimestampDifferenceExceeds(last_fail_time, now,
3563  {
3564  long wait_time;
3565 
3566  wait_time = wal_retrieve_retry_interval -
3567  TimestampDifferenceMilliseconds(last_fail_time, now);
3568 
3569  elog(LOG, "waiting for WAL to become available at %X/%X",
3570  LSN_FORMAT_ARGS(RecPtr));
3571 
3572  /* Do background tasks that might benefit us later. */
3574 
3578  wait_time,
3582 
3583  /* Handle interrupt signals of startup process */
3585  }
3586  last_fail_time = now;
3588  break;
3589 
3590  default:
3591  elog(ERROR, "unexpected WAL source %d", currentSource);
3592  }
3593  }
3594  else if (currentSource == XLOG_FROM_PG_WAL)
3595  {
3596  /*
3597  * We just successfully read a file in pg_wal. We prefer files in
3598  * the archive over ones in pg_wal, so try the next file again
3599  * from the archive first.
3600  */
3601  if (InArchiveRecovery)
3603  }
3604 
3605  if (currentSource != oldSource)
3606  elog(DEBUG2, "switched WAL source from %s to %s after %s",
3608  lastSourceFailed ? "failure" : "success");
3609 
3610  /*
3611  * We've now handled possible failure. Try to read from the chosen
3612  * source.
3613  */
3614  lastSourceFailed = false;
3615 
3616  switch (currentSource)
3617  {
3618  case XLOG_FROM_ARCHIVE:
3619  case XLOG_FROM_PG_WAL:
3620 
3621  /*
3622  * WAL receiver must not be running when reading WAL from
3623  * archive or pg_wal.
3624  */
3625  Assert(!WalRcvStreaming());
3626 
3627  /* Close any old file we might have open. */
3628  if (readFile >= 0)
3629  {
3630  close(readFile);
3631  readFile = -1;
3632  }
3633  /* Reset curFileTLI if random fetch. */
3634  if (randAccess)
3635  curFileTLI = 0;
3636 
3637  /*
3638  * Try to restore the file from archive, or read an existing
3639  * file from pg_wal.
3640  */
3643  currentSource);
3644  if (readFile >= 0)
3645  return XLREAD_SUCCESS; /* success! */
3646 
3647  /*
3648  * Nope, not found in archive or pg_wal.
3649  */
3650  lastSourceFailed = true;
3651  break;
3652 
3653  case XLOG_FROM_STREAM:
3654  {
3655  bool havedata;
3656 
3657  /*
3658  * We should be able to move to XLOG_FROM_STREAM only in
3659  * standby mode.
3660  */
3662 
3663  /*
3664  * First, shutdown walreceiver if its restart has been
3665  * requested -- but no point if we're already slated for
3666  * starting it.
3667  */
3668  if (pendingWalRcvRestart && !startWalReceiver)
3669  {
3671 
3672  /*
3673  * Re-scan for possible new timelines if we were
3674  * requested to recover to the latest timeline.
3675  */
3678  rescanLatestTimeLine(replayTLI, replayLSN);
3679 
3680  startWalReceiver = true;
3681  }
3682  pendingWalRcvRestart = false;
3683 
3684  /*
3685  * Launch walreceiver if needed.
3686  *
3687  * If fetching_ckpt is true, RecPtr points to the initial
3688  * checkpoint location. In that case, we use RedoStartLSN
3689  * as the streaming start position instead of RecPtr, so
3690  * that when we later jump backwards to start redo at
3691  * RedoStartLSN, we will have the logs streamed already.
3692  */
3693  if (startWalReceiver &&
3694  PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3695  {
3696  XLogRecPtr ptr;
3697  TimeLineID tli;
3698 
3699  if (fetching_ckpt)
3700  {
3701  ptr = RedoStartLSN;
3702  tli = RedoStartTLI;
3703  }
3704  else
3705  {
3706  ptr = RecPtr;
3707 
3708  /*
3709  * Use the record begin position to determine the
3710  * TLI, rather than the position we're reading.
3711  */
3712  tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3713 
3714  if (curFileTLI > 0 && tli < curFileTLI)
3715  elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3716  LSN_FORMAT_ARGS(tliRecPtr),
3717  tli, curFileTLI);
3718  }
3719  curFileTLI = tli;
3724  flushedUpto = 0;
3725  }
3726 
3727  /*
3728  * Check if WAL receiver is active or wait to start up.
3729  */
3730  if (!WalRcvStreaming())
3731  {
3732  lastSourceFailed = true;
3733  break;
3734  }
3735 
3736  /*
3737  * Walreceiver is active, so see if new data has arrived.
3738  *
3739  * We only advance XLogReceiptTime when we obtain fresh
3740  * WAL from walreceiver and observe that we had already
3741  * processed everything before the most recent "chunk"
3742  * that it flushed to disk. In steady state where we are
3743  * keeping up with the incoming data, XLogReceiptTime will
3744  * be updated on each cycle. When we are behind,
3745  * XLogReceiptTime will not advance, so the grace time
3746  * allotted to conflicting queries will decrease.
3747  */
3748  if (RecPtr < flushedUpto)
3749  havedata = true;
3750  else
3751  {
3752  XLogRecPtr latestChunkStart;
3753 
3754  flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3755  if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3756  {
3757  havedata = true;
3758  if (latestChunkStart <= RecPtr)
3759  {
3762  }
3763  }
3764  else
3765  havedata = false;
3766  }
3767  if (havedata)
3768  {
3769  /*
3770  * Great, streamed far enough. Open the file if it's
3771  * not open already. Also read the timeline history
3772  * file if we haven't initialized timeline history
3773  * yet; it should be streamed over and present in
3774  * pg_wal by now. Use XLOG_FROM_STREAM so that source
3775  * info is set correctly and XLogReceiptTime isn't
3776  * changed.
3777  *
3778  * NB: We must set readTimeLineHistory based on
3779  * recoveryTargetTLI, not receiveTLI. Normally they'll
3780  * be the same, but if recovery_target_timeline is
3781  * 'latest' and archiving is configured, then it's
3782  * possible that we managed to retrieve one or more
3783  * new timeline history files from the archive,
3784  * updating recoveryTargetTLI.
3785  */
3786  if (readFile < 0)
3787  {
3788  if (!expectedTLEs)
3791  receiveTLI,
3792  XLOG_FROM_STREAM, false);
3793  Assert(readFile >= 0);
3794  }
3795  else
3796  {
3797  /* just make sure source info is correct... */
3800  return XLREAD_SUCCESS;
3801  }
3802  break;
3803  }
3804 
3805  /* In nonblocking mode, return rather than sleeping. */
3806  if (nonblocking)
3807  return XLREAD_WOULDBLOCK;
3808 
3809  /*
3810  * Data not here yet. Check for trigger, then wait for
3811  * walreceiver to wake us up when new WAL arrives.
3812  */
3813  if (CheckForStandbyTrigger())
3814  {
3815  /*
3816  * Note that we don't return XLREAD_FAIL immediately
3817  * here. After being triggered, we still want to
3818  * replay all the WAL that was already streamed. It's
3819  * in pg_wal now, so we just treat this as a failure,
3820  * and the state machine will move on to replay the
3821  * streamed WAL from pg_wal, and then recheck the
3822  * trigger and exit replay.
3823  */
3824  lastSourceFailed = true;
3825  break;
3826  }
3827 
3828  /*
3829  * Since we have replayed everything we have received so
3830  * far and are about to start waiting for more WAL, let's
3831  * tell the upstream server our replay location now so
3832  * that pg_stat_replication doesn't show stale
3833  * information.
3834  */
3835  if (!streaming_reply_sent)
3836  {
3837  WalRcvForceReply();
3838  streaming_reply_sent = true;
3839  }
3840 
3841  /* Do any background tasks that might benefit us later. */
3843 
3844  /* Update pg_stat_recovery_prefetch before sleeping. */
3846 
3847  /*
3848  * Wait for more WAL to arrive, when we will be woken
3849  * immediately by the WAL receiver.
3850  */
3853  -1L,
3856  break;
3857  }
3858 
3859  default:
3860  elog(ERROR, "unexpected WAL source %d", currentSource);
3861  }
3862 
3863  /*
3864  * Check for recovery pause here so that we can confirm more quickly
3865  * that a requested pause has actually taken effect.
3866  */
3867  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
3869  recoveryPausesHere(false);
3870 
3871  /*
3872  * This possibly-long loop needs to handle interrupts of startup
3873  * process.
3874  */
3876  }
3877 
3878  return XLREAD_FAIL; /* not reached */
3879 }
3880 
3881 
3882 /*
3883  * Determine what log level should be used to report a corrupt WAL record
3884  * in the current WAL page, previously read by XLogPageRead().
3885  *
3886  * 'emode' is the error mode that would be used to report a file-not-found
3887  * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
3888  * we're retrying the exact same record that we've tried previously, only
3889  * complain the first time to keep the noise down. However, we only do when
3890  * reading from pg_wal, because we don't expect any invalid records in archive
3891  * or in records streamed from the primary. Files in the archive should be complete,
3892  * and we should never hit the end of WAL because we stop and wait for more WAL
3893  * to arrive before replaying it.
3894  *
3895  * NOTE: This function remembers the RecPtr value it was last called with,
3896  * to suppress repeated messages about the same record. Only call this when
3897  * you are about to ereport(), or you might cause a later message to be
3898  * erroneously suppressed.
3899  */
3900 static int
3902 {
3903  static XLogRecPtr lastComplaint = 0;
3904 
3905  if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
3906  {
3907  if (RecPtr == lastComplaint)
3908  emode = DEBUG1;
3909  else
3910  lastComplaint = RecPtr;
3911  }
3912  return emode;
3913 }
3914 
3915 
3916 /*
3917  * Subroutine to try to fetch and validate a prior checkpoint record.
3918  */
3919 static XLogRecord *
3921  TimeLineID replayTLI)
3922 {
3923  XLogRecord *record;
3924  uint8 info;
3925 
3926  Assert(xlogreader != NULL);
3927 
3928  if (!XRecOffIsValid(RecPtr))
3929  {
3930  ereport(LOG,
3931  (errmsg("invalid checkpoint location")));
3932  return NULL;
3933  }
3934 
3936  record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
3937 
3938  if (record == NULL)
3939  {
3940  ereport(LOG,
3941  (errmsg("invalid checkpoint record")));
3942  return NULL;
3943  }
3944  if (record->xl_rmid != RM_XLOG_ID)
3945  {
3946  ereport(LOG,
3947  (errmsg("invalid resource manager ID in checkpoint record")));
3948  return NULL;
3949  }
3950  info = record->xl_info & ~XLR_INFO_MASK;
3951  if (info != XLOG_CHECKPOINT_SHUTDOWN &&
3952  info != XLOG_CHECKPOINT_ONLINE)
3953  {
3954  ereport(LOG,
3955  (errmsg("invalid xl_info in checkpoint record")));
3956  return NULL;
3957  }
3959  {
3960  ereport(LOG,
3961  (errmsg("invalid length of checkpoint record")));
3962  return NULL;
3963  }
3964  return record;
3965 }
3966 
3967 /*
3968  * Scan for new timelines that might have appeared in the archive since we
3969  * started recovery.
3970  *
3971  * If there are any, the function changes recovery target TLI to the latest
3972  * one and returns 'true'.
3973  */
3974 static bool
3976 {
3977  List *newExpectedTLEs;
3978  bool found;
3979  ListCell *cell;
3980  TimeLineID newtarget;
3981  TimeLineID oldtarget = recoveryTargetTLI;
3982  TimeLineHistoryEntry *currentTle = NULL;
3983 
3985  if (newtarget == recoveryTargetTLI)
3986  {
3987  /* No new timelines found */
3988  return false;
3989  }
3990 
3991  /*
3992  * Determine the list of expected TLIs for the new TLI
3993  */
3994 
3995  newExpectedTLEs = readTimeLineHistory(newtarget);
3996 
3997  /*
3998  * If the current timeline is not part of the history of the new timeline,
3999  * we cannot proceed to it.
4000  */
4001  found = false;
4002  foreach(cell, newExpectedTLEs)
4003  {
4004  currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4005 
4006  if (currentTle->tli == recoveryTargetTLI)
4007  {
4008  found = true;
4009  break;
4010  }
4011  }
4012  if (!found)
4013  {
4014  ereport(LOG,
4015  (errmsg("new timeline %u is not a child of database system timeline %u",
4016  newtarget,
4017  replayTLI)));
4018  return false;
4019  }
4020 
4021  /*
4022  * The current timeline was found in the history file, but check that the
4023  * next timeline was forked off from it *after* the current recovery
4024  * location.
4025  */
4026  if (currentTle->end < replayLSN)
4027  {
4028  ereport(LOG,
4029  (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4030  newtarget,
4031  replayTLI,
4032  LSN_FORMAT_ARGS(replayLSN))));
4033  return false;
4034  }
4035 
4036  /* The new timeline history seems valid. Switch target */
4037  recoveryTargetTLI = newtarget;
4039  expectedTLEs = newExpectedTLEs;
4040 
4041  /*
4042  * As in StartupXLOG(), try to ensure we have all the history files
4043  * between the old target and new target in pg_wal.
4044  */
4045  restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4046 
4047  ereport(LOG,
4048  (errmsg("new target timeline is %u",
4049  recoveryTargetTLI)));
4050 
4051  return true;
4052 }
4053 
4054 
4055 /*
4056  * Open a logfile segment for reading (during recovery).
4057  *
4058  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4059  * Otherwise, it's assumed to be already available in pg_wal.
4060  */
4061 static int
4062 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
4063  XLogSource source, bool notfoundOk)
4064 {
4065  char xlogfname[MAXFNAMELEN];
4066  char activitymsg[MAXFNAMELEN + 16];
4067  char path[MAXPGPATH];
4068  int fd;
4069 
4070  XLogFileName(xlogfname, tli, segno, wal_segment_size);
4071 
4072  switch (source)
4073  {
4074  case XLOG_FROM_ARCHIVE:
4075  /* Report recovery progress in PS display */
4076  snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4077  xlogfname);
4078  set_ps_display(activitymsg);
4079 
4080  if (!RestoreArchivedFile(path, xlogfname,
4081  "RECOVERYXLOG",
4083  InRedo))
4084  return -1;
4085  break;
4086 
4087  case XLOG_FROM_PG_WAL:
4088  case XLOG_FROM_STREAM:
4089  XLogFilePath(path, tli, segno, wal_segment_size);
4090  break;
4091 
4092  default:
4093  elog(ERROR, "invalid XLogFileRead source %d", source);
4094  }
4095 
4096  /*
4097  * If the segment was fetched from archival storage, replace the existing
4098  * xlog segment (if any) with the archival version.
4099  */
4100  if (source == XLOG_FROM_ARCHIVE)
4101  {
4103  KeepFileRestoredFromArchive(path, xlogfname);
4104 
4105  /*
4106  * Set path to point at the new file in pg_wal.
4107  */
4108  snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4109  }
4110 
4111  fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4112  if (fd >= 0)
4113  {
4114  /* Success! */
4115  curFileTLI = tli;
4116 
4117  /* Report recovery progress in PS display */
4118  snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4119  xlogfname);
4120  set_ps_display(activitymsg);
4121 
4122  /* Track source of data in assorted state variables */
4123  readSource = source;
4125  /* In FROM_STREAM case, caller tracks receipt time, not me */
4126  if (source != XLOG_FROM_STREAM)
4128 
4129  return fd;
4130  }
4131  if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4132  ereport(PANIC,
4134  errmsg("could not open file \"%s\": %m", path)));
4135  return -1;
4136 }
4137 
4138 /*
4139  * Open a logfile segment for reading (during recovery).
4140  *
4141  * This version searches for the segment with any TLI listed in expectedTLEs.
4142  */
4143 static int
4145 {
4146  char path[MAXPGPATH];
4147  ListCell *cell;
4148  int fd;
4149  List *tles;
4150 
4151  /*
4152  * Loop looking for a suitable timeline ID: we might need to read any of
4153  * the timelines listed in expectedTLEs.
4154  *
4155  * We expect curFileTLI on entry to be the TLI of the preceding file in
4156  * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4157  * to go backwards; this prevents us from picking up the wrong file when a
4158  * parent timeline extends to higher segment numbers than the child we
4159  * want to read.
4160  *
4161  * If we haven't read the timeline history file yet, read it now, so that
4162  * we know which TLIs to scan. We don't save the list in expectedTLEs,
4163  * however, unless we actually find a valid segment. That way if there is
4164  * neither a timeline history file nor a WAL segment in the archive, and
4165  * streaming replication is set up, we'll read the timeline history file
4166  * streamed from the primary when we start streaming, instead of
4167  * recovering with a dummy history generated here.
4168  */
4169  if (expectedTLEs)
4170  tles = expectedTLEs;
4171  else
4173 
4174  foreach(cell, tles)
4175  {
4177  TimeLineID tli = hent->tli;
4178 
4179  if (tli < curFileTLI)
4180  break; /* don't bother looking at too-old TLIs */
4181 
4182  /*
4183  * Skip scanning the timeline ID that the logfile segment to read
4184  * doesn't belong to
4185  */
4186  if (hent->begin != InvalidXLogRecPtr)
4187  {
4188  XLogSegNo beginseg = 0;
4189 
4190  XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4191 
4192  /*
4193  * The logfile segment that doesn't belong to the timeline is
4194  * older or newer than the segment that the timeline started or
4195  * ended at, respectively. It's sufficient to check only the
4196  * starting segment of the timeline here. Since the timelines are
4197  * scanned in descending order in this loop, any segments newer
4198  * than the ending segment should belong to newer timeline and
4199  * have already been read before. So it's not necessary to check
4200  * the ending segment of the timeline here.
4201  */
4202  if (segno < beginseg)
4203  continue;
4204  }
4205 
4207  {
4208  fd = XLogFileRead(segno, emode, tli,
4209  XLOG_FROM_ARCHIVE, true);
4210  if (fd != -1)
4211  {
4212  elog(DEBUG1, "got WAL segment from archive");
4213  if (!expectedTLEs)
4214  expectedTLEs = tles;
4215  return fd;
4216  }
4217  }
4218 
4220  {
4221  fd = XLogFileRead(segno, emode, tli,
4222  XLOG_FROM_PG_WAL, true);
4223  if (fd != -1)
4224  {
4225  if (!expectedTLEs)
4226  expectedTLEs = tles;
4227  return fd;
4228  }
4229  }
4230  }
4231 
4232  /* Couldn't find it. For simplicity, complain about front timeline */
4234  errno = ENOENT;
4235  ereport(emode,
4237  errmsg("could not open file \"%s\": %m", path)));
4238  return -1;
4239 }
4240 
4241 /*
4242  * Set flag to signal the walreceiver to restart. (The startup process calls
4243  * this on noticing a relevant configuration change.)
4244  */
4245 void
4247 {
4249  {
4250  ereport(LOG,
4251  (errmsg("WAL receiver process shutdown requested")));
4252 
4253  pendingWalRcvRestart = true;
4254  }
4255 }
4256 
4257 
4258 /*
4259  * Has a standby promotion already been triggered?
4260  *
4261  * Unlike CheckForStandbyTrigger(), this works in any process
4262  * that's connected to shared memory.
4263  */
4264 bool
4266 {
4267  /*
4268  * We check shared state each time only until a standby promotion is
4269  * triggered. We can't trigger a promotion again, so there's no need to
4270  * keep checking after the shared variable has once been seen true.
4271  */
4273  return true;
4274 
4278 
4279  return LocalPromoteIsTriggered;
4280 }
4281 
4282 static void
4284 {
4288 
4289  /*
4290  * Mark the recovery pause state as 'not paused' because the paused state
4291  * ends and promotion continues if a promotion is triggered while recovery
4292  * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4293  * return 'paused' while a promotion is ongoing.
4294  */
4295  SetRecoveryPause(false);
4296 
4297  LocalPromoteIsTriggered = true;
4298 }
4299 
4300 /*
4301  * Check whether a promote request has arrived.
4302  */
4303 static bool
4305 {
4307  return true;
4308 
4310  {
4311  ereport(LOG, (errmsg("received promote request")));
4315  return true;
4316  }
4317 
4318  return false;
4319 }
4320 
4321 /*
4322  * Remove the files signaling a standby promotion request.
4323  */
4324 void
4326 {
4327  unlink(PROMOTE_SIGNAL_FILE);
4328 }
4329 
4330 /*
4331  * Check to see if a promote request has arrived.
4332  */
4333 bool
4335 {
4336  struct stat stat_buf;
4337 
4338  if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4339  return true;
4340 
4341  return false;
4342 }
4343 
4344 /*
4345  * Wake up startup process to replay newly arrived WAL, or to notice that
4346  * failover has been requested.
4347  */
4348 void
4350 {
4352 }
4353 
4354 /*
4355  * Schedule a walreceiver wakeup in the main recovery loop.
4356  */
4357 void
4359 {
4361 }
4362 
4363 /*
4364  * Is HotStandby active yet? This is only important in special backends
4365  * since normal backends won't ever be able to connect until this returns
4366  * true. Postmaster knows this by way of signal, not via shared memory.
4367  *
4368  * Unlike testing standbyState, this works in any process that's connected to
4369  * shared memory. (And note that standbyState alone doesn't tell the truth
4370  * anyway.)
4371  */
4372 bool
4374 {
4375  /*
4376  * We check shared state each time only until Hot Standby is active. We
4377  * can't de-activate Hot Standby, so there's no need to keep checking
4378  * after the shared variable has once been seen true.
4379  */
4381  return true;
4382  else
4383  {
4384  /* spinlock is essential on machines with weak memory ordering! */
4388 
4389  return LocalHotStandbyActive;
4390  }
4391 }
4392 
4393 /*
4394  * Like HotStandbyActive(), but to be used only in WAL replay code,
4395  * where we don't need to ask any other process what the state is.
4396  */
4397 static bool
4399 {
4401  return LocalHotStandbyActive;
4402 }
4403 
4404 /*
4405  * Get latest redo apply position.
4406  *
4407  * Exported to allow WALReceiver to read the pointer directly.
4408  */
4409 XLogRecPtr
4411 {
4412  XLogRecPtr recptr;
4413  TimeLineID tli;
4414 
4419 
4420  if (replayTLI)
4421  *replayTLI = tli;
4422  return recptr;
4423 }
4424 
4425 
4426 /*
4427  * Get position of last applied, or the record being applied.
4428  *
4429  * This is different from GetXLogReplayRecPtr() in that if a WAL
4430  * record is currently being applied, this includes that record.
4431  */
4432 XLogRecPtr
4434 {
4435  XLogRecPtr recptr;
4436  TimeLineID tli;
4437 
4439  recptr = XLogRecoveryCtl->replayEndRecPtr;
4442 
4443  if (replayEndTLI)
4444  *replayEndTLI = tli;
4445  return recptr;
4446 }
4447 
4448 /*
4449  * Save timestamp of latest processed commit/abort record.
4450  *
4451  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4452  * seen by processes other than the startup process. Note in particular
4453  * that CreateRestartPoint is executed in the checkpointer.
4454  */
4455 static void
4457 {
4461 }
4462 
4463 /*
4464  * Fetch timestamp of latest processed commit/abort record.
4465  */
4468 {
4469  TimestampTz xtime;
4470 
4474 
4475  return xtime;
4476 }
4477 
4478 /*
4479  * Save timestamp of the next chunk of WAL records to apply.
4480  *
4481  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4482  * seen by all backends.
4483  */
4484 static void
4486 {
4490 }
4491 
4492 /*
4493  * Fetch timestamp of latest processed commit/abort record.
4494  * Startup process maintains an accurate local copy in XLogReceiptTime
4495  */
4498 {
4499  TimestampTz xtime;
4500 
4504 
4505  return xtime;
4506 }
4507 
4508 /*
4509  * Returns time of receipt of current chunk of XLOG data, as well as
4510  * whether it was received from streaming replication or from archives.
4511  */
4512 void
4513 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4514 {
4515  /*
4516  * This must be executed in the startup process, since we don't export the
4517  * relevant state to shared memory.
4518  */
4519  Assert(InRecovery);
4520 
4521  *rtime = XLogReceiptTime;
4522  *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4523 }
4524 
4525 /*
4526  * Note that text field supplied is a parameter name and does not require
4527  * translation
4528  */
4529 void
4530 RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4531 {
4532  if (currValue < minValue)
4533  {
4535  {
4536  bool warned_for_promote = false;
4537 
4538  ereport(WARNING,
4539  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4540  errmsg("hot standby is not possible because of insufficient parameter settings"),
4541  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4542  param_name,
4543  currValue,
4544  minValue)));
4545 
4546  SetRecoveryPause(true);
4547 
4548  ereport(LOG,
4549  (errmsg("recovery has paused"),
4550  errdetail("If recovery is unpaused, the server will shut down."),
4551  errhint("You can then restart the server after making the necessary configuration changes.")));
4552 
4554  {
4556 
4557  if (CheckForStandbyTrigger())
4558  {
4559  if (!warned_for_promote)
4560  ereport(WARNING,
4561  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4562  errmsg("promotion is not possible because of insufficient parameter settings"),
4563 
4564  /*
4565  * Repeat the detail from above so it's easy to find
4566  * in the log.
4567  */
4568  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4569  param_name,
4570  currValue,
4571  minValue),
4572  errhint("Restart the server after making the necessary configuration changes.")));
4573  warned_for_promote = true;
4574  }
4575 
4576  /*
4577  * If recovery pause is requested then set it paused. While
4578  * we are in the loop, user might resume and pause again so
4579  * set this every time.
4580  */
4582 
4583  /*
4584  * We wait on a condition variable that will wake us as soon
4585  * as the pause ends, but we use a timeout so we can check the
4586  * above conditions periodically too.
4587  */
4590  }
4592  }
4593 
4594  ereport(FATAL,
4595  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4596  errmsg("recovery aborted because of insufficient parameter settings"),
4597  /* Repeat the detail from above so it's easy to find in the log. */
4598  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4599  param_name,
4600  currValue,
4601  minValue),
4602  errhint("You can restart the server after making the necessary configuration changes.")));
4603  }
4604 }
4605 
4606 
4607 /*
4608  * GUC check_hook for primary_slot_name
4609  */
4610 bool
4612 {
4613  if (*newval && strcmp(*newval, "") != 0 &&
4615  return false;
4616 
4617  return true;
4618 }
4619 
4620 /*
4621  * Recovery target settings: Only one of the several recovery_target* settings
4622  * may be set. Setting a second one results in an error. The global variable
4623  * recoveryTarget tracks which kind of recovery target was chosen. Other
4624  * variables store the actual target value (for example a string or a xid).
4625  * The assign functions of the parameters check whether a competing parameter
4626  * was already set. But we want to allow setting the same parameter multiple
4627  * times. We also want to allow unsetting a parameter and setting a different
4628  * one, so we unset recoveryTarget when the parameter is set to an empty
4629  * string.
4630  *
4631  * XXX this code is broken by design. Throwing an error from a GUC assign
4632  * hook breaks fundamental assumptions of guc.c. So long as all the variables
4633  * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4634  * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4635  * that we have odd behaviors such as unexpected GUC ordering dependencies.
4636  */
4637 
4638 static void
4640 error_multiple_recovery_targets(void)
4641 {
4642  ereport(ERROR,
4643  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4644  errmsg("multiple recovery targets specified"),
4645  errdetail("At most one of recovery_target, recovery_target_lsn, recovery_target_name, recovery_target_time, recovery_target_xid may be set.")));
4646 }
4647 
4648 /*
4649  * GUC check_hook for recovery_target
4650  */
4651 bool
4653 {
4654  if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4655  {
4656  GUC_check_errdetail("The only allowed value is \"immediate\".");
4657  return false;
4658  }
4659  return true;
4660 }
4661 
4662 /*
4663  * GUC assign_hook for recovery_target
4664  */
4665 void
4666 assign_recovery_target(const char *newval, void *extra)
4667 {
4670  error_multiple_recovery_targets();
4671 
4672  if (newval && strcmp(newval, "") != 0)
4674  else
4676 }
4677 
4678 /*
4679  * GUC check_hook for recovery_target_lsn
4680  */
4681 bool
4683 {
4684  if (strcmp(*newval, "") != 0)
4685  {
4686  XLogRecPtr lsn;
4687  XLogRecPtr *myextra;
4688  bool have_error = false;
4689 
4690  lsn = pg_lsn_in_internal(*newval, &have_error);
4691  if (have_error)
4692  return false;
4693 
4694  myextra = (XLogRecPtr *) guc_malloc(ERROR, sizeof(XLogRecPtr));
4695  *myextra = lsn;
4696  *extra = (void *) myextra;
4697  }
4698  return true;
4699 }
4700 
4701 /*
4702  * GUC assign_hook for recovery_target_lsn
4703  */
4704 void
4705 assign_recovery_target_lsn(const char *newval, void *extra)
4706 {
4709  error_multiple_recovery_targets();
4710 
4711  if (newval && strcmp(newval, "") != 0)
4712  {
4714  recoveryTargetLSN = *((XLogRecPtr *) extra);
4715  }
4716  else
4718 }
4719 
4720 /*
4721  * GUC check_hook for recovery_target_name
4722  */
4723 bool
4725 {
4726  /* Use the value of newval directly */
4727  if (strlen(*newval) >= MAXFNAMELEN)
4728  {
4729  GUC_check_errdetail("%s is too long (maximum %d characters).",
4730  "recovery_target_name", MAXFNAMELEN - 1);
4731  return false;
4732  }
4733  return true;
4734 }
4735 
4736 /*
4737  * GUC assign_hook for recovery_target_name
4738  */
4739 void
4740 assign_recovery_target_name(const char *newval, void *extra)
4741 {
4744  error_multiple_recovery_targets();
4745 
4746  if (newval && strcmp(newval, "") != 0)
4747  {
4750  }
4751  else
4753 }
4754 
4755 /*
4756  * GUC check_hook for recovery_target_time
4757  *
4758  * The interpretation of the recovery_target_time string can depend on the
4759  * time zone setting, so we need to wait until after all GUC processing is
4760  * done before we can do the final parsing of the string. This check function
4761  * only does a parsing pass to catch syntax errors, but we store the string
4762  * and parse it again when we need to use it.
4763  */
4764 bool
4766 {
4767  if (strcmp(*newval, "") != 0)
4768  {
4769  /* reject some special values */
4770  if (strcmp(*newval, "now") == 0 ||
4771  strcmp(*newval, "today") == 0 ||
4772  strcmp(*newval, "tomorrow") == 0 ||
4773  strcmp(*newval, "yesterday") == 0)
4774  {
4775  return false;
4776  }
4777 
4778  /*
4779  * parse timestamp value (see also timestamptz_in())
4780  */
4781  {
4782  char *str = *newval;
4783  fsec_t fsec;
4784  struct pg_tm tt,
4785  *tm = &tt;
4786  int tz;
4787  int dtype;
4788  int nf;
4789  int dterr;
4790  char *field[MAXDATEFIELDS];
4791  int ftype[MAXDATEFIELDS];
4792  char workbuf[MAXDATELEN + MAXDATEFIELDS];
4794 
4795  dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4796  field, ftype, MAXDATEFIELDS, &nf);
4797  if (dterr == 0)
4798  dterr = DecodeDateTime(field, ftype, nf, &dtype, tm, &fsec, &tz);
4799  if (dterr != 0)
4800  return false;
4801  if (dtype != DTK_DATE)
4802  return false;
4803 
4804  if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4805  {
4806  GUC_check_errdetail("timestamp out of range: \"%s\"", str);
4807  return false;
4808  }
4809  }
4810  }
4811  return true;
4812 }
4813 
4814 /*
4815  * GUC assign_hook for recovery_target_time
4816  */
4817 void
4818 assign_recovery_target_time(const char *newval, void *extra)
4819 {
4822  error_multiple_recovery_targets();
4823 
4824  if (newval && strcmp(newval, "") != 0)
4826  else
4828 }
4829 
4830 /*
4831  * GUC check_hook for recovery_target_timeline
4832  */
4833 bool
4835 {
4837  RecoveryTargetTimeLineGoal *myextra;
4838 
4839  if (strcmp(*newval, "current") == 0)
4841  else if (strcmp(*newval, "latest") == 0)
4843  else
4844  {
4846 
4847  errno = 0;
4848  strtoul(*newval, NULL, 0);
4849  if (errno == EINVAL || errno == ERANGE)
4850  {
4851  GUC_check_errdetail("recovery_target_timeline is not a valid number.");
4852  return false;
4853  }
4854  }
4855 
4857  *myextra = rttg;
4858  *extra = (void *) myextra;
4859 
4860  return true;
4861 }
4862 
4863 /*
4864  * GUC assign_hook for recovery_target_timeline
4865  */
4866 void
4867 assign_recovery_target_timeline(const char *newval, void *extra)
4868 {
4871  recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
4872  else
4874 }
4875 
4876 /*
4877  * GUC check_hook for recovery_target_xid
4878  */
4879 bool
4881 {
4882  if (strcmp(*newval, "") != 0)
4883  {
4884  TransactionId xid;
4885  TransactionId *myextra;
4886 
4887  errno = 0;
4888  xid = (TransactionId) strtou64(*newval, NULL, 0);
4889  if (errno == EINVAL || errno == ERANGE)
4890  return false;
4891 
4892  myextra = (TransactionId *) guc_malloc(ERROR, sizeof(TransactionId));
4893  *myextra = xid;
4894  *extra = (void *) myextra;
4895  }
4896  return true;
4897 }
4898 
4899 /*
4900  * GUC assign_hook for recovery_target_xid
4901  */
4902 void
4903 assign_recovery_target_xid(const char *newval, void *extra)
4904 {
4907  error_multiple_recovery_targets();
4908 
4909  if (newval && strcmp(newval, "") != 0)
4910  {
4912  recoveryTargetXid = *((TransactionId *) extra);
4913  }
4914  else
4916 }
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition: timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:544
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:572
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition: timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition: timeline.c:50
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition: timeline.c:526
void remove_tablespace_symlink(const char *linkloc)
Definition: tablespace.c:889
bool allow_in_place_tablespaces
Definition: tablespace.c:91
void HandleStartupProcInterrupts(void)
Definition: startup.c:168
bool IsPromoteSignaled(void)
Definition: startup.c:297
void begin_startup_progress_phase(void)
Definition: startup.c:321
void ResetPromoteSignaled(void)
Definition: startup.c:303
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition: datetime.c:753
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp)
Definition: datetime.c:974
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1701
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition: timestamp.c:1926
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1719
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition: timestamp.c:403
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1573
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1537
const char * timestamptz_to_str(TimestampTz t)
Definition: timestamp.c:1782
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3954
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4172
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:280
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:107
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:45
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:228
Pointer Page
Definition: bufpage.h:78
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
unsigned int uint32
Definition: c.h:442
signed int int32
Definition: c.h:430
#define PG_BINARY
Definition: c.h:1209
#define UINT64_FORMAT
Definition: c.h:485
#define strtou64(str, endptr, base)
Definition: c.h:1234
unsigned char uint8
Definition: c.h:440
uint32 TransactionId
Definition: c.h:588
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:166
size_t Size
Definition: c.h:541
void RequestCheckpoint(int flags)
Definition: checkpointer.c:931
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableCancelSleep(void)
int64 TimestampTz
Definition: timestamp.h:39
int32 fsec_t
Definition: timestamp.h:41
int errmsg_internal(const char *fmt,...)
Definition: elog.c:993
int errcode_for_file_access(void)
Definition: elog.c:718
int errdetail(const char *fmt,...)
Definition: elog.c:1039
ErrorContextCallback * error_context_stack
Definition: elog.c:94
int errhint(const char *fmt,...)
Definition: elog.c:1153
int errcode(int sqlerrcode)
Definition: elog.c:695
int errmsg(const char *fmt,...)
Definition: elog.c:906
#define LOG
Definition: elog.h:27
#define errcontext
Definition: elog.h:192
#define DEBUG3
Definition: elog.h:24
#define FATAL
Definition: elog.h:37
#define WARNING
Definition: elog.h:32
#define DEBUG2
Definition: elog.h:25
#define PANIC
Definition: elog.h:38
#define DEBUG1
Definition: elog.h:26
#define ERROR
Definition: elog.h:35
#define ereport(elevel,...)
Definition: elog.h:145
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2709
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2383
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1015
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:688
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:993
int FreeFile(FILE *file)
Definition: fd.c:2581
int pg_fsync(int fd)
Definition: fd.c:356
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2643
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:406
@ PGFILETYPE_LNK
Definition: file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:646
bool IsUnderPostmaster
Definition: globals.c:113
char * DataDir
Definition: globals.c:66
bool IsPostmasterEnvironment
Definition: globals.c:112
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:631
#define newval
#define GUC_check_errdetail
Definition: guc.h:434
GucSource
Definition: guc.h:108
int trace_recovery_messages
Definition: guc_tables.c:495
#define MAXDATEFIELDS
Definition: datetime.h:203
#define DTK_DATE
Definition: datetime.h:145
#define MAXDATELEN
Definition: datetime.h:201
#define close(a)
Definition: win32.h:12
void proc_exit(int code)
Definition: ipc.c:104
int i
Definition: isn.c:73
void OwnLatch(Latch *latch)
Definition: latch.c:422
void DisownLatch(Latch *latch)
Definition: latch.c:448
void InitSharedLatch(Latch *latch)
Definition: latch.c:389
void SetLatch(Latch *latch)
Definition: latch.c:591
void ResetLatch(Latch *latch)
Definition: latch.c:683
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:476
#define WL_TIMEOUT
Definition: latch.h:128
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:130
#define WL_LATCH_SET
Definition: latch.h:125
Assert(fmt[strlen(fmt) - 1] !='\n')
List * lappend(List *list, void *datum)
Definition: list.c:338
void list_free_deep(List *list)
Definition: list.c:1559
static struct pg_tm tm
Definition: localtime.c:104
char * pstrdup(const char *in)
Definition: mcxt.c:1483
void pfree(void *pointer)
Definition: mcxt.c:1306
void * palloc0(Size size)
Definition: mcxt.c:1230
void * palloc(Size size)
Definition: mcxt.c:1199
#define AmStartupProcess()
Definition: miscadmin.h:440
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:402
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition: pg_control.h:74
#define XLOG_OVERWRITE_CONTRECORD
Definition: pg_control.h:80
DBState
Definition: pg_control.h:88
@ DB_IN_ARCHIVE_RECOVERY
Definition: pg_control.h:94
@ DB_SHUTDOWNED_IN_RECOVERY
Definition: pg_control.h:91
@ DB_SHUTDOWNED
Definition: pg_control.h:90
@ DB_IN_CRASH_RECOVERY
Definition: pg_control.h:93
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:67
#define XLOG_BACKUP_END
Definition: pg_control.h:72
#define XLOG_CHECKPOINT_ONLINE
Definition: pg_control.h:68
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:76
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:170
#define NIL
Definition: pg_list.h:66
XLogRecPtr pg_lsn_in_internal(const char *str, bool *have_error)
Definition: pg_lsn.c:30
static rewind_source * source
Definition: pg_rewind.c:81
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:67
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:181
@ PMSIGNAL_RECOVERY_STARTED
Definition: pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition: pmsignal.h:36
#define pg_pread
Definition: port.h:225
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:600
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:698
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:560
#define InvalidOid
Definition: postgres_ext.h:36
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition: procarray.c:4392
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition: procarray.c:4529
void set_ps_display(const char *activity)
Definition: ps_status.c:342
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
void RmgrStartup(void)
Definition: rmgr.c:49
void RmgrCleanup(void)
Definition: rmgr.c:65
int slock_t
Definition: s_lock.h:754
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:396
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:198
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:91
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:176
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:188
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
Oid oldestMultiDB
Definition: pg_control.h:50
MultiXactId oldestMulti
Definition: pg_control.h:49
MultiXactOffset nextMultiOffset
Definition: pg_control.h:46
TransactionId newestCommitTsXid
Definition: pg_control.h:54
TransactionId oldestXid
Definition: pg_control.h:47
TimeLineID PrevTimeLineID
Definition: pg_control.h:40
TimeLineID ThisTimeLineID
Definition: pg_control.h:39
Oid nextOid
Definition: pg_control.h:44
MultiXactId nextMulti
Definition: pg_control.h:45
FullTransactionId nextXid
Definition: pg_control.h:43
TransactionId oldestCommitTsXid
Definition: pg_control.h:52
XLogRecPtr redo
Definition: pg_control.h:37
Oid oldestXidDB
Definition: pg_control.h:48
XLogRecPtr backupStartPoint
Definition: pg_control.h:168
bool backupEndRequired
Definition: pg_control.h:170
CheckPoint checkPointCopy
Definition: pg_control.h:133
XLogRecPtr backupEndPoint
Definition: pg_control.h:169
XLogRecPtr minRecoveryPoint
Definition: pg_control.h:166
XLogRecPtr checkPoint
Definition: pg_control.h:131
uint64 system_identifier
Definition: pg_control.h:108
TimeLineID minRecoveryPointTLI
Definition: pg_control.h:167
Definition: dirent.c:26
XLogRecPtr lastPageBeginPtr
Definition: xlogrecovery.h:111
XLogRecPtr abortedRecPtr
Definition: xlogrecovery.h:120
XLogRecPtr missingContrecPtr
Definition: xlogrecovery.h:121
TimeLineID endOfLogTLI
Definition: xlogrecovery.h:109
struct ErrorContextCallback * previous
Definition: elog.h:234
void(* callback)(void *arg)
Definition: elog.h:235
Definition: latch.h:111
Definition: pg_list.h:52
RelFileNumber relNumber
void(* rm_mask)(char *pagedata, BlockNumber blkno)
void(* rm_redo)(XLogReaderState *record)
const char *(* rm_identify)(uint8 info)
const char * rm_name
void(* rm_desc)(StringInfo buf, XLogReaderState *record)
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h:29
TimeLineID ws_tli
Definition: xlogreader.h:49
TimeLineID replayTLI
Definition: xlogrecovery.c:199
XLogRecPtr missingContrecPtr
Definition: xlogreader.h:215
char * errormsg_buf
Definition: xlogreader.h:311
XLogRecPtr EndRecPtr
Definition: xlogreader.h:207
uint64 system_identifier
Definition: xlogreader.h:191
XLogRecPtr ReadRecPtr
Definition: xlogreader.h:206
XLogRecPtr abortedRecPtr
Definition: xlogreader.h:214
TimeLineID latestPageTLI
Definition: xlogreader.h:280
XLogRecPtr overwrittenRecPtr
Definition: xlogreader.h:217
XLogRecPtr latestPagePtr
Definition: xlogreader.h:279
WALOpenSegment seg
Definition: xlogreader.h:272
void * private_data
Definition: xlogreader.h:196
uint8 xl_info
Definition: xlogrecord.h:46
uint32 xl_tot_len
Definition: xlogrecord.h:43
TransactionId xl_xid
Definition: xlogrecord.h:44
RmgrId xl_rmid
Definition: xlogrecord.h:47
ConditionVariable recoveryNotPausedCV
Definition: xlogrecovery.c:359
XLogRecPtr lastReplayedEndRecPtr
Definition: xlogrecovery.c:339
TimeLineID replayEndTLI
Definition: xlogrecovery.c:348
TimeLineID lastReplayedTLI
Definition: xlogrecovery.c:340
TimestampTz currentChunkStartTime
Definition: xlogrecovery.c:356
XLogRecPtr replayEndRecPtr
Definition: xlogrecovery.c:347
TimestampTz recoveryLastXTime
Definition: xlogrecovery.c:350
RecoveryPauseState recoveryPauseState
Definition: xlogrecovery.c:358
XLogRecPtr lastReplayedReadRecPtr
Definition: xlogrecovery.c:338
Definition: guc.h:168
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
Definition: pgtime.h:35
Definition: regguts.h:318
TimeLineID PrevTimeLineID
TimeLineID ThisTimeLineID
char rp_name[MAXFNAMELEN]
TransactionId twophase_xid
Definition: xact.h:414
TransactionId twophase_xid
Definition: xact.h:384
#define InvalidTransactionId
Definition: transam.h:31
#define U64FromFullTransactionId(x)
Definition: transam.h:49
#define XidFromFullTransactionId(x)
Definition: transam.h:48
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
#define TimestampTzPlusMilliseconds(tz, ms)
Definition: