PostgreSQL Source Code  git master
xlogrecovery.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * xlogrecovery.c
4  * Functions for WAL recovery, standby mode
5  *
6  * This source file contains functions controlling WAL recovery.
7  * InitWalRecovery() initializes the system for crash or archive recovery,
8  * or standby mode, depending on configuration options and the state of
9  * the control file and possible backup label file. PerformWalRecovery()
10  * performs the actual WAL replay, calling the rmgr-specific redo routines.
11  * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12  * and prepares information needed to initialize the WAL for writes. In
13  * addition to these three main functions, there are a bunch of functions
14  * for interrogating recovery state and controlling the recovery process.
15  *
16  *
17  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
18  * Portions Copyright (c) 1994, Regents of the University of California
19  *
20  * src/backend/access/transam/xlogrecovery.c
21  *
22  *-------------------------------------------------------------------------
23  */
24 
25 #include "postgres.h"
26 
27 #include <ctype.h>
28 #include <math.h>
29 #include <time.h>
30 #include <sys/stat.h>
31 #include <sys/time.h>
32 #include <unistd.h>
33 
34 #include "access/timeline.h"
35 #include "access/transam.h"
36 #include "access/xact.h"
37 #include "access/xlog_internal.h"
38 #include "access/xlogarchive.h"
39 #include "access/xlogprefetcher.h"
40 #include "access/xlogreader.h"
41 #include "access/xlogrecovery.h"
42 #include "access/xlogutils.h"
43 #include "backup/basebackup.h"
44 #include "catalog/pg_control.h"
45 #include "commands/tablespace.h"
46 #include "common/file_utils.h"
47 #include "miscadmin.h"
48 #include "pgstat.h"
49 #include "postmaster/bgwriter.h"
50 #include "postmaster/startup.h"
51 #include "replication/slot.h"
52 #include "replication/slotsync.h"
54 #include "storage/fd.h"
55 #include "storage/ipc.h"
56 #include "storage/latch.h"
57 #include "storage/pmsignal.h"
58 #include "storage/procarray.h"
59 #include "storage/spin.h"
60 #include "utils/datetime.h"
61 #include "utils/fmgrprotos.h"
62 #include "utils/guc_hooks.h"
63 #include "utils/pg_lsn.h"
64 #include "utils/ps_status.h"
65 #include "utils/pg_rusage.h"
66 
67 /* Unsupported old recovery command file names (relative to $PGDATA) */
68 #define RECOVERY_COMMAND_FILE "recovery.conf"
69 #define RECOVERY_COMMAND_DONE "recovery.done"
70 
71 /*
72  * GUC support
73  */
75  {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
76  {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
77  {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
78  {NULL, 0, false}
79 };
80 
81 /* options formerly taken from recovery.conf for archive recovery */
83 char *recoveryEndCommand = NULL;
84 char *archiveCleanupCommand = NULL;
91 const char *recoveryTargetName;
94 
95 /* options formerly taken from recovery.conf for XLOG streaming */
96 char *PrimaryConnInfo = NULL;
97 char *PrimarySlotName = NULL;
99 
100 /*
101  * recoveryTargetTimeLineGoal: what the user requested, if any
102  *
103  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
104  *
105  * recoveryTargetTLI: the currently understood target timeline; changes
106  *
107  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
108  * the timelines of its known parents, newest first (so recoveryTargetTLI is
109  * always the first list member). Only these TLIs are expected to be seen in
110  * the WAL segments we read, and indeed only these TLIs will be considered as
111  * candidate WAL files to open at all.
112  *
113  * curFileTLI: the TLI appearing in the name of the current input WAL file.
114  * (This is not necessarily the same as the timeline from which we are
115  * replaying WAL, which StartupXLOG calls replayTLI, because we could be
116  * scanning data that was copied from an ancestor timeline when the current
117  * file was created.) During a sequential scan we do not allow this value
118  * to decrease.
119  */
125 
126 /*
127  * When ArchiveRecoveryRequested is set, archive recovery was requested,
128  * ie. signal files were present. When InArchiveRecovery is set, we are
129  * currently recovering using offline XLOG archives. These variables are only
130  * valid in the startup process.
131  *
132  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
133  * currently performing crash recovery using only XLOG files in pg_wal, but
134  * will switch to using offline XLOG archives as soon as we reach the end of
135  * WAL in pg_wal.
136  */
138 bool InArchiveRecovery = false;
139 
140 /*
141  * When StandbyModeRequested is set, standby mode was requested, i.e.
142  * standby.signal file was present. When StandbyMode is set, we are currently
143  * in standby mode. These variables are only valid in the startup process.
144  * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
145  */
146 static bool StandbyModeRequested = false;
147 bool StandbyMode = false;
148 
149 /* was a signal file present at startup? */
150 static bool standby_signal_file_found = false;
151 static bool recovery_signal_file_found = false;
152 
153 /*
154  * CheckPointLoc is the position of the checkpoint record that determines
155  * where to start the replay. It comes from the backup label file or the
156  * control file.
157  *
158  * RedoStartLSN is the checkpoint's REDO location, also from the backup label
159  * file or the control file. In standby mode, XLOG streaming usually starts
160  * from the position where an invalid record was found. But if we fail to
161  * read even the initial checkpoint record, we use the REDO location instead
162  * of the checkpoint location as the start position of XLOG streaming.
163  * Otherwise we would have to jump backwards to the REDO location after
164  * reading the checkpoint record, because the REDO record can precede the
165  * checkpoint record.
166  */
171 
172 /*
173  * Local copy of SharedHotStandbyActive variable. False actually means "not
174  * known, need to check the shared state".
175  */
176 static bool LocalHotStandbyActive = false;
177 
178 /*
179  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
180  * known, need to check the shared state".
181  */
182 static bool LocalPromoteIsTriggered = false;
183 
184 /* Has the recovery code requested a walreceiver wakeup? */
186 
187 /* XLogReader object used to parse the WAL records */
189 
190 /* XLogPrefetcher object used to consume WAL records with read-ahead */
192 
193 /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
194 typedef struct XLogPageReadPrivate
195 {
196  int emode;
197  bool fetching_ckpt; /* are we fetching a checkpoint record? */
201 
202 /* flag to tell XLogPageRead that we have started replaying */
203 static bool InRedo = false;
204 
205 /*
206  * Codes indicating where we got a WAL file from during recovery, or where
207  * to attempt to get one.
208  */
209 typedef enum
210 {
211  XLOG_FROM_ANY = 0, /* request to read WAL from any source */
212  XLOG_FROM_ARCHIVE, /* restored using restore_command */
213  XLOG_FROM_PG_WAL, /* existing file in pg_wal */
214  XLOG_FROM_STREAM, /* streamed from primary */
215 } XLogSource;
216 
217 /* human-readable names for XLogSources, for debugging output */
218 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
219 
220 /*
221  * readFile is -1 or a kernel FD for the log file segment that's currently
222  * open for reading. readSegNo identifies the segment. readOff is the offset
223  * of the page just read, readLen indicates how much of it has been read into
224  * readBuf, and readSource indicates where we got the currently open file from.
225  *
226  * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
227  * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
228  * worthwhile, since the XLOG is not read by general-purpose sessions.
229  */
230 static int readFile = -1;
231 static XLogSegNo readSegNo = 0;
232 static uint32 readOff = 0;
233 static uint32 readLen = 0;
235 
236 /*
237  * Keeps track of which source we're currently reading from. This is
238  * different from readSource in that this is always set, even when we don't
239  * currently have a WAL file open. If lastSourceFailed is set, our last
240  * attempt to read from currentSource failed, and we should try another source
241  * next.
242  *
243  * pendingWalRcvRestart is set when a config change occurs that requires a
244  * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
245  */
247 static bool lastSourceFailed = false;
248 static bool pendingWalRcvRestart = false;
249 
250 /*
251  * These variables track when we last obtained some WAL data to process,
252  * and where we got it from. (XLogReceiptSource is initially the same as
253  * readSource, but readSource gets reset to zero when we don't have data
254  * to process right now. It is also different from currentSource, which
255  * also changes when we try to read from a source and fail, while
256  * XLogReceiptSource tracks where we last successfully read some WAL.)
257  */
260 
261 /* Local copy of WalRcv->flushedUpto */
264 
265 /*
266  * Copy of minRecoveryPoint and backupEndPoint from the control file.
267  *
268  * In order to reach consistency, we must replay the WAL up to
269  * minRecoveryPoint. If backupEndRequired is true, we must also reach
270  * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
271  * to backupStartPoint.
272  *
273  * Note: In archive recovery, after consistency has been reached, the
274  * functions in xlog.c will start updating minRecoveryPoint in the control
275  * file. But this copy of minRecoveryPoint variable reflects the value at the
276  * beginning of recovery, and is *not* updated after consistency is reached.
277  */
280 
283 static bool backupEndRequired = false;
284 
285 /*
286  * Have we reached a consistent database state? In crash recovery, we have
287  * to replay all the WAL, so reachedConsistency is never set. During archive
288  * recovery, the database is consistent once minRecoveryPoint is reached.
289  *
290  * Consistent state means that the system is internally consistent, all
291  * the WAL has been replayed up to a certain point, and importantly, there
292  * is no trace of later actions on disk.
293  */
294 bool reachedConsistency = false;
295 
296 /* Buffers dedicated to consistency checks of size BLCKSZ */
297 static char *replay_image_masked = NULL;
298 static char *primary_image_masked = NULL;
299 
300 
301 /*
302  * Shared-memory state for WAL recovery.
303  */
304 typedef struct XLogRecoveryCtlData
305 {
306  /*
307  * SharedHotStandbyActive indicates if we allow hot standby queries to be
308  * run. Protected by info_lck.
309  */
311 
312  /*
313  * SharedPromoteIsTriggered indicates if a standby promotion has been
314  * triggered. Protected by info_lck.
315  */
317 
318  /*
319  * recoveryWakeupLatch is used to wake up the startup process to continue
320  * WAL replay, if it is waiting for WAL to arrive or promotion to be
321  * requested.
322  *
323  * Note that the startup process also uses another latch, its procLatch,
324  * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
325  * signaling the startup process in favor of using its procLatch, which
326  * comports better with possible generic signal handlers using that latch.
327  * But we should not do that because the startup process doesn't assume
328  * that it's waken up by walreceiver process or SIGHUP signal handler
329  * while it's waiting for recovery conflict. The separate latches,
330  * recoveryWakeupLatch and procLatch, should be used for inter-process
331  * communication for WAL replay and recovery conflict, respectively.
332  */
334 
335  /*
336  * Last record successfully replayed.
337  */
338  XLogRecPtr lastReplayedReadRecPtr; /* start position */
339  XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
340  TimeLineID lastReplayedTLI; /* timeline */
341 
342  /*
343  * When we're currently replaying a record, ie. in a redo function,
344  * replayEndRecPtr points to the end+1 of the record being replayed,
345  * otherwise it's equal to lastReplayedEndRecPtr.
346  */
349  /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
351 
352  /*
353  * timestamp of when we started replaying the current chunk of WAL data,
354  * only relevant for replication or archive recovery
355  */
357  /* Recovery pause state */
360 
361  slock_t info_lck; /* locks shared variables shown above */
363 
365 
366 /*
367  * abortedRecPtr is the start pointer of a broken record at end of WAL when
368  * recovery completes; missingContrecPtr is the location of the first
369  * contrecord that went missing. See CreateOverwriteContrecordRecord for
370  * details.
371  */
374 
375 /*
376  * if recoveryStopsBefore/After returns true, it saves information of the stop
377  * point here
378  */
383 static bool recoveryStopAfter;
384 
385 /* prototypes for local functions */
386 static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
387 
388 static void EnableStandbyMode(void);
389 static void readRecoverySignalFile(void);
390 static void validateRecoveryParameters(void);
391 static bool read_backup_label(XLogRecPtr *checkPointLoc,
392  TimeLineID *backupLabelTLI,
393  bool *backupEndRequired, bool *backupFromStandby);
394 static bool read_tablespace_map(List **tablespaces);
395 
396 static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
397 static void CheckRecoveryConsistency(void);
398 static void rm_redo_error_callback(void *arg);
399 #ifdef WAL_DEBUG
400 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
401 #endif
402 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
403 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
404  TimeLineID prevTLI, TimeLineID replayTLI);
405 static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
406 static void verifyBackupPageConsistency(XLogReaderState *record);
407 
408 static bool recoveryStopsBefore(XLogReaderState *record);
409 static bool recoveryStopsAfter(XLogReaderState *record);
410 static char *getRecoveryStopReason(void);
411 static void recoveryPausesHere(bool endOfRecovery);
412 static bool recoveryApplyDelay(XLogReaderState *record);
413 static void ConfirmRecoveryPaused(void);
414 
416  int emode, bool fetching_ckpt,
417  TimeLineID replayTLI);
418 
419 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
420  int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
422  bool randAccess,
423  bool fetching_ckpt,
424  XLogRecPtr tliRecPtr,
425  TimeLineID replayTLI,
426  XLogRecPtr replayLSN,
427  bool nonblocking);
428 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
430  XLogRecPtr RecPtr, TimeLineID replayTLI);
431 static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
432 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
433  XLogSource source, bool notfoundOk);
434 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
435 
436 static bool CheckForStandbyTrigger(void);
437 static void SetPromoteIsTriggered(void);
438 static bool HotStandbyActiveInReplay(void);
439 
440 static void SetCurrentChunkStartTime(TimestampTz xtime);
441 static void SetLatestXTime(TimestampTz xtime);
442 
443 /*
444  * Initialization of shared memory for WAL recovery
445  */
446 Size
448 {
449  Size size;
450 
451  /* XLogRecoveryCtl */
452  size = sizeof(XLogRecoveryCtlData);
453 
454  return size;
455 }
456 
457 void
459 {
460  bool found;
461 
463  ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
464  if (found)
465  return;
466  memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
467 
471 }
472 
473 /*
474  * A thin wrapper to enable StandbyMode and do other preparatory work as
475  * needed.
476  */
477 static void
479 {
480  StandbyMode = true;
481 
482  /*
483  * To avoid server log bloat, we don't report recovery progress in a
484  * standby as it will always be in recovery unless promoted. We disable
485  * startup progress timeout in standby mode to avoid calling
486  * startup_progress_timeout_handler() unnecessarily.
487  */
489 }
490 
491 /*
492  * Prepare the system for WAL recovery, if needed.
493  *
494  * This is called by StartupXLOG() which coordinates the server startup
495  * sequence. This function analyzes the control file and the backup label
496  * file, if any, and figures out whether we need to perform crash recovery or
497  * archive recovery, and how far we need to replay the WAL to reach a
498  * consistent state.
499  *
500  * This doesn't yet change the on-disk state, except for creating the symlinks
501  * from table space map file if any, and for fetching WAL files needed to find
502  * the checkpoint record. On entry, the caller has already read the control
503  * file into memory, and passes it as argument. This function updates it to
504  * reflect the recovery state, and the caller is expected to write it back to
505  * disk does after initializing other subsystems, but before calling
506  * PerformWalRecovery().
507  *
508  * This initializes some global variables like ArchiveRecoveryRequested, and
509  * StandbyModeRequested and InRecovery.
510  */
511 void
513  bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
514 {
515  XLogPageReadPrivate *private;
516  struct stat st;
517  bool wasShutdown;
518  XLogRecord *record;
519  DBState dbstate_at_startup;
520  bool haveTblspcMap = false;
521  bool haveBackupLabel = false;
522  CheckPoint checkPoint;
523  bool backupFromStandby = false;
524 
525  dbstate_at_startup = ControlFile->state;
526 
527  /*
528  * Initialize on the assumption we want to recover to the latest timeline
529  * that's active according to pg_control.
530  */
534  else
536 
537  /*
538  * Check for signal files, and if so set up state for offline recovery
539  */
542 
543  /*
544  * Take ownership of the wakeup latch if we're going to sleep during
545  * recovery, if required.
546  */
549 
550  /*
551  * Set the WAL reading processor now, as it will be needed when reading
552  * the checkpoint record required (backup_label or not).
553  */
554  private = palloc0(sizeof(XLogPageReadPrivate));
555  xlogreader =
557  XL_ROUTINE(.page_read = &XLogPageRead,
558  .segment_open = NULL,
559  .segment_close = wal_segment_close),
560  private);
561  if (!xlogreader)
562  ereport(ERROR,
563  (errcode(ERRCODE_OUT_OF_MEMORY),
564  errmsg("out of memory"),
565  errdetail("Failed while allocating a WAL reading processor.")));
567 
568  /*
569  * Set the WAL decode buffer size. This limits how far ahead we can read
570  * in the WAL.
571  */
573 
574  /* Create a WAL prefetcher. */
576 
577  /*
578  * Allocate two page buffers dedicated to WAL consistency checks. We do
579  * it this way, rather than just making static arrays, for two reasons:
580  * (1) no need to waste the storage in most instantiations of the backend;
581  * (2) a static char array isn't guaranteed to have any particular
582  * alignment, whereas palloc() will provide MAXALIGN'd storage.
583  */
584  replay_image_masked = (char *) palloc(BLCKSZ);
585  primary_image_masked = (char *) palloc(BLCKSZ);
586 
587  /*
588  * Read the backup_label file. We want to run this part of the recovery
589  * process after checking for signal files and after performing validation
590  * of the recovery parameters.
591  */
593  &backupFromStandby))
594  {
595  List *tablespaces = NIL;
596 
597  /*
598  * Archive recovery was requested, and thanks to the backup label
599  * file, we know how far we need to replay to reach consistency. Enter
600  * archive recovery directly.
601  */
602  InArchiveRecovery = true;
605 
606  /*
607  * Omitting backup_label when creating a new replica, PITR node etc.
608  * unfortunately is a common cause of corruption. Logging that
609  * backup_label was used makes it a bit easier to exclude that as the
610  * cause of observed corruption.
611  *
612  * Do so before we try to read the checkpoint record (which can fail),
613  * as otherwise it can be hard to understand why a checkpoint other
614  * than ControlFile->checkPoint is used.
615  */
616  ereport(LOG,
617  (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
620  CheckPointTLI)));
621 
622  /*
623  * When a backup_label file is present, we want to roll forward from
624  * the checkpoint it identifies, rather than using pg_control.
625  */
627  CheckPointTLI);
628  if (record != NULL)
629  {
630  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
631  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
632  ereport(DEBUG1,
633  (errmsg_internal("checkpoint record is at %X/%X",
635  InRecovery = true; /* force recovery even if SHUTDOWNED */
636 
637  /*
638  * Make sure that REDO location exists. This may not be the case
639  * if there was a crash during an online backup, which left a
640  * backup_label around that references a WAL segment that's
641  * already been archived.
642  */
643  if (checkPoint.redo < CheckPointLoc)
644  {
646  if (!ReadRecord(xlogprefetcher, LOG, false,
647  checkPoint.ThisTimeLineID))
648  ereport(FATAL,
649  (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
651  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
652  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
653  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
655  }
656  }
657  else
658  {
659  ereport(FATAL,
660  (errmsg("could not locate required checkpoint record at %X/%X",
662  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
663  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
664  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
666  wasShutdown = false; /* keep compiler quiet */
667  }
668 
669  /* Read the tablespace_map file if present and create symlinks. */
670  if (read_tablespace_map(&tablespaces))
671  {
672  ListCell *lc;
673 
674  foreach(lc, tablespaces)
675  {
676  tablespaceinfo *ti = lfirst(lc);
677  char *linkloc;
678 
679  linkloc = psprintf("pg_tblspc/%u", ti->oid);
680 
681  /*
682  * Remove the existing symlink if any and Create the symlink
683  * under PGDATA.
684  */
685  remove_tablespace_symlink(linkloc);
686 
687  if (symlink(ti->path, linkloc) < 0)
688  ereport(ERROR,
690  errmsg("could not create symbolic link \"%s\": %m",
691  linkloc)));
692 
693  pfree(ti->path);
694  pfree(ti);
695  }
696 
697  /* tell the caller to delete it later */
698  haveTblspcMap = true;
699  }
700 
701  /* tell the caller to delete it later */
702  haveBackupLabel = true;
703  }
704  else
705  {
706  /* No backup_label file has been found if we are here. */
707 
708  /*
709  * If tablespace_map file is present without backup_label file, there
710  * is no use of such file. There is no harm in retaining it, but it
711  * is better to get rid of the map file so that we don't have any
712  * redundant file in data directory and it will avoid any sort of
713  * confusion. It seems prudent though to just rename the file out of
714  * the way rather than delete it completely, also we ignore any error
715  * that occurs in rename operation as even if map file is present
716  * without backup_label file, it is harmless.
717  */
718  if (stat(TABLESPACE_MAP, &st) == 0)
719  {
720  unlink(TABLESPACE_MAP_OLD);
722  ereport(LOG,
723  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
725  errdetail("File \"%s\" was renamed to \"%s\".",
727  else
728  ereport(LOG,
729  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
731  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
733  }
734 
735  /*
736  * It's possible that archive recovery was requested, but we don't
737  * know how far we need to replay the WAL before we reach consistency.
738  * This can happen for example if a base backup is taken from a
739  * running server using an atomic filesystem snapshot, without calling
740  * pg_backup_start/stop. Or if you just kill a running primary server
741  * and put it into archive recovery by creating a recovery signal
742  * file.
743  *
744  * Our strategy in that case is to perform crash recovery first,
745  * replaying all the WAL present in pg_wal, and only enter archive
746  * recovery after that.
747  *
748  * But usually we already know how far we need to replay the WAL (up
749  * to minRecoveryPoint, up to backupEndPoint, or until we see an
750  * end-of-backup record), and we can enter archive recovery directly.
751  */
757  {
758  InArchiveRecovery = true;
761  }
762 
763  /*
764  * For the same reason as when starting up with backup_label present,
765  * emit a log message when we continue initializing from a base
766  * backup.
767  */
769  ereport(LOG,
770  (errmsg("restarting backup recovery with redo LSN %X/%X",
772 
773  /* Get the last valid checkpoint record. */
779  CheckPointTLI);
780  if (record != NULL)
781  {
782  ereport(DEBUG1,
783  (errmsg_internal("checkpoint record is at %X/%X",
785  }
786  else
787  {
788  /*
789  * We used to attempt to go back to a secondary checkpoint record
790  * here, but only when not in standby mode. We now just fail if we
791  * can't read the last checkpoint because this allows us to
792  * simplify processing around checkpoints.
793  */
794  ereport(PANIC,
795  (errmsg("could not locate a valid checkpoint record at %X/%X",
797  }
798  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
799  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
800  }
801 
803  {
805  ereport(LOG,
806  (errmsg("entering standby mode")));
808  ereport(LOG,
809  (errmsg("starting point-in-time recovery to XID %u",
812  ereport(LOG,
813  (errmsg("starting point-in-time recovery to %s",
816  ereport(LOG,
817  (errmsg("starting point-in-time recovery to \"%s\"",
820  ereport(LOG,
821  (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
824  ereport(LOG,
825  (errmsg("starting point-in-time recovery to earliest consistent point")));
826  else
827  ereport(LOG,
828  (errmsg("starting archive recovery")));
829  }
830 
831  /*
832  * If the location of the checkpoint record is not on the expected
833  * timeline in the history of the requested timeline, we cannot proceed:
834  * the backup is not part of the history of the requested timeline.
835  */
836  Assert(expectedTLEs); /* was initialized by reading checkpoint
837  * record */
840  {
841  XLogRecPtr switchpoint;
842 
843  /*
844  * tliSwitchPoint will throw an error if the checkpoint's timeline is
845  * not in expectedTLEs at all.
846  */
848  ereport(FATAL,
849  (errmsg("requested timeline %u is not a child of this server's history",
851  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
854  LSN_FORMAT_ARGS(switchpoint))));
855  }
856 
857  /*
858  * The min recovery point should be part of the requested timeline's
859  * history, too.
860  */
864  ereport(FATAL,
865  (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
869 
870  ereport(DEBUG1,
871  (errmsg_internal("redo record is at %X/%X; shutdown %s",
872  LSN_FORMAT_ARGS(checkPoint.redo),
873  wasShutdown ? "true" : "false")));
874  ereport(DEBUG1,
875  (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
876  U64FromFullTransactionId(checkPoint.nextXid),
877  checkPoint.nextOid)));
878  ereport(DEBUG1,
879  (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
880  checkPoint.nextMulti, checkPoint.nextMultiOffset)));
881  ereport(DEBUG1,
882  (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
883  checkPoint.oldestXid, checkPoint.oldestXidDB)));
884  ereport(DEBUG1,
885  (errmsg_internal("oldest MultiXactId: %u, in database %u",
886  checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
887  ereport(DEBUG1,
888  (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
889  checkPoint.oldestCommitTsXid,
890  checkPoint.newestCommitTsXid)));
892  ereport(PANIC,
893  (errmsg("invalid next transaction ID")));
894 
895  /* sanity check */
896  if (checkPoint.redo > CheckPointLoc)
897  ereport(PANIC,
898  (errmsg("invalid redo in checkpoint record")));
899 
900  /*
901  * Check whether we need to force recovery from WAL. If it appears to
902  * have been a clean shutdown and we did not have a recovery signal file,
903  * then assume no recovery needed.
904  */
905  if (checkPoint.redo < CheckPointLoc)
906  {
907  if (wasShutdown)
908  ereport(PANIC,
909  (errmsg("invalid redo record in shutdown checkpoint")));
910  InRecovery = true;
911  }
912  else if (ControlFile->state != DB_SHUTDOWNED)
913  InRecovery = true;
914  else if (ArchiveRecoveryRequested)
915  {
916  /* force recovery due to presence of recovery signal file */
917  InRecovery = true;
918  }
919 
920  /*
921  * If recovery is needed, update our in-memory copy of pg_control to show
922  * that we are recovering and to show the selected checkpoint as the place
923  * we are starting from. We also mark pg_control with any minimum recovery
924  * stop point obtained from a backup history file.
925  *
926  * We don't write the changes to disk yet, though. Only do that after
927  * initializing various subsystems.
928  */
929  if (InRecovery)
930  {
931  if (InArchiveRecovery)
932  {
934  }
935  else
936  {
937  ereport(LOG,
938  (errmsg("database system was not properly shut down; "
939  "automatic recovery in progress")));
941  ereport(LOG,
942  (errmsg("crash recovery starts in timeline %u "
943  "and has target timeline %u",
947  }
949  ControlFile->checkPointCopy = checkPoint;
950  if (InArchiveRecovery)
951  {
952  /* initialize minRecoveryPoint if not set yet */
953  if (ControlFile->minRecoveryPoint < checkPoint.redo)
954  {
955  ControlFile->minRecoveryPoint = checkPoint.redo;
957  }
958  }
959 
960  /*
961  * Set backupStartPoint if we're starting recovery from a base backup.
962  *
963  * Also set backupEndPoint and use minRecoveryPoint as the backup end
964  * location if we're starting recovery from a base backup which was
965  * taken from a standby. In this case, the database system status in
966  * pg_control must indicate that the database was already in recovery.
967  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
968  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
969  * before reaching this point; e.g. because restore_command or
970  * primary_conninfo were faulty.
971  *
972  * Any other state indicates that the backup somehow became corrupted
973  * and we can't sensibly continue with recovery.
974  */
975  if (haveBackupLabel)
976  {
977  ControlFile->backupStartPoint = checkPoint.redo;
979 
980  if (backupFromStandby)
981  {
982  if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
983  dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
984  ereport(FATAL,
985  (errmsg("backup_label contains data inconsistent with control file"),
986  errhint("This means that the backup is corrupted and you will "
987  "have to use another backup for recovery.")));
989  }
990  }
991  }
992 
993  /* remember these, so that we know when we have reached consistency */
997  if (InArchiveRecovery)
998  {
1001  }
1002  else
1003  {
1005  minRecoveryPointTLI = 0;
1006  }
1007 
1008  /*
1009  * Start recovery assuming that the final record isn't lost.
1010  */
1013 
1014  *wasShutdown_ptr = wasShutdown;
1015  *haveBackupLabel_ptr = haveBackupLabel;
1016  *haveTblspcMap_ptr = haveTblspcMap;
1017 }
1018 
1019 /*
1020  * See if there are any recovery signal files and if so, set state for
1021  * recovery.
1022  *
1023  * See if there is a recovery command file (recovery.conf), and if so
1024  * throw an ERROR since as of PG12 we no longer recognize that.
1025  */
1026 static void
1028 {
1029  struct stat stat_buf;
1030 
1032  return;
1033 
1034  /*
1035  * Check for old recovery API file: recovery.conf
1036  */
1037  if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1038  ereport(FATAL,
1040  errmsg("using recovery command file \"%s\" is not supported",
1042 
1043  /*
1044  * Remove unused .done file, if present. Ignore if absent.
1045  */
1046  unlink(RECOVERY_COMMAND_DONE);
1047 
1048  /*
1049  * Check for recovery signal files and if found, fsync them since they
1050  * represent server state information. We don't sweat too much about the
1051  * possibility of fsync failure, however.
1052  *
1053  * If present, standby signal file takes precedence. If neither is present
1054  * then we won't enter archive recovery.
1055  */
1056  if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1057  {
1058  int fd;
1059 
1061  S_IRUSR | S_IWUSR);
1062  if (fd >= 0)
1063  {
1064  (void) pg_fsync(fd);
1065  close(fd);
1066  }
1068  }
1069  else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1070  {
1071  int fd;
1072 
1074  S_IRUSR | S_IWUSR);
1075  if (fd >= 0)
1076  {
1077  (void) pg_fsync(fd);
1078  close(fd);
1079  }
1081  }
1082 
1083  StandbyModeRequested = false;
1084  ArchiveRecoveryRequested = false;
1086  {
1087  StandbyModeRequested = true;
1088  ArchiveRecoveryRequested = true;
1089  }
1090  else if (recovery_signal_file_found)
1091  {
1092  StandbyModeRequested = false;
1093  ArchiveRecoveryRequested = true;
1094  }
1095  else
1096  return;
1097 
1098  /*
1099  * We don't support standby mode in standalone backends; that requires
1100  * other processes such as the WAL receiver to be alive.
1101  */
1103  ereport(FATAL,
1104  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1105  errmsg("standby mode is not supported by single-user servers")));
1106 }
1107 
1108 static void
1110 {
1112  return;
1113 
1114  /*
1115  * Check for compulsory parameters
1116  */
1118  {
1119  if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1120  (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1121  ereport(WARNING,
1122  (errmsg("specified neither primary_conninfo nor restore_command"),
1123  errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1124  }
1125  else
1126  {
1127  if (recoveryRestoreCommand == NULL ||
1128  strcmp(recoveryRestoreCommand, "") == 0)
1129  ereport(FATAL,
1130  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1131  errmsg("must specify restore_command when standby mode is not enabled")));
1132  }
1133 
1134  /*
1135  * Override any inconsistent requests. Note that this is a change of
1136  * behaviour in 9.5; prior to this we simply ignored a request to pause if
1137  * hot_standby = off, which was surprising behaviour.
1138  */
1142 
1143  /*
1144  * Final parsing of recovery_target_time string; see also
1145  * check_recovery_target_time().
1146  */
1148  {
1152  Int32GetDatum(-1)));
1153  }
1154 
1155  /*
1156  * If user specified recovery_target_timeline, validate it or compute the
1157  * "latest" value. We can't do this until after we've gotten the restore
1158  * command and set InArchiveRecovery, because we need to fetch timeline
1159  * history files from the archive.
1160  */
1162  {
1164 
1165  /* Timeline 1 does not have a history file, all else should */
1166  if (rtli != 1 && !existsTimeLineHistory(rtli))
1167  ereport(FATAL,
1168  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1169  errmsg("recovery target timeline %u does not exist",
1170  rtli)));
1171  recoveryTargetTLI = rtli;
1172  }
1174  {
1175  /* We start the "latest" search from pg_control's timeline */
1177  }
1178  else
1179  {
1180  /*
1181  * else we just use the recoveryTargetTLI as already read from
1182  * ControlFile
1183  */
1185  }
1186 }
1187 
1188 /*
1189  * read_backup_label: check to see if a backup_label file is present
1190  *
1191  * If we see a backup_label during recovery, we assume that we are recovering
1192  * from a backup dump file, and we therefore roll forward from the checkpoint
1193  * identified by the label file, NOT what pg_control says. This avoids the
1194  * problem that pg_control might have been archived one or more checkpoints
1195  * later than the start of the dump, and so if we rely on it as the start
1196  * point, we will fail to restore a consistent database state.
1197  *
1198  * Returns true if a backup_label was found (and fills the checkpoint
1199  * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1200  * returns false if not. If this backup_label came from a streamed backup,
1201  * *backupEndRequired is set to true. If this backup_label was created during
1202  * recovery, *backupFromStandby is set to true.
1203  *
1204  * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1205  * and TLI read from the backup file.
1206  */
1207 static bool
1208 read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1209  bool *backupEndRequired, bool *backupFromStandby)
1210 {
1211  char startxlogfilename[MAXFNAMELEN];
1212  TimeLineID tli_from_walseg,
1213  tli_from_file;
1214  FILE *lfp;
1215  char ch;
1216  char backuptype[20];
1217  char backupfrom[20];
1218  char backuplabel[MAXPGPATH];
1219  char backuptime[128];
1220  uint32 hi,
1221  lo;
1222 
1223  /* suppress possible uninitialized-variable warnings */
1224  *checkPointLoc = InvalidXLogRecPtr;
1225  *backupLabelTLI = 0;
1226  *backupEndRequired = false;
1227  *backupFromStandby = false;
1228 
1229  /*
1230  * See if label file is present
1231  */
1232  lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1233  if (!lfp)
1234  {
1235  if (errno != ENOENT)
1236  ereport(FATAL,
1238  errmsg("could not read file \"%s\": %m",
1239  BACKUP_LABEL_FILE)));
1240  return false; /* it's not there, all is fine */
1241  }
1242 
1243  /*
1244  * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1245  * is pretty crude, but we are not expecting any variability in the file
1246  * format).
1247  */
1248  if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1249  &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1250  ereport(FATAL,
1251  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1252  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1253  RedoStartLSN = ((uint64) hi) << 32 | lo;
1254  RedoStartTLI = tli_from_walseg;
1255  if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1256  &hi, &lo, &ch) != 3 || ch != '\n')
1257  ereport(FATAL,
1258  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1259  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1260  *checkPointLoc = ((uint64) hi) << 32 | lo;
1261  *backupLabelTLI = tli_from_walseg;
1262 
1263  /*
1264  * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1265  * which could mean either pg_basebackup or the pg_backup_start/stop
1266  * method was used) or if this label came from somewhere else (the only
1267  * other option today being from pg_rewind). If this was a streamed
1268  * backup then we know that we need to play through until we get to the
1269  * end of the WAL which was generated during the backup (at which point we
1270  * will have reached consistency and backupEndRequired will be reset to be
1271  * false).
1272  */
1273  if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1274  {
1275  if (strcmp(backuptype, "streamed") == 0)
1276  *backupEndRequired = true;
1277  }
1278 
1279  /*
1280  * BACKUP FROM lets us know if this was from a primary or a standby. If
1281  * it was from a standby, we'll double-check that the control file state
1282  * matches that of a standby.
1283  */
1284  if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1285  {
1286  if (strcmp(backupfrom, "standby") == 0)
1287  *backupFromStandby = true;
1288  }
1289 
1290  /*
1291  * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1292  * but checking for their presence is useful for debugging and the next
1293  * sanity checks. Cope also with the fact that the result buffers have a
1294  * pre-allocated size, hence if the backup_label file has been generated
1295  * with strings longer than the maximum assumed here an incorrect parsing
1296  * happens. That's fine as only minor consistency checks are done
1297  * afterwards.
1298  */
1299  if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1300  ereport(DEBUG1,
1301  (errmsg_internal("backup time %s in file \"%s\"",
1302  backuptime, BACKUP_LABEL_FILE)));
1303 
1304  if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1305  ereport(DEBUG1,
1306  (errmsg_internal("backup label %s in file \"%s\"",
1307  backuplabel, BACKUP_LABEL_FILE)));
1308 
1309  /*
1310  * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1311  * it as a sanity check if present.
1312  */
1313  if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1314  {
1315  if (tli_from_walseg != tli_from_file)
1316  ereport(FATAL,
1317  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1318  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1319  errdetail("Timeline ID parsed is %u, but expected %u.",
1320  tli_from_file, tli_from_walseg)));
1321 
1322  ereport(DEBUG1,
1323  (errmsg_internal("backup timeline %u in file \"%s\"",
1324  tli_from_file, BACKUP_LABEL_FILE)));
1325  }
1326 
1327  if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
1328  ereport(FATAL,
1329  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1330  errmsg("this is an incremental backup, not a data directory"),
1331  errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1332 
1333  if (ferror(lfp) || FreeFile(lfp))
1334  ereport(FATAL,
1336  errmsg("could not read file \"%s\": %m",
1337  BACKUP_LABEL_FILE)));
1338 
1339  return true;
1340 }
1341 
1342 /*
1343  * read_tablespace_map: check to see if a tablespace_map file is present
1344  *
1345  * If we see a tablespace_map file during recovery, we assume that we are
1346  * recovering from a backup dump file, and we therefore need to create symlinks
1347  * as per the information present in tablespace_map file.
1348  *
1349  * Returns true if a tablespace_map file was found (and fills *tablespaces
1350  * with a tablespaceinfo struct for each tablespace listed in the file);
1351  * returns false if not.
1352  */
1353 static bool
1355 {
1356  tablespaceinfo *ti;
1357  FILE *lfp;
1358  char str[MAXPGPATH];
1359  int ch,
1360  i,
1361  n;
1362  bool was_backslash;
1363 
1364  /*
1365  * See if tablespace_map file is present
1366  */
1367  lfp = AllocateFile(TABLESPACE_MAP, "r");
1368  if (!lfp)
1369  {
1370  if (errno != ENOENT)
1371  ereport(FATAL,
1373  errmsg("could not read file \"%s\": %m",
1374  TABLESPACE_MAP)));
1375  return false; /* it's not there, all is fine */
1376  }
1377 
1378  /*
1379  * Read and parse the link name and path lines from tablespace_map file
1380  * (this code is pretty crude, but we are not expecting any variability in
1381  * the file format). De-escape any backslashes that were inserted.
1382  */
1383  i = 0;
1384  was_backslash = false;
1385  while ((ch = fgetc(lfp)) != EOF)
1386  {
1387  if (!was_backslash && (ch == '\n' || ch == '\r'))
1388  {
1389  char *endp;
1390 
1391  if (i == 0)
1392  continue; /* \r immediately followed by \n */
1393 
1394  /*
1395  * The de-escaped line should contain an OID followed by exactly
1396  * one space followed by a path. The path might start with
1397  * spaces, so don't be too liberal about parsing.
1398  */
1399  str[i] = '\0';
1400  n = 0;
1401  while (str[n] && str[n] != ' ')
1402  n++;
1403  if (n < 1 || n >= i - 1)
1404  ereport(FATAL,
1405  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1406  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1407  str[n++] = '\0';
1408 
1409  ti = palloc0(sizeof(tablespaceinfo));
1410  errno = 0;
1411  ti->oid = strtoul(str, &endp, 10);
1412  if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1413  ereport(FATAL,
1414  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1415  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1416  ti->path = pstrdup(str + n);
1417  *tablespaces = lappend(*tablespaces, ti);
1418 
1419  i = 0;
1420  continue;
1421  }
1422  else if (!was_backslash && ch == '\\')
1423  was_backslash = true;
1424  else
1425  {
1426  if (i < sizeof(str) - 1)
1427  str[i++] = ch;
1428  was_backslash = false;
1429  }
1430  }
1431 
1432  if (i != 0 || was_backslash) /* last line not terminated? */
1433  ereport(FATAL,
1434  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1435  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1436 
1437  if (ferror(lfp) || FreeFile(lfp))
1438  ereport(FATAL,
1440  errmsg("could not read file \"%s\": %m",
1441  TABLESPACE_MAP)));
1442 
1443  return true;
1444 }
1445 
1446 /*
1447  * Finish WAL recovery.
1448  *
1449  * This does not close the 'xlogreader' yet, because in some cases the caller
1450  * still wants to re-read the last checkpoint record by calling
1451  * ReadCheckpointRecord().
1452  *
1453  * Returns the position of the last valid or applied record, after which new
1454  * WAL should be appended, information about why recovery was ended, and some
1455  * other things. See the EndOfWalRecoveryInfo struct for details.
1456  */
1459 {
1461  XLogRecPtr lastRec;
1462  TimeLineID lastRecTLI;
1463  XLogRecPtr endOfLog;
1464 
1465  /*
1466  * Kill WAL receiver, if it's still running, before we continue to write
1467  * the startup checkpoint and aborted-contrecord records. It will trump
1468  * over these records and subsequent ones if it's still alive when we
1469  * start writing WAL.
1470  */
1472 
1473  /*
1474  * Shutdown the slot sync worker to drop any temporary slots acquired by
1475  * it and to prevent it from keep trying to fetch the failover slots.
1476  *
1477  * We do not update the 'synced' column in 'pg_replication_slots' system
1478  * view from true to false here, as any failed update could leave 'synced'
1479  * column false for some slots. This could cause issues during slot sync
1480  * after restarting the server as a standby. While updating the 'synced'
1481  * column after switching to the new timeline is an option, it does not
1482  * simplify the handling for the 'synced' column. Therefore, we retain the
1483  * 'synced' column as true after promotion as it may provide useful
1484  * information about the slot origin.
1485  */
1486  ShutDownSlotSync();
1487 
1488  /*
1489  * We are now done reading the xlog from stream. Turn off streaming
1490  * recovery to force fetching the files (which would be required at end of
1491  * recovery, e.g., timeline history file) from archive or pg_wal.
1492  *
1493  * Note that standby mode must be turned off after killing WAL receiver,
1494  * i.e., calling XLogShutdownWalRcv().
1495  */
1496  Assert(!WalRcvStreaming());
1497  StandbyMode = false;
1498 
1499  /*
1500  * Determine where to start writing WAL next.
1501  *
1502  * Re-fetch the last valid or last applied record, so we can identify the
1503  * exact endpoint of what we consider the valid portion of WAL. There may
1504  * be an incomplete continuation record after that, in which case
1505  * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1506  * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1507  * it is intentionally missing. See CreateOverwriteContrecordRecord().
1508  *
1509  * An important side-effect of this is to load the last page into
1510  * xlogreader. The caller uses it to initialize the WAL for writing.
1511  */
1512  if (!InRecovery)
1513  {
1514  lastRec = CheckPointLoc;
1515  lastRecTLI = CheckPointTLI;
1516  }
1517  else
1518  {
1520  lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1521  }
1523  (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1524  endOfLog = xlogreader->EndRecPtr;
1525 
1526  /*
1527  * Remember the TLI in the filename of the XLOG segment containing the
1528  * end-of-log. It could be different from the timeline that endOfLog
1529  * nominally belongs to, if there was a timeline switch in that segment,
1530  * and we were reading the old WAL from a segment belonging to a higher
1531  * timeline.
1532  */
1533  result->endOfLogTLI = xlogreader->seg.ws_tli;
1534 
1536  {
1537  /*
1538  * We are no longer in archive recovery state.
1539  *
1540  * We are now done reading the old WAL. Turn off archive fetching if
1541  * it was active.
1542  */
1544  InArchiveRecovery = false;
1545 
1546  /*
1547  * If the ending log segment is still open, close it (to avoid
1548  * problems on Windows with trying to rename or delete an open file).
1549  */
1550  if (readFile >= 0)
1551  {
1552  close(readFile);
1553  readFile = -1;
1554  }
1555  }
1556 
1557  /*
1558  * Copy the last partial block to the caller, for initializing the WAL
1559  * buffer for appending new WAL.
1560  */
1561  if (endOfLog % XLOG_BLCKSZ != 0)
1562  {
1563  char *page;
1564  int len;
1565  XLogRecPtr pageBeginPtr;
1566 
1567  pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1568  Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1569 
1570  /* Copy the valid part of the last block */
1571  len = endOfLog % XLOG_BLCKSZ;
1572  page = palloc(len);
1573  memcpy(page, xlogreader->readBuf, len);
1574 
1575  result->lastPageBeginPtr = pageBeginPtr;
1576  result->lastPage = page;
1577  }
1578  else
1579  {
1580  /* There is no partial block to copy. */
1581  result->lastPageBeginPtr = endOfLog;
1582  result->lastPage = NULL;
1583  }
1584 
1585  /*
1586  * Create a comment for the history file to explain why and where timeline
1587  * changed.
1588  */
1590 
1591  result->lastRec = lastRec;
1592  result->lastRecTLI = lastRecTLI;
1593  result->endOfLog = endOfLog;
1594 
1595  result->abortedRecPtr = abortedRecPtr;
1597 
1600 
1601  return result;
1602 }
1603 
1604 /*
1605  * Clean up the WAL reader and leftovers from restoring WAL from archive
1606  */
1607 void
1609 {
1610  char recoveryPath[MAXPGPATH];
1611 
1612  /* Final update of pg_stat_recovery_prefetch. */
1614 
1615  /* Shut down xlogreader */
1616  if (readFile >= 0)
1617  {
1618  close(readFile);
1619  readFile = -1;
1620  }
1623 
1625  {
1626  /*
1627  * Since there might be a partial WAL segment named RECOVERYXLOG, get
1628  * rid of it.
1629  */
1630  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1631  unlink(recoveryPath); /* ignore any error */
1632 
1633  /* Get rid of any remaining recovered timeline-history file, too */
1634  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1635  unlink(recoveryPath); /* ignore any error */
1636  }
1637 
1638  /*
1639  * We don't need the latch anymore. It's not strictly necessary to disown
1640  * it, but let's do it for the sake of tidiness.
1641  */
1644 }
1645 
1646 /*
1647  * Perform WAL recovery.
1648  *
1649  * If the system was shut down cleanly, this is never called.
1650  */
1651 void
1653 {
1654  XLogRecord *record;
1655  bool reachedRecoveryTarget = false;
1656  TimeLineID replayTLI;
1657 
1658  /*
1659  * Initialize shared variables for tracking progress of WAL replay, as if
1660  * we had just replayed the record before the REDO location (or the
1661  * checkpoint record itself, if it's a shutdown checkpoint).
1662  */
1665  {
1669  }
1670  else
1671  {
1675  }
1682 
1683  /* Also ensure XLogReceiptTime has a sane value */
1685 
1686  /*
1687  * Let postmaster know we've started redo now, so that it can launch the
1688  * archiver if necessary.
1689  */
1690  if (IsUnderPostmaster)
1692 
1693  /*
1694  * Allow read-only connections immediately if we're consistent already.
1695  */
1697 
1698  /*
1699  * Find the first record that logically follows the checkpoint --- it
1700  * might physically precede it, though.
1701  */
1703  {
1704  /* back up to find the record */
1705  replayTLI = RedoStartTLI;
1707  record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1708 
1709  /*
1710  * If a checkpoint record's redo pointer points back to an earlier
1711  * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1712  * record.
1713  */
1714  if (record->xl_rmid != RM_XLOG_ID ||
1715  (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1716  ereport(FATAL,
1717  (errmsg("unexpected record type found at redo point %X/%X",
1719  }
1720  else
1721  {
1722  /* just have to read next record after CheckPoint */
1724  replayTLI = CheckPointTLI;
1725  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1726  }
1727 
1728  if (record != NULL)
1729  {
1730  TimestampTz xtime;
1731  PGRUsage ru0;
1732 
1733  pg_rusage_init(&ru0);
1734 
1735  InRedo = true;
1736 
1737  RmgrStartup();
1738 
1739  ereport(LOG,
1740  (errmsg("redo starts at %X/%X",
1742 
1743  /* Prepare to report progress of the redo phase. */
1744  if (!StandbyMode)
1746 
1747  /*
1748  * main redo apply loop
1749  */
1750  do
1751  {
1752  if (!StandbyMode)
1753  ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1755 
1756 #ifdef WAL_DEBUG
1757  if (XLOG_DEBUG)
1758  {
1760 
1761  initStringInfo(&buf);
1762  appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1765  xlog_outrec(&buf, xlogreader);
1766  appendStringInfoString(&buf, " - ");
1768  elog(LOG, "%s", buf.data);
1769  pfree(buf.data);
1770  }
1771 #endif
1772 
1773  /* Handle interrupt signals of startup process */
1775 
1776  /*
1777  * Pause WAL replay, if requested by a hot-standby session via
1778  * SetRecoveryPause().
1779  *
1780  * Note that we intentionally don't take the info_lck spinlock
1781  * here. We might therefore read a slightly stale value of the
1782  * recoveryPause flag, but it can't be very stale (no worse than
1783  * the last spinlock we did acquire). Since a pause request is a
1784  * pretty asynchronous thing anyway, possibly responding to it one
1785  * WAL record later than we otherwise would is a minor issue, so
1786  * it doesn't seem worth adding another spinlock cycle to prevent
1787  * that.
1788  */
1789  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1791  recoveryPausesHere(false);
1792 
1793  /*
1794  * Have we reached our recovery target?
1795  */
1797  {
1798  reachedRecoveryTarget = true;
1799  break;
1800  }
1801 
1802  /*
1803  * If we've been asked to lag the primary, wait on latch until
1804  * enough time has passed.
1805  */
1807  {
1808  /*
1809  * We test for paused recovery again here. If user sets
1810  * delayed apply, it may be because they expect to pause
1811  * recovery in case of problems, so we must test again here
1812  * otherwise pausing during the delay-wait wouldn't work.
1813  */
1814  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1816  recoveryPausesHere(false);
1817  }
1818 
1819  /*
1820  * Apply the record
1821  */
1822  ApplyWalRecord(xlogreader, record, &replayTLI);
1823 
1824  /* Exit loop if we reached inclusive recovery target */
1826  {
1827  reachedRecoveryTarget = true;
1828  break;
1829  }
1830 
1831  /* Else, try to fetch the next WAL record */
1832  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1833  } while (record != NULL);
1834 
1835  /*
1836  * end of main redo apply loop
1837  */
1838 
1839  if (reachedRecoveryTarget)
1840  {
1841  if (!reachedConsistency)
1842  ereport(FATAL,
1843  (errmsg("requested recovery stop point is before consistent recovery point")));
1844 
1845  /*
1846  * This is the last point where we can restart recovery with a new
1847  * recovery target, if we shutdown and begin again. After this,
1848  * Resource Managers may choose to do permanent corrective actions
1849  * at end of recovery.
1850  */
1851  switch (recoveryTargetAction)
1852  {
1854 
1855  /*
1856  * exit with special return code to request shutdown of
1857  * postmaster. Log messages issued from postmaster.
1858  */
1859  proc_exit(3);
1860 
1862  SetRecoveryPause(true);
1863  recoveryPausesHere(true);
1864 
1865  /* drop into promote */
1866 
1868  break;
1869  }
1870  }
1871 
1872  RmgrCleanup();
1873 
1874  ereport(LOG,
1875  (errmsg("redo done at %X/%X system usage: %s",
1877  pg_rusage_show(&ru0))));
1878  xtime = GetLatestXTime();
1879  if (xtime)
1880  ereport(LOG,
1881  (errmsg("last completed transaction was at log time %s",
1882  timestamptz_to_str(xtime))));
1883 
1884  InRedo = false;
1885  }
1886  else
1887  {
1888  /* there are no WAL records following the checkpoint */
1889  ereport(LOG,
1890  (errmsg("redo is not required")));
1891  }
1892 
1893  /*
1894  * This check is intentionally after the above log messages that indicate
1895  * how far recovery went.
1896  */
1899  !reachedRecoveryTarget)
1900  ereport(FATAL,
1901  (errmsg("recovery ended before configured recovery target was reached")));
1902 }
1903 
1904 /*
1905  * Subroutine of PerformWalRecovery, to apply one WAL record.
1906  */
1907 static void
1909 {
1910  ErrorContextCallback errcallback;
1911  bool switchedTLI = false;
1912 
1913  /* Setup error traceback support for ereport() */
1914  errcallback.callback = rm_redo_error_callback;
1915  errcallback.arg = (void *) xlogreader;
1916  errcallback.previous = error_context_stack;
1917  error_context_stack = &errcallback;
1918 
1919  /*
1920  * TransamVariables->nextXid must be beyond record's xid.
1921  */
1923 
1924  /*
1925  * Before replaying this record, check if this record causes the current
1926  * timeline to change. The record is already considered to be part of the
1927  * new timeline, so we update replayTLI before replaying it. That's
1928  * important so that replayEndTLI, which is recorded as the minimum
1929  * recovery point's TLI if recovery stops after this record, is set
1930  * correctly.
1931  */
1932  if (record->xl_rmid == RM_XLOG_ID)
1933  {
1934  TimeLineID newReplayTLI = *replayTLI;
1935  TimeLineID prevReplayTLI = *replayTLI;
1936  uint8 info = record->xl_info & ~XLR_INFO_MASK;
1937 
1938  if (info == XLOG_CHECKPOINT_SHUTDOWN)
1939  {
1940  CheckPoint checkPoint;
1941 
1942  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1943  newReplayTLI = checkPoint.ThisTimeLineID;
1944  prevReplayTLI = checkPoint.PrevTimeLineID;
1945  }
1946  else if (info == XLOG_END_OF_RECOVERY)
1947  {
1948  xl_end_of_recovery xlrec;
1949 
1950  memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1951  newReplayTLI = xlrec.ThisTimeLineID;
1952  prevReplayTLI = xlrec.PrevTimeLineID;
1953  }
1954 
1955  if (newReplayTLI != *replayTLI)
1956  {
1957  /* Check that it's OK to switch to this TLI */
1959  newReplayTLI, prevReplayTLI, *replayTLI);
1960 
1961  /* Following WAL records should be run with new TLI */
1962  *replayTLI = newReplayTLI;
1963  switchedTLI = true;
1964  }
1965  }
1966 
1967  /*
1968  * Update shared replayEndRecPtr before replaying this record, so that
1969  * XLogFlush will update minRecoveryPoint correctly.
1970  */
1973  XLogRecoveryCtl->replayEndTLI = *replayTLI;
1975 
1976  /*
1977  * If we are attempting to enter Hot Standby mode, process XIDs we see
1978  */
1980  TransactionIdIsValid(record->xl_xid))
1982 
1983  /*
1984  * Some XLOG record types that are related to recovery are processed
1985  * directly here, rather than in xlog_redo()
1986  */
1987  if (record->xl_rmid == RM_XLOG_ID)
1988  xlogrecovery_redo(xlogreader, *replayTLI);
1989 
1990  /* Now apply the WAL record itself */
1991  GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1992 
1993  /*
1994  * After redo, check whether the backup pages associated with the WAL
1995  * record are consistent with the existing pages. This check is done only
1996  * if consistency check is enabled for this record.
1997  */
1998  if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2000 
2001  /* Pop the error context stack */
2002  error_context_stack = errcallback.previous;
2003 
2004  /*
2005  * Update lastReplayedEndRecPtr after this record has been successfully
2006  * replayed.
2007  */
2011  XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2013 
2014  /* ------
2015  * Wakeup walsenders:
2016  *
2017  * On the standby, the WAL is flushed first (which will only wake up
2018  * physical walsenders) and then applied, which will only wake up logical
2019  * walsenders.
2020  *
2021  * Indeed, logical walsenders on standby can't decode and send data until
2022  * it's been applied.
2023  *
2024  * Physical walsenders don't need to be woken up during replay unless
2025  * cascading replication is allowed and time line change occurred (so that
2026  * they can notice that they are on a new time line).
2027  *
2028  * That's why the wake up conditions are for:
2029  *
2030  * - physical walsenders in case of new time line and cascade
2031  * replication is allowed
2032  * - logical walsenders in case cascade replication is allowed (could not
2033  * be created otherwise)
2034  * ------
2035  */
2037  WalSndWakeup(switchedTLI, true);
2038 
2039  /*
2040  * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2041  * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2042  * a reply to the primary.
2043  */
2045  {
2046  doRequestWalReceiverReply = false;
2047  WalRcvForceReply();
2048  }
2049 
2050  /* Allow read-only connections if we're consistent now */
2052 
2053  /* Is this a timeline switch? */
2054  if (switchedTLI)
2055  {
2056  /*
2057  * Before we continue on the new timeline, clean up any (possibly
2058  * bogus) future WAL segments on the old timeline.
2059  */
2061 
2062  /* Reset the prefetcher. */
2064  }
2065 }
2066 
2067 /*
2068  * Some XLOG RM record types that are directly related to WAL recovery are
2069  * handled here rather than in the xlog_redo()
2070  */
2071 static void
2073 {
2074  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2075  XLogRecPtr lsn = record->EndRecPtr;
2076 
2077  Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2078 
2079  if (info == XLOG_OVERWRITE_CONTRECORD)
2080  {
2081  /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2083 
2084  memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2085  if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2086  elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
2089 
2090  /* We have safely skipped the aborted record */
2093 
2094  ereport(LOG,
2095  (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2098 
2099  /* Verifying the record should only happen once */
2101  }
2102  else if (info == XLOG_BACKUP_END)
2103  {
2104  XLogRecPtr startpoint;
2105 
2106  memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2107 
2108  if (backupStartPoint == startpoint)
2109  {
2110  /*
2111  * We have reached the end of base backup, the point where
2112  * pg_backup_stop() was done. The data on disk is now consistent
2113  * (assuming we have also reached minRecoveryPoint). Set
2114  * backupEndPoint to the current LSN, so that the next call to
2115  * CheckRecoveryConsistency() will notice it and do the
2116  * end-of-backup processing.
2117  */
2118  elog(DEBUG1, "end of backup record reached");
2119 
2120  backupEndPoint = lsn;
2121  }
2122  else
2123  elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2125  }
2126 }
2127 
2128 /*
2129  * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2130  * directories.
2131  *
2132  * Replay of database creation XLOG records for databases that were later
2133  * dropped can create fake directories in pg_tblspc. By the time consistency
2134  * is reached these directories should have been removed; here we verify
2135  * that this did indeed happen. This is to be called at the point where
2136  * consistent state is reached.
2137  *
2138  * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2139  * useful for testing purposes, and also allows for an escape hatch in case
2140  * things go south.
2141  */
2142 static void
2144 {
2145  DIR *dir;
2146  struct dirent *de;
2147 
2148  dir = AllocateDir("pg_tblspc");
2149  while ((de = ReadDir(dir, "pg_tblspc")) != NULL)
2150  {
2151  char path[MAXPGPATH + 10];
2152 
2153  /* Skip entries of non-oid names */
2154  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2155  continue;
2156 
2157  snprintf(path, sizeof(path), "pg_tblspc/%s", de->d_name);
2158 
2159  if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2162  errmsg("unexpected directory entry \"%s\" found in %s",
2163  de->d_name, "pg_tblspc/"),
2164  errdetail("All directory entries in pg_tblspc/ should be symbolic links."),
2165  errhint("Remove those directories, or set allow_in_place_tablespaces to ON transiently to let recovery complete.")));
2166  }
2167 }
2168 
2169 /*
2170  * Checks if recovery has reached a consistent state. When consistency is
2171  * reached and we have a valid starting standby snapshot, tell postmaster
2172  * that it can start accepting read-only connections.
2173  */
2174 static void
2176 {
2177  XLogRecPtr lastReplayedEndRecPtr;
2178  TimeLineID lastReplayedTLI;
2179 
2180  /*
2181  * During crash recovery, we don't reach a consistent state until we've
2182  * replayed all the WAL.
2183  */
2185  return;
2186 
2188 
2189  /*
2190  * assume that we are called in the startup process, and hence don't need
2191  * a lock to read lastReplayedEndRecPtr
2192  */
2193  lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2194  lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2195 
2196  /*
2197  * Have we reached the point where our base backup was completed?
2198  */
2200  backupEndPoint <= lastReplayedEndRecPtr)
2201  {
2202  XLogRecPtr saveBackupStartPoint = backupStartPoint;
2203  XLogRecPtr saveBackupEndPoint = backupEndPoint;
2204 
2205  elog(DEBUG1, "end of backup reached");
2206 
2207  /*
2208  * We have reached the end of base backup, as indicated by pg_control.
2209  * Update the control file accordingly.
2210  */
2211  ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2214  backupEndRequired = false;
2215 
2216  ereport(LOG,
2217  (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
2218  LSN_FORMAT_ARGS(saveBackupStartPoint),
2219  LSN_FORMAT_ARGS(saveBackupEndPoint))));
2220  }
2221 
2222  /*
2223  * Have we passed our safe starting point? Note that minRecoveryPoint is
2224  * known to be incorrectly set if recovering from a backup, until the
2225  * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2226  * All we know prior to that is that we're not consistent yet.
2227  */
2229  minRecoveryPoint <= lastReplayedEndRecPtr)
2230  {
2231  /*
2232  * Check to see if the XLOG sequence contained any unresolved
2233  * references to uninitialized pages.
2234  */
2236 
2237  /*
2238  * Check that pg_tblspc doesn't contain any real directories. Replay
2239  * of Database/CREATE_* records may have created fictitious tablespace
2240  * directories that should have been removed by the time consistency
2241  * was reached.
2242  */
2244 
2245  reachedConsistency = true;
2246  ereport(LOG,
2247  (errmsg("consistent recovery state reached at %X/%X",
2248  LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2249  }
2250 
2251  /*
2252  * Have we got a valid starting snapshot that will allow queries to be
2253  * run? If so, we can tell postmaster that the database is consistent now,
2254  * enabling connections.
2255  */
2260  {
2264 
2265  LocalHotStandbyActive = true;
2266 
2268  }
2269 }
2270 
2271 /*
2272  * Error context callback for errors occurring during rm_redo().
2273  */
2274 static void
2276 {
2277  XLogReaderState *record = (XLogReaderState *) arg;
2279 
2280  initStringInfo(&buf);
2281  xlog_outdesc(&buf, record);
2282  xlog_block_info(&buf, record);
2283 
2284  /* translator: %s is a WAL record description */
2285  errcontext("WAL redo at %X/%X for %s",
2286  LSN_FORMAT_ARGS(record->ReadRecPtr),
2287  buf.data);
2288 
2289  pfree(buf.data);
2290 }
2291 
2292 /*
2293  * Returns a string describing an XLogRecord, consisting of its identity
2294  * optionally followed by a colon, a space, and a further description.
2295  */
2296 void
2298 {
2299  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2300  uint8 info = XLogRecGetInfo(record);
2301  const char *id;
2302 
2304  appendStringInfoChar(buf, '/');
2305 
2306  id = rmgr.rm_identify(info);
2307  if (id == NULL)
2308  appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2309  else
2310  appendStringInfo(buf, "%s: ", id);
2311 
2312  rmgr.rm_desc(buf, record);
2313 }
2314 
2315 #ifdef WAL_DEBUG
2316 
2317 static void
2318 xlog_outrec(StringInfo buf, XLogReaderState *record)
2319 {
2320  appendStringInfo(buf, "prev %X/%X; xid %u",
2322  XLogRecGetXid(record));
2323 
2324  appendStringInfo(buf, "; len %u",
2325  XLogRecGetDataLen(record));
2326 
2327  xlog_block_info(buf, record);
2328 }
2329 #endif /* WAL_DEBUG */
2330 
2331 /*
2332  * Returns a string giving information about all the blocks in an
2333  * XLogRecord.
2334  */
2335 static void
2337 {
2338  int block_id;
2339 
2340  /* decode block references */
2341  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2342  {
2343  RelFileLocator rlocator;
2344  ForkNumber forknum;
2345  BlockNumber blk;
2346 
2347  if (!XLogRecGetBlockTagExtended(record, block_id,
2348  &rlocator, &forknum, &blk, NULL))
2349  continue;
2350 
2351  if (forknum != MAIN_FORKNUM)
2352  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2353  block_id,
2354  rlocator.spcOid, rlocator.dbOid,
2355  rlocator.relNumber,
2356  forknum,
2357  blk);
2358  else
2359  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2360  block_id,
2361  rlocator.spcOid, rlocator.dbOid,
2362  rlocator.relNumber,
2363  blk);
2364  if (XLogRecHasBlockImage(record, block_id))
2365  appendStringInfoString(buf, " FPW");
2366  }
2367 }
2368 
2369 
2370 /*
2371  * Check that it's OK to switch to new timeline during recovery.
2372  *
2373  * 'lsn' is the address of the shutdown checkpoint record we're about to
2374  * replay. (Currently, timeline can only change at a shutdown checkpoint).
2375  */
2376 static void
2378  TimeLineID replayTLI)
2379 {
2380  /* Check that the record agrees on what the current (old) timeline is */
2381  if (prevTLI != replayTLI)
2382  ereport(PANIC,
2383  (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2384  prevTLI, replayTLI)));
2385 
2386  /*
2387  * The new timeline better be in the list of timelines we expect to see,
2388  * according to the timeline history. It should also not decrease.
2389  */
2390  if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2391  ereport(PANIC,
2392  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2393  newTLI, replayTLI)));
2394 
2395  /*
2396  * If we have not yet reached min recovery point, and we're about to
2397  * switch to a timeline greater than the timeline of the min recovery
2398  * point: trouble. After switching to the new timeline, we could not
2399  * possibly visit the min recovery point on the correct timeline anymore.
2400  * This can happen if there is a newer timeline in the archive that
2401  * branched before the timeline the min recovery point is on, and you
2402  * attempt to do PITR to the new timeline.
2403  */
2405  lsn < minRecoveryPoint &&
2406  newTLI > minRecoveryPointTLI)
2407  ereport(PANIC,
2408  (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2409  newTLI,
2412 
2413  /* Looks good */
2414 }
2415 
2416 
2417 /*
2418  * Extract timestamp from WAL record.
2419  *
2420  * If the record contains a timestamp, returns true, and saves the timestamp
2421  * in *recordXtime. If the record type has no timestamp, returns false.
2422  * Currently, only transaction commit/abort records and restore points contain
2423  * timestamps.
2424  */
2425 static bool
2427 {
2428  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2429  uint8 xact_info = info & XLOG_XACT_OPMASK;
2430  uint8 rmid = XLogRecGetRmid(record);
2431 
2432  if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2433  {
2434  *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2435  return true;
2436  }
2437  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2438  xact_info == XLOG_XACT_COMMIT_PREPARED))
2439  {
2440  *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2441  return true;
2442  }
2443  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2444  xact_info == XLOG_XACT_ABORT_PREPARED))
2445  {
2446  *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2447  return true;
2448  }
2449  return false;
2450 }
2451 
2452 /*
2453  * Checks whether the current buffer page and backup page stored in the
2454  * WAL record are consistent or not. Before comparing the two pages, a
2455  * masking can be applied to the pages to ignore certain areas like hint bits,
2456  * unused space between pd_lower and pd_upper among other things. This
2457  * function should be called once WAL replay has been completed for a
2458  * given record.
2459  */
2460 static void
2462 {
2463  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2464  RelFileLocator rlocator;
2465  ForkNumber forknum;
2466  BlockNumber blkno;
2467  int block_id;
2468 
2469  /* Records with no backup blocks have no need for consistency checks. */
2470  if (!XLogRecHasAnyBlockRefs(record))
2471  return;
2472 
2473  Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2474 
2475  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2476  {
2477  Buffer buf;
2478  Page page;
2479 
2480  if (!XLogRecGetBlockTagExtended(record, block_id,
2481  &rlocator, &forknum, &blkno, NULL))
2482  {
2483  /*
2484  * WAL record doesn't contain a block reference with the given id.
2485  * Do nothing.
2486  */
2487  continue;
2488  }
2489 
2490  Assert(XLogRecHasBlockImage(record, block_id));
2491 
2492  if (XLogRecBlockImageApply(record, block_id))
2493  {
2494  /*
2495  * WAL record has already applied the page, so bypass the
2496  * consistency check as that would result in comparing the full
2497  * page stored in the record with itself.
2498  */
2499  continue;
2500  }
2501 
2502  /*
2503  * Read the contents from the current buffer and store it in a
2504  * temporary page.
2505  */
2506  buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2508  InvalidBuffer);
2509  if (!BufferIsValid(buf))
2510  continue;
2511 
2513  page = BufferGetPage(buf);
2514 
2515  /*
2516  * Take a copy of the local page where WAL has been applied to have a
2517  * comparison base before masking it...
2518  */
2519  memcpy(replay_image_masked, page, BLCKSZ);
2520 
2521  /* No need for this page anymore now that a copy is in. */
2523 
2524  /*
2525  * If the block LSN is already ahead of this WAL record, we can't
2526  * expect contents to match. This can happen if recovery is
2527  * restarted.
2528  */
2529  if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2530  continue;
2531 
2532  /*
2533  * Read the contents from the backup copy, stored in WAL record and
2534  * store it in a temporary page. There is no need to allocate a new
2535  * page here, a local buffer is fine to hold its contents and a mask
2536  * can be directly applied on it.
2537  */
2538  if (!RestoreBlockImage(record, block_id, primary_image_masked))
2539  ereport(ERROR,
2540  (errcode(ERRCODE_INTERNAL_ERROR),
2541  errmsg_internal("%s", record->errormsg_buf)));
2542 
2543  /*
2544  * If masking function is defined, mask both the primary and replay
2545  * images
2546  */
2547  if (rmgr.rm_mask != NULL)
2548  {
2549  rmgr.rm_mask(replay_image_masked, blkno);
2550  rmgr.rm_mask(primary_image_masked, blkno);
2551  }
2552 
2553  /* Time to compare the primary and replay images. */
2554  if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2555  {
2556  elog(FATAL,
2557  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2558  rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2559  forknum, blkno);
2560  }
2561  }
2562 }
2563 
2564 /*
2565  * For point-in-time recovery, this function decides whether we want to
2566  * stop applying the XLOG before the current record.
2567  *
2568  * Returns true if we are stopping, false otherwise. If stopping, some
2569  * information is saved in recoveryStopXid et al for use in annotating the
2570  * new timeline's history file.
2571  */
2572 static bool
2574 {
2575  bool stopsHere = false;
2576  uint8 xact_info;
2577  bool isCommit;
2578  TimestampTz recordXtime = 0;
2579  TransactionId recordXid;
2580 
2581  /*
2582  * Ignore recovery target settings when not in archive recovery (meaning
2583  * we are in crash recovery).
2584  */
2586  return false;
2587 
2588  /* Check if we should stop as soon as reaching consistency */
2590  {
2591  ereport(LOG,
2592  (errmsg("recovery stopping after reaching consistency")));
2593 
2594  recoveryStopAfter = false;
2597  recoveryStopTime = 0;
2598  recoveryStopName[0] = '\0';
2599  return true;
2600  }
2601 
2602  /* Check if target LSN has been reached */
2605  record->ReadRecPtr >= recoveryTargetLSN)
2606  {
2607  recoveryStopAfter = false;
2609  recoveryStopLSN = record->ReadRecPtr;
2610  recoveryStopTime = 0;
2611  recoveryStopName[0] = '\0';
2612  ereport(LOG,
2613  (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2615  return true;
2616  }
2617 
2618  /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2619  if (XLogRecGetRmid(record) != RM_XACT_ID)
2620  return false;
2621 
2622  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2623 
2624  if (xact_info == XLOG_XACT_COMMIT)
2625  {
2626  isCommit = true;
2627  recordXid = XLogRecGetXid(record);
2628  }
2629  else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2630  {
2631  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2632  xl_xact_parsed_commit parsed;
2633 
2634  isCommit = true;
2636  xlrec,
2637  &parsed);
2638  recordXid = parsed.twophase_xid;
2639  }
2640  else if (xact_info == XLOG_XACT_ABORT)
2641  {
2642  isCommit = false;
2643  recordXid = XLogRecGetXid(record);
2644  }
2645  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2646  {
2647  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2648  xl_xact_parsed_abort parsed;
2649 
2650  isCommit = false;
2652  xlrec,
2653  &parsed);
2654  recordXid = parsed.twophase_xid;
2655  }
2656  else
2657  return false;
2658 
2660  {
2661  /*
2662  * There can be only one transaction end record with this exact
2663  * transactionid
2664  *
2665  * when testing for an xid, we MUST test for equality only, since
2666  * transactions are numbered in the order they start, not the order
2667  * they complete. A higher numbered xid will complete before you about
2668  * 50% of the time...
2669  */
2670  stopsHere = (recordXid == recoveryTargetXid);
2671  }
2672 
2673  /*
2674  * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2675  * We don't expect getRecordTimestamp ever to fail, since we already know
2676  * this is a commit or abort record; but test its result anyway.
2677  */
2678  if (getRecordTimestamp(record, &recordXtime) &&
2680  {
2681  /*
2682  * There can be many transactions that share the same commit time, so
2683  * we stop after the last one, if we are inclusive, or stop at the
2684  * first one if we are exclusive
2685  */
2687  stopsHere = (recordXtime > recoveryTargetTime);
2688  else
2689  stopsHere = (recordXtime >= recoveryTargetTime);
2690  }
2691 
2692  if (stopsHere)
2693  {
2694  recoveryStopAfter = false;
2695  recoveryStopXid = recordXid;
2696  recoveryStopTime = recordXtime;
2698  recoveryStopName[0] = '\0';
2699 
2700  if (isCommit)
2701  {
2702  ereport(LOG,
2703  (errmsg("recovery stopping before commit of transaction %u, time %s",
2706  }
2707  else
2708  {
2709  ereport(LOG,
2710  (errmsg("recovery stopping before abort of transaction %u, time %s",
2713  }
2714  }
2715 
2716  return stopsHere;
2717 }
2718 
2719 /*
2720  * Same as recoveryStopsBefore, but called after applying the record.
2721  *
2722  * We also track the timestamp of the latest applied COMMIT/ABORT
2723  * record in XLogRecoveryCtl->recoveryLastXTime.
2724  */
2725 static bool
2727 {
2728  uint8 info;
2729  uint8 xact_info;
2730  uint8 rmid;
2731  TimestampTz recordXtime = 0;
2732 
2733  /*
2734  * Ignore recovery target settings when not in archive recovery (meaning
2735  * we are in crash recovery).
2736  */
2738  return false;
2739 
2740  info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2741  rmid = XLogRecGetRmid(record);
2742 
2743  /*
2744  * There can be many restore points that share the same name; we stop at
2745  * the first one.
2746  */
2748  rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2749  {
2750  xl_restore_point *recordRestorePointData;
2751 
2752  recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2753 
2754  if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2755  {
2756  recoveryStopAfter = true;
2759  (void) getRecordTimestamp(record, &recoveryStopTime);
2760  strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2761 
2762  ereport(LOG,
2763  (errmsg("recovery stopping at restore point \"%s\", time %s",
2766  return true;
2767  }
2768  }
2769 
2770  /* Check if the target LSN has been reached */
2773  record->ReadRecPtr >= recoveryTargetLSN)
2774  {
2775  recoveryStopAfter = true;
2777  recoveryStopLSN = record->ReadRecPtr;
2778  recoveryStopTime = 0;
2779  recoveryStopName[0] = '\0';
2780  ereport(LOG,
2781  (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2783  return true;
2784  }
2785 
2786  if (rmid != RM_XACT_ID)
2787  return false;
2788 
2789  xact_info = info & XLOG_XACT_OPMASK;
2790 
2791  if (xact_info == XLOG_XACT_COMMIT ||
2792  xact_info == XLOG_XACT_COMMIT_PREPARED ||
2793  xact_info == XLOG_XACT_ABORT ||
2794  xact_info == XLOG_XACT_ABORT_PREPARED)
2795  {
2796  TransactionId recordXid;
2797 
2798  /* Update the last applied transaction timestamp */
2799  if (getRecordTimestamp(record, &recordXtime))
2800  SetLatestXTime(recordXtime);
2801 
2802  /* Extract the XID of the committed/aborted transaction */
2803  if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2804  {
2805  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2806  xl_xact_parsed_commit parsed;
2807 
2809  xlrec,
2810  &parsed);
2811  recordXid = parsed.twophase_xid;
2812  }
2813  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2814  {
2815  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2816  xl_xact_parsed_abort parsed;
2817 
2819  xlrec,
2820  &parsed);
2821  recordXid = parsed.twophase_xid;
2822  }
2823  else
2824  recordXid = XLogRecGetXid(record);
2825 
2826  /*
2827  * There can be only one transaction end record with this exact
2828  * transactionid
2829  *
2830  * when testing for an xid, we MUST test for equality only, since
2831  * transactions are numbered in the order they start, not the order
2832  * they complete. A higher numbered xid will complete before you about
2833  * 50% of the time...
2834  */
2836  recordXid == recoveryTargetXid)
2837  {
2838  recoveryStopAfter = true;
2839  recoveryStopXid = recordXid;
2840  recoveryStopTime = recordXtime;
2842  recoveryStopName[0] = '\0';
2843 
2844  if (xact_info == XLOG_XACT_COMMIT ||
2845  xact_info == XLOG_XACT_COMMIT_PREPARED)
2846  {
2847  ereport(LOG,
2848  (errmsg("recovery stopping after commit of transaction %u, time %s",
2851  }
2852  else if (xact_info == XLOG_XACT_ABORT ||
2853  xact_info == XLOG_XACT_ABORT_PREPARED)
2854  {
2855  ereport(LOG,
2856  (errmsg("recovery stopping after abort of transaction %u, time %s",
2859  }
2860  return true;
2861  }
2862  }
2863 
2864  /* Check if we should stop as soon as reaching consistency */
2866  {
2867  ereport(LOG,
2868  (errmsg("recovery stopping after reaching consistency")));
2869 
2870  recoveryStopAfter = true;
2872  recoveryStopTime = 0;
2874  recoveryStopName[0] = '\0';
2875  return true;
2876  }
2877 
2878  return false;
2879 }
2880 
2881 /*
2882  * Create a comment for the history file to explain why and where
2883  * timeline changed.
2884  */
2885 static char *
2887 {
2888  char reason[200];
2889 
2891  snprintf(reason, sizeof(reason),
2892  "%s transaction %u",
2893  recoveryStopAfter ? "after" : "before",
2894  recoveryStopXid);
2896  snprintf(reason, sizeof(reason),
2897  "%s %s\n",
2898  recoveryStopAfter ? "after" : "before",
2900  else if (recoveryTarget == RECOVERY_TARGET_LSN)
2901  snprintf(reason, sizeof(reason),
2902  "%s LSN %X/%X\n",
2903  recoveryStopAfter ? "after" : "before",
2906  snprintf(reason, sizeof(reason),
2907  "at restore point \"%s\"",
2910  snprintf(reason, sizeof(reason), "reached consistency");
2911  else
2912  snprintf(reason, sizeof(reason), "no recovery target specified");
2913 
2914  return pstrdup(reason);
2915 }
2916 
2917 /*
2918  * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2919  *
2920  * endOfRecovery is true if the recovery target is reached and
2921  * the paused state starts at the end of recovery because of
2922  * recovery_target_action=pause, and false otherwise.
2923  */
2924 static void
2925 recoveryPausesHere(bool endOfRecovery)
2926 {
2927  /* Don't pause unless users can connect! */
2928  if (!LocalHotStandbyActive)
2929  return;
2930 
2931  /* Don't pause after standby promotion has been triggered */
2933  return;
2934 
2935  if (endOfRecovery)
2936  ereport(LOG,
2937  (errmsg("pausing at the end of recovery"),
2938  errhint("Execute pg_wal_replay_resume() to promote.")));
2939  else
2940  ereport(LOG,
2941  (errmsg("recovery has paused"),
2942  errhint("Execute pg_wal_replay_resume() to continue.")));
2943 
2944  /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2946  {
2948  if (CheckForStandbyTrigger())
2949  return;
2950 
2951  /*
2952  * If recovery pause is requested then set it paused. While we are in
2953  * the loop, user might resume and pause again so set this every time.
2954  */
2956 
2957  /*
2958  * We wait on a condition variable that will wake us as soon as the
2959  * pause ends, but we use a timeout so we can check the above exit
2960  * condition periodically too.
2961  */
2963  WAIT_EVENT_RECOVERY_PAUSE);
2964  }
2966 }
2967 
2968 /*
2969  * When recovery_min_apply_delay is set, we wait long enough to make sure
2970  * certain record types are applied at least that interval behind the primary.
2971  *
2972  * Returns true if we waited.
2973  *
2974  * Note that the delay is calculated between the WAL record log time and
2975  * the current time on standby. We would prefer to keep track of when this
2976  * standby received each WAL record, which would allow a more consistent
2977  * approach and one not affected by time synchronisation issues, but that
2978  * is significantly more effort and complexity for little actual gain in
2979  * usability.
2980  */
2981 static bool
2983 {
2984  uint8 xact_info;
2985  TimestampTz xtime;
2986  TimestampTz delayUntil;
2987  long msecs;
2988 
2989  /* nothing to do if no delay configured */
2990  if (recovery_min_apply_delay <= 0)
2991  return false;
2992 
2993  /* no delay is applied on a database not yet consistent */
2994  if (!reachedConsistency)
2995  return false;
2996 
2997  /* nothing to do if crash recovery is requested */
2999  return false;
3000 
3001  /*
3002  * Is it a COMMIT record?
3003  *
3004  * We deliberately choose not to delay aborts since they have no effect on
3005  * MVCC. We already allow replay of records that don't have a timestamp,
3006  * so there is already opportunity for issues caused by early conflicts on
3007  * standbys.
3008  */
3009  if (XLogRecGetRmid(record) != RM_XACT_ID)
3010  return false;
3011 
3012  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3013 
3014  if (xact_info != XLOG_XACT_COMMIT &&
3015  xact_info != XLOG_XACT_COMMIT_PREPARED)
3016  return false;
3017 
3018  if (!getRecordTimestamp(record, &xtime))
3019  return false;
3020 
3022 
3023  /*
3024  * Exit without arming the latch if it's already past time to apply this
3025  * record
3026  */
3028  if (msecs <= 0)
3029  return false;
3030 
3031  while (true)
3032  {
3034 
3035  /* This might change recovery_min_apply_delay. */
3037 
3038  if (CheckForStandbyTrigger())
3039  break;
3040 
3041  /*
3042  * Recalculate delayUntil as recovery_min_apply_delay could have
3043  * changed while waiting in this loop.
3044  */
3046 
3047  /*
3048  * Wait for difference between GetCurrentTimestamp() and delayUntil.
3049  */
3051  delayUntil);
3052 
3053  if (msecs <= 0)
3054  break;
3055 
3056  elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3057 
3060  msecs,
3061  WAIT_EVENT_RECOVERY_APPLY_DELAY);
3062  }
3063  return true;
3064 }
3065 
3066 /*
3067  * Get the current state of the recovery pause request.
3068  */
3071 {
3073 
3077 
3078  return state;
3079 }
3080 
3081 /*
3082  * Set the recovery pause state.
3083  *
3084  * If recovery pause is requested then sets the recovery pause state to
3085  * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3086  * to 'not paused' to resume the recovery. The recovery pause will be
3087  * confirmed by the ConfirmRecoveryPaused.
3088  */
3089 void
3090 SetRecoveryPause(bool recoveryPause)
3091 {
3093 
3094  if (!recoveryPause)
3098 
3100 
3101  if (!recoveryPause)
3103 }
3104 
3105 /*
3106  * Confirm the recovery pause by setting the recovery pause state to
3107  * RECOVERY_PAUSED.
3108  */
3109 static void
3111 {
3112  /* If recovery pause is requested then set it paused */
3117 }
3118 
3119 
3120 /*
3121  * Attempt to read the next XLOG record.
3122  *
3123  * Before first call, the reader needs to be positioned to the first record
3124  * by calling XLogPrefetcherBeginRead().
3125  *
3126  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3127  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3128  * record is available.
3129  */
3130 static XLogRecord *
3132  bool fetching_ckpt, TimeLineID replayTLI)
3133 {
3134  XLogRecord *record;
3137 
3138  /* Pass through parameters to XLogPageRead */
3139  private->fetching_ckpt = fetching_ckpt;
3140  private->emode = emode;
3141  private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3142  private->replayTLI = replayTLI;
3143 
3144  /* This is the first attempt to read this page. */
3145  lastSourceFailed = false;
3146 
3147  for (;;)
3148  {
3149  char *errormsg;
3150 
3151  record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3152  if (record == NULL)
3153  {
3154  /*
3155  * When we find that WAL ends in an incomplete record, keep track
3156  * of that record. After recovery is done, we'll write a record
3157  * to indicate to downstream WAL readers that that portion is to
3158  * be ignored.
3159  *
3160  * However, when ArchiveRecoveryRequested = true, we're going to
3161  * switch to a new timeline at the end of recovery. We will only
3162  * copy WAL over to the new timeline up to the end of the last
3163  * complete record, so if we did this, we would later create an
3164  * overwrite contrecord in the wrong place, breaking everything.
3165  */
3166  if (!ArchiveRecoveryRequested &&
3168  {
3171  }
3172 
3173  if (readFile >= 0)
3174  {
3175  close(readFile);
3176  readFile = -1;
3177  }
3178 
3179  /*
3180  * We only end up here without a message when XLogPageRead()
3181  * failed - in that case we already logged something. In
3182  * StandbyMode that only happens if we have been triggered, so we
3183  * shouldn't loop anymore in that case.
3184  */
3185  if (errormsg)
3187  (errmsg_internal("%s", errormsg) /* already translated */ ));
3188  }
3189 
3190  /*
3191  * Check page TLI is one of the expected values.
3192  */
3194  {
3195  char fname[MAXFNAMELEN];
3196  XLogSegNo segno;
3197  int32 offset;
3198 
3202  XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3205  (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3207  fname,
3209  offset)));
3210  record = NULL;
3211  }
3212 
3213  if (record)
3214  {
3215  /* Great, got a record */
3216  return record;
3217  }
3218  else
3219  {
3220  /* No valid record available from this source */
3221  lastSourceFailed = true;
3222 
3223  /*
3224  * If archive recovery was requested, but we were still doing
3225  * crash recovery, switch to archive recovery and retry using the
3226  * offline archive. We have now replayed all the valid WAL in
3227  * pg_wal, so we are presumably now consistent.
3228  *
3229  * We require that there's at least some valid WAL present in
3230  * pg_wal, however (!fetching_ckpt). We could recover using the
3231  * WAL from the archive, even if pg_wal is completely empty, but
3232  * we'd have no idea how far we'd have to replay to reach
3233  * consistency. So err on the safe side and give up.
3234  */
3236  !fetching_ckpt)
3237  {
3238  ereport(DEBUG1,
3239  (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3240  InArchiveRecovery = true;
3243 
3246  minRecoveryPointTLI = replayTLI;
3247 
3249 
3250  /*
3251  * Before we retry, reset lastSourceFailed and currentSource
3252  * so that we will check the archive next.
3253  */
3254  lastSourceFailed = false;
3256 
3257  continue;
3258  }
3259 
3260  /* In standby mode, loop back to retry. Otherwise, give up. */
3262  continue;
3263  else
3264  return NULL;
3265  }
3266  }
3267 }
3268 
3269 /*
3270  * Read the XLOG page containing targetPagePtr into readBuf (if not read
3271  * already). Returns number of bytes read, if the page is read successfully,
3272  * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3273  * but only if they have not been previously reported.
3274  *
3275  * See XLogReaderRoutine.page_read for more details.
3276  *
3277  * While prefetching, xlogreader->nonblocking may be set. In that case,
3278  * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3279  *
3280  * This is responsible for restoring files from archive as needed, as well
3281  * as for waiting for the requested WAL record to arrive in standby mode.
3282  *
3283  * xlogreader->private_data->emode specifies the log level used for reporting
3284  * "file not found" or "end of WAL" situations in archive recovery, or in
3285  * standby mode when promotion is triggered. If set to WARNING or below,
3286  * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3287  * levels the ereport() won't return.
3288  *
3289  * In standby mode, if after a successful return of XLogPageRead() the
3290  * caller finds the record it's interested in to be broken, it should
3291  * ereport the error with the level determined by
3292  * emode_for_corrupt_record(), and then set lastSourceFailed
3293  * and call XLogPageRead() again with the same arguments. This lets
3294  * XLogPageRead() to try fetching the record from another source, or to
3295  * sleep and retry.
3296  */
3297 static int
3299  XLogRecPtr targetRecPtr, char *readBuf)
3300 {
3301  XLogPageReadPrivate *private =
3303  int emode = private->emode;
3304  uint32 targetPageOff;
3305  XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3306  int r;
3307 
3308  XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3309  targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3310 
3311  /*
3312  * See if we need to switch to a new segment because the requested record
3313  * is not in the currently open one.
3314  */
3315  if (readFile >= 0 &&
3316  !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3317  {
3318  /*
3319  * Request a restartpoint if we've replayed too much xlog since the
3320  * last one.
3321  */
3323  {
3325  {
3326  (void) GetRedoRecPtr();
3329  }
3330  }
3331 
3332  close(readFile);
3333  readFile = -1;
3335  }
3336 
3337  XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3338 
3339 retry:
3340  /* See if we need to retrieve more data */
3341  if (readFile < 0 ||
3343  flushedUpto < targetPagePtr + reqLen))
3344  {
3345  if (readFile >= 0 &&
3348  flushedUpto < targetPagePtr + reqLen)
3349  return XLREAD_WOULDBLOCK;
3350 
3351  switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3352  private->randAccess,
3353  private->fetching_ckpt,
3354  targetRecPtr,
3355  private->replayTLI,
3358  {
3359  case XLREAD_WOULDBLOCK:
3360  return XLREAD_WOULDBLOCK;
3361  case XLREAD_FAIL:
3362  if (readFile >= 0)
3363  close(readFile);
3364  readFile = -1;
3365  readLen = 0;
3367  return XLREAD_FAIL;
3368  case XLREAD_SUCCESS:
3369  break;
3370  }
3371  }
3372 
3373  /*
3374  * At this point, we have the right segment open and if we're streaming we
3375  * know the requested record is in it.
3376  */
3377  Assert(readFile != -1);
3378 
3379  /*
3380  * If the current segment is being streamed from the primary, calculate
3381  * how much of the current page we have received already. We know the
3382  * requested record has been received, but this is for the benefit of
3383  * future calls, to allow quick exit at the top of this function.
3384  */
3386  {
3387  if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3388  readLen = XLOG_BLCKSZ;
3389  else
3391  targetPageOff;
3392  }
3393  else
3394  readLen = XLOG_BLCKSZ;
3395 
3396  /* Read the requested page */
3397  readOff = targetPageOff;
3398 
3399  pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3400  r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3401  if (r != XLOG_BLCKSZ)
3402  {
3403  char fname[MAXFNAMELEN];
3404  int save_errno = errno;
3405 
3408  if (r < 0)
3409  {
3410  errno = save_errno;
3411  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3413  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3414  fname, LSN_FORMAT_ARGS(targetPagePtr),
3415  readOff)));
3416  }
3417  else
3418  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3420  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3421  fname, LSN_FORMAT_ARGS(targetPagePtr),
3422  readOff, r, (Size) XLOG_BLCKSZ)));
3423  goto next_record_is_invalid;
3424  }
3426 
3427  Assert(targetSegNo == readSegNo);
3428  Assert(targetPageOff == readOff);
3429  Assert(reqLen <= readLen);
3430 
3432 
3433  /*
3434  * Check the page header immediately, so that we can retry immediately if
3435  * it's not valid. This may seem unnecessary, because ReadPageInternal()
3436  * validates the page header anyway, and would propagate the failure up to
3437  * ReadRecord(), which would retry. However, there's a corner case with
3438  * continuation records, if a record is split across two pages such that
3439  * we would need to read the two pages from different sources. For
3440  * example, imagine a scenario where a streaming replica is started up,
3441  * and replay reaches a record that's split across two WAL segments. The
3442  * first page is only available locally, in pg_wal, because it's already
3443  * been recycled on the primary. The second page, however, is not present
3444  * in pg_wal, and we should stream it from the primary. There is a
3445  * recycled WAL segment present in pg_wal, with garbage contents, however.
3446  * We would read the first page from the local WAL segment, but when
3447  * reading the second page, we would read the bogus, recycled, WAL
3448  * segment. If we didn't catch that case here, we would never recover,
3449  * because ReadRecord() would retry reading the whole record from the
3450  * beginning.
3451  *
3452  * Of course, this only catches errors in the page header, which is what
3453  * happens in the case of a recycled WAL segment. Other kinds of errors or
3454  * corruption still has the same problem. But this at least fixes the
3455  * common case, which can happen as part of normal operation.
3456  *
3457  * Validating the page header is cheap enough that doing it twice
3458  * shouldn't be a big deal from a performance point of view.
3459  *
3460  * When not in standby mode, an invalid page header should cause recovery
3461  * to end, not retry reading the page, so we don't need to validate the
3462  * page header here for the retry. Instead, ReadPageInternal() is
3463  * responsible for the validation.
3464  */
3465  if (StandbyMode &&
3466  !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3467  {
3468  /*
3469  * Emit this error right now then retry this page immediately. Use
3470  * errmsg_internal() because the message was already translated.
3471  */
3472  if (xlogreader->errormsg_buf[0])
3475 
3476  /* reset any error XLogReaderValidatePageHeader() might have set */
3478  goto next_record_is_invalid;
3479  }
3480 
3481  return readLen;
3482 
3483 next_record_is_invalid:
3484 
3485  /*
3486  * If we're reading ahead, give up fast. Retries and error reporting will
3487  * be handled by a later read when recovery catches up to this point.
3488  */
3489  if (xlogreader->nonblocking)
3490  return XLREAD_WOULDBLOCK;
3491 
3492  lastSourceFailed = true;
3493 
3494  if (readFile >= 0)
3495  close(readFile);
3496  readFile = -1;
3497  readLen = 0;
3499 
3500  /* In standby-mode, keep trying */
3501  if (StandbyMode)
3502  goto retry;
3503  else
3504  return XLREAD_FAIL;
3505 }
3506 
3507 /*
3508  * Open the WAL segment containing WAL location 'RecPtr'.
3509  *
3510  * The segment can be fetched via restore_command, or via walreceiver having
3511  * streamed the record, or it can already be present in pg_wal. Checking
3512  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3513  * too, in case someone copies a new segment directly to pg_wal. That is not
3514  * documented or recommended, though.
3515  *
3516  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3517  * prepare to read WAL starting from RedoStartLSN after this.
3518  *
3519  * 'RecPtr' might not point to the beginning of the record we're interested
3520  * in, it might also point to the page or segment header. In that case,
3521  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3522  * used to decide which timeline to stream the requested WAL from.
3523  *
3524  * 'replayLSN' is the current replay LSN, so that if we scan for new
3525  * timelines, we can reject a switch to a timeline that branched off before
3526  * this point.
3527  *
3528  * If the record is not immediately available, the function returns false
3529  * if we're not in standby mode. In standby mode, waits for it to become
3530  * available.
3531  *
3532  * When the requested record becomes available, the function opens the file
3533  * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3534  * of standby mode is triggered by the user, and there is no more WAL
3535  * available, returns XLREAD_FAIL.
3536  *
3537  * If nonblocking is true, then give up immediately if we can't satisfy the
3538  * request, returning XLREAD_WOULDBLOCK instead of waiting.
3539  */
3540 static XLogPageReadResult
3541 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3542  bool fetching_ckpt, XLogRecPtr tliRecPtr,
3543  TimeLineID replayTLI, XLogRecPtr replayLSN,
3544  bool nonblocking)
3545 {
3546  static TimestampTz last_fail_time = 0;
3547  TimestampTz now;
3548  bool streaming_reply_sent = false;
3549 
3550  /*-------
3551  * Standby mode is implemented by a state machine:
3552  *
3553  * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3554  * pg_wal (XLOG_FROM_PG_WAL)
3555  * 2. Check for promotion trigger request
3556  * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3557  * 4. Rescan timelines
3558  * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3559  *
3560  * Failure to read from the current source advances the state machine to
3561  * the next state.
3562  *
3563  * 'currentSource' indicates the current state. There are no currentSource
3564  * values for "check trigger", "rescan timelines", and "sleep" states,
3565  * those actions are taken when reading from the previous source fails, as
3566  * part of advancing to the next state.
3567  *
3568  * If standby mode is turned off while reading WAL from stream, we move
3569  * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3570  * the files (which would be required at end of recovery, e.g., timeline
3571  * history file) from archive or pg_wal. We don't need to kill WAL receiver
3572  * here because it's already stopped when standby mode is turned off at
3573  * the end of recovery.
3574  *-------
3575  */
3576  if (!InArchiveRecovery)
3578  else if (currentSource == XLOG_FROM_ANY ||
3580  {
3581  lastSourceFailed = false;
3583  }
3584 
3585  for (;;)
3586  {
3587  XLogSource oldSource = currentSource;
3588  bool startWalReceiver = false;
3589 
3590  /*
3591  * First check if we failed to read from the current source, and
3592  * advance the state machine if so. The failure to read might've
3593  * happened outside this function, e.g when a CRC check fails on a
3594  * record, or within this loop.
3595  */
3596  if (lastSourceFailed)
3597  {
3598  /*
3599  * Don't allow any retry loops to occur during nonblocking
3600  * readahead. Let the caller process everything that has been
3601  * decoded already first.
3602  */
3603  if (nonblocking)
3604  return XLREAD_WOULDBLOCK;
3605 
3606  switch (currentSource)
3607  {
3608  case XLOG_FROM_ARCHIVE:
3609  case XLOG_FROM_PG_WAL:
3610 
3611  /*
3612  * Check to see if promotion is requested. Note that we do
3613  * this only after failure, so when you promote, we still
3614  * finish replaying as much as we can from archive and
3615  * pg_wal before failover.
3616  */
3618  {
3620  return XLREAD_FAIL;
3621  }
3622 
3623  /*
3624  * Not in standby mode, and we've now tried the archive
3625  * and pg_wal.
3626  */
3627  if (!StandbyMode)
3628  return XLREAD_FAIL;
3629 
3630  /*
3631  * Move to XLOG_FROM_STREAM state, and set to start a
3632  * walreceiver if necessary.
3633  */
3635  startWalReceiver = true;
3636  break;
3637 
3638  case XLOG_FROM_STREAM:
3639 
3640  /*
3641  * Failure while streaming. Most likely, we got here
3642  * because streaming replication was terminated, or
3643  * promotion was triggered. But we also get here if we
3644  * find an invalid record in the WAL streamed from the
3645  * primary, in which case something is seriously wrong.
3646  * There's little chance that the problem will just go
3647  * away, but PANIC is not good for availability either,
3648  * especially in hot standby mode. So, we treat that the
3649  * same as disconnection, and retry from archive/pg_wal
3650  * again. The WAL in the archive should be identical to
3651  * what was streamed, so it's unlikely that it helps, but
3652  * one can hope...
3653  */
3654 
3655  /*
3656  * We should be able to move to XLOG_FROM_STREAM only in
3657  * standby mode.
3658  */
3660 
3661  /*
3662  * Before we leave XLOG_FROM_STREAM state, make sure that
3663  * walreceiver is not active, so that it won't overwrite
3664  * WAL that we restore from archive.
3665  */
3667 
3668  /*
3669  * Before we sleep, re-scan for possible new timelines if
3670  * we were requested to recover to the latest timeline.
3671  */
3673  {
3674  if (rescanLatestTimeLine(replayTLI, replayLSN))
3675  {
3677  break;
3678  }
3679  }
3680 
3681  /*
3682  * XLOG_FROM_STREAM is the last state in our state
3683  * machine, so we've exhausted all the options for
3684  * obtaining the requested WAL. We're going to loop back
3685  * and retry from the archive, but if it hasn't been long
3686  * since last attempt, sleep wal_retrieve_retry_interval
3687  * milliseconds to avoid busy-waiting.
3688  */
3690  if (!TimestampDifferenceExceeds(last_fail_time, now,
3692  {
3693  long wait_time;
3694 
3695  wait_time = wal_retrieve_retry_interval -
3696  TimestampDifferenceMilliseconds(last_fail_time, now);
3697 
3698  elog(LOG, "waiting for WAL to become available at %X/%X",
3699  LSN_FORMAT_ARGS(RecPtr));
3700 
3701  /* Do background tasks that might benefit us later. */
3703 
3707  wait_time,
3708  WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3711 
3712  /* Handle interrupt signals of startup process */
3714  }
3715  last_fail_time = now;
3717  break;
3718 
3719  default:
3720  elog(ERROR, "unexpected WAL source %d", currentSource);
3721  }
3722  }
3723  else if (currentSource == XLOG_FROM_PG_WAL)
3724  {
3725  /*
3726  * We just successfully read a file in pg_wal. We prefer files in
3727  * the archive over ones in pg_wal, so try the next file again
3728  * from the archive first.
3729  */
3730  if (InArchiveRecovery)
3732  }
3733 
3734  if (currentSource != oldSource)
3735  elog(DEBUG2, "switched WAL source from %s to %s after %s",
3737  lastSourceFailed ? "failure" : "success");
3738 
3739  /*
3740  * We've now handled possible failure. Try to read from the chosen
3741  * source.
3742  */
3743  lastSourceFailed = false;
3744 
3745  switch (currentSource)
3746  {
3747  case XLOG_FROM_ARCHIVE:
3748  case XLOG_FROM_PG_WAL:
3749 
3750  /*
3751  * WAL receiver must not be running when reading WAL from
3752  * archive or pg_wal.
3753  */
3754  Assert(!WalRcvStreaming());
3755 
3756  /* Close any old file we might have open. */
3757  if (readFile >= 0)
3758  {
3759  close(readFile);
3760  readFile = -1;
3761  }
3762  /* Reset curFileTLI if random fetch. */
3763  if (randAccess)
3764  curFileTLI = 0;
3765 
3766  /*
3767  * Try to restore the file from archive, or read an existing
3768  * file from pg_wal.
3769  */
3772  currentSource);
3773  if (readFile >= 0)
3774  return XLREAD_SUCCESS; /* success! */
3775 
3776  /*
3777  * Nope, not found in archive or pg_wal.
3778  */
3779  lastSourceFailed = true;
3780  break;
3781 
3782  case XLOG_FROM_STREAM:
3783  {
3784  bool havedata;
3785 
3786  /*
3787  * We should be able to move to XLOG_FROM_STREAM only in
3788  * standby mode.
3789  */
3791 
3792  /*
3793  * First, shutdown walreceiver if its restart has been
3794  * requested -- but no point if we're already slated for
3795  * starting it.
3796  */
3797  if (pendingWalRcvRestart && !startWalReceiver)
3798  {
3800 
3801  /*
3802  * Re-scan for possible new timelines if we were
3803  * requested to recover to the latest timeline.
3804  */
3807  rescanLatestTimeLine(replayTLI, replayLSN);
3808 
3809  startWalReceiver = true;
3810  }
3811  pendingWalRcvRestart = false;
3812 
3813  /*
3814  * Launch walreceiver if needed.
3815  *
3816  * If fetching_ckpt is true, RecPtr points to the initial
3817  * checkpoint location. In that case, we use RedoStartLSN
3818  * as the streaming start position instead of RecPtr, so
3819  * that when we later jump backwards to start redo at
3820  * RedoStartLSN, we will have the logs streamed already.
3821  */
3822  if (startWalReceiver &&
3823  PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3824  {
3825  XLogRecPtr ptr;
3826  TimeLineID tli;
3827 
3828  if (fetching_ckpt)
3829  {
3830  ptr = RedoStartLSN;
3831  tli = RedoStartTLI;
3832  }
3833  else
3834  {
3835  ptr = RecPtr;
3836 
3837  /*
3838  * Use the record begin position to determine the
3839  * TLI, rather than the position we're reading.
3840  */
3841  tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3842 
3843  if (curFileTLI > 0 && tli < curFileTLI)
3844  elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3845  LSN_FORMAT_ARGS(tliRecPtr),
3846  tli, curFileTLI);
3847  }
3848  curFileTLI = tli;
3853  flushedUpto = 0;
3854  }
3855 
3856  /*
3857  * Check if WAL receiver is active or wait to start up.
3858  */
3859  if (!WalRcvStreaming())
3860  {
3861  lastSourceFailed = true;
3862  break;
3863  }
3864 
3865  /*
3866  * Walreceiver is active, so see if new data has arrived.
3867  *
3868  * We only advance XLogReceiptTime when we obtain fresh
3869  * WAL from walreceiver and observe that we had already
3870  * processed everything before the most recent "chunk"
3871  * that it flushed to disk. In steady state where we are
3872  * keeping up with the incoming data, XLogReceiptTime will
3873  * be updated on each cycle. When we are behind,
3874  * XLogReceiptTime will not advance, so the grace time
3875  * allotted to conflicting queries will decrease.
3876  */
3877  if (RecPtr < flushedUpto)
3878  havedata = true;
3879  else
3880  {
3881  XLogRecPtr latestChunkStart;
3882 
3883  flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3884  if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3885  {
3886  havedata = true;
3887  if (latestChunkStart <= RecPtr)
3888  {
3891  }
3892  }
3893  else
3894  havedata = false;
3895  }
3896  if (havedata)
3897  {
3898  /*
3899  * Great, streamed far enough. Open the file if it's
3900  * not open already. Also read the timeline history
3901  * file if we haven't initialized timeline history
3902  * yet; it should be streamed over and present in
3903  * pg_wal by now. Use XLOG_FROM_STREAM so that source
3904  * info is set correctly and XLogReceiptTime isn't
3905  * changed.
3906  *
3907  * NB: We must set readTimeLineHistory based on
3908  * recoveryTargetTLI, not receiveTLI. Normally they'll
3909  * be the same, but if recovery_target_timeline is
3910  * 'latest' and archiving is configured, then it's
3911  * possible that we managed to retrieve one or more
3912  * new timeline history files from the archive,
3913  * updating recoveryTargetTLI.
3914  */
3915  if (readFile < 0)
3916  {
3917  if (!expectedTLEs)
3920  receiveTLI,
3921  XLOG_FROM_STREAM, false);
3922  Assert(readFile >= 0);
3923  }
3924  else
3925  {
3926  /* just make sure source info is correct... */
3929  return XLREAD_SUCCESS;
3930  }
3931  break;
3932  }
3933 
3934  /* In nonblocking mode, return rather than sleeping. */
3935  if (nonblocking)
3936  return XLREAD_WOULDBLOCK;
3937 
3938  /*
3939  * Data not here yet. Check for trigger, then wait for
3940  * walreceiver to wake us up when new WAL arrives.
3941  */
3942  if (CheckForStandbyTrigger())
3943  {
3944  /*
3945  * Note that we don't return XLREAD_FAIL immediately
3946  * here. After being triggered, we still want to
3947  * replay all the WAL that was already streamed. It's
3948  * in pg_wal now, so we just treat this as a failure,
3949  * and the state machine will move on to replay the
3950  * streamed WAL from pg_wal, and then recheck the
3951  * trigger and exit replay.
3952  */
3953  lastSourceFailed = true;
3954  break;
3955  }
3956 
3957  /*
3958  * Since we have replayed everything we have received so
3959  * far and are about to start waiting for more WAL, let's
3960  * tell the upstream server our replay location now so
3961  * that pg_stat_replication doesn't show stale
3962  * information.
3963  */
3964  if (!streaming_reply_sent)
3965  {
3966  WalRcvForceReply();
3967  streaming_reply_sent = true;
3968  }
3969 
3970  /* Do any background tasks that might benefit us later. */
3972 
3973  /* Update pg_stat_recovery_prefetch before sleeping. */
3975 
3976  /*
3977  * Wait for more WAL to arrive, when we will be woken
3978  * immediately by the WAL receiver.
3979  */
3982  -1L,
3983  WAIT_EVENT_RECOVERY_WAL_STREAM);
3985  break;
3986  }
3987 
3988  default:
3989  elog(ERROR, "unexpected WAL source %d", currentSource);
3990  }
3991 
3992  /*
3993  * Check for recovery pause here so that we can confirm more quickly
3994  * that a requested pause has actually taken effect.
3995  */
3996  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
3998  recoveryPausesHere(false);
3999 
4000  /*
4001  * This possibly-long loop needs to handle interrupts of startup
4002  * process.
4003  */
4005  }
4006 
4007  return XLREAD_FAIL; /* not reached */
4008 }
4009 
4010 
4011 /*
4012  * Determine what log level should be used to report a corrupt WAL record
4013  * in the current WAL page, previously read by XLogPageRead().
4014  *
4015  * 'emode' is the error mode that would be used to report a file-not-found
4016  * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4017  * we're retrying the exact same record that we've tried previously, only
4018  * complain the first time to keep the noise down. However, we only do when
4019  * reading from pg_wal, because we don't expect any invalid records in archive
4020  * or in records streamed from the primary. Files in the archive should be complete,
4021  * and we should never hit the end of WAL because we stop and wait for more WAL
4022  * to arrive before replaying it.
4023  *
4024  * NOTE: This function remembers the RecPtr value it was last called with,
4025  * to suppress repeated messages about the same record. Only call this when
4026  * you are about to ereport(), or you might cause a later message to be
4027  * erroneously suppressed.
4028  */
4029 static int
4031 {
4032  static XLogRecPtr lastComplaint = 0;
4033 
4034  if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4035  {
4036  if (RecPtr == lastComplaint)
4037  emode = DEBUG1;
4038  else
4039  lastComplaint = RecPtr;
4040  }
4041  return emode;
4042 }
4043 
4044 
4045 /*
4046  * Subroutine to try to fetch and validate a prior checkpoint record.
4047  */
4048 static XLogRecord *
4050  TimeLineID replayTLI)
4051 {
4052  XLogRecord *record;
4053  uint8 info;
4054 
4055  Assert(xlogreader != NULL);
4056 
4057  if (!XRecOffIsValid(RecPtr))
4058  {
4059  ereport(LOG,
4060  (errmsg("invalid checkpoint location")));
4061  return NULL;
4062  }
4063 
4065  record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4066 
4067  if (record == NULL)
4068  {
4069  ereport(LOG,
4070  (errmsg("invalid checkpoint record")));
4071  return NULL;
4072  }
4073  if (record->xl_rmid != RM_XLOG_ID)
4074  {
4075  ereport(LOG,
4076  (errmsg("invalid resource manager ID in checkpoint record")));
4077  return NULL;
4078  }
4079  info = record->xl_info & ~XLR_INFO_MASK;
4080  if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4081  info != XLOG_CHECKPOINT_ONLINE)
4082  {
4083  ereport(LOG,
4084  (errmsg("invalid xl_info in checkpoint record")));
4085  return NULL;
4086  }
4088  {
4089  ereport(LOG,
4090  (errmsg("invalid length of checkpoint record")));
4091  return NULL;
4092  }
4093  return record;
4094 }
4095 
4096 /*
4097  * Scan for new timelines that might have appeared in the archive since we
4098  * started recovery.
4099  *
4100  * If there are any, the function changes recovery target TLI to the latest
4101  * one and returns 'true'.
4102  */
4103 static bool
4105 {
4106  List *newExpectedTLEs;
4107  bool found;
4108  ListCell *cell;
4109  TimeLineID newtarget;
4110  TimeLineID oldtarget = recoveryTargetTLI;
4111  TimeLineHistoryEntry *currentTle = NULL;
4112 
4114  if (newtarget == recoveryTargetTLI)
4115  {
4116  /* No new timelines found */
4117  return false;
4118  }
4119 
4120  /*
4121  * Determine the list of expected TLIs for the new TLI
4122  */
4123 
4124  newExpectedTLEs = readTimeLineHistory(newtarget);
4125 
4126  /*
4127  * If the current timeline is not part of the history of the new timeline,
4128  * we cannot proceed to it.
4129  */
4130  found = false;
4131  foreach(cell, newExpectedTLEs)
4132  {
4133  currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4134 
4135  if (currentTle->tli == recoveryTargetTLI)
4136  {
4137  found = true;
4138  break;
4139  }
4140  }
4141  if (!found)
4142  {
4143  ereport(LOG,
4144  (errmsg("new timeline %u is not a child of database system timeline %u",
4145  newtarget,
4146  replayTLI)));
4147  return false;
4148  }
4149 
4150  /*
4151  * The current timeline was found in the history file, but check that the
4152  * next timeline was forked off from it *after* the current recovery
4153  * location.
4154  */
4155  if (currentTle->end < replayLSN)
4156  {
4157  ereport(LOG,
4158  (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4159  newtarget,
4160  replayTLI,
4161  LSN_FORMAT_ARGS(replayLSN))));
4162  return false;
4163  }
4164 
4165  /* The new timeline history seems valid. Switch target */
4166  recoveryTargetTLI = newtarget;
4168  expectedTLEs = newExpectedTLEs;
4169 
4170  /*
4171  * As in StartupXLOG(), try to ensure we have all the history files
4172  * between the old target and new target in pg_wal.
4173  */
4174  restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4175 
4176  ereport(LOG,
4177  (errmsg("new target timeline is %u",
4178  recoveryTargetTLI)));
4179 
4180  return true;
4181 }
4182 
4183 
4184 /*
4185  * Open a logfile segment for reading (during recovery).
4186  *
4187  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4188  * Otherwise, it's assumed to be already available in pg_wal.
4189  */
4190 static int
4191 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
4192  XLogSource source, bool notfoundOk)
4193 {
4194  char xlogfname[MAXFNAMELEN];
4195  char activitymsg[MAXFNAMELEN + 16];
4196  char path[MAXPGPATH];
4197  int fd;
4198 
4199  XLogFileName(xlogfname, tli, segno, wal_segment_size);
4200 
4201  switch (source)
4202  {
4203  case XLOG_FROM_ARCHIVE:
4204  /* Report recovery progress in PS display */
4205  snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4206  xlogfname);
4207  set_ps_display(activitymsg);
4208 
4209  if (!RestoreArchivedFile(path, xlogfname,
4210  "RECOVERYXLOG",
4212  InRedo))
4213  return -1;
4214  break;
4215 
4216  case XLOG_FROM_PG_WAL:
4217  case XLOG_FROM_STREAM:
4218  XLogFilePath(path, tli, segno, wal_segment_size);
4219  break;
4220 
4221  default:
4222  elog(ERROR, "invalid XLogFileRead source %d", source);
4223  }
4224 
4225  /*
4226  * If the segment was fetched from archival storage, replace the existing
4227  * xlog segment (if any) with the archival version.
4228  */
4229  if (source == XLOG_FROM_ARCHIVE)
4230  {
4232  KeepFileRestoredFromArchive(path, xlogfname);
4233 
4234  /*
4235  * Set path to point at the new file in pg_wal.
4236  */
4237  snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4238  }
4239 
4240  fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4241  if (fd >= 0)
4242  {
4243  /* Success! */
4244  curFileTLI = tli;
4245 
4246  /* Report recovery progress in PS display */
4247  snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4248  xlogfname);
4249  set_ps_display(activitymsg);
4250 
4251  /* Track source of data in assorted state variables */
4252  readSource = source;
4254  /* In FROM_STREAM case, caller tracks receipt time, not me */
4255  if (source != XLOG_FROM_STREAM)
4257 
4258  return fd;
4259  }
4260  if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4261  ereport(PANIC,
4263  errmsg("could not open file \"%s\": %m", path)));
4264  return -1;
4265 }
4266 
4267 /*
4268  * Open a logfile segment for reading (during recovery).
4269  *
4270  * This version searches for the segment with any TLI listed in expectedTLEs.
4271  */
4272 static int
4274 {
4275  char path[MAXPGPATH];
4276  ListCell *cell;
4277  int fd;
4278  List *tles;
4279 
4280  /*
4281  * Loop looking for a suitable timeline ID: we might need to read any of
4282  * the timelines listed in expectedTLEs.
4283  *
4284  * We expect curFileTLI on entry to be the TLI of the preceding file in
4285  * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4286  * to go backwards; this prevents us from picking up the wrong file when a
4287  * parent timeline extends to higher segment numbers than the child we
4288  * want to read.
4289  *
4290  * If we haven't read the timeline history file yet, read it now, so that
4291  * we know which TLIs to scan. We don't save the list in expectedTLEs,
4292  * however, unless we actually find a valid segment. That way if there is
4293  * neither a timeline history file nor a WAL segment in the archive, and
4294  * streaming replication is set up, we'll read the timeline history file
4295  * streamed from the primary when we start streaming, instead of
4296  * recovering with a dummy history generated here.
4297  */
4298  if (expectedTLEs)
4299  tles = expectedTLEs;
4300  else
4302 
4303  foreach(cell, tles)
4304  {
4306  TimeLineID tli = hent->tli;
4307 
4308  if (tli < curFileTLI)
4309  break; /* don't bother looking at too-old TLIs */
4310 
4311  /*
4312  * Skip scanning the timeline ID that the logfile segment to read
4313  * doesn't belong to
4314  */
4315  if (hent->begin != InvalidXLogRecPtr)
4316  {
4317  XLogSegNo beginseg = 0;
4318 
4319  XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4320 
4321  /*
4322  * The logfile segment that doesn't belong to the timeline is
4323  * older or newer than the segment that the timeline started or
4324  * ended at, respectively. It's sufficient to check only the
4325  * starting segment of the timeline here. Since the timelines are
4326  * scanned in descending order in this loop, any segments newer
4327  * than the ending segment should belong to newer timeline and
4328  * have already been read before. So it's not necessary to check
4329  * the ending segment of the timeline here.
4330  */
4331  if (segno < beginseg)
4332  continue;
4333  }
4334 
4336  {
4337  fd = XLogFileRead(segno, emode, tli,
4338  XLOG_FROM_ARCHIVE, true);
4339  if (fd != -1)
4340  {
4341  elog(DEBUG1, "got WAL segment from archive");
4342  if (!expectedTLEs)
4343  expectedTLEs = tles;
4344  return fd;
4345  }
4346  }
4347 
4349  {
4350  fd = XLogFileRead(segno, emode, tli,
4351  XLOG_FROM_PG_WAL, true);
4352  if (fd != -1)
4353  {
4354  if (!expectedTLEs)
4355  expectedTLEs = tles;
4356  return fd;
4357  }
4358  }
4359  }
4360 
4361  /* Couldn't find it. For simplicity, complain about front timeline */
4363  errno = ENOENT;
4364  ereport(emode,
4366  errmsg("could not open file \"%s\": %m", path)));
4367  return -1;
4368 }
4369 
4370 /*
4371  * Set flag to signal the walreceiver to restart. (The startup process calls
4372  * this on noticing a relevant configuration change.)
4373  */
4374 void
4376 {
4378  {
4379  ereport(LOG,
4380  (errmsg("WAL receiver process shutdown requested")));
4381 
4382  pendingWalRcvRestart = true;
4383  }
4384 }
4385 
4386 
4387 /*
4388  * Has a standby promotion already been triggered?
4389  *
4390  * Unlike CheckForStandbyTrigger(), this works in any process
4391  * that's connected to shared memory.
4392  */
4393 bool
4395 {
4396  /*
4397  * We check shared state each time only until a standby promotion is
4398  * triggered. We can't trigger a promotion again, so there's no need to
4399  * keep checking after the shared variable has once been seen true.
4400  */
4402  return true;
4403 
4407 
4408  return LocalPromoteIsTriggered;
4409 }
4410 
4411 static void
4413 {
4417 
4418  /*
4419  * Mark the recovery pause state as 'not paused' because the paused state
4420  * ends and promotion continues if a promotion is triggered while recovery
4421  * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4422  * return 'paused' while a promotion is ongoing.
4423  */
4424  SetRecoveryPause(false);
4425 
4426  LocalPromoteIsTriggered = true;
4427 }
4428 
4429 /*
4430  * Check whether a promote request has arrived.
4431  */
4432 static bool
4434 {
4436  return true;
4437 
4439  {
4440  ereport(LOG, (errmsg("received promote request")));
4444  return true;
4445  }
4446 
4447  return false;
4448 }
4449 
4450 /*
4451  * Remove the files signaling a standby promotion request.
4452  */
4453 void
4455 {
4456  unlink(PROMOTE_SIGNAL_FILE);
4457 }
4458 
4459 /*
4460  * Check to see if a promote request has arrived.
4461  */
4462 bool
4464 {
4465  struct stat stat_buf;
4466 
4467  if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4468  return true;
4469 
4470  return false;
4471 }
4472 
4473 /*
4474  * Wake up startup process to replay newly arrived WAL, or to notice that
4475  * failover has been requested.
4476  */
4477 void
4479 {
4481 }
4482 
4483 /*
4484  * Schedule a walreceiver wakeup in the main recovery loop.
4485  */
4486 void
4488 {
4490 }
4491 
4492 /*
4493  * Is HotStandby active yet? This is only important in special backends
4494  * since normal backends won't ever be able to connect until this returns
4495  * true. Postmaster knows this by way of signal, not via shared memory.
4496  *
4497  * Unlike testing standbyState, this works in any process that's connected to
4498  * shared memory. (And note that standbyState alone doesn't tell the truth
4499  * anyway.)
4500  */
4501 bool
4503 {
4504  /*
4505  * We check shared state each time only until Hot Standby is active. We
4506  * can't de-activate Hot Standby, so there's no need to keep checking
4507  * after the shared variable has once been seen true.
4508  */
4510  return true;
4511  else
4512  {
4513  /* spinlock is essential on machines with weak memory ordering! */
4517 
4518  return LocalHotStandbyActive;
4519  }
4520 }
4521 
4522 /*
4523  * Like HotStandbyActive(), but to be used only in WAL replay code,
4524  * where we don't need to ask any other process what the state is.
4525  */
4526 static bool
4528 {
4530  return LocalHotStandbyActive;
4531 }
4532 
4533 /*
4534  * Get latest redo apply position.
4535  *
4536  * Exported to allow WALReceiver to read the pointer directly.
4537  */
4538 XLogRecPtr
4540 {
4541  XLogRecPtr recptr;
4542  TimeLineID tli;
4543 
4548 
4549  if (replayTLI)
4550  *replayTLI = tli;
4551  return recptr;
4552 }
4553 
4554 
4555 /*
4556  * Get position of last applied, or the record being applied.
4557  *
4558  * This is different from GetXLogReplayRecPtr() in that if a WAL
4559  * record is currently being applied, this includes that record.
4560  */
4561 XLogRecPtr
4563 {
4564  XLogRecPtr recptr;
4565  TimeLineID tli;
4566 
4568  recptr = XLogRecoveryCtl->replayEndRecPtr;
4571 
4572  if (replayEndTLI)
4573  *replayEndTLI = tli;
4574  return recptr;
4575 }
4576 
4577 /*
4578  * Save timestamp of latest processed commit/abort record.
4579  *
4580  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4581  * seen by processes other than the startup process. Note in particular
4582  * that CreateRestartPoint is executed in the checkpointer.
4583  */
4584 static void
4586 {
4590 }
4591 
4592 /*
4593  * Fetch timestamp of latest processed commit/abort record.
4594  */
4597 {
4598  TimestampTz xtime;
4599 
4603 
4604  return xtime;
4605 }
4606 
4607 /*
4608  * Save timestamp of the next chunk of WAL records to apply.
4609  *
4610  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4611  * seen by all backends.
4612  */
4613 static void
4615 {
4619 }
4620 
4621 /*
4622  * Fetch timestamp of latest processed commit/abort record.
4623  * Startup process maintains an accurate local copy in XLogReceiptTime
4624  */
4627 {
4628  TimestampTz xtime;
4629 
4633 
4634  return xtime;
4635 }
4636 
4637 /*
4638  * Returns time of receipt of current chunk of XLOG data, as well as
4639  * whether it was received from streaming replication or from archives.
4640  */
4641 void
4642 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4643 {
4644  /*
4645  * This must be executed in the startup process, since we don't export the
4646  * relevant state to shared memory.
4647  */
4648  Assert(InRecovery);
4649 
4650  *rtime = XLogReceiptTime;
4651  *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4652 }
4653 
4654 /*
4655  * Note that text field supplied is a parameter name and does not require
4656  * translation
4657  */
4658 void
4659 RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4660 {
4661  if (currValue < minValue)
4662  {
4664  {
4665  bool warned_for_promote = false;
4666 
4667  ereport(WARNING,
4668  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4669  errmsg("hot standby is not possible because of insufficient parameter settings"),
4670  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4671  param_name,
4672  currValue,
4673  minValue)));
4674 
4675  SetRecoveryPause(true);
4676 
4677  ereport(LOG,
4678  (errmsg("recovery has paused"),
4679  errdetail("If recovery is unpaused, the server will shut down."),
4680  errhint("You can then restart the server after making the necessary configuration changes.")));
4681 
4683  {
4685 
4686  if (CheckForStandbyTrigger())
4687  {
4688  if (!warned_for_promote)
4689  ereport(WARNING,
4690  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4691  errmsg("promotion is not possible because of insufficient parameter settings"),
4692 
4693  /*
4694  * Repeat the detail from above so it's easy to find
4695  * in the log.
4696  */
4697  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4698  param_name,
4699  currValue,
4700  minValue),
4701  errhint("Restart the server after making the necessary configuration changes.")));
4702  warned_for_promote = true;
4703  }
4704 
4705  /*
4706  * If recovery pause is requested then set it paused. While
4707  * we are in the loop, user might resume and pause again so
4708  * set this every time.
4709  */
4711 
4712  /*
4713  * We wait on a condition variable that will wake us as soon
4714  * as the pause ends, but we use a timeout so we can check the
4715  * above conditions periodically too.
4716  */
4718  WAIT_EVENT_RECOVERY_PAUSE);
4719  }
4721  }
4722 
4723  ereport(FATAL,
4724  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4725  errmsg("recovery aborted because of insufficient parameter settings"),
4726  /* Repeat the detail from above so it's easy to find in the log. */
4727  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4728  param_name,
4729  currValue,
4730  minValue),
4731  errhint("You can restart the server after making the necessary configuration changes.")));
4732  }
4733 }
4734 
4735 
4736 /*
4737  * GUC check_hook for primary_slot_name
4738  */
4739 bool
4741 {
4742  if (*newval && strcmp(*newval, "") != 0 &&
4744  return false;
4745 
4746  return true;
4747 }
4748 
4749 /*
4750  * Recovery target settings: Only one of the several recovery_target* settings
4751  * may be set. Setting a second one results in an error. The global variable
4752  * recoveryTarget tracks which kind of recovery target was chosen. Other
4753  * variables store the actual target value (for example a string or a xid).
4754  * The assign functions of the parameters check whether a competing parameter
4755  * was already set. But we want to allow setting the same parameter multiple
4756  * times. We also want to allow unsetting a parameter and setting a different
4757  * one, so we unset recoveryTarget when the parameter is set to an empty
4758  * string.
4759  *
4760  * XXX this code is broken by design. Throwing an error from a GUC assign
4761  * hook breaks fundamental assumptions of guc.c. So long as all the variables
4762  * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4763  * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4764  * that we have odd behaviors such as unexpected GUC ordering dependencies.
4765  */
4766 
4767 static void
4769 error_multiple_recovery_targets(void)
4770 {
4771  ereport(ERROR,
4772  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4773  errmsg("multiple recovery targets specified"),
4774  errdetail("At most one of recovery_target, recovery_target_lsn, recovery_target_name, recovery_target_time, recovery_target_xid may be set.")));
4775 }
4776 
4777 /*
4778  * GUC check_hook for recovery_target
4779  */
4780 bool
4782 {
4783  if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4784  {
4785  GUC_check_errdetail("The only allowed value is \"immediate\".");
4786  return false;
4787  }
4788  return true;
4789 }
4790 
4791 /*
4792  * GUC assign_hook for recovery_target
4793  */
4794 void
4795 assign_recovery_target(const char *newval, void *extra)
4796 {
4799  error_multiple_recovery_targets();
4800 
4801  if (newval && strcmp(newval, "") != 0)
4803  else
4805 }
4806 
4807 /*
4808  * GUC check_hook for recovery_target_lsn
4809  */
4810 bool
4812 {
4813  if (strcmp(*newval, "") != 0)
4814  {
4815  XLogRecPtr lsn;
4816  XLogRecPtr *myextra;
4817  bool have_error = false;
4818 
4819  lsn = pg_lsn_in_internal(*newval, &have_error);
4820  if (have_error)
4821  return false;
4822 
4823  myextra = (XLogRecPtr *) guc_malloc(ERROR, sizeof(XLogRecPtr));
4824  *myextra = lsn;
4825  *extra = (void *) myextra;
4826  }
4827  return true;
4828 }
4829 
4830 /*
4831  * GUC assign_hook for recovery_target_lsn
4832  */
4833 void
4834 assign_recovery_target_lsn(const char *newval, void *extra)
4835 {
4838  error_multiple_recovery_targets();
4839 
4840  if (newval && strcmp(newval, "") != 0)
4841  {
4843  recoveryTargetLSN = *((XLogRecPtr *) extra);
4844  }
4845  else
4847 }
4848 
4849 /*
4850  * GUC check_hook for recovery_target_name
4851  */
4852 bool
4854 {
4855  /* Use the value of newval directly */
4856  if (strlen(*newval) >= MAXFNAMELEN)
4857  {
4858  GUC_check_errdetail("%s is too long (maximum %d characters).",
4859  "recovery_target_name", MAXFNAMELEN - 1);
4860  return false;
4861  }
4862  return true;
4863 }
4864 
4865 /*
4866  * GUC assign_hook for recovery_target_name
4867  */
4868 void
4869 assign_recovery_target_name(const char *newval, void *extra)
4870 {
4873  error_multiple_recovery_targets();
4874 
4875  if (newval && strcmp(newval, "") != 0)
4876  {
4879  }
4880  else
4882 }
4883 
4884 /*
4885  * GUC check_hook for recovery_target_time
4886  *
4887  * The interpretation of the recovery_target_time string can depend on the
4888  * time zone setting, so we need to wait until after all GUC processing is
4889  * done before we can do the final parsing of the string. This check function
4890  * only does a parsing pass to catch syntax errors, but we store the string
4891  * and parse it again when we need to use it.
4892  */
4893 bool
4895 {
4896  if (strcmp(*newval, "") != 0)
4897  {
4898  /* reject some special values */
4899  if (strcmp(*newval, "now") == 0 ||
4900  strcmp(*newval, "today") == 0 ||
4901  strcmp(*newval, "tomorrow") == 0 ||
4902  strcmp(*newval, "yesterday") == 0)
4903  {
4904  return false;
4905  }
4906 
4907  /*
4908  * parse timestamp value (see also timestamptz_in())
4909  */
4910  {
4911  char *str = *newval;
4912  fsec_t fsec;
4913  struct pg_tm tt,
4914  *tm = &tt;
4915  int tz;
4916  int dtype;
4917  int nf;
4918  int dterr;
4919  char *field[MAXDATEFIELDS];
4920  int ftype[MAXDATEFIELDS];
4921  char workbuf[MAXDATELEN + MAXDATEFIELDS];
4922  DateTimeErrorExtra dtextra;
4924 
4925  dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4926  field, ftype, MAXDATEFIELDS, &nf);
4927  if (dterr == 0)
4928  dterr = DecodeDateTime(field, ftype, nf,
4929  &dtype, tm, &fsec, &tz, &dtextra);
4930  if (dterr != 0)
4931  return false;
4932  if (dtype != DTK_DATE)
4933  return false;
4934 
4935  if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4936  {
4937  GUC_check_errdetail("timestamp out of range: \"%s\"", str);
4938  return false;
4939  }
4940  }
4941  }
4942  return true;
4943 }
4944 
4945 /*
4946  * GUC assign_hook for recovery_target_time
4947  */
4948 void
4949 assign_recovery_target_time(const char *newval, void *extra)
4950 {
4953  error_multiple_recovery_targets();
4954 
4955  if (newval && strcmp(newval, "") != 0)
4957  else
4959 }
4960 
4961 /*
4962  * GUC check_hook for recovery_target_timeline
4963  */
4964 bool
4966 {
4968  RecoveryTargetTimeLineGoal *myextra;
4969 
4970  if (strcmp(*newval, "current") == 0)
4972  else if (strcmp(*newval, "latest") == 0)
4974  else
4975  {
4977 
4978  errno = 0;
4979  strtoul(*newval, NULL, 0);
4980  if (errno == EINVAL || errno == ERANGE)
4981  {
4982  GUC_check_errdetail("recovery_target_timeline is not a valid number.");
4983  return false;
4984  }
4985  }
4986 
4988  *myextra = rttg;
4989  *extra = (void *) myextra;
4990 
4991  return true;
4992 }
4993 
4994 /*
4995  * GUC assign_hook for recovery_target_timeline
4996  */
4997 void
4998 assign_recovery_target_timeline(const char *newval, void *extra)
4999 {
5002  recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5003  else
5005 }
5006 
5007 /*
5008  * GUC check_hook for recovery_target_xid
5009  */
5010 bool
5012 {
5013  if (strcmp(*newval, "") != 0)
5014  {
5015  TransactionId xid;
5016  TransactionId *myextra;
5017 
5018  errno = 0;
5019  xid = (TransactionId) strtou64(*newval, NULL, 0);
5020  if (errno == EINVAL || errno == ERANGE)
5021  return false;
5022 
5023  myextra = (TransactionId *) guc_malloc(ERROR, sizeof(TransactionId));
5024  *myextra = xid;
5025  *extra = (void *) myextra;
5026  }
5027  return true;
5028 }
5029 
5030 /*
5031  * GUC assign_hook for recovery_target_xid
5032  */
5033 void
5034 assign_recovery_target_xid(const char *newval, void *extra)
5035 {
5038  error_multiple_recovery_targets();
5039 
5040  if (newval && strcmp(newval, "") != 0)
5041  {
5043  recoveryTargetXid = *((TransactionId *) extra);
5044  }
5045  else
5047 }
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition: timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:544
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:572
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition: timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition: timeline.c:50
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition: timeline.c:526
void remove_tablespace_symlink(const char *linkloc)
Definition: tablespace.c:883
bool allow_in_place_tablespaces
Definition: tablespace.c:85
void HandleStartupProcInterrupts(void)
Definition: startup.c:154
void disable_startup_progress_timeout(void)
Definition: startup.c:309
bool IsPromoteSignaled(void)
Definition: startup.c:288
void begin_startup_progress_phase(void)
Definition: startup.c:343
void ResetPromoteSignaled(void)
Definition: startup.c:294
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition: datetime.c:754
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition: datetime.c:978
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1766
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition: timestamp.c:1997
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1790
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition: timestamp.c:416
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1654
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1618
const char * timestamptz_to_str(TimestampTz t)
Definition: timestamp.c:1853
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4867
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5085
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:408
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:199
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:51
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:359
Pointer Page
Definition: bufpage.h:78
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
unsigned int uint32
Definition: c.h:506
signed int int32
Definition: c.h:494
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:182
#define Assert(condition)
Definition: c.h:858
#define PG_BINARY
Definition: c.h:1273
#define UINT64_FORMAT
Definition: c.h:549
#define strtou64(str, endptr, base)
Definition: c.h:1298
unsigned char uint8
Definition: c.h:504
uint32 TransactionId
Definition: c.h:652
size_t Size
Definition: c.h:605
void RequestCheckpoint(int flags)
Definition: checkpointer.c:941
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition: timestamp.h:39
int32 fsec_t
Definition: timestamp.h:41
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1159
int errcode_for_file_access(void)
Definition: elog.c:882
int errdetail(const char *fmt,...)
Definition: elog.c:1205
ErrorContextCallback * error_context_stack
Definition: elog.c:94
int errhint(const char *fmt,...)
Definition: elog.c:1319
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define LOG
Definition: elog.h:31
#define errcontext
Definition: elog.h:196
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2909
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2583
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1109
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:782
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1087
int FreeFile(FILE *file)
Definition: fd.c:2781
int pg_fsync(int fd)
Definition: fd.c:386
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2843
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:525
@ PGFILETYPE_LNK
Definition: file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:646
bool IsUnderPostmaster
Definition: globals.c:117
char * DataDir
Definition: globals.c:68
bool IsPostmasterEnvironment
Definition: globals.c:116
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:640
#define newval
#define GUC_check_errdetail
Definition: guc.h:448
GucSource
Definition: guc.h:108
const char * str
#define MAXDATEFIELDS
Definition: datetime.h:202
#define DTK_DATE
Definition: datetime.h:144
#define MAXDATELEN
Definition: datetime.h:200
#define close(a)
Definition: win32.h:12
void proc_exit(int code)
Definition: ipc.c:104
int i
Definition: isn.c:73
void OwnLatch(Latch *latch)
Definition: latch.c:463
void DisownLatch(Latch *latch)
Definition: latch.c:489
void InitSharedLatch(Latch *latch)
Definition: latch.c:430
void SetLatch(Latch *latch)
Definition: latch.c:632
void ResetLatch(Latch *latch)
Definition: latch.c:724
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:517
#define WL_TIMEOUT
Definition: latch.h:130
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:132
#define WL_LATCH_SET
Definition: latch.h:127
List * lappend(List *list, void *datum)
Definition: list.c:339
void list_free_deep(List *list)
Definition: list.c:1560
static struct pg_tm tm
Definition: localtime.c:104
char * pstrdup(const char *in)
Definition: mcxt.c:1695
void pfree(void *pointer)
Definition: mcxt.c:1520
void * palloc0(Size size)
Definition: mcxt.c:1346
void * palloc(Size size)
Definition: mcxt.c:1316
#define AmStartupProcess()
Definition: miscadmin.h:382
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:454
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition: pg_control.h:74
#define XLOG_CHECKPOINT_REDO
Definition: pg_control.h:81
#define XLOG_OVERWRITE_CONTRECORD
Definition: pg_control.h:80
DBState
Definition: pg_control.h:89
@ DB_IN_ARCHIVE_RECOVERY
Definition: pg_control.h:95
@ DB_SHUTDOWNED_IN_RECOVERY
Definition: pg_control.h:92
@ DB_SHUTDOWNED
Definition: pg_control.h:91
@ DB_IN_CRASH_RECOVERY
Definition: pg_control.h:94
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:67
#define XLOG_BACKUP_END
Definition: pg_control.h:72
#define XLOG_CHECKPOINT_ONLINE
Definition: pg_control.h:68
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:76
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
XLogRecPtr pg_lsn_in_internal(const char *str, bool *have_error)
Definition: pg_lsn.c:29
static rewind_source * source
Definition: pg_rewind.c:89
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:73
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:181
@ PMSIGNAL_RECOVERY_STARTED
Definition: pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition: pmsignal.h:36
#define pg_pread
Definition: port.h:225
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:350
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:36
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition: procarray.c:4387
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition: procarray.c:4524
static void set_ps_display(const char *activity)
Definition: ps_status.h:40
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
void RmgrStartup(void)
Definition: rmgr.c:58
void RmgrCleanup(void)
Definition: rmgr.c:74
int slock_t
Definition: s_lock.h:735
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:387
static pg_noinline void Size size
Definition: slab.c:607
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:252
void ShutDownSlotSync(void)
Definition: slotsync.c:1562
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:97
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:182
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:194
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
Oid oldestMultiDB
Definition: pg_control.h:50
MultiXactId oldestMulti
Definition: pg_control.h:49
MultiXactOffset nextMultiOffset
Definition: pg_control.h:46
TransactionId newestCommitTsXid
Definition: pg_control.h:54
TransactionId oldestXid
Definition: pg_control.h:47
TimeLineID PrevTimeLineID
Definition: pg_control.h:40
TimeLineID ThisTimeLineID
Definition: pg_control.h:39
Oid nextOid
Definition: pg_control.h:44
MultiXactId nextMulti
Definition: pg_control.h:45
FullTransactionId nextXid
Definition: pg_control.h:43
TransactionId oldestCommitTsXid
Definition: pg_control.h:52
XLogRecPtr redo
Definition: pg_control.h:37
Oid oldestXidDB
Definition: pg_control.h:48
XLogRecPtr backupStartPoint
Definition: pg_control.h:169
bool backupEndRequired
Definition: pg_control.h:171
CheckPoint checkPointCopy
Definition: pg_control.h:134
XLogRecPtr backupEndPoint
Definition: pg_control.h:170
XLogRecPtr minRecoveryPoint
Definition: pg_control.h:167
XLogRecPtr checkPoint
Definition: pg_control.h:132
uint64 system_identifier
Definition: pg_control.h:109
TimeLineID minRecoveryPointTLI
Definition: pg_control.h:168
Definition: dirent.c:26
XLogRecPtr lastPageBeginPtr
Definition: xlogrecovery.h:111
XLogRecPtr abortedRecPtr
Definition: xlogrecovery.h:120
XLogRecPtr missingContrecPtr
Definition: xlogrecovery.h:121
TimeLineID endOfLogTLI
Definition: xlogrecovery.h:109
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
Definition: latch.h:113
Definition: pg_list.h:54
RelFileNumber relNumber
void(* rm_mask)(char *pagedata, BlockNumber blkno)
void(* rm_redo)(XLogReaderState *record)
const char *(* rm_identify)(uint8 info)
const char * rm_name
void(* rm_desc)(StringInfo buf, XLogReaderState *record)
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h:29
TimeLineID ws_tli
Definition: xlogreader.h:49
TimeLineID replayTLI
Definition: xlogrecovery.c:199
XLogRecPtr missingContrecPtr
Definition: xlogreader.h:215
char * errormsg_buf
Definition: xlogreader.h:311
XLogRecPtr EndRecPtr
Definition: xlogreader.h:207
uint64 system_identifier
Definition: xlogreader.h:191
XLogRecPtr ReadRecPtr
Definition: xlogreader.h:206
XLogRecPtr abortedRecPtr
Definition: xlogreader.h:214
TimeLineID latestPageTLI
Definition: xlogreader.h:280
XLogRecPtr overwrittenRecPtr
Definition: xlogreader.h:217
XLogRecPtr latestPagePtr
Definition: xlogreader.h:279
WALOpenSegment seg
Definition: xlogreader.h:272
void * private_data
Definition: xlogreader.h:196
uint8 xl_info
Definition: xlogrecord.h:46
uint32 xl_tot_len
Definition: xlogrecord.h:43
TransactionId xl_xid
Definition: xlogrecord.h:44
RmgrId xl_rmid
Definition: xlogrecord.h:47
ConditionVariable recoveryNotPausedCV
Definition: xlogrecovery.c:359
XLogRecPtr lastReplayedEndRecPtr
Definition: xlogrecovery.c:339
TimeLineID replayEndTLI
Definition: xlogrecovery.c:348
TimeLineID lastReplayedTLI
Definition: xlogrecovery.c:340
TimestampTz currentChunkStartTime
Definition: xlogrecovery.c:356
XLogRecPtr replayEndRecPtr
Definition: xlogrecovery.c:347
TimestampTz recoveryLastXTime
Definition: xlogrecovery.c:350
RecoveryPauseState recoveryPauseState
Definition: xlogrecovery.c:358
XLogRecPtr lastReplayedReadRecPtr
Definition: xlogrecovery.c:338
Definition: guc.h:170
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
Definition: pgtime.h:35
Definition: regguts.h:323
TimeLineID PrevTimeLineID
TimeLineID ThisTimeLineID
char rp_name[MAXFNAMELEN]
TransactionId twophase_xid
Definition: xact.h:421
TransactionId twophase_xid
Definition: xact.h:391
#define InvalidTransactionId
Definition: transam.h:31
#define U64FromFullTransactionId(x)
Definition: transam.h:49
#define XidFromFullTransactionId(x)
Definition: transam.h:48
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
#define TimestampTzPlusMilliseconds(tz, ms)
Definition: timestamp.h:85
static TimestampTz DatumGetTimestampTz(Datum X)
Definition: timestamp.h:34
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition: varsup.c:304
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:88
static void pgstat_report_wait_end(void)
Definition: wait_event.h:104
void WalRcvForceReply(void)
Definition: walreceiver.c:1358
#define AllowCascadeReplication()
Definition: walreceiver.h:41
XLogRecPtr GetWalRcvFlushRecPtr(XLogRecPtr *latestChunkStart, TimeLineID *receiveTLI)
bool WalRcvStreaming(void)
void RequestXLogStreaming(TimeLineID tli, XLogRecPtr recptr, const char *conninfo, const char *slotname, bool create_temp_slot)
bool WalRcvRunning(void)
void WalSndWakeup(bool physical, bool logical)
Definition: walsender.c:3666
#define stat
Definition: win32_port.h:284
#define S_IRUSR
Definition: win32_port.h:289
#define symlink(oldpath, newpath)
Definition: win32_port.h:235
#define S_IWUSR
Definition: win32_port.h:292
#define XLOG_XACT_COMMIT_PREPARED
Definition: xact.h:172
#define XLOG_XACT_COMMIT
Definition: xact.h:169
#define XLOG_XACT_OPMASK
Definition: xact.h:179
#define XLOG_XACT_ABORT
Definition: xact.h:171
#define XLOG_XACT_ABORT_PREPARED
Definition: xact.h:173
void ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed)
Definition: xactdesc.c:35
void ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed)
Definition: xactdesc.c:141
int wal_decode_buffer_size
Definition: xlog.c:136
bool EnableHotStandby
Definition: xlog.c:121
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:6393
void SetInstallXLogFileSegmentActive(void)
Definition: xlog.c:9404
bool IsInstallXLogFileSegmentActive(void)
Definition: xlog.c:9412
int wal_segment_size
Definition: xlog.c:143
void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
Definition: xlog.c:6165
void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
Definition: xlog.c:3929
void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
Definition: xlog.c:6203
int wal_retrieve_retry_interval
Definition: xlog.c:134
static ControlFileData * ControlFile
Definition: xlog.c:569
void XLogShutdownWalRcv(void)
Definition: xlog.c:9393
bool XLogCheckpointNeeded(XLogSegNo new_segno)
Definition: xlog.c:2285
#define TABLESPACE_MAP_OLD
Definition: xlog.h:302
#define TABLESPACE_MAP
Definition: xlog.h:301
#define STANDBY_SIGNAL_FILE
Definition: xlog.h:297
#define CHECKPOINT_CAUSE_XLOG
Definition: xlog.h:146
#define PROMOTE_SIGNAL_FILE
Definition: xlog.h:305
#define BACKUP_LABEL_FILE
Definition: xlog.h:298
#define RECOVERY_SIGNAL_FILE
Definition: xlog.h:296
static RmgrData GetRmgr(RmgrId rmid)
@ RECOVERY_TARGET_ACTION_PAUSE
@ RECOVERY_TARGET_ACTION_PROMOTE
@ RECOVERY_TARGET_ACTION_SHUTDOWN
#define XLogSegmentOffset(xlogptr, wal_segsz_bytes)
#define MAXFNAMELEN
#define XLOGDIR
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)
static void XLogFilePath(char *path, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XRecOffIsValid(xlrp)
static void XLogFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes)
bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize, bool cleanupEnabled)
Definition: xlogarchive.c:54
void KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
Definition: xlogarchive.c:358
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
uint32 TimeLineID
Definition: xlogdefs.h:59
uint64 XLogSegNo
Definition: xlogdefs.h:48
void XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher)
XLogRecord * XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg)
void XLogPrefetchReconfigure(void)
XLogPrefetcher * XLogPrefetcherAllocate(XLogReaderState *reader)
void XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
void XLogPrefetcherFree(XLogPrefetcher *prefetcher)
XLogReaderState * XLogPrefetcherGetReader(XLogPrefetcher *prefetcher)
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
Definition: xlogreader.c:1997
void XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size)
Definition: xlogreader.c:90
void XLogReaderResetError(XLogReaderState *state)
Definition: xlogreader.c:1365
bool XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, char *phdr)
Definition: xlogreader.c:1224
void XLogReaderFree(XLogReaderState *state)
Definition: xlogreader.c:161
XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, XLogReaderRoutine *routine, void *private_data)
Definition: xlogreader.c:106
bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
Definition: xlogreader.c:2056
#define XLogRecGetDataLen(decoder)
Definition: xlogreader.h:416
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:410
#define XLogRecBlockImageApply(decoder, block_id)
Definition: xlogreader.h:425
#define XLogRecGetRmid(decoder)
Definition: xlogreader.h:411
#define XLogRecGetData(decoder)
Definition: xlogreader.h:415
#define XLogRecGetXid(decoder)
Definition: xlogreader.h:412
#define XL_ROUTINE(...)
Definition: xlogreader.h:117
#define XLogRecMaxBlockId(decoder)
Definition: xlogreader.h:418
XLogPageReadResult
Definition: xlogreader.h:350
@ XLREAD_WOULDBLOCK
Definition: xlogreader.h:353
@ XLREAD_SUCCESS
Definition: xlogreader.h:351
@ XLREAD_FAIL
Definition: xlogreader.h:352
#define XLogRecHasBlockImage(decoder, block_id)
Definition: xlogreader.h:423
#define XLogRecGetPrev(decoder)
Definition: xlogreader.h:409
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:417
#define SizeOfXLogRecordDataHeaderShort
Definition: xlogrecord.h:217
#define XLR_INFO_MASK
Definition: xlogrecord.h:62
#define SizeOfXLogRecord
Definition: xlogrecord.h:55
#define XLR_CHECK_CONSISTENCY
Definition: xlogrecord.h:91
bool reachedConsistency
Definition: xlogrecovery.c:294
bool check_primary_slot_name(char **newval, void **extra, GucSource source)
static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
static XLogRecPtr recoveryStopLSN
Definition: xlogrecovery.c:381
static bool recoveryStopsBefore(XLogReaderState *record)
static TimestampTz recoveryStopTime
Definition: xlogrecovery.c:380
void assign_recovery_target_xid(const char *newval, void *extra)
static bool CheckForStandbyTrigger(void)
int recovery_min_apply_delay
Definition: xlogrecovery.c:93
bool check_recovery_target(char **newval, void **extra, GucSource source)
static bool backupEndRequired
Definition: xlogrecovery.c:283
bool HotStandbyActive(void)
static char * getRecoveryStopReason(void)
void ShutdownWalRecovery(void)
RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal
Definition: xlogrecovery.c:120
int recoveryTargetAction
Definition: xlogrecovery.c:87
static void rm_redo_error_callback(void *arg)
static bool recoveryApplyDelay(XLogReaderState *record)
bool ArchiveRecoveryRequested
Definition: xlogrecovery.c:137
const char * recoveryTargetName
Definition: xlogrecovery.c:91
static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
static void pg_attribute_noreturn() error_multiple_recovery_targets(void)
bool check_recovery_target_timeline(char **newval, void **extra, GucSource source)
static XLogRecPtr minRecoveryPoint
Definition: xlogrecovery.c:278
static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf)
static XLogRecPtr backupEndPoint
Definition: xlogrecovery.c:282
const struct config_enum_entry recovery_target_action_options[]
Definition: xlogrecovery.c:74
static void validateRecoveryParameters(void)
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, TimeLineID replayTLI)
static XLogRecord * ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr, TimeLineID replayTLI)
void StartupRequestWalReceiverRestart(void)
bool InArchiveRecovery
Definition: xlogrecovery.c:138
static bool recoveryStopsAfter(XLogReaderState *record)
void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
char * PrimarySlotName
Definition: xlogrecovery.c:97
static TimeLineID curFileTLI
Definition: xlogrecovery.c:124
static char recoveryStopName[MAXFNAMELEN]
Definition: xlogrecovery.c:382
static void CheckRecoveryConsistency(void)
static bool pendingWalRcvRestart
Definition: xlogrecovery.c:248
void PerformWalRecovery(void)
static XLogSource XLogReceiptSource
Definition: xlogrecovery.c:259
bool CheckPromoteSignal(void)
struct XLogPageReadPrivate XLogPageReadPrivate
static bool recoveryStopAfter
Definition: xlogrecovery.c:383
static const char *const xlogSourceNames[]
Definition: xlogrecovery.c:218
static TimeLineID RedoStartTLI
Definition: xlogrecovery.c:170
char * recoveryRestoreCommand
Definition: xlogrecovery.c:82
static void verifyBackupPageConsistency(XLogReaderState *record)
void assign_recovery_target(const char *newval, void *extra)
void SetRecoveryPause(bool recoveryPause)
EndOfWalRecoveryInfo * FinishWalRecovery(void)
static bool lastSourceFailed
Definition: xlogrecovery.c:247
char * archiveCleanupCommand
Definition: xlogrecovery.c:84
XLogRecPtr GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
static TimeLineID receiveTLI
Definition: xlogrecovery.c:263
void WakeupRecovery(void)
void xlog_outdesc(StringInfo buf, XLogReaderState *record)
static bool LocalPromoteIsTriggered
Definition: xlogrecovery.c:182
bool PromoteIsTriggered(void)
TimestampTz GetCurrentChunkReplayStartTime(void)
static void ConfirmRecoveryPaused(void)
static void readRecoverySignalFile(void)
static XLogRecPtr missingContrecPtr
Definition: xlogrecovery.c:373
static XLogRecoveryCtlData * XLogRecoveryCtl
Definition: xlogrecovery.c:364
static uint32 readOff
Definition: xlogrecovery.c:232
static bool standby_signal_file_found
Definition: xlogrecovery.c:150
char * recovery_target_time_string
Definition: xlogrecovery.c:89
bool StandbyMode
Definition: xlogrecovery.c:147
static int readFile
Definition: xlogrecovery.c:230
static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr, TimeLineID replayTLI, XLogRecPtr replayLSN, bool nonblocking)
XLogRecPtr recoveryTargetLSN
Definition: xlogrecovery.c:92
RecoveryTargetType recoveryTarget
Definition: xlogrecovery.c:85
static bool read_tablespace_map(List **tablespaces)
static bool doRequestWalReceiverReply
Definition: xlogrecovery.c:185
static bool read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, bool *backupEndRequired, bool *backupFromStandby)
static XLogSource currentSource
Definition: xlogrecovery.c:246
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI)
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
static List * expectedTLEs
Definition: xlogrecovery.c:123
static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
static XLogSegNo readSegNo
Definition: xlogrecovery.c:231
void assign_recovery_target_name(const char *newval, void *extra)
static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, XLogSource source, bool notfoundOk)
static XLogRecPtr abortedRecPtr
Definition: xlogrecovery.c:372
static char * primary_image_masked
Definition: xlogrecovery.c:298
static TimeLineID minRecoveryPointTLI
Definition: xlogrecovery.c:279
static XLogRecord * ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, bool fetching_ckpt, TimeLineID replayTLI)
void assign_recovery_target_time(const char *newval, void *extra)
static void SetCurrentChunkStartTime(TimestampTz xtime)
static XLogRecPtr CheckPointLoc
Definition: xlogrecovery.c:167
bool check_recovery_target_xid(char **newval, void **extra, GucSource source)
static bool LocalHotStandbyActive
Definition: xlogrecovery.c:176
struct XLogRecoveryCtlData XLogRecoveryCtlData
static bool HotStandbyActiveInReplay(void)
static bool InRedo
Definition: xlogrecovery.c:203
static TransactionId recoveryStopXid
Definition: xlogrecovery.c:379
bool check_recovery_target_time(char **newval, void **extra, GucSource source)
static XLogSource readSource
Definition: xlogrecovery.c:234
static void SetPromoteIsTriggered(void)
#define RECOVERY_COMMAND_FILE
Definition: xlogrecovery.c:68
TransactionId recoveryTargetXid
Definition: xlogrecovery.c:88
XLogSource
Definition: xlogrecovery.c:210
@ XLOG_FROM_PG_WAL
Definition: xlogrecovery.c:213
@ XLOG_FROM_STREAM
Definition: xlogrecovery.c:214
@ XLOG_FROM_ARCHIVE
Definition: xlogrecovery.c:212
@ XLOG_FROM_ANY
Definition: xlogrecovery.c:211
TimeLineID recoveryTargetTLIRequested
Definition: xlogrecovery.c:121
void InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
Definition: xlogrecovery.c:512
static void xlog_block_info(StringInfo buf, XLogReaderState *record)
static TimestampTz XLogReceiptTime
Definition: xlogrecovery.c:258
static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
Size XLogRecoveryShmemSize(void)
Definition: xlogrecovery.c:447
static char * replay_image_masked
Definition: xlogrecovery.c:297
bool wal_receiver_create_temp_slot
Definition: xlogrecovery.c:98
static void CheckTablespaceDirectory(void)
char * recoveryEndCommand
Definition: xlogrecovery.c:83
RecoveryPauseState GetRecoveryPauseState(void)
TimeLineID recoveryTargetTLI
Definition: xlogrecovery.c:122
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
void assign_recovery_target_lsn(const char *newval, void *extra)
bool check_recovery_target_lsn(char **newval, void **extra, GucSource source)
static XLogRecPtr RedoStartLSN
Definition: xlogrecovery.c:169
static XLogRecPtr flushedUpto
Definition: xlogrecovery.c:262
void XLogRecoveryShmemInit(void)
Definition: xlogrecovery.c:458
static void recoveryPausesHere(bool endOfRecovery)
static uint32 readLen
Definition: xlogrecovery.c:233
static void EnableStandbyMode(void)
Definition: xlogrecovery.c:478
#define RECOVERY_COMMAND_DONE
Definition: xlogrecovery.c:69
static bool recovery_signal_file_found
Definition: xlogrecovery.c:151
TimestampTz recoveryTargetTime
Definition: xlogrecovery.c:90
TimestampTz GetLatestXTime(void)
char * PrimaryConnInfo
Definition: xlogrecovery.c:96
void XLogRequestWalReceiverReply(void)
static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
static XLogPrefetcher * xlogprefetcher
Definition: xlogrecovery.c:191
static bool StandbyModeRequested
Definition: xlogrecovery.c:146
bool check_recovery_target_name(char **newval, void **extra, GucSource source)
bool recoveryTargetInclusive
Definition: xlogrecovery.c:86
static XLogReaderState * xlogreader
Definition: xlogrecovery.c:188
void RemovePromoteSignalFiles(void)
void assign_recovery_target_timeline(const char *newval, void *extra)
static XLogRecPtr backupStartPoint
Definition: xlogrecovery.c:281
static void SetLatestXTime(TimestampTz xtime)
static TimeLineID CheckPointTLI
Definition: xlogrecovery.c:168
RecoveryTargetType
Definition: xlogrecovery.h:24
@ RECOVERY_TARGET_IMMEDIATE
Definition: xlogrecovery.h:30
@ RECOVERY_TARGET_TIME
Definition: xlogrecovery.h:27
@ RECOVERY_TARGET_UNSET
Definition: xlogrecovery.h:25
@ RECOVERY_TARGET_XID
Definition: xlogrecovery.h:26
@ RECOVERY_TARGET_LSN
Definition: xlogrecovery.h:29
@ RECOVERY_TARGET_NAME
Definition: xlogrecovery.h:28
RecoveryTargetTimeLineGoal
Definition: xlogrecovery.h:37
@ RECOVERY_TARGET_TIMELINE_NUMERIC
Definition: xlogrecovery.h:40
@ RECOVERY_TARGET_TIMELINE_CONTROLFILE
Definition: xlogrecovery.h:38
@ RECOVERY_TARGET_TIMELINE_LATEST
Definition: xlogrecovery.h:39
RecoveryPauseState
Definition: xlogrecovery.h:45
@ RECOVERY_PAUSED
Definition: xlogrecovery.h:48
@ RECOVERY_NOT_PAUSED
Definition: xlogrecovery.h:46
@ RECOVERY_PAUSE_REQUESTED
Definition: xlogrecovery.h:47
void wal_segment_close(XLogReaderState *state)
Definition: xlogutils.c:842
Buffer XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode, Buffer recent_buffer)
Definition: xlogutils.c:471
HotStandbyState standbyState
Definition: xlogutils.c:53
bool InRecovery
Definition: xlogutils.c:50
void XLogCheckInvalidPages(void)
Definition: xlogutils.c:245
@ STANDBY_SNAPSHOT_READY
Definition: xlogutils.h:52
@ STANDBY_INITIALIZED
Definition: xlogutils.h:50