PostgreSQL Source Code  git master
xlogrecovery.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * xlogrecovery.c
4  * Functions for WAL recovery, standby mode
5  *
6  * This source file contains functions controlling WAL recovery.
7  * InitWalRecovery() initializes the system for crash or archive recovery,
8  * or standby mode, depending on configuration options and the state of
9  * the control file and possible backup label file. PerformWalRecovery()
10  * performs the actual WAL replay, calling the rmgr-specific redo routines.
11  * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12  * and prepares information needed to initialize the WAL for writes. In
13  * addition to these three main functions, there are a bunch of functions
14  * for interrogating recovery state and controlling the recovery process.
15  *
16  *
17  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
18  * Portions Copyright (c) 1994, Regents of the University of California
19  *
20  * src/backend/access/transam/xlogrecovery.c
21  *
22  *-------------------------------------------------------------------------
23  */
24 
25 #include "postgres.h"
26 
27 #include <ctype.h>
28 #include <math.h>
29 #include <time.h>
30 #include <sys/stat.h>
31 #include <sys/time.h>
32 #include <unistd.h>
33 
34 #include "access/timeline.h"
35 #include "access/transam.h"
36 #include "access/xact.h"
37 #include "access/xlog_internal.h"
38 #include "access/xlogarchive.h"
39 #include "access/xlogprefetcher.h"
40 #include "access/xlogreader.h"
41 #include "access/xlogrecovery.h"
42 #include "access/xlogutils.h"
43 #include "backup/basebackup.h"
44 #include "catalog/pg_control.h"
45 #include "commands/tablespace.h"
46 #include "commands/waitlsn.h"
47 #include "common/file_utils.h"
48 #include "miscadmin.h"
49 #include "pgstat.h"
50 #include "postmaster/bgwriter.h"
51 #include "postmaster/startup.h"
52 #include "replication/slot.h"
53 #include "replication/slotsync.h"
55 #include "storage/fd.h"
56 #include "storage/ipc.h"
57 #include "storage/latch.h"
58 #include "storage/pmsignal.h"
59 #include "storage/procarray.h"
60 #include "storage/spin.h"
61 #include "utils/datetime.h"
62 #include "utils/fmgrprotos.h"
63 #include "utils/guc_hooks.h"
64 #include "utils/pg_lsn.h"
65 #include "utils/ps_status.h"
66 #include "utils/pg_rusage.h"
67 
68 /* Unsupported old recovery command file names (relative to $PGDATA) */
69 #define RECOVERY_COMMAND_FILE "recovery.conf"
70 #define RECOVERY_COMMAND_DONE "recovery.done"
71 
72 /*
73  * GUC support
74  */
76  {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
77  {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
78  {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
79  {NULL, 0, false}
80 };
81 
82 /* options formerly taken from recovery.conf for archive recovery */
84 char *recoveryEndCommand = NULL;
85 char *archiveCleanupCommand = NULL;
92 const char *recoveryTargetName;
95 
96 /* options formerly taken from recovery.conf for XLOG streaming */
97 char *PrimaryConnInfo = NULL;
98 char *PrimarySlotName = NULL;
100 
101 /*
102  * recoveryTargetTimeLineGoal: what the user requested, if any
103  *
104  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
105  *
106  * recoveryTargetTLI: the currently understood target timeline; changes
107  *
108  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
109  * the timelines of its known parents, newest first (so recoveryTargetTLI is
110  * always the first list member). Only these TLIs are expected to be seen in
111  * the WAL segments we read, and indeed only these TLIs will be considered as
112  * candidate WAL files to open at all.
113  *
114  * curFileTLI: the TLI appearing in the name of the current input WAL file.
115  * (This is not necessarily the same as the timeline from which we are
116  * replaying WAL, which StartupXLOG calls replayTLI, because we could be
117  * scanning data that was copied from an ancestor timeline when the current
118  * file was created.) During a sequential scan we do not allow this value
119  * to decrease.
120  */
126 
127 /*
128  * When ArchiveRecoveryRequested is set, archive recovery was requested,
129  * ie. signal files were present. When InArchiveRecovery is set, we are
130  * currently recovering using offline XLOG archives. These variables are only
131  * valid in the startup process.
132  *
133  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
134  * currently performing crash recovery using only XLOG files in pg_wal, but
135  * will switch to using offline XLOG archives as soon as we reach the end of
136  * WAL in pg_wal.
137  */
139 bool InArchiveRecovery = false;
140 
141 /*
142  * When StandbyModeRequested is set, standby mode was requested, i.e.
143  * standby.signal file was present. When StandbyMode is set, we are currently
144  * in standby mode. These variables are only valid in the startup process.
145  * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
146  */
147 static bool StandbyModeRequested = false;
148 bool StandbyMode = false;
149 
150 /* was a signal file present at startup? */
151 static bool standby_signal_file_found = false;
152 static bool recovery_signal_file_found = false;
153 
154 /*
155  * CheckPointLoc is the position of the checkpoint record that determines
156  * where to start the replay. It comes from the backup label file or the
157  * control file.
158  *
159  * RedoStartLSN is the checkpoint's REDO location, also from the backup label
160  * file or the control file. In standby mode, XLOG streaming usually starts
161  * from the position where an invalid record was found. But if we fail to
162  * read even the initial checkpoint record, we use the REDO location instead
163  * of the checkpoint location as the start position of XLOG streaming.
164  * Otherwise we would have to jump backwards to the REDO location after
165  * reading the checkpoint record, because the REDO record can precede the
166  * checkpoint record.
167  */
172 
173 /*
174  * Local copy of SharedHotStandbyActive variable. False actually means "not
175  * known, need to check the shared state".
176  */
177 static bool LocalHotStandbyActive = false;
178 
179 /*
180  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
181  * known, need to check the shared state".
182  */
183 static bool LocalPromoteIsTriggered = false;
184 
185 /* Has the recovery code requested a walreceiver wakeup? */
187 
188 /* XLogReader object used to parse the WAL records */
190 
191 /* XLogPrefetcher object used to consume WAL records with read-ahead */
193 
194 /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
195 typedef struct XLogPageReadPrivate
196 {
197  int emode;
198  bool fetching_ckpt; /* are we fetching a checkpoint record? */
202 
203 /* flag to tell XLogPageRead that we have started replaying */
204 static bool InRedo = false;
205 
206 /*
207  * Codes indicating where we got a WAL file from during recovery, or where
208  * to attempt to get one.
209  */
210 typedef enum
211 {
212  XLOG_FROM_ANY = 0, /* request to read WAL from any source */
213  XLOG_FROM_ARCHIVE, /* restored using restore_command */
214  XLOG_FROM_PG_WAL, /* existing file in pg_wal */
215  XLOG_FROM_STREAM, /* streamed from primary */
216 } XLogSource;
217 
218 /* human-readable names for XLogSources, for debugging output */
219 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
220 
221 /*
222  * readFile is -1 or a kernel FD for the log file segment that's currently
223  * open for reading. readSegNo identifies the segment. readOff is the offset
224  * of the page just read, readLen indicates how much of it has been read into
225  * readBuf, and readSource indicates where we got the currently open file from.
226  *
227  * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
228  * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
229  * worthwhile, since the XLOG is not read by general-purpose sessions.
230  */
231 static int readFile = -1;
232 static XLogSegNo readSegNo = 0;
233 static uint32 readOff = 0;
234 static uint32 readLen = 0;
236 
237 /*
238  * Keeps track of which source we're currently reading from. This is
239  * different from readSource in that this is always set, even when we don't
240  * currently have a WAL file open. If lastSourceFailed is set, our last
241  * attempt to read from currentSource failed, and we should try another source
242  * next.
243  *
244  * pendingWalRcvRestart is set when a config change occurs that requires a
245  * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
246  */
248 static bool lastSourceFailed = false;
249 static bool pendingWalRcvRestart = false;
250 
251 /*
252  * These variables track when we last obtained some WAL data to process,
253  * and where we got it from. (XLogReceiptSource is initially the same as
254  * readSource, but readSource gets reset to zero when we don't have data
255  * to process right now. It is also different from currentSource, which
256  * also changes when we try to read from a source and fail, while
257  * XLogReceiptSource tracks where we last successfully read some WAL.)
258  */
261 
262 /* Local copy of WalRcv->flushedUpto */
265 
266 /*
267  * Copy of minRecoveryPoint and backupEndPoint from the control file.
268  *
269  * In order to reach consistency, we must replay the WAL up to
270  * minRecoveryPoint. If backupEndRequired is true, we must also reach
271  * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
272  * to backupStartPoint.
273  *
274  * Note: In archive recovery, after consistency has been reached, the
275  * functions in xlog.c will start updating minRecoveryPoint in the control
276  * file. But this copy of minRecoveryPoint variable reflects the value at the
277  * beginning of recovery, and is *not* updated after consistency is reached.
278  */
281 
284 static bool backupEndRequired = false;
285 
286 /*
287  * Have we reached a consistent database state? In crash recovery, we have
288  * to replay all the WAL, so reachedConsistency is never set. During archive
289  * recovery, the database is consistent once minRecoveryPoint is reached.
290  *
291  * Consistent state means that the system is internally consistent, all
292  * the WAL has been replayed up to a certain point, and importantly, there
293  * is no trace of later actions on disk.
294  */
295 bool reachedConsistency = false;
296 
297 /* Buffers dedicated to consistency checks of size BLCKSZ */
298 static char *replay_image_masked = NULL;
299 static char *primary_image_masked = NULL;
300 
301 
302 /*
303  * Shared-memory state for WAL recovery.
304  */
305 typedef struct XLogRecoveryCtlData
306 {
307  /*
308  * SharedHotStandbyActive indicates if we allow hot standby queries to be
309  * run. Protected by info_lck.
310  */
312 
313  /*
314  * SharedPromoteIsTriggered indicates if a standby promotion has been
315  * triggered. Protected by info_lck.
316  */
318 
319  /*
320  * recoveryWakeupLatch is used to wake up the startup process to continue
321  * WAL replay, if it is waiting for WAL to arrive or promotion to be
322  * requested.
323  *
324  * Note that the startup process also uses another latch, its procLatch,
325  * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
326  * signaling the startup process in favor of using its procLatch, which
327  * comports better with possible generic signal handlers using that latch.
328  * But we should not do that because the startup process doesn't assume
329  * that it's waken up by walreceiver process or SIGHUP signal handler
330  * while it's waiting for recovery conflict. The separate latches,
331  * recoveryWakeupLatch and procLatch, should be used for inter-process
332  * communication for WAL replay and recovery conflict, respectively.
333  */
335 
336  /*
337  * Last record successfully replayed.
338  */
339  XLogRecPtr lastReplayedReadRecPtr; /* start position */
340  XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
341  TimeLineID lastReplayedTLI; /* timeline */
342 
343  /*
344  * When we're currently replaying a record, ie. in a redo function,
345  * replayEndRecPtr points to the end+1 of the record being replayed,
346  * otherwise it's equal to lastReplayedEndRecPtr.
347  */
350  /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
352 
353  /*
354  * timestamp of when we started replaying the current chunk of WAL data,
355  * only relevant for replication or archive recovery
356  */
358  /* Recovery pause state */
361 
362  slock_t info_lck; /* locks shared variables shown above */
364 
366 
367 /*
368  * abortedRecPtr is the start pointer of a broken record at end of WAL when
369  * recovery completes; missingContrecPtr is the location of the first
370  * contrecord that went missing. See CreateOverwriteContrecordRecord for
371  * details.
372  */
375 
376 /*
377  * if recoveryStopsBefore/After returns true, it saves information of the stop
378  * point here
379  */
384 static bool recoveryStopAfter;
385 
386 /* prototypes for local functions */
387 static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
388 
389 static void EnableStandbyMode(void);
390 static void readRecoverySignalFile(void);
391 static void validateRecoveryParameters(void);
392 static bool read_backup_label(XLogRecPtr *checkPointLoc,
393  TimeLineID *backupLabelTLI,
394  bool *backupEndRequired, bool *backupFromStandby);
395 static bool read_tablespace_map(List **tablespaces);
396 
397 static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
398 static void CheckRecoveryConsistency(void);
399 static void rm_redo_error_callback(void *arg);
400 #ifdef WAL_DEBUG
401 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
402 #endif
403 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
404 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
405  TimeLineID prevTLI, TimeLineID replayTLI);
406 static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
407 static void verifyBackupPageConsistency(XLogReaderState *record);
408 
409 static bool recoveryStopsBefore(XLogReaderState *record);
410 static bool recoveryStopsAfter(XLogReaderState *record);
411 static char *getRecoveryStopReason(void);
412 static void recoveryPausesHere(bool endOfRecovery);
413 static bool recoveryApplyDelay(XLogReaderState *record);
414 static void ConfirmRecoveryPaused(void);
415 
417  int emode, bool fetching_ckpt,
418  TimeLineID replayTLI);
419 
420 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
421  int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
423  bool randAccess,
424  bool fetching_ckpt,
425  XLogRecPtr tliRecPtr,
426  TimeLineID replayTLI,
427  XLogRecPtr replayLSN,
428  bool nonblocking);
429 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
431  XLogRecPtr RecPtr, TimeLineID replayTLI);
432 static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
433 static int XLogFileRead(XLogSegNo segno, TimeLineID tli,
434  XLogSource source, bool notfoundOk);
435 static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
436 
437 static bool CheckForStandbyTrigger(void);
438 static void SetPromoteIsTriggered(void);
439 static bool HotStandbyActiveInReplay(void);
440 
441 static void SetCurrentChunkStartTime(TimestampTz xtime);
442 static void SetLatestXTime(TimestampTz xtime);
443 
444 /*
445  * Initialization of shared memory for WAL recovery
446  */
447 Size
449 {
450  Size size;
451 
452  /* XLogRecoveryCtl */
453  size = sizeof(XLogRecoveryCtlData);
454 
455  return size;
456 }
457 
458 void
460 {
461  bool found;
462 
464  ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
465  if (found)
466  return;
467  memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
468 
472 }
473 
474 /*
475  * A thin wrapper to enable StandbyMode and do other preparatory work as
476  * needed.
477  */
478 static void
480 {
481  StandbyMode = true;
482 
483  /*
484  * To avoid server log bloat, we don't report recovery progress in a
485  * standby as it will always be in recovery unless promoted. We disable
486  * startup progress timeout in standby mode to avoid calling
487  * startup_progress_timeout_handler() unnecessarily.
488  */
490 }
491 
492 /*
493  * Prepare the system for WAL recovery, if needed.
494  *
495  * This is called by StartupXLOG() which coordinates the server startup
496  * sequence. This function analyzes the control file and the backup label
497  * file, if any, and figures out whether we need to perform crash recovery or
498  * archive recovery, and how far we need to replay the WAL to reach a
499  * consistent state.
500  *
501  * This doesn't yet change the on-disk state, except for creating the symlinks
502  * from table space map file if any, and for fetching WAL files needed to find
503  * the checkpoint record. On entry, the caller has already read the control
504  * file into memory, and passes it as argument. This function updates it to
505  * reflect the recovery state, and the caller is expected to write it back to
506  * disk does after initializing other subsystems, but before calling
507  * PerformWalRecovery().
508  *
509  * This initializes some global variables like ArchiveRecoveryRequested, and
510  * StandbyModeRequested and InRecovery.
511  */
512 void
514  bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
515 {
516  XLogPageReadPrivate *private;
517  struct stat st;
518  bool wasShutdown;
519  XLogRecord *record;
520  DBState dbstate_at_startup;
521  bool haveTblspcMap = false;
522  bool haveBackupLabel = false;
523  CheckPoint checkPoint;
524  bool backupFromStandby = false;
525 
526  dbstate_at_startup = ControlFile->state;
527 
528  /*
529  * Initialize on the assumption we want to recover to the latest timeline
530  * that's active according to pg_control.
531  */
535  else
537 
538  /*
539  * Check for signal files, and if so set up state for offline recovery
540  */
543 
544  /*
545  * Take ownership of the wakeup latch if we're going to sleep during
546  * recovery, if required.
547  */
550 
551  /*
552  * Set the WAL reading processor now, as it will be needed when reading
553  * the checkpoint record required (backup_label or not).
554  */
555  private = palloc0(sizeof(XLogPageReadPrivate));
556  xlogreader =
558  XL_ROUTINE(.page_read = &XLogPageRead,
559  .segment_open = NULL,
560  .segment_close = wal_segment_close),
561  private);
562  if (!xlogreader)
563  ereport(ERROR,
564  (errcode(ERRCODE_OUT_OF_MEMORY),
565  errmsg("out of memory"),
566  errdetail("Failed while allocating a WAL reading processor.")));
568 
569  /*
570  * Set the WAL decode buffer size. This limits how far ahead we can read
571  * in the WAL.
572  */
574 
575  /* Create a WAL prefetcher. */
577 
578  /*
579  * Allocate two page buffers dedicated to WAL consistency checks. We do
580  * it this way, rather than just making static arrays, for two reasons:
581  * (1) no need to waste the storage in most instantiations of the backend;
582  * (2) a static char array isn't guaranteed to have any particular
583  * alignment, whereas palloc() will provide MAXALIGN'd storage.
584  */
585  replay_image_masked = (char *) palloc(BLCKSZ);
586  primary_image_masked = (char *) palloc(BLCKSZ);
587 
588  /*
589  * Read the backup_label file. We want to run this part of the recovery
590  * process after checking for signal files and after performing validation
591  * of the recovery parameters.
592  */
594  &backupFromStandby))
595  {
596  List *tablespaces = NIL;
597 
598  /*
599  * Archive recovery was requested, and thanks to the backup label
600  * file, we know how far we need to replay to reach consistency. Enter
601  * archive recovery directly.
602  */
603  InArchiveRecovery = true;
606 
607  /*
608  * Omitting backup_label when creating a new replica, PITR node etc.
609  * unfortunately is a common cause of corruption. Logging that
610  * backup_label was used makes it a bit easier to exclude that as the
611  * cause of observed corruption.
612  *
613  * Do so before we try to read the checkpoint record (which can fail),
614  * as otherwise it can be hard to understand why a checkpoint other
615  * than ControlFile->checkPoint is used.
616  */
617  ereport(LOG,
618  (errmsg("starting backup recovery with redo LSN %X/%X, checkpoint LSN %X/%X, on timeline ID %u",
621  CheckPointTLI)));
622 
623  /*
624  * When a backup_label file is present, we want to roll forward from
625  * the checkpoint it identifies, rather than using pg_control.
626  */
628  CheckPointTLI);
629  if (record != NULL)
630  {
631  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
632  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
633  ereport(DEBUG1,
634  (errmsg_internal("checkpoint record is at %X/%X",
636  InRecovery = true; /* force recovery even if SHUTDOWNED */
637 
638  /*
639  * Make sure that REDO location exists. This may not be the case
640  * if there was a crash during an online backup, which left a
641  * backup_label around that references a WAL segment that's
642  * already been archived.
643  */
644  if (checkPoint.redo < CheckPointLoc)
645  {
647  if (!ReadRecord(xlogprefetcher, LOG, false,
648  checkPoint.ThisTimeLineID))
649  ereport(FATAL,
650  (errmsg("could not find redo location %X/%X referenced by checkpoint record at %X/%X",
652  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
653  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
654  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
656  }
657  }
658  else
659  {
660  ereport(FATAL,
661  (errmsg("could not locate required checkpoint record at %X/%X",
663  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
664  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
665  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
667  wasShutdown = false; /* keep compiler quiet */
668  }
669 
670  /* Read the tablespace_map file if present and create symlinks. */
671  if (read_tablespace_map(&tablespaces))
672  {
673  ListCell *lc;
674 
675  foreach(lc, tablespaces)
676  {
677  tablespaceinfo *ti = lfirst(lc);
678  char *linkloc;
679 
680  linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
681 
682  /*
683  * Remove the existing symlink if any and Create the symlink
684  * under PGDATA.
685  */
686  remove_tablespace_symlink(linkloc);
687 
688  if (symlink(ti->path, linkloc) < 0)
689  ereport(ERROR,
691  errmsg("could not create symbolic link \"%s\": %m",
692  linkloc)));
693 
694  pfree(ti->path);
695  pfree(ti);
696  }
697 
698  /* tell the caller to delete it later */
699  haveTblspcMap = true;
700  }
701 
702  /* tell the caller to delete it later */
703  haveBackupLabel = true;
704  }
705  else
706  {
707  /* No backup_label file has been found if we are here. */
708 
709  /*
710  * If tablespace_map file is present without backup_label file, there
711  * is no use of such file. There is no harm in retaining it, but it
712  * is better to get rid of the map file so that we don't have any
713  * redundant file in data directory and it will avoid any sort of
714  * confusion. It seems prudent though to just rename the file out of
715  * the way rather than delete it completely, also we ignore any error
716  * that occurs in rename operation as even if map file is present
717  * without backup_label file, it is harmless.
718  */
719  if (stat(TABLESPACE_MAP, &st) == 0)
720  {
721  unlink(TABLESPACE_MAP_OLD);
723  ereport(LOG,
724  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
726  errdetail("File \"%s\" was renamed to \"%s\".",
728  else
729  ereport(LOG,
730  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
732  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
734  }
735 
736  /*
737  * It's possible that archive recovery was requested, but we don't
738  * know how far we need to replay the WAL before we reach consistency.
739  * This can happen for example if a base backup is taken from a
740  * running server using an atomic filesystem snapshot, without calling
741  * pg_backup_start/stop. Or if you just kill a running primary server
742  * and put it into archive recovery by creating a recovery signal
743  * file.
744  *
745  * Our strategy in that case is to perform crash recovery first,
746  * replaying all the WAL present in pg_wal, and only enter archive
747  * recovery after that.
748  *
749  * But usually we already know how far we need to replay the WAL (up
750  * to minRecoveryPoint, up to backupEndPoint, or until we see an
751  * end-of-backup record), and we can enter archive recovery directly.
752  */
758  {
759  InArchiveRecovery = true;
762  }
763 
764  /*
765  * For the same reason as when starting up with backup_label present,
766  * emit a log message when we continue initializing from a base
767  * backup.
768  */
770  ereport(LOG,
771  (errmsg("restarting backup recovery with redo LSN %X/%X",
773 
774  /* Get the last valid checkpoint record. */
780  CheckPointTLI);
781  if (record != NULL)
782  {
783  ereport(DEBUG1,
784  (errmsg_internal("checkpoint record is at %X/%X",
786  }
787  else
788  {
789  /*
790  * We used to attempt to go back to a secondary checkpoint record
791  * here, but only when not in standby mode. We now just fail if we
792  * can't read the last checkpoint because this allows us to
793  * simplify processing around checkpoints.
794  */
795  ereport(PANIC,
796  (errmsg("could not locate a valid checkpoint record at %X/%X",
798  }
799  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
800  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
801  }
802 
804  {
806  ereport(LOG,
807  (errmsg("entering standby mode")));
809  ereport(LOG,
810  (errmsg("starting point-in-time recovery to XID %u",
813  ereport(LOG,
814  (errmsg("starting point-in-time recovery to %s",
817  ereport(LOG,
818  (errmsg("starting point-in-time recovery to \"%s\"",
821  ereport(LOG,
822  (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
825  ereport(LOG,
826  (errmsg("starting point-in-time recovery to earliest consistent point")));
827  else
828  ereport(LOG,
829  (errmsg("starting archive recovery")));
830  }
831 
832  /*
833  * If the location of the checkpoint record is not on the expected
834  * timeline in the history of the requested timeline, we cannot proceed:
835  * the backup is not part of the history of the requested timeline.
836  */
837  Assert(expectedTLEs); /* was initialized by reading checkpoint
838  * record */
841  {
842  XLogRecPtr switchpoint;
843 
844  /*
845  * tliSwitchPoint will throw an error if the checkpoint's timeline is
846  * not in expectedTLEs at all.
847  */
849  ereport(FATAL,
850  (errmsg("requested timeline %u is not a child of this server's history",
852  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
855  LSN_FORMAT_ARGS(switchpoint))));
856  }
857 
858  /*
859  * The min recovery point should be part of the requested timeline's
860  * history, too.
861  */
865  ereport(FATAL,
866  (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
870 
871  ereport(DEBUG1,
872  (errmsg_internal("redo record is at %X/%X; shutdown %s",
873  LSN_FORMAT_ARGS(checkPoint.redo),
874  wasShutdown ? "true" : "false")));
875  ereport(DEBUG1,
876  (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
877  U64FromFullTransactionId(checkPoint.nextXid),
878  checkPoint.nextOid)));
879  ereport(DEBUG1,
880  (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
881  checkPoint.nextMulti, checkPoint.nextMultiOffset)));
882  ereport(DEBUG1,
883  (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
884  checkPoint.oldestXid, checkPoint.oldestXidDB)));
885  ereport(DEBUG1,
886  (errmsg_internal("oldest MultiXactId: %u, in database %u",
887  checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
888  ereport(DEBUG1,
889  (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
890  checkPoint.oldestCommitTsXid,
891  checkPoint.newestCommitTsXid)));
893  ereport(PANIC,
894  (errmsg("invalid next transaction ID")));
895 
896  /* sanity check */
897  if (checkPoint.redo > CheckPointLoc)
898  ereport(PANIC,
899  (errmsg("invalid redo in checkpoint record")));
900 
901  /*
902  * Check whether we need to force recovery from WAL. If it appears to
903  * have been a clean shutdown and we did not have a recovery signal file,
904  * then assume no recovery needed.
905  */
906  if (checkPoint.redo < CheckPointLoc)
907  {
908  if (wasShutdown)
909  ereport(PANIC,
910  (errmsg("invalid redo record in shutdown checkpoint")));
911  InRecovery = true;
912  }
913  else if (ControlFile->state != DB_SHUTDOWNED)
914  InRecovery = true;
915  else if (ArchiveRecoveryRequested)
916  {
917  /* force recovery due to presence of recovery signal file */
918  InRecovery = true;
919  }
920 
921  /*
922  * If recovery is needed, update our in-memory copy of pg_control to show
923  * that we are recovering and to show the selected checkpoint as the place
924  * we are starting from. We also mark pg_control with any minimum recovery
925  * stop point obtained from a backup history file.
926  *
927  * We don't write the changes to disk yet, though. Only do that after
928  * initializing various subsystems.
929  */
930  if (InRecovery)
931  {
932  if (InArchiveRecovery)
933  {
935  }
936  else
937  {
938  ereport(LOG,
939  (errmsg("database system was not properly shut down; "
940  "automatic recovery in progress")));
942  ereport(LOG,
943  (errmsg("crash recovery starts in timeline %u "
944  "and has target timeline %u",
948  }
950  ControlFile->checkPointCopy = checkPoint;
951  if (InArchiveRecovery)
952  {
953  /* initialize minRecoveryPoint if not set yet */
954  if (ControlFile->minRecoveryPoint < checkPoint.redo)
955  {
956  ControlFile->minRecoveryPoint = checkPoint.redo;
958  }
959  }
960 
961  /*
962  * Set backupStartPoint if we're starting recovery from a base backup.
963  *
964  * Also set backupEndPoint and use minRecoveryPoint as the backup end
965  * location if we're starting recovery from a base backup which was
966  * taken from a standby. In this case, the database system status in
967  * pg_control must indicate that the database was already in recovery.
968  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
969  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
970  * before reaching this point; e.g. because restore_command or
971  * primary_conninfo were faulty.
972  *
973  * Any other state indicates that the backup somehow became corrupted
974  * and we can't sensibly continue with recovery.
975  */
976  if (haveBackupLabel)
977  {
978  ControlFile->backupStartPoint = checkPoint.redo;
980 
981  if (backupFromStandby)
982  {
983  if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
984  dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
985  ereport(FATAL,
986  (errmsg("backup_label contains data inconsistent with control file"),
987  errhint("This means that the backup is corrupted and you will "
988  "have to use another backup for recovery.")));
990  }
991  }
992  }
993 
994  /* remember these, so that we know when we have reached consistency */
998  if (InArchiveRecovery)
999  {
1002  }
1003  else
1004  {
1006  minRecoveryPointTLI = 0;
1007  }
1008 
1009  /*
1010  * Start recovery assuming that the final record isn't lost.
1011  */
1014 
1015  *wasShutdown_ptr = wasShutdown;
1016  *haveBackupLabel_ptr = haveBackupLabel;
1017  *haveTblspcMap_ptr = haveTblspcMap;
1018 }
1019 
1020 /*
1021  * See if there are any recovery signal files and if so, set state for
1022  * recovery.
1023  *
1024  * See if there is a recovery command file (recovery.conf), and if so
1025  * throw an ERROR since as of PG12 we no longer recognize that.
1026  */
1027 static void
1029 {
1030  struct stat stat_buf;
1031 
1033  return;
1034 
1035  /*
1036  * Check for old recovery API file: recovery.conf
1037  */
1038  if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1039  ereport(FATAL,
1041  errmsg("using recovery command file \"%s\" is not supported",
1043 
1044  /*
1045  * Remove unused .done file, if present. Ignore if absent.
1046  */
1047  unlink(RECOVERY_COMMAND_DONE);
1048 
1049  /*
1050  * Check for recovery signal files and if found, fsync them since they
1051  * represent server state information. We don't sweat too much about the
1052  * possibility of fsync failure, however.
1053  *
1054  * If present, standby signal file takes precedence. If neither is present
1055  * then we won't enter archive recovery.
1056  */
1057  if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1058  {
1059  int fd;
1060 
1062  S_IRUSR | S_IWUSR);
1063  if (fd >= 0)
1064  {
1065  (void) pg_fsync(fd);
1066  close(fd);
1067  }
1069  }
1070  else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1071  {
1072  int fd;
1073 
1075  S_IRUSR | S_IWUSR);
1076  if (fd >= 0)
1077  {
1078  (void) pg_fsync(fd);
1079  close(fd);
1080  }
1082  }
1083 
1084  StandbyModeRequested = false;
1085  ArchiveRecoveryRequested = false;
1087  {
1088  StandbyModeRequested = true;
1089  ArchiveRecoveryRequested = true;
1090  }
1091  else if (recovery_signal_file_found)
1092  {
1093  StandbyModeRequested = false;
1094  ArchiveRecoveryRequested = true;
1095  }
1096  else
1097  return;
1098 
1099  /*
1100  * We don't support standby mode in standalone backends; that requires
1101  * other processes such as the WAL receiver to be alive.
1102  */
1104  ereport(FATAL,
1105  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1106  errmsg("standby mode is not supported by single-user servers")));
1107 }
1108 
1109 static void
1111 {
1113  return;
1114 
1115  /*
1116  * Check for compulsory parameters
1117  */
1119  {
1120  if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1121  (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1122  ereport(WARNING,
1123  (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1124  errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1125  }
1126  else
1127  {
1128  if (recoveryRestoreCommand == NULL ||
1129  strcmp(recoveryRestoreCommand, "") == 0)
1130  ereport(FATAL,
1131  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1132  errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1133  }
1134 
1135  /*
1136  * Override any inconsistent requests. Note that this is a change of
1137  * behaviour in 9.5; prior to this we simply ignored a request to pause if
1138  * hot_standby = off, which was surprising behaviour.
1139  */
1143 
1144  /*
1145  * Final parsing of recovery_target_time string; see also
1146  * check_recovery_target_time().
1147  */
1149  {
1153  Int32GetDatum(-1)));
1154  }
1155 
1156  /*
1157  * If user specified recovery_target_timeline, validate it or compute the
1158  * "latest" value. We can't do this until after we've gotten the restore
1159  * command and set InArchiveRecovery, because we need to fetch timeline
1160  * history files from the archive.
1161  */
1163  {
1165 
1166  /* Timeline 1 does not have a history file, all else should */
1167  if (rtli != 1 && !existsTimeLineHistory(rtli))
1168  ereport(FATAL,
1169  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1170  errmsg("recovery target timeline %u does not exist",
1171  rtli)));
1172  recoveryTargetTLI = rtli;
1173  }
1175  {
1176  /* We start the "latest" search from pg_control's timeline */
1178  }
1179  else
1180  {
1181  /*
1182  * else we just use the recoveryTargetTLI as already read from
1183  * ControlFile
1184  */
1186  }
1187 }
1188 
1189 /*
1190  * read_backup_label: check to see if a backup_label file is present
1191  *
1192  * If we see a backup_label during recovery, we assume that we are recovering
1193  * from a backup dump file, and we therefore roll forward from the checkpoint
1194  * identified by the label file, NOT what pg_control says. This avoids the
1195  * problem that pg_control might have been archived one or more checkpoints
1196  * later than the start of the dump, and so if we rely on it as the start
1197  * point, we will fail to restore a consistent database state.
1198  *
1199  * Returns true if a backup_label was found (and fills the checkpoint
1200  * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1201  * returns false if not. If this backup_label came from a streamed backup,
1202  * *backupEndRequired is set to true. If this backup_label was created during
1203  * recovery, *backupFromStandby is set to true.
1204  *
1205  * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1206  * and TLI read from the backup file.
1207  */
1208 static bool
1209 read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1210  bool *backupEndRequired, bool *backupFromStandby)
1211 {
1212  char startxlogfilename[MAXFNAMELEN];
1213  TimeLineID tli_from_walseg,
1214  tli_from_file;
1215  FILE *lfp;
1216  char ch;
1217  char backuptype[20];
1218  char backupfrom[20];
1219  char backuplabel[MAXPGPATH];
1220  char backuptime[128];
1221  uint32 hi,
1222  lo;
1223 
1224  /* suppress possible uninitialized-variable warnings */
1225  *checkPointLoc = InvalidXLogRecPtr;
1226  *backupLabelTLI = 0;
1227  *backupEndRequired = false;
1228  *backupFromStandby = false;
1229 
1230  /*
1231  * See if label file is present
1232  */
1233  lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1234  if (!lfp)
1235  {
1236  if (errno != ENOENT)
1237  ereport(FATAL,
1239  errmsg("could not read file \"%s\": %m",
1240  BACKUP_LABEL_FILE)));
1241  return false; /* it's not there, all is fine */
1242  }
1243 
1244  /*
1245  * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1246  * is pretty crude, but we are not expecting any variability in the file
1247  * format).
1248  */
1249  if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1250  &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1251  ereport(FATAL,
1252  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1253  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1254  RedoStartLSN = ((uint64) hi) << 32 | lo;
1255  RedoStartTLI = tli_from_walseg;
1256  if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1257  &hi, &lo, &ch) != 3 || ch != '\n')
1258  ereport(FATAL,
1259  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1260  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1261  *checkPointLoc = ((uint64) hi) << 32 | lo;
1262  *backupLabelTLI = tli_from_walseg;
1263 
1264  /*
1265  * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1266  * which could mean either pg_basebackup or the pg_backup_start/stop
1267  * method was used) or if this label came from somewhere else (the only
1268  * other option today being from pg_rewind). If this was a streamed
1269  * backup then we know that we need to play through until we get to the
1270  * end of the WAL which was generated during the backup (at which point we
1271  * will have reached consistency and backupEndRequired will be reset to be
1272  * false).
1273  */
1274  if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1275  {
1276  if (strcmp(backuptype, "streamed") == 0)
1277  *backupEndRequired = true;
1278  }
1279 
1280  /*
1281  * BACKUP FROM lets us know if this was from a primary or a standby. If
1282  * it was from a standby, we'll double-check that the control file state
1283  * matches that of a standby.
1284  */
1285  if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1286  {
1287  if (strcmp(backupfrom, "standby") == 0)
1288  *backupFromStandby = true;
1289  }
1290 
1291  /*
1292  * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1293  * but checking for their presence is useful for debugging and the next
1294  * sanity checks. Cope also with the fact that the result buffers have a
1295  * pre-allocated size, hence if the backup_label file has been generated
1296  * with strings longer than the maximum assumed here an incorrect parsing
1297  * happens. That's fine as only minor consistency checks are done
1298  * afterwards.
1299  */
1300  if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1301  ereport(DEBUG1,
1302  (errmsg_internal("backup time %s in file \"%s\"",
1303  backuptime, BACKUP_LABEL_FILE)));
1304 
1305  if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1306  ereport(DEBUG1,
1307  (errmsg_internal("backup label %s in file \"%s\"",
1308  backuplabel, BACKUP_LABEL_FILE)));
1309 
1310  /*
1311  * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1312  * it as a sanity check if present.
1313  */
1314  if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1315  {
1316  if (tli_from_walseg != tli_from_file)
1317  ereport(FATAL,
1318  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1319  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1320  errdetail("Timeline ID parsed is %u, but expected %u.",
1321  tli_from_file, tli_from_walseg)));
1322 
1323  ereport(DEBUG1,
1324  (errmsg_internal("backup timeline %u in file \"%s\"",
1325  tli_from_file, BACKUP_LABEL_FILE)));
1326  }
1327 
1328  if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%X\n", &hi, &lo) > 0)
1329  ereport(FATAL,
1330  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1331  errmsg("this is an incremental backup, not a data directory"),
1332  errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1333 
1334  if (ferror(lfp) || FreeFile(lfp))
1335  ereport(FATAL,
1337  errmsg("could not read file \"%s\": %m",
1338  BACKUP_LABEL_FILE)));
1339 
1340  return true;
1341 }
1342 
1343 /*
1344  * read_tablespace_map: check to see if a tablespace_map file is present
1345  *
1346  * If we see a tablespace_map file during recovery, we assume that we are
1347  * recovering from a backup dump file, and we therefore need to create symlinks
1348  * as per the information present in tablespace_map file.
1349  *
1350  * Returns true if a tablespace_map file was found (and fills *tablespaces
1351  * with a tablespaceinfo struct for each tablespace listed in the file);
1352  * returns false if not.
1353  */
1354 static bool
1356 {
1357  tablespaceinfo *ti;
1358  FILE *lfp;
1359  char str[MAXPGPATH];
1360  int ch,
1361  i,
1362  n;
1363  bool was_backslash;
1364 
1365  /*
1366  * See if tablespace_map file is present
1367  */
1368  lfp = AllocateFile(TABLESPACE_MAP, "r");
1369  if (!lfp)
1370  {
1371  if (errno != ENOENT)
1372  ereport(FATAL,
1374  errmsg("could not read file \"%s\": %m",
1375  TABLESPACE_MAP)));
1376  return false; /* it's not there, all is fine */
1377  }
1378 
1379  /*
1380  * Read and parse the link name and path lines from tablespace_map file
1381  * (this code is pretty crude, but we are not expecting any variability in
1382  * the file format). De-escape any backslashes that were inserted.
1383  */
1384  i = 0;
1385  was_backslash = false;
1386  while ((ch = fgetc(lfp)) != EOF)
1387  {
1388  if (!was_backslash && (ch == '\n' || ch == '\r'))
1389  {
1390  char *endp;
1391 
1392  if (i == 0)
1393  continue; /* \r immediately followed by \n */
1394 
1395  /*
1396  * The de-escaped line should contain an OID followed by exactly
1397  * one space followed by a path. The path might start with
1398  * spaces, so don't be too liberal about parsing.
1399  */
1400  str[i] = '\0';
1401  n = 0;
1402  while (str[n] && str[n] != ' ')
1403  n++;
1404  if (n < 1 || n >= i - 1)
1405  ereport(FATAL,
1406  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1407  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1408  str[n++] = '\0';
1409 
1410  ti = palloc0(sizeof(tablespaceinfo));
1411  errno = 0;
1412  ti->oid = strtoul(str, &endp, 10);
1413  if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1414  ereport(FATAL,
1415  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1416  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1417  ti->path = pstrdup(str + n);
1418  *tablespaces = lappend(*tablespaces, ti);
1419 
1420  i = 0;
1421  continue;
1422  }
1423  else if (!was_backslash && ch == '\\')
1424  was_backslash = true;
1425  else
1426  {
1427  if (i < sizeof(str) - 1)
1428  str[i++] = ch;
1429  was_backslash = false;
1430  }
1431  }
1432 
1433  if (i != 0 || was_backslash) /* last line not terminated? */
1434  ereport(FATAL,
1435  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1436  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1437 
1438  if (ferror(lfp) || FreeFile(lfp))
1439  ereport(FATAL,
1441  errmsg("could not read file \"%s\": %m",
1442  TABLESPACE_MAP)));
1443 
1444  return true;
1445 }
1446 
1447 /*
1448  * Finish WAL recovery.
1449  *
1450  * This does not close the 'xlogreader' yet, because in some cases the caller
1451  * still wants to re-read the last checkpoint record by calling
1452  * ReadCheckpointRecord().
1453  *
1454  * Returns the position of the last valid or applied record, after which new
1455  * WAL should be appended, information about why recovery was ended, and some
1456  * other things. See the EndOfWalRecoveryInfo struct for details.
1457  */
1460 {
1462  XLogRecPtr lastRec;
1463  TimeLineID lastRecTLI;
1464  XLogRecPtr endOfLog;
1465 
1466  /*
1467  * Kill WAL receiver, if it's still running, before we continue to write
1468  * the startup checkpoint and aborted-contrecord records. It will trump
1469  * over these records and subsequent ones if it's still alive when we
1470  * start writing WAL.
1471  */
1473 
1474  /*
1475  * Shutdown the slot sync worker to drop any temporary slots acquired by
1476  * it and to prevent it from keep trying to fetch the failover slots.
1477  *
1478  * We do not update the 'synced' column in 'pg_replication_slots' system
1479  * view from true to false here, as any failed update could leave 'synced'
1480  * column false for some slots. This could cause issues during slot sync
1481  * after restarting the server as a standby. While updating the 'synced'
1482  * column after switching to the new timeline is an option, it does not
1483  * simplify the handling for the 'synced' column. Therefore, we retain the
1484  * 'synced' column as true after promotion as it may provide useful
1485  * information about the slot origin.
1486  */
1487  ShutDownSlotSync();
1488 
1489  /*
1490  * We are now done reading the xlog from stream. Turn off streaming
1491  * recovery to force fetching the files (which would be required at end of
1492  * recovery, e.g., timeline history file) from archive or pg_wal.
1493  *
1494  * Note that standby mode must be turned off after killing WAL receiver,
1495  * i.e., calling XLogShutdownWalRcv().
1496  */
1497  Assert(!WalRcvStreaming());
1498  StandbyMode = false;
1499 
1500  /*
1501  * Determine where to start writing WAL next.
1502  *
1503  * Re-fetch the last valid or last applied record, so we can identify the
1504  * exact endpoint of what we consider the valid portion of WAL. There may
1505  * be an incomplete continuation record after that, in which case
1506  * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1507  * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1508  * it is intentionally missing. See CreateOverwriteContrecordRecord().
1509  *
1510  * An important side-effect of this is to load the last page into
1511  * xlogreader. The caller uses it to initialize the WAL for writing.
1512  */
1513  if (!InRecovery)
1514  {
1515  lastRec = CheckPointLoc;
1516  lastRecTLI = CheckPointTLI;
1517  }
1518  else
1519  {
1521  lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1522  }
1524  (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1525  endOfLog = xlogreader->EndRecPtr;
1526 
1527  /*
1528  * Remember the TLI in the filename of the XLOG segment containing the
1529  * end-of-log. It could be different from the timeline that endOfLog
1530  * nominally belongs to, if there was a timeline switch in that segment,
1531  * and we were reading the old WAL from a segment belonging to a higher
1532  * timeline.
1533  */
1534  result->endOfLogTLI = xlogreader->seg.ws_tli;
1535 
1537  {
1538  /*
1539  * We are no longer in archive recovery state.
1540  *
1541  * We are now done reading the old WAL. Turn off archive fetching if
1542  * it was active.
1543  */
1545  InArchiveRecovery = false;
1546 
1547  /*
1548  * If the ending log segment is still open, close it (to avoid
1549  * problems on Windows with trying to rename or delete an open file).
1550  */
1551  if (readFile >= 0)
1552  {
1553  close(readFile);
1554  readFile = -1;
1555  }
1556  }
1557 
1558  /*
1559  * Copy the last partial block to the caller, for initializing the WAL
1560  * buffer for appending new WAL.
1561  */
1562  if (endOfLog % XLOG_BLCKSZ != 0)
1563  {
1564  char *page;
1565  int len;
1566  XLogRecPtr pageBeginPtr;
1567 
1568  pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1569  Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1570 
1571  /* Copy the valid part of the last block */
1572  len = endOfLog % XLOG_BLCKSZ;
1573  page = palloc(len);
1574  memcpy(page, xlogreader->readBuf, len);
1575 
1576  result->lastPageBeginPtr = pageBeginPtr;
1577  result->lastPage = page;
1578  }
1579  else
1580  {
1581  /* There is no partial block to copy. */
1582  result->lastPageBeginPtr = endOfLog;
1583  result->lastPage = NULL;
1584  }
1585 
1586  /*
1587  * Create a comment for the history file to explain why and where timeline
1588  * changed.
1589  */
1591 
1592  result->lastRec = lastRec;
1593  result->lastRecTLI = lastRecTLI;
1594  result->endOfLog = endOfLog;
1595 
1596  result->abortedRecPtr = abortedRecPtr;
1598 
1601 
1602  return result;
1603 }
1604 
1605 /*
1606  * Clean up the WAL reader and leftovers from restoring WAL from archive
1607  */
1608 void
1610 {
1611  char recoveryPath[MAXPGPATH];
1612 
1613  /* Final update of pg_stat_recovery_prefetch. */
1615 
1616  /* Shut down xlogreader */
1617  if (readFile >= 0)
1618  {
1619  close(readFile);
1620  readFile = -1;
1621  }
1624 
1626  {
1627  /*
1628  * Since there might be a partial WAL segment named RECOVERYXLOG, get
1629  * rid of it.
1630  */
1631  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1632  unlink(recoveryPath); /* ignore any error */
1633 
1634  /* Get rid of any remaining recovered timeline-history file, too */
1635  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1636  unlink(recoveryPath); /* ignore any error */
1637  }
1638 
1639  /*
1640  * We don't need the latch anymore. It's not strictly necessary to disown
1641  * it, but let's do it for the sake of tidiness.
1642  */
1645 }
1646 
1647 /*
1648  * Perform WAL recovery.
1649  *
1650  * If the system was shut down cleanly, this is never called.
1651  */
1652 void
1654 {
1655  XLogRecord *record;
1656  bool reachedRecoveryTarget = false;
1657  TimeLineID replayTLI;
1658 
1659  /*
1660  * Initialize shared variables for tracking progress of WAL replay, as if
1661  * we had just replayed the record before the REDO location (or the
1662  * checkpoint record itself, if it's a shutdown checkpoint).
1663  */
1666  {
1670  }
1671  else
1672  {
1676  }
1683 
1684  /* Also ensure XLogReceiptTime has a sane value */
1686 
1687  /*
1688  * Let postmaster know we've started redo now, so that it can launch the
1689  * archiver if necessary.
1690  */
1691  if (IsUnderPostmaster)
1693 
1694  /*
1695  * Allow read-only connections immediately if we're consistent already.
1696  */
1698 
1699  /*
1700  * Find the first record that logically follows the checkpoint --- it
1701  * might physically precede it, though.
1702  */
1704  {
1705  /* back up to find the record */
1706  replayTLI = RedoStartTLI;
1708  record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1709 
1710  /*
1711  * If a checkpoint record's redo pointer points back to an earlier
1712  * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1713  * record.
1714  */
1715  if (record->xl_rmid != RM_XLOG_ID ||
1716  (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1717  ereport(FATAL,
1718  (errmsg("unexpected record type found at redo point %X/%X",
1720  }
1721  else
1722  {
1723  /* just have to read next record after CheckPoint */
1725  replayTLI = CheckPointTLI;
1726  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1727  }
1728 
1729  if (record != NULL)
1730  {
1731  TimestampTz xtime;
1732  PGRUsage ru0;
1733 
1734  pg_rusage_init(&ru0);
1735 
1736  InRedo = true;
1737 
1738  RmgrStartup();
1739 
1740  ereport(LOG,
1741  (errmsg("redo starts at %X/%X",
1743 
1744  /* Prepare to report progress of the redo phase. */
1745  if (!StandbyMode)
1747 
1748  /*
1749  * main redo apply loop
1750  */
1751  do
1752  {
1753  if (!StandbyMode)
1754  ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1756 
1757 #ifdef WAL_DEBUG
1758  if (XLOG_DEBUG)
1759  {
1761 
1762  initStringInfo(&buf);
1763  appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1766  xlog_outrec(&buf, xlogreader);
1767  appendStringInfoString(&buf, " - ");
1769  elog(LOG, "%s", buf.data);
1770  pfree(buf.data);
1771  }
1772 #endif
1773 
1774  /* Handle interrupt signals of startup process */
1776 
1777  /*
1778  * Pause WAL replay, if requested by a hot-standby session via
1779  * SetRecoveryPause().
1780  *
1781  * Note that we intentionally don't take the info_lck spinlock
1782  * here. We might therefore read a slightly stale value of the
1783  * recoveryPause flag, but it can't be very stale (no worse than
1784  * the last spinlock we did acquire). Since a pause request is a
1785  * pretty asynchronous thing anyway, possibly responding to it one
1786  * WAL record later than we otherwise would is a minor issue, so
1787  * it doesn't seem worth adding another spinlock cycle to prevent
1788  * that.
1789  */
1790  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1792  recoveryPausesHere(false);
1793 
1794  /*
1795  * Have we reached our recovery target?
1796  */
1798  {
1799  reachedRecoveryTarget = true;
1800  break;
1801  }
1802 
1803  /*
1804  * If we've been asked to lag the primary, wait on latch until
1805  * enough time has passed.
1806  */
1808  {
1809  /*
1810  * We test for paused recovery again here. If user sets
1811  * delayed apply, it may be because they expect to pause
1812  * recovery in case of problems, so we must test again here
1813  * otherwise pausing during the delay-wait wouldn't work.
1814  */
1815  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1817  recoveryPausesHere(false);
1818  }
1819 
1820  /*
1821  * Apply the record
1822  */
1823  ApplyWalRecord(xlogreader, record, &replayTLI);
1824 
1825  /* Exit loop if we reached inclusive recovery target */
1827  {
1828  reachedRecoveryTarget = true;
1829  break;
1830  }
1831 
1832  /*
1833  * If we replayed an LSN that someone was waiting for then walk
1834  * over the shared memory array and set latches to notify the
1835  * waiters.
1836  */
1837  if (waitLSNState &&
1841 
1842  /* Else, try to fetch the next WAL record */
1843  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1844  } while (record != NULL);
1845 
1846  /*
1847  * end of main redo apply loop
1848  */
1849 
1850  if (reachedRecoveryTarget)
1851  {
1852  if (!reachedConsistency)
1853  ereport(FATAL,
1854  (errmsg("requested recovery stop point is before consistent recovery point")));
1855 
1856  /*
1857  * This is the last point where we can restart recovery with a new
1858  * recovery target, if we shutdown and begin again. After this,
1859  * Resource Managers may choose to do permanent corrective actions
1860  * at end of recovery.
1861  */
1862  switch (recoveryTargetAction)
1863  {
1865 
1866  /*
1867  * exit with special return code to request shutdown of
1868  * postmaster. Log messages issued from postmaster.
1869  */
1870  proc_exit(3);
1871 
1873  SetRecoveryPause(true);
1874  recoveryPausesHere(true);
1875 
1876  /* drop into promote */
1877 
1879  break;
1880  }
1881  }
1882 
1883  RmgrCleanup();
1884 
1885  ereport(LOG,
1886  (errmsg("redo done at %X/%X system usage: %s",
1888  pg_rusage_show(&ru0))));
1889  xtime = GetLatestXTime();
1890  if (xtime)
1891  ereport(LOG,
1892  (errmsg("last completed transaction was at log time %s",
1893  timestamptz_to_str(xtime))));
1894 
1895  InRedo = false;
1896  }
1897  else
1898  {
1899  /* there are no WAL records following the checkpoint */
1900  ereport(LOG,
1901  (errmsg("redo is not required")));
1902  }
1903 
1904  /*
1905  * This check is intentionally after the above log messages that indicate
1906  * how far recovery went.
1907  */
1910  !reachedRecoveryTarget)
1911  ereport(FATAL,
1912  (errcode(ERRCODE_CONFIG_FILE_ERROR),
1913  errmsg("recovery ended before configured recovery target was reached")));
1914 }
1915 
1916 /*
1917  * Subroutine of PerformWalRecovery, to apply one WAL record.
1918  */
1919 static void
1921 {
1922  ErrorContextCallback errcallback;
1923  bool switchedTLI = false;
1924 
1925  /* Setup error traceback support for ereport() */
1926  errcallback.callback = rm_redo_error_callback;
1927  errcallback.arg = (void *) xlogreader;
1928  errcallback.previous = error_context_stack;
1929  error_context_stack = &errcallback;
1930 
1931  /*
1932  * TransamVariables->nextXid must be beyond record's xid.
1933  */
1935 
1936  /*
1937  * Before replaying this record, check if this record causes the current
1938  * timeline to change. The record is already considered to be part of the
1939  * new timeline, so we update replayTLI before replaying it. That's
1940  * important so that replayEndTLI, which is recorded as the minimum
1941  * recovery point's TLI if recovery stops after this record, is set
1942  * correctly.
1943  */
1944  if (record->xl_rmid == RM_XLOG_ID)
1945  {
1946  TimeLineID newReplayTLI = *replayTLI;
1947  TimeLineID prevReplayTLI = *replayTLI;
1948  uint8 info = record->xl_info & ~XLR_INFO_MASK;
1949 
1950  if (info == XLOG_CHECKPOINT_SHUTDOWN)
1951  {
1952  CheckPoint checkPoint;
1953 
1954  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1955  newReplayTLI = checkPoint.ThisTimeLineID;
1956  prevReplayTLI = checkPoint.PrevTimeLineID;
1957  }
1958  else if (info == XLOG_END_OF_RECOVERY)
1959  {
1960  xl_end_of_recovery xlrec;
1961 
1962  memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1963  newReplayTLI = xlrec.ThisTimeLineID;
1964  prevReplayTLI = xlrec.PrevTimeLineID;
1965  }
1966 
1967  if (newReplayTLI != *replayTLI)
1968  {
1969  /* Check that it's OK to switch to this TLI */
1971  newReplayTLI, prevReplayTLI, *replayTLI);
1972 
1973  /* Following WAL records should be run with new TLI */
1974  *replayTLI = newReplayTLI;
1975  switchedTLI = true;
1976  }
1977  }
1978 
1979  /*
1980  * Update shared replayEndRecPtr before replaying this record, so that
1981  * XLogFlush will update minRecoveryPoint correctly.
1982  */
1985  XLogRecoveryCtl->replayEndTLI = *replayTLI;
1987 
1988  /*
1989  * If we are attempting to enter Hot Standby mode, process XIDs we see
1990  */
1992  TransactionIdIsValid(record->xl_xid))
1994 
1995  /*
1996  * Some XLOG record types that are related to recovery are processed
1997  * directly here, rather than in xlog_redo()
1998  */
1999  if (record->xl_rmid == RM_XLOG_ID)
2000  xlogrecovery_redo(xlogreader, *replayTLI);
2001 
2002  /* Now apply the WAL record itself */
2003  GetRmgr(record->xl_rmid).rm_redo(xlogreader);
2004 
2005  /*
2006  * After redo, check whether the backup pages associated with the WAL
2007  * record are consistent with the existing pages. This check is done only
2008  * if consistency check is enabled for this record.
2009  */
2010  if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2012 
2013  /* Pop the error context stack */
2014  error_context_stack = errcallback.previous;
2015 
2016  /*
2017  * Update lastReplayedEndRecPtr after this record has been successfully
2018  * replayed.
2019  */
2023  XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2025 
2026  /* ------
2027  * Wakeup walsenders:
2028  *
2029  * On the standby, the WAL is flushed first (which will only wake up
2030  * physical walsenders) and then applied, which will only wake up logical
2031  * walsenders.
2032  *
2033  * Indeed, logical walsenders on standby can't decode and send data until
2034  * it's been applied.
2035  *
2036  * Physical walsenders don't need to be woken up during replay unless
2037  * cascading replication is allowed and time line change occurred (so that
2038  * they can notice that they are on a new time line).
2039  *
2040  * That's why the wake up conditions are for:
2041  *
2042  * - physical walsenders in case of new time line and cascade
2043  * replication is allowed
2044  * - logical walsenders in case cascade replication is allowed (could not
2045  * be created otherwise)
2046  * ------
2047  */
2049  WalSndWakeup(switchedTLI, true);
2050 
2051  /*
2052  * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2053  * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2054  * a reply to the primary.
2055  */
2057  {
2058  doRequestWalReceiverReply = false;
2059  WalRcvForceReply();
2060  }
2061 
2062  /* Allow read-only connections if we're consistent now */
2064 
2065  /* Is this a timeline switch? */
2066  if (switchedTLI)
2067  {
2068  /*
2069  * Before we continue on the new timeline, clean up any (possibly
2070  * bogus) future WAL segments on the old timeline.
2071  */
2073 
2074  /* Reset the prefetcher. */
2076  }
2077 }
2078 
2079 /*
2080  * Some XLOG RM record types that are directly related to WAL recovery are
2081  * handled here rather than in the xlog_redo()
2082  */
2083 static void
2085 {
2086  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2087  XLogRecPtr lsn = record->EndRecPtr;
2088 
2089  Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2090 
2091  if (info == XLOG_OVERWRITE_CONTRECORD)
2092  {
2093  /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2095 
2096  memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2097  if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2098  elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
2101 
2102  /* We have safely skipped the aborted record */
2105 
2106  ereport(LOG,
2107  (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
2110 
2111  /* Verifying the record should only happen once */
2113  }
2114  else if (info == XLOG_BACKUP_END)
2115  {
2116  XLogRecPtr startpoint;
2117 
2118  memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2119 
2120  if (backupStartPoint == startpoint)
2121  {
2122  /*
2123  * We have reached the end of base backup, the point where
2124  * pg_backup_stop() was done. The data on disk is now consistent
2125  * (assuming we have also reached minRecoveryPoint). Set
2126  * backupEndPoint to the current LSN, so that the next call to
2127  * CheckRecoveryConsistency() will notice it and do the
2128  * end-of-backup processing.
2129  */
2130  elog(DEBUG1, "end of backup record reached");
2131 
2132  backupEndPoint = lsn;
2133  }
2134  else
2135  elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2137  }
2138 }
2139 
2140 /*
2141  * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2142  * directories.
2143  *
2144  * Replay of database creation XLOG records for databases that were later
2145  * dropped can create fake directories in pg_tblspc. By the time consistency
2146  * is reached these directories should have been removed; here we verify
2147  * that this did indeed happen. This is to be called at the point where
2148  * consistent state is reached.
2149  *
2150  * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2151  * useful for testing purposes, and also allows for an escape hatch in case
2152  * things go south.
2153  */
2154 static void
2156 {
2157  DIR *dir;
2158  struct dirent *de;
2159 
2160  dir = AllocateDir(PG_TBLSPC_DIR);
2161  while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2162  {
2163  char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2164 
2165  /* Skip entries of non-oid names */
2166  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2167  continue;
2168 
2169  snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2170 
2171  if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2174  errmsg("unexpected directory entry \"%s\" found in %s",
2175  de->d_name, PG_TBLSPC_DIR),
2176  errdetail("All directory entries in %s/ should be symbolic links.",
2177  PG_TBLSPC_DIR),
2178  errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2179  }
2180 }
2181 
2182 /*
2183  * Checks if recovery has reached a consistent state. When consistency is
2184  * reached and we have a valid starting standby snapshot, tell postmaster
2185  * that it can start accepting read-only connections.
2186  */
2187 static void
2189 {
2190  XLogRecPtr lastReplayedEndRecPtr;
2191  TimeLineID lastReplayedTLI;
2192 
2193  /*
2194  * During crash recovery, we don't reach a consistent state until we've
2195  * replayed all the WAL.
2196  */
2198  return;
2199 
2201 
2202  /*
2203  * assume that we are called in the startup process, and hence don't need
2204  * a lock to read lastReplayedEndRecPtr
2205  */
2206  lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2207  lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2208 
2209  /*
2210  * Have we reached the point where our base backup was completed?
2211  */
2213  backupEndPoint <= lastReplayedEndRecPtr)
2214  {
2215  XLogRecPtr saveBackupStartPoint = backupStartPoint;
2216  XLogRecPtr saveBackupEndPoint = backupEndPoint;
2217 
2218  elog(DEBUG1, "end of backup reached");
2219 
2220  /*
2221  * We have reached the end of base backup, as indicated by pg_control.
2222  * Update the control file accordingly.
2223  */
2224  ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2227  backupEndRequired = false;
2228 
2229  ereport(LOG,
2230  (errmsg("completed backup recovery with redo LSN %X/%X and end LSN %X/%X",
2231  LSN_FORMAT_ARGS(saveBackupStartPoint),
2232  LSN_FORMAT_ARGS(saveBackupEndPoint))));
2233  }
2234 
2235  /*
2236  * Have we passed our safe starting point? Note that minRecoveryPoint is
2237  * known to be incorrectly set if recovering from a backup, until the
2238  * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2239  * All we know prior to that is that we're not consistent yet.
2240  */
2242  minRecoveryPoint <= lastReplayedEndRecPtr)
2243  {
2244  /*
2245  * Check to see if the XLOG sequence contained any unresolved
2246  * references to uninitialized pages.
2247  */
2249 
2250  /*
2251  * Check that pg_tblspc doesn't contain any real directories. Replay
2252  * of Database/CREATE_* records may have created fictitious tablespace
2253  * directories that should have been removed by the time consistency
2254  * was reached.
2255  */
2257 
2258  reachedConsistency = true;
2259  ereport(LOG,
2260  (errmsg("consistent recovery state reached at %X/%X",
2261  LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2262  }
2263 
2264  /*
2265  * Have we got a valid starting snapshot that will allow queries to be
2266  * run? If so, we can tell postmaster that the database is consistent now,
2267  * enabling connections.
2268  */
2273  {
2277 
2278  LocalHotStandbyActive = true;
2279 
2281  }
2282 }
2283 
2284 /*
2285  * Error context callback for errors occurring during rm_redo().
2286  */
2287 static void
2289 {
2290  XLogReaderState *record = (XLogReaderState *) arg;
2292 
2293  initStringInfo(&buf);
2294  xlog_outdesc(&buf, record);
2295  xlog_block_info(&buf, record);
2296 
2297  /* translator: %s is a WAL record description */
2298  errcontext("WAL redo at %X/%X for %s",
2299  LSN_FORMAT_ARGS(record->ReadRecPtr),
2300  buf.data);
2301 
2302  pfree(buf.data);
2303 }
2304 
2305 /*
2306  * Returns a string describing an XLogRecord, consisting of its identity
2307  * optionally followed by a colon, a space, and a further description.
2308  */
2309 void
2311 {
2312  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2313  uint8 info = XLogRecGetInfo(record);
2314  const char *id;
2315 
2317  appendStringInfoChar(buf, '/');
2318 
2319  id = rmgr.rm_identify(info);
2320  if (id == NULL)
2321  appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2322  else
2323  appendStringInfo(buf, "%s: ", id);
2324 
2325  rmgr.rm_desc(buf, record);
2326 }
2327 
2328 #ifdef WAL_DEBUG
2329 
2330 static void
2331 xlog_outrec(StringInfo buf, XLogReaderState *record)
2332 {
2333  appendStringInfo(buf, "prev %X/%X; xid %u",
2335  XLogRecGetXid(record));
2336 
2337  appendStringInfo(buf, "; len %u",
2338  XLogRecGetDataLen(record));
2339 
2340  xlog_block_info(buf, record);
2341 }
2342 #endif /* WAL_DEBUG */
2343 
2344 /*
2345  * Returns a string giving information about all the blocks in an
2346  * XLogRecord.
2347  */
2348 static void
2350 {
2351  int block_id;
2352 
2353  /* decode block references */
2354  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2355  {
2356  RelFileLocator rlocator;
2357  ForkNumber forknum;
2358  BlockNumber blk;
2359 
2360  if (!XLogRecGetBlockTagExtended(record, block_id,
2361  &rlocator, &forknum, &blk, NULL))
2362  continue;
2363 
2364  if (forknum != MAIN_FORKNUM)
2365  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2366  block_id,
2367  rlocator.spcOid, rlocator.dbOid,
2368  rlocator.relNumber,
2369  forknum,
2370  blk);
2371  else
2372  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2373  block_id,
2374  rlocator.spcOid, rlocator.dbOid,
2375  rlocator.relNumber,
2376  blk);
2377  if (XLogRecHasBlockImage(record, block_id))
2378  appendStringInfoString(buf, " FPW");
2379  }
2380 }
2381 
2382 
2383 /*
2384  * Check that it's OK to switch to new timeline during recovery.
2385  *
2386  * 'lsn' is the address of the shutdown checkpoint record we're about to
2387  * replay. (Currently, timeline can only change at a shutdown checkpoint).
2388  */
2389 static void
2391  TimeLineID replayTLI)
2392 {
2393  /* Check that the record agrees on what the current (old) timeline is */
2394  if (prevTLI != replayTLI)
2395  ereport(PANIC,
2396  (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2397  prevTLI, replayTLI)));
2398 
2399  /*
2400  * The new timeline better be in the list of timelines we expect to see,
2401  * according to the timeline history. It should also not decrease.
2402  */
2403  if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2404  ereport(PANIC,
2405  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2406  newTLI, replayTLI)));
2407 
2408  /*
2409  * If we have not yet reached min recovery point, and we're about to
2410  * switch to a timeline greater than the timeline of the min recovery
2411  * point: trouble. After switching to the new timeline, we could not
2412  * possibly visit the min recovery point on the correct timeline anymore.
2413  * This can happen if there is a newer timeline in the archive that
2414  * branched before the timeline the min recovery point is on, and you
2415  * attempt to do PITR to the new timeline.
2416  */
2418  lsn < minRecoveryPoint &&
2419  newTLI > minRecoveryPointTLI)
2420  ereport(PANIC,
2421  (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2422  newTLI,
2425 
2426  /* Looks good */
2427 }
2428 
2429 
2430 /*
2431  * Extract timestamp from WAL record.
2432  *
2433  * If the record contains a timestamp, returns true, and saves the timestamp
2434  * in *recordXtime. If the record type has no timestamp, returns false.
2435  * Currently, only transaction commit/abort records and restore points contain
2436  * timestamps.
2437  */
2438 static bool
2440 {
2441  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2442  uint8 xact_info = info & XLOG_XACT_OPMASK;
2443  uint8 rmid = XLogRecGetRmid(record);
2444 
2445  if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2446  {
2447  *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2448  return true;
2449  }
2450  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2451  xact_info == XLOG_XACT_COMMIT_PREPARED))
2452  {
2453  *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2454  return true;
2455  }
2456  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2457  xact_info == XLOG_XACT_ABORT_PREPARED))
2458  {
2459  *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2460  return true;
2461  }
2462  return false;
2463 }
2464 
2465 /*
2466  * Checks whether the current buffer page and backup page stored in the
2467  * WAL record are consistent or not. Before comparing the two pages, a
2468  * masking can be applied to the pages to ignore certain areas like hint bits,
2469  * unused space between pd_lower and pd_upper among other things. This
2470  * function should be called once WAL replay has been completed for a
2471  * given record.
2472  */
2473 static void
2475 {
2476  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2477  RelFileLocator rlocator;
2478  ForkNumber forknum;
2479  BlockNumber blkno;
2480  int block_id;
2481 
2482  /* Records with no backup blocks have no need for consistency checks. */
2483  if (!XLogRecHasAnyBlockRefs(record))
2484  return;
2485 
2486  Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2487 
2488  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2489  {
2490  Buffer buf;
2491  Page page;
2492 
2493  if (!XLogRecGetBlockTagExtended(record, block_id,
2494  &rlocator, &forknum, &blkno, NULL))
2495  {
2496  /*
2497  * WAL record doesn't contain a block reference with the given id.
2498  * Do nothing.
2499  */
2500  continue;
2501  }
2502 
2503  Assert(XLogRecHasBlockImage(record, block_id));
2504 
2505  if (XLogRecBlockImageApply(record, block_id))
2506  {
2507  /*
2508  * WAL record has already applied the page, so bypass the
2509  * consistency check as that would result in comparing the full
2510  * page stored in the record with itself.
2511  */
2512  continue;
2513  }
2514 
2515  /*
2516  * Read the contents from the current buffer and store it in a
2517  * temporary page.
2518  */
2519  buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2521  InvalidBuffer);
2522  if (!BufferIsValid(buf))
2523  continue;
2524 
2526  page = BufferGetPage(buf);
2527 
2528  /*
2529  * Take a copy of the local page where WAL has been applied to have a
2530  * comparison base before masking it...
2531  */
2532  memcpy(replay_image_masked, page, BLCKSZ);
2533 
2534  /* No need for this page anymore now that a copy is in. */
2536 
2537  /*
2538  * If the block LSN is already ahead of this WAL record, we can't
2539  * expect contents to match. This can happen if recovery is
2540  * restarted.
2541  */
2542  if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2543  continue;
2544 
2545  /*
2546  * Read the contents from the backup copy, stored in WAL record and
2547  * store it in a temporary page. There is no need to allocate a new
2548  * page here, a local buffer is fine to hold its contents and a mask
2549  * can be directly applied on it.
2550  */
2551  if (!RestoreBlockImage(record, block_id, primary_image_masked))
2552  ereport(ERROR,
2553  (errcode(ERRCODE_INTERNAL_ERROR),
2554  errmsg_internal("%s", record->errormsg_buf)));
2555 
2556  /*
2557  * If masking function is defined, mask both the primary and replay
2558  * images
2559  */
2560  if (rmgr.rm_mask != NULL)
2561  {
2562  rmgr.rm_mask(replay_image_masked, blkno);
2563  rmgr.rm_mask(primary_image_masked, blkno);
2564  }
2565 
2566  /* Time to compare the primary and replay images. */
2567  if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2568  {
2569  elog(FATAL,
2570  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2571  rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2572  forknum, blkno);
2573  }
2574  }
2575 }
2576 
2577 /*
2578  * For point-in-time recovery, this function decides whether we want to
2579  * stop applying the XLOG before the current record.
2580  *
2581  * Returns true if we are stopping, false otherwise. If stopping, some
2582  * information is saved in recoveryStopXid et al for use in annotating the
2583  * new timeline's history file.
2584  */
2585 static bool
2587 {
2588  bool stopsHere = false;
2589  uint8 xact_info;
2590  bool isCommit;
2591  TimestampTz recordXtime = 0;
2592  TransactionId recordXid;
2593 
2594  /*
2595  * Ignore recovery target settings when not in archive recovery (meaning
2596  * we are in crash recovery).
2597  */
2599  return false;
2600 
2601  /* Check if we should stop as soon as reaching consistency */
2603  {
2604  ereport(LOG,
2605  (errmsg("recovery stopping after reaching consistency")));
2606 
2607  recoveryStopAfter = false;
2610  recoveryStopTime = 0;
2611  recoveryStopName[0] = '\0';
2612  return true;
2613  }
2614 
2615  /* Check if target LSN has been reached */
2618  record->ReadRecPtr >= recoveryTargetLSN)
2619  {
2620  recoveryStopAfter = false;
2622  recoveryStopLSN = record->ReadRecPtr;
2623  recoveryStopTime = 0;
2624  recoveryStopName[0] = '\0';
2625  ereport(LOG,
2626  (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2628  return true;
2629  }
2630 
2631  /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2632  if (XLogRecGetRmid(record) != RM_XACT_ID)
2633  return false;
2634 
2635  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2636 
2637  if (xact_info == XLOG_XACT_COMMIT)
2638  {
2639  isCommit = true;
2640  recordXid = XLogRecGetXid(record);
2641  }
2642  else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2643  {
2644  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2645  xl_xact_parsed_commit parsed;
2646 
2647  isCommit = true;
2649  xlrec,
2650  &parsed);
2651  recordXid = parsed.twophase_xid;
2652  }
2653  else if (xact_info == XLOG_XACT_ABORT)
2654  {
2655  isCommit = false;
2656  recordXid = XLogRecGetXid(record);
2657  }
2658  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2659  {
2660  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2661  xl_xact_parsed_abort parsed;
2662 
2663  isCommit = false;
2665  xlrec,
2666  &parsed);
2667  recordXid = parsed.twophase_xid;
2668  }
2669  else
2670  return false;
2671 
2673  {
2674  /*
2675  * There can be only one transaction end record with this exact
2676  * transactionid
2677  *
2678  * when testing for an xid, we MUST test for equality only, since
2679  * transactions are numbered in the order they start, not the order
2680  * they complete. A higher numbered xid will complete before you about
2681  * 50% of the time...
2682  */
2683  stopsHere = (recordXid == recoveryTargetXid);
2684  }
2685 
2686  /*
2687  * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2688  * We don't expect getRecordTimestamp ever to fail, since we already know
2689  * this is a commit or abort record; but test its result anyway.
2690  */
2691  if (getRecordTimestamp(record, &recordXtime) &&
2693  {
2694  /*
2695  * There can be many transactions that share the same commit time, so
2696  * we stop after the last one, if we are inclusive, or stop at the
2697  * first one if we are exclusive
2698  */
2700  stopsHere = (recordXtime > recoveryTargetTime);
2701  else
2702  stopsHere = (recordXtime >= recoveryTargetTime);
2703  }
2704 
2705  if (stopsHere)
2706  {
2707  recoveryStopAfter = false;
2708  recoveryStopXid = recordXid;
2709  recoveryStopTime = recordXtime;
2711  recoveryStopName[0] = '\0';
2712 
2713  if (isCommit)
2714  {
2715  ereport(LOG,
2716  (errmsg("recovery stopping before commit of transaction %u, time %s",
2719  }
2720  else
2721  {
2722  ereport(LOG,
2723  (errmsg("recovery stopping before abort of transaction %u, time %s",
2726  }
2727  }
2728 
2729  return stopsHere;
2730 }
2731 
2732 /*
2733  * Same as recoveryStopsBefore, but called after applying the record.
2734  *
2735  * We also track the timestamp of the latest applied COMMIT/ABORT
2736  * record in XLogRecoveryCtl->recoveryLastXTime.
2737  */
2738 static bool
2740 {
2741  uint8 info;
2742  uint8 xact_info;
2743  uint8 rmid;
2744  TimestampTz recordXtime = 0;
2745 
2746  /*
2747  * Ignore recovery target settings when not in archive recovery (meaning
2748  * we are in crash recovery).
2749  */
2751  return false;
2752 
2753  info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2754  rmid = XLogRecGetRmid(record);
2755 
2756  /*
2757  * There can be many restore points that share the same name; we stop at
2758  * the first one.
2759  */
2761  rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2762  {
2763  xl_restore_point *recordRestorePointData;
2764 
2765  recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2766 
2767  if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2768  {
2769  recoveryStopAfter = true;
2772  (void) getRecordTimestamp(record, &recoveryStopTime);
2773  strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2774 
2775  ereport(LOG,
2776  (errmsg("recovery stopping at restore point \"%s\", time %s",
2779  return true;
2780  }
2781  }
2782 
2783  /* Check if the target LSN has been reached */
2786  record->ReadRecPtr >= recoveryTargetLSN)
2787  {
2788  recoveryStopAfter = true;
2790  recoveryStopLSN = record->ReadRecPtr;
2791  recoveryStopTime = 0;
2792  recoveryStopName[0] = '\0';
2793  ereport(LOG,
2794  (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2796  return true;
2797  }
2798 
2799  if (rmid != RM_XACT_ID)
2800  return false;
2801 
2802  xact_info = info & XLOG_XACT_OPMASK;
2803 
2804  if (xact_info == XLOG_XACT_COMMIT ||
2805  xact_info == XLOG_XACT_COMMIT_PREPARED ||
2806  xact_info == XLOG_XACT_ABORT ||
2807  xact_info == XLOG_XACT_ABORT_PREPARED)
2808  {
2809  TransactionId recordXid;
2810 
2811  /* Update the last applied transaction timestamp */
2812  if (getRecordTimestamp(record, &recordXtime))
2813  SetLatestXTime(recordXtime);
2814 
2815  /* Extract the XID of the committed/aborted transaction */
2816  if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2817  {
2818  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2819  xl_xact_parsed_commit parsed;
2820 
2822  xlrec,
2823  &parsed);
2824  recordXid = parsed.twophase_xid;
2825  }
2826  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2827  {
2828  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2829  xl_xact_parsed_abort parsed;
2830 
2832  xlrec,
2833  &parsed);
2834  recordXid = parsed.twophase_xid;
2835  }
2836  else
2837  recordXid = XLogRecGetXid(record);
2838 
2839  /*
2840  * There can be only one transaction end record with this exact
2841  * transactionid
2842  *
2843  * when testing for an xid, we MUST test for equality only, since
2844  * transactions are numbered in the order they start, not the order
2845  * they complete. A higher numbered xid will complete before you about
2846  * 50% of the time...
2847  */
2849  recordXid == recoveryTargetXid)
2850  {
2851  recoveryStopAfter = true;
2852  recoveryStopXid = recordXid;
2853  recoveryStopTime = recordXtime;
2855  recoveryStopName[0] = '\0';
2856 
2857  if (xact_info == XLOG_XACT_COMMIT ||
2858  xact_info == XLOG_XACT_COMMIT_PREPARED)
2859  {
2860  ereport(LOG,
2861  (errmsg("recovery stopping after commit of transaction %u, time %s",
2864  }
2865  else if (xact_info == XLOG_XACT_ABORT ||
2866  xact_info == XLOG_XACT_ABORT_PREPARED)
2867  {
2868  ereport(LOG,
2869  (errmsg("recovery stopping after abort of transaction %u, time %s",
2872  }
2873  return true;
2874  }
2875  }
2876 
2877  /* Check if we should stop as soon as reaching consistency */
2879  {
2880  ereport(LOG,
2881  (errmsg("recovery stopping after reaching consistency")));
2882 
2883  recoveryStopAfter = true;
2885  recoveryStopTime = 0;
2887  recoveryStopName[0] = '\0';
2888  return true;
2889  }
2890 
2891  return false;
2892 }
2893 
2894 /*
2895  * Create a comment for the history file to explain why and where
2896  * timeline changed.
2897  */
2898 static char *
2900 {
2901  char reason[200];
2902 
2904  snprintf(reason, sizeof(reason),
2905  "%s transaction %u",
2906  recoveryStopAfter ? "after" : "before",
2907  recoveryStopXid);
2909  snprintf(reason, sizeof(reason),
2910  "%s %s\n",
2911  recoveryStopAfter ? "after" : "before",
2913  else if (recoveryTarget == RECOVERY_TARGET_LSN)
2914  snprintf(reason, sizeof(reason),
2915  "%s LSN %X/%X\n",
2916  recoveryStopAfter ? "after" : "before",
2919  snprintf(reason, sizeof(reason),
2920  "at restore point \"%s\"",
2923  snprintf(reason, sizeof(reason), "reached consistency");
2924  else
2925  snprintf(reason, sizeof(reason), "no recovery target specified");
2926 
2927  return pstrdup(reason);
2928 }
2929 
2930 /*
2931  * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2932  *
2933  * endOfRecovery is true if the recovery target is reached and
2934  * the paused state starts at the end of recovery because of
2935  * recovery_target_action=pause, and false otherwise.
2936  */
2937 static void
2938 recoveryPausesHere(bool endOfRecovery)
2939 {
2940  /* Don't pause unless users can connect! */
2941  if (!LocalHotStandbyActive)
2942  return;
2943 
2944  /* Don't pause after standby promotion has been triggered */
2946  return;
2947 
2948  if (endOfRecovery)
2949  ereport(LOG,
2950  (errmsg("pausing at the end of recovery"),
2951  errhint("Execute pg_wal_replay_resume() to promote.")));
2952  else
2953  ereport(LOG,
2954  (errmsg("recovery has paused"),
2955  errhint("Execute pg_wal_replay_resume() to continue.")));
2956 
2957  /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2959  {
2961  if (CheckForStandbyTrigger())
2962  return;
2963 
2964  /*
2965  * If recovery pause is requested then set it paused. While we are in
2966  * the loop, user might resume and pause again so set this every time.
2967  */
2969 
2970  /*
2971  * We wait on a condition variable that will wake us as soon as the
2972  * pause ends, but we use a timeout so we can check the above exit
2973  * condition periodically too.
2974  */
2976  WAIT_EVENT_RECOVERY_PAUSE);
2977  }
2979 }
2980 
2981 /*
2982  * When recovery_min_apply_delay is set, we wait long enough to make sure
2983  * certain record types are applied at least that interval behind the primary.
2984  *
2985  * Returns true if we waited.
2986  *
2987  * Note that the delay is calculated between the WAL record log time and
2988  * the current time on standby. We would prefer to keep track of when this
2989  * standby received each WAL record, which would allow a more consistent
2990  * approach and one not affected by time synchronisation issues, but that
2991  * is significantly more effort and complexity for little actual gain in
2992  * usability.
2993  */
2994 static bool
2996 {
2997  uint8 xact_info;
2998  TimestampTz xtime;
2999  TimestampTz delayUntil;
3000  long msecs;
3001 
3002  /* nothing to do if no delay configured */
3003  if (recovery_min_apply_delay <= 0)
3004  return false;
3005 
3006  /* no delay is applied on a database not yet consistent */
3007  if (!reachedConsistency)
3008  return false;
3009 
3010  /* nothing to do if crash recovery is requested */
3012  return false;
3013 
3014  /*
3015  * Is it a COMMIT record?
3016  *
3017  * We deliberately choose not to delay aborts since they have no effect on
3018  * MVCC. We already allow replay of records that don't have a timestamp,
3019  * so there is already opportunity for issues caused by early conflicts on
3020  * standbys.
3021  */
3022  if (XLogRecGetRmid(record) != RM_XACT_ID)
3023  return false;
3024 
3025  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3026 
3027  if (xact_info != XLOG_XACT_COMMIT &&
3028  xact_info != XLOG_XACT_COMMIT_PREPARED)
3029  return false;
3030 
3031  if (!getRecordTimestamp(record, &xtime))
3032  return false;
3033 
3035 
3036  /*
3037  * Exit without arming the latch if it's already past time to apply this
3038  * record
3039  */
3041  if (msecs <= 0)
3042  return false;
3043 
3044  while (true)
3045  {
3047 
3048  /* This might change recovery_min_apply_delay. */
3050 
3051  if (CheckForStandbyTrigger())
3052  break;
3053 
3054  /*
3055  * Recalculate delayUntil as recovery_min_apply_delay could have
3056  * changed while waiting in this loop.
3057  */
3059 
3060  /*
3061  * Wait for difference between GetCurrentTimestamp() and delayUntil.
3062  */
3064  delayUntil);
3065 
3066  if (msecs <= 0)
3067  break;
3068 
3069  elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3070 
3073  msecs,
3074  WAIT_EVENT_RECOVERY_APPLY_DELAY);
3075  }
3076  return true;
3077 }
3078 
3079 /*
3080  * Get the current state of the recovery pause request.
3081  */
3084 {
3086 
3090 
3091  return state;
3092 }
3093 
3094 /*
3095  * Set the recovery pause state.
3096  *
3097  * If recovery pause is requested then sets the recovery pause state to
3098  * 'pause requested' if it is not already 'paused'. Otherwise, sets it
3099  * to 'not paused' to resume the recovery. The recovery pause will be
3100  * confirmed by the ConfirmRecoveryPaused.
3101  */
3102 void
3103 SetRecoveryPause(bool recoveryPause)
3104 {
3106 
3107  if (!recoveryPause)
3111 
3113 
3114  if (!recoveryPause)
3116 }
3117 
3118 /*
3119  * Confirm the recovery pause by setting the recovery pause state to
3120  * RECOVERY_PAUSED.
3121  */
3122 static void
3124 {
3125  /* If recovery pause is requested then set it paused */
3130 }
3131 
3132 
3133 /*
3134  * Attempt to read the next XLOG record.
3135  *
3136  * Before first call, the reader needs to be positioned to the first record
3137  * by calling XLogPrefetcherBeginRead().
3138  *
3139  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3140  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3141  * record is available.
3142  */
3143 static XLogRecord *
3145  bool fetching_ckpt, TimeLineID replayTLI)
3146 {
3147  XLogRecord *record;
3150 
3151  /* Pass through parameters to XLogPageRead */
3152  private->fetching_ckpt = fetching_ckpt;
3153  private->emode = emode;
3154  private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3155  private->replayTLI = replayTLI;
3156 
3157  /* This is the first attempt to read this page. */
3158  lastSourceFailed = false;
3159 
3160  for (;;)
3161  {
3162  char *errormsg;
3163 
3164  record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3165  if (record == NULL)
3166  {
3167  /*
3168  * When we find that WAL ends in an incomplete record, keep track
3169  * of that record. After recovery is done, we'll write a record
3170  * to indicate to downstream WAL readers that that portion is to
3171  * be ignored.
3172  *
3173  * However, when ArchiveRecoveryRequested = true, we're going to
3174  * switch to a new timeline at the end of recovery. We will only
3175  * copy WAL over to the new timeline up to the end of the last
3176  * complete record, so if we did this, we would later create an
3177  * overwrite contrecord in the wrong place, breaking everything.
3178  */
3179  if (!ArchiveRecoveryRequested &&
3181  {
3184  }
3185 
3186  if (readFile >= 0)
3187  {
3188  close(readFile);
3189  readFile = -1;
3190  }
3191 
3192  /*
3193  * We only end up here without a message when XLogPageRead()
3194  * failed - in that case we already logged something. In
3195  * StandbyMode that only happens if we have been triggered, so we
3196  * shouldn't loop anymore in that case.
3197  */
3198  if (errormsg)
3200  (errmsg_internal("%s", errormsg) /* already translated */ ));
3201  }
3202 
3203  /*
3204  * Check page TLI is one of the expected values.
3205  */
3207  {
3208  char fname[MAXFNAMELEN];
3209  XLogSegNo segno;
3210  int32 offset;
3211 
3215  XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3218  (errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%X, offset %u",
3220  fname,
3222  offset)));
3223  record = NULL;
3224  }
3225 
3226  if (record)
3227  {
3228  /* Great, got a record */
3229  return record;
3230  }
3231  else
3232  {
3233  /* No valid record available from this source */
3234  lastSourceFailed = true;
3235 
3236  /*
3237  * If archive recovery was requested, but we were still doing
3238  * crash recovery, switch to archive recovery and retry using the
3239  * offline archive. We have now replayed all the valid WAL in
3240  * pg_wal, so we are presumably now consistent.
3241  *
3242  * We require that there's at least some valid WAL present in
3243  * pg_wal, however (!fetching_ckpt). We could recover using the
3244  * WAL from the archive, even if pg_wal is completely empty, but
3245  * we'd have no idea how far we'd have to replay to reach
3246  * consistency. So err on the safe side and give up.
3247  */
3249  !fetching_ckpt)
3250  {
3251  ereport(DEBUG1,
3252  (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3253  InArchiveRecovery = true;
3256 
3259  minRecoveryPointTLI = replayTLI;
3260 
3262 
3263  /*
3264  * Before we retry, reset lastSourceFailed and currentSource
3265  * so that we will check the archive next.
3266  */
3267  lastSourceFailed = false;
3269 
3270  continue;
3271  }
3272 
3273  /* In standby mode, loop back to retry. Otherwise, give up. */
3275  continue;
3276  else
3277  return NULL;
3278  }
3279  }
3280 }
3281 
3282 /*
3283  * Read the XLOG page containing targetPagePtr into readBuf (if not read
3284  * already). Returns number of bytes read, if the page is read successfully,
3285  * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed,
3286  * but only if they have not been previously reported.
3287  *
3288  * See XLogReaderRoutine.page_read for more details.
3289  *
3290  * While prefetching, xlogreader->nonblocking may be set. In that case,
3291  * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3292  *
3293  * This is responsible for restoring files from archive as needed, as well
3294  * as for waiting for the requested WAL record to arrive in standby mode.
3295  *
3296  * xlogreader->private_data->emode specifies the log level used for reporting
3297  * "file not found" or "end of WAL" situations in archive recovery, or in
3298  * standby mode when promotion is triggered. If set to WARNING or below,
3299  * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3300  * levels the ereport() won't return.
3301  *
3302  * In standby mode, if after a successful return of XLogPageRead() the
3303  * caller finds the record it's interested in to be broken, it should
3304  * ereport the error with the level determined by
3305  * emode_for_corrupt_record(), and then set lastSourceFailed
3306  * and call XLogPageRead() again with the same arguments. This lets
3307  * XLogPageRead() to try fetching the record from another source, or to
3308  * sleep and retry.
3309  */
3310 static int
3312  XLogRecPtr targetRecPtr, char *readBuf)
3313 {
3314  XLogPageReadPrivate *private =
3316  int emode = private->emode;
3317  uint32 targetPageOff;
3318  XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3319  int r;
3320 
3321  XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3322  targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3323 
3324  /*
3325  * See if we need to switch to a new segment because the requested record
3326  * is not in the currently open one.
3327  */
3328  if (readFile >= 0 &&
3329  !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3330  {
3331  /*
3332  * Request a restartpoint if we've replayed too much xlog since the
3333  * last one.
3334  */
3336  {
3338  {
3339  (void) GetRedoRecPtr();
3342  }
3343  }
3344 
3345  close(readFile);
3346  readFile = -1;
3348  }
3349 
3350  XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3351 
3352 retry:
3353  /* See if we need to retrieve more data */
3354  if (readFile < 0 ||
3356  flushedUpto < targetPagePtr + reqLen))
3357  {
3358  if (readFile >= 0 &&
3361  flushedUpto < targetPagePtr + reqLen)
3362  return XLREAD_WOULDBLOCK;
3363 
3364  switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3365  private->randAccess,
3366  private->fetching_ckpt,
3367  targetRecPtr,
3368  private->replayTLI,
3371  {
3372  case XLREAD_WOULDBLOCK:
3373  return XLREAD_WOULDBLOCK;
3374  case XLREAD_FAIL:
3375  if (readFile >= 0)
3376  close(readFile);
3377  readFile = -1;
3378  readLen = 0;
3380  return XLREAD_FAIL;
3381  case XLREAD_SUCCESS:
3382  break;
3383  }
3384  }
3385 
3386  /*
3387  * At this point, we have the right segment open and if we're streaming we
3388  * know the requested record is in it.
3389  */
3390  Assert(readFile != -1);
3391 
3392  /*
3393  * If the current segment is being streamed from the primary, calculate
3394  * how much of the current page we have received already. We know the
3395  * requested record has been received, but this is for the benefit of
3396  * future calls, to allow quick exit at the top of this function.
3397  */
3399  {
3400  if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3401  readLen = XLOG_BLCKSZ;
3402  else
3404  targetPageOff;
3405  }
3406  else
3407  readLen = XLOG_BLCKSZ;
3408 
3409  /* Read the requested page */
3410  readOff = targetPageOff;
3411 
3412  pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3413  r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3414  if (r != XLOG_BLCKSZ)
3415  {
3416  char fname[MAXFNAMELEN];
3417  int save_errno = errno;
3418 
3421  if (r < 0)
3422  {
3423  errno = save_errno;
3424  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3426  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: %m",
3427  fname, LSN_FORMAT_ARGS(targetPagePtr),
3428  readOff)));
3429  }
3430  else
3431  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3433  errmsg("could not read from WAL segment %s, LSN %X/%X, offset %u: read %d of %zu",
3434  fname, LSN_FORMAT_ARGS(targetPagePtr),
3435  readOff, r, (Size) XLOG_BLCKSZ)));
3436  goto next_record_is_invalid;
3437  }
3439 
3440  Assert(targetSegNo == readSegNo);
3441  Assert(targetPageOff == readOff);
3442  Assert(reqLen <= readLen);
3443 
3445 
3446  /*
3447  * Check the page header immediately, so that we can retry immediately if
3448  * it's not valid. This may seem unnecessary, because ReadPageInternal()
3449  * validates the page header anyway, and would propagate the failure up to
3450  * ReadRecord(), which would retry. However, there's a corner case with
3451  * continuation records, if a record is split across two pages such that
3452  * we would need to read the two pages from different sources. For
3453  * example, imagine a scenario where a streaming replica is started up,
3454  * and replay reaches a record that's split across two WAL segments. The
3455  * first page is only available locally, in pg_wal, because it's already
3456  * been recycled on the primary. The second page, however, is not present
3457  * in pg_wal, and we should stream it from the primary. There is a
3458  * recycled WAL segment present in pg_wal, with garbage contents, however.
3459  * We would read the first page from the local WAL segment, but when
3460  * reading the second page, we would read the bogus, recycled, WAL
3461  * segment. If we didn't catch that case here, we would never recover,
3462  * because ReadRecord() would retry reading the whole record from the
3463  * beginning.
3464  *
3465  * Of course, this only catches errors in the page header, which is what
3466  * happens in the case of a recycled WAL segment. Other kinds of errors or
3467  * corruption still has the same problem. But this at least fixes the
3468  * common case, which can happen as part of normal operation.
3469  *
3470  * Validating the page header is cheap enough that doing it twice
3471  * shouldn't be a big deal from a performance point of view.
3472  *
3473  * When not in standby mode, an invalid page header should cause recovery
3474  * to end, not retry reading the page, so we don't need to validate the
3475  * page header here for the retry. Instead, ReadPageInternal() is
3476  * responsible for the validation.
3477  */
3478  if (StandbyMode &&
3479  !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3480  {
3481  /*
3482  * Emit this error right now then retry this page immediately. Use
3483  * errmsg_internal() because the message was already translated.
3484  */
3485  if (xlogreader->errormsg_buf[0])
3488 
3489  /* reset any error XLogReaderValidatePageHeader() might have set */
3491  goto next_record_is_invalid;
3492  }
3493 
3494  return readLen;
3495 
3496 next_record_is_invalid:
3497 
3498  /*
3499  * If we're reading ahead, give up fast. Retries and error reporting will
3500  * be handled by a later read when recovery catches up to this point.
3501  */
3502  if (xlogreader->nonblocking)
3503  return XLREAD_WOULDBLOCK;
3504 
3505  lastSourceFailed = true;
3506 
3507  if (readFile >= 0)
3508  close(readFile);
3509  readFile = -1;
3510  readLen = 0;
3512 
3513  /* In standby-mode, keep trying */
3514  if (StandbyMode)
3515  goto retry;
3516  else
3517  return XLREAD_FAIL;
3518 }
3519 
3520 /*
3521  * Open the WAL segment containing WAL location 'RecPtr'.
3522  *
3523  * The segment can be fetched via restore_command, or via walreceiver having
3524  * streamed the record, or it can already be present in pg_wal. Checking
3525  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3526  * too, in case someone copies a new segment directly to pg_wal. That is not
3527  * documented or recommended, though.
3528  *
3529  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3530  * prepare to read WAL starting from RedoStartLSN after this.
3531  *
3532  * 'RecPtr' might not point to the beginning of the record we're interested
3533  * in, it might also point to the page or segment header. In that case,
3534  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3535  * used to decide which timeline to stream the requested WAL from.
3536  *
3537  * 'replayLSN' is the current replay LSN, so that if we scan for new
3538  * timelines, we can reject a switch to a timeline that branched off before
3539  * this point.
3540  *
3541  * If the record is not immediately available, the function returns false
3542  * if we're not in standby mode. In standby mode, waits for it to become
3543  * available.
3544  *
3545  * When the requested record becomes available, the function opens the file
3546  * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3547  * of standby mode is triggered by the user, and there is no more WAL
3548  * available, returns XLREAD_FAIL.
3549  *
3550  * If nonblocking is true, then give up immediately if we can't satisfy the
3551  * request, returning XLREAD_WOULDBLOCK instead of waiting.
3552  */
3553 static XLogPageReadResult
3554 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3555  bool fetching_ckpt, XLogRecPtr tliRecPtr,
3556  TimeLineID replayTLI, XLogRecPtr replayLSN,
3557  bool nonblocking)
3558 {
3559  static TimestampTz last_fail_time = 0;
3560  TimestampTz now;
3561  bool streaming_reply_sent = false;
3562 
3563  /*-------
3564  * Standby mode is implemented by a state machine:
3565  *
3566  * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3567  * pg_wal (XLOG_FROM_PG_WAL)
3568  * 2. Check for promotion trigger request
3569  * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3570  * 4. Rescan timelines
3571  * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3572  *
3573  * Failure to read from the current source advances the state machine to
3574  * the next state.
3575  *
3576  * 'currentSource' indicates the current state. There are no currentSource
3577  * values for "check trigger", "rescan timelines", and "sleep" states,
3578  * those actions are taken when reading from the previous source fails, as
3579  * part of advancing to the next state.
3580  *
3581  * If standby mode is turned off while reading WAL from stream, we move
3582  * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3583  * the files (which would be required at end of recovery, e.g., timeline
3584  * history file) from archive or pg_wal. We don't need to kill WAL receiver
3585  * here because it's already stopped when standby mode is turned off at
3586  * the end of recovery.
3587  *-------
3588  */
3589  if (!InArchiveRecovery)
3591  else if (currentSource == XLOG_FROM_ANY ||
3593  {
3594  lastSourceFailed = false;
3596  }
3597 
3598  for (;;)
3599  {
3600  XLogSource oldSource = currentSource;
3601  bool startWalReceiver = false;
3602 
3603  /*
3604  * First check if we failed to read from the current source, and
3605  * advance the state machine if so. The failure to read might've
3606  * happened outside this function, e.g when a CRC check fails on a
3607  * record, or within this loop.
3608  */
3609  if (lastSourceFailed)
3610  {
3611  /*
3612  * Don't allow any retry loops to occur during nonblocking
3613  * readahead. Let the caller process everything that has been
3614  * decoded already first.
3615  */
3616  if (nonblocking)
3617  return XLREAD_WOULDBLOCK;
3618 
3619  switch (currentSource)
3620  {
3621  case XLOG_FROM_ARCHIVE:
3622  case XLOG_FROM_PG_WAL:
3623 
3624  /*
3625  * Check to see if promotion is requested. Note that we do
3626  * this only after failure, so when you promote, we still
3627  * finish replaying as much as we can from archive and
3628  * pg_wal before failover.
3629  */
3631  {
3633  return XLREAD_FAIL;
3634  }
3635 
3636  /*
3637  * Not in standby mode, and we've now tried the archive
3638  * and pg_wal.
3639  */
3640  if (!StandbyMode)
3641  return XLREAD_FAIL;
3642 
3643  /*
3644  * Move to XLOG_FROM_STREAM state, and set to start a
3645  * walreceiver if necessary.
3646  */
3648  startWalReceiver = true;
3649  break;
3650 
3651  case XLOG_FROM_STREAM:
3652 
3653  /*
3654  * Failure while streaming. Most likely, we got here
3655  * because streaming replication was terminated, or
3656  * promotion was triggered. But we also get here if we
3657  * find an invalid record in the WAL streamed from the
3658  * primary, in which case something is seriously wrong.
3659  * There's little chance that the problem will just go
3660  * away, but PANIC is not good for availability either,
3661  * especially in hot standby mode. So, we treat that the
3662  * same as disconnection, and retry from archive/pg_wal
3663  * again. The WAL in the archive should be identical to
3664  * what was streamed, so it's unlikely that it helps, but
3665  * one can hope...
3666  */
3667 
3668  /*
3669  * We should be able to move to XLOG_FROM_STREAM only in
3670  * standby mode.
3671  */
3673 
3674  /*
3675  * Before we leave XLOG_FROM_STREAM state, make sure that
3676  * walreceiver is not active, so that it won't overwrite
3677  * WAL that we restore from archive.
3678  */
3680 
3681  /*
3682  * Before we sleep, re-scan for possible new timelines if
3683  * we were requested to recover to the latest timeline.
3684  */
3686  {
3687  if (rescanLatestTimeLine(replayTLI, replayLSN))
3688  {
3690  break;
3691  }
3692  }
3693 
3694  /*
3695  * XLOG_FROM_STREAM is the last state in our state
3696  * machine, so we've exhausted all the options for
3697  * obtaining the requested WAL. We're going to loop back
3698  * and retry from the archive, but if it hasn't been long
3699  * since last attempt, sleep wal_retrieve_retry_interval
3700  * milliseconds to avoid busy-waiting.
3701  */
3703  if (!TimestampDifferenceExceeds(last_fail_time, now,
3705  {
3706  long wait_time;
3707 
3708  wait_time = wal_retrieve_retry_interval -
3709  TimestampDifferenceMilliseconds(last_fail_time, now);
3710 
3711  elog(LOG, "waiting for WAL to become available at %X/%X",
3712  LSN_FORMAT_ARGS(RecPtr));
3713 
3714  /* Do background tasks that might benefit us later. */
3716 
3720  wait_time,
3721  WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3724 
3725  /* Handle interrupt signals of startup process */
3727  }
3728  last_fail_time = now;
3730  break;
3731 
3732  default:
3733  elog(ERROR, "unexpected WAL source %d", currentSource);
3734  }
3735  }
3736  else if (currentSource == XLOG_FROM_PG_WAL)
3737  {
3738  /*
3739  * We just successfully read a file in pg_wal. We prefer files in
3740  * the archive over ones in pg_wal, so try the next file again
3741  * from the archive first.
3742  */
3743  if (InArchiveRecovery)
3745  }
3746 
3747  if (currentSource != oldSource)
3748  elog(DEBUG2, "switched WAL source from %s to %s after %s",
3750  lastSourceFailed ? "failure" : "success");
3751 
3752  /*
3753  * We've now handled possible failure. Try to read from the chosen
3754  * source.
3755  */
3756  lastSourceFailed = false;
3757 
3758  switch (currentSource)
3759  {
3760  case XLOG_FROM_ARCHIVE:
3761  case XLOG_FROM_PG_WAL:
3762 
3763  /*
3764  * WAL receiver must not be running when reading WAL from
3765  * archive or pg_wal.
3766  */
3767  Assert(!WalRcvStreaming());
3768 
3769  /* Close any old file we might have open. */
3770  if (readFile >= 0)
3771  {
3772  close(readFile);
3773  readFile = -1;
3774  }
3775  /* Reset curFileTLI if random fetch. */
3776  if (randAccess)
3777  curFileTLI = 0;
3778 
3779  /*
3780  * Try to restore the file from archive, or read an existing
3781  * file from pg_wal.
3782  */
3785  currentSource);
3786  if (readFile >= 0)
3787  return XLREAD_SUCCESS; /* success! */
3788 
3789  /*
3790  * Nope, not found in archive or pg_wal.
3791  */
3792  lastSourceFailed = true;
3793  break;
3794 
3795  case XLOG_FROM_STREAM:
3796  {
3797  bool havedata;
3798 
3799  /*
3800  * We should be able to move to XLOG_FROM_STREAM only in
3801  * standby mode.
3802  */
3804 
3805  /*
3806  * First, shutdown walreceiver if its restart has been
3807  * requested -- but no point if we're already slated for
3808  * starting it.
3809  */
3810  if (pendingWalRcvRestart && !startWalReceiver)
3811  {
3813 
3814  /*
3815  * Re-scan for possible new timelines if we were
3816  * requested to recover to the latest timeline.
3817  */
3820  rescanLatestTimeLine(replayTLI, replayLSN);
3821 
3822  startWalReceiver = true;
3823  }
3824  pendingWalRcvRestart = false;
3825 
3826  /*
3827  * Launch walreceiver if needed.
3828  *
3829  * If fetching_ckpt is true, RecPtr points to the initial
3830  * checkpoint location. In that case, we use RedoStartLSN
3831  * as the streaming start position instead of RecPtr, so
3832  * that when we later jump backwards to start redo at
3833  * RedoStartLSN, we will have the logs streamed already.
3834  */
3835  if (startWalReceiver &&
3836  PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3837  {
3838  XLogRecPtr ptr;
3839  TimeLineID tli;
3840 
3841  if (fetching_ckpt)
3842  {
3843  ptr = RedoStartLSN;
3844  tli = RedoStartTLI;
3845  }
3846  else
3847  {
3848  ptr = RecPtr;
3849 
3850  /*
3851  * Use the record begin position to determine the
3852  * TLI, rather than the position we're reading.
3853  */
3854  tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3855 
3856  if (curFileTLI > 0 && tli < curFileTLI)
3857  elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3858  LSN_FORMAT_ARGS(tliRecPtr),
3859  tli, curFileTLI);
3860  }
3861  curFileTLI = tli;
3866  flushedUpto = 0;
3867  }
3868 
3869  /*
3870  * Check if WAL receiver is active or wait to start up.
3871  */
3872  if (!WalRcvStreaming())
3873  {
3874  lastSourceFailed = true;
3875  break;
3876  }
3877 
3878  /*
3879  * Walreceiver is active, so see if new data has arrived.
3880  *
3881  * We only advance XLogReceiptTime when we obtain fresh
3882  * WAL from walreceiver and observe that we had already
3883  * processed everything before the most recent "chunk"
3884  * that it flushed to disk. In steady state where we are
3885  * keeping up with the incoming data, XLogReceiptTime will
3886  * be updated on each cycle. When we are behind,
3887  * XLogReceiptTime will not advance, so the grace time
3888  * allotted to conflicting queries will decrease.
3889  */
3890  if (RecPtr < flushedUpto)
3891  havedata = true;
3892  else
3893  {
3894  XLogRecPtr latestChunkStart;
3895 
3896  flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3897  if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3898  {
3899  havedata = true;
3900  if (latestChunkStart <= RecPtr)
3901  {
3904  }
3905  }
3906  else
3907  havedata = false;
3908  }
3909  if (havedata)
3910  {
3911  /*
3912  * Great, streamed far enough. Open the file if it's
3913  * not open already. Also read the timeline history
3914  * file if we haven't initialized timeline history
3915  * yet; it should be streamed over and present in
3916  * pg_wal by now. Use XLOG_FROM_STREAM so that source
3917  * info is set correctly and XLogReceiptTime isn't
3918  * changed.
3919  *
3920  * NB: We must set readTimeLineHistory based on
3921  * recoveryTargetTLI, not receiveTLI. Normally they'll
3922  * be the same, but if recovery_target_timeline is
3923  * 'latest' and archiving is configured, then it's
3924  * possible that we managed to retrieve one or more
3925  * new timeline history files from the archive,
3926  * updating recoveryTargetTLI.
3927  */
3928  if (readFile < 0)
3929  {
3930  if (!expectedTLEs)
3933  XLOG_FROM_STREAM, false);
3934  Assert(readFile >= 0);
3935  }
3936  else
3937  {
3938  /* just make sure source info is correct... */
3941  return XLREAD_SUCCESS;
3942  }
3943  break;
3944  }
3945 
3946  /* In nonblocking mode, return rather than sleeping. */
3947  if (nonblocking)
3948  return XLREAD_WOULDBLOCK;
3949 
3950  /*
3951  * Data not here yet. Check for trigger, then wait for
3952  * walreceiver to wake us up when new WAL arrives.
3953  */
3954  if (CheckForStandbyTrigger())
3955  {
3956  /*
3957  * Note that we don't return XLREAD_FAIL immediately
3958  * here. After being triggered, we still want to
3959  * replay all the WAL that was already streamed. It's
3960  * in pg_wal now, so we just treat this as a failure,
3961  * and the state machine will move on to replay the
3962  * streamed WAL from pg_wal, and then recheck the
3963  * trigger and exit replay.
3964  */
3965  lastSourceFailed = true;
3966  break;
3967  }
3968 
3969  /*
3970  * Since we have replayed everything we have received so
3971  * far and are about to start waiting for more WAL, let's
3972  * tell the upstream server our replay location now so
3973  * that pg_stat_replication doesn't show stale
3974  * information.
3975  */
3976  if (!streaming_reply_sent)
3977  {
3978  WalRcvForceReply();
3979  streaming_reply_sent = true;
3980  }
3981 
3982  /* Do any background tasks that might benefit us later. */
3984 
3985  /* Update pg_stat_recovery_prefetch before sleeping. */
3987 
3988  /*
3989  * Wait for more WAL to arrive, when we will be woken
3990  * immediately by the WAL receiver.
3991  */
3994  -1L,
3995  WAIT_EVENT_RECOVERY_WAL_STREAM);
3997  break;
3998  }
3999 
4000  default:
4001  elog(ERROR, "unexpected WAL source %d", currentSource);
4002  }
4003 
4004  /*
4005  * Check for recovery pause here so that we can confirm more quickly
4006  * that a requested pause has actually taken effect.
4007  */
4008  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4010  recoveryPausesHere(false);
4011 
4012  /*
4013  * This possibly-long loop needs to handle interrupts of startup
4014  * process.
4015  */
4017  }
4018 
4019  return XLREAD_FAIL; /* not reached */
4020 }
4021 
4022 
4023 /*
4024  * Determine what log level should be used to report a corrupt WAL record
4025  * in the current WAL page, previously read by XLogPageRead().
4026  *
4027  * 'emode' is the error mode that would be used to report a file-not-found
4028  * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
4029  * we're retrying the exact same record that we've tried previously, only
4030  * complain the first time to keep the noise down. However, we only do when
4031  * reading from pg_wal, because we don't expect any invalid records in archive
4032  * or in records streamed from the primary. Files in the archive should be complete,
4033  * and we should never hit the end of WAL because we stop and wait for more WAL
4034  * to arrive before replaying it.
4035  *
4036  * NOTE: This function remembers the RecPtr value it was last called with,
4037  * to suppress repeated messages about the same record. Only call this when
4038  * you are about to ereport(), or you might cause a later message to be
4039  * erroneously suppressed.
4040  */
4041 static int
4043 {
4044  static XLogRecPtr lastComplaint = 0;
4045 
4046  if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4047  {
4048  if (RecPtr == lastComplaint)
4049  emode = DEBUG1;
4050  else
4051  lastComplaint = RecPtr;
4052  }
4053  return emode;
4054 }
4055 
4056 
4057 /*
4058  * Subroutine to try to fetch and validate a prior checkpoint record.
4059  */
4060 static XLogRecord *
4062  TimeLineID replayTLI)
4063 {
4064  XLogRecord *record;
4065  uint8 info;
4066 
4067  Assert(xlogreader != NULL);
4068 
4069  if (!XRecOffIsValid(RecPtr))
4070  {
4071  ereport(LOG,
4072  (errmsg("invalid checkpoint location")));
4073  return NULL;
4074  }
4075 
4077  record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4078 
4079  if (record == NULL)
4080  {
4081  ereport(LOG,
4082  (errmsg("invalid checkpoint record")));
4083  return NULL;
4084  }
4085  if (record->xl_rmid != RM_XLOG_ID)
4086  {
4087  ereport(LOG,
4088  (errmsg("invalid resource manager ID in checkpoint record")));
4089  return NULL;
4090  }
4091  info = record->xl_info & ~XLR_INFO_MASK;
4092  if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4093  info != XLOG_CHECKPOINT_ONLINE)
4094  {
4095  ereport(LOG,
4096  (errmsg("invalid xl_info in checkpoint record")));
4097  return NULL;
4098  }
4100  {
4101  ereport(LOG,
4102  (errmsg("invalid length of checkpoint record")));
4103  return NULL;
4104  }
4105  return record;
4106 }
4107 
4108 /*
4109  * Scan for new timelines that might have appeared in the archive since we
4110  * started recovery.
4111  *
4112  * If there are any, the function changes recovery target TLI to the latest
4113  * one and returns 'true'.
4114  */
4115 static bool
4117 {
4118  List *newExpectedTLEs;
4119  bool found;
4120  ListCell *cell;
4121  TimeLineID newtarget;
4122  TimeLineID oldtarget = recoveryTargetTLI;
4123  TimeLineHistoryEntry *currentTle = NULL;
4124 
4126  if (newtarget == recoveryTargetTLI)
4127  {
4128  /* No new timelines found */
4129  return false;
4130  }
4131 
4132  /*
4133  * Determine the list of expected TLIs for the new TLI
4134  */
4135 
4136  newExpectedTLEs = readTimeLineHistory(newtarget);
4137 
4138  /*
4139  * If the current timeline is not part of the history of the new timeline,
4140  * we cannot proceed to it.
4141  */
4142  found = false;
4143  foreach(cell, newExpectedTLEs)
4144  {
4145  currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4146 
4147  if (currentTle->tli == recoveryTargetTLI)
4148  {
4149  found = true;
4150  break;
4151  }
4152  }
4153  if (!found)
4154  {
4155  ereport(LOG,
4156  (errmsg("new timeline %u is not a child of database system timeline %u",
4157  newtarget,
4158  replayTLI)));
4159  return false;
4160  }
4161 
4162  /*
4163  * The current timeline was found in the history file, but check that the
4164  * next timeline was forked off from it *after* the current recovery
4165  * location.
4166  */
4167  if (currentTle->end < replayLSN)
4168  {
4169  ereport(LOG,
4170  (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4171  newtarget,
4172  replayTLI,
4173  LSN_FORMAT_ARGS(replayLSN))));
4174  return false;
4175  }
4176 
4177  /* The new timeline history seems valid. Switch target */
4178  recoveryTargetTLI = newtarget;
4180  expectedTLEs = newExpectedTLEs;
4181 
4182  /*
4183  * As in StartupXLOG(), try to ensure we have all the history files
4184  * between the old target and new target in pg_wal.
4185  */
4186  restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4187 
4188  ereport(LOG,
4189  (errmsg("new target timeline is %u",
4190  recoveryTargetTLI)));
4191 
4192  return true;
4193 }
4194 
4195 
4196 /*
4197  * Open a logfile segment for reading (during recovery).
4198  *
4199  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4200  * Otherwise, it's assumed to be already available in pg_wal.
4201  */
4202 static int
4204  XLogSource source, bool notfoundOk)
4205 {
4206  char xlogfname[MAXFNAMELEN];
4207  char activitymsg[MAXFNAMELEN + 16];
4208  char path[MAXPGPATH];
4209  int fd;
4210 
4211  XLogFileName(xlogfname, tli, segno, wal_segment_size);
4212 
4213  switch (source)
4214  {
4215  case XLOG_FROM_ARCHIVE:
4216  /* Report recovery progress in PS display */
4217  snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4218  xlogfname);
4219  set_ps_display(activitymsg);
4220 
4221  if (!RestoreArchivedFile(path, xlogfname,
4222  "RECOVERYXLOG",
4224  InRedo))
4225  return -1;
4226  break;
4227 
4228  case XLOG_FROM_PG_WAL:
4229  case XLOG_FROM_STREAM:
4230  XLogFilePath(path, tli, segno, wal_segment_size);
4231  break;
4232 
4233  default:
4234  elog(ERROR, "invalid XLogFileRead source %d", source);
4235  }
4236 
4237  /*
4238  * If the segment was fetched from archival storage, replace the existing
4239  * xlog segment (if any) with the archival version.
4240  */
4241  if (source == XLOG_FROM_ARCHIVE)
4242  {
4244  KeepFileRestoredFromArchive(path, xlogfname);
4245 
4246  /*
4247  * Set path to point at the new file in pg_wal.
4248  */
4249  snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4250  }
4251 
4252  fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4253  if (fd >= 0)
4254  {
4255  /* Success! */
4256  curFileTLI = tli;
4257 
4258  /* Report recovery progress in PS display */
4259  snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4260  xlogfname);
4261  set_ps_display(activitymsg);
4262 
4263  /* Track source of data in assorted state variables */
4264  readSource = source;
4266  /* In FROM_STREAM case, caller tracks receipt time, not me */
4267  if (source != XLOG_FROM_STREAM)
4269 
4270  return fd;
4271  }
4272  if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4273  ereport(PANIC,
4275  errmsg("could not open file \"%s\": %m", path)));
4276  return -1;
4277 }
4278 
4279 /*
4280  * Open a logfile segment for reading (during recovery).
4281  *
4282  * This version searches for the segment with any TLI listed in expectedTLEs.
4283  */
4284 static int
4286 {
4287  char path[MAXPGPATH];
4288  ListCell *cell;
4289  int fd;
4290  List *tles;
4291 
4292  /*
4293  * Loop looking for a suitable timeline ID: we might need to read any of
4294  * the timelines listed in expectedTLEs.
4295  *
4296  * We expect curFileTLI on entry to be the TLI of the preceding file in
4297  * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4298  * to go backwards; this prevents us from picking up the wrong file when a
4299  * parent timeline extends to higher segment numbers than the child we
4300  * want to read.
4301  *
4302  * If we haven't read the timeline history file yet, read it now, so that
4303  * we know which TLIs to scan. We don't save the list in expectedTLEs,
4304  * however, unless we actually find a valid segment. That way if there is
4305  * neither a timeline history file nor a WAL segment in the archive, and
4306  * streaming replication is set up, we'll read the timeline history file
4307  * streamed from the primary when we start streaming, instead of
4308  * recovering with a dummy history generated here.
4309  */
4310  if (expectedTLEs)
4311  tles = expectedTLEs;
4312  else
4314 
4315  foreach(cell, tles)
4316  {
4318  TimeLineID tli = hent->tli;
4319 
4320  if (tli < curFileTLI)
4321  break; /* don't bother looking at too-old TLIs */
4322 
4323  /*
4324  * Skip scanning the timeline ID that the logfile segment to read
4325  * doesn't belong to
4326  */
4327  if (hent->begin != InvalidXLogRecPtr)
4328  {
4329  XLogSegNo beginseg = 0;
4330 
4331  XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4332 
4333  /*
4334  * The logfile segment that doesn't belong to the timeline is
4335  * older or newer than the segment that the timeline started or
4336  * ended at, respectively. It's sufficient to check only the
4337  * starting segment of the timeline here. Since the timelines are
4338  * scanned in descending order in this loop, any segments newer
4339  * than the ending segment should belong to newer timeline and
4340  * have already been read before. So it's not necessary to check
4341  * the ending segment of the timeline here.
4342  */
4343  if (segno < beginseg)
4344  continue;
4345  }
4346 
4348  {
4349  fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4350  if (fd != -1)
4351  {
4352  elog(DEBUG1, "got WAL segment from archive");
4353  if (!expectedTLEs)
4354  expectedTLEs = tles;
4355  return fd;
4356  }
4357  }
4358 
4360  {
4361  fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4362  if (fd != -1)
4363  {
4364  if (!expectedTLEs)
4365  expectedTLEs = tles;
4366  return fd;
4367  }
4368  }
4369  }
4370 
4371  /* Couldn't find it. For simplicity, complain about front timeline */
4373  errno = ENOENT;
4374  ereport(DEBUG2,
4376  errmsg("could not open file \"%s\": %m", path)));
4377  return -1;
4378 }
4379 
4380 /*
4381  * Set flag to signal the walreceiver to restart. (The startup process calls
4382  * this on noticing a relevant configuration change.)
4383  */
4384 void
4386 {
4388  {
4389  ereport(LOG,
4390  (errmsg("WAL receiver process shutdown requested")));
4391 
4392  pendingWalRcvRestart = true;
4393  }
4394 }
4395 
4396 
4397 /*
4398  * Has a standby promotion already been triggered?
4399  *
4400  * Unlike CheckForStandbyTrigger(), this works in any process
4401  * that's connected to shared memory.
4402  */
4403 bool
4405 {
4406  /*
4407  * We check shared state each time only until a standby promotion is
4408  * triggered. We can't trigger a promotion again, so there's no need to
4409  * keep checking after the shared variable has once been seen true.
4410  */
4412  return true;
4413 
4417 
4418  return LocalPromoteIsTriggered;
4419 }
4420 
4421 static void
4423 {
4427 
4428  /*
4429  * Mark the recovery pause state as 'not paused' because the paused state
4430  * ends and promotion continues if a promotion is triggered while recovery
4431  * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4432  * return 'paused' while a promotion is ongoing.
4433  */
4434  SetRecoveryPause(false);
4435 
4436  LocalPromoteIsTriggered = true;
4437 }
4438 
4439 /*
4440  * Check whether a promote request has arrived.
4441  */
4442 static bool
4444 {
4446  return true;
4447 
4449  {
4450  ereport(LOG, (errmsg("received promote request")));
4454  return true;
4455  }
4456 
4457  return false;
4458 }
4459 
4460 /*
4461  * Remove the files signaling a standby promotion request.
4462  */
4463 void
4465 {
4466  unlink(PROMOTE_SIGNAL_FILE);
4467 }
4468 
4469 /*
4470  * Check to see if a promote request has arrived.
4471  */
4472 bool
4474 {
4475  struct stat stat_buf;
4476 
4477  if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4478  return true;
4479 
4480  return false;
4481 }
4482 
4483 /*
4484  * Wake up startup process to replay newly arrived WAL, or to notice that
4485  * failover has been requested.
4486  */
4487 void
4489 {
4491 }
4492 
4493 /*
4494  * Schedule a walreceiver wakeup in the main recovery loop.
4495  */
4496 void
4498 {
4500 }
4501 
4502 /*
4503  * Is HotStandby active yet? This is only important in special backends
4504  * since normal backends won't ever be able to connect until this returns
4505  * true. Postmaster knows this by way of signal, not via shared memory.
4506  *
4507  * Unlike testing standbyState, this works in any process that's connected to
4508  * shared memory. (And note that standbyState alone doesn't tell the truth
4509  * anyway.)
4510  */
4511 bool
4513 {
4514  /*
4515  * We check shared state each time only until Hot Standby is active. We
4516  * can't de-activate Hot Standby, so there's no need to keep checking
4517  * after the shared variable has once been seen true.
4518  */
4520  return true;
4521  else
4522  {
4523  /* spinlock is essential on machines with weak memory ordering! */
4527 
4528  return LocalHotStandbyActive;
4529  }
4530 }
4531 
4532 /*
4533  * Like HotStandbyActive(), but to be used only in WAL replay code,
4534  * where we don't need to ask any other process what the state is.
4535  */
4536 static bool
4538 {
4540  return LocalHotStandbyActive;
4541 }
4542 
4543 /*
4544  * Get latest redo apply position.
4545  *
4546  * Exported to allow WALReceiver to read the pointer directly.
4547  */
4548 XLogRecPtr
4550 {
4551  XLogRecPtr recptr;
4552  TimeLineID tli;
4553 
4558 
4559  if (replayTLI)
4560  *replayTLI = tli;
4561  return recptr;
4562 }
4563 
4564 
4565 /*
4566  * Get position of last applied, or the record being applied.
4567  *
4568  * This is different from GetXLogReplayRecPtr() in that if a WAL
4569  * record is currently being applied, this includes that record.
4570  */
4571 XLogRecPtr
4573 {
4574  XLogRecPtr recptr;
4575  TimeLineID tli;
4576 
4578  recptr = XLogRecoveryCtl->replayEndRecPtr;
4581 
4582  if (replayEndTLI)
4583  *replayEndTLI = tli;
4584  return recptr;
4585 }
4586 
4587 /*
4588  * Save timestamp of latest processed commit/abort record.
4589  *
4590  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4591  * seen by processes other than the startup process. Note in particular
4592  * that CreateRestartPoint is executed in the checkpointer.
4593  */
4594 static void
4596 {
4600 }
4601 
4602 /*
4603  * Fetch timestamp of latest processed commit/abort record.
4604  */
4607 {
4608  TimestampTz xtime;
4609 
4613 
4614  return xtime;
4615 }
4616 
4617 /*
4618  * Save timestamp of the next chunk of WAL records to apply.
4619  *
4620  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4621  * seen by all backends.
4622  */
4623 static void
4625 {
4629 }
4630 
4631 /*
4632  * Fetch timestamp of latest processed commit/abort record.
4633  * Startup process maintains an accurate local copy in XLogReceiptTime
4634  */
4637 {
4638  TimestampTz xtime;
4639 
4643 
4644  return xtime;
4645 }
4646 
4647 /*
4648  * Returns time of receipt of current chunk of XLOG data, as well as
4649  * whether it was received from streaming replication or from archives.
4650  */
4651 void
4652 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4653 {
4654  /*
4655  * This must be executed in the startup process, since we don't export the
4656  * relevant state to shared memory.
4657  */
4658  Assert(InRecovery);
4659 
4660  *rtime = XLogReceiptTime;
4661  *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4662 }
4663 
4664 /*
4665  * Note that text field supplied is a parameter name and does not require
4666  * translation
4667  */
4668 void
4669 RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4670 {
4671  if (currValue < minValue)
4672  {
4674  {
4675  bool warned_for_promote = false;
4676 
4677  ereport(WARNING,
4678  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4679  errmsg("hot standby is not possible because of insufficient parameter settings"),
4680  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4681  param_name,
4682  currValue,
4683  minValue)));
4684 
4685  SetRecoveryPause(true);
4686 
4687  ereport(LOG,
4688  (errmsg("recovery has paused"),
4689  errdetail("If recovery is unpaused, the server will shut down."),
4690  errhint("You can then restart the server after making the necessary configuration changes.")));
4691 
4693  {
4695 
4696  if (CheckForStandbyTrigger())
4697  {
4698  if (!warned_for_promote)
4699  ereport(WARNING,
4700  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4701  errmsg("promotion is not possible because of insufficient parameter settings"),
4702 
4703  /*
4704  * Repeat the detail from above so it's easy to find
4705  * in the log.
4706  */
4707  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4708  param_name,
4709  currValue,
4710  minValue),
4711  errhint("Restart the server after making the necessary configuration changes.")));
4712  warned_for_promote = true;
4713  }
4714 
4715  /*
4716  * If recovery pause is requested then set it paused. While
4717  * we are in the loop, user might resume and pause again so
4718  * set this every time.
4719  */
4721 
4722  /*
4723  * We wait on a condition variable that will wake us as soon
4724  * as the pause ends, but we use a timeout so we can check the
4725  * above conditions periodically too.
4726  */
4728  WAIT_EVENT_RECOVERY_PAUSE);
4729  }
4731  }
4732 
4733  ereport(FATAL,
4734  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4735  errmsg("recovery aborted because of insufficient parameter settings"),
4736  /* Repeat the detail from above so it's easy to find in the log. */
4737  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4738  param_name,
4739  currValue,
4740  minValue),
4741  errhint("You can restart the server after making the necessary configuration changes.")));
4742  }
4743 }
4744 
4745 
4746 /*
4747  * GUC check_hook for primary_slot_name
4748  */
4749 bool
4751 {
4752  if (*newval && strcmp(*newval, "") != 0 &&
4754  return false;
4755 
4756  return true;
4757 }
4758 
4759 /*
4760  * Recovery target settings: Only one of the several recovery_target* settings
4761  * may be set. Setting a second one results in an error. The global variable
4762  * recoveryTarget tracks which kind of recovery target was chosen. Other
4763  * variables store the actual target value (for example a string or a xid).
4764  * The assign functions of the parameters check whether a competing parameter
4765  * was already set. But we want to allow setting the same parameter multiple
4766  * times. We also want to allow unsetting a parameter and setting a different
4767  * one, so we unset recoveryTarget when the parameter is set to an empty
4768  * string.
4769  *
4770  * XXX this code is broken by design. Throwing an error from a GUC assign
4771  * hook breaks fundamental assumptions of guc.c. So long as all the variables
4772  * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4773  * since we'd just abort postmaster startup anyway. Nonetheless it's likely
4774  * that we have odd behaviors such as unexpected GUC ordering dependencies.
4775  */
4776 
4777 static void
4779 error_multiple_recovery_targets(void)
4780 {
4781  ereport(ERROR,
4782  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4783  errmsg("multiple recovery targets specified"),
4784  errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4785 }
4786 
4787 /*
4788  * GUC check_hook for recovery_target
4789  */
4790 bool
4792 {
4793  if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4794  {
4795  GUC_check_errdetail("The only allowed value is \"immediate\".");
4796  return false;
4797  }
4798  return true;
4799 }
4800 
4801 /*
4802  * GUC assign_hook for recovery_target
4803  */
4804 void
4805 assign_recovery_target(const char *newval, void *extra)
4806 {
4809  error_multiple_recovery_targets();
4810 
4811  if (newval && strcmp(newval, "") != 0)
4813  else
4815 }
4816 
4817 /*
4818  * GUC check_hook for recovery_target_lsn
4819  */
4820 bool
4822 {
4823  if (strcmp(*newval, "") != 0)
4824  {
4825  XLogRecPtr lsn;
4826  XLogRecPtr *myextra;
4827  bool have_error = false;
4828 
4829  lsn = pg_lsn_in_internal(*newval, &have_error);
4830  if (have_error)
4831  return false;
4832 
4833  myextra = (XLogRecPtr *) guc_malloc(ERROR, sizeof(XLogRecPtr));
4834  *myextra = lsn;
4835  *extra = (void *) myextra;
4836  }
4837  return true;
4838 }
4839 
4840 /*
4841  * GUC assign_hook for recovery_target_lsn
4842  */
4843 void
4844 assign_recovery_target_lsn(const char *newval, void *extra)
4845 {
4848  error_multiple_recovery_targets();
4849 
4850  if (newval && strcmp(newval, "") != 0)
4851  {
4853  recoveryTargetLSN = *((XLogRecPtr *) extra);
4854  }
4855  else
4857 }
4858 
4859 /*
4860  * GUC check_hook for recovery_target_name
4861  */
4862 bool
4864 {
4865  /* Use the value of newval directly */
4866  if (strlen(*newval) >= MAXFNAMELEN)
4867  {
4868  GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4869  "recovery_target_name", MAXFNAMELEN - 1);
4870  return false;
4871  }
4872  return true;
4873 }
4874 
4875 /*
4876  * GUC assign_hook for recovery_target_name
4877  */
4878 void
4879 assign_recovery_target_name(const char *newval, void *extra)
4880 {
4883  error_multiple_recovery_targets();
4884 
4885  if (newval && strcmp(newval, "") != 0)
4886  {
4889  }
4890  else
4892 }
4893 
4894 /*
4895  * GUC check_hook for recovery_target_time
4896  *
4897  * The interpretation of the recovery_target_time string can depend on the
4898  * time zone setting, so we need to wait until after all GUC processing is
4899  * done before we can do the final parsing of the string. This check function
4900  * only does a parsing pass to catch syntax errors, but we store the string
4901  * and parse it again when we need to use it.
4902  */
4903 bool
4905 {
4906  if (strcmp(*newval, "") != 0)
4907  {
4908  /* reject some special values */
4909  if (strcmp(*newval, "now") == 0 ||
4910  strcmp(*newval, "today") == 0 ||
4911  strcmp(*newval, "tomorrow") == 0 ||
4912  strcmp(*newval, "yesterday") == 0)
4913  {
4914  return false;
4915  }
4916 
4917  /*
4918  * parse timestamp value (see also timestamptz_in())
4919  */
4920  {
4921  char *str = *newval;
4922  fsec_t fsec;
4923  struct pg_tm tt,
4924  *tm = &tt;
4925  int tz;
4926  int dtype;
4927  int nf;
4928  int dterr;
4929  char *field[MAXDATEFIELDS];
4930  int ftype[MAXDATEFIELDS];
4931  char workbuf[MAXDATELEN + MAXDATEFIELDS];
4932  DateTimeErrorExtra dtextra;
4934 
4935  dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4936  field, ftype, MAXDATEFIELDS, &nf);
4937  if (dterr == 0)
4938  dterr = DecodeDateTime(field, ftype, nf,
4939  &dtype, tm, &fsec, &tz, &dtextra);
4940  if (dterr != 0)
4941  return false;
4942  if (dtype != DTK_DATE)
4943  return false;
4944 
4945  if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4946  {
4947  GUC_check_errdetail("timestamp out of range: \"%s\"", str);
4948  return false;
4949  }
4950  }
4951  }
4952  return true;
4953 }
4954 
4955 /*
4956  * GUC assign_hook for recovery_target_time
4957  */
4958 void
4959 assign_recovery_target_time(const char *newval, void *extra)
4960 {
4963  error_multiple_recovery_targets();
4964 
4965  if (newval && strcmp(newval, "") != 0)
4967  else
4969 }
4970 
4971 /*
4972  * GUC check_hook for recovery_target_timeline
4973  */
4974 bool
4976 {
4978  RecoveryTargetTimeLineGoal *myextra;
4979 
4980  if (strcmp(*newval, "current") == 0)
4982  else if (strcmp(*newval, "latest") == 0)
4984  else
4985  {
4987 
4988  errno = 0;
4989  strtoul(*newval, NULL, 0);
4990  if (errno == EINVAL || errno == ERANGE)
4991  {
4992  GUC_check_errdetail("\"recovery_target_timeline\" is not a valid number.");
4993  return false;
4994  }
4995  }
4996 
4998  *myextra = rttg;
4999  *extra = (void *) myextra;
5000 
5001  return true;
5002 }
5003 
5004 /*
5005  * GUC assign_hook for recovery_target_timeline
5006  */
5007 void
5008 assign_recovery_target_timeline(const char *newval, void *extra)
5009 {
5012  recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5013  else
5015 }
5016 
5017 /*
5018  * GUC check_hook for recovery_target_xid
5019  */
5020 bool
5022 {
5023  if (strcmp(*newval, "") != 0)
5024  {
5025  TransactionId xid;
5026  TransactionId *myextra;
5027 
5028  errno = 0;
5029  xid = (TransactionId) strtou64(*newval, NULL, 0);
5030  if (errno == EINVAL || errno == ERANGE)
5031  return false;
5032 
5033  myextra = (TransactionId *) guc_malloc(ERROR, sizeof(TransactionId));
5034  *myextra = xid;
5035  *extra = (void *) myextra;
5036  }
5037  return true;
5038 }
5039 
5040 /*
5041  * GUC assign_hook for recovery_target_xid
5042  */
5043 void
5044 assign_recovery_target_xid(const char *newval, void *extra)
5045 {
5048  error_multiple_recovery_targets();
5049 
5050  if (newval && strcmp(newval, "") != 0)
5051  {
5053  recoveryTargetXid = *((TransactionId *) extra);
5054  }
5055  else
5057 }
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition: atomics.h:467
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition: timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:544
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:572
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition: timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition: timeline.c:50
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition: timeline.c:526
void remove_tablespace_symlink(const char *linkloc)
Definition: tablespace.c:883
bool allow_in_place_tablespaces
Definition: tablespace.c:85
void HandleStartupProcInterrupts(void)
Definition: startup.c:154
void disable_startup_progress_timeout(void)
Definition: startup.c:309
bool IsPromoteSignaled(void)
Definition: startup.c:288
void begin_startup_progress_phase(void)
Definition: startup.c:343
void ResetPromoteSignaled(void)
Definition: startup.c:294
int ParseDateTime(const char *timestr, char *workbuf, size_t buflen, char **field, int *ftype, int maxfields, int *numfields)
Definition: datetime.c:754
int DecodeDateTime(char **field, int *ftype, int nf, int *dtype, struct pg_tm *tm, fsec_t *fsec, int *tzp, DateTimeErrorExtra *extra)
Definition: datetime.c:978
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1756
int tm2timestamp(struct pg_tm *tm, fsec_t fsec, int *tzp, Timestamp *result)
Definition: timestamp.c:1987
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1780
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition: timestamp.c:417
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1644
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1608
const char * timestamptz_to_str(TimestampTz t)
Definition: timestamp.c:1843
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4941
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5158
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:191
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:51
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:351
Pointer Page
Definition: bufpage.h:81
static XLogRecPtr PageGetLSN(const char *page)
Definition: bufpage.h:386
unsigned int uint32
Definition: c.h:506
signed int int32
Definition: c.h:496
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:185
#define Assert(condition)
Definition: c.h:849
#define PG_BINARY
Definition: c.h:1264
#define UINT64_FORMAT
Definition: c.h:540
#define strtou64(str, endptr, base)
Definition: c.h:1289
unsigned char uint8
Definition: c.h:504
uint32 TransactionId
Definition: c.h:643
size_t Size
Definition: c.h:596
void RequestCheckpoint(int flags)
Definition: checkpointer.c:952
bool ConditionVariableCancelSleep(void)
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
int64 TimestampTz
Definition: timestamp.h:39
int32 fsec_t
Definition: timestamp.h:41
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode_for_file_access(void)
Definition: elog.c:876
int errdetail(const char *fmt,...)
Definition: elog.c:1203
ErrorContextCallback * error_context_stack
Definition: elog.c:94
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define errcontext
Definition: elog.h:196
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2932
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2606
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1109
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:782
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1087
int FreeFile(FILE *file)
Definition: fd.c:2804
int pg_fsync(int fd)
Definition: fd.c:386
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2866
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:526
@ PGFILETYPE_LNK
Definition: file_utils.h:24
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:645
bool IsUnderPostmaster
Definition: globals.c:119
char * DataDir
Definition: globals.c:70
bool IsPostmasterEnvironment
Definition: globals.c:118
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:637
#define newval
#define GUC_check_errdetail
Definition: guc.h:476
GucSource
Definition: guc.h:108
const char * str
#define MAXDATEFIELDS
Definition: datetime.h:202
#define DTK_DATE
Definition: datetime.h:144
#define MAXDATELEN
Definition: datetime.h:200
#define close(a)
Definition: win32.h:12
void proc_exit(int code)
Definition: ipc.c:104
int i
Definition: isn.c:73
void OwnLatch(Latch *latch)
Definition: latch.c:463
void DisownLatch(Latch *latch)
Definition: latch.c:489
void InitSharedLatch(Latch *latch)
Definition: latch.c:430
void SetLatch(Latch *latch)
Definition: latch.c:632
void ResetLatch(Latch *latch)
Definition: latch.c:724
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:517
#define WL_TIMEOUT
Definition: latch.h:130
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:132
#define WL_LATCH_SET
Definition: latch.h:127
List * lappend(List *list, void *datum)
Definition: list.c:339
void list_free_deep(List *list)
Definition: list.c:1560
static struct pg_tm tm
Definition: localtime.c:104
char * pstrdup(const char *in)
Definition: mcxt.c:1696
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc0(Size size)
Definition: mcxt.c:1347
void * palloc(Size size)
Definition: mcxt.c:1317
#define AmStartupProcess()
Definition: miscadmin.h:379
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:451
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition: pg_control.h:75
#define XLOG_CHECKPOINT_REDO
Definition: pg_control.h:82
#define XLOG_OVERWRITE_CONTRECORD
Definition: pg_control.h:81
DBState
Definition: pg_control.h:90
@ DB_IN_ARCHIVE_RECOVERY
Definition: pg_control.h:96
@ DB_SHUTDOWNED_IN_RECOVERY
Definition: pg_control.h:93
@ DB_SHUTDOWNED
Definition: pg_control.h:92
@ DB_IN_CRASH_RECOVERY
Definition: pg_control.h:95
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:68
#define XLOG_BACKUP_END
Definition: pg_control.h:73
#define XLOG_CHECKPOINT_ONLINE
Definition: pg_control.h:69
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:77
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:172
#define NIL
Definition: pg_list.h:68
XLogRecPtr pg_lsn_in_internal(const char *str, bool *have_error)
Definition: pg_lsn.c:29
static rewind_source * source
Definition: pg_rewind.c:89
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:73
int64 timestamp
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:184
@ PMSIGNAL_RECOVERY_STARTED
Definition: pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition: pmsignal.h:36
#define pg_pread
Definition: port.h:225
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static Datum ObjectIdGetDatum(Oid X)
Definition: postgres.h:252
static Datum CStringGetDatum(const char *X)
Definition: postgres.h:350
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
#define InvalidOid
Definition: postgres_ext.h:36
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition: procarray.c:4407
void KnownAssignedTransactionIdsIdleMaintenance(void)
Definition: procarray.c:4544
static void set_ps_display(const char *activity)
Definition: ps_status.h:40
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
#define PG_TBLSPC_DIR
Definition: relpath.h:41
void RmgrStartup(void)
Definition: rmgr.c:58
void RmgrCleanup(void)
Definition: rmgr.c:74
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:387
static pg_noinline void Size size
Definition: slab.c:607
bool ReplicationSlotValidateName(const char *name, int elevel)
Definition: slot.c:252
void ShutDownSlotSync(void)
Definition: slotsync.c:1563
#define SpinLockInit(lock)
Definition: spin.h:57
#define SpinLockRelease(lock)
Definition: spin.h:61
#define SpinLockAcquire(lock)
Definition: spin.h:59
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:97
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:182
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:194
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
Oid oldestMultiDB
Definition: pg_control.h:51
MultiXactId oldestMulti
Definition: pg_control.h:50
MultiXactOffset nextMultiOffset
Definition: pg_control.h:47
TransactionId newestCommitTsXid
Definition: pg_control.h:55
TransactionId oldestXid
Definition: pg_control.h:48
TimeLineID PrevTimeLineID
Definition: pg_control.h:40
TimeLineID ThisTimeLineID
Definition: pg_control.h:39
Oid nextOid
Definition: pg_control.h:45
MultiXactId nextMulti
Definition: pg_control.h:46
FullTransactionId nextXid
Definition: pg_control.h:44
TransactionId oldestCommitTsXid
Definition: pg_control.h:53
XLogRecPtr redo
Definition: pg_control.h:37
Oid oldestXidDB
Definition: pg_control.h:49
XLogRecPtr backupStartPoint
Definition: pg_control.h:170
bool backupEndRequired
Definition: pg_control.h:172
CheckPoint checkPointCopy
Definition: pg_control.h:135
XLogRecPtr backupEndPoint
Definition: pg_control.h:171
XLogRecPtr minRecoveryPoint
Definition: pg_control.h:168
XLogRecPtr checkPoint
Definition: pg_control.h:133
uint64 system_identifier
Definition: pg_control.h:110
TimeLineID minRecoveryPointTLI
Definition: pg_control.h:169
Definition: dirent.c:26
XLogRecPtr lastPageBeginPtr
Definition: xlogrecovery.h:111
XLogRecPtr abortedRecPtr
Definition: xlogrecovery.h:120
XLogRecPtr missingContrecPtr
Definition: xlogrecovery.h:121
TimeLineID endOfLogTLI
Definition: xlogrecovery.h:109
struct ErrorContextCallback * previous
Definition: elog.h:296
void(* callback)(void *arg)
Definition: elog.h:297
Definition: latch.h:113
Definition: pg_list.h:54
RelFileNumber relNumber
void(* rm_mask)(char *pagedata, BlockNumber blkno)
void(* rm_redo)(XLogReaderState *record)
const char *(* rm_identify)(uint8 info)
const char * rm_name
void(* rm_desc)(StringInfo buf, XLogReaderState *record)
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h:29
TimeLineID ws_tli
Definition: xlogreader.h:49
pg_atomic_uint64 minWaitedLSN
Definition: waitlsn.h:58
TimeLineID replayTLI
Definition: xlogrecovery.c:200
XLogRecPtr missingContrecPtr
Definition: xlogreader.h:215
char * errormsg_buf
Definition: xlogreader.h:311
XLogRecPtr EndRecPtr
Definition: xlogreader.h:207
uint64 system_identifier
Definition: xlogreader.h:191
XLogRecPtr ReadRecPtr
Definition: xlogreader.h:206
XLogRecPtr abortedRecPtr
Definition: xlogreader.h:214
TimeLineID latestPageTLI
Definition: xlogreader.h:280
XLogRecPtr overwrittenRecPtr
Definition: xlogreader.h:217
XLogRecPtr latestPagePtr
Definition: xlogreader.h:279
WALOpenSegment seg
Definition: xlogreader.h:272
void * private_data
Definition: xlogreader.h:196
uint8 xl_info
Definition: xlogrecord.h:46
uint32 xl_tot_len
Definition: xlogrecord.h:43
TransactionId xl_xid
Definition: xlogrecord.h:44
RmgrId xl_rmid
Definition: xlogrecord.h:47
ConditionVariable recoveryNotPausedCV
Definition: xlogrecovery.c:360
XLogRecPtr lastReplayedEndRecPtr
Definition: xlogrecovery.c:340
TimeLineID replayEndTLI
Definition: xlogrecovery.c:349
TimeLineID lastReplayedTLI
Definition: xlogrecovery.c:341
TimestampTz currentChunkStartTime
Definition: xlogrecovery.c:357
XLogRecPtr replayEndRecPtr
Definition: xlogrecovery.c:348
TimestampTz recoveryLastXTime
Definition: xlogrecovery.c:351
RecoveryPauseState recoveryPauseState
Definition: xlogrecovery.c:359
XLogRecPtr lastReplayedReadRecPtr
Definition: xlogrecovery.c:339
Definition: guc.h:170
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
Definition: pgtime.h:35
Definition: regguts.h:323
TimeLineID PrevTimeLineID
TimeLineID ThisTimeLineID
char rp_name[MAXFNAMELEN]
TransactionId twophase_xid
Definition: xact.h:427
TransactionId twophase_xid
Definition: xact.h:397
#define InvalidTransactionId
Definition: transam.h:31
#define U64FromFullTransactionId(x)
Definition: transam.h:49
#define XidFromFullTransactionId(x)
Definition: transam.h:48
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
#define TimestampTzPlusMilliseconds(tz, ms)
Definition: timestamp.h:85
static TimestampTz DatumGetTimestampTz(Datum X)
Definition: timestamp.h:34
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition: varsup.c:304
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:85
static void pgstat_report_wait_end(void)
Definition: wait_event.h:101
struct WaitLSNState * waitLSNState
Definition: waitlsn.c:37
void WaitLSNSetLatches(XLogRecPtr currentLSN)
Definition: waitlsn.c:155
void WalRcvForceReply(void)
Definition: walreceiver.c:1359
#define AllowCascadeReplication()
Definition: walreceiver.h:41
XLogRecPtr GetWalRcvFlushRecPtr(XLogRecPtr *latestChunkStart, TimeLineID *receiveTLI)
bool WalRcvStreaming(void)
void RequestXLogStreaming(TimeLineID tli, XLogRecPtr recptr, const char *conninfo, const char *slotname, bool create_temp_slot)
bool WalRcvRunning(void)
void WalSndWakeup(bool physical, bool logical)
Definition: walsender.c:3638
#define stat
Definition: win32_port.h:284
#define S_IRUSR
Definition: win32_port.h:289
#define symlink(oldpath, newpath)
Definition: win32_port.h:235
#define S_IWUSR
Definition: win32_port.h:292
#define XLOG_XACT_COMMIT_PREPARED
Definition: xact.h:172
#define XLOG_XACT_COMMIT
Definition: xact.h:169
#define XLOG_XACT_OPMASK
Definition: xact.h:179
#define XLOG_XACT_ABORT
Definition: xact.h:171
#define XLOG_XACT_ABORT_PREPARED
Definition: xact.h:173
void ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed)
Definition: xactdesc.c:35
void ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed)
Definition: xactdesc.c:141
int wal_decode_buffer_size
Definition: xlog.c:135
bool EnableHotStandby
Definition: xlog.c:120
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:6436
void SetInstallXLogFileSegmentActive(void)
Definition: xlog.c:9483
bool IsInstallXLogFileSegmentActive(void)
Definition: xlog.c:9491
int wal_segment_size
Definition: xlog.c:142
void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
Definition: xlog.c:6208
void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
Definition: xlog.c:3933
void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
Definition: xlog.c:6246
int wal_retrieve_retry_interval
Definition: xlog.c:133
static ControlFileData * ControlFile
Definition: xlog.c:573
void XLogShutdownWalRcv(void)
Definition: xlog.c:9472
bool XLogCheckpointNeeded(XLogSegNo new_segno)
Definition: xlog.c:2289
#define TABLESPACE_MAP_OLD
Definition: xlog.h:306
#define TABLESPACE_MAP
Definition: xlog.h:305
#define STANDBY_SIGNAL_FILE
Definition: xlog.h:301
#define CHECKPOINT_CAUSE_XLOG
Definition: xlog.h:148
#define PROMOTE_SIGNAL_FILE
Definition: xlog.h:309
#define BACKUP_LABEL_FILE
Definition: xlog.h:302
#define RECOVERY_SIGNAL_FILE
Definition: xlog.h:300
static RmgrData GetRmgr(RmgrId rmid)
@ RECOVERY_TARGET_ACTION_PAUSE
@ RECOVERY_TARGET_ACTION_PROMOTE
@ RECOVERY_TARGET_ACTION_SHUTDOWN
#define XLogSegmentOffset(xlogptr, wal_segsz_bytes)
#define MAXFNAMELEN
#define XLOGDIR
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)
static void XLogFilePath(char *path, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XRecOffIsValid(xlrp)
static void XLogFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes)
bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize, bool cleanupEnabled)
Definition: xlogarchive.c:54
void KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
Definition: xlogarchive.c:358
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
uint32 TimeLineID
Definition: xlogdefs.h:59
uint64 XLogSegNo
Definition: xlogdefs.h:48
void XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher)
XLogRecord * XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg)
void XLogPrefetchReconfigure(void)
XLogPrefetcher * XLogPrefetcherAllocate(XLogReaderState *reader)
void XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
void XLogPrefetcherFree(XLogPrefetcher *prefetcher)
XLogReaderState * XLogPrefetcherGetReader(XLogPrefetcher *prefetcher)
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
Definition: xlogreader.c:1997
void XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size)
Definition: xlogreader.c:90
void XLogReaderResetError(XLogReaderState *state)
Definition: xlogreader.c:1365
bool XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, char *phdr)
Definition: xlogreader.c:1224
void XLogReaderFree(XLogReaderState *state)
Definition: xlogreader.c:161
XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, XLogReaderRoutine *routine, void *private_data)
Definition: xlogreader.c:106
bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
Definition: xlogreader.c:2056
#define XLogRecGetDataLen(decoder)
Definition: xlogreader.h:416
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:410
#define XLogRecBlockImageApply(decoder, block_id)
Definition: xlogreader.h:425
#define XLogRecGetRmid(decoder)
Definition: xlogreader.h:411
#define XLogRecGetData(decoder)
Definition: xlogreader.h:415
#define XLogRecGetXid(decoder)
Definition: xlogreader.h:412
#define XL_ROUTINE(...)
Definition: xlogreader.h:117
#define XLogRecMaxBlockId(decoder)
Definition: xlogreader.h:418
XLogPageReadResult
Definition: xlogreader.h:350
@ XLREAD_WOULDBLOCK
Definition: xlogreader.h:353
@ XLREAD_SUCCESS
Definition: xlogreader.h:351
@ XLREAD_FAIL
Definition: xlogreader.h:352
#define XLogRecHasBlockImage(decoder, block_id)
Definition: xlogreader.h:423
#define XLogRecGetPrev(decoder)
Definition: xlogreader.h:409
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:417
#define SizeOfXLogRecordDataHeaderShort
Definition: xlogrecord.h:217
#define XLR_INFO_MASK
Definition: xlogrecord.h:62
#define SizeOfXLogRecord
Definition: xlogrecord.h:55
#define XLR_CHECK_CONSISTENCY
Definition: xlogrecord.h:91
bool reachedConsistency
Definition: xlogrecovery.c:295
bool check_primary_slot_name(char **newval, void **extra, GucSource source)
static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
static XLogRecPtr recoveryStopLSN
Definition: xlogrecovery.c:382
static bool recoveryStopsBefore(XLogReaderState *record)
static TimestampTz recoveryStopTime
Definition: xlogrecovery.c:381
void assign_recovery_target_xid(const char *newval, void *extra)
static bool CheckForStandbyTrigger(void)
int recovery_min_apply_delay
Definition: xlogrecovery.c:94
bool check_recovery_target(char **newval, void **extra, GucSource source)
static bool backupEndRequired
Definition: xlogrecovery.c:284
bool HotStandbyActive(void)
static char * getRecoveryStopReason(void)
void ShutdownWalRecovery(void)
RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal
Definition: xlogrecovery.c:121
int recoveryTargetAction
Definition: xlogrecovery.c:88
static void rm_redo_error_callback(void *arg)
static bool recoveryApplyDelay(XLogReaderState *record)
bool ArchiveRecoveryRequested
Definition: xlogrecovery.c:138
const char * recoveryTargetName
Definition: xlogrecovery.c:92
static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
static void pg_attribute_noreturn() error_multiple_recovery_targets(void)
bool check_recovery_target_timeline(char **newval, void **extra, GucSource source)
static XLogRecPtr minRecoveryPoint
Definition: xlogrecovery.c:279
static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf)
static XLogRecPtr backupEndPoint
Definition: xlogrecovery.c:283
const struct config_enum_entry recovery_target_action_options[]
Definition: xlogrecovery.c:75
static void validateRecoveryParameters(void)
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, TimeLineID replayTLI)
static XLogRecord * ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr, TimeLineID replayTLI)
void StartupRequestWalReceiverRestart(void)
bool InArchiveRecovery
Definition: xlogrecovery.c:139
static bool recoveryStopsAfter(XLogReaderState *record)
void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
char * PrimarySlotName
Definition: xlogrecovery.c:98
static TimeLineID curFileTLI
Definition: xlogrecovery.c:125
static char recoveryStopName[MAXFNAMELEN]
Definition: xlogrecovery.c:383
static void CheckRecoveryConsistency(void)
static bool pendingWalRcvRestart
Definition: xlogrecovery.c:249
void PerformWalRecovery(void)
static XLogSource XLogReceiptSource
Definition: xlogrecovery.c:260
bool CheckPromoteSignal(void)
struct XLogPageReadPrivate XLogPageReadPrivate
static bool recoveryStopAfter
Definition: xlogrecovery.c:384
static const char *const xlogSourceNames[]
Definition: xlogrecovery.c:219
static TimeLineID RedoStartTLI
Definition: xlogrecovery.c:171
char * recoveryRestoreCommand
Definition: xlogrecovery.c:83
static void verifyBackupPageConsistency(XLogReaderState *record)
static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
void assign_recovery_target(const char *newval, void *extra)
void SetRecoveryPause(bool recoveryPause)
EndOfWalRecoveryInfo * FinishWalRecovery(void)
static bool lastSourceFailed
Definition: xlogrecovery.c:248
char * archiveCleanupCommand
Definition: xlogrecovery.c:85
XLogRecPtr GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
static TimeLineID receiveTLI
Definition: xlogrecovery.c:264
void WakeupRecovery(void)
void xlog_outdesc(StringInfo buf, XLogReaderState *record)
static bool LocalPromoteIsTriggered
Definition: xlogrecovery.c:183
bool PromoteIsTriggered(void)
TimestampTz GetCurrentChunkReplayStartTime(void)
static void ConfirmRecoveryPaused(void)
static void readRecoverySignalFile(void)
static XLogRecPtr missingContrecPtr
Definition: xlogrecovery.c:374
static XLogRecoveryCtlData * XLogRecoveryCtl
Definition: xlogrecovery.c:365
static uint32 readOff
Definition: xlogrecovery.c:233
static bool standby_signal_file_found
Definition: xlogrecovery.c:151
char * recovery_target_time_string
Definition: xlogrecovery.c:90
bool StandbyMode
Definition: xlogrecovery.c:148
static int readFile
Definition: xlogrecovery.c:231
static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr, TimeLineID replayTLI, XLogRecPtr replayLSN, bool nonblocking)
XLogRecPtr recoveryTargetLSN
Definition: xlogrecovery.c:93
RecoveryTargetType recoveryTarget
Definition: xlogrecovery.c:86
static bool read_tablespace_map(List **tablespaces)
static bool doRequestWalReceiverReply
Definition: xlogrecovery.c:186
static bool read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, bool *backupEndRequired, bool *backupFromStandby)
static int XLogFileRead(XLogSegNo segno, TimeLineID tli, XLogSource source, bool notfoundOk)
static XLogSource currentSource
Definition: xlogrecovery.c:247
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI)
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
static List * expectedTLEs
Definition: xlogrecovery.c:124
static XLogSegNo readSegNo
Definition: xlogrecovery.c:232
void assign_recovery_target_name(const char *newval, void *extra)
static XLogRecPtr abortedRecPtr
Definition: xlogrecovery.c:373
static char * primary_image_masked
Definition: xlogrecovery.c:299
static TimeLineID minRecoveryPointTLI
Definition: xlogrecovery.c:280
static XLogRecord * ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, bool fetching_ckpt, TimeLineID replayTLI)
void assign_recovery_target_time(const char *newval, void *extra)
static void SetCurrentChunkStartTime(TimestampTz xtime)
static XLogRecPtr CheckPointLoc
Definition: xlogrecovery.c:168
bool check_recovery_target_xid(char **newval, void **extra, GucSource source)
static bool LocalHotStandbyActive
Definition: xlogrecovery.c:177
struct XLogRecoveryCtlData XLogRecoveryCtlData
static bool HotStandbyActiveInReplay(void)
static bool InRedo
Definition: xlogrecovery.c:204
static TransactionId recoveryStopXid
Definition: xlogrecovery.c:380
bool check_recovery_target_time(char **newval, void **extra, GucSource source)
static XLogSource readSource
Definition: xlogrecovery.c:235
static void SetPromoteIsTriggered(void)
#define RECOVERY_COMMAND_FILE
Definition: xlogrecovery.c:69
TransactionId recoveryTargetXid
Definition: xlogrecovery.c:89
XLogSource
Definition: xlogrecovery.c:211
@ XLOG_FROM_PG_WAL
Definition: xlogrecovery.c:214
@ XLOG_FROM_STREAM
Definition: xlogrecovery.c:215
@ XLOG_FROM_ARCHIVE
Definition: xlogrecovery.c:213
@ XLOG_FROM_ANY
Definition: xlogrecovery.c:212
TimeLineID recoveryTargetTLIRequested
Definition: xlogrecovery.c:122
void InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
Definition: xlogrecovery.c:513
static void xlog_block_info(StringInfo buf, XLogReaderState *record)
static TimestampTz XLogReceiptTime
Definition: xlogrecovery.c:259
static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
Size XLogRecoveryShmemSize(void)
Definition: xlogrecovery.c:448
static char * replay_image_masked
Definition: xlogrecovery.c:298
bool wal_receiver_create_temp_slot
Definition: xlogrecovery.c:99
static void CheckTablespaceDirectory(void)
char * recoveryEndCommand
Definition: xlogrecovery.c:84
RecoveryPauseState GetRecoveryPauseState(void)
TimeLineID recoveryTargetTLI
Definition: xlogrecovery.c:123
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
void assign_recovery_target_lsn(const char *newval, void *extra)
bool check_recovery_target_lsn(char **newval, void **extra, GucSource source)
static XLogRecPtr RedoStartLSN
Definition: xlogrecovery.c:170
static XLogRecPtr flushedUpto
Definition: xlogrecovery.c:263
void XLogRecoveryShmemInit(void)
Definition: xlogrecovery.c:459
static void recoveryPausesHere(bool endOfRecovery)
static uint32 readLen
Definition: xlogrecovery.c:234
static void EnableStandbyMode(void)
Definition: xlogrecovery.c:479
#define RECOVERY_COMMAND_DONE
Definition: xlogrecovery.c:70
static bool recovery_signal_file_found
Definition: xlogrecovery.c:152
TimestampTz recoveryTargetTime
Definition: xlogrecovery.c:91
TimestampTz GetLatestXTime(void)
char * PrimaryConnInfo
Definition: xlogrecovery.c:97
void XLogRequestWalReceiverReply(void)
static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
static XLogPrefetcher * xlogprefetcher
Definition: xlogrecovery.c:192
static bool StandbyModeRequested
Definition: xlogrecovery.c:147
bool check_recovery_target_name(char **newval, void **extra, GucSource source)
bool recoveryTargetInclusive
Definition: xlogrecovery.c:87
static XLogReaderState * xlogreader
Definition: xlogrecovery.c:189
void RemovePromoteSignalFiles(void)
void assign_recovery_target_timeline(const char *newval, void *extra)
static XLogRecPtr backupStartPoint
Definition: xlogrecovery.c:282
static void SetLatestXTime(TimestampTz xtime)
static TimeLineID CheckPointTLI
Definition: xlogrecovery.c:169
RecoveryTargetType
Definition: xlogrecovery.h:24
@ RECOVERY_TARGET_IMMEDIATE
Definition: xlogrecovery.h:30
@ RECOVERY_TARGET_TIME
Definition: xlogrecovery.h:27
@ RECOVERY_TARGET_UNSET
Definition: xlogrecovery.h:25
@ RECOVERY_TARGET_XID
Definition: xlogrecovery.h:26
@ RECOVERY_TARGET_LSN
Definition: xlogrecovery.h:29
@ RECOVERY_TARGET_NAME
Definition: xlogrecovery.h:28
RecoveryTargetTimeLineGoal
Definition: xlogrecovery.h:37
@ RECOVERY_TARGET_TIMELINE_NUMERIC
Definition: xlogrecovery.h:40
@ RECOVERY_TARGET_TIMELINE_CONTROLFILE
Definition: xlogrecovery.h:38
@ RECOVERY_TARGET_TIMELINE_LATEST
Definition: xlogrecovery.h:39
RecoveryPauseState
Definition: xlogrecovery.h:45
@ RECOVERY_PAUSED
Definition: xlogrecovery.h:48
@ RECOVERY_NOT_PAUSED
Definition: xlogrecovery.h:46
@ RECOVERY_PAUSE_REQUESTED
Definition: xlogrecovery.h:47
void wal_segment_close(XLogReaderState *state)
Definition: xlogutils.c:842
Buffer XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode, Buffer recent_buffer)
Definition: xlogutils.c:471
HotStandbyState standbyState
Definition: xlogutils.c:53
bool InRecovery
Definition: xlogutils.c:50
void XLogCheckInvalidPages(void)
Definition: xlogutils.c:245
@ STANDBY_SNAPSHOT_READY
Definition: xlogutils.h:55
@ STANDBY_INITIALIZED
Definition: xlogutils.h:53