PostgreSQL Source Code  git master
xlogrecovery.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * xlogrecovery.c
4  * Functions for WAL recovery, standby mode
5  *
6  * This source file contains functions controlling WAL recovery.
7  * InitWalRecovery() initializes the system for crash or archive recovery,
8  * or standby mode, depending on configuration options and the state of
9  * the control file and possible backup label file. PerformWalRecovery()
10  * performs the actual WAL replay, calling the rmgr-specific redo routines.
11  * EndWalRecovery() performs end-of-recovery checks and cleanup actions,
12  * and prepares information needed to initialize the WAL for writes. In
13  * addition to these three main functions, there are a bunch of functions
14  * for interrogating recovery state and controlling the recovery process.
15  *
16  *
17  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
18  * Portions Copyright (c) 1994, Regents of the University of California
19  *
20  * src/backend/access/transam/xlogrecovery.c
21  *
22  *-------------------------------------------------------------------------
23  */
24 
25 #include "postgres.h"
26 
27 #include <ctype.h>
28 #include <math.h>
29 #include <time.h>
30 #include <sys/stat.h>
31 #include <sys/time.h>
32 #include <unistd.h>
33 
34 #include "access/timeline.h"
35 #include "access/transam.h"
36 #include "access/xact.h"
37 #include "access/xlog_internal.h"
38 #include "access/xlogarchive.h"
39 #include "access/xlogprefetcher.h"
40 #include "access/xlogreader.h"
41 #include "access/xlogrecovery.h"
42 #include "access/xlogutils.h"
43 #include "catalog/pg_control.h"
44 #include "commands/tablespace.h"
45 #include "miscadmin.h"
46 #include "pgstat.h"
47 #include "postmaster/bgwriter.h"
48 #include "postmaster/startup.h"
49 #include "replication/basebackup.h"
51 #include "storage/fd.h"
52 #include "storage/ipc.h"
53 #include "storage/latch.h"
54 #include "storage/pmsignal.h"
55 #include "storage/proc.h"
56 #include "storage/procarray.h"
57 #include "storage/spin.h"
58 #include "utils/builtins.h"
59 #include "utils/guc.h"
60 #include "utils/ps_status.h"
61 #include "utils/pg_rusage.h"
62 
63 /* Unsupported old recovery command file names (relative to $PGDATA) */
64 #define RECOVERY_COMMAND_FILE "recovery.conf"
65 #define RECOVERY_COMMAND_DONE "recovery.done"
66 
67 /*
68  * GUC support
69  */
71  {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
72  {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
73  {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
74  {NULL, 0, false}
75 };
76 
77 /* options formerly taken from recovery.conf for archive recovery */
79 char *recoveryEndCommand = NULL;
80 char *archiveCleanupCommand = NULL;
87 const char *recoveryTargetName;
90 
91 /* options formerly taken from recovery.conf for XLOG streaming */
92 char *PrimaryConnInfo = NULL;
93 char *PrimarySlotName = NULL;
94 char *PromoteTriggerFile = NULL;
96 
97 /*
98  * recoveryTargetTimeLineGoal: what the user requested, if any
99  *
100  * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
101  *
102  * recoveryTargetTLI: the currently understood target timeline; changes
103  *
104  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
105  * the timelines of its known parents, newest first (so recoveryTargetTLI is
106  * always the first list member). Only these TLIs are expected to be seen in
107  * the WAL segments we read, and indeed only these TLIs will be considered as
108  * candidate WAL files to open at all.
109  *
110  * curFileTLI: the TLI appearing in the name of the current input WAL file.
111  * (This is not necessarily the same as the timeline from which we are
112  * replaying WAL, which StartupXLOG calls replayTLI, because we could be
113  * scanning data that was copied from an ancestor timeline when the current
114  * file was created.) During a sequential scan we do not allow this value
115  * to decrease.
116  */
122 
123 /*
124  * When ArchiveRecoveryRequested is set, archive recovery was requested,
125  * ie. signal files were present. When InArchiveRecovery is set, we are
126  * currently recovering using offline XLOG archives. These variables are only
127  * valid in the startup process.
128  *
129  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
130  * currently performing crash recovery using only XLOG files in pg_wal, but
131  * will switch to using offline XLOG archives as soon as we reach the end of
132  * WAL in pg_wal.
133 */
135 bool InArchiveRecovery = false;
136 
137 /*
138  * When StandbyModeRequested is set, standby mode was requested, i.e.
139  * standby.signal file was present. When StandbyMode is set, we are currently
140  * in standby mode. These variables are only valid in the startup process.
141  * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
142  */
143 static bool StandbyModeRequested = false;
144 bool StandbyMode = false;
145 
146 /* was a signal file present at startup? */
147 static bool standby_signal_file_found = false;
148 static bool recovery_signal_file_found = false;
149 
150 /*
151  * CheckPointLoc is the position of the checkpoint record that determines
152  * where to start the replay. It comes from the backup label file or the
153  * control file.
154  *
155  * RedoStartLSN is the checkpoint's REDO location, also from the backup label
156  * file or the control file. In standby mode, XLOG streaming usually starts
157  * from the position where an invalid record was found. But if we fail to
158  * read even the initial checkpoint record, we use the REDO location instead
159  * of the checkpoint location as the start position of XLOG streaming.
160  * Otherwise we would have to jump backwards to the REDO location after
161  * reading the checkpoint record, because the REDO record can precede the
162  * checkpoint record.
163  */
168 
169 /*
170  * Local copy of SharedHotStandbyActive variable. False actually means "not
171  * known, need to check the shared state".
172  */
173 static bool LocalHotStandbyActive = false;
174 
175 /*
176  * Local copy of SharedPromoteIsTriggered variable. False actually means "not
177  * known, need to check the shared state".
178  */
179 static bool LocalPromoteIsTriggered = false;
180 
181 /* Has the recovery code requested a walreceiver wakeup? */
183 
184 /* XLogReader object used to parse the WAL records */
186 
187 /* XLogPrefetcher object used to consume WAL records with read-ahead */
189 
190 /* Parameters passed down from ReadRecord to the XLogPageRead callback. */
191 typedef struct XLogPageReadPrivate
192 {
193  int emode;
194  bool fetching_ckpt; /* are we fetching a checkpoint record? */
198 
199 /* flag to tell XLogPageRead that we have started replaying */
200 static bool InRedo = false;
201 
202 /*
203  * Codes indicating where we got a WAL file from during recovery, or where
204  * to attempt to get one.
205  */
206 typedef enum
207 {
208  XLOG_FROM_ANY = 0, /* request to read WAL from any source */
209  XLOG_FROM_ARCHIVE, /* restored using restore_command */
210  XLOG_FROM_PG_WAL, /* existing file in pg_wal */
211  XLOG_FROM_STREAM /* streamed from primary */
213 
214 /* human-readable names for XLogSources, for debugging output */
215 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
216 
217 /*
218  * readFile is -1 or a kernel FD for the log file segment that's currently
219  * open for reading. readSegNo identifies the segment. readOff is the offset
220  * of the page just read, readLen indicates how much of it has been read into
221  * readBuf, and readSource indicates where we got the currently open file from.
222  *
223  * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
224  * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
225  * worthwhile, since the XLOG is not read by general-purpose sessions.
226  */
227 static int readFile = -1;
228 static XLogSegNo readSegNo = 0;
229 static uint32 readOff = 0;
230 static uint32 readLen = 0;
232 
233 /*
234  * Keeps track of which source we're currently reading from. This is
235  * different from readSource in that this is always set, even when we don't
236  * currently have a WAL file open. If lastSourceFailed is set, our last
237  * attempt to read from currentSource failed, and we should try another source
238  * next.
239  *
240  * pendingWalRcvRestart is set when a config change occurs that requires a
241  * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
242  */
244 static bool lastSourceFailed = false;
245 static bool pendingWalRcvRestart = false;
246 
247 /*
248  * These variables track when we last obtained some WAL data to process,
249  * and where we got it from. (XLogReceiptSource is initially the same as
250  * readSource, but readSource gets reset to zero when we don't have data
251  * to process right now. It is also different from currentSource, which
252  * also changes when we try to read from a source and fail, while
253  * XLogReceiptSource tracks where we last successfully read some WAL.)
254  */
257 
258 /* Local copy of WalRcv->flushedUpto */
261 
262 /*
263  * Copy of minRecoveryPoint and backupEndPoint from the control file.
264  *
265  * In order to reach consistency, we must replay the WAL up to
266  * minRecoveryPoint. If backupEndRequired is true, we must also reach
267  * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
268  * to backupStartPoint.
269  *
270  * Note: In archive recovery, after consistency has been reached, the
271  * functions in xlog.c will start updating minRecoveryPoint in the control
272  * file. But this copy of minRecoveryPoint variable reflects the value at the
273  * beginning of recovery, and is *not* updated after consistency is reached.
274  */
277 
280 static bool backupEndRequired = false;
281 
282 /*
283  * Have we reached a consistent database state? In crash recovery, we have
284  * to replay all the WAL, so reachedConsistency is never set. During archive
285  * recovery, the database is consistent once minRecoveryPoint is reached.
286  *
287  * Consistent state means that the system is internally consistent, all
288  * the WAL has been replayed up to a certain point, and importantly, there
289  * is no trace of later actions on disk.
290  */
291 bool reachedConsistency = false;
292 
293 /* Buffers dedicated to consistency checks of size BLCKSZ */
294 static char *replay_image_masked = NULL;
295 static char *primary_image_masked = NULL;
296 
297 
298 /*
299  * Shared-memory state for WAL recovery.
300  */
301 typedef struct XLogRecoveryCtlData
302 {
303  /*
304  * SharedHotStandbyActive indicates if we allow hot standby queries to be
305  * run. Protected by info_lck.
306  */
308 
309  /*
310  * SharedPromoteIsTriggered indicates if a standby promotion has been
311  * triggered. Protected by info_lck.
312  */
314 
315  /*
316  * recoveryWakeupLatch is used to wake up the startup process to continue
317  * WAL replay, if it is waiting for WAL to arrive or failover trigger file
318  * to appear.
319  *
320  * Note that the startup process also uses another latch, its procLatch,
321  * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
322  * signaling the startup process in favor of using its procLatch, which
323  * comports better with possible generic signal handlers using that latch.
324  * But we should not do that because the startup process doesn't assume
325  * that it's waken up by walreceiver process or SIGHUP signal handler
326  * while it's waiting for recovery conflict. The separate latches,
327  * recoveryWakeupLatch and procLatch, should be used for inter-process
328  * communication for WAL replay and recovery conflict, respectively.
329  */
331 
332  /*
333  * Last record successfully replayed.
334  */
335  XLogRecPtr lastReplayedReadRecPtr; /* start position */
336  XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */
337  TimeLineID lastReplayedTLI; /* timeline */
338 
339  /*
340  * When we're currently replaying a record, ie. in a redo function,
341  * replayEndRecPtr points to the end+1 of the record being replayed,
342  * otherwise it's equal to lastReplayedEndRecPtr.
343  */
346  /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
348 
349  /*
350  * timestamp of when we started replaying the current chunk of WAL data,
351  * only relevant for replication or archive recovery
352  */
354  /* Recovery pause state */
357 
358  slock_t info_lck; /* locks shared variables shown above */
360 
362 
363 /*
364  * abortedRecPtr is the start pointer of a broken record at end of WAL when
365  * recovery completes; missingContrecPtr is the location of the first
366  * contrecord that went missing. See CreateOverwriteContrecordRecord for
367  * details.
368  */
371 
372 /*
373  * if recoveryStopsBefore/After returns true, it saves information of the stop
374  * point here
375  */
380 static bool recoveryStopAfter;
381 
382 /* prototypes for local functions */
383 static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
384 
385 static void readRecoverySignalFile(void);
386 static void validateRecoveryParameters(void);
387 static bool read_backup_label(XLogRecPtr *checkPointLoc,
388  TimeLineID *backupLabelTLI,
389  bool *backupEndRequired, bool *backupFromStandby);
390 static bool read_tablespace_map(List **tablespaces);
391 
392 static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
393 static void CheckRecoveryConsistency(void);
394 static void rm_redo_error_callback(void *arg);
395 #ifdef WAL_DEBUG
396 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
397 #endif
398 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
399 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
400  TimeLineID prevTLI, TimeLineID replayTLI);
401 static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
402 static void verifyBackupPageConsistency(XLogReaderState *record);
403 
404 static bool recoveryStopsBefore(XLogReaderState *record);
405 static bool recoveryStopsAfter(XLogReaderState *record);
406 static char *getRecoveryStopReason(void);
407 static void recoveryPausesHere(bool endOfRecovery);
408 static bool recoveryApplyDelay(XLogReaderState *record);
409 static void ConfirmRecoveryPaused(void);
410 
412  int emode, bool fetching_ckpt,
413  TimeLineID replayTLI);
414 
415 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
416  int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
418  bool randAccess,
419  bool fetching_ckpt,
420  XLogRecPtr tliRecPtr,
421  TimeLineID replayTLI,
422  XLogRecPtr replayLSN,
423  bool nonblocking);
424 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
426  int whichChkpt, bool report, TimeLineID replayTLI);
427 static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
428 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
429  XLogSource source, bool notfoundOk);
430 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
431 
432 static bool CheckForStandbyTrigger(void);
433 static void SetPromoteIsTriggered(void);
434 static bool HotStandbyActiveInReplay(void);
435 
436 static void SetCurrentChunkStartTime(TimestampTz xtime);
437 static void SetLatestXTime(TimestampTz xtime);
438 
439 /*
440  * Initialization of shared memory for WAL recovery
441  */
442 Size
444 {
445  Size size;
446 
447  /* XLogRecoveryCtl */
448  size = sizeof(XLogRecoveryCtlData);
449 
450  return size;
451 }
452 
453 void
455 {
456  bool found;
457 
459  ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
460  if (found)
461  return;
462  memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
463 
467 }
468 
469 /*
470  * Prepare the system for WAL recovery, if needed.
471  *
472  * This is called by StartupXLOG() which coordinates the server startup
473  * sequence. This function analyzes the control file and the backup label
474  * file, if any, and figures out whether we need to perform crash recovery or
475  * archive recovery, and how far we need to replay the WAL to reach a
476  * consistent state.
477  *
478  * This doesn't yet change the on-disk state, except for creating the symlinks
479  * from table space map file if any, and for fetching WAL files needed to find
480  * the checkpoint record. On entry, the caller has already read the control
481  * file into memory, and passes it as argument. This function updates it to
482  * reflect the recovery state, and the caller is expected to write it back to
483  * disk does after initializing other subsystems, but before calling
484  * PerformWalRecovery().
485  *
486  * This initializes some global variables like ArchiveModeRequested, and
487  * StandbyModeRequested and InRecovery.
488  */
489 void
491  bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
492 {
493  XLogPageReadPrivate *private;
494  struct stat st;
495  bool wasShutdown;
496  XLogRecord *record;
497  DBState dbstate_at_startup;
498  bool haveTblspcMap = false;
499  bool haveBackupLabel = false;
500  CheckPoint checkPoint;
501  bool backupFromStandby = false;
502 
503  dbstate_at_startup = ControlFile->state;
504 
505  /*
506  * Initialize on the assumption we want to recover to the latest timeline
507  * that's active according to pg_control.
508  */
512  else
514 
515  /*
516  * Check for signal files, and if so set up state for offline recovery
517  */
520 
522  {
524  ereport(LOG,
525  (errmsg("entering standby mode")));
527  ereport(LOG,
528  (errmsg("starting point-in-time recovery to XID %u",
531  ereport(LOG,
532  (errmsg("starting point-in-time recovery to %s",
535  ereport(LOG,
536  (errmsg("starting point-in-time recovery to \"%s\"",
539  ereport(LOG,
540  (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
543  ereport(LOG,
544  (errmsg("starting point-in-time recovery to earliest consistent point")));
545  else
546  ereport(LOG,
547  (errmsg("starting archive recovery")));
548  }
549 
550  /*
551  * Take ownership of the wakeup latch if we're going to sleep during
552  * recovery.
553  */
556 
557  private = palloc0(sizeof(XLogPageReadPrivate));
558  xlogreader =
560  XL_ROUTINE(.page_read = &XLogPageRead,
561  .segment_open = NULL,
562  .segment_close = wal_segment_close),
563  private);
564  if (!xlogreader)
565  ereport(ERROR,
566  (errcode(ERRCODE_OUT_OF_MEMORY),
567  errmsg("out of memory"),
568  errdetail("Failed while allocating a WAL reading processor.")));
570 
571  /*
572  * Set the WAL decode buffer size. This limits how far ahead we can read
573  * in the WAL.
574  */
576 
577  /* Create a WAL prefetcher. */
579 
580  /*
581  * Allocate two page buffers dedicated to WAL consistency checks. We do
582  * it this way, rather than just making static arrays, for two reasons:
583  * (1) no need to waste the storage in most instantiations of the backend;
584  * (2) a static char array isn't guaranteed to have any particular
585  * alignment, whereas palloc() will provide MAXALIGN'd storage.
586  */
587  replay_image_masked = (char *) palloc(BLCKSZ);
588  primary_image_masked = (char *) palloc(BLCKSZ);
589 
591  &backupFromStandby))
592  {
593  List *tablespaces = NIL;
594 
595  /*
596  * Archive recovery was requested, and thanks to the backup label
597  * file, we know how far we need to replay to reach consistency. Enter
598  * archive recovery directly.
599  */
600  InArchiveRecovery = true;
602  StandbyMode = true;
603 
604  /*
605  * When a backup_label file is present, we want to roll forward from
606  * the checkpoint it identifies, rather than using pg_control.
607  */
609  CheckPointTLI);
610  if (record != NULL)
611  {
612  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
613  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
614  ereport(DEBUG1,
615  (errmsg_internal("checkpoint record is at %X/%X",
617  InRecovery = true; /* force recovery even if SHUTDOWNED */
618 
619  /*
620  * Make sure that REDO location exists. This may not be the case
621  * if there was a crash during an online backup, which left a
622  * backup_label around that references a WAL segment that's
623  * already been archived.
624  */
625  if (checkPoint.redo < CheckPointLoc)
626  {
628  if (!ReadRecord(xlogprefetcher, LOG, false,
629  checkPoint.ThisTimeLineID))
630  ereport(FATAL,
631  (errmsg("could not find redo location referenced by checkpoint record"),
632  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
633  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
634  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
635  DataDir, DataDir, DataDir)));
636  }
637  }
638  else
639  {
640  ereport(FATAL,
641  (errmsg("could not locate required checkpoint record"),
642  errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
643  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
644  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
645  DataDir, DataDir, DataDir)));
646  wasShutdown = false; /* keep compiler quiet */
647  }
648 
649  /* Read the tablespace_map file if present and create symlinks. */
650  if (read_tablespace_map(&tablespaces))
651  {
652  ListCell *lc;
653 
654  foreach(lc, tablespaces)
655  {
656  tablespaceinfo *ti = lfirst(lc);
657  char *linkloc;
658 
659  linkloc = psprintf("pg_tblspc/%s", ti->oid);
660 
661  /*
662  * Remove the existing symlink if any and Create the symlink
663  * under PGDATA.
664  */
665  remove_tablespace_symlink(linkloc);
666 
667  if (symlink(ti->path, linkloc) < 0)
668  ereport(ERROR,
670  errmsg("could not create symbolic link \"%s\": %m",
671  linkloc)));
672 
673  pfree(ti->oid);
674  pfree(ti->path);
675  pfree(ti);
676  }
677 
678  /* tell the caller to delete it later */
679  haveTblspcMap = true;
680  }
681 
682  /* tell the caller to delete it later */
683  haveBackupLabel = true;
684  }
685  else
686  {
687  /*
688  * If tablespace_map file is present without backup_label file, there
689  * is no use of such file. There is no harm in retaining it, but it
690  * is better to get rid of the map file so that we don't have any
691  * redundant file in data directory and it will avoid any sort of
692  * confusion. It seems prudent though to just rename the file out of
693  * the way rather than delete it completely, also we ignore any error
694  * that occurs in rename operation as even if map file is present
695  * without backup_label file, it is harmless.
696  */
697  if (stat(TABLESPACE_MAP, &st) == 0)
698  {
699  unlink(TABLESPACE_MAP_OLD);
701  ereport(LOG,
702  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
704  errdetail("File \"%s\" was renamed to \"%s\".",
706  else
707  ereport(LOG,
708  (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
710  errdetail("Could not rename file \"%s\" to \"%s\": %m.",
712  }
713 
714  /*
715  * It's possible that archive recovery was requested, but we don't
716  * know how far we need to replay the WAL before we reach consistency.
717  * This can happen for example if a base backup is taken from a
718  * running server using an atomic filesystem snapshot, without calling
719  * pg_start/stop_backup. Or if you just kill a running primary server
720  * and put it into archive recovery by creating a recovery signal
721  * file.
722  *
723  * Our strategy in that case is to perform crash recovery first,
724  * replaying all the WAL present in pg_wal, and only enter archive
725  * recovery after that.
726  *
727  * But usually we already know how far we need to replay the WAL (up
728  * to minRecoveryPoint, up to backupEndPoint, or until we see an
729  * end-of-backup record), and we can enter archive recovery directly.
730  */
736  {
737  InArchiveRecovery = true;
739  StandbyMode = true;
740  }
741 
742  /* Get the last valid checkpoint record. */
748  CheckPointTLI);
749  if (record != NULL)
750  {
751  ereport(DEBUG1,
752  (errmsg_internal("checkpoint record is at %X/%X",
754  }
755  else
756  {
757  /*
758  * We used to attempt to go back to a secondary checkpoint record
759  * here, but only when not in standby mode. We now just fail if we
760  * can't read the last checkpoint because this allows us to
761  * simplify processing around checkpoints.
762  */
763  ereport(PANIC,
764  (errmsg("could not locate a valid checkpoint record")));
765  }
766  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
767  wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
768  }
769 
770  /*
771  * If the location of the checkpoint record is not on the expected
772  * timeline in the history of the requested timeline, we cannot proceed:
773  * the backup is not part of the history of the requested timeline.
774  */
775  Assert(expectedTLEs); /* was initialized by reading checkpoint
776  * record */
779  {
780  XLogRecPtr switchpoint;
781 
782  /*
783  * tliSwitchPoint will throw an error if the checkpoint's timeline is
784  * not in expectedTLEs at all.
785  */
787  ereport(FATAL,
788  (errmsg("requested timeline %u is not a child of this server's history",
790  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
793  LSN_FORMAT_ARGS(switchpoint))));
794  }
795 
796  /*
797  * The min recovery point should be part of the requested timeline's
798  * history, too.
799  */
803  ereport(FATAL,
804  (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
808 
809  ereport(DEBUG1,
810  (errmsg_internal("redo record is at %X/%X; shutdown %s",
811  LSN_FORMAT_ARGS(checkPoint.redo),
812  wasShutdown ? "true" : "false")));
813  ereport(DEBUG1,
814  (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
815  U64FromFullTransactionId(checkPoint.nextXid),
816  checkPoint.nextOid)));
817  ereport(DEBUG1,
818  (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
819  checkPoint.nextMulti, checkPoint.nextMultiOffset)));
820  ereport(DEBUG1,
821  (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
822  checkPoint.oldestXid, checkPoint.oldestXidDB)));
823  ereport(DEBUG1,
824  (errmsg_internal("oldest MultiXactId: %u, in database %u",
825  checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
826  ereport(DEBUG1,
827  (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
828  checkPoint.oldestCommitTsXid,
829  checkPoint.newestCommitTsXid)));
831  ereport(PANIC,
832  (errmsg("invalid next transaction ID")));
833 
834  /* sanity check */
835  if (checkPoint.redo > CheckPointLoc)
836  ereport(PANIC,
837  (errmsg("invalid redo in checkpoint record")));
838 
839  /*
840  * Check whether we need to force recovery from WAL. If it appears to
841  * have been a clean shutdown and we did not have a recovery signal file,
842  * then assume no recovery needed.
843  */
844  if (checkPoint.redo < CheckPointLoc)
845  {
846  if (wasShutdown)
847  ereport(PANIC,
848  (errmsg("invalid redo record in shutdown checkpoint")));
849  InRecovery = true;
850  }
851  else if (ControlFile->state != DB_SHUTDOWNED)
852  InRecovery = true;
853  else if (ArchiveRecoveryRequested)
854  {
855  /* force recovery due to presence of recovery signal file */
856  InRecovery = true;
857  }
858 
859  /*
860  * If recovery is needed, update our in-memory copy of pg_control to show
861  * that we are recovering and to show the selected checkpoint as the place
862  * we are starting from. We also mark pg_control with any minimum recovery
863  * stop point obtained from a backup history file.
864  *
865  * We don't write the changes to disk yet, though. Only do that after
866  * initializing various subsystems.
867  */
868  if (InRecovery)
869  {
870  if (InArchiveRecovery)
871  {
873  }
874  else
875  {
876  ereport(LOG,
877  (errmsg("database system was not properly shut down; "
878  "automatic recovery in progress")));
880  ereport(LOG,
881  (errmsg("crash recovery starts in timeline %u "
882  "and has target timeline %u",
886  }
888  ControlFile->checkPointCopy = checkPoint;
889  if (InArchiveRecovery)
890  {
891  /* initialize minRecoveryPoint if not set yet */
892  if (ControlFile->minRecoveryPoint < checkPoint.redo)
893  {
894  ControlFile->minRecoveryPoint = checkPoint.redo;
896  }
897  }
898 
899  /*
900  * Set backupStartPoint if we're starting recovery from a base backup.
901  *
902  * Also set backupEndPoint and use minRecoveryPoint as the backup end
903  * location if we're starting recovery from a base backup which was
904  * taken from a standby. In this case, the database system status in
905  * pg_control must indicate that the database was already in recovery.
906  * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
907  * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
908  * before reaching this point; e.g. because restore_command or
909  * primary_conninfo were faulty.
910  *
911  * Any other state indicates that the backup somehow became corrupted
912  * and we can't sensibly continue with recovery.
913  */
914  if (haveBackupLabel)
915  {
916  ControlFile->backupStartPoint = checkPoint.redo;
918 
919  if (backupFromStandby)
920  {
921  if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
922  dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
923  ereport(FATAL,
924  (errmsg("backup_label contains data inconsistent with control file"),
925  errhint("This means that the backup is corrupted and you will "
926  "have to use another backup for recovery.")));
928  }
929  }
930  }
931 
932  /* remember these, so that we know when we have reached consistency */
936  if (InArchiveRecovery)
937  {
940  }
941  else
942  {
945  }
946 
947  /*
948  * Start recovery assuming that the final record isn't lost.
949  */
952 
953  *wasShutdown_ptr = wasShutdown;
954  *haveBackupLabel_ptr = haveBackupLabel;
955  *haveTblspcMap_ptr = haveTblspcMap;
956 }
957 
958 /*
959  * See if there are any recovery signal files and if so, set state for
960  * recovery.
961  *
962  * See if there is a recovery command file (recovery.conf), and if so
963  * throw an ERROR since as of PG12 we no longer recognize that.
964  */
965 static void
967 {
968  struct stat stat_buf;
969 
971  return;
972 
973  /*
974  * Check for old recovery API file: recovery.conf
975  */
976  if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
977  ereport(FATAL,
979  errmsg("using recovery command file \"%s\" is not supported",
981 
982  /*
983  * Remove unused .done file, if present. Ignore if absent.
984  */
985  unlink(RECOVERY_COMMAND_DONE);
986 
987  /*
988  * Check for recovery signal files and if found, fsync them since they
989  * represent server state information. We don't sweat too much about the
990  * possibility of fsync failure, however.
991  *
992  * If present, standby signal file takes precedence. If neither is present
993  * then we won't enter archive recovery.
994  */
995  if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
996  {
997  int fd;
998 
1000  S_IRUSR | S_IWUSR);
1001  if (fd >= 0)
1002  {
1003  (void) pg_fsync(fd);
1004  close(fd);
1005  }
1007  }
1008  else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1009  {
1010  int fd;
1011 
1013  S_IRUSR | S_IWUSR);
1014  if (fd >= 0)
1015  {
1016  (void) pg_fsync(fd);
1017  close(fd);
1018  }
1020  }
1021 
1022  StandbyModeRequested = false;
1023  ArchiveRecoveryRequested = false;
1025  {
1026  StandbyModeRequested = true;
1027  ArchiveRecoveryRequested = true;
1028  }
1029  else if (recovery_signal_file_found)
1030  {
1031  StandbyModeRequested = false;
1032  ArchiveRecoveryRequested = true;
1033  }
1034  else
1035  return;
1036 
1037  /*
1038  * We don't support standby mode in standalone backends; that requires
1039  * other processes such as the WAL receiver to be alive.
1040  */
1042  ereport(FATAL,
1043  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1044  errmsg("standby mode is not supported by single-user servers")));
1045 }
1046 
1047 static void
1049 {
1051  return;
1052 
1053  /*
1054  * Check for compulsory parameters
1055  */
1057  {
1058  if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1059  (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1060  ereport(WARNING,
1061  (errmsg("specified neither primary_conninfo nor restore_command"),
1062  errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1063  }
1064  else
1065  {
1066  if (recoveryRestoreCommand == NULL ||
1067  strcmp(recoveryRestoreCommand, "") == 0)
1068  ereport(FATAL,
1069  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1070  errmsg("must specify restore_command when standby mode is not enabled")));
1071  }
1072 
1073  /*
1074  * Override any inconsistent requests. Note that this is a change of
1075  * behaviour in 9.5; prior to this we simply ignored a request to pause if
1076  * hot_standby = off, which was surprising behaviour.
1077  */
1081 
1082  /*
1083  * Final parsing of recovery_target_time string; see also
1084  * check_recovery_target_time().
1085  */
1087  {
1091  Int32GetDatum(-1)));
1092  }
1093 
1094  /*
1095  * If user specified recovery_target_timeline, validate it or compute the
1096  * "latest" value. We can't do this until after we've gotten the restore
1097  * command and set InArchiveRecovery, because we need to fetch timeline
1098  * history files from the archive.
1099  */
1101  {
1103 
1104  /* Timeline 1 does not have a history file, all else should */
1105  if (rtli != 1 && !existsTimeLineHistory(rtli))
1106  ereport(FATAL,
1107  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1108  errmsg("recovery target timeline %u does not exist",
1109  rtli)));
1110  recoveryTargetTLI = rtli;
1111  }
1113  {
1114  /* We start the "latest" search from pg_control's timeline */
1116  }
1117  else
1118  {
1119  /*
1120  * else we just use the recoveryTargetTLI as already read from
1121  * ControlFile
1122  */
1124  }
1125 }
1126 
1127 /*
1128  * read_backup_label: check to see if a backup_label file is present
1129  *
1130  * If we see a backup_label during recovery, we assume that we are recovering
1131  * from a backup dump file, and we therefore roll forward from the checkpoint
1132  * identified by the label file, NOT what pg_control says. This avoids the
1133  * problem that pg_control might have been archived one or more checkpoints
1134  * later than the start of the dump, and so if we rely on it as the start
1135  * point, we will fail to restore a consistent database state.
1136  *
1137  * Returns true if a backup_label was found (and fills the checkpoint
1138  * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1139  * returns false if not. If this backup_label came from a streamed backup,
1140  * *backupEndRequired is set to true. If this backup_label was created during
1141  * recovery, *backupFromStandby is set to true.
1142  *
1143  * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1144  * and TLI read from the backup file.
1145  */
1146 static bool
1147 read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1148  bool *backupEndRequired, bool *backupFromStandby)
1149 {
1150  char startxlogfilename[MAXFNAMELEN];
1151  TimeLineID tli_from_walseg,
1152  tli_from_file;
1153  FILE *lfp;
1154  char ch;
1155  char backuptype[20];
1156  char backupfrom[20];
1157  char backuplabel[MAXPGPATH];
1158  char backuptime[128];
1159  uint32 hi,
1160  lo;
1161 
1162  /* suppress possible uninitialized-variable warnings */
1163  *checkPointLoc = InvalidXLogRecPtr;
1164  *backupLabelTLI = 0;
1165  *backupEndRequired = false;
1166  *backupFromStandby = false;
1167 
1168  /*
1169  * See if label file is present
1170  */
1171  lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1172  if (!lfp)
1173  {
1174  if (errno != ENOENT)
1175  ereport(FATAL,
1177  errmsg("could not read file \"%s\": %m",
1178  BACKUP_LABEL_FILE)));
1179  return false; /* it's not there, all is fine */
1180  }
1181 
1182  /*
1183  * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1184  * is pretty crude, but we are not expecting any variability in the file
1185  * format).
1186  */
1187  if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
1188  &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1189  ereport(FATAL,
1190  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1191  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1192  RedoStartLSN = ((uint64) hi) << 32 | lo;
1193  RedoStartTLI = tli_from_walseg;
1194  if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
1195  &hi, &lo, &ch) != 3 || ch != '\n')
1196  ereport(FATAL,
1197  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1198  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1199  *checkPointLoc = ((uint64) hi) << 32 | lo;
1200  *backupLabelTLI = tli_from_walseg;
1201 
1202  /*
1203  * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1204  * which could mean either pg_basebackup or the pg_backup_start/stop
1205  * method was used) or if this label came from somewhere else (the only
1206  * other option today being from pg_rewind). If this was a streamed
1207  * backup then we know that we need to play through until we get to the
1208  * end of the WAL which was generated during the backup (at which point we
1209  * will have reached consistency and backupEndRequired will be reset to be
1210  * false).
1211  */
1212  if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1213  {
1214  if (strcmp(backuptype, "streamed") == 0)
1215  *backupEndRequired = true;
1216  }
1217 
1218  /*
1219  * BACKUP FROM lets us know if this was from a primary or a standby. If
1220  * it was from a standby, we'll double-check that the control file state
1221  * matches that of a standby.
1222  */
1223  if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1224  {
1225  if (strcmp(backupfrom, "standby") == 0)
1226  *backupFromStandby = true;
1227  }
1228 
1229  /*
1230  * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1231  * but checking for their presence is useful for debugging and the next
1232  * sanity checks. Cope also with the fact that the result buffers have a
1233  * pre-allocated size, hence if the backup_label file has been generated
1234  * with strings longer than the maximum assumed here an incorrect parsing
1235  * happens. That's fine as only minor consistency checks are done
1236  * afterwards.
1237  */
1238  if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1239  ereport(DEBUG1,
1240  (errmsg_internal("backup time %s in file \"%s\"",
1241  backuptime, BACKUP_LABEL_FILE)));
1242 
1243  if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1244  ereport(DEBUG1,
1245  (errmsg_internal("backup label %s in file \"%s\"",
1246  backuplabel, BACKUP_LABEL_FILE)));
1247 
1248  /*
1249  * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1250  * it as a sanity check if present.
1251  */
1252  if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1253  {
1254  if (tli_from_walseg != tli_from_file)
1255  ereport(FATAL,
1256  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1257  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1258  errdetail("Timeline ID parsed is %u, but expected %u.",
1259  tli_from_file, tli_from_walseg)));
1260 
1261  ereport(DEBUG1,
1262  (errmsg_internal("backup timeline %u in file \"%s\"",
1263  tli_from_file, BACKUP_LABEL_FILE)));
1264  }
1265 
1266  if (ferror(lfp) || FreeFile(lfp))
1267  ereport(FATAL,
1269  errmsg("could not read file \"%s\": %m",
1270  BACKUP_LABEL_FILE)));
1271 
1272  return true;
1273 }
1274 
1275 /*
1276  * read_tablespace_map: check to see if a tablespace_map file is present
1277  *
1278  * If we see a tablespace_map file during recovery, we assume that we are
1279  * recovering from a backup dump file, and we therefore need to create symlinks
1280  * as per the information present in tablespace_map file.
1281  *
1282  * Returns true if a tablespace_map file was found (and fills *tablespaces
1283  * with a tablespaceinfo struct for each tablespace listed in the file);
1284  * returns false if not.
1285  */
1286 static bool
1288 {
1289  tablespaceinfo *ti;
1290  FILE *lfp;
1291  char str[MAXPGPATH];
1292  int ch,
1293  i,
1294  n;
1295  bool was_backslash;
1296 
1297  /*
1298  * See if tablespace_map file is present
1299  */
1300  lfp = AllocateFile(TABLESPACE_MAP, "r");
1301  if (!lfp)
1302  {
1303  if (errno != ENOENT)
1304  ereport(FATAL,
1306  errmsg("could not read file \"%s\": %m",
1307  TABLESPACE_MAP)));
1308  return false; /* it's not there, all is fine */
1309  }
1310 
1311  /*
1312  * Read and parse the link name and path lines from tablespace_map file
1313  * (this code is pretty crude, but we are not expecting any variability in
1314  * the file format). De-escape any backslashes that were inserted.
1315  */
1316  i = 0;
1317  was_backslash = false;
1318  while ((ch = fgetc(lfp)) != EOF)
1319  {
1320  if (!was_backslash && (ch == '\n' || ch == '\r'))
1321  {
1322  if (i == 0)
1323  continue; /* \r immediately followed by \n */
1324 
1325  /*
1326  * The de-escaped line should contain an OID followed by exactly
1327  * one space followed by a path. The path might start with
1328  * spaces, so don't be too liberal about parsing.
1329  */
1330  str[i] = '\0';
1331  n = 0;
1332  while (str[n] && str[n] != ' ')
1333  n++;
1334  if (n < 1 || n >= i - 1)
1335  ereport(FATAL,
1336  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1337  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1338  str[n++] = '\0';
1339 
1340  ti = palloc0(sizeof(tablespaceinfo));
1341  ti->oid = pstrdup(str);
1342  ti->path = pstrdup(str + n);
1343  *tablespaces = lappend(*tablespaces, ti);
1344 
1345  i = 0;
1346  continue;
1347  }
1348  else if (!was_backslash && ch == '\\')
1349  was_backslash = true;
1350  else
1351  {
1352  if (i < sizeof(str) - 1)
1353  str[i++] = ch;
1354  was_backslash = false;
1355  }
1356  }
1357 
1358  if (i != 0 || was_backslash) /* last line not terminated? */
1359  ereport(FATAL,
1360  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1361  errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1362 
1363  if (ferror(lfp) || FreeFile(lfp))
1364  ereport(FATAL,
1366  errmsg("could not read file \"%s\": %m",
1367  TABLESPACE_MAP)));
1368 
1369  return true;
1370 }
1371 
1372 /*
1373  * Finish WAL recovery.
1374  *
1375  * This does not close the 'xlogreader' yet, because in some cases the caller
1376  * still wants to re-read the last checkpoint record by calling
1377  * ReadCheckPointRecord().
1378  *
1379  * Returns the position of the last valid or applied record, after which new
1380  * WAL should be appended, information about why recovery was ended, and some
1381  * other things. See the WalRecoveryResult struct for details.
1382  */
1385 {
1387  XLogRecPtr lastRec;
1388  TimeLineID lastRecTLI;
1389  XLogRecPtr endOfLog;
1390 
1391  /*
1392  * Kill WAL receiver, if it's still running, before we continue to write
1393  * the startup checkpoint and aborted-contrecord records. It will trump
1394  * over these records and subsequent ones if it's still alive when we
1395  * start writing WAL.
1396  */
1398 
1399  /*
1400  * We are now done reading the xlog from stream. Turn off streaming
1401  * recovery to force fetching the files (which would be required at end of
1402  * recovery, e.g., timeline history file) from archive or pg_wal.
1403  *
1404  * Note that standby mode must be turned off after killing WAL receiver,
1405  * i.e., calling XLogShutdownWalRcv().
1406  */
1407  Assert(!WalRcvStreaming());
1408  StandbyMode = false;
1409 
1410  /*
1411  * Determine where to start writing WAL next.
1412  *
1413  * Re-fetch the last valid or last applied record, so we can identify the
1414  * exact endpoint of what we consider the valid portion of WAL. There may
1415  * be an incomplete continuation record after that, in which case
1416  * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1417  * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1418  * it is intentionally missing. See CreateOverwriteContrecordRecord().
1419  *
1420  * An important side-effect of this is to load the last page into
1421  * xlogreader. The caller uses it to initialize the WAL for writing.
1422  */
1423  if (!InRecovery)
1424  {
1425  lastRec = CheckPointLoc;
1426  lastRecTLI = CheckPointTLI;
1427  }
1428  else
1429  {
1431  lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1432  }
1434  (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1435  endOfLog = xlogreader->EndRecPtr;
1436 
1437  /*
1438  * Remember the TLI in the filename of the XLOG segment containing the
1439  * end-of-log. It could be different from the timeline that endOfLog
1440  * nominally belongs to, if there was a timeline switch in that segment,
1441  * and we were reading the old WAL from a segment belonging to a higher
1442  * timeline.
1443  */
1444  result->endOfLogTLI = xlogreader->seg.ws_tli;
1445 
1447  {
1448  /*
1449  * We are no longer in archive recovery state.
1450  *
1451  * We are now done reading the old WAL. Turn off archive fetching if
1452  * it was active.
1453  */
1455  InArchiveRecovery = false;
1456 
1457  /*
1458  * If the ending log segment is still open, close it (to avoid
1459  * problems on Windows with trying to rename or delete an open file).
1460  */
1461  if (readFile >= 0)
1462  {
1463  close(readFile);
1464  readFile = -1;
1465  }
1466  }
1467 
1468  /*
1469  * Copy the last partial block to the caller, for initializing the WAL
1470  * buffer for appending new WAL.
1471  */
1472  if (endOfLog % XLOG_BLCKSZ != 0)
1473  {
1474  char *page;
1475  int len;
1476  XLogRecPtr pageBeginPtr;
1477 
1478  pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1479  Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1480 
1481  /* Copy the valid part of the last block */
1482  len = endOfLog % XLOG_BLCKSZ;
1483  page = palloc(len);
1484  memcpy(page, xlogreader->readBuf, len);
1485 
1486  result->lastPageBeginPtr = pageBeginPtr;
1487  result->lastPage = page;
1488  }
1489  else
1490  {
1491  /* There is no partial block to copy. */
1492  result->lastPageBeginPtr = endOfLog;
1493  result->lastPage = NULL;
1494  }
1495 
1496  /*
1497  * Create a comment for the history file to explain why and where timeline
1498  * changed.
1499  */
1501 
1502  result->lastRec = lastRec;
1503  result->lastRecTLI = lastRecTLI;
1504  result->endOfLog = endOfLog;
1505 
1506  result->abortedRecPtr = abortedRecPtr;
1508 
1511 
1512  return result;
1513 }
1514 
1515 /*
1516  * Clean up the WAL reader and leftovers from restoring WAL from archive
1517  */
1518 void
1520 {
1521  char recoveryPath[MAXPGPATH];
1522 
1523  /* Final update of pg_stat_recovery_prefetch. */
1525 
1526  /* Shut down xlogreader */
1527  if (readFile >= 0)
1528  {
1529  close(readFile);
1530  readFile = -1;
1531  }
1534 
1536  {
1537  /*
1538  * Since there might be a partial WAL segment named RECOVERYXLOG, get
1539  * rid of it.
1540  */
1541  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1542  unlink(recoveryPath); /* ignore any error */
1543 
1544  /* Get rid of any remaining recovered timeline-history file, too */
1545  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1546  unlink(recoveryPath); /* ignore any error */
1547  }
1548 
1549  /*
1550  * We don't need the latch anymore. It's not strictly necessary to disown
1551  * it, but let's do it for the sake of tidiness.
1552  */
1555 }
1556 
1557 /*
1558  * Perform WAL recovery.
1559  *
1560  * If the system was shut down cleanly, this is never called.
1561  */
1562 void
1564 {
1565  XLogRecord *record;
1566  bool reachedRecoveryTarget = false;
1567  TimeLineID replayTLI;
1568 
1569  /*
1570  * Initialize shared variables for tracking progress of WAL replay, as if
1571  * we had just replayed the record before the REDO location (or the
1572  * checkpoint record itself, if it's a shutdown checkpoint).
1573  */
1576  {
1580  }
1581  else
1582  {
1586  }
1593 
1594  /* Also ensure XLogReceiptTime has a sane value */
1596 
1597  /*
1598  * Let postmaster know we've started redo now, so that it can launch the
1599  * archiver if necessary.
1600  */
1601  if (IsUnderPostmaster)
1603 
1604  /*
1605  * Allow read-only connections immediately if we're consistent already.
1606  */
1608 
1609  /*
1610  * Find the first record that logically follows the checkpoint --- it
1611  * might physically precede it, though.
1612  */
1614  {
1615  /* back up to find the record */
1616  replayTLI = RedoStartTLI;
1618  record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1619  }
1620  else
1621  {
1622  /* just have to read next record after CheckPoint */
1624  replayTLI = CheckPointTLI;
1625  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1626  }
1627 
1628  if (record != NULL)
1629  {
1630  TimestampTz xtime;
1631  PGRUsage ru0;
1632 
1633  pg_rusage_init(&ru0);
1634 
1635  InRedo = true;
1636 
1637  RmgrStartup();
1638 
1639  ereport(LOG,
1640  (errmsg("redo starts at %X/%X",
1642 
1643  /* Prepare to report progress of the redo phase. */
1644  if (!StandbyMode)
1646 
1647  /*
1648  * main redo apply loop
1649  */
1650  do
1651  {
1652  if (!StandbyMode)
1653  ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
1655 
1656 #ifdef WAL_DEBUG
1657  if (XLOG_DEBUG ||
1658  (record->xl_rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
1659  (record->xl_rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
1660  {
1662 
1663  initStringInfo(&buf);
1664  appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
1667  xlog_outrec(&buf, xlogreader);
1668  appendStringInfoString(&buf, " - ");
1670  elog(LOG, "%s", buf.data);
1671  pfree(buf.data);
1672  }
1673 #endif
1674 
1675  /* Handle interrupt signals of startup process */
1677 
1678  /*
1679  * Pause WAL replay, if requested by a hot-standby session via
1680  * SetRecoveryPause().
1681  *
1682  * Note that we intentionally don't take the info_lck spinlock
1683  * here. We might therefore read a slightly stale value of the
1684  * recoveryPause flag, but it can't be very stale (no worse than
1685  * the last spinlock we did acquire). Since a pause request is a
1686  * pretty asynchronous thing anyway, possibly responding to it one
1687  * WAL record later than we otherwise would is a minor issue, so
1688  * it doesn't seem worth adding another spinlock cycle to prevent
1689  * that.
1690  */
1691  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1693  recoveryPausesHere(false);
1694 
1695  /*
1696  * Have we reached our recovery target?
1697  */
1699  {
1700  reachedRecoveryTarget = true;
1701  break;
1702  }
1703 
1704  /*
1705  * If we've been asked to lag the primary, wait on latch until
1706  * enough time has passed.
1707  */
1709  {
1710  /*
1711  * We test for paused recovery again here. If user sets
1712  * delayed apply, it may be because they expect to pause
1713  * recovery in case of problems, so we must test again here
1714  * otherwise pausing during the delay-wait wouldn't work.
1715  */
1716  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1718  recoveryPausesHere(false);
1719  }
1720 
1721  /*
1722  * Apply the record
1723  */
1724  ApplyWalRecord(xlogreader, record, &replayTLI);
1725 
1726  /* Exit loop if we reached inclusive recovery target */
1728  {
1729  reachedRecoveryTarget = true;
1730  break;
1731  }
1732 
1733  /* Else, try to fetch the next WAL record */
1734  record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1735  } while (record != NULL);
1736 
1737  /*
1738  * end of main redo apply loop
1739  */
1740 
1741  if (reachedRecoveryTarget)
1742  {
1743  if (!reachedConsistency)
1744  ereport(FATAL,
1745  (errmsg("requested recovery stop point is before consistent recovery point")));
1746 
1747  /*
1748  * This is the last point where we can restart recovery with a new
1749  * recovery target, if we shutdown and begin again. After this,
1750  * Resource Managers may choose to do permanent corrective actions
1751  * at end of recovery.
1752  */
1753  switch (recoveryTargetAction)
1754  {
1756 
1757  /*
1758  * exit with special return code to request shutdown of
1759  * postmaster. Log messages issued from postmaster.
1760  */
1761  proc_exit(3);
1762 
1764  SetRecoveryPause(true);
1765  recoveryPausesHere(true);
1766 
1767  /* drop into promote */
1768 
1770  break;
1771  }
1772  }
1773 
1774  RmgrCleanup();
1775 
1776  ereport(LOG,
1777  (errmsg("redo done at %X/%X system usage: %s",
1779  pg_rusage_show(&ru0))));
1780  xtime = GetLatestXTime();
1781  if (xtime)
1782  ereport(LOG,
1783  (errmsg("last completed transaction was at log time %s",
1784  timestamptz_to_str(xtime))));
1785 
1786  InRedo = false;
1787  }
1788  else
1789  {
1790  /* there are no WAL records following the checkpoint */
1791  ereport(LOG,
1792  (errmsg("redo is not required")));
1793  }
1794 
1795  /*
1796  * This check is intentionally after the above log messages that indicate
1797  * how far recovery went.
1798  */
1801  !reachedRecoveryTarget)
1802  ereport(FATAL,
1803  (errmsg("recovery ended before configured recovery target was reached")));
1804 }
1805 
1806 /*
1807  * Subroutine of PerformWalRecovery, to apply one WAL record.
1808  */
1809 static void
1811 {
1812  ErrorContextCallback errcallback;
1813  bool switchedTLI = false;
1814 
1815  /* Setup error traceback support for ereport() */
1816  errcallback.callback = rm_redo_error_callback;
1817  errcallback.arg = (void *) xlogreader;
1818  errcallback.previous = error_context_stack;
1819  error_context_stack = &errcallback;
1820 
1821  /*
1822  * ShmemVariableCache->nextXid must be beyond record's xid.
1823  */
1825 
1826  /*
1827  * Before replaying this record, check if this record causes the current
1828  * timeline to change. The record is already considered to be part of the
1829  * new timeline, so we update replayTLI before replaying it. That's
1830  * important so that replayEndTLI, which is recorded as the minimum
1831  * recovery point's TLI if recovery stops after this record, is set
1832  * correctly.
1833  */
1834  if (record->xl_rmid == RM_XLOG_ID)
1835  {
1836  TimeLineID newReplayTLI = *replayTLI;
1837  TimeLineID prevReplayTLI = *replayTLI;
1838  uint8 info = record->xl_info & ~XLR_INFO_MASK;
1839 
1840  if (info == XLOG_CHECKPOINT_SHUTDOWN)
1841  {
1842  CheckPoint checkPoint;
1843 
1844  memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1845  newReplayTLI = checkPoint.ThisTimeLineID;
1846  prevReplayTLI = checkPoint.PrevTimeLineID;
1847  }
1848  else if (info == XLOG_END_OF_RECOVERY)
1849  {
1850  xl_end_of_recovery xlrec;
1851 
1852  memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1853  newReplayTLI = xlrec.ThisTimeLineID;
1854  prevReplayTLI = xlrec.PrevTimeLineID;
1855  }
1856 
1857  if (newReplayTLI != *replayTLI)
1858  {
1859  /* Check that it's OK to switch to this TLI */
1861  newReplayTLI, prevReplayTLI, *replayTLI);
1862 
1863  /* Following WAL records should be run with new TLI */
1864  *replayTLI = newReplayTLI;
1865  switchedTLI = true;
1866  }
1867  }
1868 
1869  /*
1870  * Update shared replayEndRecPtr before replaying this record, so that
1871  * XLogFlush will update minRecoveryPoint correctly.
1872  */
1875  XLogRecoveryCtl->replayEndTLI = *replayTLI;
1877 
1878  /*
1879  * If we are attempting to enter Hot Standby mode, process XIDs we see
1880  */
1882  TransactionIdIsValid(record->xl_xid))
1884 
1885  /*
1886  * Some XLOG record types that are related to recovery are processed
1887  * directly here, rather than in xlog_redo()
1888  */
1889  if (record->xl_rmid == RM_XLOG_ID)
1890  xlogrecovery_redo(xlogreader, *replayTLI);
1891 
1892  /* Now apply the WAL record itself */
1893  GetRmgr(record->xl_rmid).rm_redo(xlogreader);
1894 
1895  /*
1896  * After redo, check whether the backup pages associated with the WAL
1897  * record are consistent with the existing pages. This check is done only
1898  * if consistency check is enabled for this record.
1899  */
1900  if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
1902 
1903  /* Pop the error context stack */
1904  error_context_stack = errcallback.previous;
1905 
1906  /*
1907  * Update lastReplayedEndRecPtr after this record has been successfully
1908  * replayed.
1909  */
1913  XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
1915 
1916  /*
1917  * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
1918  * receiver so that it notices the updated lastReplayedEndRecPtr and sends
1919  * a reply to the primary.
1920  */
1922  {
1923  doRequestWalReceiverReply = false;
1924  WalRcvForceReply();
1925  }
1926 
1927  /* Allow read-only connections if we're consistent now */
1929 
1930  /* Is this a timeline switch? */
1931  if (switchedTLI)
1932  {
1933  /*
1934  * Before we continue on the new timeline, clean up any (possibly
1935  * bogus) future WAL segments on the old timeline.
1936  */
1938 
1939  /*
1940  * Wake up any walsenders to notice that we are on a new timeline.
1941  */
1943  WalSndWakeup();
1944 
1945  /* Reset the prefetcher. */
1947  }
1948 }
1949 
1950 /*
1951  * Some XLOG RM record types that are directly related to WAL recovery are
1952  * handled here rather than in the xlog_redo()
1953  */
1954 static void
1956 {
1957  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1958  XLogRecPtr lsn = record->EndRecPtr;
1959 
1960  Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
1961 
1962  if (info == XLOG_OVERWRITE_CONTRECORD)
1963  {
1964  /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
1966 
1967  memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
1968  if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
1969  elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
1972 
1973  /* We have safely skipped the aborted record */
1976 
1977  ereport(LOG,
1978  (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
1981 
1982  /* Verifying the record should only happen once */
1984  }
1985  else if (info == XLOG_BACKUP_END)
1986  {
1987  XLogRecPtr startpoint;
1988 
1989  memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
1990 
1991  if (backupStartPoint == startpoint)
1992  {
1993  /*
1994  * We have reached the end of base backup, the point where
1995  * pg_backup_stop() was done. The data on disk is now consistent
1996  * (assuming we have also reached minRecoveryPoint). Set
1997  * backupEndPoint to the current LSN, so that the next call to
1998  * CheckRecoveryConsistency() will notice it and do the
1999  * end-of-backup processing.
2000  */
2001  elog(DEBUG1, "end of backup record reached");
2002 
2003  backupEndPoint = lsn;
2004  }
2005  else
2006  elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
2008  }
2009 }
2010 
2011 /*
2012  * Checks if recovery has reached a consistent state. When consistency is
2013  * reached and we have a valid starting standby snapshot, tell postmaster
2014  * that it can start accepting read-only connections.
2015  */
2016 static void
2018 {
2019  XLogRecPtr lastReplayedEndRecPtr;
2020  TimeLineID lastReplayedTLI;
2021 
2022  /*
2023  * During crash recovery, we don't reach a consistent state until we've
2024  * replayed all the WAL.
2025  */
2027  return;
2028 
2030 
2031  /*
2032  * assume that we are called in the startup process, and hence don't need
2033  * a lock to read lastReplayedEndRecPtr
2034  */
2035  lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2036  lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2037 
2038  /*
2039  * Have we reached the point where our base backup was completed?
2040  */
2042  backupEndPoint <= lastReplayedEndRecPtr)
2043  {
2044  elog(DEBUG1, "end of backup reached");
2045 
2046  /*
2047  * We have reached the end of base backup, as indicated by pg_control.
2048  * Update the control file accordingly.
2049  */
2050  ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2053  backupEndRequired = false;
2054  }
2055 
2056  /*
2057  * Have we passed our safe starting point? Note that minRecoveryPoint is
2058  * known to be incorrectly set if recovering from a backup, until the
2059  * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2060  * All we know prior to that is that we're not consistent yet.
2061  */
2063  minRecoveryPoint <= lastReplayedEndRecPtr)
2064  {
2065  /*
2066  * Check to see if the XLOG sequence contained any unresolved
2067  * references to uninitialized pages.
2068  */
2070 
2071  reachedConsistency = true;
2072  ereport(LOG,
2073  (errmsg("consistent recovery state reached at %X/%X",
2074  LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
2075  }
2076 
2077  /*
2078  * Have we got a valid starting snapshot that will allow queries to be
2079  * run? If so, we can tell postmaster that the database is consistent now,
2080  * enabling connections.
2081  */
2086  {
2090 
2091  LocalHotStandbyActive = true;
2092 
2094  }
2095 }
2096 
2097 /*
2098  * Error context callback for errors occurring during rm_redo().
2099  */
2100 static void
2102 {
2103  XLogReaderState *record = (XLogReaderState *) arg;
2105 
2106  initStringInfo(&buf);
2107  xlog_outdesc(&buf, record);
2108  xlog_block_info(&buf, record);
2109 
2110  /* translator: %s is a WAL record description */
2111  errcontext("WAL redo at %X/%X for %s",
2112  LSN_FORMAT_ARGS(record->ReadRecPtr),
2113  buf.data);
2114 
2115  pfree(buf.data);
2116 }
2117 
2118 /*
2119  * Returns a string describing an XLogRecord, consisting of its identity
2120  * optionally followed by a colon, a space, and a further description.
2121  */
2122 void
2124 {
2125  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2126  uint8 info = XLogRecGetInfo(record);
2127  const char *id;
2128 
2130  appendStringInfoChar(buf, '/');
2131 
2132  id = rmgr.rm_identify(info);
2133  if (id == NULL)
2134  appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2135  else
2136  appendStringInfo(buf, "%s: ", id);
2137 
2138  rmgr.rm_desc(buf, record);
2139 }
2140 
2141 #ifdef WAL_DEBUG
2142 
2143 static void
2144 xlog_outrec(StringInfo buf, XLogReaderState *record)
2145 {
2146  appendStringInfo(buf, "prev %X/%X; xid %u",
2148  XLogRecGetXid(record));
2149 
2150  appendStringInfo(buf, "; len %u",
2151  XLogRecGetDataLen(record));
2152 
2153  xlog_block_info(buf, record);
2154 }
2155 #endif /* WAL_DEBUG */
2156 
2157 /*
2158  * Returns a string giving information about all the blocks in an
2159  * XLogRecord.
2160  */
2161 static void
2163 {
2164  int block_id;
2165 
2166  /* decode block references */
2167  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2168  {
2169  RelFileNode rnode;
2170  ForkNumber forknum;
2171  BlockNumber blk;
2172 
2173  if (!XLogRecGetBlockTagExtended(record, block_id,
2174  &rnode, &forknum, &blk, NULL))
2175  continue;
2176 
2177  if (forknum != MAIN_FORKNUM)
2178  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2179  block_id,
2180  rnode.spcNode, rnode.dbNode, rnode.relNode,
2181  forknum,
2182  blk);
2183  else
2184  appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2185  block_id,
2186  rnode.spcNode, rnode.dbNode, rnode.relNode,
2187  blk);
2188  if (XLogRecHasBlockImage(record, block_id))
2189  appendStringInfoString(buf, " FPW");
2190  }
2191 }
2192 
2193 
2194 /*
2195  * Check that it's OK to switch to new timeline during recovery.
2196  *
2197  * 'lsn' is the address of the shutdown checkpoint record we're about to
2198  * replay. (Currently, timeline can only change at a shutdown checkpoint).
2199  */
2200 static void
2202  TimeLineID replayTLI)
2203 {
2204  /* Check that the record agrees on what the current (old) timeline is */
2205  if (prevTLI != replayTLI)
2206  ereport(PANIC,
2207  (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2208  prevTLI, replayTLI)));
2209 
2210  /*
2211  * The new timeline better be in the list of timelines we expect to see,
2212  * according to the timeline history. It should also not decrease.
2213  */
2214  if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2215  ereport(PANIC,
2216  (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2217  newTLI, replayTLI)));
2218 
2219  /*
2220  * If we have not yet reached min recovery point, and we're about to
2221  * switch to a timeline greater than the timeline of the min recovery
2222  * point: trouble. After switching to the new timeline, we could not
2223  * possibly visit the min recovery point on the correct timeline anymore.
2224  * This can happen if there is a newer timeline in the archive that
2225  * branched before the timeline the min recovery point is on, and you
2226  * attempt to do PITR to the new timeline.
2227  */
2229  lsn < minRecoveryPoint &&
2230  newTLI > minRecoveryPointTLI)
2231  ereport(PANIC,
2232  (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
2233  newTLI,
2236 
2237  /* Looks good */
2238 }
2239 
2240 
2241 /*
2242  * Extract timestamp from WAL record.
2243  *
2244  * If the record contains a timestamp, returns true, and saves the timestamp
2245  * in *recordXtime. If the record type has no timestamp, returns false.
2246  * Currently, only transaction commit/abort records and restore points contain
2247  * timestamps.
2248  */
2249 static bool
2251 {
2252  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2253  uint8 xact_info = info & XLOG_XACT_OPMASK;
2254  uint8 rmid = XLogRecGetRmid(record);
2255 
2256  if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2257  {
2258  *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2259  return true;
2260  }
2261  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2262  xact_info == XLOG_XACT_COMMIT_PREPARED))
2263  {
2264  *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2265  return true;
2266  }
2267  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2268  xact_info == XLOG_XACT_ABORT_PREPARED))
2269  {
2270  *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2271  return true;
2272  }
2273  return false;
2274 }
2275 
2276 /*
2277  * Checks whether the current buffer page and backup page stored in the
2278  * WAL record are consistent or not. Before comparing the two pages, a
2279  * masking can be applied to the pages to ignore certain areas like hint bits,
2280  * unused space between pd_lower and pd_upper among other things. This
2281  * function should be called once WAL replay has been completed for a
2282  * given record.
2283  */
2284 static void
2286 {
2287  RmgrData rmgr = GetRmgr(XLogRecGetRmid(record));
2288  RelFileNode rnode;
2289  ForkNumber forknum;
2290  BlockNumber blkno;
2291  int block_id;
2292 
2293  /* Records with no backup blocks have no need for consistency checks. */
2294  if (!XLogRecHasAnyBlockRefs(record))
2295  return;
2296 
2297  Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2298 
2299  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2300  {
2301  Buffer buf;
2302  Page page;
2303 
2304  if (!XLogRecGetBlockTagExtended(record, block_id,
2305  &rnode, &forknum, &blkno, NULL))
2306  {
2307  /*
2308  * WAL record doesn't contain a block reference with the given id.
2309  * Do nothing.
2310  */
2311  continue;
2312  }
2313 
2314  Assert(XLogRecHasBlockImage(record, block_id));
2315 
2316  if (XLogRecBlockImageApply(record, block_id))
2317  {
2318  /*
2319  * WAL record has already applied the page, so bypass the
2320  * consistency check as that would result in comparing the full
2321  * page stored in the record with itself.
2322  */
2323  continue;
2324  }
2325 
2326  /*
2327  * Read the contents from the current buffer and store it in a
2328  * temporary page.
2329  */
2330  buf = XLogReadBufferExtended(rnode, forknum, blkno,
2332  InvalidBuffer);
2333  if (!BufferIsValid(buf))
2334  continue;
2335 
2337  page = BufferGetPage(buf);
2338 
2339  /*
2340  * Take a copy of the local page where WAL has been applied to have a
2341  * comparison base before masking it...
2342  */
2343  memcpy(replay_image_masked, page, BLCKSZ);
2344 
2345  /* No need for this page anymore now that a copy is in. */
2347 
2348  /*
2349  * If the block LSN is already ahead of this WAL record, we can't
2350  * expect contents to match. This can happen if recovery is
2351  * restarted.
2352  */
2353  if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2354  continue;
2355 
2356  /*
2357  * Read the contents from the backup copy, stored in WAL record and
2358  * store it in a temporary page. There is no need to allocate a new
2359  * page here, a local buffer is fine to hold its contents and a mask
2360  * can be directly applied on it.
2361  */
2362  if (!RestoreBlockImage(record, block_id, primary_image_masked))
2363  elog(ERROR, "failed to restore block image");
2364 
2365  /*
2366  * If masking function is defined, mask both the primary and replay
2367  * images
2368  */
2369  if (rmgr.rm_mask != NULL)
2370  {
2371  rmgr.rm_mask(replay_image_masked, blkno);
2372  rmgr.rm_mask(primary_image_masked, blkno);
2373  }
2374 
2375  /* Time to compare the primary and replay images. */
2376  if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2377  {
2378  elog(FATAL,
2379  "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2380  rnode.spcNode, rnode.dbNode, rnode.relNode,
2381  forknum, blkno);
2382  }
2383  }
2384 }
2385 
2386 /*
2387  * For point-in-time recovery, this function decides whether we want to
2388  * stop applying the XLOG before the current record.
2389  *
2390  * Returns true if we are stopping, false otherwise. If stopping, some
2391  * information is saved in recoveryStopXid et al for use in annotating the
2392  * new timeline's history file.
2393  */
2394 static bool
2396 {
2397  bool stopsHere = false;
2398  uint8 xact_info;
2399  bool isCommit;
2400  TimestampTz recordXtime = 0;
2401  TransactionId recordXid;
2402 
2403  /*
2404  * Ignore recovery target settings when not in archive recovery (meaning
2405  * we are in crash recovery).
2406  */
2408  return false;
2409 
2410  /* Check if we should stop as soon as reaching consistency */
2412  {
2413  ereport(LOG,
2414  (errmsg("recovery stopping after reaching consistency")));
2415 
2416  recoveryStopAfter = false;
2419  recoveryStopTime = 0;
2420  recoveryStopName[0] = '\0';
2421  return true;
2422  }
2423 
2424  /* Check if target LSN has been reached */
2427  record->ReadRecPtr >= recoveryTargetLSN)
2428  {
2429  recoveryStopAfter = false;
2431  recoveryStopLSN = record->ReadRecPtr;
2432  recoveryStopTime = 0;
2433  recoveryStopName[0] = '\0';
2434  ereport(LOG,
2435  (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
2437  return true;
2438  }
2439 
2440  /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2441  if (XLogRecGetRmid(record) != RM_XACT_ID)
2442  return false;
2443 
2444  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2445 
2446  if (xact_info == XLOG_XACT_COMMIT)
2447  {
2448  isCommit = true;
2449  recordXid = XLogRecGetXid(record);
2450  }
2451  else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2452  {
2453  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2454  xl_xact_parsed_commit parsed;
2455 
2456  isCommit = true;
2458  xlrec,
2459  &parsed);
2460  recordXid = parsed.twophase_xid;
2461  }
2462  else if (xact_info == XLOG_XACT_ABORT)
2463  {
2464  isCommit = false;
2465  recordXid = XLogRecGetXid(record);
2466  }
2467  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2468  {
2469  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2470  xl_xact_parsed_abort parsed;
2471 
2472  isCommit = false;
2474  xlrec,
2475  &parsed);
2476  recordXid = parsed.twophase_xid;
2477  }
2478  else
2479  return false;
2480 
2482  {
2483  /*
2484  * There can be only one transaction end record with this exact
2485  * transactionid
2486  *
2487  * when testing for an xid, we MUST test for equality only, since
2488  * transactions are numbered in the order they start, not the order
2489  * they complete. A higher numbered xid will complete before you about
2490  * 50% of the time...
2491  */
2492  stopsHere = (recordXid == recoveryTargetXid);
2493  }
2494 
2496  getRecordTimestamp(record, &recordXtime))
2497  {
2498  /*
2499  * There can be many transactions that share the same commit time, so
2500  * we stop after the last one, if we are inclusive, or stop at the
2501  * first one if we are exclusive
2502  */
2504  stopsHere = (recordXtime > recoveryTargetTime);
2505  else
2506  stopsHere = (recordXtime >= recoveryTargetTime);
2507  }
2508 
2509  if (stopsHere)
2510  {
2511  recoveryStopAfter = false;
2512  recoveryStopXid = recordXid;
2513  recoveryStopTime = recordXtime;
2515  recoveryStopName[0] = '\0';
2516 
2517  if (isCommit)
2518  {
2519  ereport(LOG,
2520  (errmsg("recovery stopping before commit of transaction %u, time %s",
2523  }
2524  else
2525  {
2526  ereport(LOG,
2527  (errmsg("recovery stopping before abort of transaction %u, time %s",
2530  }
2531  }
2532 
2533  return stopsHere;
2534 }
2535 
2536 /*
2537  * Same as recoveryStopsBefore, but called after applying the record.
2538  *
2539  * We also track the timestamp of the latest applied COMMIT/ABORT
2540  * record in XLogRecoveryCtl->recoveryLastXTime.
2541  */
2542 static bool
2544 {
2545  uint8 info;
2546  uint8 xact_info;
2547  uint8 rmid;
2548  TimestampTz recordXtime;
2549 
2550  /*
2551  * Ignore recovery target settings when not in archive recovery (meaning
2552  * we are in crash recovery).
2553  */
2555  return false;
2556 
2557  info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2558  rmid = XLogRecGetRmid(record);
2559 
2560  /*
2561  * There can be many restore points that share the same name; we stop at
2562  * the first one.
2563  */
2565  rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2566  {
2567  xl_restore_point *recordRestorePointData;
2568 
2569  recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2570 
2571  if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2572  {
2573  recoveryStopAfter = true;
2576  (void) getRecordTimestamp(record, &recoveryStopTime);
2577  strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2578 
2579  ereport(LOG,
2580  (errmsg("recovery stopping at restore point \"%s\", time %s",
2583  return true;
2584  }
2585  }
2586 
2587  /* Check if the target LSN has been reached */
2590  record->ReadRecPtr >= recoveryTargetLSN)
2591  {
2592  recoveryStopAfter = true;
2594  recoveryStopLSN = record->ReadRecPtr;
2595  recoveryStopTime = 0;
2596  recoveryStopName[0] = '\0';
2597  ereport(LOG,
2598  (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
2600  return true;
2601  }
2602 
2603  if (rmid != RM_XACT_ID)
2604  return false;
2605 
2606  xact_info = info & XLOG_XACT_OPMASK;
2607 
2608  if (xact_info == XLOG_XACT_COMMIT ||
2609  xact_info == XLOG_XACT_COMMIT_PREPARED ||
2610  xact_info == XLOG_XACT_ABORT ||
2611  xact_info == XLOG_XACT_ABORT_PREPARED)
2612  {
2613  TransactionId recordXid;
2614 
2615  /* Update the last applied transaction timestamp */
2616  if (getRecordTimestamp(record, &recordXtime))
2617  SetLatestXTime(recordXtime);
2618 
2619  /* Extract the XID of the committed/aborted transaction */
2620  if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2621  {
2622  xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2623  xl_xact_parsed_commit parsed;
2624 
2626  xlrec,
2627  &parsed);
2628  recordXid = parsed.twophase_xid;
2629  }
2630  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2631  {
2632  xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2633  xl_xact_parsed_abort parsed;
2634 
2636  xlrec,
2637  &parsed);
2638  recordXid = parsed.twophase_xid;
2639  }
2640  else
2641  recordXid = XLogRecGetXid(record);
2642 
2643  /*
2644  * There can be only one transaction end record with this exact
2645  * transactionid
2646  *
2647  * when testing for an xid, we MUST test for equality only, since
2648  * transactions are numbered in the order they start, not the order
2649  * they complete. A higher numbered xid will complete before you about
2650  * 50% of the time...
2651  */
2653  recordXid == recoveryTargetXid)
2654  {
2655  recoveryStopAfter = true;
2656  recoveryStopXid = recordXid;
2657  recoveryStopTime = recordXtime;
2659  recoveryStopName[0] = '\0';
2660 
2661  if (xact_info == XLOG_XACT_COMMIT ||
2662  xact_info == XLOG_XACT_COMMIT_PREPARED)
2663  {
2664  ereport(LOG,
2665  (errmsg("recovery stopping after commit of transaction %u, time %s",
2668  }
2669  else if (xact_info == XLOG_XACT_ABORT ||
2670  xact_info == XLOG_XACT_ABORT_PREPARED)
2671  {
2672  ereport(LOG,
2673  (errmsg("recovery stopping after abort of transaction %u, time %s",
2676  }
2677  return true;
2678  }
2679  }
2680 
2681  /* Check if we should stop as soon as reaching consistency */
2683  {
2684  ereport(LOG,
2685  (errmsg("recovery stopping after reaching consistency")));
2686 
2687  recoveryStopAfter = true;
2689  recoveryStopTime = 0;
2691  recoveryStopName[0] = '\0';
2692  return true;
2693  }
2694 
2695  return false;
2696 }
2697 
2698 /*
2699  * Create a comment for the history file to explain why and where
2700  * timeline changed.
2701  */
2702 static char *
2704 {
2705  char reason[200];
2706 
2708  snprintf(reason, sizeof(reason),
2709  "%s transaction %u",
2710  recoveryStopAfter ? "after" : "before",
2711  recoveryStopXid);
2713  snprintf(reason, sizeof(reason),
2714  "%s %s\n",
2715  recoveryStopAfter ? "after" : "before",
2717  else if (recoveryTarget == RECOVERY_TARGET_LSN)
2718  snprintf(reason, sizeof(reason),
2719  "%s LSN %X/%X\n",
2720  recoveryStopAfter ? "after" : "before",
2723  snprintf(reason, sizeof(reason),
2724  "at restore point \"%s\"",
2727  snprintf(reason, sizeof(reason), "reached consistency");
2728  else
2729  snprintf(reason, sizeof(reason), "no recovery target specified");
2730 
2731  return pstrdup(reason);
2732 }
2733 
2734 /*
2735  * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2736  *
2737  * endOfRecovery is true if the recovery target is reached and
2738  * the paused state starts at the end of recovery because of
2739  * recovery_target_action=pause, and false otherwise.
2740  */
2741 static void
2742 recoveryPausesHere(bool endOfRecovery)
2743 {
2744  /* Don't pause unless users can connect! */
2745  if (!LocalHotStandbyActive)
2746  return;
2747 
2748  /* Don't pause after standby promotion has been triggered */
2750  return;
2751 
2752  if (endOfRecovery)
2753  ereport(LOG,
2754  (errmsg("pausing at the end of recovery"),
2755  errhint("Execute pg_wal_replay_resume() to promote.")));
2756  else
2757  ereport(LOG,
2758  (errmsg("recovery has paused"),
2759  errhint("Execute pg_wal_replay_resume() to continue.")));
2760 
2761  /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2763  {
2765  if (CheckForStandbyTrigger())
2766  return;
2767 
2768  /*
2769  * If recovery pause is requested then set it paused. While we are in
2770  * the loop, user might resume and pause again so set this every time.
2771  */
2773 
2774  /*
2775  * We wait on a condition variable that will wake us as soon as the
2776  * pause ends, but we use a timeout so we can check the above exit
2777  * condition periodically too.
2778  */
2781  }
2783 }
2784 
2785 /*
2786  * When recovery_min_apply_delay is set, we wait long enough to make sure
2787  * certain record types are applied at least that interval behind the primary.
2788  *
2789  * Returns true if we waited.
2790  *
2791  * Note that the delay is calculated between the WAL record log time and
2792  * the current time on standby. We would prefer to keep track of when this
2793  * standby received each WAL record, which would allow a more consistent
2794  * approach and one not affected by time synchronisation issues, but that
2795  * is significantly more effort and complexity for little actual gain in
2796  * usability.
2797  */
2798 static bool
2800 {
2801  uint8 xact_info;
2802  TimestampTz xtime;
2803  TimestampTz delayUntil;
2804  long msecs;
2805 
2806  /* nothing to do if no delay configured */
2807  if (recovery_min_apply_delay <= 0)
2808  return false;
2809 
2810  /* no delay is applied on a database not yet consistent */
2811  if (!reachedConsistency)
2812  return false;
2813 
2814  /* nothing to do if crash recovery is requested */
2816  return false;
2817 
2818  /*
2819  * Is it a COMMIT record?
2820  *
2821  * We deliberately choose not to delay aborts since they have no effect on
2822  * MVCC. We already allow replay of records that don't have a timestamp,
2823  * so there is already opportunity for issues caused by early conflicts on
2824  * standbys.
2825  */
2826  if (XLogRecGetRmid(record) != RM_XACT_ID)
2827  return false;
2828 
2829  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2830 
2831  if (xact_info != XLOG_XACT_COMMIT &&
2832  xact_info != XLOG_XACT_COMMIT_PREPARED)
2833  return false;
2834 
2835  if (!getRecordTimestamp(record, &xtime))
2836  return false;
2837 
2839 
2840  /*
2841  * Exit without arming the latch if it's already past time to apply this
2842  * record
2843  */
2845  if (msecs <= 0)
2846  return false;
2847 
2848  while (true)
2849  {
2851 
2852  /*
2853  * This might change recovery_min_apply_delay or the trigger file's
2854  * location.
2855  */
2857 
2858  if (CheckForStandbyTrigger())
2859  break;
2860 
2861  /*
2862  * Recalculate delayUntil as recovery_min_apply_delay could have
2863  * changed while waiting in this loop.
2864  */
2866 
2867  /*
2868  * Wait for difference between GetCurrentTimestamp() and delayUntil.
2869  */
2871  delayUntil);
2872 
2873  if (msecs <= 0)
2874  break;
2875 
2876  elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
2877 
2880  msecs,
2882  }
2883  return true;
2884 }
2885 
2886 /*
2887  * Get the current state of the recovery pause request.
2888  */
2891 {
2893 
2897 
2898  return state;
2899 }
2900 
2901 /*
2902  * Set the recovery pause state.
2903  *
2904  * If recovery pause is requested then sets the recovery pause state to
2905  * 'pause requested' if it is not already 'paused'. Otherwise, sets it
2906  * to 'not paused' to resume the recovery. The recovery pause will be
2907  * confirmed by the ConfirmRecoveryPaused.
2908  */
2909 void
2910 SetRecoveryPause(bool recoveryPause)
2911 {
2913 
2914  if (!recoveryPause)
2918 
2920 
2921  if (!recoveryPause)
2923 }
2924 
2925 /*
2926  * Confirm the recovery pause by setting the recovery pause state to
2927  * RECOVERY_PAUSED.
2928  */
2929 static void
2931 {
2932  /* If recovery pause is requested then set it paused */
2937 }
2938 
2939 
2940 /*
2941  * Attempt to read the next XLOG record.
2942  *
2943  * Before first call, the reader needs to be positioned to the first record
2944  * by calling XLogPrefetcherBeginRead().
2945  *
2946  * If no valid record is available, returns NULL, or fails if emode is PANIC.
2947  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
2948  * record is available.
2949  */
2950 static XLogRecord *
2952  bool fetching_ckpt, TimeLineID replayTLI)
2953 {
2954  XLogRecord *record;
2957 
2958  /* Pass through parameters to XLogPageRead */
2959  private->fetching_ckpt = fetching_ckpt;
2960  private->emode = emode;
2961  private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
2962  private->replayTLI = replayTLI;
2963 
2964  /* This is the first attempt to read this page. */
2965  lastSourceFailed = false;
2966 
2967  for (;;)
2968  {
2969  char *errormsg;
2970 
2971  record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
2972  if (record == NULL)
2973  {
2974  /*
2975  * When not in standby mode we find that WAL ends in an incomplete
2976  * record, keep track of that record. After recovery is done,
2977  * we'll write a record to indicate to downstream WAL readers that
2978  * that portion is to be ignored.
2979  */
2980  if (!StandbyMode &&
2982  {
2985  }
2986 
2987  if (readFile >= 0)
2988  {
2989  close(readFile);
2990  readFile = -1;
2991  }
2992 
2993  /*
2994  * We only end up here without a message when XLogPageRead()
2995  * failed - in that case we already logged something. In
2996  * StandbyMode that only happens if we have been triggered, so we
2997  * shouldn't loop anymore in that case.
2998  */
2999  if (errormsg)
3001  (errmsg_internal("%s", errormsg) /* already translated */ ));
3002  }
3003 
3004  /*
3005  * Check page TLI is one of the expected values.
3006  */
3008  {
3009  char fname[MAXFNAMELEN];
3010  XLogSegNo segno;
3011  int32 offset;
3012 
3016  XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3019  (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
3021  fname,
3022  offset)));
3023  record = NULL;
3024  }
3025 
3026  if (record)
3027  {
3028  /* Great, got a record */
3029  return record;
3030  }
3031  else
3032  {
3033  /* No valid record available from this source */
3034  lastSourceFailed = true;
3035 
3036  /*
3037  * If archive recovery was requested, but we were still doing
3038  * crash recovery, switch to archive recovery and retry using the
3039  * offline archive. We have now replayed all the valid WAL in
3040  * pg_wal, so we are presumably now consistent.
3041  *
3042  * We require that there's at least some valid WAL present in
3043  * pg_wal, however (!fetching_ckpt). We could recover using the
3044  * WAL from the archive, even if pg_wal is completely empty, but
3045  * we'd have no idea how far we'd have to replay to reach
3046  * consistency. So err on the safe side and give up.
3047  */
3049  !fetching_ckpt)
3050  {
3051  ereport(DEBUG1,
3052  (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3053  InArchiveRecovery = true;
3055  StandbyMode = true;
3056 
3059  minRecoveryPointTLI = replayTLI;
3060 
3062 
3063  /*
3064  * Before we retry, reset lastSourceFailed and currentSource
3065  * so that we will check the archive next.
3066  */
3067  lastSourceFailed = false;
3069 
3070  continue;
3071  }
3072 
3073  /* In standby mode, loop back to retry. Otherwise, give up. */
3075  continue;
3076  else
3077  return NULL;
3078  }
3079  }
3080 }
3081 
3082 /*
3083  * Read the XLOG page containing RecPtr into readBuf (if not read already).
3084  * Returns number of bytes read, if the page is read successfully, or
3085  * XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed, but
3086  * only if they have not been previously reported.
3087  *
3088  * While prefetching, xlogreader->nonblocking may be set. In that case,
3089  * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3090  *
3091  * This is responsible for restoring files from archive as needed, as well
3092  * as for waiting for the requested WAL record to arrive in standby mode.
3093  *
3094  * 'emode' specifies the log level used for reporting "file not found" or
3095  * "end of WAL" situations in archive recovery, or in standby mode when a
3096  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
3097  * XLREAD_FAIL in those situations, on higher log levels the ereport() won't
3098  * return.
3099  *
3100  * In standby mode, if after a successful return of XLogPageRead() the
3101  * caller finds the record it's interested in to be broken, it should
3102  * ereport the error with the level determined by
3103  * emode_for_corrupt_record(), and then set lastSourceFailed
3104  * and call XLogPageRead() again with the same arguments. This lets
3105  * XLogPageRead() to try fetching the record from another source, or to
3106  * sleep and retry.
3107  */
3108 static int
3110  XLogRecPtr targetRecPtr, char *readBuf)
3111 {
3112  XLogPageReadPrivate *private =
3114  int emode = private->emode;
3115  uint32 targetPageOff;
3116  XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3117  int r;
3118 
3119  XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3120  targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3121 
3122  /*
3123  * See if we need to switch to a new segment because the requested record
3124  * is not in the currently open one.
3125  */
3126  if (readFile >= 0 &&
3127  !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3128  {
3129  /*
3130  * Request a restartpoint if we've replayed too much xlog since the
3131  * last one.
3132  */
3134  {
3136  {
3137  (void) GetRedoRecPtr();
3140  }
3141  }
3142 
3143  close(readFile);
3144  readFile = -1;
3146  }
3147 
3148  XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3149 
3150 retry:
3151  /* See if we need to retrieve more data */
3152  if (readFile < 0 ||
3154  flushedUpto < targetPagePtr + reqLen))
3155  {
3156  if (readFile >= 0 &&
3159  flushedUpto < targetPagePtr + reqLen)
3160  return XLREAD_WOULDBLOCK;
3161 
3162  switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3163  private->randAccess,
3164  private->fetching_ckpt,
3165  targetRecPtr,
3166  private->replayTLI,
3169  {
3170  case XLREAD_WOULDBLOCK:
3171  return XLREAD_WOULDBLOCK;
3172  case XLREAD_FAIL:
3173  if (readFile >= 0)
3174  close(readFile);
3175  readFile = -1;
3176  readLen = 0;
3178  return XLREAD_FAIL;
3179  case XLREAD_SUCCESS:
3180  break;
3181  }
3182  }
3183 
3184  /*
3185  * At this point, we have the right segment open and if we're streaming we
3186  * know the requested record is in it.
3187  */
3188  Assert(readFile != -1);
3189 
3190  /*
3191  * If the current segment is being streamed from the primary, calculate
3192  * how much of the current page we have received already. We know the
3193  * requested record has been received, but this is for the benefit of
3194  * future calls, to allow quick exit at the top of this function.
3195  */
3197  {
3198  if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3199  readLen = XLOG_BLCKSZ;
3200  else
3202  targetPageOff;
3203  }
3204  else
3205  readLen = XLOG_BLCKSZ;
3206 
3207  /* Read the requested page */
3208  readOff = targetPageOff;
3209 
3211  r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3212  if (r != XLOG_BLCKSZ)
3213  {
3214  char fname[MAXFNAMELEN];
3215  int save_errno = errno;
3216 
3219  if (r < 0)
3220  {
3221  errno = save_errno;
3222  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3224  errmsg("could not read from log segment %s, offset %u: %m",
3225  fname, readOff)));
3226  }
3227  else
3228  ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3230  errmsg("could not read from log segment %s, offset %u: read %d of %zu",
3231  fname, readOff, r, (Size) XLOG_BLCKSZ)));
3232  goto next_record_is_invalid;
3233  }
3235 
3236  Assert(targetSegNo == readSegNo);
3237  Assert(targetPageOff == readOff);
3238  Assert(reqLen <= readLen);
3239 
3241 
3242  /*
3243  * Check the page header immediately, so that we can retry immediately if
3244  * it's not valid. This may seem unnecessary, because ReadPageInternal()
3245  * validates the page header anyway, and would propagate the failure up to
3246  * ReadRecord(), which would retry. However, there's a corner case with
3247  * continuation records, if a record is split across two pages such that
3248  * we would need to read the two pages from different sources. For
3249  * example, imagine a scenario where a streaming replica is started up,
3250  * and replay reaches a record that's split across two WAL segments. The
3251  * first page is only available locally, in pg_wal, because it's already
3252  * been recycled on the primary. The second page, however, is not present
3253  * in pg_wal, and we should stream it from the primary. There is a
3254  * recycled WAL segment present in pg_wal, with garbage contents, however.
3255  * We would read the first page from the local WAL segment, but when
3256  * reading the second page, we would read the bogus, recycled, WAL
3257  * segment. If we didn't catch that case here, we would never recover,
3258  * because ReadRecord() would retry reading the whole record from the
3259  * beginning.
3260  *
3261  * Of course, this only catches errors in the page header, which is what
3262  * happens in the case of a recycled WAL segment. Other kinds of errors or
3263  * corruption still has the same problem. But this at least fixes the
3264  * common case, which can happen as part of normal operation.
3265  *
3266  * Validating the page header is cheap enough that doing it twice
3267  * shouldn't be a big deal from a performance point of view.
3268  *
3269  * When not in standby mode, an invalid page header should cause recovery
3270  * to end, not retry reading the page, so we don't need to validate the
3271  * page header here for the retry. Instead, ReadPageInternal() is
3272  * responsible for the validation.
3273  */
3274  if (StandbyMode &&
3275  !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3276  {
3277  /*
3278  * Emit this error right now then retry this page immediately. Use
3279  * errmsg_internal() because the message was already translated.
3280  */
3281  if (xlogreader->errormsg_buf[0])
3284 
3285  /* reset any error XLogReaderValidatePageHeader() might have set */
3286  xlogreader->errormsg_buf[0] = '\0';
3287  goto next_record_is_invalid;
3288  }
3289 
3290  return readLen;
3291 
3292 next_record_is_invalid:
3293  lastSourceFailed = true;
3294 
3295  if (readFile >= 0)
3296  close(readFile);
3297  readFile = -1;
3298  readLen = 0;
3300 
3301  /* In standby-mode, keep trying */
3302  if (StandbyMode)
3303  goto retry;
3304  else
3305  return XLREAD_FAIL;
3306 }
3307 
3308 /*
3309  * Open the WAL segment containing WAL location 'RecPtr'.
3310  *
3311  * The segment can be fetched via restore_command, or via walreceiver having
3312  * streamed the record, or it can already be present in pg_wal. Checking
3313  * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3314  * too, in case someone copies a new segment directly to pg_wal. That is not
3315  * documented or recommended, though.
3316  *
3317  * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3318  * prepare to read WAL starting from RedoStartLSN after this.
3319  *
3320  * 'RecPtr' might not point to the beginning of the record we're interested
3321  * in, it might also point to the page or segment header. In that case,
3322  * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3323  * used to decide which timeline to stream the requested WAL from.
3324  *
3325  * 'replayLSN' is the current replay LSN, so that if we scan for new
3326  * timelines, we can reject a switch to a timeline that branched off before
3327  * this point.
3328  *
3329  * If the record is not immediately available, the function returns false
3330  * if we're not in standby mode. In standby mode, waits for it to become
3331  * available.
3332  *
3333  * When the requested record becomes available, the function opens the file
3334  * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3335  * of standby mode is triggered by the user, and there is no more WAL
3336  * available, returns XLREAD_FAIL.
3337  *
3338  * If nonblocking is true, then give up immediately if we can't satisfy the
3339  * request, returning XLREAD_WOULDBLOCK instead of waiting.
3340  */
3341 static XLogPageReadResult
3342 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3343  bool fetching_ckpt, XLogRecPtr tliRecPtr,
3344  TimeLineID replayTLI, XLogRecPtr replayLSN,
3345  bool nonblocking)
3346 {
3347  static TimestampTz last_fail_time = 0;
3348  TimestampTz now;
3349  bool streaming_reply_sent = false;
3350 
3351  /*-------
3352  * Standby mode is implemented by a state machine:
3353  *
3354  * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3355  * pg_wal (XLOG_FROM_PG_WAL)
3356  * 2. Check trigger file
3357  * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3358  * 4. Rescan timelines
3359  * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3360  *
3361  * Failure to read from the current source advances the state machine to
3362  * the next state.
3363  *
3364  * 'currentSource' indicates the current state. There are no currentSource
3365  * values for "check trigger", "rescan timelines", and "sleep" states,
3366  * those actions are taken when reading from the previous source fails, as
3367  * part of advancing to the next state.
3368  *
3369  * If standby mode is turned off while reading WAL from stream, we move
3370  * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3371  * the files (which would be required at end of recovery, e.g., timeline
3372  * history file) from archive or pg_wal. We don't need to kill WAL receiver
3373  * here because it's already stopped when standby mode is turned off at
3374  * the end of recovery.
3375  *-------
3376  */
3377  if (!InArchiveRecovery)
3379  else if (currentSource == XLOG_FROM_ANY ||
3381  {
3382  lastSourceFailed = false;
3384  }
3385 
3386  for (;;)
3387  {
3388  XLogSource oldSource = currentSource;
3389  bool startWalReceiver = false;
3390 
3391  /*
3392  * First check if we failed to read from the current source, and
3393  * advance the state machine if so. The failure to read might've
3394  * happened outside this function, e.g when a CRC check fails on a
3395  * record, or within this loop.
3396  */
3397  if (lastSourceFailed)
3398  {
3399  /*
3400  * Don't allow any retry loops to occur during nonblocking
3401  * readahead. Let the caller process everything that has been
3402  * decoded already first.
3403  */
3404  if (nonblocking)
3405  return XLREAD_WOULDBLOCK;
3406 
3407  switch (currentSource)
3408  {
3409  case XLOG_FROM_ARCHIVE:
3410  case XLOG_FROM_PG_WAL:
3411 
3412  /*
3413  * Check to see if the trigger file exists. Note that we
3414  * do this only after failure, so when you create the
3415  * trigger file, we still finish replaying as much as we
3416  * can from archive and pg_wal before failover.
3417  */
3419  {
3421  return XLREAD_FAIL;
3422  }
3423 
3424  /*
3425  * Not in standby mode, and we've now tried the archive
3426  * and pg_wal.
3427  */
3428  if (!StandbyMode)
3429  return XLREAD_FAIL;
3430 
3431  /*
3432  * Move to XLOG_FROM_STREAM state, and set to start a
3433  * walreceiver if necessary.
3434  */
3436  startWalReceiver = true;
3437  break;
3438 
3439  case XLOG_FROM_STREAM:
3440 
3441  /*
3442  * Failure while streaming. Most likely, we got here
3443  * because streaming replication was terminated, or
3444  * promotion was triggered. But we also get here if we
3445  * find an invalid record in the WAL streamed from the
3446  * primary, in which case something is seriously wrong.
3447  * There's little chance that the problem will just go
3448  * away, but PANIC is not good for availability either,
3449  * especially in hot standby mode. So, we treat that the
3450  * same as disconnection, and retry from archive/pg_wal
3451  * again. The WAL in the archive should be identical to
3452  * what was streamed, so it's unlikely that it helps, but
3453  * one can hope...
3454  */
3455 
3456  /*
3457  * We should be able to move to XLOG_FROM_STREAM only in
3458  * standby mode.
3459  */
3461 
3462  /*
3463  * Before we leave XLOG_FROM_STREAM state, make sure that
3464  * walreceiver is not active, so that it won't overwrite
3465  * WAL that we restore from archive.
3466  */
3467  if (WalRcvStreaming())
3469 
3470  /*
3471  * Before we sleep, re-scan for possible new timelines if
3472  * we were requested to recover to the latest timeline.
3473  */
3475  {
3476  if (rescanLatestTimeLine(replayTLI, replayLSN))
3477  {
3479  break;
3480  }
3481  }
3482 
3483  /*
3484  * XLOG_FROM_STREAM is the last state in our state
3485  * machine, so we've exhausted all the options for
3486  * obtaining the requested WAL. We're going to loop back
3487  * and retry from the archive, but if it hasn't been long
3488  * since last attempt, sleep wal_retrieve_retry_interval
3489  * milliseconds to avoid busy-waiting.
3490  */
3492  if (!TimestampDifferenceExceeds(last_fail_time, now,
3494  {
3495  long wait_time;
3496 
3497  wait_time = wal_retrieve_retry_interval -
3498  TimestampDifferenceMilliseconds(last_fail_time, now);
3499 
3500  elog(LOG, "waiting for WAL to become available at %X/%X",
3501  LSN_FORMAT_ARGS(RecPtr));
3502 
3506  wait_time,
3510 
3511  /* Handle interrupt signals of startup process */
3513  }
3514  last_fail_time = now;
3516  break;
3517 
3518  default:
3519  elog(ERROR, "unexpected WAL source %d", currentSource);
3520  }
3521  }
3522  else if (currentSource == XLOG_FROM_PG_WAL)
3523  {
3524  /*
3525  * We just successfully read a file in pg_wal. We prefer files in
3526  * the archive over ones in pg_wal, so try the next file again
3527  * from the archive first.
3528  */
3529  if (InArchiveRecovery)
3531  }
3532 
3533  if (currentSource != oldSource)
3534  elog(DEBUG2, "switched WAL source from %s to %s after %s",
3536  lastSourceFailed ? "failure" : "success");
3537 
3538  /*
3539  * We've now handled possible failure. Try to read from the chosen
3540  * source.
3541  */
3542  lastSourceFailed = false;
3543 
3544  switch (currentSource)
3545  {
3546  case XLOG_FROM_ARCHIVE:
3547  case XLOG_FROM_PG_WAL:
3548 
3549  /*
3550  * WAL receiver must not be running when reading WAL from
3551  * archive or pg_wal.
3552  */
3553  Assert(!WalRcvStreaming());
3554 
3555  /* Close any old file we might have open. */
3556  if (readFile >= 0)
3557  {
3558  close(readFile);
3559  readFile = -1;
3560  }
3561  /* Reset curFileTLI if random fetch. */
3562  if (randAccess)
3563  curFileTLI = 0;
3564 
3565  /*
3566  * Try to restore the file from archive, or read an existing
3567  * file from pg_wal.
3568  */
3571  currentSource);
3572  if (readFile >= 0)
3573  return XLREAD_SUCCESS; /* success! */
3574 
3575  /*
3576  * Nope, not found in archive or pg_wal.
3577  */
3578  lastSourceFailed = true;
3579  break;
3580 
3581  case XLOG_FROM_STREAM:
3582  {
3583  bool havedata;
3584 
3585  /*
3586  * We should be able to move to XLOG_FROM_STREAM only in
3587  * standby mode.
3588  */
3590 
3591  /*
3592  * First, shutdown walreceiver if its restart has been
3593  * requested -- but no point if we're already slated for
3594  * starting it.
3595  */
3596  if (pendingWalRcvRestart && !startWalReceiver)
3597  {
3599 
3600  /*
3601  * Re-scan for possible new timelines if we were
3602  * requested to recover to the latest timeline.
3603  */
3606  rescanLatestTimeLine(replayTLI, replayLSN);
3607 
3608  startWalReceiver = true;
3609  }
3610  pendingWalRcvRestart = false;
3611 
3612  /*
3613  * Launch walreceiver if needed.
3614  *
3615  * If fetching_ckpt is true, RecPtr points to the initial
3616  * checkpoint location. In that case, we use RedoStartLSN
3617  * as the streaming start position instead of RecPtr, so
3618  * that when we later jump backwards to start redo at
3619  * RedoStartLSN, we will have the logs streamed already.
3620  */
3621  if (startWalReceiver &&
3622  PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3623  {
3624  XLogRecPtr ptr;
3625  TimeLineID tli;
3626 
3627  if (fetching_ckpt)
3628  {
3629  ptr = RedoStartLSN;
3630  tli = RedoStartTLI;
3631  }
3632  else
3633  {
3634  ptr = RecPtr;
3635 
3636  /*
3637  * Use the record begin position to determine the
3638  * TLI, rather than the position we're reading.
3639  */
3640  tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3641 
3642  if (curFileTLI > 0 && tli < curFileTLI)
3643  elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3644  LSN_FORMAT_ARGS(tliRecPtr),
3645  tli, curFileTLI);
3646  }
3647  curFileTLI = tli;
3652  flushedUpto = 0;
3653  }
3654 
3655  /*
3656  * Check if WAL receiver is active or wait to start up.
3657  */
3658  if (!WalRcvStreaming())
3659  {
3660  lastSourceFailed = true;
3661  break;
3662  }
3663 
3664  /*
3665  * Walreceiver is active, so see if new data has arrived.
3666  *
3667  * We only advance XLogReceiptTime when we obtain fresh
3668  * WAL from walreceiver and observe that we had already
3669  * processed everything before the most recent "chunk"
3670  * that it flushed to disk. In steady state where we are
3671  * keeping up with the incoming data, XLogReceiptTime will
3672  * be updated on each cycle. When we are behind,
3673  * XLogReceiptTime will not advance, so the grace time
3674  * allotted to conflicting queries will decrease.
3675  */
3676  if (RecPtr < flushedUpto)
3677  havedata = true;
3678  else
3679  {
3680  XLogRecPtr latestChunkStart;
3681 
3682  flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3683  if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3684  {
3685  havedata = true;
3686  if (latestChunkStart <= RecPtr)
3687  {
3690  }
3691  }
3692  else
3693  havedata = false;
3694  }
3695  if (havedata)
3696  {
3697  /*
3698  * Great, streamed far enough. Open the file if it's
3699  * not open already. Also read the timeline history
3700  * file if we haven't initialized timeline history
3701  * yet; it should be streamed over and present in
3702  * pg_wal by now. Use XLOG_FROM_STREAM so that source
3703  * info is set correctly and XLogReceiptTime isn't
3704  * changed.
3705  *
3706  * NB: We must set readTimeLineHistory based on
3707  * recoveryTargetTLI, not receiveTLI. Normally they'll
3708  * be the same, but if recovery_target_timeline is
3709  * 'latest' and archiving is configured, then it's
3710  * possible that we managed to retrieve one or more
3711  * new timeline history files from the archive,
3712  * updating recoveryTargetTLI.
3713  */
3714  if (readFile < 0)
3715  {
3716  if (!expectedTLEs)
3719  receiveTLI,
3720  XLOG_FROM_STREAM, false);
3721  Assert(readFile >= 0);
3722  }
3723  else
3724  {
3725  /* just make sure source info is correct... */
3728  return XLREAD_SUCCESS;
3729  }
3730  break;
3731  }
3732 
3733  /* In nonblocking mode, return rather than sleeping. */
3734  if (nonblocking)
3735  return XLREAD_WOULDBLOCK;
3736 
3737  /*
3738  * Data not here yet. Check for trigger, then wait for
3739  * walreceiver to wake us up when new WAL arrives.
3740  */
3741  if (CheckForStandbyTrigger())
3742  {
3743  /*
3744  * Note that we don't return XLREAD_FAIL immediately
3745  * here. After being triggered, we still want to
3746  * replay all the WAL that was already streamed. It's
3747  * in pg_wal now, so we just treat this as a failure,
3748  * and the state machine will move on to replay the
3749  * streamed WAL from pg_wal, and then recheck the
3750  * trigger and exit replay.
3751  */
3752  lastSourceFailed = true;
3753  break;
3754  }
3755 
3756  /*
3757  * Since we have replayed everything we have received so
3758  * far and are about to start waiting for more WAL, let's
3759  * tell the upstream server our replay location now so
3760  * that pg_stat_replication doesn't show stale
3761  * information.
3762  */
3763  if (!streaming_reply_sent)
3764  {
3765  WalRcvForceReply();
3766  streaming_reply_sent = true;
3767  }
3768 
3769  /* Update pg_stat_recovery_prefetch before sleeping. */
3771 
3772  /*
3773  * Wait for more WAL to arrive. Time out after 5 seconds
3774  * to react to a trigger file promptly and to check if the
3775  * WAL receiver is still active.
3776  */
3782  break;
3783  }
3784 
3785  default:
3786  elog(ERROR, "unexpected WAL source %d", currentSource);
3787  }
3788 
3789  /*
3790  * Check for recovery pause here so that we can confirm more quickly
3791  * that a requested pause has actually taken effect.
3792  */
3793  if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
3795  recoveryPausesHere(false);
3796 
3797  /*
3798  * This possibly-long loop needs to handle interrupts of startup
3799  * process.
3800  */
3802  }
3803 
3804  return XLREAD_FAIL; /* not reached */
3805 }
3806 
3807 
3808 /*
3809  * Determine what log level should be used to report a corrupt WAL record
3810  * in the current WAL page, previously read by XLogPageRead().
3811  *
3812  * 'emode' is the error mode that would be used to report a file-not-found
3813  * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
3814  * we're retrying the exact same record that we've tried previously, only
3815  * complain the first time to keep the noise down. However, we only do when
3816  * reading from pg_wal, because we don't expect any invalid records in archive
3817  * or in records streamed from the primary. Files in the archive should be complete,
3818  * and we should never hit the end of WAL because we stop and wait for more WAL
3819  * to arrive before replaying it.
3820  *
3821  * NOTE: This function remembers the RecPtr value it was last called with,
3822  * to suppress repeated messages about the same record. Only call this when
3823  * you are about to ereport(), or you might cause a later message to be
3824  * erroneously suppressed.
3825  */
3826 static int
3828 {
3829  static XLogRecPtr lastComplaint = 0;
3830 
3831  if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
3832  {
3833  if (RecPtr == lastComplaint)
3834  emode = DEBUG1;
3835  else
3836  lastComplaint = RecPtr;
3837  }
3838  return emode;
3839 }
3840 
3841 
3842 /*
3843  * Subroutine to try to fetch and validate a prior checkpoint record.
3844  *
3845  * whichChkpt identifies the checkpoint (merely for reporting purposes).
3846  * 1 for "primary", 0 for "other" (backup_label)
3847  */
3848 static XLogRecord *
3850  int whichChkpt, bool report, TimeLineID replayTLI)
3851 {
3852  XLogRecord *record;
3853  uint8 info;
3854 
3855  Assert(xlogreader != NULL);
3856 
3857  if (!XRecOffIsValid(RecPtr))
3858  {
3859  if (!report)
3860  return NULL;
3861 
3862  switch (whichChkpt)
3863  {
3864  case 1:
3865  ereport(LOG,
3866  (errmsg("invalid primary checkpoint link in control file")));
3867  break;
3868  default:
3869  ereport(LOG,
3870  (errmsg("invalid checkpoint link in backup_label file")));
3871  break;
3872  }
3873  return NULL;
3874  }
3875 
3877  record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
3878 
3879  if (record == NULL)
3880  {
3881  if (!report)
3882  return NULL;
3883 
3884  switch (whichChkpt)
3885  {
3886  case 1:
3887  ereport(LOG,
3888  (errmsg("invalid primary checkpoint record")));
3889  break;
3890  default:
3891  ereport(LOG,
3892  (errmsg("invalid checkpoint record")));
3893  break;
3894  }
3895  return NULL;
3896  }
3897  if (record->xl_rmid != RM_XLOG_ID)
3898  {
3899  switch (whichChkpt)
3900  {
3901  case 1:
3902  ereport(LOG,
3903  (errmsg("invalid resource manager ID in primary checkpoint record")));
3904  break;
3905  default:
3906  ereport(LOG,
3907  (errmsg("invalid resource manager ID in checkpoint record")));
3908  break;
3909  }
3910  return NULL;
3911  }
3912  info = record->xl_info & ~XLR_INFO_MASK;
3913  if (info != XLOG_CHECKPOINT_SHUTDOWN &&
3914  info != XLOG_CHECKPOINT_ONLINE)
3915  {
3916  switch (whichChkpt)
3917  {
3918  case 1:
3919  ereport(LOG,
3920  (errmsg("invalid xl_info in primary checkpoint record")));
3921  break;
3922  default:
3923  ereport(LOG,
3924  (errmsg("invalid xl_info in checkpoint record")));
3925  break;
3926  }
3927  return NULL;
3928  }
3930  {
3931  switch (whichChkpt)
3932  {
3933  case 1:
3934  ereport(LOG,
3935  (errmsg("invalid length of primary checkpoint record")));
3936  break;
3937  default:
3938  ereport(LOG,
3939  (errmsg("invalid length of checkpoint record")));
3940  break;
3941  }
3942  return NULL;
3943  }
3944  return record;
3945 }
3946 
3947 /*
3948  * Scan for new timelines that might have appeared in the archive since we
3949  * started recovery.
3950  *
3951  * If there are any, the function changes recovery target TLI to the latest
3952  * one and returns 'true'.
3953  */
3954 static bool
3956 {
3957  List *newExpectedTLEs;
3958  bool found;
3959  ListCell *cell;
3960  TimeLineID newtarget;
3961  TimeLineID oldtarget = recoveryTargetTLI;
3962  TimeLineHistoryEntry *currentTle = NULL;
3963 
3965  if (newtarget == recoveryTargetTLI)
3966  {
3967  /* No new timelines found */
3968  return false;
3969  }
3970 
3971  /*
3972  * Determine the list of expected TLIs for the new TLI
3973  */
3974 
3975  newExpectedTLEs = readTimeLineHistory(newtarget);
3976 
3977  /*
3978  * If the current timeline is not part of the history of the new timeline,
3979  * we cannot proceed to it.
3980  */
3981  found = false;
3982  foreach(cell, newExpectedTLEs)
3983  {
3984  currentTle = (TimeLineHistoryEntry *) lfirst(cell);
3985 
3986  if (currentTle->tli == recoveryTargetTLI)
3987  {
3988  found = true;
3989  break;
3990  }
3991  }
3992  if (!found)
3993  {
3994  ereport(LOG,
3995  (errmsg("new timeline %u is not a child of database system timeline %u",
3996  newtarget,
3997  replayTLI)));
3998  return false;
3999  }
4000 
4001  /*
4002  * The current timeline was found in the history file, but check that the
4003  * next timeline was forked off from it *after* the current recovery
4004  * location.
4005  */
4006  if (currentTle->end < replayLSN)
4007  {
4008  ereport(LOG,
4009  (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4010  newtarget,
4011  replayTLI,
4012  LSN_FORMAT_ARGS(replayLSN))));
4013  return false;
4014  }
4015 
4016  /* The new timeline history seems valid. Switch target */
4017  recoveryTargetTLI = newtarget;
4019  expectedTLEs = newExpectedTLEs;
4020 
4021  /*
4022  * As in StartupXLOG(), try to ensure we have all the history files
4023  * between the old target and new target in pg_wal.
4024  */
4025  restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4026 
4027  ereport(LOG,
4028  (errmsg("new target timeline is %u",
4029  recoveryTargetTLI)));
4030 
4031  return true;
4032 }
4033 
4034 
4035 /*
4036  * Open a logfile segment for reading (during recovery).
4037  *
4038  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4039  * Otherwise, it's assumed to be already available in pg_wal.
4040  */
4041 static int
4042 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
4043  XLogSource source, bool notfoundOk)
4044 {
4045  char xlogfname[MAXFNAMELEN];
4046  char activitymsg[MAXFNAMELEN + 16];
4047  char path[MAXPGPATH];
4048  int fd;
4049 
4050  XLogFileName(xlogfname, tli, segno, wal_segment_size);
4051 
4052  switch (source)
4053  {
4054  case XLOG_FROM_ARCHIVE:
4055  /* Report recovery progress in PS display */
4056  snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4057  xlogfname);
4058  set_ps_display(activitymsg);
4059 
4060  if (!RestoreArchivedFile(path, xlogfname,
4061  "RECOVERYXLOG",
4063  InRedo))
4064  return -1;
4065  break;
4066 
4067  case XLOG_FROM_PG_WAL:
4068  case XLOG_FROM_STREAM:
4069  XLogFilePath(path, tli, segno, wal_segment_size);
4070  break;
4071 
4072  default:
4073  elog(ERROR, "invalid XLogFileRead source %d", source);
4074  }
4075 
4076  /*
4077  * If the segment was fetched from archival storage, replace the existing
4078  * xlog segment (if any) with the archival version.
4079  */
4080  if (source == XLOG_FROM_ARCHIVE)
4081  {
4083  KeepFileRestoredFromArchive(path, xlogfname);
4084 
4085  /*
4086  * Set path to point at the new file in pg_wal.
4087  */
4088  snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4089  }
4090 
4091  fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4092  if (fd >= 0)
4093  {
4094  /* Success! */
4095  curFileTLI = tli;
4096 
4097  /* Report recovery progress in PS display */
4098  snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4099  xlogfname);
4100  set_ps_display(activitymsg);
4101 
4102  /* Track source of data in assorted state variables */
4103  readSource = source;
4105  /* In FROM_STREAM case, caller tracks receipt time, not me */
4106  if (source != XLOG_FROM_STREAM)
4108 
4109  return fd;
4110  }
4111  if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4112  ereport(PANIC,
4114  errmsg("could not open file \"%s\": %m", path)));
4115  return -1;
4116 }
4117 
4118 /*
4119  * Open a logfile segment for reading (during recovery).
4120  *
4121  * This version searches for the segment with any TLI listed in expectedTLEs.
4122  */
4123 static int
4125 {
4126  char path[MAXPGPATH];
4127  ListCell *cell;
4128  int fd;
4129  List *tles;
4130 
4131  /*
4132  * Loop looking for a suitable timeline ID: we might need to read any of
4133  * the timelines listed in expectedTLEs.
4134  *
4135  * We expect curFileTLI on entry to be the TLI of the preceding file in
4136  * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
4137  * to go backwards; this prevents us from picking up the wrong file when a
4138  * parent timeline extends to higher segment numbers than the child we
4139  * want to read.
4140  *
4141  * If we haven't read the timeline history file yet, read it now, so that
4142  * we know which TLIs to scan. We don't save the list in expectedTLEs,
4143  * however, unless we actually find a valid segment. That way if there is
4144  * neither a timeline history file nor a WAL segment in the archive, and
4145  * streaming replication is set up, we'll read the timeline history file
4146  * streamed from the primary when we start streaming, instead of
4147  * recovering with a dummy history generated here.
4148  */
4149  if (expectedTLEs)
4150  tles = expectedTLEs;
4151  else
4153 
4154  foreach(cell, tles)
4155  {
4157  TimeLineID tli = hent->tli;
4158 
4159  if (tli < curFileTLI)
4160  break; /* don't bother looking at too-old TLIs */
4161 
4162  /*
4163  * Skip scanning the timeline ID that the logfile segment to read
4164  * doesn't belong to
4165  */
4166  if (hent->begin != InvalidXLogRecPtr)
4167  {
4168  XLogSegNo beginseg = 0;
4169 
4170  XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4171 
4172  /*
4173  * The logfile segment that doesn't belong to the timeline is
4174  * older or newer than the segment that the timeline started or
4175  * ended at, respectively. It's sufficient to check only the
4176  * starting segment of the timeline here. Since the timelines are
4177  * scanned in descending order in this loop, any segments newer
4178  * than the ending segment should belong to newer timeline and
4179  * have already been read before. So it's not necessary to check
4180  * the ending segment of the timeline here.
4181  */
4182  if (segno < beginseg)
4183  continue;
4184  }
4185 
4187  {
4188  fd = XLogFileRead(segno, emode, tli,
4189  XLOG_FROM_ARCHIVE, true);
4190  if (fd != -1)
4191  {
4192  elog(DEBUG1, "got WAL segment from archive");
4193  if (!expectedTLEs)
4194  expectedTLEs = tles;
4195  return fd;
4196  }
4197  }
4198 
4200  {
4201  fd = XLogFileRead(segno, emode, tli,
4202  XLOG_FROM_PG_WAL, true);
4203  if (fd != -1)
4204  {
4205  if (!expectedTLEs)
4206  expectedTLEs = tles;
4207  return fd;
4208  }
4209  }
4210  }
4211 
4212  /* Couldn't find it. For simplicity, complain about front timeline */
4214  errno = ENOENT;
4215  ereport(emode,
4217  errmsg("could not open file \"%s\": %m", path)));
4218  return -1;
4219 }
4220 
4221 /*
4222  * Set flag to signal the walreceiver to restart. (The startup process calls
4223  * this on noticing a relevant configuration change.)
4224  */
4225 void
4227 {
4229  {
4230  ereport(LOG,
4231  (errmsg("WAL receiver process shutdown requested")));
4232 
4233  pendingWalRcvRestart = true;
4234  }
4235 }
4236 
4237 
4238 /*
4239  * Has a standby promotion already been triggered?
4240  *
4241  * Unlike CheckForStandbyTrigger(), this works in any process
4242  * that's connected to shared memory.
4243  */
4244 bool
4246 {
4247  /*
4248  * We check shared state each time only until a standby promotion is
4249  * triggered. We can't trigger a promotion again, so there's no need to
4250  * keep checking after the shared variable has once been seen true.
4251  */
4253  return true;
4254 
4258 
4259  return LocalPromoteIsTriggered;
4260 }
4261 
4262 static void
4264 {
4268 
4269  /*
4270  * Mark the recovery pause state as 'not paused' because the paused state
4271  * ends and promotion continues if a promotion is triggered while recovery
4272  * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4273  * return 'paused' while a promotion is ongoing.
4274  */
4275  SetRecoveryPause(false);
4276 
4277  LocalPromoteIsTriggered = true;
4278 }
4279 
4280 /*
4281  * Check to see whether the user-specified trigger file exists and whether a
4282  * promote request has arrived. If either condition holds, return true.
4283  */
4284 static bool
4286 {
4287  struct stat stat_buf;
4288 
4290  return true;
4291 
4293  {
4294  ereport(LOG, (errmsg("received promote request")));
4298  return true;
4299  }
4300 
4301  if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0)
4302  return false;
4303 
4304  if (stat(PromoteTriggerFile, &stat_buf) == 0)
4305  {
4306  ereport(LOG,
4307  (errmsg("promote trigger file found: %s", PromoteTriggerFile)));
4308  unlink(PromoteTriggerFile);
4310  return true;
4311  }
4312  else if (errno != ENOENT)
4313  ereport(ERROR,
4315  errmsg("could not stat promote trigger file \"%s\": %m",
4316  PromoteTriggerFile)));
4317 
4318  return false;
4319 }
4320 
4321 /*
4322  * Remove the files signaling a standby promotion request.
4323  */
4324 void
4326 {
4327  unlink(PROMOTE_SIGNAL_FILE);
4328 }
4329 
4330 /*
4331  * Check to see if a promote request has arrived.
4332  */
4333 bool
4335 {
4336  struct stat stat_buf;
4337 
4338  if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4339  return true;
4340 
4341  return false;
4342 }
4343 
4344 /*
4345  * Wake up startup process to replay newly arrived WAL, or to notice that
4346  * failover has been requested.
4347  */
4348 void
4350 {
4352 }
4353 
4354 /*
4355  * Schedule a walreceiver wakeup in the main recovery loop.
4356  */
4357 void
4359 {
4361 }
4362 
4363 /*
4364  * Is HotStandby active yet? This is only important in special backends
4365  * since normal backends won't ever be able to connect until this returns
4366  * true. Postmaster knows this by way of signal, not via shared memory.
4367  *
4368  * Unlike testing standbyState, this works in any process that's connected to
4369  * shared memory. (And note that standbyState alone doesn't tell the truth
4370  * anyway.)
4371  */
4372 bool
4374 {
4375  /*
4376  * We check shared state each time only until Hot Standby is active. We
4377  * can't de-activate Hot Standby, so there's no need to keep checking
4378  * after the shared variable has once been seen true.
4379  */
4381  return true;
4382  else
4383  {
4384  /* spinlock is essential on machines with weak memory ordering! */
4388 
4389  return LocalHotStandbyActive;
4390  }
4391 }
4392 
4393 /*
4394  * Like HotStandbyActive(), but to be used only in WAL replay code,
4395  * where we don't need to ask any other process what the state is.
4396  */
4397 static bool
4399 {
4401  return LocalHotStandbyActive;
4402 }
4403 
4404 /*
4405  * Get latest redo apply position.
4406  *
4407  * Exported to allow WALReceiver to read the pointer directly.
4408  */
4409 XLogRecPtr
4411 {
4412  XLogRecPtr recptr;
4413  TimeLineID tli;
4414 
4419 
4420  if (replayTLI)
4421  *replayTLI = tli;
4422  return recptr;
4423 }
4424 
4425 
4426 /*
4427  * Get position of last applied, or the record being applied.
4428  *
4429  * This is different from GetXLogReplayRecPtr() in that if a WAL
4430  * record is currently being applied, this includes that record.
4431  */
4432 XLogRecPtr
4434 {
4435  XLogRecPtr recptr;
4436  TimeLineID tli;
4437 
4439  recptr = XLogRecoveryCtl->replayEndRecPtr;
4442 
4443  if (replayEndTLI)
4444  *replayEndTLI = tli;
4445  return recptr;
4446 }
4447 
4448 /*
4449  * Save timestamp of latest processed commit/abort record.
4450  *
4451  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4452  * seen by processes other than the startup process. Note in particular
4453  * that CreateRestartPoint is executed in the checkpointer.
4454  */
4455 static void
4457 {
4461 }
4462 
4463 /*
4464  * Fetch timestamp of latest processed commit/abort record.
4465  */
4468 {
4469  TimestampTz xtime;
4470 
4474 
4475  return xtime;
4476 }
4477 
4478 /*
4479  * Save timestamp of the next chunk of WAL records to apply.
4480  *
4481  * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4482  * seen by all backends.
4483  */
4484 static void
4486 {
4490 }
4491 
4492 /*
4493  * Fetch timestamp of latest processed commit/abort record.
4494  * Startup process maintains an accurate local copy in XLogReceiptTime
4495  */
4498 {
4499  TimestampTz xtime;
4500 
4504 
4505  return xtime;
4506 }
4507 
4508 /*
4509  * Returns time of receipt of current chunk of XLOG data, as well as
4510  * whether it was received from streaming replication or from archives.
4511  */
4512 void
4513 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4514 {
4515  /*
4516  * This must be executed in the startup process, since we don't export the
4517  * relevant state to shared memory.
4518  */
4519  Assert(InRecovery);
4520 
4521  *rtime = XLogReceiptTime;
4522  *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4523 }
4524 
4525 /*
4526  * Note that text field supplied is a parameter name and does not require
4527  * translation
4528  */
4529 void
4530 RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4531 {
4532  if (currValue < minValue)
4533  {
4535  {
4536  bool warned_for_promote = false;
4537 
4538  ereport(WARNING,
4539  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4540  errmsg("hot standby is not possible because of insufficient parameter settings"),
4541  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4542  param_name,
4543  currValue,
4544  minValue)));
4545 
4546  SetRecoveryPause(true);
4547 
4548  ereport(LOG,
4549  (errmsg("recovery has paused"),
4550  errdetail("If recovery is unpaused, the server will shut down."),
4551  errhint("You can then restart the server after making the necessary configuration changes.")));
4552 
4554  {
4556 
4557  if (CheckForStandbyTrigger())
4558  {
4559  if (!warned_for_promote)
4560  ereport(WARNING,
4561  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4562  errmsg("promotion is not possible because of insufficient parameter settings"),
4563 
4564  /*
4565  * Repeat the detail from above so it's easy to find
4566  * in the log.
4567  */
4568  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4569  param_name,
4570  currValue,
4571  minValue),
4572  errhint("Restart the server after making the necessary configuration changes.")));
4573  warned_for_promote = true;
4574  }
4575 
4576  /*
4577  * If recovery pause is requested then set it paused. While
4578  * we are in the loop, user might resume and pause again so
4579  * set this every time.
4580  */
4582 
4583  /*
4584  * We wait on a condition variable that will wake us as soon
4585  * as the pause ends, but we use a timeout so we can check the
4586  * above conditions periodically too.
4587  */
4590  }
4592  }
4593 
4594  ereport(FATAL,
4595  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4596  errmsg("recovery aborted because of insufficient parameter settings"),
4597  /* Repeat the detail from above so it's easy to find in the log. */
4598  errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4599  param_name,
4600  currValue,
4601  minValue),
4602  errhint("You can restart the server after making the necessary configuration changes.")));
4603  }
4604 }
TimeLineID findNewestTimeLine(TimeLineID startTLI)
Definition: timeline.c:264
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:552
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:580
bool existsTimeLineHistory(TimeLineID probeTLI)
Definition: timeline.c:222
void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
Definition: timeline.c:50
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
bool tliInHistory(TimeLineID tli, List *expectedTLEs)
Definition: timeline.c:534
void remove_tablespace_symlink(const char *linkloc)
Definition: tablespace.c:929
void HandleStartupProcInterrupts(void)
Definition: startup.c:168
bool IsPromoteSignaled(void)
Definition: startup.c:297
void begin_startup_progress_phase(void)
Definition: startup.c:321
void ResetPromoteSignaled(void)
Definition: startup.c:303
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1687
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1705
Datum timestamptz_in(PG_FUNCTION_ARGS)
Definition: timestamp.c:404
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1574
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1538
const char * timestamptz_to_str(TimestampTz t)
Definition: timestamp.c:1768
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3938
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4156
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
@ RBM_NORMAL_NO_LOG
Definition: bufmgr.h:45
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
Pointer Page
Definition: bufpage.h:78
#define PageGetLSN(page)
Definition: bufpage.h:365
unsigned int uint32
Definition: c.h:441
signed int int32
Definition: c.h:429
#define PG_BINARY
Definition: c.h:1268
#define UINT64_FORMAT
Definition: c.h:484
unsigned char uint8
Definition: c.h:439
uint32 TransactionId
Definition: c.h:587
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:155
size_t Size
Definition: c.h:540
void RequestCheckpoint(int flags)
Definition: checkpointer.c:931
bool ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, uint32 wait_event_info)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableCancelSleep(void)
int64 TimestampTz
Definition: timestamp.h:39
int errmsg_internal(const char *fmt,...)
Definition: elog.c:991
int errcode_for_file_access(void)
Definition: elog.c:716
int errdetail(const char *fmt,...)
Definition: elog.c:1037
ErrorContextCallback * error_context_stack
Definition: elog.c:93
int errhint(const char *fmt,...)
Definition: elog.c:1151
int errcode(int sqlerrcode)
Definition: elog.c:693
int errmsg(const char *fmt,...)
Definition: elog.c:904
#define LOG
Definition: elog.h:25
#define errcontext
Definition: elog.h:190
#define DEBUG3
Definition: elog.h:22
#define FATAL
Definition: elog.h:35
#define WARNING
Definition: elog.h:30
#define DEBUG2
Definition: elog.h:23
#define PANIC
Definition: elog.h:36
#define DEBUG1
Definition: elog.h:24
#define ERROR
Definition: elog.h:33
#define elog(elevel,...)
Definition: elog.h:218
#define ereport(elevel,...)
Definition: elog.h:143
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2461
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1093
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:699
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1071
int FreeFile(FILE *file)
Definition: fd.c:2660
int pg_fsync(int fd)
Definition: fd.c:359
#define DirectFunctionCall3(func, arg1, arg2, arg3)
Definition: fmgr.h:635
bool IsUnderPostmaster
Definition: globals.c:113
char * DataDir
Definition: globals.c:66
bool IsPostmasterEnvironment
Definition: globals.c:112
int trace_recovery_messages
Definition: guc.c:645
#define close(a)
Definition: win32.h:12
void proc_exit(int code)
Definition: ipc.c:104
int i
Definition: isn.c:73
void OwnLatch(Latch *latch)
Definition: latch.c:422
void DisownLatch(Latch *latch)
Definition: latch.c:448
void InitSharedLatch(Latch *latch)
Definition: latch.c:389
void SetLatch(Latch *latch)
Definition: latch.c:591
void ResetLatch(Latch *latch)
Definition: latch.c:683
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition: latch.c:476
#define WL_TIMEOUT
Definition: latch.h:128
#define WL_EXIT_ON_PM_DEATH
Definition: latch.h:130
#define WL_LATCH_SET
Definition: latch.h:125
Assert(fmt[strlen(fmt) - 1] !='\n')
List * lappend(List *list, void *datum)
Definition: list.c:336
void list_free_deep(List *list)
Definition: list.c:1519
char * pstrdup(const char *in)
Definition: mcxt.c:1305
void pfree(void *pointer)
Definition: mcxt.c:1175
void * palloc0(Size size)
Definition: mcxt.c:1099
void * palloc(Size size)
Definition: mcxt.c:1068
#define AmStartupProcess()
Definition: miscadmin.h:444
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:406
void * arg
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:43
#define MAXPGPATH
#define XLOG_RESTORE_POINT
Definition: pg_control.h:74
#define XLOG_OVERWRITE_CONTRECORD
Definition: pg_control.h:80
DBState
Definition: pg_control.h:88
@ DB_IN_ARCHIVE_RECOVERY
Definition: pg_control.h:94
@ DB_SHUTDOWNED_IN_RECOVERY
Definition: pg_control.h:91
@ DB_SHUTDOWNED
Definition: pg_control.h:90
@ DB_IN_CRASH_RECOVERY
Definition: pg_control.h:93
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:67
#define XLOG_BACKUP_END
Definition: pg_control.h:72
#define XLOG_CHECKPOINT_ONLINE
Definition: pg_control.h:68
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:76
const void size_t len
#define lfirst(lc)
Definition: pg_list.h:169
#define NIL
Definition: pg_list.h:65
static rewind_source * source
Definition: pg_rewind.c:81
const char * pg_rusage_show(const PGRUsage *ru0)
Definition: pg_rusage.c:40
void pg_rusage_init(PGRUsage *ru0)
Definition: pg_rusage.c:27
static char * buf
Definition: pg_test_fsync.c:67
void SendPostmasterSignal(PMSignalReason reason)
Definition: pmsignal.c:153
@ PMSIGNAL_RECOVERY_STARTED
Definition: pmsignal.h:35
@ PMSIGNAL_BEGIN_HOT_STANDBY
Definition: pmsignal.h:36
ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset)
Definition: pread.c:27
#define snprintf
Definition: port.h:225
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
#define CStringGetDatum(X)
Definition: postgres.h:622
#define ObjectIdGetDatum(X)
Definition: postgres.h:551
#define Int32GetDatum(X)
Definition: postgres.h:523
#define InvalidOid
Definition: postgres_ext.h:36
static int fd(const char *x, int i)
Definition: preproc-init.c:105
void RecordKnownAssignedTransactionIds(TransactionId xid)
Definition: procarray.c:4377
void set_ps_display(const char *activity)
Definition: ps_status.c:349
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
ForkNumber
Definition: relpath.h:41
@ MAIN_FORKNUM
Definition: relpath.h:43
void RmgrStartup(void)
Definition: rmgr.c:49
void RmgrCleanup(void)
Definition: rmgr.c:65
int slock_t
Definition: s_lock.h:975
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:396
#define SpinLockInit(lock)
Definition: spin.h:60
#define SpinLockRelease(lock)
Definition: spin.h:64
#define SpinLockAcquire(lock)
Definition: spin.h:62
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
void appendStringInfo(StringInfo str, const char *fmt,...)
Definition: stringinfo.c:91
void appendStringInfoString(StringInfo str, const char *s)
Definition: stringinfo.c:176
void appendStringInfoChar(StringInfo str, char ch)
Definition: stringinfo.c:188
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
Oid oldestMultiDB
Definition: pg_control.h:50
MultiXactId oldestMulti
Definition: pg_control.h:49
MultiXactOffset nextMultiOffset
Definition: pg_control.h:46
TransactionId newestCommitTsXid
Definition: pg_control.h:54
TransactionId oldestXid
Definition: pg_control.h:47
TimeLineID PrevTimeLineID
Definition: pg_control.h:40
TimeLineID ThisTimeLineID
Definition: pg_control.h:39
Oid nextOid
Definition: pg_control.h:44
MultiXactId nextMulti
Definition: pg_control.h:45
FullTransactionId nextXid
Definition: pg_control.h:43
TransactionId oldestCommitTsXid
Definition: pg_control.h:52
XLogRecPtr redo
Definition: pg_control.h:37
Oid oldestXidDB
Definition: pg_control.h:48
XLogRecPtr backupStartPoint
Definition: pg_control.h:168
bool backupEndRequired
Definition: pg_control.h:170
CheckPoint checkPointCopy
Definition: pg_control.h:133
XLogRecPtr backupEndPoint
Definition: pg_control.h:169
XLogRecPtr minRecoveryPoint
Definition: pg_control.h:166
XLogRecPtr checkPoint
Definition: pg_control.h:131
uint64 system_identifier
Definition: pg_control.h:108
TimeLineID minRecoveryPointTLI
Definition: pg_control.h:167
XLogRecPtr lastPageBeginPtr
Definition: xlogrecovery.h:110
XLogRecPtr abortedRecPtr
Definition: xlogrecovery.h:119
XLogRecPtr missingContrecPtr
Definition: xlogrecovery.h:120
TimeLineID endOfLogTLI
Definition: xlogrecovery.h:108
struct ErrorContextCallback * previous
Definition: elog.h:232
void(* callback)(void *arg)
Definition: elog.h:233
Definition: latch.h:111
Definition: pg_list.h:51
void(* rm_mask)(char *pagedata, BlockNumber blkno)
void(* rm_redo)(XLogReaderState *record)
const char *(* rm_identify)(uint8 info)
const char * rm_name
void(* rm_desc)(StringInfo buf, XLogReaderState *record)
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h:29
TimeLineID ws_tli
Definition: xlogreader.h:49
TimeLineID replayTLI
Definition: xlogrecovery.c:196
XLogRecPtr missingContrecPtr
Definition: xlogreader.h:215
char * errormsg_buf
Definition: xlogreader.h:311
XLogRecPtr EndRecPtr
Definition: xlogreader.h:207
uint64 system_identifier
Definition: xlogreader.h:191
XLogRecPtr ReadRecPtr
Definition: xlogreader.h:206
XLogRecPtr abortedRecPtr
Definition: xlogreader.h:214
TimeLineID latestPageTLI
Definition: xlogreader.h:280
XLogRecPtr overwrittenRecPtr
Definition: xlogreader.h:217
XLogRecPtr latestPagePtr
Definition: xlogreader.h:279
WALOpenSegment seg
Definition: xlogreader.h:272
void * private_data
Definition: xlogreader.h:196
uint8 xl_info
Definition: xlogrecord.h:46
uint32 xl_tot_len
Definition: xlogrecord.h:43
TransactionId xl_xid
Definition: xlogrecord.h:44
RmgrId xl_rmid
Definition: xlogrecord.h:47
ConditionVariable recoveryNotPausedCV
Definition: xlogrecovery.c:356
XLogRecPtr lastReplayedEndRecPtr
Definition: xlogrecovery.c:336
TimeLineID replayEndTLI
Definition: xlogrecovery.c:345
TimeLineID lastReplayedTLI
Definition: xlogrecovery.c:337
TimestampTz currentChunkStartTime
Definition: xlogrecovery.c:353
XLogRecPtr replayEndRecPtr
Definition: xlogrecovery.c:344
TimestampTz recoveryLastXTime
Definition: xlogrecovery.c:347
RecoveryPauseState recoveryPauseState
Definition: xlogrecovery.c:355
XLogRecPtr lastReplayedReadRecPtr
Definition: xlogrecovery.c:335
Definition: guc.h:169
Definition: regguts.h:318
TimeLineID PrevTimeLineID
TimeLineID ThisTimeLineID
char rp_name[MAXFNAMELEN]
TransactionId twophase_xid
Definition: xact.h:408
TransactionId twophase_xid
Definition: xact.h:378
#define InvalidTransactionId
Definition: transam.h:31
#define U64FromFullTransactionId(x)
Definition: transam.h:49
#define XidFromFullTransactionId(x)
Definition: transam.h:48
#define TransactionIdIsValid(xid)
Definition: transam.h:41
#define TransactionIdIsNormal(xid)
Definition: transam.h:42
#define TimestampTzPlusMilliseconds(tz, ms)
Definition: timestamp.h:56
#define DatumGetTimestampTz(X)
Definition: timestamp.h:28
void AdvanceNextFullTransactionIdPastXid(TransactionId xid)
Definition: varsup.c:277
@ WAIT_EVENT_RECOVERY_WAL_STREAM
Definition: wait_event.h:45
@ WAIT_EVENT_WAL_READ
Definition: wait_event.h:229
@ WAIT_EVENT_RECOVERY_PAUSE
Definition: wait_event.h:123
@ WAIT_EVENT_RECOVERY_APPLY_DELAY
Definition: wait_event.h:145
@ WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL
Definition: wait_event.h:146
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:266
static void pgstat_report_wait_end(void)
Definition: wait_event.h:282
void WalRcvForceReply(void)
Definition: walreceiver.c:1297
#define AllowCascadeReplication()
Definition: walreceiver.h:40
XLogRecPtr GetWalRcvFlushRecPtr(XLogRecPtr *latestChunkStart, TimeLineID *receiveTLI)
bool WalRcvStreaming(void)
void RequestXLogStreaming(TimeLineID tli, XLogRecPtr recptr, const char *conninfo, const char *slotname, bool create_temp_slot)
bool WalRcvRunning(void)
void WalSndWakeup(void)
Definition: walsender.c:3309
#define stat
Definition: win32_port.h:283
#define S_IRUSR
Definition: win32_port.h:288
#define symlink(oldpath, newpath)
Definition: win32_port.h:235
#define S_IWUSR
Definition: win32_port.h:291
#define XLOG_XACT_COMMIT_PREPARED
Definition: xact.h:159
#define XLOG_XACT_COMMIT
Definition: xact.h:156
#define XLOG_XACT_OPMASK
Definition: xact.h:166
#define XLOG_XACT_ABORT
Definition: xact.h:158
#define XLOG_XACT_ABORT_PREPARED
Definition: xact.h:160
void ParseCommitRecord(uint8 info, xl_xact_commit *xlrec, xl_xact_parsed_commit *parsed)
Definition: xactdesc.c:35
void ParseAbortRecord(uint8 info, xl_xact_abort *xlrec, xl_xact_parsed_abort *parsed)
Definition: xactdesc.c:141
int wal_decode_buffer_size
Definition: xlog.c:137
bool EnableHotStandby
Definition: xlog.c:122
XLogRecPtr GetRedoRecPtr(void)
Definition: xlog.c:5865
void SetInstallXLogFileSegmentActive(void)
Definition: xlog.c:8863
bool IsInstallXLogFileSegmentActive(void)
Definition: xlog.c:8871
int wal_segment_size
Definition: xlog.c:144
void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
Definition: xlog.c:5637
void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
Definition: xlog.c:3630
void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
Definition: xlog.c:5675
int wal_retrieve_retry_interval
Definition: xlog.c:135
static ControlFileData * ControlFile
Definition: xlog.c:566
void XLogShutdownWalRcv(void)
Definition: xlog.c:8852
bool XLogCheckpointNeeded(XLogSegNo new_segno)
Definition: xlog.c:2039
#define TABLESPACE_MAP_OLD
Definition: xlog.h:296
#define TABLESPACE_MAP
Definition: xlog.h:295
#define STANDBY_SIGNAL_FILE
Definition: xlog.h:291
#define CHECKPOINT_CAUSE_XLOG
Definition: xlog.h:143
#define PROMOTE_SIGNAL_FILE
Definition: xlog.h:299
#define BACKUP_LABEL_FILE
Definition: xlog.h:292
#define RECOVERY_SIGNAL_FILE
Definition: xlog.h:290
static RmgrData GetRmgr(RmgrId rmid)
@ RECOVERY_TARGET_ACTION_PAUSE
@ RECOVERY_TARGET_ACTION_PROMOTE
@ RECOVERY_TARGET_ACTION_SHUTDOWN
#define XLogSegmentOffset(xlogptr, wal_segsz_bytes)
#define XLogFileName(fname, tli, logSegNo, wal_segsz_bytes)
#define MAXFNAMELEN
#define XLogFilePath(path, tli, logSegNo, wal_segsz_bytes)
#define XLOGDIR
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)
#define XRecOffIsValid(xlrp)
#define XLByteInSeg(xlrp, logSegNo, wal_segsz_bytes)
bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize, bool cleanupEnabled)
Definition: xlogarchive.c:54
void KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
Definition: xlogarchive.c:388
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
uint32 TimeLineID
Definition: xlogdefs.h:59
uint64 XLogSegNo
Definition: xlogdefs.h:48
void XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher)
XLogRecord * XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg)
void XLogPrefetchReconfigure(void)
XLogPrefetcher * XLogPrefetcherAllocate(XLogReaderState *reader)
void XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
void XLogPrefetcherFree(XLogPrefetcher *prefetcher)
XLogReaderState * XLogPrefetcherGetReader(XLogPrefetcher *prefetcher)
void XLogReaderSetDecodeBuffer(XLogReaderState *state, void *buffer, size_t size)
Definition: xlogreader.c:92
bool XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, char *phdr)
Definition: xlogreader.c:1190
void XLogReaderFree(XLogReaderState *state)
Definition: xlogreader.c:170
XLogReaderState * XLogReaderAllocate(int wal_segment_size, const char *waldir, XLogReaderRoutine *routine, void *private_data)
Definition: xlogreader.c:108
bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
Definition: xlogreader.c:2011
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
Definition: xlogreader.c:1953
#define XLogRecGetDataLen(decoder)
Definition: xlogreader.h:414
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:408
#define XLogRecBlockImageApply(decoder, block_id)
Definition: xlogreader.h:423
#define XLogRecGetRmid(decoder)
Definition: xlogreader.h:409
#define XLogRecGetData(decoder)
Definition: xlogreader.h:413
#define XLogRecGetXid(decoder)
Definition: xlogreader.h:410
#define XL_ROUTINE(...)
Definition: xlogreader.h:117
#define XLogRecMaxBlockId(decoder)
Definition: xlogreader.h:416
XLogPageReadResult
Definition: xlogreader.h:351
@ XLREAD_WOULDBLOCK
Definition: xlogreader.h:354
@ XLREAD_SUCCESS
Definition: xlogreader.h:352
@ XLREAD_FAIL
Definition: xlogreader.h:353
#define XLogRecHasBlockImage(decoder, block_id)
Definition: xlogreader.h:421
#define XLogRecGetPrev(decoder)
Definition: xlogreader.h:407
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:415
#define SizeOfXLogRecordDataHeaderShort
Definition: xlogrecord.h:206
#define XLR_INFO_MASK
Definition: xlogrecord.h:62
#define SizeOfXLogRecord
Definition: xlogrecord.h:55
#define XLR_CHECK_CONSISTENCY
Definition: xlogrecord.h:80
bool reachedConsistency
Definition: xlogrecovery.c:291
static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
static XLogRecPtr recoveryStopLSN
Definition: xlogrecovery.c:378
static bool recoveryStopsBefore(XLogReaderState *record)
static TimestampTz recoveryStopTime
Definition: xlogrecovery.c:377
static bool CheckForStandbyTrigger(void)
int recovery_min_apply_delay
Definition: xlogrecovery.c:89
static bool backupEndRequired
Definition: xlogrecovery.c:280
bool HotStandbyActive(void)
static char * getRecoveryStopReason(void)
void ShutdownWalRecovery(void)
RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal
Definition: xlogrecovery.c:117
int recoveryTargetAction
Definition: xlogrecovery.c:83
static void rm_redo_error_callback(void *arg)
static bool recoveryApplyDelay(XLogReaderState *record)
bool ArchiveRecoveryRequested
Definition: xlogrecovery.c:134
const char * recoveryTargetName
Definition: xlogrecovery.c:87
static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
static XLogRecPtr minRecoveryPoint
Definition: xlogrecovery.c:275
static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf)
static XLogRecPtr backupEndPoint
Definition: xlogrecovery.c:279
const struct config_enum_entry recovery_target_action_options[]
Definition: xlogrecovery.c:70
static void validateRecoveryParameters(void)
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, TimeLineID replayTLI)
void StartupRequestWalReceiverRestart(void)
bool InArchiveRecovery
Definition: xlogrecovery.c:135
static bool recoveryStopsAfter(XLogReaderState *record)
void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
char * PrimarySlotName
Definition: xlogrecovery.c:93
char * PromoteTriggerFile
Definition: xlogrecovery.c:94
static TimeLineID curFileTLI
Definition: xlogrecovery.c:121
static char recoveryStopName[MAXFNAMELEN]
Definition: xlogrecovery.c:379
static void CheckRecoveryConsistency(void)
static bool pendingWalRcvRestart
Definition: xlogrecovery.c:245
void PerformWalRecovery(void)
static XLogSource XLogReceiptSource
Definition: xlogrecovery.c:256
bool CheckPromoteSignal(void)
struct XLogPageReadPrivate XLogPageReadPrivate
static bool recoveryStopAfter
Definition: xlogrecovery.c:380
static const char *const xlogSourceNames[]
Definition: xlogrecovery.c:215
static TimeLineID RedoStartTLI
Definition: xlogrecovery.c:167
char * recoveryRestoreCommand
Definition: xlogrecovery.c:78
static void verifyBackupPageConsistency(XLogReaderState *record)
void SetRecoveryPause(bool recoveryPause)
EndOfWalRecoveryInfo * FinishWalRecovery(void)
static bool lastSourceFailed
Definition: xlogrecovery.c:244
char * archiveCleanupCommand
Definition: xlogrecovery.c:80
XLogRecPtr GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
static TimeLineID receiveTLI
Definition: xlogrecovery.c:260
void WakeupRecovery(void)
void xlog_outdesc(StringInfo buf, XLogReaderState *record)
static bool LocalPromoteIsTriggered
Definition: xlogrecovery.c:179
bool PromoteIsTriggered(void)
TimestampTz GetCurrentChunkReplayStartTime(void)
static void ConfirmRecoveryPaused(void)
static void readRecoverySignalFile(void)
Definition: xlogrecovery.c:966
static XLogRecPtr missingContrecPtr
Definition: xlogrecovery.c:370
static XLogRecoveryCtlData * XLogRecoveryCtl
Definition: xlogrecovery.c:361
static uint32 readOff
Definition: xlogrecovery.c:229
static bool standby_signal_file_found
Definition: xlogrecovery.c:147
char * recovery_target_time_string
Definition: xlogrecovery.c:85
bool StandbyMode
Definition: xlogrecovery.c:144
static int readFile
Definition: xlogrecovery.c:227
static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr, TimeLineID replayTLI, XLogRecPtr replayLSN, bool nonblocking)
XLogRecPtr recoveryTargetLSN
Definition: xlogrecovery.c:88
RecoveryTargetType recoveryTarget
Definition: xlogrecovery.c:81
static bool read_tablespace_map(List **tablespaces)
static bool doRequestWalReceiverReply
Definition: xlogrecovery.c:182
static bool read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, bool *backupEndRequired, bool *backupFromStandby)
static XLogSource currentSource
Definition: xlogrecovery.c:243
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI)
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
static List * expectedTLEs
Definition: xlogrecovery.c:120
static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
static XLogSegNo readSegNo
Definition: xlogrecovery.c:228
static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, XLogSource source, bool notfoundOk)
static XLogRecPtr abortedRecPtr
Definition: xlogrecovery.c:369
static char * primary_image_masked
Definition: xlogrecovery.c:295
static TimeLineID minRecoveryPointTLI
Definition: xlogrecovery.c:276
static XLogRecord * ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, bool fetching_ckpt, TimeLineID replayTLI)
static void SetCurrentChunkStartTime(TimestampTz xtime)
static XLogRecPtr CheckPointLoc
Definition: xlogrecovery.c:164
static bool LocalHotStandbyActive
Definition: xlogrecovery.c:173
struct XLogRecoveryCtlData XLogRecoveryCtlData