PostgreSQL Source Code  git master
pg_rewind.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * pg_rewind.c
4  * Synchronizes a PostgreSQL data directory to a new timeline
5  *
6  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7  *
8  *-------------------------------------------------------------------------
9  */
10 #include "postgres_fe.h"
11 
12 #include <sys/stat.h>
13 #include <fcntl.h>
14 #include <time.h>
15 #include <unistd.h>
16 
17 #include "access/timeline.h"
18 #include "access/xlog_internal.h"
19 #include "catalog/catversion.h"
20 #include "catalog/pg_control.h"
22 #include "common/file_perm.h"
24 #include "common/string.h"
25 #include "fe_utils/recovery_gen.h"
26 #include "fe_utils/string_utils.h"
27 #include "file_ops.h"
28 #include "filemap.h"
29 #include "getopt_long.h"
30 #include "pg_rewind.h"
31 #include "rewind_source.h"
32 #include "storage/bufpage.h"
33 
34 static void usage(const char *progname);
35 
36 static void perform_rewind(filemap_t *filemap, rewind_source *source,
37  XLogRecPtr chkptrec,
38  TimeLineID chkpttli,
39  XLogRecPtr chkptredo);
40 
41 static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli,
42  XLogRecPtr checkpointloc);
43 
45  const char *content, size_t size);
46 static void getRestoreCommand(const char *argv0);
47 static void sanityChecks(void);
48 static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex);
49 static void ensureCleanShutdown(const char *argv0);
50 static void disconnect_atexit(void);
51 
55 
56 const char *progname;
58 
59 /* Configuration options */
60 char *datadir_target = NULL;
61 char *datadir_source = NULL;
62 char *connstr_source = NULL;
63 char *restore_command = NULL;
64 char *config_file = NULL;
65 
66 static bool debug = false;
67 bool showprogress = false;
68 bool dry_run = false;
69 bool do_sync = true;
70 bool restore_wal = false;
71 
72 /* Target history */
75 
76 /* Progress counters */
77 uint64 fetch_size;
78 uint64 fetch_done;
79 
80 static PGconn *conn;
82 
83 static void
84 usage(const char *progname)
85 {
86  printf(_("%s resynchronizes a PostgreSQL cluster with another copy of the cluster.\n\n"), progname);
87  printf(_("Usage:\n %s [OPTION]...\n\n"), progname);
88  printf(_("Options:\n"));
89  printf(_(" -c, --restore-target-wal use restore_command in target configuration to\n"
90  " retrieve WAL files from archives\n"));
91  printf(_(" -D, --target-pgdata=DIRECTORY existing data directory to modify\n"));
92  printf(_(" --source-pgdata=DIRECTORY source data directory to synchronize with\n"));
93  printf(_(" --source-server=CONNSTR source server to synchronize with\n"));
94  printf(_(" -n, --dry-run stop before modifying anything\n"));
95  printf(_(" -N, --no-sync do not wait for changes to be written\n"
96  " safely to disk\n"));
97  printf(_(" -P, --progress write progress messages\n"));
98  printf(_(" -R, --write-recovery-conf write configuration for replication\n"
99  " (requires --source-server)\n"));
100  printf(_(" --config-file=FILENAME use specified main server configuration\n"
101  " file when running target cluster\n"));
102  printf(_(" --debug write a lot of debug messages\n"));
103  printf(_(" --no-ensure-shutdown do not automatically fix unclean shutdown\n"));
104  printf(_(" -V, --version output version information, then exit\n"));
105  printf(_(" -?, --help show this help, then exit\n"));
106  printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
107  printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
108 }
109 
110 
111 int
112 main(int argc, char **argv)
113 {
114  static struct option long_options[] = {
115  {"help", no_argument, NULL, '?'},
116  {"target-pgdata", required_argument, NULL, 'D'},
117  {"write-recovery-conf", no_argument, NULL, 'R'},
118  {"source-pgdata", required_argument, NULL, 1},
119  {"source-server", required_argument, NULL, 2},
120  {"no-ensure-shutdown", no_argument, NULL, 4},
121  {"config-file", required_argument, NULL, 5},
122  {"version", no_argument, NULL, 'V'},
123  {"restore-target-wal", no_argument, NULL, 'c'},
124  {"dry-run", no_argument, NULL, 'n'},
125  {"no-sync", no_argument, NULL, 'N'},
126  {"progress", no_argument, NULL, 'P'},
127  {"debug", no_argument, NULL, 3},
128  {NULL, 0, NULL, 0}
129  };
130  int option_index;
131  int c;
132  XLogRecPtr divergerec;
133  int lastcommontliIndex;
134  XLogRecPtr chkptrec;
135  TimeLineID chkpttli;
136  XLogRecPtr chkptredo;
137  XLogRecPtr target_wal_endrec;
138  size_t size;
139  char *buffer;
140  bool no_ensure_shutdown = false;
141  bool rewind_needed;
142  bool writerecoveryconf = false;
143  filemap_t *filemap;
144 
145  pg_logging_init(argv[0]);
146  set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));
147  progname = get_progname(argv[0]);
148 
149  /* Process command-line arguments */
150  if (argc > 1)
151  {
152  if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
153  {
154  usage(progname);
155  exit(0);
156  }
157  if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
158  {
159  puts("pg_rewind (PostgreSQL) " PG_VERSION);
160  exit(0);
161  }
162  }
163 
164  while ((c = getopt_long(argc, argv, "cD:nNPR", long_options, &option_index)) != -1)
165  {
166  switch (c)
167  {
168  case 'c':
169  restore_wal = true;
170  break;
171 
172  case 'P':
173  showprogress = true;
174  break;
175 
176  case 'n':
177  dry_run = true;
178  break;
179 
180  case 'N':
181  do_sync = false;
182  break;
183 
184  case 'R':
185  writerecoveryconf = true;
186  break;
187 
188  case 3:
189  debug = true;
191  break;
192 
193  case 'D': /* -D or --target-pgdata */
195  break;
196 
197  case 1: /* --source-pgdata */
199  break;
200 
201  case 2: /* --source-server */
203  break;
204 
205  case 4:
206  no_ensure_shutdown = true;
207  break;
208 
209  case 5:
211  break;
212 
213  default:
214  /* getopt_long already emitted a complaint */
215  pg_log_error_hint("Try \"%s --help\" for more information.", progname);
216  exit(1);
217  }
218  }
219 
220  if (datadir_source == NULL && connstr_source == NULL)
221  {
222  pg_log_error("no source specified (--source-pgdata or --source-server)");
223  pg_log_error_hint("Try \"%s --help\" for more information.", progname);
224  exit(1);
225  }
226 
227  if (datadir_source != NULL && connstr_source != NULL)
228  {
229  pg_log_error("only one of --source-pgdata or --source-server can be specified");
230  pg_log_error_hint("Try \"%s --help\" for more information.", progname);
231  exit(1);
232  }
233 
234  if (datadir_target == NULL)
235  {
236  pg_log_error("no target data directory specified (--target-pgdata)");
237  pg_log_error_hint("Try \"%s --help\" for more information.", progname);
238  exit(1);
239  }
240 
241  if (writerecoveryconf && connstr_source == NULL)
242  {
243  pg_log_error("no source server information (--source-server) specified for --write-recovery-conf");
244  pg_log_error_hint("Try \"%s --help\" for more information.", progname);
245  exit(1);
246  }
247 
248  if (optind < argc)
249  {
250  pg_log_error("too many command-line arguments (first is \"%s\")",
251  argv[optind]);
252  pg_log_error_hint("Try \"%s --help\" for more information.", progname);
253  exit(1);
254  }
255 
256  /*
257  * Don't allow pg_rewind to be run as root, to avoid overwriting the
258  * ownership of files in the data directory. We need only check for root
259  * -- any other user won't have sufficient permissions to modify files in
260  * the data directory.
261  */
262 #ifndef WIN32
263  if (geteuid() == 0)
264  {
265  pg_log_error("cannot be executed by \"root\"");
266  pg_log_error_hint("You must run %s as the PostgreSQL superuser.",
267  progname);
268  exit(1);
269  }
270 #endif
271 
273 
274  /* Set mask based on PGDATA permissions */
276  pg_fatal("could not read permissions of directory \"%s\": %m",
278 
279  umask(pg_mode_mask);
280 
281  getRestoreCommand(argv[0]);
282 
283  atexit(disconnect_atexit);
284 
285  /*
286  * Ok, we have all the options and we're ready to start. First, connect to
287  * remote server.
288  */
289  if (connstr_source)
290  {
292 
293  if (PQstatus(conn) == CONNECTION_BAD)
294  pg_fatal("%s", PQerrorMessage(conn));
295 
296  if (showprogress)
297  pg_log_info("connected to server");
298 
300  }
301  else
303 
304  /*
305  * Check the status of the target instance.
306  *
307  * If the target instance was not cleanly shut down, start and stop the
308  * target cluster once in single-user mode to enforce recovery to finish,
309  * ensuring that the cluster can be used by pg_rewind. Note that if
310  * no_ensure_shutdown is specified, pg_rewind ignores this step, and users
311  * need to make sure by themselves that the target cluster is in a clean
312  * state.
313  */
314  buffer = slurpFile(datadir_target, "global/pg_control", &size);
315  digestControlFile(&ControlFile_target, buffer, size);
316  pg_free(buffer);
317 
318  if (!no_ensure_shutdown &&
321  {
322  ensureCleanShutdown(argv[0]);
323 
324  buffer = slurpFile(datadir_target, "global/pg_control", &size);
325  digestControlFile(&ControlFile_target, buffer, size);
326  pg_free(buffer);
327  }
328 
329  buffer = source->fetch_file(source, "global/pg_control", &size);
330  digestControlFile(&ControlFile_source, buffer, size);
331  pg_free(buffer);
332 
333  sanityChecks();
334 
335  /*
336  * Find the common ancestor timeline between the clusters.
337  *
338  * If both clusters are already on the same timeline, there's nothing to
339  * do.
340  */
343  {
344  pg_log_info("source and target cluster are on the same timeline");
345  rewind_needed = false;
346  target_wal_endrec = 0;
347  }
348  else
349  {
350  XLogRecPtr chkptendrec;
351 
352  findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);
353  pg_log_info("servers diverged at WAL location %X/%X on timeline %u",
354  LSN_FORMAT_ARGS(divergerec),
355  targetHistory[lastcommontliIndex].tli);
356 
357  /*
358  * Determine the end-of-WAL on the target.
359  *
360  * The WAL ends at the last shutdown checkpoint, or at
361  * minRecoveryPoint if it was a standby. (If we supported rewinding a
362  * server that was not shut down cleanly, we would need to replay
363  * until we reach the first invalid record, like crash recovery does.)
364  */
365 
366  /* read the checkpoint record on the target to see where it ends. */
367  chkptendrec = readOneRecord(datadir_target,
369  targetNentries - 1,
371 
372  if (ControlFile_target.minRecoveryPoint > chkptendrec)
373  {
374  target_wal_endrec = ControlFile_target.minRecoveryPoint;
375  }
376  else
377  {
378  target_wal_endrec = chkptendrec;
379  }
380 
381  /*
382  * Check for the possibility that the target is in fact a direct
383  * ancestor of the source. In that case, there is no divergent history
384  * in the target that needs rewinding.
385  */
386  if (target_wal_endrec > divergerec)
387  {
388  rewind_needed = true;
389  }
390  else
391  {
392  /* the last common checkpoint record must be part of target WAL */
393  Assert(target_wal_endrec == divergerec);
394 
395  rewind_needed = false;
396  }
397  }
398 
399  if (!rewind_needed)
400  {
401  pg_log_info("no rewind required");
402  if (writerecoveryconf && !dry_run)
405  exit(0);
406  }
407 
408  findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex,
409  &chkptrec, &chkpttli, &chkptredo, restore_command);
410  pg_log_info("rewinding from last common checkpoint at %X/%X on timeline %u",
411  LSN_FORMAT_ARGS(chkptrec), chkpttli);
412 
413  /* Initialize the hash table to track the status of each file */
414  filehash_init();
415 
416  /*
417  * Collect information about all files in the both data directories.
418  */
419  if (showprogress)
420  pg_log_info("reading source file list");
422 
423  if (showprogress)
424  pg_log_info("reading target file list");
426 
427  /*
428  * Read the target WAL from last checkpoint before the point of fork, to
429  * extract all the pages that were modified on the target cluster after
430  * the fork.
431  */
432  if (showprogress)
433  pg_log_info("reading WAL in target");
434  extractPageMap(datadir_target, chkptrec, lastcommontliIndex,
435  target_wal_endrec, restore_command);
436 
437  /*
438  * We have collected all information we need from both systems. Decide
439  * what to do with each file.
440  */
441  filemap = decide_file_actions();
442  if (showprogress)
443  calculate_totals(filemap);
444 
445  /* this is too verbose even for verbose mode */
446  if (debug)
447  print_filemap(filemap);
448 
449  /*
450  * Ok, we're ready to start copying things over.
451  */
452  if (showprogress)
453  {
454  pg_log_info("need to copy %lu MB (total source directory size is %lu MB)",
455  (unsigned long) (filemap->fetch_size / (1024 * 1024)),
456  (unsigned long) (filemap->total_size / (1024 * 1024)));
457 
458  fetch_size = filemap->fetch_size;
459  fetch_done = 0;
460  }
461 
462  /*
463  * We have now collected all the information we need from both systems,
464  * and we are ready to start modifying the target directory.
465  *
466  * This is the point of no return. Once we start copying things, there is
467  * no turning back!
468  */
469  perform_rewind(filemap, source, chkptrec, chkpttli, chkptredo);
470 
471  if (showprogress)
472  pg_log_info("syncing target data directory");
473  sync_target_dir();
474 
475  /* Also update the standby configuration, if requested. */
476  if (writerecoveryconf && !dry_run)
479 
480  /* don't need the source connection anymore */
482  if (conn)
483  {
484  PQfinish(conn);
485  conn = NULL;
486  }
487 
488  pg_log_info("Done!");
489 
490  return 0;
491 }
492 
493 /*
494  * Perform the rewind.
495  *
496  * We have already collected all the information we need from the
497  * target and the source.
498  */
499 static void
501  XLogRecPtr chkptrec,
502  TimeLineID chkpttli,
503  XLogRecPtr chkptredo)
504 {
505  XLogRecPtr endrec;
506  TimeLineID endtli;
507  ControlFileData ControlFile_new;
508  size_t size;
509  char *buffer;
510 
511  /*
512  * Execute the actions in the file map, fetching data from the source
513  * system as needed.
514  */
515  for (int i = 0; i < filemap->nentries; i++)
516  {
517  file_entry_t *entry = filemap->entries[i];
518 
519  /*
520  * If this is a relation file, copy the modified blocks.
521  *
522  * This is in addition to any other changes.
523  */
524  if (entry->target_pages_to_overwrite.bitmapsize > 0)
525  {
527  BlockNumber blkno;
528  off_t offset;
529 
531  while (datapagemap_next(iter, &blkno))
532  {
533  offset = blkno * BLCKSZ;
534  source->queue_fetch_range(source, entry->path, offset, BLCKSZ);
535  }
536  pg_free(iter);
537  }
538 
539  switch (entry->action)
540  {
541  case FILE_ACTION_NONE:
542  /* nothing else to do */
543  break;
544 
545  case FILE_ACTION_COPY:
546  source->queue_fetch_file(source, entry->path, entry->source_size);
547  break;
548 
550  truncate_target_file(entry->path, entry->source_size);
551  break;
552 
555  entry->target_size,
556  entry->source_size - entry->target_size);
557  break;
558 
559  case FILE_ACTION_REMOVE:
560  remove_target(entry);
561  break;
562 
563  case FILE_ACTION_CREATE:
564  create_target(entry);
565  break;
566 
568  pg_fatal("no action decided for file \"%s\"", entry->path);
569  break;
570  }
571  }
572 
573  /* Complete any remaining range-fetches that we queued up above. */
575 
577 
578  progress_report(true);
579 
580  /*
581  * Fetch the control file from the source last. This ensures that the
582  * minRecoveryPoint is up-to-date.
583  */
584  buffer = source->fetch_file(source, "global/pg_control", &size);
586  pg_free(buffer);
587 
588  /*
589  * Sanity check: If the source is a local system, the control file should
590  * not have changed since we started.
591  *
592  * XXX: We assume it hasn't been modified, but actually, what could go
593  * wrong? The logic handles a libpq source that's modified concurrently,
594  * why not a local datadir?
595  */
596  if (datadir_source &&
598  sizeof(ControlFileData)) != 0)
599  {
600  pg_fatal("source system was modified while pg_rewind was running");
601  }
602 
603  if (showprogress)
604  pg_log_info("creating backup label and updating control file");
605 
606  /*
607  * Create a backup label file, to tell the target where to begin the WAL
608  * replay. Normally, from the last common checkpoint between the source
609  * and the target. But if the source is a standby server, it's possible
610  * that the last common checkpoint is *after* the standby's restartpoint.
611  * That implies that the source server has applied the checkpoint record,
612  * but hasn't performed a corresponding restartpoint yet. Make sure we
613  * start at the restartpoint's redo point in that case.
614  *
615  * Use the old version of the source's control file for this. The server
616  * might have finished the restartpoint after we started copying files,
617  * but we must begin from the redo point at the time that started copying.
618  */
619  if (ControlFile_source.checkPointCopy.redo < chkptredo)
620  {
623  chkptrec = ControlFile_source.checkPoint;
624  }
625  createBackupLabel(chkptredo, chkpttli, chkptrec);
626 
627  /*
628  * Update control file of target, to tell the target how far it must
629  * replay the WAL (minRecoveryPoint).
630  */
631  if (connstr_source)
632  {
633  /*
634  * The source is a live server. Like in an online backup, it's
635  * important that we recover all the WAL that was generated while we
636  * were copying files.
637  */
639  {
640  /*
641  * Source is a standby server. We must replay to its
642  * minRecoveryPoint.
643  */
646  }
647  else
648  {
649  /*
650  * Source is a production, non-standby, server. We must replay to
651  * the last WAL insert location.
652  */
654  pg_fatal("source system was in unexpected state at end of rewind");
655 
658  }
659  }
660  else
661  {
662  /*
663  * Source is a local data directory. It should've shut down cleanly,
664  * and we must replay to the latest shutdown checkpoint.
665  */
668  }
669 
670  memcpy(&ControlFile_new, &ControlFile_source_after, sizeof(ControlFileData));
671  ControlFile_new.minRecoveryPoint = endrec;
672  ControlFile_new.minRecoveryPointTLI = endtli;
673  ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;
674  if (!dry_run)
675  update_controlfile(datadir_target, &ControlFile_new, do_sync);
676 }
677 
678 static void
680 {
681  /* TODO Check that there's no backup_label in either cluster */
682 
683  /* Check system_identifier match */
685  pg_fatal("source and target clusters are from different systems");
686 
687  /* check version */
692  {
693  pg_fatal("clusters are not compatible with this version of pg_rewind");
694  }
695 
696  /*
697  * Target cluster need to use checksums or hint bit wal-logging, this to
698  * prevent from data corruption that could occur because of hint bits.
699  */
702  {
703  pg_fatal("target server needs to use either data checksums or \"wal_log_hints = on\"");
704  }
705 
706  /*
707  * Target cluster better not be running. This doesn't guard against
708  * someone starting the cluster concurrently. Also, this is probably more
709  * strict than necessary; it's OK if the target node was not shut down
710  * cleanly, as long as it isn't running at the moment.
711  */
714  pg_fatal("target server must be shut down cleanly");
715 
716  /*
717  * When the source is a data directory, also require that the source
718  * server is shut down. There isn't any very strong reason for this
719  * limitation, but better safe than sorry.
720  */
721  if (datadir_source &&
724  pg_fatal("source data directory must be shut down cleanly");
725 }
726 
727 /*
728  * Print a progress report based on the fetch_size and fetch_done variables.
729  *
730  * Progress report is written at maximum once per second, except that the
731  * last progress report is always printed.
732  *
733  * If finished is set to true, this is the last progress report. The cursor
734  * is moved to the next line.
735  */
736 void
737 progress_report(bool finished)
738 {
739  static pg_time_t last_progress_report = 0;
740  int percent;
741  char fetch_done_str[32];
742  char fetch_size_str[32];
743  pg_time_t now;
744 
745  if (!showprogress)
746  return;
747 
748  now = time(NULL);
749  if (now == last_progress_report && !finished)
750  return; /* Max once per second */
751 
753  percent = fetch_size ? (int) ((fetch_done) * 100 / fetch_size) : 0;
754 
755  /*
756  * Avoid overflowing past 100% or the full size. This may make the total
757  * size number change as we approach the end of the backup (the estimate
758  * will always be wrong if WAL is included), but that's better than having
759  * the done column be bigger than the total.
760  */
761  if (percent > 100)
762  percent = 100;
763  if (fetch_done > fetch_size)
765 
766  snprintf(fetch_done_str, sizeof(fetch_done_str), UINT64_FORMAT,
767  fetch_done / 1024);
768  snprintf(fetch_size_str, sizeof(fetch_size_str), UINT64_FORMAT,
769  fetch_size / 1024);
770 
771  fprintf(stderr, _("%*s/%s kB (%d%%) copied"),
772  (int) strlen(fetch_size_str), fetch_done_str, fetch_size_str,
773  percent);
774 
775  /*
776  * Stay on the same line if reporting to a terminal and we're not done
777  * yet.
778  */
779  fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
780 }
781 
782 /*
783  * Find minimum from two WAL locations assuming InvalidXLogRecPtr means
784  * infinity as src/include/access/timeline.h states. This routine should
785  * be used only when comparing WAL locations related to history files.
786  */
787 static XLogRecPtr
789 {
790  if (XLogRecPtrIsInvalid(a))
791  return b;
792  else if (XLogRecPtrIsInvalid(b))
793  return a;
794  else
795  return Min(a, b);
796 }
797 
798 /*
799  * Retrieve timeline history for given control file which should behold
800  * either source or target.
801  */
802 static TimeLineHistoryEntry *
803 getTimelineHistory(ControlFileData *controlFile, int *nentries)
804 {
805  TimeLineHistoryEntry *history;
806  TimeLineID tli;
807 
808  tli = controlFile->checkPointCopy.ThisTimeLineID;
809 
810  /*
811  * Timeline 1 does not have a history file, so there is no need to check
812  * and fake an entry with infinite start and end positions.
813  */
814  if (tli == 1)
815  {
816  history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry));
817  history->tli = tli;
818  history->begin = history->end = InvalidXLogRecPtr;
819  *nentries = 1;
820  }
821  else
822  {
823  char path[MAXPGPATH];
824  char *histfile;
825 
826  TLHistoryFilePath(path, tli);
827 
828  /* Get history file from appropriate source */
829  if (controlFile == &ControlFile_source)
830  histfile = source->fetch_file(source, path, NULL);
831  else if (controlFile == &ControlFile_target)
832  histfile = slurpFile(datadir_target, path, NULL);
833  else
834  pg_fatal("invalid control file");
835 
836  history = rewind_parseTimeLineHistory(histfile, tli, nentries);
837  pg_free(histfile);
838  }
839 
840  if (debug)
841  {
842  int i;
843 
844  if (controlFile == &ControlFile_source)
845  pg_log_debug("Source timeline history:");
846  else if (controlFile == &ControlFile_target)
847  pg_log_debug("Target timeline history:");
848  else
849  Assert(false);
850 
851  /*
852  * Print the target timeline history.
853  */
854  for (i = 0; i < targetNentries; i++)
855  {
856  TimeLineHistoryEntry *entry;
857 
858  entry = &history[i];
859  pg_log_debug("%u: %X/%X - %X/%X", entry->tli,
860  LSN_FORMAT_ARGS(entry->begin),
861  LSN_FORMAT_ARGS(entry->end));
862  }
863  }
864 
865  return history;
866 }
867 
868 /*
869  * Determine the TLI of the last common timeline in the timeline history of the
870  * two clusters. targetHistory is filled with target timeline history and
871  * targetNentries is number of items in targetHistory. *tliIndex is set to the
872  * index of last common timeline in targetHistory array, and *recptr is set to
873  * the position where the timeline history diverged (ie. the first WAL record
874  * that's not the same in both clusters).
875  *
876  * Control files of both clusters must be read into ControlFile_target/source
877  * before calling this routine.
878  */
879 static void
880 findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
881 {
882  TimeLineHistoryEntry *sourceHistory;
883  int sourceNentries;
884  int i,
885  n;
886 
887  /* Retrieve timelines for both source and target */
888  sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries);
890 
891  /*
892  * Trace the history forward, until we hit the timeline diverge. It may
893  * still be possible that the source and target nodes used the same
894  * timeline number in their history but with different start position
895  * depending on the history files that each node has fetched in previous
896  * recovery processes. Hence check the start position of the new timeline
897  * as well and move down by one extra timeline entry if they do not match.
898  */
899  n = Min(sourceNentries, targetNentries);
900  for (i = 0; i < n; i++)
901  {
902  if (sourceHistory[i].tli != targetHistory[i].tli ||
903  sourceHistory[i].begin != targetHistory[i].begin)
904  break;
905  }
906 
907  if (i > 0)
908  {
909  i--;
910  *recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end);
911  *tliIndex = i;
912 
913  pg_free(sourceHistory);
914  return;
915  }
916  else
917  {
918  pg_fatal("could not find common ancestor of the source and target cluster's timelines");
919  }
920 }
921 
922 
923 /*
924  * Create a backup_label file that forces recovery to begin at the last common
925  * checkpoint.
926  */
927 static void
928 createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, XLogRecPtr checkpointloc)
929 {
930  XLogSegNo startsegno;
931  time_t stamp_time;
932  char strfbuf[128];
933  char xlogfilename[MAXFNAMELEN];
934  struct tm *tmp;
935  char buf[1000];
936  int len;
937 
938  XLByteToSeg(startpoint, startsegno, WalSegSz);
939  XLogFileName(xlogfilename, starttli, startsegno, WalSegSz);
940 
941  /*
942  * Construct backup label file
943  */
944  stamp_time = time(NULL);
945  tmp = localtime(&stamp_time);
946  strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", tmp);
947 
948  len = snprintf(buf, sizeof(buf),
949  "START WAL LOCATION: %X/%X (file %s)\n"
950  "CHECKPOINT LOCATION: %X/%X\n"
951  "BACKUP METHOD: pg_rewind\n"
952  "BACKUP FROM: standby\n"
953  "START TIME: %s\n",
954  /* omit LABEL: line */
955  LSN_FORMAT_ARGS(startpoint), xlogfilename,
956  LSN_FORMAT_ARGS(checkpointloc),
957  strfbuf);
958  if (len >= sizeof(buf))
959  pg_fatal("backup label buffer too small"); /* shouldn't happen */
960 
961  /* TODO: move old file out of the way, if any. */
962  open_target_file("backup_label", true); /* BACKUP_LABEL_FILE */
965 }
966 
967 /*
968  * Check CRC of control file
969  */
970 static void
972 {
973  pg_crc32c crc;
974 
975  /* Calculate CRC */
976  INIT_CRC32C(crc);
977  COMP_CRC32C(crc, (char *) ControlFile, offsetof(ControlFileData, crc));
978  FIN_CRC32C(crc);
979 
980  /* And simply compare it */
981  if (!EQ_CRC32C(crc, ControlFile->crc))
982  pg_fatal("unexpected control file CRC");
983 }
984 
985 /*
986  * Verify control file contents in the buffer 'content', and copy it to
987  * *ControlFile.
988  */
989 static void
991  size_t size)
992 {
993  if (size != PG_CONTROL_FILE_SIZE)
994  pg_fatal("unexpected control file size %d, expected %d",
995  (int) size, PG_CONTROL_FILE_SIZE);
996 
997  memcpy(ControlFile, content, sizeof(ControlFileData));
998 
999  /* set and validate WalSegSz */
1001 
1003  pg_fatal(ngettext("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
1004  "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
1005  WalSegSz),
1006  WalSegSz);
1007 
1008  /* Additional checks on control file */
1010 }
1011 
1012 /*
1013  * Get value of GUC parameter restore_command from the target cluster.
1014  *
1015  * This uses a logic based on "postgres -C" to get the value from the
1016  * cluster.
1017  */
1018 static void
1020 {
1021  int rc;
1022  char postgres_exec_path[MAXPGPATH],
1023  cmd_output[MAXPGPATH];
1024  PQExpBuffer postgres_cmd;
1025 
1026  if (!restore_wal)
1027  return;
1028 
1029  /* find postgres executable */
1030  rc = find_other_exec(argv0, "postgres",
1032  postgres_exec_path);
1033 
1034  if (rc < 0)
1035  {
1036  char full_path[MAXPGPATH];
1037 
1038  if (find_my_exec(argv0, full_path) < 0)
1039  strlcpy(full_path, progname, sizeof(full_path));
1040 
1041  if (rc == -1)
1042  pg_fatal("program \"%s\" is needed by %s but was not found in the same directory as \"%s\"",
1043  "postgres", progname, full_path);
1044  else
1045  pg_fatal("program \"%s\" was found by \"%s\" but was not the same version as %s",
1046  "postgres", full_path, progname);
1047  }
1048 
1049  /*
1050  * Build a command able to retrieve the value of GUC parameter
1051  * restore_command, if set.
1052  */
1053  postgres_cmd = createPQExpBuffer();
1054 
1055  /* path to postgres, properly quoted */
1056  appendShellString(postgres_cmd, postgres_exec_path);
1057 
1058  /* add -D switch, with properly quoted data directory */
1059  appendPQExpBufferStr(postgres_cmd, " -D ");
1060  appendShellString(postgres_cmd, datadir_target);
1061 
1062  /* add custom configuration file only if requested */
1063  if (config_file != NULL)
1064  {
1065  appendPQExpBufferStr(postgres_cmd, " -c config_file=");
1066  appendShellString(postgres_cmd, config_file);
1067  }
1068 
1069  /* add -C switch, for restore_command */
1070  appendPQExpBufferStr(postgres_cmd, " -C restore_command");
1071 
1072  if (!pipe_read_line(postgres_cmd->data, cmd_output, sizeof(cmd_output)))
1073  exit(1);
1074 
1075  (void) pg_strip_crlf(cmd_output);
1076 
1077  if (strcmp(cmd_output, "") == 0)
1078  pg_fatal("restore_command is not set in the target cluster");
1079 
1080  restore_command = pg_strdup(cmd_output);
1081 
1082  pg_log_debug("using for rewind restore_command = \'%s\'",
1083  restore_command);
1084 
1085  destroyPQExpBuffer(postgres_cmd);
1086 }
1087 
1088 
1089 /*
1090  * Ensure clean shutdown of target instance by launching single-user mode
1091  * postgres to do crash recovery.
1092  */
1093 static void
1095 {
1096  int ret;
1097 #define MAXCMDLEN (2 * MAXPGPATH)
1098  char exec_path[MAXPGPATH];
1099  PQExpBuffer postgres_cmd;
1100 
1101  /* locate postgres binary */
1102  if ((ret = find_other_exec(argv0, "postgres",
1104  exec_path)) < 0)
1105  {
1106  char full_path[MAXPGPATH];
1107 
1108  if (find_my_exec(argv0, full_path) < 0)
1109  strlcpy(full_path, progname, sizeof(full_path));
1110 
1111  if (ret == -1)
1112  pg_fatal("program \"%s\" is needed by %s but was not found in the same directory as \"%s\"",
1113  "postgres", progname, full_path);
1114  else
1115  pg_fatal("program \"%s\" was found by \"%s\" but was not the same version as %s",
1116  "postgres", full_path, progname);
1117  }
1118 
1119  pg_log_info("executing \"%s\" for target server to complete crash recovery",
1120  exec_path);
1121 
1122  /*
1123  * Skip processing if requested, but only after ensuring presence of
1124  * postgres.
1125  */
1126  if (dry_run)
1127  return;
1128 
1129  /*
1130  * Finally run postgres in single-user mode. There is no need to use
1131  * fsync here. This makes the recovery faster, and the target data folder
1132  * is synced at the end anyway.
1133  */
1134  postgres_cmd = createPQExpBuffer();
1135 
1136  /* path to postgres, properly quoted */
1137  appendShellString(postgres_cmd, exec_path);
1138 
1139  /* add set of options with properly quoted data directory */
1140  appendPQExpBufferStr(postgres_cmd, " --single -F -D ");
1141  appendShellString(postgres_cmd, datadir_target);
1142 
1143  /* add custom configuration file only if requested */
1144  if (config_file != NULL)
1145  {
1146  appendPQExpBufferStr(postgres_cmd, " -c config_file=");
1147  appendShellString(postgres_cmd, config_file);
1148  }
1149 
1150  /* finish with the database name, and a properly quoted redirection */
1151  appendPQExpBufferStr(postgres_cmd, " template1 < ");
1152  appendShellString(postgres_cmd, DEVNULL);
1153 
1154  fflush(NULL);
1155  if (system(postgres_cmd->data) != 0)
1156  {
1157  pg_log_error("postgres single-user mode in target cluster failed");
1158  pg_log_error_detail("Command was: %s", postgres_cmd->data);
1159  exit(1);
1160  }
1161 
1162  destroyPQExpBuffer(postgres_cmd);
1163 }
1164 
1165 static void
1167 {
1168  if (conn != NULL)
1169  PQfinish(conn);
1170 }
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1537
uint32 BlockNumber
Definition: block.h:31
#define PG_DATA_CHECKSUM_VERSION
Definition: bufpage.h:203
#define Min(x, y)
Definition: c.h:937
#define ngettext(s, p, n)
Definition: c.h:1120
#define PG_TEXTDOMAIN(domain)
Definition: c.h:1153
#define UINT64_FORMAT
Definition: c.h:485
#define CATALOG_VERSION_NO
Definition: catversion.h:60
int find_my_exec(const char *argv0, char *retpath)
Definition: exec.c:152
char * pipe_read_line(char *cmd, char *line, int maxsize)
Definition: exec.c:387
void set_pglocale_pgservice(const char *argv0, const char *app)
Definition: exec.c:460
int find_other_exec(const char *argv0, const char *target, const char *versionstr, char *retpath)
Definition: exec.c:351
void update_controlfile(const char *DataDir, ControlFileData *ControlFile, bool do_sync)
bool datapagemap_next(datapagemap_iterator_t *iter, BlockNumber *blkno)
Definition: datapagemap.c:87
datapagemap_iterator_t * datapagemap_iterate(datapagemap_t *map)
Definition: datapagemap.c:75
#define _(x)
Definition: elog.c:90
char * PQerrorMessage(const PGconn *conn)
Definition: fe-connect.c:6743
ConnStatusType PQstatus(const PGconn *conn)
Definition: fe-connect.c:6690
void PQfinish(PGconn *conn)
Definition: fe-connect.c:4130
PGconn * PQconnectdb(const char *conninfo)
Definition: fe-connect.c:707
char * pg_strdup(const char *in)
Definition: fe_memutils.c:85
void pg_free(void *ptr)
Definition: fe_memutils.c:105
void * pg_malloc(size_t size)
Definition: fe_memutils.c:47
void traverse_datadir(const char *datadir, process_file_callback_t callback)
Definition: file_ops.c:362
char * slurpFile(const char *datadir, const char *path, size_t *filesize)
Definition: file_ops.c:314
void close_target_file(void)
Definition: file_ops.c:75
void truncate_target_file(const char *path, off_t newsize)
Definition: file_ops.c:206
void remove_target(file_entry_t *entry)
Definition: file_ops.c:130
void sync_target_dir(void)
Definition: file_ops.c:294
void create_target(file_entry_t *entry)
Definition: file_ops.c:156
void open_target_file(const char *path, bool trunc)
Definition: file_ops.c:47
void write_target_range(char *buf, off_t begin, size_t size)
Definition: file_ops.c:88
int pg_mode_mask
Definition: file_perm.c:25
bool GetDataDirectoryCreatePerm(const char *dataDir)
void filehash_init(void)
Definition: filemap.c:168
void process_source_file(const char *path, file_type_t type, size_t size, const char *link_target)
Definition: filemap.c:218
void print_filemap(filemap_t *filemap)
Definition: filemap.c:479
void process_target_file(const char *path, file_type_t type, size_t size, const char *link_target)
Definition: filemap.c:254
filemap_t * decide_file_actions(void)
Definition: filemap.c:788
void calculate_totals(filemap_t *filemap)
Definition: filemap.c:438
@ FILE_ACTION_REMOVE
Definition: filemap.h:27
@ FILE_ACTION_COPY
Definition: filemap.h:21
@ FILE_ACTION_NONE
Definition: filemap.h:24
@ FILE_ACTION_COPY_TAIL
Definition: filemap.h:22
@ FILE_ACTION_UNDECIDED
Definition: filemap.h:18
@ FILE_ACTION_TRUNCATE
Definition: filemap.h:26
@ FILE_ACTION_CREATE
Definition: filemap.h:20
int getopt_long(int argc, char *const argv[], const char *optstring, const struct option *longopts, int *longindex)
Definition: getopt_long.c:57
#define no_argument
Definition: getopt_long.h:24
#define required_argument
Definition: getopt_long.h:25
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int i
Definition: isn.c:73
@ CONNECTION_BAD
Definition: libpq-fe.h:61
static void const char fflush(stdout)
Assert(fmt[strlen(fmt) - 1] !='\n')
exit(1)
rewind_source * init_libpq_source(PGconn *conn)
Definition: libpq_source.c:82
rewind_source * init_local_source(const char *datadir)
Definition: local_source.c:40
static struct pg_tm tm
Definition: localtime.c:104
void pg_logging_increase_verbosity(void)
Definition: logging.c:182
void pg_logging_init(const char *argv0)
Definition: logging.c:83
#define pg_log_error(...)
Definition: logging.h:106
#define pg_log_error_hint(...)
Definition: logging.h:112
#define pg_log_info(...)
Definition: logging.h:124
#define pg_log_error_detail(...)
Definition: logging.h:109
#define pg_log_debug(...)
Definition: logging.h:133
void extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, XLogRecPtr endpoint, const char *restoreCommand)
Definition: parsexlog.c:66
void findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, XLogRecPtr *lastchkptredo, const char *restoreCommand)
Definition: parsexlog.c:168
XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex, const char *restoreCommand)
Definition: parsexlog.c:124
static pg_time_t last_progress_report
Definition: pg_amcheck.c:144
#define pg_fatal(...)
static bool writerecoveryconf
#define MAXPGPATH
#define PG_CONTROL_VERSION
Definition: pg_control.h:25
@ DB_IN_PRODUCTION
Definition: pg_control.h:95
@ DB_IN_ARCHIVE_RECOVERY
Definition: pg_control.h:94
@ DB_SHUTDOWNED_IN_RECOVERY
Definition: pg_control.h:91
@ DB_SHUTDOWNED
Definition: pg_control.h:90
#define PG_CONTROL_FILE_SIZE
Definition: pg_control.h:248
uint32 pg_crc32c
Definition: pg_crc32c.h:38
#define COMP_CRC32C(crc, data, len)
Definition: pg_crc32c.h:89
#define EQ_CRC32C(c1, c2)
Definition: pg_crc32c.h:42
#define INIT_CRC32C(crc)
Definition: pg_crc32c.h:41
#define FIN_CRC32C(crc)
Definition: pg_crc32c.h:94
const void size_t len
return crc
static char * argv0
Definition: pg_ctl.c:92
static char * exec_path
Definition: pg_ctl.c:87
PGDLLIMPORT int optind
Definition: getopt.c:50
PGDLLIMPORT char * optarg
Definition: getopt.c:52
static ControlFileData ControlFile_source
Definition: pg_rewind.c:53
static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, XLogRecPtr checkpointloc)
Definition: pg_rewind.c:928
static void usage(const char *progname)
Definition: pg_rewind.c:84
static void sanityChecks(void)
Definition: pg_rewind.c:679
char * datadir_source
Definition: pg_rewind.c:61
static ControlFileData ControlFile_source_after
Definition: pg_rewind.c:54
int WalSegSz
Definition: pg_rewind.c:57
char * restore_command
Definition: pg_rewind.c:63
static bool debug
Definition: pg_rewind.c:66
int main(int argc, char **argv)
Definition: pg_rewind.c:112
static XLogRecPtr MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b)
Definition: pg_rewind.c:788
static void ensureCleanShutdown(const char *argv0)
Definition: pg_rewind.c:1094
TimeLineHistoryEntry * targetHistory
Definition: pg_rewind.c:73
static rewind_source * source
Definition: pg_rewind.c:81
static ControlFileData ControlFile_target
Definition: pg_rewind.c:52
static void digestControlFile(ControlFileData *ControlFile, const char *content, size_t size)
Definition: pg_rewind.c:990
char * connstr_source
Definition: pg_rewind.c:62
static TimeLineHistoryEntry * getTimelineHistory(ControlFileData *controlFile, int *nentries)
Definition: pg_rewind.c:803
static void checkControlFile(ControlFileData *ControlFile)
Definition: pg_rewind.c:971
static void getRestoreCommand(const char *argv0)
Definition: pg_rewind.c:1019
char * datadir_target
Definition: pg_rewind.c:60
bool do_sync
Definition: pg_rewind.c:69
bool restore_wal
Definition: pg_rewind.c:70
uint64 fetch_done
Definition: pg_rewind.c:78
int targetNentries
Definition: pg_rewind.c:74
void progress_report(bool finished)
Definition: pg_rewind.c:737
uint64 fetch_size
Definition: pg_rewind.c:77
char * config_file
Definition: pg_rewind.c:64
bool dry_run
Definition: pg_rewind.c:68
bool showprogress
Definition: pg_rewind.c:67
const char * progname
Definition: pg_rewind.c:56
static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex)
Definition: pg_rewind.c:880
static void perform_rewind(filemap_t *filemap, rewind_source *source, XLogRecPtr chkptrec, TimeLineID chkpttli, XLogRecPtr chkptredo)
Definition: pg_rewind.c:500
static void disconnect_atexit(void)
Definition: pg_rewind.c:1166
static PGconn * conn
Definition: pg_rewind.c:80
TimeLineHistoryEntry * rewind_parseTimeLineHistory(char *buffer, TimeLineID targetTLI, int *nentries)
Definition: timeline.c:29
static char * buf
Definition: pg_test_fsync.c:67
int64 pg_time_t
Definition: pgtime.h:23
const char * get_progname(const char *argv0)
Definition: path.c:574
#define snprintf
Definition: port.h:238
#define DEVNULL
Definition: port.h:160
#define PG_BACKEND_VERSIONSTR
Definition: port.h:143
#define fprintf
Definition: port.h:242
#define printf(...)
Definition: port.h:244
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
PQExpBuffer createPQExpBuffer(void)
Definition: pqexpbuffer.c:72
void destroyPQExpBuffer(PQExpBuffer str)
Definition: pqexpbuffer.c:114
void appendPQExpBufferStr(PQExpBuffer str, const char *data)
Definition: pqexpbuffer.c:367
char * c
void WriteRecoveryConfig(PGconn *pgconn, char *target_dir, PQExpBuffer contents)
Definition: recovery_gen.c:105
PQExpBuffer GenerateRecoveryConfig(PGconn *pgconn, char *replication_slot)
Definition: recovery_gen.c:23
void get_restricted_token(void)
int pg_strip_crlf(char *str)
Definition: string.c:155
void appendShellString(PQExpBuffer buf, const char *str)
Definition: string_utils.c:429
TimeLineID ThisTimeLineID
Definition: pg_control.h:39
XLogRecPtr redo
Definition: pg_control.h:37
uint32 pg_control_version
Definition: pg_control.h:123
uint32 xlog_seg_size
Definition: pg_control.h:209
CheckPoint checkPointCopy
Definition: pg_control.h:133
XLogRecPtr minRecoveryPoint
Definition: pg_control.h:166
uint32 data_checksum_version
Definition: pg_control.h:220
XLogRecPtr checkPoint
Definition: pg_control.h:131
uint64 system_identifier
Definition: pg_control.h:108
uint32 catalog_version_no
Definition: pg_control.h:124
TimeLineID minRecoveryPointTLI
Definition: pg_control.h:167
pg_crc32c crc
Definition: pg_control.h:230
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h:29
int bitmapsize
Definition: datapagemap.h:18
Definition: filemap.h:50
datapagemap_t target_pages_to_overwrite
Definition: filemap.h:68
const char * path
Definition: filemap.h:53
size_t source_size
Definition: filemap.h:75
size_t target_size
Definition: filemap.h:61
file_action_t action
Definition: filemap.h:81
file_entry_t * entries[FLEXIBLE_ARRAY_MEMBER]
Definition: filemap.h:96
int nentries
Definition: filemap.h:95
uint64 total_size
Definition: filemap.h:92
uint64 fetch_size
Definition: filemap.h:93
void(* queue_fetch_file)(struct rewind_source *, const char *path, size_t len)
Definition: rewind_source.h:60
void(* traverse_files)(struct rewind_source *, process_file_callback_t callback)
Definition: rewind_source.h:29
void(* finish_fetch)(struct rewind_source *)
Definition: rewind_source.h:66
XLogRecPtr(* get_current_wal_insert_lsn)(struct rewind_source *)
Definition: rewind_source.h:71
void(* queue_fetch_range)(struct rewind_source *, const char *path, off_t offset, size_t len)
Definition: rewind_source.h:47
char *(* fetch_file)(struct rewind_source *, const char *path, size_t *filesize)
Definition: rewind_source.h:37
void(* destroy)(struct rewind_source *)
Definition: rewind_source.h:76
static ControlFileData * ControlFile
Definition: xlog.c:570
#define IsValidWalSegSize(size)
Definition: xlog_internal.h:96
#define MAXFNAMELEN
#define XLByteToSeg(xlrp, logSegNo, wal_segsz_bytes)
static void XLogFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
static void TLHistoryFilePath(char *path, TimeLineID tli)
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
uint32 TimeLineID
Definition: xlogdefs.h:59
uint64 XLogSegNo
Definition: xlogdefs.h:48