PostgreSQL Source Code  git master
basebackup_incremental.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * basebackup_incremental.c
4  * code for incremental backup support
5  *
6  * This code isn't actually in charge of taking an incremental backup;
7  * the actual construction of the incremental backup happens in
8  * basebackup.c. Here, we're concerned with providing the necessary
9  * supports for that operation. In particular, we need to parse the
10  * backup manifest supplied by the user taking the incremental backup
11  * and extract the required information from it.
12  *
13  * Portions Copyright (c) 2010-2024, PostgreSQL Global Development Group
14  *
15  * IDENTIFICATION
16  * src/backend/backup/basebackup_incremental.c
17  *
18  *-------------------------------------------------------------------------
19  */
20 #include "postgres.h"
21 
22 #include "access/timeline.h"
23 #include "access/xlog.h"
25 #include "backup/walsummary.h"
26 #include "common/blkreftable.h"
27 #include "common/hashfn.h"
28 #include "common/int.h"
29 #include "common/parse_manifest.h"
30 #include "datatype/timestamp.h"
32 #include "utils/timestamp.h"
33 
34 #define BLOCKS_PER_READ 512
35 
36 /*
37  * We expect to find the last lines of the manifest, including the checksum,
38  * in the last MIN_CHUNK bytes of the manifest. We trigger an incremental
39  * parse step if we are about to overflow MAX_CHUNK bytes.
40  */
41 #define MIN_CHUNK 1024
42 #define MAX_CHUNK (128 * 1024)
43 
44 /*
45  * Details extracted from the WAL ranges present in the supplied backup manifest.
46  */
47 typedef struct
48 {
53 
54 /*
55  * Details extracted from the file list present in the supplied backup manifest.
56  */
57 typedef struct
58 {
60  const char *path;
61  size_t size;
63 
64 static uint32 hash_string_pointer(const char *s);
65 #define SH_PREFIX backup_file
66 #define SH_ELEMENT_TYPE backup_file_entry
67 #define SH_KEY_TYPE const char *
68 #define SH_KEY path
69 #define SH_HASH_KEY(tb, key) hash_string_pointer(key)
70 #define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0)
71 #define SH_SCOPE static inline
72 #define SH_DECLARE
73 #define SH_DEFINE
74 #include "lib/simplehash.h"
75 
77 {
78  /* Memory context for this object and its subsidiary objects. */
80 
81  /* Temporary buffer for storing the manifest while parsing it. */
83 
84  /* WAL ranges extracted from the backup manifest. */
86 
87  /*
88  * Files extracted from the backup manifest.
89  *
90  * We don't really need this information, because we use WAL summaries to
91  * figure out what's changed. It would be unsafe to just rely on the list
92  * of files that existed before, because it's possible for a file to be
93  * removed and a new one created with the same name and different
94  * contents. In such cases, the whole file must still be sent. We can tell
95  * from the WAL summaries whether that happened, but not from the file
96  * list.
97  *
98  * Nonetheless, this data is useful for sanity checking. If a file that we
99  * think we shouldn't need to send is not present in the manifest for the
100  * prior backup, something has gone terribly wrong. We retain the file
101  * names and sizes, but not the checksums or last modified times, for
102  * which we have no use.
103  *
104  * One significant downside of storing this data is that it consumes
105  * memory. If that turns out to be a problem, we might have to decide not
106  * to retain this information, or to make it optional.
107  */
108  backup_file_hash *manifest_files;
109 
110  /*
111  * Block-reference table for the incremental backup.
112  *
113  * It's possible that storing the entire block-reference table in memory
114  * will be a problem for some users. The in-memory format that we're using
115  * here is pretty efficient, converging to little more than 1 bit per
116  * block for relation forks with large numbers of modified blocks. It's
117  * possible, however, that if you try to perform an incremental backup of
118  * a database with a sufficiently large number of relations on a
119  * sufficiently small machine, you could run out of memory here. If that
120  * turns out to be a problem in practice, we'll need to be more clever.
121  */
123 
124  /*
125  * State object for incremental JSON parsing
126  */
128 };
129 
131  int manifest_version);
133  uint64 manifest_system_identifier);
135  char *pathname,
136  size_t size,
137  pg_checksum_type checksum_type,
138  int checksum_length,
139  uint8 *checksum_payload);
141  TimeLineID tli,
142  XLogRecPtr start_lsn,
143  XLogRecPtr end_lsn);
145  const char *fmt,...)
147 static int compare_block_numbers(const void *a, const void *b);
148 
149 /*
150  * Create a new object for storing information extracted from the manifest
151  * supplied when creating an incremental backup.
152  */
155 {
157  MemoryContext oldcontext;
159 
160  oldcontext = MemoryContextSwitchTo(mcxt);
161 
162  ib = palloc0(sizeof(IncrementalBackupInfo));
163  ib->mcxt = mcxt;
164  initStringInfo(&ib->buf);
165 
166  /*
167  * It's hard to guess how many files a "typical" installation will have in
168  * the data directory, but a fresh initdb creates almost 1000 files as of
169  * this writing, so it seems to make sense for our estimate to
170  * substantially higher.
171  */
172  ib->manifest_files = backup_file_create(mcxt, 10000, NULL);
173 
175  /* Parse the manifest. */
176  context->private_data = ib;
177  context->version_cb = manifest_process_version;
178  context->system_identifier_cb = manifest_process_system_identifier;
179  context->per_file_cb = manifest_process_file;
180  context->per_wal_range_cb = manifest_process_wal_range;
181  context->error_cb = manifest_report_error;
182 
184 
185  MemoryContextSwitchTo(oldcontext);
186 
187  return ib;
188 }
189 
190 /*
191  * Before taking an incremental backup, the caller must supply the backup
192  * manifest from a prior backup. Each chunk of manifest data received
193  * from the client should be passed to this function.
194  */
195 void
197  int len)
198 {
199  MemoryContext oldcontext;
200 
201  /* Switch to our memory context. */
202  oldcontext = MemoryContextSwitchTo(ib->mcxt);
203 
204  if (ib->buf.len > MIN_CHUNK && ib->buf.len + len > MAX_CHUNK)
205  {
206  /*
207  * time for an incremental parse. We'll do all but the last MIN_CHUNK
208  * so that we have enough left for the final piece.
209  */
211  ib->inc_state, ib->buf.data, ib->buf.len - MIN_CHUNK, false);
212  /* now remove what we just parsed */
213  memmove(ib->buf.data, ib->buf.data + (ib->buf.len - MIN_CHUNK),
214  MIN_CHUNK + 1);
215  ib->buf.len = MIN_CHUNK;
216  }
217 
219 
220  /* Switch back to previous memory context. */
221  MemoryContextSwitchTo(oldcontext);
222 }
223 
224 /*
225  * Finalize an IncrementalBackupInfo object after all manifest data has
226  * been supplied via calls to AppendIncrementalManifestData.
227  */
228 void
230 {
231  MemoryContext oldcontext;
232 
233  /* Switch to our memory context. */
234  oldcontext = MemoryContextSwitchTo(ib->mcxt);
235 
236  /* Parse the last chunk of the manifest */
238  ib->inc_state, ib->buf.data, ib->buf.len, true);
239 
240  /* Done with the buffer, so release memory. */
241  pfree(ib->buf.data);
242  ib->buf.data = NULL;
243 
244  /* Done with inc_state, so release that memory too */
246 
247  /* Switch back to previous memory context. */
248  MemoryContextSwitchTo(oldcontext);
249 }
250 
251 /*
252  * Prepare to take an incremental backup.
253  *
254  * Before this function is called, AppendIncrementalManifestData and
255  * FinalizeIncrementalManifest should have already been called to pass all
256  * the manifest data to this object.
257  *
258  * This function performs sanity checks on the data extracted from the
259  * manifest and figures out for which WAL ranges we need summaries, and
260  * whether those summaries are available. Then, it reads and combines the
261  * data from those summary files. It also updates the backup_state with the
262  * reference TLI and LSN for the prior backup.
263  */
264 void
267 {
268  MemoryContext oldcontext;
270  List *all_wslist,
271  *required_wslist = NIL;
272  ListCell *lc;
273  TimeLineHistoryEntry **tlep;
274  int num_wal_ranges;
275  int i;
276  bool found_backup_start_tli = false;
277  TimeLineID earliest_wal_range_tli = 0;
278  XLogRecPtr earliest_wal_range_start_lsn = InvalidXLogRecPtr;
279  TimeLineID latest_wal_range_tli = 0;
280  XLogRecPtr summarized_lsn;
281  XLogRecPtr pending_lsn;
282  XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr;
283  int deadcycles = 0;
284  TimestampTz initial_time,
285  current_time;
286 
287  Assert(ib->buf.data == NULL);
288 
289  /* Switch to our memory context. */
290  oldcontext = MemoryContextSwitchTo(ib->mcxt);
291 
292  /*
293  * A valid backup manifest must always contain at least one WAL range
294  * (usually exactly one, unless the backup spanned a timeline switch).
295  */
296  num_wal_ranges = list_length(ib->manifest_wal_ranges);
297  if (num_wal_ranges == 0)
298  ereport(ERROR,
299  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
300  errmsg("manifest contains no required WAL ranges")));
301 
302  /*
303  * Match up the TLIs that appear in the WAL ranges of the backup manifest
304  * with those that appear in this server's timeline history. We expect
305  * every backup_wal_range to match to a TimeLineHistoryEntry; if it does
306  * not, that's an error.
307  *
308  * This loop also decides which of the WAL ranges is the manifest is most
309  * ancient and which one is the newest, according to the timeline history
310  * of this server, and stores TLIs of those WAL ranges into
311  * earliest_wal_range_tli and latest_wal_range_tli. It also updates
312  * earliest_wal_range_start_lsn to the start LSN of the WAL range for
313  * earliest_wal_range_tli.
314  *
315  * Note that the return value of readTimeLineHistory puts the latest
316  * timeline at the beginning of the list, not the end. Hence, the earliest
317  * TLI is the one that occurs nearest the end of the list returned by
318  * readTimeLineHistory, and the latest TLI is the one that occurs closest
319  * to the beginning.
320  */
322  tlep = palloc0(num_wal_ranges * sizeof(TimeLineHistoryEntry *));
323  for (i = 0; i < num_wal_ranges; ++i)
324  {
326  bool saw_earliest_wal_range_tli = false;
327  bool saw_latest_wal_range_tli = false;
328 
329  /* Search this server's history for this WAL range's TLI. */
330  foreach(lc, expectedTLEs)
331  {
332  TimeLineHistoryEntry *tle = lfirst(lc);
333 
334  if (tle->tli == range->tli)
335  {
336  tlep[i] = tle;
337  break;
338  }
339 
340  if (tle->tli == earliest_wal_range_tli)
341  saw_earliest_wal_range_tli = true;
342  if (tle->tli == latest_wal_range_tli)
343  saw_latest_wal_range_tli = true;
344  }
345 
346  /*
347  * An incremental backup can only be taken relative to a backup that
348  * represents a previous state of this server. If the backup requires
349  * WAL from a timeline that's not in our history, that definitely
350  * isn't the case.
351  */
352  if (tlep[i] == NULL)
353  ereport(ERROR,
354  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
355  errmsg("timeline %u found in manifest, but not in this server's history",
356  range->tli)));
357 
358  /*
359  * If we found this TLI in the server's history before encountering
360  * the latest TLI seen so far in the server's history, then this TLI
361  * is the latest one seen so far.
362  *
363  * If on the other hand we saw the earliest TLI seen so far before
364  * finding this TLI, this TLI is earlier than the earliest one seen so
365  * far. And if this is the first TLI for which we've searched, it's
366  * also the earliest one seen so far.
367  *
368  * On the first loop iteration, both things should necessarily be
369  * true.
370  */
371  if (!saw_latest_wal_range_tli)
372  latest_wal_range_tli = range->tli;
373  if (earliest_wal_range_tli == 0 || saw_earliest_wal_range_tli)
374  {
375  earliest_wal_range_tli = range->tli;
376  earliest_wal_range_start_lsn = range->start_lsn;
377  }
378  }
379 
380  /*
381  * Propagate information about the prior backup into the backup_label that
382  * will be generated for this backup.
383  */
384  backup_state->istartpoint = earliest_wal_range_start_lsn;
385  backup_state->istarttli = earliest_wal_range_tli;
386 
387  /*
388  * Sanity check start and end LSNs for the WAL ranges in the manifest.
389  *
390  * Commonly, there won't be any timeline switches during the prior backup
391  * at all, but if there are, they should happen at the same LSNs that this
392  * server switched timelines.
393  *
394  * Whether there are any timeline switches during the prior backup or not,
395  * the prior backup shouldn't require any WAL from a timeline prior to the
396  * start of that timeline. It also shouldn't require any WAL from later
397  * than the start of this backup.
398  *
399  * If any of these sanity checks fail, one possible explanation is that
400  * the user has generated WAL on the same timeline with the same LSNs more
401  * than once. For instance, if two standbys running on timeline 1 were
402  * both promoted and (due to a broken archiving setup) both selected new
403  * timeline ID 2, then it's possible that one of these checks might trip.
404  *
405  * Note that there are lots of ways for the user to do something very bad
406  * without tripping any of these checks, and they are not intended to be
407  * comprehensive. It's pretty hard to see how we could be certain of
408  * anything here. However, if there's a problem staring us right in the
409  * face, it's best to report it, so we do.
410  */
411  for (i = 0; i < num_wal_ranges; ++i)
412  {
414 
415  if (range->tli == earliest_wal_range_tli)
416  {
417  if (range->start_lsn < tlep[i]->begin)
418  ereport(ERROR,
419  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
420  errmsg("manifest requires WAL from initial timeline %u starting at %X/%X, but that timeline begins at %X/%X",
421  range->tli,
422  LSN_FORMAT_ARGS(range->start_lsn),
423  LSN_FORMAT_ARGS(tlep[i]->begin))));
424  }
425  else
426  {
427  if (range->start_lsn != tlep[i]->begin)
428  ereport(ERROR,
429  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
430  errmsg("manifest requires WAL from continuation timeline %u starting at %X/%X, but that timeline begins at %X/%X",
431  range->tli,
432  LSN_FORMAT_ARGS(range->start_lsn),
433  LSN_FORMAT_ARGS(tlep[i]->begin))));
434  }
435 
436  if (range->tli == latest_wal_range_tli)
437  {
438  if (range->end_lsn > backup_state->startpoint)
439  ereport(ERROR,
440  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
441  errmsg("manifest requires WAL from final timeline %u ending at %X/%X, but this backup starts at %X/%X",
442  range->tli,
443  LSN_FORMAT_ARGS(range->end_lsn),
445  }
446  else
447  {
448  if (range->end_lsn != tlep[i]->end)
449  ereport(ERROR,
450  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
451  errmsg("manifest requires WAL from non-final timeline %u ending at %X/%X, but this server switched timelines at %X/%X",
452  range->tli,
453  LSN_FORMAT_ARGS(range->end_lsn),
454  LSN_FORMAT_ARGS(tlep[i]->end))));
455  }
456 
457  }
458 
459  /*
460  * Wait for WAL summarization to catch up to the backup start LSN (but
461  * time out if it doesn't do so quickly enough).
462  */
463  initial_time = current_time = GetCurrentTimestamp();
464  while (1)
465  {
466  long timeout_in_ms = 10000;
467  long elapsed_seconds;
468 
469  /*
470  * Align the wait time to prevent drift. This doesn't really matter,
471  * but we'd like the warnings about how long we've been waiting to say
472  * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
473  * drifting to something that is not a multiple of ten.
474  */
475  timeout_in_ms -=
476  TimestampDifferenceMilliseconds(initial_time, current_time) %
477  timeout_in_ms;
478 
479  /* Wait for up to 10 seconds. */
481  timeout_in_ms, &pending_lsn);
482 
483  /* If WAL summarization has progressed sufficiently, stop waiting. */
484  if (summarized_lsn >= backup_state->startpoint)
485  break;
486 
487  /*
488  * Keep track of the number of cycles during which there has been no
489  * progression of pending_lsn. If pending_lsn is not advancing, that
490  * means that not only are no new files appearing on disk, but we're
491  * not even incorporating new records into the in-memory state.
492  */
493  if (pending_lsn > prior_pending_lsn)
494  {
495  prior_pending_lsn = pending_lsn;
496  deadcycles = 0;
497  }
498  else
499  ++deadcycles;
500 
501  /*
502  * If we've managed to wait for an entire minute without the WAL
503  * summarizer absorbing a single WAL record, error out; probably
504  * something is wrong.
505  *
506  * We could consider also erroring out if the summarizer is taking too
507  * long to catch up, but it's not clear what rate of progress would be
508  * acceptable and what would be too slow. So instead, we just try to
509  * error out in the case where there's no progress at all. That seems
510  * likely to catch a reasonable number of the things that can go wrong
511  * in practice (e.g. the summarizer process is completely hung, say
512  * because somebody hooked up a debugger to it or something) without
513  * giving up too quickly when the system is just slow.
514  */
515  if (deadcycles >= 6)
516  ereport(ERROR,
517  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
518  errmsg("WAL summarization is not progressing"),
519  errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
521  LSN_FORMAT_ARGS(summarized_lsn),
522  LSN_FORMAT_ARGS(pending_lsn))));
523 
524  /*
525  * Otherwise, just let the user know what's happening.
526  */
527  current_time = GetCurrentTimestamp();
528  elapsed_seconds =
529  TimestampDifferenceMilliseconds(initial_time, current_time) / 1000;
531  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
532  errmsg("still waiting for WAL summarization through %X/%X after %ld seconds",
534  elapsed_seconds),
535  errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
536  LSN_FORMAT_ARGS(summarized_lsn),
537  LSN_FORMAT_ARGS(pending_lsn))));
538  }
539 
540  /*
541  * Retrieve a list of all WAL summaries on any timeline that overlap with
542  * the LSN range of interest. We could instead call GetWalSummaries() once
543  * per timeline in the loop that follows, but that would involve reading
544  * the directory multiple times. It should be mildly faster - and perhaps
545  * a bit safer - to do it just once.
546  */
547  all_wslist = GetWalSummaries(0, earliest_wal_range_start_lsn,
549 
550  /*
551  * We need WAL summaries for everything that happened during the prior
552  * backup and everything that happened afterward up until the point where
553  * the current backup started.
554  */
555  foreach(lc, expectedTLEs)
556  {
557  TimeLineHistoryEntry *tle = lfirst(lc);
558  XLogRecPtr tli_start_lsn = tle->begin;
559  XLogRecPtr tli_end_lsn = tle->end;
560  XLogRecPtr tli_missing_lsn = InvalidXLogRecPtr;
561  List *tli_wslist;
562 
563  /*
564  * Working through the history of this server from the current
565  * timeline backwards, we skip everything until we find the timeline
566  * where this backup started. Most of the time, this means we won't
567  * skip anything at all, as it's unlikely that the timeline has
568  * changed since the beginning of the backup moments ago.
569  */
570  if (tle->tli == backup_state->starttli)
571  {
572  found_backup_start_tli = true;
573  tli_end_lsn = backup_state->startpoint;
574  }
575  else if (!found_backup_start_tli)
576  continue;
577 
578  /*
579  * Find the summaries that overlap the LSN range of interest for this
580  * timeline. If this is the earliest timeline involved, the range of
581  * interest begins with the start LSN of the prior backup; otherwise,
582  * it begins at the LSN at which this timeline came into existence. If
583  * this is the latest TLI involved, the range of interest ends at the
584  * start LSN of the current backup; otherwise, it ends at the point
585  * where we switched from this timeline to the next one.
586  */
587  if (tle->tli == earliest_wal_range_tli)
588  tli_start_lsn = earliest_wal_range_start_lsn;
589  tli_wslist = FilterWalSummaries(all_wslist, tle->tli,
590  tli_start_lsn, tli_end_lsn);
591 
592  /*
593  * There is no guarantee that the WAL summaries we found cover the
594  * entire range of LSNs for which summaries are required, or indeed
595  * that we found any WAL summaries at all. Check whether we have a
596  * problem of that sort.
597  */
598  if (!WalSummariesAreComplete(tli_wslist, tli_start_lsn, tli_end_lsn,
599  &tli_missing_lsn))
600  {
601  if (XLogRecPtrIsInvalid(tli_missing_lsn))
602  ereport(ERROR,
603  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
604  errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but no summaries for that timeline and LSN range exist",
605  tle->tli,
606  LSN_FORMAT_ARGS(tli_start_lsn),
607  LSN_FORMAT_ARGS(tli_end_lsn))));
608  else
609  ereport(ERROR,
610  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
611  errmsg("WAL summaries are required on timeline %u from %X/%X to %X/%X, but the summaries for that timeline and LSN range are incomplete",
612  tle->tli,
613  LSN_FORMAT_ARGS(tli_start_lsn),
614  LSN_FORMAT_ARGS(tli_end_lsn)),
615  errdetail("The first unsummarized LSN in this range is %X/%X.",
616  LSN_FORMAT_ARGS(tli_missing_lsn))));
617  }
618 
619  /*
620  * Remember that we need to read these summaries.
621  *
622  * Technically, it's possible that this could read more files than
623  * required, since tli_wslist in theory could contain redundant
624  * summaries. For instance, if we have a summary from 0/10000000 to
625  * 0/20000000 and also one from 0/00000000 to 0/30000000, then the
626  * latter subsumes the former and the former could be ignored.
627  *
628  * We ignore this possibility because the WAL summarizer only tries to
629  * generate summaries that do not overlap. If somehow they exist,
630  * we'll do a bit of extra work but the results should still be
631  * correct.
632  */
633  required_wslist = list_concat(required_wslist, tli_wslist);
634 
635  /*
636  * Timelines earlier than the one in which the prior backup began are
637  * not relevant.
638  */
639  if (tle->tli == earliest_wal_range_tli)
640  break;
641  }
642 
643  /*
644  * Read all of the required block reference table files and merge all of
645  * the data into a single in-memory block reference table.
646  *
647  * See the comments for struct IncrementalBackupInfo for some thoughts on
648  * memory usage.
649  */
651  foreach(lc, required_wslist)
652  {
653  WalSummaryFile *ws = lfirst(lc);
654  WalSummaryIO wsio;
655  BlockRefTableReader *reader;
656  RelFileLocator rlocator;
657  ForkNumber forknum;
658  BlockNumber limit_block;
660 
661  wsio.file = OpenWalSummaryFile(ws, false);
662  wsio.filepos = 0;
663  ereport(DEBUG1,
664  (errmsg_internal("reading WAL summary file \"%s\"",
665  FilePathName(wsio.file))));
667  FilePathName(wsio.file),
668  ReportWalSummaryError, NULL);
669  while (BlockRefTableReaderNextRelation(reader, &rlocator, &forknum,
670  &limit_block))
671  {
672  BlockRefTableSetLimitBlock(ib->brtab, &rlocator,
673  forknum, limit_block);
674 
675  while (1)
676  {
677  unsigned nblocks;
678  unsigned i;
679 
680  nblocks = BlockRefTableReaderGetBlocks(reader, blocks,
682  if (nblocks == 0)
683  break;
684 
685  for (i = 0; i < nblocks; ++i)
686  BlockRefTableMarkBlockModified(ib->brtab, &rlocator,
687  forknum, blocks[i]);
688  }
689  }
691  FileClose(wsio.file);
692  }
693 
694  /* Switch back to previous memory context. */
695  MemoryContextSwitchTo(oldcontext);
696 }
697 
698 /*
699  * Get the pathname that should be used when a file is sent incrementally.
700  *
701  * The result is a palloc'd string.
702  */
703 char *
704 GetIncrementalFilePath(Oid dboid, Oid spcoid, RelFileNumber relfilenumber,
705  ForkNumber forknum, unsigned segno)
706 {
707  char *path;
708  char *lastslash;
709  char *ipath;
710 
711  path = GetRelationPath(dboid, spcoid, relfilenumber, INVALID_PROC_NUMBER,
712  forknum);
713 
714  lastslash = strrchr(path, '/');
715  Assert(lastslash != NULL);
716  *lastslash = '\0';
717 
718  if (segno > 0)
719  ipath = psprintf("%s/INCREMENTAL.%s.%u", path, lastslash + 1, segno);
720  else
721  ipath = psprintf("%s/INCREMENTAL.%s", path, lastslash + 1);
722 
723  pfree(path);
724 
725  return ipath;
726 }
727 
728 /*
729  * How should we back up a particular file as part of an incremental backup?
730  *
731  * If the return value is BACK_UP_FILE_FULLY, caller should back up the whole
732  * file just as if this were not an incremental backup. The contents of the
733  * relative_block_numbers array are unspecified in this case.
734  *
735  * If the return value is BACK_UP_FILE_INCREMENTALLY, caller should include
736  * an incremental file in the backup instead of the entire file. On return,
737  * *num_blocks_required will be set to the number of blocks that need to be
738  * sent, and the actual block numbers will have been stored in
739  * relative_block_numbers, which should be an array of at least RELSEG_SIZE.
740  * In addition, *truncation_block_length will be set to the value that should
741  * be included in the incremental file.
742  */
745  Oid dboid, Oid spcoid,
746  RelFileNumber relfilenumber, ForkNumber forknum,
747  unsigned segno, size_t size,
748  unsigned *num_blocks_required,
749  BlockNumber *relative_block_numbers,
750  unsigned *truncation_block_length)
751 {
752  BlockNumber limit_block;
753  BlockNumber start_blkno;
754  BlockNumber stop_blkno;
755  RelFileLocator rlocator;
756  BlockRefTableEntry *brtentry;
757  unsigned i;
758  unsigned nblocks;
759 
760  /* Should only be called after PrepareForIncrementalBackup. */
761  Assert(ib->buf.data == NULL);
762 
763  /*
764  * dboid could be InvalidOid if shared rel, but spcoid and relfilenumber
765  * should have legal values.
766  */
767  Assert(OidIsValid(spcoid));
768  Assert(RelFileNumberIsValid(relfilenumber));
769 
770  /*
771  * If the file size is too large or not a multiple of BLCKSZ, then
772  * something weird is happening, so give up and send the whole file.
773  */
774  if ((size % BLCKSZ) != 0 || size / BLCKSZ > RELSEG_SIZE)
775  return BACK_UP_FILE_FULLY;
776 
777  /*
778  * The free-space map fork is not properly WAL-logged, so we need to
779  * backup the entire file every time.
780  */
781  if (forknum == FSM_FORKNUM)
782  return BACK_UP_FILE_FULLY;
783 
784  /*
785  * If this file was not part of the prior backup, back it up fully.
786  *
787  * If this file was created after the prior backup and before the start of
788  * the current backup, then the WAL summary information will tell us to
789  * back up the whole file. However, if this file was created after the
790  * start of the current backup, then the WAL summary won't know anything
791  * about it. Without this logic, we would erroneously conclude that it was
792  * OK to send it incrementally.
793  *
794  * Note that the file could have existed at the time of the prior backup,
795  * gotten deleted, and then a new file with the same name could have been
796  * created. In that case, this logic won't prevent the file from being
797  * backed up incrementally. But, if the deletion happened before the start
798  * of the current backup, the limit block will be 0, inducing a full
799  * backup. If the deletion happened after the start of the current backup,
800  * reconstruction will erroneously combine blocks from the current
801  * lifespan of the file with blocks from the previous lifespan -- but in
802  * this type of case, WAL replay to reach backup consistency should remove
803  * and recreate the file anyway, so the initial bogus contents should not
804  * matter.
805  */
806  if (backup_file_lookup(ib->manifest_files, path) == NULL)
807  {
808  char *ipath;
809 
810  ipath = GetIncrementalFilePath(dboid, spcoid, relfilenumber,
811  forknum, segno);
812  if (backup_file_lookup(ib->manifest_files, ipath) == NULL)
813  return BACK_UP_FILE_FULLY;
814  }
815 
816  /*
817  * Look up the special block reference table entry for the database as a
818  * whole.
819  */
820  rlocator.spcOid = spcoid;
821  rlocator.dbOid = dboid;
822  rlocator.relNumber = 0;
823  if (BlockRefTableGetEntry(ib->brtab, &rlocator, MAIN_FORKNUM,
824  &limit_block) != NULL)
825  {
826  /*
827  * According to the WAL summary, this database OID/tablespace OID
828  * pairing has been created since the previous backup. So, everything
829  * in it must be backed up fully.
830  */
831  return BACK_UP_FILE_FULLY;
832  }
833 
834  /* Look up the block reference table entry for this relfilenode. */
835  rlocator.relNumber = relfilenumber;
836  brtentry = BlockRefTableGetEntry(ib->brtab, &rlocator, forknum,
837  &limit_block);
838 
839  /*
840  * If there is no entry, then there have been no WAL-logged changes to the
841  * relation since the predecessor backup was taken, so we can back it up
842  * incrementally and need not include any modified blocks.
843  *
844  * However, if the file is zero-length, we should do a full backup,
845  * because an incremental file is always more than zero length, and it's
846  * silly to take an incremental backup when a full backup would be
847  * smaller.
848  */
849  if (brtentry == NULL)
850  {
851  if (size == 0)
852  return BACK_UP_FILE_FULLY;
853  *num_blocks_required = 0;
854  *truncation_block_length = size / BLCKSZ;
856  }
857 
858  /*
859  * If the limit_block is less than or equal to the point where this
860  * segment starts, send the whole file.
861  */
862  if (limit_block <= segno * RELSEG_SIZE)
863  return BACK_UP_FILE_FULLY;
864 
865  /*
866  * Get relevant entries from the block reference table entry.
867  *
868  * We shouldn't overflow computing the start or stop block numbers, but if
869  * it manages to happen somehow, detect it and throw an error.
870  */
871  start_blkno = segno * RELSEG_SIZE;
872  stop_blkno = start_blkno + (size / BLCKSZ);
873  if (start_blkno / RELSEG_SIZE != segno || stop_blkno < start_blkno)
874  ereport(ERROR,
875  errcode(ERRCODE_INTERNAL_ERROR),
876  errmsg_internal("overflow computing block number bounds for segment %u with size %zu",
877  segno, size));
878 
879  /*
880  * This will write *absolute* block numbers into the output array, but
881  * we'll transpose them below.
882  */
883  nblocks = BlockRefTableEntryGetBlocks(brtentry, start_blkno, stop_blkno,
884  relative_block_numbers, RELSEG_SIZE);
885  Assert(nblocks <= RELSEG_SIZE);
886 
887  /*
888  * If we're going to have to send nearly all of the blocks, then just send
889  * the whole file, because that won't require much extra storage or
890  * transfer and will speed up and simplify backup restoration. It's not
891  * clear what threshold is most appropriate here and perhaps it ought to
892  * be configurable, but for now we're just going to say that if we'd need
893  * to send 90% of the blocks anyway, give up and send the whole file.
894  *
895  * NB: If you change the threshold here, at least make sure to back up the
896  * file fully when every single block must be sent, because there's
897  * nothing good about sending an incremental file in that case.
898  */
899  if (nblocks * BLCKSZ > size * 0.9)
900  return BACK_UP_FILE_FULLY;
901 
902  /*
903  * Looks like we can send an incremental file, so sort the block numbers
904  * and then transpose them from absolute block numbers to relative block
905  * numbers if necessary.
906  *
907  * NB: If the block reference table was using the bitmap representation
908  * for a given chunk, the block numbers in that chunk will already be
909  * sorted, but when the array-of-offsets representation is used, we can
910  * receive block numbers here out of order.
911  */
912  qsort(relative_block_numbers, nblocks, sizeof(BlockNumber),
914  if (start_blkno != 0)
915  {
916  for (i = 0; i < nblocks; ++i)
917  relative_block_numbers[i] -= start_blkno;
918  }
919  *num_blocks_required = nblocks;
920 
921  /*
922  * The truncation block length is the minimum length of the reconstructed
923  * file. Any block numbers below this threshold that are not present in
924  * the backup need to be fetched from the prior backup. At or above this
925  * threshold, blocks should only be included in the result if they are
926  * present in the backup. (This may require inserting zero blocks if the
927  * blocks included in the backup are non-consecutive.)
928  */
929  *truncation_block_length = size / BLCKSZ;
930  if (BlockNumberIsValid(limit_block))
931  {
932  unsigned relative_limit = limit_block - segno * RELSEG_SIZE;
933 
934  if (*truncation_block_length < relative_limit)
935  *truncation_block_length = relative_limit;
936  }
937 
938  /* Send it incrementally. */
940 }
941 
942 /*
943  * Compute the size for a header of an incremental file containing a given
944  * number of blocks. The header is rounded to a multiple of BLCKSZ, but
945  * only if the file will store some block data.
946  */
947 extern size_t
948 GetIncrementalHeaderSize(unsigned num_blocks_required)
949 {
950  size_t result;
951 
952  /* Make sure we're not going to overflow. */
953  Assert(num_blocks_required <= RELSEG_SIZE);
954 
955  /*
956  * Three four byte quantities (magic number, truncation block length,
957  * block count) followed by block numbers.
958  */
959  result = 3 * sizeof(uint32) + (sizeof(BlockNumber) * num_blocks_required);
960 
961  /*
962  * Round the header size to a multiple of BLCKSZ - when not a multiple of
963  * BLCKSZ, add the missing fraction of a block. But do this only if the
964  * file will store data for some blocks, otherwise keep it small.
965  */
966  if ((num_blocks_required > 0) && (result % BLCKSZ != 0))
967  result += BLCKSZ - (result % BLCKSZ);
968 
969  return result;
970 }
971 
972 /*
973  * Compute the size for an incremental file containing a given number of blocks.
974  */
975 extern size_t
976 GetIncrementalFileSize(unsigned num_blocks_required)
977 {
978  size_t result;
979 
980  /* Make sure we're not going to overflow. */
981  Assert(num_blocks_required <= RELSEG_SIZE);
982 
983  /*
984  * Header with three four byte quantities (magic number, truncation block
985  * length, block count) followed by block numbers, rounded to a multiple
986  * of BLCKSZ (for files with block data), followed by block contents.
987  */
988  result = GetIncrementalHeaderSize(num_blocks_required);
989  result += BLCKSZ * num_blocks_required;
990 
991  return result;
992 }
993 
994 /*
995  * Helper function for filemap hash table.
996  */
997 static uint32
998 hash_string_pointer(const char *s)
999 {
1000  unsigned char *ss = (unsigned char *) s;
1001 
1002  return hash_bytes(ss, strlen(s));
1003 }
1004 
1005 /*
1006  * This callback to validate the manifest version for incremental backup.
1007  */
1008 static void
1010  int manifest_version)
1011 {
1012  /* Incremental backups don't work with manifest version 1 */
1013  if (manifest_version == 1)
1014  context->error_cb(context,
1015  "backup manifest version 1 does not support incremental backup");
1016 }
1017 
1018 /*
1019  * This callback to validate the manifest system identifier against the current
1020  * database server.
1021  */
1022 static void
1024  uint64 manifest_system_identifier)
1025 {
1026  uint64 system_identifier;
1027 
1028  /* Get system identifier of current system */
1029  system_identifier = GetSystemIdentifier();
1030 
1031  if (manifest_system_identifier != system_identifier)
1032  context->error_cb(context,
1033  "manifest system identifier is %llu, but database system identifier is %llu",
1034  (unsigned long long) manifest_system_identifier,
1035  (unsigned long long) system_identifier);
1036 }
1037 
1038 /*
1039  * This callback is invoked for each file mentioned in the backup manifest.
1040  *
1041  * We store the path to each file and the size of each file for sanity-checking
1042  * purposes. For further details, see comments for IncrementalBackupInfo.
1043  */
1044 static void
1046  char *pathname, size_t size,
1047  pg_checksum_type checksum_type,
1048  int checksum_length,
1049  uint8 *checksum_payload)
1050 {
1051  IncrementalBackupInfo *ib = context->private_data;
1052  backup_file_entry *entry;
1053  bool found;
1054 
1055  entry = backup_file_insert(ib->manifest_files, pathname, &found);
1056  if (!found)
1057  {
1058  entry->path = MemoryContextStrdup(ib->manifest_files->ctx,
1059  pathname);
1060  entry->size = size;
1061  }
1062 }
1063 
1064 /*
1065  * This callback is invoked for each WAL range mentioned in the backup
1066  * manifest.
1067  *
1068  * We're just interested in learning the oldest LSN and the corresponding TLI
1069  * that appear in any WAL range.
1070  */
1071 static void
1073  TimeLineID tli, XLogRecPtr start_lsn,
1074  XLogRecPtr end_lsn)
1075 {
1076  IncrementalBackupInfo *ib = context->private_data;
1078 
1079  range->tli = tli;
1080  range->start_lsn = start_lsn;
1081  range->end_lsn = end_lsn;
1083 }
1084 
1085 /*
1086  * This callback is invoked if an error occurs while parsing the backup
1087  * manifest.
1088  */
1089 static void
1091 {
1092  StringInfoData errbuf;
1093 
1094  initStringInfo(&errbuf);
1095 
1096  for (;;)
1097  {
1098  va_list ap;
1099  int needed;
1100 
1101  va_start(ap, fmt);
1102  needed = appendStringInfoVA(&errbuf, fmt, ap);
1103  va_end(ap);
1104  if (needed == 0)
1105  break;
1106  enlargeStringInfo(&errbuf, needed);
1107  }
1108 
1109  ereport(ERROR,
1110  errmsg_internal("%s", errbuf.data));
1111 }
1112 
1113 /*
1114  * Quicksort comparator for block numbers.
1115  */
1116 static int
1117 compare_block_numbers(const void *a, const void *b)
1118 {
1119  BlockNumber aa = *(BlockNumber *) a;
1120  BlockNumber bb = *(BlockNumber *) b;
1121 
1122  return pg_cmp_u32(aa, bb);
1123 }
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
long TimestampDifferenceMilliseconds(TimestampTz start_time, TimestampTz stop_time)
Definition: timestamp.c:1766
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1654
#define BLOCKS_PER_READ
void AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data, int len)
static void manifest_report_error(JsonManifestParseContext *ib, const char *fmt,...) pg_attribute_printf(2
static void manifest_process_version(JsonManifestParseContext *context, int manifest_version)
static int compare_block_numbers(const void *a, const void *b)
static uint32 hash_string_pointer(const char *s)
size_t GetIncrementalHeaderSize(unsigned num_blocks_required)
char * GetIncrementalFilePath(Oid dboid, Oid spcoid, RelFileNumber relfilenumber, ForkNumber forknum, unsigned segno)
#define MAX_CHUNK
static void manifest_process_system_identifier(JsonManifestParseContext *context, uint64 manifest_system_identifier)
#define MIN_CHUNK
size_t GetIncrementalFileSize(unsigned num_blocks_required)
static void manifest_process_file(JsonManifestParseContext *context, char *pathname, size_t size, pg_checksum_type checksum_type, int checksum_length, uint8 *checksum_payload)
FileBackupMethod GetFileBackupMethod(IncrementalBackupInfo *ib, const char *path, Oid dboid, Oid spcoid, RelFileNumber relfilenumber, ForkNumber forknum, unsigned segno, size_t size, unsigned *num_blocks_required, BlockNumber *relative_block_numbers, unsigned *truncation_block_length)
static void pg_attribute_noreturn()
static void manifest_process_wal_range(JsonManifestParseContext *context, TimeLineID tli, XLogRecPtr start_lsn, XLogRecPtr end_lsn)
IncrementalBackupInfo * CreateIncrementalBackupInfo(MemoryContext mcxt)
void PrepareForIncrementalBackup(IncrementalBackupInfo *ib, BackupState *backup_state)
void FinalizeIncrementalManifest(IncrementalBackupInfo *ib)
@ BACK_UP_FILE_INCREMENTALLY
@ BACK_UP_FILE_FULLY
BlockRefTableReader * CreateBlockRefTableReader(io_callback_fn read_callback, void *read_callback_arg, char *error_filename, report_error_fn error_callback, void *error_callback_arg)
Definition: blkreftable.c:577
bool BlockRefTableReaderNextRelation(BlockRefTableReader *reader, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *limit_block)
Definition: blkreftable.c:613
int BlockRefTableEntryGetBlocks(BlockRefTableEntry *entry, BlockNumber start_blkno, BlockNumber stop_blkno, BlockNumber *blocks, int nblocks)
Definition: blkreftable.c:369
void BlockRefTableMarkBlockModified(BlockRefTable *brtab, const RelFileLocator *rlocator, ForkNumber forknum, BlockNumber blknum)
Definition: blkreftable.c:297
BlockRefTableEntry * BlockRefTableGetEntry(BlockRefTable *brtab, const RelFileLocator *rlocator, ForkNumber forknum, BlockNumber *limit_block)
Definition: blkreftable.c:340
unsigned BlockRefTableReaderGetBlocks(BlockRefTableReader *reader, BlockNumber *blocks, int nblocks)
Definition: blkreftable.c:689
void BlockRefTableSetLimitBlock(BlockRefTable *brtab, const RelFileLocator *rlocator, ForkNumber forknum, BlockNumber limit_block)
Definition: blkreftable.c:262
void DestroyBlockRefTableReader(BlockRefTableReader *reader)
Definition: blkreftable.c:773
void(*) BlockRefTable CreateEmptyBlockRefTable)(void)
uint32 BlockNumber
Definition: block.h:31
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
unsigned int uint32
Definition: c.h:506
#define Assert(condition)
Definition: c.h:858
#define pg_attribute_printf(f, a)
Definition: c.h:191
unsigned char uint8
Definition: c.h:504
#define OidIsValid(objectId)
Definition: c.h:775
pg_checksum_type
int64 TimestampTz
Definition: timestamp.h:39
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1159
int errdetail(const char *fmt,...)
Definition: elog.c:1205
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define WARNING
Definition: elog.h:36
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
void FileClose(File file)
Definition: fd.c:1978
char * FilePathName(File file)
Definition: fd.c:2461
uint32 hash_bytes(const unsigned char *k, int keylen)
Definition: hashfn.c:146
static int pg_cmp_u32(uint32 a, uint32 b)
Definition: int.h:489
int b
Definition: isn.c:70
int a
Definition: isn.c:69
int i
Definition: isn.c:73
static void const char * fmt
va_end(args)
va_start(args, fmt)
List * lappend(List *list, void *datum)
Definition: list.c:339
List * list_concat(List *list1, const List *list2)
Definition: list.c:561
void pfree(void *pointer)
Definition: mcxt.c:1520
void * palloc0(Size size)
Definition: mcxt.c:1346
char * MemoryContextStrdup(MemoryContext context, const char *string)
Definition: mcxt.c:1682
void * palloc(Size size)
Definition: mcxt.c:1316
void json_parse_manifest_incremental_chunk(JsonManifestParseIncrementalState *incstate, char *chunk, int size, bool is_last)
JsonManifestParseIncrementalState * json_parse_manifest_incremental_init(JsonManifestParseContext *context)
void json_parse_manifest_incremental_shutdown(JsonManifestParseIncrementalState *incstate)
const void size_t len
const void * data
#define lfirst(lc)
Definition: pg_list.h:172
static int list_length(const List *l)
Definition: pg_list.h:152
#define NIL
Definition: pg_list.h:68
static void * list_nth(const List *list, int n)
Definition: pg_list.h:299
#define qsort(a, b, c, d)
Definition: port.h:449
unsigned int Oid
Definition: postgres_ext.h:31
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
tree context
Definition: radixtree.h:1833
MemoryContextSwitchTo(old_ctx)
static struct cvec * range(struct vars *v, chr a, chr b, int cases)
Definition: regc_locale.c:412
char * GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber, int procNumber, ForkNumber forkNumber)
Definition: relpath.c:141
Oid RelFileNumber
Definition: relpath.h:25
ForkNumber
Definition: relpath.h:48
@ FSM_FORKNUM
Definition: relpath.h:51
@ MAIN_FORKNUM
Definition: relpath.h:50
#define RelFileNumberIsValid(relnumber)
Definition: relpath.h:27
static pg_noinline void Size size
Definition: slab.c:607
int appendStringInfoVA(StringInfo str, const char *fmt, va_list args)
Definition: stringinfo.c:139
void enlargeStringInfo(StringInfo str, int needed)
Definition: stringinfo.c:289
void appendBinaryStringInfo(StringInfo str, const void *data, int datalen)
Definition: stringinfo.c:233
void initStringInfo(StringInfo str)
Definition: stringinfo.c:59
TimeLineID istarttli
Definition: xlogbackup.h:32
TimeLineID starttli
Definition: xlogbackup.h:27
XLogRecPtr startpoint
Definition: xlogbackup.h:26
XLogRecPtr istartpoint
Definition: xlogbackup.h:31
backup_file_hash * manifest_files
JsonManifestParseIncrementalState * inc_state
Definition: pg_list.h:54
RelFileNumber relNumber
XLogRecPtr begin
Definition: timeline.h:28
TimeLineID tli
Definition: timeline.h:27
XLogRecPtr end
Definition: timeline.h:29
off_t filepos
Definition: walsummary.h:24
size_t size
uint32 status
const char * path
XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout, XLogRecPtr *pending_lsn)
File OpenWalSummaryFile(WalSummaryFile *ws, bool missing_ok)
Definition: walsummary.c:205
List * GetWalSummaries(TimeLineID tli, XLogRecPtr start_lsn, XLogRecPtr end_lsn)
Definition: walsummary.c:43
bool WalSummariesAreComplete(List *wslist, XLogRecPtr start_lsn, XLogRecPtr end_lsn, XLogRecPtr *missing_lsn)
Definition: walsummary.c:138
int ReadWalSummary(void *wal_summary_io, void *data, int length)
Definition: walsummary.c:273
List * FilterWalSummaries(List *wslist, TimeLineID tli, XLogRecPtr start_lsn, XLogRecPtr end_lsn)
Definition: walsummary.c:100
void ReportWalSummaryError(void *callback_arg, char *fmt,...)
Definition: walsummary.c:322
uint64 GetSystemIdentifier(void)
Definition: xlog.c:4535
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
uint32 TimeLineID
Definition: xlogdefs.h:59
static BackupState * backup_state
Definition: xlogfuncs.c:40
static List * expectedTLEs
Definition: xlogrecovery.c:123