PostgreSQL Source Code  git master
xlogutils.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * xlogutils.c
4  *
5  * PostgreSQL write-ahead log manager utility routines
6  *
7  * This file contains support routines that are used by XLOG replay functions.
8  * None of this code is used during normal system operation.
9  *
10  *
11  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  * src/backend/access/transam/xlogutils.c
15  *
16  *-------------------------------------------------------------------------
17  */
18 #include "postgres.h"
19 
20 #include <unistd.h>
21 
22 #include "access/timeline.h"
23 #include "access/xlog.h"
24 #include "access/xlog_internal.h"
25 #include "access/xlogutils.h"
26 #include "miscadmin.h"
27 #include "pgstat.h"
28 #include "storage/fd.h"
29 #include "storage/smgr.h"
30 #include "utils/guc.h"
31 #include "utils/hsearch.h"
32 #include "utils/rel.h"
33 
34 
35 /* GUC variable */
36 bool ignore_invalid_pages = false;
37 
38 /*
39  * Are we doing recovery from XLOG?
40  *
41  * This is only ever true in the startup process; it should be read as meaning
42  * "this process is replaying WAL records", rather than "the system is in
43  * recovery mode". It should be examined primarily by functions that need
44  * to act differently when called from a WAL redo function (e.g., to skip WAL
45  * logging). To check whether the system is in recovery regardless of which
46  * process you're running in, use RecoveryInProgress() but only after shared
47  * memory startup and lock initialization.
48  *
49  * This is updated from xlog.c, but lives here because it's mostly read by
50  * WAL redo functions.
51  */
52 bool InRecovery = false;
53 
54 /* Are we in Hot Standby mode? Only valid in startup process, see xlogutils.h */
56 
57 /*
58  * During XLOG replay, we may see XLOG records for incremental updates of
59  * pages that no longer exist, because their relation was later dropped or
60  * truncated. (Note: this is only possible when full_page_writes = OFF,
61  * since when it's ON, the first reference we see to a page should always
62  * be a full-page rewrite not an incremental update.) Rather than simply
63  * ignoring such records, we make a note of the referenced page, and then
64  * complain if we don't actually see a drop or truncate covering the page
65  * later in replay.
66  */
67 typedef struct xl_invalid_page_key
68 {
69  RelFileNode node; /* the relation */
70  ForkNumber forkno; /* the fork number */
71  BlockNumber blkno; /* the page */
73 
74 typedef struct xl_invalid_page
75 {
76  xl_invalid_page_key key; /* hash key ... must be first */
77  bool present; /* page existed but contained zeroes */
79 
80 static HTAB *invalid_page_tab = NULL;
81 
82 
83 /* Report a reference to an invalid page */
84 static void
86  BlockNumber blkno, bool present)
87 {
88  char *path = relpathperm(node, forkno);
89 
90  if (present)
91  elog(elevel, "page %u of relation %s is uninitialized",
92  blkno, path);
93  else
94  elog(elevel, "page %u of relation %s does not exist",
95  blkno, path);
96  pfree(path);
97 }
98 
99 /* Log a reference to an invalid page */
100 static void
102  bool present)
103 {
105  xl_invalid_page *hentry;
106  bool found;
107 
108  /*
109  * Once recovery has reached a consistent state, the invalid-page table
110  * should be empty and remain so. If a reference to an invalid page is
111  * found after consistency is reached, PANIC immediately. This might seem
112  * aggressive, but it's better than letting the invalid reference linger
113  * in the hash table until the end of recovery and PANIC there, which
114  * might come only much later if this is a standby server.
115  */
116  if (reachedConsistency)
117  {
118  report_invalid_page(WARNING, node, forkno, blkno, present);
120  "WAL contains references to invalid pages");
121  }
122 
123  /*
124  * Log references to invalid pages at DEBUG1 level. This allows some
125  * tracing of the cause (note the elog context mechanism will tell us
126  * something about the XLOG record that generated the reference).
127  */
129  report_invalid_page(DEBUG1, node, forkno, blkno, present);
130 
131  if (invalid_page_tab == NULL)
132  {
133  /* create hash table when first needed */
134  HASHCTL ctl;
135 
136  ctl.keysize = sizeof(xl_invalid_page_key);
137  ctl.entrysize = sizeof(xl_invalid_page);
138 
139  invalid_page_tab = hash_create("XLOG invalid-page table",
140  100,
141  &ctl,
143  }
144 
145  /* we currently assume xl_invalid_page_key contains no padding */
146  key.node = node;
147  key.forkno = forkno;
148  key.blkno = blkno;
149  hentry = (xl_invalid_page *)
150  hash_search(invalid_page_tab, (void *) &key, HASH_ENTER, &found);
151 
152  if (!found)
153  {
154  /* hash_search already filled in the key */
155  hentry->present = present;
156  }
157  else
158  {
159  /* repeat reference ... leave "present" as it was */
160  }
161 }
162 
163 /* Forget any invalid pages >= minblkno, because they've been dropped */
164 static void
166 {
168  xl_invalid_page *hentry;
169 
170  if (invalid_page_tab == NULL)
171  return; /* nothing to do */
172 
173  hash_seq_init(&status, invalid_page_tab);
174 
175  while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
176  {
177  if (RelFileNodeEquals(hentry->key.node, node) &&
178  hentry->key.forkno == forkno &&
179  hentry->key.blkno >= minblkno)
180  {
182  {
183  char *path = relpathperm(hentry->key.node, forkno);
184 
185  elog(DEBUG2, "page %u of relation %s has been dropped",
186  hentry->key.blkno, path);
187  pfree(path);
188  }
189 
190  if (hash_search(invalid_page_tab,
191  (void *) &hentry->key,
192  HASH_REMOVE, NULL) == NULL)
193  elog(ERROR, "hash table corrupted");
194  }
195  }
196 }
197 
198 /* Forget any invalid pages in a whole database */
199 static void
201 {
203  xl_invalid_page *hentry;
204 
205  if (invalid_page_tab == NULL)
206  return; /* nothing to do */
207 
208  hash_seq_init(&status, invalid_page_tab);
209 
210  while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
211  {
212  if (hentry->key.node.dbNode == dbid)
213  {
215  {
216  char *path = relpathperm(hentry->key.node, hentry->key.forkno);
217 
218  elog(DEBUG2, "page %u of relation %s has been dropped",
219  hentry->key.blkno, path);
220  pfree(path);
221  }
222 
223  if (hash_search(invalid_page_tab,
224  (void *) &hentry->key,
225  HASH_REMOVE, NULL) == NULL)
226  elog(ERROR, "hash table corrupted");
227  }
228  }
229 }
230 
231 /* Are there any unresolved references to invalid pages? */
232 bool
234 {
235  if (invalid_page_tab != NULL &&
236  hash_get_num_entries(invalid_page_tab) > 0)
237  return true;
238  return false;
239 }
240 
241 /* Complain about any remaining invalid-page entries */
242 void
244 {
246  xl_invalid_page *hentry;
247  bool foundone = false;
248 
249  if (invalid_page_tab == NULL)
250  return; /* nothing to do */
251 
252  hash_seq_init(&status, invalid_page_tab);
253 
254  /*
255  * Our strategy is to emit WARNING messages for all remaining entries and
256  * only PANIC after we've dumped all the available info.
257  */
258  while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
259  {
260  report_invalid_page(WARNING, hentry->key.node, hentry->key.forkno,
261  hentry->key.blkno, hentry->present);
262  foundone = true;
263  }
264 
265  if (foundone)
267  "WAL contains references to invalid pages");
268 
269  hash_destroy(invalid_page_tab);
270  invalid_page_tab = NULL;
271 }
272 
273 
274 /*
275  * XLogReadBufferForRedo
276  * Read a page during XLOG replay
277  *
278  * Reads a block referenced by a WAL record into shared buffer cache, and
279  * determines what needs to be done to redo the changes to it. If the WAL
280  * record includes a full-page image of the page, it is restored.
281  *
282  * 'record.EndRecPtr' is compared to the page's LSN to determine if the record
283  * has already been replayed. 'block_id' is the ID number the block was
284  * registered with, when the WAL record was created.
285  *
286  * Returns one of the following:
287  *
288  * BLK_NEEDS_REDO - changes from the WAL record need to be applied
289  * BLK_DONE - block doesn't need replaying
290  * BLK_RESTORED - block was restored from a full-page image included in
291  * the record
292  * BLK_NOTFOUND - block was not found (because it was truncated away by
293  * an operation later in the WAL stream)
294  *
295  * On return, the buffer is locked in exclusive-mode, and returned in *buf.
296  * Note that the buffer is locked and returned even if it doesn't need
297  * replaying. (Getting the buffer lock is not really necessary during
298  * single-process crash recovery, but some subroutines such as MarkBufferDirty
299  * will complain if we don't have the lock. In hot standby mode it's
300  * definitely necessary.)
301  *
302  * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag
303  * set, we restore it, even if the page in the database appears newer. This
304  * is to protect ourselves against database pages that were partially or
305  * incorrectly written during a crash. We assume that the XLOG data must be
306  * good because it has passed a CRC check, while the database page might not
307  * be. This will force us to replay all subsequent modifications of the page
308  * that appear in XLOG, rather than possibly ignoring them as already
309  * applied, but that's not a huge drawback.
310  */
313  Buffer *buf)
314 {
315  return XLogReadBufferForRedoExtended(record, block_id, RBM_NORMAL,
316  false, buf);
317 }
318 
319 /*
320  * Pin and lock a buffer referenced by a WAL record, for the purpose of
321  * re-initializing it.
322  */
323 Buffer
325 {
326  Buffer buf;
327 
328  XLogReadBufferForRedoExtended(record, block_id, RBM_ZERO_AND_LOCK, false,
329  &buf);
330  return buf;
331 }
332 
333 /*
334  * XLogReadBufferForRedoExtended
335  * Like XLogReadBufferForRedo, but with extra options.
336  *
337  * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
338  * with all-zeroes pages up to the referenced block number. In
339  * RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value
340  * is always BLK_NEEDS_REDO.
341  *
342  * (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock
343  * parameter. Do not use an inconsistent combination!)
344  *
345  * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer
346  * using LockBufferForCleanup(), instead of a regular exclusive lock.
347  */
350  uint8 block_id,
351  ReadBufferMode mode, bool get_cleanup_lock,
352  Buffer *buf)
353 {
354  XLogRecPtr lsn = record->EndRecPtr;
355  RelFileNode rnode;
356  ForkNumber forknum;
358  Page page;
359  bool zeromode;
360  bool willinit;
361 
362  if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
363  {
364  /* Caller specified a bogus block_id */
365  elog(PANIC, "failed to locate backup block with ID %d", block_id);
366  }
367 
368  /*
369  * Make sure that if the block is marked with WILL_INIT, the caller is
370  * going to initialize it. And vice versa.
371  */
372  zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
373  willinit = (record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0;
374  if (willinit && !zeromode)
375  elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine");
376  if (!willinit && zeromode)
377  elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record");
378 
379  /* If it has a full-page image and it should be restored, do it. */
380  if (XLogRecBlockImageApply(record, block_id))
381  {
382  Assert(XLogRecHasBlockImage(record, block_id));
383  *buf = XLogReadBufferExtended(rnode, forknum, blkno,
384  get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
385  page = BufferGetPage(*buf);
386  if (!RestoreBlockImage(record, block_id, page))
387  elog(ERROR, "failed to restore block image");
388 
389  /*
390  * The page may be uninitialized. If so, we can't set the LSN because
391  * that would corrupt the page.
392  */
393  if (!PageIsNew(page))
394  {
395  PageSetLSN(page, lsn);
396  }
397 
398  MarkBufferDirty(*buf);
399 
400  /*
401  * At the end of crash recovery the init forks of unlogged relations
402  * are copied, without going through shared buffers. So we need to
403  * force the on-disk state of init forks to always be in sync with the
404  * state in shared buffers.
405  */
406  if (forknum == INIT_FORKNUM)
407  FlushOneBuffer(*buf);
408 
409  return BLK_RESTORED;
410  }
411  else
412  {
413  *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode);
414  if (BufferIsValid(*buf))
415  {
416  if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK)
417  {
418  if (get_cleanup_lock)
419  LockBufferForCleanup(*buf);
420  else
422  }
423  if (lsn <= PageGetLSN(BufferGetPage(*buf)))
424  return BLK_DONE;
425  else
426  return BLK_NEEDS_REDO;
427  }
428  else
429  return BLK_NOTFOUND;
430  }
431 }
432 
433 /*
434  * XLogReadBufferExtended
435  * Read a page during XLOG replay
436  *
437  * This is functionally comparable to ReadBufferExtended. There's some
438  * differences in the behavior wrt. the "mode" argument:
439  *
440  * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
441  * return InvalidBuffer. In this case the caller should silently skip the
442  * update on this page. (In this situation, we expect that the page was later
443  * dropped or truncated. If we don't see evidence of that later in the WAL
444  * sequence, we'll complain at the end of WAL replay.)
445  *
446  * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
447  * with all-zeroes pages up to the given block number.
448  *
449  * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
450  * exist, and we don't check for all-zeroes. Thus, no log entry is made
451  * to imply that the page should be dropped or truncated later.
452  *
453  * NB: A redo function should normally not call this directly. To get a page
454  * to modify, use XLogReadBufferForRedoExtended instead. It is important that
455  * all pages modified by a WAL record are registered in the WAL records, or
456  * they will be invisible to tools that need to know which pages are modified.
457  */
458 Buffer
461 {
462  BlockNumber lastblock;
463  Buffer buffer;
464  SMgrRelation smgr;
465 
466  Assert(blkno != P_NEW);
467 
468  /* Open the relation at smgr level */
469  smgr = smgropen(rnode, InvalidBackendId);
470 
471  /*
472  * Create the target file if it doesn't already exist. This lets us cope
473  * if the replay sequence contains writes to a relation that is later
474  * deleted. (The original coding of this routine would instead suppress
475  * the writes, but that seems like it risks losing valuable data if the
476  * filesystem loses an inode during a crash. Better to write the data
477  * until we are actually told to delete the file.)
478  */
479  smgrcreate(smgr, forknum, true);
480 
481  lastblock = smgrnblocks(smgr, forknum);
482 
483  if (blkno < lastblock)
484  {
485  /* page exists in file */
486  buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
487  mode, NULL);
488  }
489  else
490  {
491  /* hm, page doesn't exist in file */
492  if (mode == RBM_NORMAL)
493  {
494  log_invalid_page(rnode, forknum, blkno, false);
495  return InvalidBuffer;
496  }
497  if (mode == RBM_NORMAL_NO_LOG)
498  return InvalidBuffer;
499  /* OK to extend the file */
500  /* we do this in recovery only - no rel-extension lock needed */
502  buffer = InvalidBuffer;
503  do
504  {
505  if (buffer != InvalidBuffer)
506  {
507  if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
509  ReleaseBuffer(buffer);
510  }
511  buffer = ReadBufferWithoutRelcache(rnode, forknum,
512  P_NEW, mode, NULL);
513  }
514  while (BufferGetBlockNumber(buffer) < blkno);
515  /* Handle the corner case that P_NEW returns non-consecutive pages */
516  if (BufferGetBlockNumber(buffer) != blkno)
517  {
518  if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
520  ReleaseBuffer(buffer);
521  buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
522  mode, NULL);
523  }
524  }
525 
526  if (mode == RBM_NORMAL)
527  {
528  /* check that page has been initialized */
529  Page page = (Page) BufferGetPage(buffer);
530 
531  /*
532  * We assume that PageIsNew is safe without a lock. During recovery,
533  * there should be no other backends that could modify the buffer at
534  * the same time.
535  */
536  if (PageIsNew(page))
537  {
538  ReleaseBuffer(buffer);
539  log_invalid_page(rnode, forknum, blkno, true);
540  return InvalidBuffer;
541  }
542  }
543 
544  return buffer;
545 }
546 
547 /*
548  * Struct actually returned by CreateFakeRelcacheEntry, though the declared
549  * return type is Relation.
550  */
551 typedef struct
552 {
553  RelationData reldata; /* Note: this must be first */
556 
558 
559 /*
560  * Create a fake relation cache entry for a physical relation
561  *
562  * It's often convenient to use the same functions in XLOG replay as in the
563  * main codepath, but those functions typically work with a relcache entry.
564  * We don't have a working relation cache during XLOG replay, but this
565  * function can be used to create a fake relcache entry instead. Only the
566  * fields related to physical storage, like rd_rel, are initialized, so the
567  * fake entry is only usable in low-level operations like ReadBuffer().
568  *
569  * This is also used for syncing WAL-skipped files.
570  *
571  * Caller must free the returned entry with FreeFakeRelcacheEntry().
572  */
573 Relation
575 {
576  FakeRelCacheEntry fakeentry;
577  Relation rel;
578 
579  /* Allocate the Relation struct and all related space in one block. */
580  fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
581  rel = (Relation) fakeentry;
582 
583  rel->rd_rel = &fakeentry->pgc;
584  rel->rd_node = rnode;
585 
586  /*
587  * We will never be working with temp rels during recovery or while
588  * syncing WAL-skipped files.
589  */
591 
592  /* It must be a permanent table here */
593  rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
594 
595  /* We don't know the name of the relation; use relfilenode instead */
596  sprintf(RelationGetRelationName(rel), "%u", rnode.relNode);
597 
598  /*
599  * We set up the lockRelId in case anything tries to lock the dummy
600  * relation. Note that this is fairly bogus since relNode may be
601  * different from the relation's OID. It shouldn't really matter though.
602  * In recovery, we are running by ourselves and can't have any lock
603  * conflicts. While syncing, we already hold AccessExclusiveLock.
604  */
605  rel->rd_lockInfo.lockRelId.dbId = rnode.dbNode;
606  rel->rd_lockInfo.lockRelId.relId = rnode.relNode;
607 
608  rel->rd_smgr = NULL;
609 
610  return rel;
611 }
612 
613 /*
614  * Free a fake relation cache entry.
615  */
616 void
618 {
619  /* make sure the fakerel is not referenced by the SmgrRelation anymore */
620  if (fakerel->rd_smgr != NULL)
621  smgrclearowner(&fakerel->rd_smgr, fakerel->rd_smgr);
622  pfree(fakerel);
623 }
624 
625 /*
626  * Drop a relation during XLOG replay
627  *
628  * This is called when the relation is about to be deleted; we need to remove
629  * any open "invalid-page" records for the relation.
630  */
631 void
633 {
634  forget_invalid_pages(rnode, forknum, 0);
635 }
636 
637 /*
638  * Drop a whole database during XLOG replay
639  *
640  * As above, but for DROP DATABASE instead of dropping a single rel
641  */
642 void
644 {
645  /*
646  * This is unnecessarily heavy-handed, as it will close SMgrRelation
647  * objects for other databases as well. DROP DATABASE occurs seldom enough
648  * that it's not worth introducing a variant of smgrclose for just this
649  * purpose. XXX: Or should we rather leave the smgr entries dangling?
650  */
651  smgrcloseall();
652 
654 }
655 
656 /*
657  * Truncate a relation during XLOG replay
658  *
659  * We need to clean up any open "invalid-page" records for the dropped pages.
660  */
661 void
663  BlockNumber nblocks)
664 {
665  forget_invalid_pages(rnode, forkNum, nblocks);
666 }
667 
668 /*
669  * Determine which timeline to read an xlog page from and set the
670  * XLogReaderState's currTLI to that timeline ID.
671  *
672  * We care about timelines in xlogreader when we might be reading xlog
673  * generated prior to a promotion, either if we're currently a standby in
674  * recovery or if we're a promoted primary reading xlogs generated by the old
675  * primary before our promotion.
676  *
677  * wantPage must be set to the start address of the page to read and
678  * wantLength to the amount of the page that will be read, up to
679  * XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ.
680  *
681  * We switch to an xlog segment from the new timeline eagerly when on a
682  * historical timeline, as soon as we reach the start of the xlog segment
683  * containing the timeline switch. The server copied the segment to the new
684  * timeline so all the data up to the switch point is the same, but there's no
685  * guarantee the old segment will still exist. It may have been deleted or
686  * renamed with a .partial suffix so we can't necessarily keep reading from
687  * the old TLI even though tliSwitchPoint says it's OK.
688  *
689  * We can't just check the timeline when we read a page on a different segment
690  * to the last page. We could've received a timeline switch from a cascading
691  * upstream, so the current segment ends abruptly (possibly getting renamed to
692  * .partial) and we have to switch to a new one. Even in the middle of reading
693  * a page we could have to dump the cached page and switch to a new TLI.
694  *
695  * Because of this, callers MAY NOT assume that currTLI is the timeline that
696  * will be in a page's xlp_tli; the page may begin on an older timeline or we
697  * might be reading from historical timeline data on a segment that's been
698  * copied to a new timeline.
699  *
700  * The caller must also make sure it doesn't read past the current replay
701  * position (using GetXLogReplayRecPtr) if executing in recovery, so it
702  * doesn't fail to notice that the current timeline became historical. The
703  * caller must also update ThisTimeLineID with the result of
704  * GetXLogReplayRecPtr and must check RecoveryInProgress().
705  */
706 void
708 {
709  const XLogRecPtr lastReadPage = (state->seg.ws_segno *
710  state->segcxt.ws_segsize + state->segoff);
711 
712  Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0);
713  Assert(wantLength <= XLOG_BLCKSZ);
714  Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ);
715 
716  /*
717  * If the desired page is currently read in and valid, we have nothing to
718  * do.
719  *
720  * The caller should've ensured that it didn't previously advance readOff
721  * past the valid limit of this timeline, so it doesn't matter if the
722  * current TLI has since become historical.
723  */
724  if (lastReadPage == wantPage &&
725  state->readLen != 0 &&
726  lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1))
727  return;
728 
729  /*
730  * If we're reading from the current timeline, it hasn't become historical
731  * and the page we're reading is after the last page read, we can again
732  * just carry on. (Seeking backwards requires a check to make sure the
733  * older page isn't on a prior timeline).
734  *
735  * ThisTimeLineID might've become historical since we last looked, but the
736  * caller is required not to read past the flush limit it saw at the time
737  * it looked up the timeline. There's nothing we can do about it if
738  * StartupXLOG() renames it to .partial concurrently.
739  */
740  if (state->currTLI == ThisTimeLineID && wantPage >= lastReadPage)
741  {
743  return;
744  }
745 
746  /*
747  * If we're just reading pages from a previously validated historical
748  * timeline and the timeline we're reading from is valid until the end of
749  * the current segment we can just keep reading.
750  */
751  if (state->currTLIValidUntil != InvalidXLogRecPtr &&
752  state->currTLI != ThisTimeLineID &&
753  state->currTLI != 0 &&
754  ((wantPage + wantLength) / state->segcxt.ws_segsize) <
755  (state->currTLIValidUntil / state->segcxt.ws_segsize))
756  return;
757 
758  /*
759  * If we reach this point we're either looking up a page for random
760  * access, the current timeline just became historical, or we're reading
761  * from a new segment containing a timeline switch. In all cases we need
762  * to determine the newest timeline on the segment.
763  *
764  * If it's the current timeline we can just keep reading from here unless
765  * we detect a timeline switch that makes the current timeline historical.
766  * If it's a historical timeline we can read all the segment on the newest
767  * timeline because it contains all the old timelines' data too. So only
768  * one switch check is required.
769  */
770  {
771  /*
772  * We need to re-read the timeline history in case it's been changed
773  * by a promotion or replay from a cascaded replica.
774  */
775  List *timelineHistory = readTimeLineHistory(ThisTimeLineID);
776  XLogRecPtr endOfSegment;
777 
778  endOfSegment = ((wantPage / state->segcxt.ws_segsize) + 1) *
779  state->segcxt.ws_segsize - 1;
780  Assert(wantPage / state->segcxt.ws_segsize ==
781  endOfSegment / state->segcxt.ws_segsize);
782 
783  /*
784  * Find the timeline of the last LSN on the segment containing
785  * wantPage.
786  */
787  state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory);
788  state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory,
789  &state->nextTLI);
790 
792  wantPage + wantLength < state->currTLIValidUntil);
793 
794  list_free_deep(timelineHistory);
795 
796  elog(DEBUG3, "switched to timeline %u valid until %X/%X",
797  state->currTLI,
799  }
800 }
801 
802 /* XLogReaderRoutine->segment_open callback for local pg_wal files */
803 void
805  TimeLineID *tli_p)
806 {
807  TimeLineID tli = *tli_p;
808  char path[MAXPGPATH];
809 
810  XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
811  state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
812  if (state->seg.ws_file >= 0)
813  return;
814 
815  if (errno == ENOENT)
816  ereport(ERROR,
818  errmsg("requested WAL segment %s has already been removed",
819  path)));
820  else
821  ereport(ERROR,
823  errmsg("could not open file \"%s\": %m",
824  path)));
825 }
826 
827 /* stock XLogReaderRoutine->segment_close callback */
828 void
830 {
831  close(state->seg.ws_file);
832  /* need to check errno? */
833  state->seg.ws_file = -1;
834 }
835 
836 /*
837  * XLogReaderRoutine->page_read callback for reading local xlog files
838  *
839  * Public because it would likely be very helpful for someone writing another
840  * output method outside walsender, e.g. in a bgworker.
841  *
842  * TODO: The walsender has its own version of this, but it relies on the
843  * walsender's latch being set whenever WAL is flushed. No such infrastructure
844  * exists for normal backends, so we have to do a check/sleep/repeat style of
845  * loop for now.
846  */
847 int
849  int reqLen, XLogRecPtr targetRecPtr, char *cur_page)
850 {
851  XLogRecPtr read_upto,
852  loc;
853  TimeLineID tli;
854  int count;
855  WALReadError errinfo;
856 
857  loc = targetPagePtr + reqLen;
858 
859  /* Loop waiting for xlog to be available if necessary */
860  while (1)
861  {
862  /*
863  * Determine the limit of xlog we can currently read to, and what the
864  * most recent timeline is.
865  *
866  * RecoveryInProgress() will update ThisTimeLineID when it first
867  * notices recovery finishes, so we only have to maintain it for the
868  * local process until recovery ends.
869  */
870  if (!RecoveryInProgress())
871  read_upto = GetFlushRecPtr();
872  else
873  read_upto = GetXLogReplayRecPtr(&ThisTimeLineID);
874  tli = ThisTimeLineID;
875 
876  /*
877  * Check which timeline to get the record from.
878  *
879  * We have to do it each time through the loop because if we're in
880  * recovery as a cascading standby, the current timeline might've
881  * become historical. We can't rely on RecoveryInProgress() because in
882  * a standby configuration like
883  *
884  * A => B => C
885  *
886  * if we're a logical decoding session on C, and B gets promoted, our
887  * timeline will change while we remain in recovery.
888  *
889  * We can't just keep reading from the old timeline as the last WAL
890  * archive in the timeline will get renamed to .partial by
891  * StartupXLOG().
892  *
893  * If that happens after our caller updated ThisTimeLineID but before
894  * we actually read the xlog page, we might still try to read from the
895  * old (now renamed) segment and fail. There's not much we can do
896  * about this, but it can only happen when we're a leaf of a cascading
897  * standby whose primary gets promoted while we're decoding, so a
898  * one-off ERROR isn't too bad.
899  */
900  XLogReadDetermineTimeline(state, targetPagePtr, reqLen);
901 
902  if (state->currTLI == ThisTimeLineID)
903  {
904 
905  if (loc <= read_upto)
906  break;
907 
909  pg_usleep(1000L);
910  }
911  else
912  {
913  /*
914  * We're on a historical timeline, so limit reading to the switch
915  * point where we moved to the next timeline.
916  *
917  * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know
918  * about the new timeline, so we must've received past the end of
919  * it.
920  */
921  read_upto = state->currTLIValidUntil;
922 
923  /*
924  * Setting tli to our wanted record's TLI is slightly wrong; the
925  * page might begin on an older timeline if it contains a timeline
926  * switch, since its xlog segment will have been copied from the
927  * prior timeline. This is pretty harmless though, as nothing
928  * cares so long as the timeline doesn't go backwards. We should
929  * read the page header instead; FIXME someday.
930  */
931  tli = state->currTLI;
932 
933  /* No need to wait on a historical timeline */
934  break;
935  }
936  }
937 
938  if (targetPagePtr + XLOG_BLCKSZ <= read_upto)
939  {
940  /*
941  * more than one block available; read only that block, have caller
942  * come back if they need more.
943  */
944  count = XLOG_BLCKSZ;
945  }
946  else if (targetPagePtr + reqLen > read_upto)
947  {
948  /* not enough data there */
949  return -1;
950  }
951  else
952  {
953  /* enough bytes available to satisfy the request */
954  count = read_upto - targetPagePtr;
955  }
956 
957  /*
958  * Even though we just determined how much of the page can be validly read
959  * as 'count', read the whole page anyway. It's guaranteed to be
960  * zero-padded up to the page boundary if it's incomplete.
961  */
962  if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, tli,
963  &errinfo))
964  WALReadRaiseError(&errinfo);
965 
966  /* number of valid bytes in the buffer */
967  return count;
968 }
969 
970 /*
971  * Backend-specific convenience code to handle read errors encountered by
972  * WALRead().
973  */
974 void
976 {
977  WALOpenSegment *seg = &errinfo->wre_seg;
978  char fname[MAXFNAMELEN];
979 
980  XLogFileName(fname, seg->ws_tli, seg->ws_segno, wal_segment_size);
981 
982  if (errinfo->wre_read < 0)
983  {
984  errno = errinfo->wre_errno;
985  ereport(ERROR,
987  errmsg("could not read from log segment %s, offset %u: %m",
988  fname, errinfo->wre_off)));
989  }
990  else if (errinfo->wre_read == 0)
991  {
992  ereport(ERROR,
994  errmsg("could not read from log segment %s, offset %u: read %d of %d",
995  fname, errinfo->wre_off, errinfo->wre_read,
996  errinfo->wre_req)));
997  }
998 }
WALOpenSegment wre_seg
Definition: xlogreader.h:301
bool XLogHaveInvalidPages(void)
Definition: xlogutils.c:233
void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum, BlockNumber nblocks)
Definition: xlogutils.c:662
static PgChecksumMode mode
Definition: pg_checksums.c:65
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:96
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:862
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:4064
LockRelId lockRelId
Definition: rel.h:45
#define relpathperm(rnode, forknum)
Definition: relpath.h:83
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history)
Definition: timeline.c:552
#define DEBUG1
Definition: elog.h:25
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:333
void wal_segment_close(XLogReaderState *state)
Definition: xlogutils.c:829
uint32 TimeLineID
Definition: xlogdefs.h:59
void smgrclearowner(SMgrRelation *owner, SMgrRelation reln)
Definition: smgr.c:227
RelationData reldata
Definition: xlogutils.c:553
struct xl_invalid_page xl_invalid_page
#define HASH_ELEM
Definition: hsearch.h:95
int wal_segment_size
Definition: xlog.c:119
#define DEBUG3
Definition: elog.h:23
#define XLogRecHasBlockImage(decoder, block_id)
Definition: xlogreader.h:325
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1565
static void forget_invalid_pages_db(Oid dbid)
Definition: xlogutils.c:200
void XLogCheckInvalidPages(void)
Definition: xlogutils.c:243
#define Min(x, y)
Definition: c.h:986
Oid dbId
Definition: rel.h:40
void WALReadRaiseError(WALReadError *errinfo)
Definition: xlogutils.c:975
List * readTimeLineHistory(TimeLineID targetTLI)
Definition: timeline.c:76
Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode)
Definition: xlogutils.c:459
unsigned char uint8
Definition: c.h:439
#define InvalidBuffer
Definition: buf.h:25
Size entrysize
Definition: hsearch.h:76
Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:780
void wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID *tli_p)
Definition: xlogutils.c:804
int errcode(int sqlerrcode)
Definition: elog.c:698
struct xl_invalid_page_key xl_invalid_page_key
long hash_get_num_entries(HTAB *hashp)
Definition: dynahash.c:1382
XLogRecPtr GetFlushRecPtr(void)
Definition: xlog.c:8680
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3768
ForkNumber forkno
Definition: xlogutils.c:70
#define P_NEW
Definition: bufmgr.h:91
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:954
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:98
Form_pg_class rd_rel
Definition: rel.h:109
unsigned int Oid
Definition: postgres_ext.h:31
bool RecoveryInProgress(void)
Definition: xlog.c:8328
#define PANIC
Definition: elog.h:50
void list_free_deep(List *list)
Definition: list.c:1405
#define PG_BINARY
Definition: c.h:1271
void XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, uint32 wantLength)
Definition: xlogutils.c:707
XLogRecPtr EndRecPtr
Definition: xlogreader.h:176
void smgrcloseall(void)
Definition: smgr.c:286
#define sprintf
Definition: port.h:219
WALOpenSegment seg
Definition: xlogreader.h:225
RelFileNode node
Definition: xlogutils.c:69
void pg_usleep(long microsec)
Definition: signal.c:53
Definition: dynahash.c:219
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
struct RelationData * Relation
Definition: relcache.h:26
void pfree(void *pointer)
Definition: mcxt.c:1169
static void report_invalid_page(int elevel, RelFileNode node, ForkNumber forkno, BlockNumber blkno, bool present)
Definition: xlogutils.c:85
#define ERROR
Definition: elog.h:46
Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id)
Definition: xlogutils.c:324
static void log_invalid_page(RelFileNode node, ForkNumber forkno, BlockNumber blkno, bool present)
Definition: xlogutils.c:101
#define MAXPGPATH
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI)
Definition: xlog.c:11927
#define DEBUG2
Definition: elog.h:24
bool message_level_is_interesting(int elevel)
Definition: elog.c:270
Relation CreateFakeRelcacheEntry(RelFileNode rnode)
Definition: xlogutils.c:574
static void forget_invalid_pages(RelFileNode node, ForkNumber forkno, BlockNumber minblkno)
Definition: xlogutils.c:165
LockInfoData rd_lockInfo
Definition: rel.h:112
HotStandbyState standbyState
Definition: xlogutils.c:55
static char * buf
Definition: pg_test_fsync.c:68
uint64 XLogSegNo
Definition: xlogdefs.h:48
BlockNumber blkno
Definition: xlogutils.c:71
xl_invalid_page_key key
Definition: xlogutils.c:76
XLogSegNo ws_segno
Definition: xlogreader.h:47
int errcode_for_file_access(void)
Definition: elog.c:721
XLogRecPtr currTLIValidUntil
Definition: xlogreader.h:248
#define RelationGetRelationName(relation)
Definition: rel.h:511
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:349
unsigned int uint32
Definition: c.h:441
#define BufferGetPage(buffer)
Definition: bufmgr.h:169
#define BKPBLOCK_WILL_INIT
Definition: xlogrecord.h:186
bool ignore_invalid_pages
Definition: xlogutils.c:36
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:146
ForkNumber
Definition: relpath.h:40
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:47
#define WARNING
Definition: elog.h:40
ReadBufferMode
Definition: bufmgr.h:37
void FreeFakeRelcacheEntry(Relation fakerel)
Definition: xlogutils.c:617
#define MAXFNAMELEN
static int elevel
Definition: vacuumlazy.c:401
TimeLineID nextTLI
Definition: xlogreader.h:254
bool XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum)
Definition: xlogreader.c:1531
#define HASH_BLOBS
Definition: hsearch.h:97
#define InvalidBackendId
Definition: backendid.h:23
void * palloc0(Size size)
Definition: mcxt.c:1093
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:4007
Size keysize
Definition: hsearch.h:75
XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history, TimeLineID *nextTLI)
Definition: timeline.c:580
TimeLineID ThisTimeLineID
Definition: xlog.c:194
#define ereport(elevel,...)
Definition: elog.h:157
bool InRecovery
Definition: xlogutils.c:52
TimeLineID currTLI
Definition: xlogreader.h:238
XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, Buffer *buf)
Definition: xlogutils.c:312
RelFileNode rd_node
Definition: rel.h:56
bool reachedConsistency
Definition: xlog.c:877
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:548
FakeRelCacheEntryData * FakeRelCacheEntry
Definition: xlogutils.c:557
uint64 XLogRecPtr
Definition: xlogdefs.h:21
BackendId rd_backend
Definition: rel.h:59
#define Assert(condition)
Definition: c.h:804
Definition: regguts.h:317
SMgrRelation rd_smgr
Definition: rel.h:57
XLogRedoAction
Definition: xlogutils.h:69
#define XLogFileName(fname, tli, logSegNo, wal_segsz_bytes)
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1436
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1426
TimeLineID ws_tli
Definition: xlogreader.h:48
void FlushOneBuffer(Buffer buffer)
Definition: bufmgr.c:3748
bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page)
Definition: xlogreader.c:1584
#define PageGetLSN(page)
Definition: bufpage.h:366
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1069
FormData_pg_class
Definition: pg_class.h:142
#define XLogFilePath(path, tli, logSegNo, wal_segsz_bytes)
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2748
void XLogDropRelation(RelFileNode rnode, ForkNumber forknum)
Definition: xlogutils.c:632
#define PageIsNew(page)
Definition: bufpage.h:229
int errmsg(const char *fmt,...)
Definition: elog.c:909
XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, uint8 block_id, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf)
Definition: xlogutils.c:349
void XLogDropDatabase(Oid dbid)
Definition: xlogutils.c:643
#define elog(elevel,...)
Definition: elog.h:232
static HTAB * invalid_page_tab
Definition: xlogutils.c:80
FormData_pg_class pgc
Definition: xlogutils.c:554
bool WALRead(XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli, WALReadError *errinfo)
Definition: xlogreader.c:1100
WALSegmentContext segcxt
Definition: xlogreader.h:224
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:120
HotStandbyState
Definition: xlogutils.h:47
#define XLogRecBlockImageApply(decoder, block_id)
Definition: xlogreader.h:327
#define close(a)
Definition: win32.h:12
static void static void status(const char *fmt,...) pg_attribute_printf(1
Definition: pg_regress.c:229
int read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *cur_page)
Definition: xlogutils.c:848
#define PageSetLSN(page, lsn)
Definition: bufpage.h:368
Definition: pg_list.h:50
int Buffer
Definition: buf.h:23
Pointer Page
Definition: bufpage.h:78
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
DecodedBkpBlock blocks[XLR_MAX_BLOCK_ID+1]
Definition: xlogreader.h:207
Oid relId
Definition: rel.h:39