PostgreSQL Source Code  git master
xlogprefetcher.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * xlogprefetcher.c
4  * Prefetching support for recovery.
5  *
6  * Portions Copyright (c) 2022, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/transam/xlogprefetcher.c
12  *
13  * This module provides a drop-in replacement for an XLogReader that tries to
14  * minimize I/O stalls by looking ahead in the WAL. If blocks that will be
15  * accessed in the near future are not already in the buffer pool, it initiates
16  * I/Os that might complete before the caller eventually needs the data. When
17  * referenced blocks are found in the buffer pool already, the buffer is
18  * recorded in the decoded record so that XLogReadBufferForRedo() can try to
19  * avoid a second buffer mapping table lookup.
20  *
21  * Currently, only the main fork is considered for prefetching. Currently,
22  * prefetching is only effective on systems where BufferPrefetch() does
23  * something useful (mainly Linux).
24  *
25  *-------------------------------------------------------------------------
26  */
27 
28 #include "postgres.h"
29 
30 #include "access/xlog.h"
31 #include "access/xlogprefetcher.h"
32 #include "access/xlogreader.h"
33 #include "access/xlogutils.h"
34 #include "catalog/pg_class.h"
35 #include "catalog/pg_control.h"
36 #include "catalog/storage_xlog.h"
38 #include "utils/fmgrprotos.h"
39 #include "utils/timestamp.h"
40 #include "funcapi.h"
41 #include "pgstat.h"
42 #include "miscadmin.h"
43 #include "port/atomics.h"
44 #include "storage/bufmgr.h"
45 #include "storage/shmem.h"
46 #include "storage/smgr.h"
47 #include "utils/guc.h"
48 #include "utils/hsearch.h"
49 
50 /*
51  * Every time we process this much WAL, we'll update the values in
52  * pg_stat_recovery_prefetch.
53  */
54 #define XLOGPREFETCHER_STATS_DISTANCE BLCKSZ
55 
56 /*
57  * To detect repeated access to the same block and skip useless extra system
58  * calls, we remember a small window of recently prefetched blocks.
59  */
60 #define XLOGPREFETCHER_SEQ_WINDOW_SIZE 4
61 
62 /*
63  * When maintenance_io_concurrency is not saturated, we're prepared to look
64  * ahead up to N times that number of block references.
65  */
66 #define XLOGPREFETCHER_DISTANCE_MULTIPLIER 4
67 
68 /* Define to log internal debugging messages. */
69 /* #define XLOGPREFETCHER_DEBUG_LEVEL LOG */
70 
71 /* GUCs */
73 
74 #ifdef USE_PREFETCH
75 #define RecoveryPrefetchEnabled() (recovery_prefetch != RECOVERY_PREFETCH_OFF)
76 #else
77 #define RecoveryPrefetchEnabled() false
78 #endif
79 
81 
82 /*
83  * Enum used to report whether an IO should be started.
84  */
85 typedef enum
86 {
91 
92 /*
93  * Type of callback that can decide which block to prefetch next. For now
94  * there is only one.
95  */
96 typedef LsnReadQueueNextStatus (*LsnReadQueueNextFun) (uintptr_t lrq_private,
97  XLogRecPtr *lsn);
98 
99 /*
100  * A simple circular queue of LSNs, using to control the number of
101  * (potentially) inflight IOs. This stands in for a later more general IO
102  * control mechanism, which is why it has the apparently unnecessary
103  * indirection through a function pointer.
104  */
105 typedef struct LsnReadQueue
106 {
108  uintptr_t lrq_private;
115  struct
116  {
117  bool io;
121 
122 /*
123  * A prefetcher. This is a mechanism that wraps an XLogReader, prefetching
124  * blocks that will be soon be referenced, to try to avoid IO stalls.
125  */
127 {
128  /* WAL reader and current reading state. */
132 
133  /* When to publish stats. */
135 
136  /* Book-keeping to avoid accessing blocks that don't exist yet. */
139 
140  /* Book-keeping to avoid repeat prefetches. */
144 
145  /* Book-keeping to disable prefetching temporarily. */
147 
148  /* IO depth manager. */
150 
152 
154 };
155 
156 /*
157  * A temporary filter used to track block ranges that haven't been created
158  * yet, whole relations that haven't been created yet, and whole relations
159  * that (we assume) have already been dropped, or will be created by bulk WAL
160  * operators.
161  */
162 typedef struct XLogPrefetcherFilter
163 {
169 
170 /*
171  * Counters exposed in shared memory for pg_stat_recovery_prefetch.
172  */
173 typedef struct XLogPrefetchStats
174 {
175  pg_atomic_uint64 reset_time; /* Time of last reset. */
176  pg_atomic_uint64 prefetch; /* Prefetches initiated. */
177  pg_atomic_uint64 hit; /* Blocks already in cache. */
178  pg_atomic_uint64 skip_init; /* Zero-inited blocks skipped. */
179  pg_atomic_uint64 skip_new; /* New/missing blocks filtered. */
180  pg_atomic_uint64 skip_fpw; /* FPWs skipped. */
181  pg_atomic_uint64 skip_rep; /* Repeat accesses skipped. */
182 
183  /* Dynamic values */
184  int wal_distance; /* Number of WAL bytes ahead. */
185  int block_distance; /* Number of block references ahead. */
186  int io_depth; /* Number of I/Os in progress. */
188 
189 static inline void XLogPrefetcherAddFilter(XLogPrefetcher *prefetcher,
190  RelFileNode rnode,
191  BlockNumber blockno,
192  XLogRecPtr lsn);
193 static inline bool XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher,
194  RelFileNode rnode,
195  BlockNumber blockno);
196 static inline void XLogPrefetcherCompleteFilters(XLogPrefetcher *prefetcher,
197  XLogRecPtr replaying_lsn);
198 static LsnReadQueueNextStatus XLogPrefetcherNextBlock(uintptr_t pgsr_private,
199  XLogRecPtr *lsn);
200 
202 
203 static inline LsnReadQueue *
204 lrq_alloc(uint32 max_distance,
205  uint32 max_inflight,
206  uintptr_t lrq_private,
208 {
209  LsnReadQueue *lrq;
210  uint32 size;
211 
212  Assert(max_distance >= max_inflight);
213 
214  size = max_distance + 1; /* full ring buffer has a gap */
215  lrq = palloc(offsetof(LsnReadQueue, queue) + sizeof(lrq->queue[0]) * size);
216  lrq->lrq_private = lrq_private;
217  lrq->max_inflight = max_inflight;
218  lrq->size = size;
219  lrq->next = next;
220  lrq->head = 0;
221  lrq->tail = 0;
222  lrq->inflight = 0;
223  lrq->completed = 0;
224 
225  return lrq;
226 }
227 
228 static inline void
230 {
231  pfree(lrq);
232 }
233 
234 static inline uint32
236 {
237  return lrq->inflight;
238 }
239 
240 static inline uint32
242 {
243  return lrq->completed;
244 }
245 
246 static inline void
248 {
249  /* Try to start as many IOs as we can within our limits. */
250  while (lrq->inflight < lrq->max_inflight &&
251  lrq->inflight + lrq->completed < lrq->size - 1)
252  {
253  Assert(((lrq->head + 1) % lrq->size) != lrq->tail);
254  switch (lrq->next(lrq->lrq_private, &lrq->queue[lrq->head].lsn))
255  {
256  case LRQ_NEXT_AGAIN:
257  return;
258  case LRQ_NEXT_IO:
259  lrq->queue[lrq->head].io = true;
260  lrq->inflight++;
261  break;
262  case LRQ_NEXT_NO_IO:
263  lrq->queue[lrq->head].io = false;
264  lrq->completed++;
265  break;
266  }
267  lrq->head++;
268  if (lrq->head == lrq->size)
269  lrq->head = 0;
270  }
271 }
272 
273 static inline void
275 {
276  /*
277  * We know that LSNs before 'lsn' have been replayed, so we can now assume
278  * that any IOs that were started before then have finished.
279  */
280  while (lrq->tail != lrq->head &&
281  lrq->queue[lrq->tail].lsn < lsn)
282  {
283  if (lrq->queue[lrq->tail].io)
284  lrq->inflight--;
285  else
286  lrq->completed--;
287  lrq->tail++;
288  if (lrq->tail == lrq->size)
289  lrq->tail = 0;
290  }
292  lrq_prefetch(lrq);
293 }
294 
295 size_t
297 {
298  return sizeof(XLogPrefetchStats);
299 }
300 
301 /*
302  * Reset all counters to zero.
303  */
304 void
306 {
314 }
315 
316 void
318 {
319  bool found;
320 
322  ShmemInitStruct("XLogPrefetchStats",
323  sizeof(XLogPrefetchStats),
324  &found);
325 
326  if (!found)
327  {
335  }
336 }
337 
338 /*
339  * Called when any GUC is changed that affects prefetching.
340  */
341 void
343 {
345 }
346 
347 /*
348  * Increment a counter in shared memory. This is equivalent to *counter++ on a
349  * plain uint64 without any memory barrier or locking, except on platforms
350  * where readers can't read uint64 without possibly observing a torn value.
351  */
352 static inline void
354 {
356  pg_atomic_write_u64(counter, pg_atomic_read_u64(counter) + 1);
357 }
358 
359 /*
360  * Create a prefetcher that is ready to begin prefetching blocks referenced by
361  * WAL records.
362  */
365 {
366  XLogPrefetcher *prefetcher;
367  static HASHCTL hash_table_ctl = {
368  .keysize = sizeof(RelFileNode),
369  .entrysize = sizeof(XLogPrefetcherFilter)
370  };
371 
372  prefetcher = palloc0(sizeof(XLogPrefetcher));
373 
374  prefetcher->reader = reader;
375  prefetcher->filter_table = hash_create("XLogPrefetcherFilterTable", 1024,
376  &hash_table_ctl,
378  dlist_init(&prefetcher->filter_queue);
379 
382  SharedStats->io_depth = 0;
383 
384  /* First usage will cause streaming_read to be allocated. */
386 
387  return prefetcher;
388 }
389 
390 /*
391  * Destroy a prefetcher and release all resources.
392  */
393 void
395 {
396  lrq_free(prefetcher->streaming_read);
397  hash_destroy(prefetcher->filter_table);
398  pfree(prefetcher);
399 }
400 
401 /*
402  * Provide access to the reader.
403  */
406 {
407  return prefetcher->reader;
408 }
409 
410 /*
411  * Update the statistics visible in the pg_stat_recovery_prefetch view.
412  */
413 void
415 {
416  uint32 io_depth;
417  uint32 completed;
418  int64 wal_distance;
419 
420 
421  /* How far ahead of replay are we now? */
422  if (prefetcher->reader->decode_queue_tail)
423  {
424  wal_distance =
425  prefetcher->reader->decode_queue_tail->lsn -
426  prefetcher->reader->decode_queue_head->lsn;
427  }
428  else
429  {
430  wal_distance = 0;
431  }
432 
433  /* How many IOs are currently in flight and completed? */
434  io_depth = lrq_inflight(prefetcher->streaming_read);
435  completed = lrq_completed(prefetcher->streaming_read);
436 
437  /* Update the instantaneous stats visible in pg_stat_recovery_prefetch. */
438  SharedStats->io_depth = io_depth;
439  SharedStats->block_distance = io_depth + completed;
440  SharedStats->wal_distance = wal_distance;
441 
442  prefetcher->next_stats_shm_lsn =
444 }
445 
446 /*
447  * A callback that examines the next block reference in the WAL, and possibly
448  * starts an IO so that a later read will be fast.
449  *
450  * Returns LRQ_NEXT_AGAIN if no more WAL data is available yet.
451  *
452  * Returns LRQ_NEXT_IO if the next block reference is for a main fork block
453  * that isn't in the buffer pool, and the kernel has been asked to start
454  * reading it to make a future read system call faster. An LSN is written to
455  * *lsn, and the I/O will be considered to have completed once that LSN is
456  * replayed.
457  *
458  * Returns LRQ_NO_IO if we examined the next block reference and found that it
459  * was already in the buffer pool, or we decided for various reasons not to
460  * prefetch.
461  */
463 XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
464 {
465  XLogPrefetcher *prefetcher = (XLogPrefetcher *) pgsr_private;
466  XLogReaderState *reader = prefetcher->reader;
467  XLogRecPtr replaying_lsn = reader->ReadRecPtr;
468 
469  /*
470  * We keep track of the record and block we're up to between calls with
471  * prefetcher->record and prefetcher->next_block_id.
472  */
473  for (;;)
474  {
475  DecodedXLogRecord *record;
476 
477  /* Try to read a new future record, if we don't already have one. */
478  if (prefetcher->record == NULL)
479  {
480  bool nonblocking;
481 
482  /*
483  * If there are already records or an error queued up that could
484  * be replayed, we don't want to block here. Otherwise, it's OK
485  * to block waiting for more data: presumably the caller has
486  * nothing else to do.
487  */
488  nonblocking = XLogReaderHasQueuedRecordOrError(reader);
489 
490  /* Readahead is disabled until we replay past a certain point. */
491  if (nonblocking && replaying_lsn <= prefetcher->no_readahead_until)
492  return LRQ_NEXT_AGAIN;
493 
494  record = XLogReadAhead(prefetcher->reader, nonblocking);
495  if (record == NULL)
496  {
497  /*
498  * We can't read any more, due to an error or lack of data in
499  * nonblocking mode. Don't try to read ahead again until
500  * we've replayed everything already decoded.
501  */
502  if (nonblocking && prefetcher->reader->decode_queue_tail)
503  prefetcher->no_readahead_until =
504  prefetcher->reader->decode_queue_tail->lsn;
505 
506  return LRQ_NEXT_AGAIN;
507  }
508 
509  /*
510  * If prefetching is disabled, we don't need to analyze the record
511  * or issue any prefetches. We just need to cause one record to
512  * be decoded.
513  */
515  {
516  *lsn = InvalidXLogRecPtr;
517  return LRQ_NEXT_NO_IO;
518  }
519 
520  /* We have a new record to process. */
521  prefetcher->record = record;
522  prefetcher->next_block_id = 0;
523  }
524  else
525  {
526  /* Continue to process from last call, or last loop. */
527  record = prefetcher->record;
528  }
529 
530  /*
531  * Check for operations that require us to filter out block ranges, or
532  * pause readahead completely.
533  */
534  if (replaying_lsn < record->lsn)
535  {
536  uint8 rmid = record->header.xl_rmid;
537  uint8 record_type = record->header.xl_info & ~XLR_INFO_MASK;
538 
539  if (rmid == RM_XLOG_ID)
540  {
541  if (record_type == XLOG_CHECKPOINT_SHUTDOWN ||
542  record_type == XLOG_END_OF_RECOVERY)
543  {
544  /*
545  * These records might change the TLI. Avoid potential
546  * bugs if we were to allow "read TLI" and "replay TLI" to
547  * differ without more analysis.
548  */
549  prefetcher->no_readahead_until = record->lsn;
550 
551 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
552  elog(XLOGPREFETCHER_DEBUG_LEVEL,
553  "suppressing all readahead until %X/%X is replayed due to possible TLI change",
554  LSN_FORMAT_ARGS(record->lsn));
555 #endif
556 
557  /* Fall through so we move past this record. */
558  }
559  }
560  else if (rmid == RM_DBASE_ID)
561  {
562  /*
563  * When databases are created with the file-copy strategy,
564  * there are no WAL records to tell us about the creation of
565  * individual relations.
566  */
567  if (record_type == XLOG_DBASE_CREATE_FILE_COPY)
568  {
571  RelFileNode rnode = {InvalidOid, xlrec->db_id, InvalidOid};
572 
573  /*
574  * Don't try to prefetch anything in this database until
575  * it has been created, or we might confuse the blocks of
576  * different generations, if a database OID or relfilenode
577  * is reused. It's also more efficient than discovering
578  * that relations don't exist on disk yet with ENOENT
579  * errors.
580  */
581  XLogPrefetcherAddFilter(prefetcher, rnode, 0, record->lsn);
582 
583 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
584  elog(XLOGPREFETCHER_DEBUG_LEVEL,
585  "suppressing prefetch in database %u until %X/%X is replayed due to raw file copy",
586  rnode.dbNode,
587  LSN_FORMAT_ARGS(record->lsn));
588 #endif
589  }
590  }
591  else if (rmid == RM_SMGR_ID)
592  {
593  if (record_type == XLOG_SMGR_CREATE)
594  {
595  xl_smgr_create *xlrec = (xl_smgr_create *)
596  record->main_data;
597 
598  if (xlrec->forkNum == MAIN_FORKNUM)
599  {
600  /*
601  * Don't prefetch anything for this whole relation
602  * until it has been created. Otherwise we might
603  * confuse the blocks of different generations, if a
604  * relfilenode is reused. This also avoids the need
605  * to discover the problem via extra syscalls that
606  * report ENOENT.
607  */
608  XLogPrefetcherAddFilter(prefetcher, xlrec->rnode, 0,
609  record->lsn);
610 
611 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
612  elog(XLOGPREFETCHER_DEBUG_LEVEL,
613  "suppressing prefetch in relation %u/%u/%u until %X/%X is replayed, which creates the relation",
614  xlrec->rnode.spcNode,
615  xlrec->rnode.dbNode,
616  xlrec->rnode.relNode,
617  LSN_FORMAT_ARGS(record->lsn));
618 #endif
619  }
620  }
621  else if (record_type == XLOG_SMGR_TRUNCATE)
622  {
624  record->main_data;
625 
626  /*
627  * Don't consider prefetching anything in the truncated
628  * range until the truncation has been performed.
629  */
630  XLogPrefetcherAddFilter(prefetcher, xlrec->rnode,
631  xlrec->blkno,
632  record->lsn);
633 
634 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
635  elog(XLOGPREFETCHER_DEBUG_LEVEL,
636  "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, which truncates the relation",
637  xlrec->rnode.spcNode,
638  xlrec->rnode.dbNode,
639  xlrec->rnode.relNode,
640  xlrec->blkno,
641  LSN_FORMAT_ARGS(record->lsn));
642 #endif
643  }
644  }
645  }
646 
647  /* Scan the block references, starting where we left off last time. */
648  while (prefetcher->next_block_id <= record->max_block_id)
649  {
650  int block_id = prefetcher->next_block_id++;
651  DecodedBkpBlock *block = &record->blocks[block_id];
652  SMgrRelation reln;
653  PrefetchBufferResult result;
654 
655  if (!block->in_use)
656  continue;
657 
659 
660  /*
661  * Record the LSN of this record. When it's replayed,
662  * LsnReadQueue will consider any IOs submitted for earlier LSNs
663  * to be finished.
664  */
665  *lsn = record->lsn;
666 
667  /* We don't try to prefetch anything but the main fork for now. */
668  if (block->forknum != MAIN_FORKNUM)
669  {
670  return LRQ_NEXT_NO_IO;
671  }
672 
673  /*
674  * If there is a full page image attached, we won't be reading the
675  * page, so don't bother trying to prefetch.
676  */
677  if (block->has_image)
678  {
680  return LRQ_NEXT_NO_IO;
681  }
682 
683  /* There is no point in reading a page that will be zeroed. */
684  if (block->flags & BKPBLOCK_WILL_INIT)
685  {
687  return LRQ_NEXT_NO_IO;
688  }
689 
690  /* Should we skip prefetching this block due to a filter? */
691  if (XLogPrefetcherIsFiltered(prefetcher, block->rnode, block->blkno))
692  {
694  return LRQ_NEXT_NO_IO;
695  }
696 
697  /* There is no point in repeatedly prefetching the same block. */
698  for (int i = 0; i < XLOGPREFETCHER_SEQ_WINDOW_SIZE; ++i)
699  {
700  if (block->blkno == prefetcher->recent_block[i] &&
701  RelFileNodeEquals(block->rnode, prefetcher->recent_rnode[i]))
702  {
703  /*
704  * XXX If we also remembered where it was, we could set
705  * recent_buffer so that recovery could skip smgropen()
706  * and a buffer table lookup.
707  */
709  return LRQ_NEXT_NO_IO;
710  }
711  }
712  prefetcher->recent_rnode[prefetcher->recent_idx] = block->rnode;
713  prefetcher->recent_block[prefetcher->recent_idx] = block->blkno;
714  prefetcher->recent_idx =
715  (prefetcher->recent_idx + 1) % XLOGPREFETCHER_SEQ_WINDOW_SIZE;
716 
717  /*
718  * We could try to have a fast path for repeated references to the
719  * same relation (with some scheme to handle invalidations
720  * safely), but for now we'll call smgropen() every time.
721  */
722  reln = smgropen(block->rnode, InvalidBackendId);
723 
724  /*
725  * If the relation file doesn't exist on disk, for example because
726  * we're replaying after a crash and the file will be created and
727  * then unlinked by WAL that hasn't been replayed yet, suppress
728  * further prefetching in the relation until this record is
729  * replayed.
730  */
731  if (!smgrexists(reln, MAIN_FORKNUM))
732  {
733 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
734  elog(XLOGPREFETCHER_DEBUG_LEVEL,
735  "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk",
736  reln->smgr_rnode.node.spcNode,
737  reln->smgr_rnode.node.dbNode,
738  reln->smgr_rnode.node.relNode,
739  LSN_FORMAT_ARGS(record->lsn));
740 #endif
741  XLogPrefetcherAddFilter(prefetcher, block->rnode, 0,
742  record->lsn);
744  return LRQ_NEXT_NO_IO;
745  }
746 
747  /*
748  * If the relation isn't big enough to contain the referenced
749  * block yet, suppress prefetching of this block and higher until
750  * this record is replayed.
751  */
752  if (block->blkno >= smgrnblocks(reln, block->forknum))
753  {
754 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
755  elog(XLOGPREFETCHER_DEBUG_LEVEL,
756  "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small",
757  reln->smgr_rnode.node.spcNode,
758  reln->smgr_rnode.node.dbNode,
759  reln->smgr_rnode.node.relNode,
760  block->blkno,
761  LSN_FORMAT_ARGS(record->lsn));
762 #endif
763  XLogPrefetcherAddFilter(prefetcher, block->rnode, block->blkno,
764  record->lsn);
766  return LRQ_NEXT_NO_IO;
767  }
768 
769  /* Try to initiate prefetching. */
770  result = PrefetchSharedBuffer(reln, block->forknum, block->blkno);
771  if (BufferIsValid(result.recent_buffer))
772  {
773  /* Cache hit, nothing to do. */
775  block->prefetch_buffer = result.recent_buffer;
776  return LRQ_NEXT_NO_IO;
777  }
778  else if (result.initiated_io)
779  {
780  /* Cache miss, I/O (presumably) started. */
783  return LRQ_NEXT_IO;
784  }
785  else
786  {
787  /*
788  * This shouldn't be possible, because we already determined
789  * that the relation exists on disk and is big enough.
790  * Something is wrong with the cache invalidation for
791  * smgrexists(), smgrnblocks(), or the file was unlinked or
792  * truncated beneath our feet?
793  */
794  elog(ERROR,
795  "could not prefetch relation %u/%u/%u block %u",
796  reln->smgr_rnode.node.spcNode,
797  reln->smgr_rnode.node.dbNode,
798  reln->smgr_rnode.node.relNode,
799  block->blkno);
800  }
801  }
802 
803  /*
804  * Several callsites need to be able to read exactly one record
805  * without any internal readahead. Examples: xlog.c reading
806  * checkpoint records with emode set to PANIC, which might otherwise
807  * cause XLogPageRead() to panic on some future page, and xlog.c
808  * determining where to start writing WAL next, which depends on the
809  * contents of the reader's internal buffer after reading one record.
810  * Therefore, don't even think about prefetching until the first
811  * record after XLogPrefetcherBeginRead() has been consumed.
812  */
813  if (prefetcher->reader->decode_queue_tail &&
814  prefetcher->reader->decode_queue_tail->lsn == prefetcher->begin_ptr)
815  return LRQ_NEXT_AGAIN;
816 
817  /* Advance to the next record. */
818  prefetcher->record = NULL;
819  }
820  pg_unreachable();
821 }
822 
823 /*
824  * Expose statistics about recovery prefetching.
825  */
826 Datum
828 {
829 #define PG_STAT_GET_RECOVERY_PREFETCH_COLS 10
830  ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
833 
834  SetSingleFuncCall(fcinfo, 0);
835 
836  for (int i = 0; i < PG_STAT_GET_RECOVERY_PREFETCH_COLS; ++i)
837  nulls[i] = false;
838 
849  tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
850 
851  return (Datum) 0;
852 }
853 
854 /*
855  * Don't prefetch any blocks >= 'blockno' from a given 'rnode', until 'lsn'
856  * has been replayed.
857  */
858 static inline void
860  BlockNumber blockno, XLogRecPtr lsn)
861 {
862  XLogPrefetcherFilter *filter;
863  bool found;
864 
865  filter = hash_search(prefetcher->filter_table, &rnode, HASH_ENTER, &found);
866  if (!found)
867  {
868  /*
869  * Don't allow any prefetching of this block or higher until replayed.
870  */
871  filter->filter_until_replayed = lsn;
872  filter->filter_from_block = blockno;
873  dlist_push_head(&prefetcher->filter_queue, &filter->link);
874  }
875  else
876  {
877  /*
878  * We were already filtering this rnode. Extend the filter's lifetime
879  * to cover this WAL record, but leave the lower of the block numbers
880  * there because we don't want to have to track individual blocks.
881  */
882  filter->filter_until_replayed = lsn;
883  dlist_delete(&filter->link);
884  dlist_push_head(&prefetcher->filter_queue, &filter->link);
885  filter->filter_from_block = Min(filter->filter_from_block, blockno);
886  }
887 }
888 
889 /*
890  * Have we replayed any records that caused us to begin filtering a block
891  * range? That means that relations should have been created, extended or
892  * dropped as required, so we can stop filtering out accesses to a given
893  * relfilenode.
894  */
895 static inline void
897 {
898  while (unlikely(!dlist_is_empty(&prefetcher->filter_queue)))
899  {
901  link,
902  &prefetcher->filter_queue);
903 
904  if (filter->filter_until_replayed >= replaying_lsn)
905  break;
906 
907  dlist_delete(&filter->link);
908  hash_search(prefetcher->filter_table, filter, HASH_REMOVE, NULL);
909  }
910 }
911 
912 /*
913  * Check if a given block should be skipped due to a filter.
914  */
915 static inline bool
917  BlockNumber blockno)
918 {
919  /*
920  * Test for empty queue first, because we expect it to be empty most of
921  * the time and we can avoid the hash table lookup in that case.
922  */
923  if (unlikely(!dlist_is_empty(&prefetcher->filter_queue)))
924  {
925  XLogPrefetcherFilter *filter;
926 
927  /* See if the block range is filtered. */
928  filter = hash_search(prefetcher->filter_table, &rnode, HASH_FIND, NULL);
929  if (filter && filter->filter_from_block <= blockno)
930  {
931 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
932  elog(XLOGPREFETCHER_DEBUG_LEVEL,
933  "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (blocks >= %u filtered)",
934  rnode.spcNode, rnode.dbNode, rnode.relNode, blockno,
936  filter->filter_from_block);
937 #endif
938  return true;
939  }
940 
941  /* See if the whole database is filtered. */
942  rnode.relNode = InvalidOid;
943  rnode.spcNode = InvalidOid;
944  filter = hash_search(prefetcher->filter_table, &rnode, HASH_FIND, NULL);
945  if (filter)
946  {
947 #ifdef XLOGPREFETCHER_DEBUG_LEVEL
948  elog(XLOGPREFETCHER_DEBUG_LEVEL,
949  "prefetch of %u/%u/%u block %u suppressed; filtering until LSN %X/%X is replayed (whole database)",
950  rnode.spcNode, rnode.dbNode, rnode.relNode, blockno,
952 #endif
953  return true;
954  }
955  }
956 
957  return false;
958 }
959 
960 /*
961  * A wrapper for XLogBeginRead() that also resets the prefetcher.
962  */
963 void
965 {
966  /* This will forget about any in-flight IO. */
967  prefetcher->reconfigure_count--;
968 
969  /* Book-keeping to avoid readahead on first read. */
970  prefetcher->begin_ptr = recPtr;
971 
972  prefetcher->no_readahead_until = 0;
973 
974  /* This will forget about any queued up records in the decoder. */
975  XLogBeginRead(prefetcher->reader, recPtr);
976 }
977 
978 /*
979  * A wrapper for XLogReadRecord() that provides the same interface, but also
980  * tries to initiate I/O for blocks referenced in future WAL records.
981  */
982 XLogRecord *
984 {
985  DecodedXLogRecord *record;
986 
987  /*
988  * See if it's time to reset the prefetching machinery, because a relevant
989  * GUC was changed.
990  */
992  {
993  uint32 max_distance;
994  uint32 max_inflight;
995 
996  if (prefetcher->streaming_read)
997  lrq_free(prefetcher->streaming_read);
998 
1000  {
1001  max_inflight = Max(maintenance_io_concurrency, 2);
1002  max_distance = max_inflight * XLOGPREFETCHER_DISTANCE_MULTIPLIER;
1003  }
1004  else
1005  {
1006  max_inflight = 1;
1007  max_distance = 1;
1008  }
1009 
1010  prefetcher->streaming_read = lrq_alloc(max_distance,
1011  max_inflight,
1012  (uintptr_t) prefetcher,
1014 
1015  prefetcher->reconfigure_count = XLogPrefetchReconfigureCount;
1016  }
1017 
1018  /*
1019  * Release last returned record, if there is one. We need to do this so
1020  * that we can check for empty decode queue accurately.
1021  */
1022  XLogReleasePreviousRecord(prefetcher->reader);
1023 
1024  /* If there's nothing queued yet, then start prefetching. */
1025  if (!XLogReaderHasQueuedRecordOrError(prefetcher->reader))
1026  lrq_prefetch(prefetcher->streaming_read);
1027 
1028  /* Read the next record. */
1029  record = XLogNextRecord(prefetcher->reader, errmsg);
1030  if (!record)
1031  return NULL;
1032 
1033  /*
1034  * The record we just got is the "current" one, for the benefit of the
1035  * XLogRecXXX() macros.
1036  */
1037  Assert(record == prefetcher->reader->record);
1038 
1039  /*
1040  * Can we drop any prefetch filters yet, given the record we're about to
1041  * return? This assumes that any records with earlier LSNs have been
1042  * replayed, so if we were waiting for a relation to be created or
1043  * extended, it is now OK to access blocks in the covered range.
1044  */
1045  XLogPrefetcherCompleteFilters(prefetcher, record->lsn);
1046 
1047  /*
1048  * See if it's time to compute some statistics, because enough WAL has
1049  * been processed.
1050  */
1051  if (unlikely(record->lsn >= prefetcher->next_stats_shm_lsn))
1052  XLogPrefetcherComputeStats(prefetcher);
1053 
1054  /*
1055  * The caller is about to replay this record, so we can now report that
1056  * all IO initiated because of early WAL must be finished. This may
1057  * trigger more readahead.
1058  */
1059  lrq_complete_lsn(prefetcher->streaming_read, record->lsn);
1060 
1061  Assert(record == prefetcher->reader->record);
1062 
1063  return &record->header;
1064 }
1065 
1066 bool
1067 check_recovery_prefetch(int *new_value, void **extra, GucSource source)
1068 {
1069 #ifndef USE_PREFETCH
1070  if (*new_value == RECOVERY_PREFETCH_ON)
1071  {
1072  GUC_check_errdetail("recovery_prefetch not supported on platforms that lack posix_fadvise().");
1073  return false;
1074  }
1075 #endif
1076 
1077  return true;
1078 }
1079 
1080 void
1081 assign_recovery_prefetch(int new_value, void *extra)
1082 {
1083  /* Reconfigure prefetching, because a setting it depends on changed. */
1084  recovery_prefetch = new_value;
1085  if (AmStartupProcess())
1087 }
static void pg_atomic_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition: atomics.h:438
static void pg_atomic_init_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition: atomics.h:415
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition: atomics.h:429
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1574
#define InvalidBackendId
Definition: backendid.h:23
uint32 BlockNumber
Definition: block.h:31
static int32 next
Definition: blutils.c:219
static Datum values[MAXATTR]
Definition: bootstrap.c:156
#define InvalidBuffer
Definition: buf.h:25
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:505
int maintenance_io_concurrency
Definition: bufmgr.c:152
#define BufferIsValid(bufnum)
Definition: bufmgr.h:123
unsigned int uint32
Definition: c.h:441
#define Min(x, y)
Definition: c.h:986
#define offsetof(type, field)
Definition: c.h:727
#define Max(x, y)
Definition: c.h:980
#define FLEXIBLE_ARRAY_MEMBER
Definition: c.h:350
#define pg_unreachable()
Definition: c.h:258
#define unlikely(x)
Definition: c.h:273
unsigned char uint8
Definition: c.h:439
#define XLOG_DBASE_CREATE_FILE_COPY
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:862
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:954
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:349
int errmsg(const char *fmt,...)
Definition: elog.c:904
#define ERROR
Definition: elog.h:33
#define elog(elevel,...)
Definition: elog.h:218
Datum Int64GetDatum(int64 X)
Definition: fmgr.c:1683
#define PG_FUNCTION_ARGS
Definition: fmgr.h:193
void SetSingleFuncCall(FunctionCallInfo fcinfo, bits32 flags)
Definition: funcapi.c:76
bool IsUnderPostmaster
Definition: globals.c:113
#define GUC_check_errdetail
Definition: guc.h:431
GucSource
Definition: guc.h:109
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_REMOVE
Definition: hsearch.h:115
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
static void dlist_init(dlist_head *head)
Definition: ilist.h:278
static bool dlist_is_empty(dlist_head *head)
Definition: ilist.h:289
static void dlist_delete(dlist_node *node)
Definition: ilist.h:358
#define dlist_tail_element(type, membername, lhead)
Definition: ilist.h:515
static void dlist_push_head(dlist_head *head, dlist_node *node)
Definition: ilist.h:300
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
void pfree(void *pointer)
Definition: mcxt.c:1175
void * palloc0(Size size)
Definition: mcxt.c:1099
void * palloc(Size size)
Definition: mcxt.c:1068
#define AmStartupProcess()
Definition: miscadmin.h:444
#define XLOG_CHECKPOINT_SHUTDOWN
Definition: pg_control.h:67
#define XLOG_END_OF_RECOVERY
Definition: pg_control.h:76
static rewind_source * source
Definition: pg_rewind.c:81
uintptr_t Datum
Definition: postgres.h:411
#define Int32GetDatum(X)
Definition: postgres.h:523
#define InvalidOid
Definition: postgres_ext.h:36
struct RelFileNode RelFileNode
#define RelFileNodeEquals(node1, node2)
Definition: relfilenode.h:88
@ MAIN_FORKNUM
Definition: relpath.h:43
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition: shmem.c:396
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:579
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:247
SMgrRelation smgropen(RelFileNode rnode, BackendId backend)
Definition: smgr.c:146
#define XLOG_SMGR_CREATE
Definition: storage_xlog.h:30
#define XLOG_SMGR_TRUNCATE
Definition: storage_xlog.h:31
RelFileNode rnode
Definition: xlogreader.h:125
Buffer prefetch_buffer
Definition: xlogreader.h:130
BlockNumber blkno
Definition: xlogreader.h:127
ForkNumber forknum
Definition: xlogreader.h:126
XLogRecord header
Definition: xlogreader.h:166
XLogRecPtr lsn
Definition: xlogreader.h:164
Size keysize
Definition: hsearch.h:75
Definition: dynahash.c:220
XLogRecPtr lsn
struct LsnReadQueue::@14 queue[FLEXIBLE_ARRAY_MEMBER]
uint32 max_inflight
LsnReadQueueNextFun next
uintptr_t lrq_private
Buffer recent_buffer
Definition: bufmgr.h:54
RelFileNode node
Definition: relfilenode.h:74
TupleDesc setDesc
Definition: execnodes.h:317
Tuplestorestate * setResult
Definition: execnodes.h:316
RelFileNodeBackend smgr_rnode
Definition: smgr.h:42
pg_atomic_uint64 skip_fpw
pg_atomic_uint64 skip_init
pg_atomic_uint64 reset_time
pg_atomic_uint64 hit
pg_atomic_uint64 prefetch
pg_atomic_uint64 skip_rep
pg_atomic_uint64 skip_new
XLogRecPtr filter_until_replayed
BlockNumber filter_from_block
dlist_head filter_queue
XLogRecPtr no_readahead_until
RelFileNode recent_rnode[XLOGPREFETCHER_SEQ_WINDOW_SIZE]
XLogReaderState * reader
XLogRecPtr begin_ptr
LsnReadQueue * streaming_read
DecodedXLogRecord * record
XLogRecPtr next_stats_shm_lsn
BlockNumber recent_block[XLOGPREFETCHER_SEQ_WINDOW_SIZE]
DecodedXLogRecord * record
Definition: xlogreader.h:236
DecodedXLogRecord * decode_queue_head
Definition: xlogreader.h:260
XLogRecPtr ReadRecPtr
Definition: xlogreader.h:206
DecodedXLogRecord * decode_queue_tail
Definition: xlogreader.h:261
uint8 xl_info
Definition: xlogrecord.h:46
RmgrId xl_rmid
Definition: xlogrecord.h:47
ForkNumber forkNum
Definition: storage_xlog.h:36
RelFileNode rnode
Definition: storage_xlog.h:35
BlockNumber blkno
Definition: storage_xlog.h:48
RelFileNode rnode
Definition: storage_xlog.h:49
void tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc, Datum *values, bool *isnull)
Definition: tuplestore.c:750
#define TimestampTzGetDatum(X)
Definition: timestamp.h:32
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:43
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
void XLogPrefetchResetStats(void)
static bool XLogPrefetcherIsFiltered(XLogPrefetcher *prefetcher, RelFileNode rnode, BlockNumber blockno)
void XLogPrefetcherComputeStats(XLogPrefetcher *prefetcher)
int recovery_prefetch
struct LsnReadQueue LsnReadQueue
#define RecoveryPrefetchEnabled()
static LsnReadQueue * lrq_alloc(uint32 max_distance, uint32 max_inflight, uintptr_t lrq_private, LsnReadQueueNextFun next)
static void XLogPrefetcherCompleteFilters(XLogPrefetcher *prefetcher, XLogRecPtr replaying_lsn)
LsnReadQueueNextStatus(* LsnReadQueueNextFun)(uintptr_t lrq_private, XLogRecPtr *lsn)
static void lrq_free(LsnReadQueue *lrq)
struct XLogPrefetchStats XLogPrefetchStats
XLogRecord * XLogPrefetcherReadRecord(XLogPrefetcher *prefetcher, char **errmsg)
static void lrq_prefetch(LsnReadQueue *lrq)
static int XLogPrefetchReconfigureCount
Datum pg_stat_get_recovery_prefetch(PG_FUNCTION_ARGS)
static LsnReadQueueNextStatus XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
static uint32 lrq_completed(LsnReadQueue *lrq)
static XLogPrefetchStats * SharedStats
static uint32 lrq_inflight(LsnReadQueue *lrq)
void XLogPrefetchReconfigure(void)
size_t XLogPrefetchShmemSize(void)
#define PG_STAT_GET_RECOVERY_PREFETCH_COLS
XLogPrefetcher * XLogPrefetcherAllocate(XLogReaderState *reader)
void XLogPrefetchShmemInit(void)
void XLogPrefetcherBeginRead(XLogPrefetcher *prefetcher, XLogRecPtr recPtr)
void assign_recovery_prefetch(int new_value, void *extra)
static void XLogPrefetchIncrement(pg_atomic_uint64 *counter)
#define XLOGPREFETCHER_SEQ_WINDOW_SIZE
struct XLogPrefetcherFilter XLogPrefetcherFilter
static void lrq_complete_lsn(LsnReadQueue *lrq, XLogRecPtr lsn)
#define XLOGPREFETCHER_STATS_DISTANCE
#define XLOGPREFETCHER_DISTANCE_MULTIPLIER
void XLogPrefetcherFree(XLogPrefetcher *prefetcher)
bool check_recovery_prefetch(int *new_value, void **extra, GucSource source)
static void XLogPrefetcherAddFilter(XLogPrefetcher *prefetcher, RelFileNode rnode, BlockNumber blockno, XLogRecPtr lsn)
LsnReadQueueNextStatus
@ LRQ_NEXT_NO_IO
@ LRQ_NEXT_IO
@ LRQ_NEXT_AGAIN
XLogReaderState * XLogPrefetcherGetReader(XLogPrefetcher *prefetcher)
@ RECOVERY_PREFETCH_ON
@ RECOVERY_PREFETCH_TRY
DecodedXLogRecord * XLogNextRecord(XLogReaderState *state, char **errormsg)
Definition: xlogreader.c:354
DecodedXLogRecord * XLogReadAhead(XLogReaderState *state, bool nonblocking)
Definition: xlogreader.c:938
void XLogReleasePreviousRecord(XLogReaderState *state)
Definition: xlogreader.c:282
void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr)
Definition: xlogreader.c:264
static bool XLogReaderHasQueuedRecordOrError(XLogReaderState *state)
Definition: xlogreader.h:325
#define BKPBLOCK_WILL_INIT
Definition: xlogrecord.h:188
#define XLR_INFO_MASK
Definition: xlogrecord.h:62