PostgreSQL Source Code git master
Loading...
Searching...
No Matches
sync.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * sync.c
4 * File synchronization management code.
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/sync/sync.c
12 *
13 *-------------------------------------------------------------------------
14 */
15#include "postgres.h"
16
17#include <unistd.h>
18#include <fcntl.h>
19#include <sys/file.h>
20
21#include "access/clog.h"
22#include "access/commit_ts.h"
23#include "access/multixact.h"
24#include "access/xlog.h"
25#include "miscadmin.h"
26#include "pgstat.h"
28#include "postmaster/bgwriter.h"
29#include "storage/fd.h"
30#include "storage/latch.h"
31#include "storage/md.h"
32#include "utils/hsearch.h"
33#include "utils/memutils.h"
34#include "utils/wait_event.h"
35
36/*
37 * In some contexts (currently, standalone backends and the checkpointer)
38 * we keep track of pending fsync operations: we need to remember all relation
39 * segments that have been written since the last checkpoint, so that we can
40 * fsync them down to disk before completing the next checkpoint. This hash
41 * table remembers the pending operations. We use a hash table mostly as
42 * a convenient way of merging duplicate requests.
43 *
44 * We use a similar mechanism to remember no-longer-needed files that can
45 * be deleted after the next checkpoint, but we use a linked list instead of
46 * a hash table, because we don't expect there to be any duplicate requests.
47 *
48 * These mechanisms are only used for non-temp relations; we never fsync
49 * temp rels, nor do we need to postpone their deletion (see comments in
50 * mdunlink).
51 *
52 * (Regular backends do not track pending operations locally, but forward
53 * them to the checkpointer.)
54 */
55typedef uint16 CycleCtr; /* can be any convenient integer size */
56
57typedef struct
58{
59 FileTag tag; /* identifies handler and file */
60 CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */
61 bool canceled; /* canceled is true if we canceled "recently" */
63
64typedef struct
65{
66 FileTag tag; /* identifies handler and file */
67 CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */
68 bool canceled; /* true if request has been canceled */
70
73static MemoryContext pendingOpsCxt; /* context for the above */
74
77
78/* Intervals for calling AbsorbSyncRequests */
79#define FSYNCS_PER_ABSORB 10
80#define UNLINKS_PER_ABSORB 10
81
82/*
83 * Function pointers for handling sync and unlink requests.
84 */
85typedef struct SyncOps
86{
87 int (*sync_syncfiletag) (const FileTag *ftag, char *path);
88 int (*sync_unlinkfiletag) (const FileTag *ftag, char *path);
90 const FileTag *candidate);
92
93/*
94 * These indexes must correspond to the values of the SyncRequestHandler enum.
95 */
96static const SyncOps syncsw[] = {
97 /* magnetic disk */
98 [SYNC_HANDLER_MD] = {
100 .sync_unlinkfiletag = mdunlinkfiletag,
101 .sync_filetagmatches = mdfiletagmatches
102 },
103 /* pg_xact */
105 .sync_syncfiletag = clogsyncfiletag
106 },
107 /* pg_commit_ts */
109 .sync_syncfiletag = committssyncfiletag
110 },
111 /* pg_multixact/offsets */
113 .sync_syncfiletag = multixactoffsetssyncfiletag
114 },
115 /* pg_multixact/members */
117 .sync_syncfiletag = multixactmemberssyncfiletag
118 }
119};
120
121/*
122 * Initialize data structures for the file sync tracking.
123 */
124void
126{
127 /*
128 * Create pending-operations hashtable if we need it. Currently, we need
129 * it if we are standalone (not under a postmaster) or if we are a
130 * checkpointer auxiliary process.
131 */
133 {
135
136 /*
137 * XXX: The checkpointer needs to add entries to the pending ops table
138 * when absorbing fsync requests. That is done within a critical
139 * section, which isn't usually allowed, but we make an exception. It
140 * means that there's a theoretical possibility that you run out of
141 * memory while absorbing fsync requests, which leads to a PANIC.
142 * Fortunately the hash table is small so that's unlikely to happen in
143 * practice.
144 */
146 "Pending ops context",
149
150 hash_ctl.keysize = sizeof(FileTag);
151 hash_ctl.entrysize = sizeof(PendingFsyncEntry);
152 hash_ctl.hcxt = pendingOpsCxt;
153 pendingOps = hash_create("Pending Ops Table",
154 100L,
155 &hash_ctl,
158 }
159}
160
161/*
162 * SyncPreCheckpoint() -- Do pre-checkpoint work
163 *
164 * To distinguish unlink requests that arrived before this checkpoint
165 * started from those that arrived during the checkpoint, we use a cycle
166 * counter similar to the one we use for fsync requests. That cycle
167 * counter is incremented here.
168 *
169 * This must be called *before* the checkpoint REDO point is determined.
170 * That ensures that we won't delete files too soon. Since this calls
171 * AbsorbSyncRequests(), which performs memory allocations, it cannot be
172 * called within a critical section.
173 *
174 * Note that we can't do anything here that depends on the assumption
175 * that the checkpoint will be completed.
176 */
177void
179{
180 /*
181 * Operations such as DROP TABLESPACE assume that the next checkpoint will
182 * process all recently forwarded unlink requests, but if they aren't
183 * absorbed prior to advancing the cycle counter, they won't be processed
184 * until a future checkpoint. The following absorb ensures that any
185 * unlink requests forwarded before the checkpoint began will be processed
186 * in the current checkpoint.
187 */
189
190 /*
191 * Any unlink requests arriving after this point will be assigned the next
192 * cycle counter, and won't be unlinked until next checkpoint.
193 */
195}
196
197/*
198 * SyncPostCheckpoint() -- Do post-checkpoint work
199 *
200 * Remove any lingering files that can now be safely removed.
201 */
202void
204{
205 int absorb_counter;
206 ListCell *lc;
207
209 foreach(lc, pendingUnlinks)
210 {
212 char path[MAXPGPATH];
213
214 /* Skip over any canceled entries */
215 if (entry->canceled)
216 continue;
217
218 /*
219 * New entries are appended to the end, so if the entry is new we've
220 * reached the end of old entries.
221 *
222 * Note: if just the right number of consecutive checkpoints fail, we
223 * could be fooled here by cycle_ctr wraparound. However, the only
224 * consequence is that we'd delay unlinking for one more checkpoint,
225 * which is perfectly tolerable.
226 */
227 if (entry->cycle_ctr == checkpoint_cycle_ctr)
228 break;
229
230 /* Unlink the file */
231 if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
232 path) < 0)
233 {
234 /*
235 * There's a race condition, when the database is dropped at the
236 * same time that we process the pending unlink requests. If the
237 * DROP DATABASE deletes the file before we do, we will get ENOENT
238 * here. rmtree() also has to ignore ENOENT errors, to deal with
239 * the possibility that we delete the file first.
240 */
241 if (errno != ENOENT)
244 errmsg("could not remove file \"%s\": %m", path)));
245 }
246
247 /* Mark the list entry as canceled, just in case */
248 entry->canceled = true;
249
250 /*
251 * As in ProcessSyncRequests, we don't want to stop absorbing fsync
252 * requests for a long time when there are many deletions to be done.
253 * We can safely call AbsorbSyncRequests() at this point in the loop.
254 */
255 if (--absorb_counter <= 0)
256 {
259 }
260 }
261
262 /*
263 * If we reached the end of the list, we can just remove the whole list
264 * (remembering to pfree all the PendingUnlinkEntry objects). Otherwise,
265 * we must keep the entries at or after "lc".
266 */
267 if (lc == NULL)
268 {
271 }
272 else
273 {
274 int ntodelete = list_cell_number(pendingUnlinks, lc);
275
276 for (int i = 0; i < ntodelete; i++)
278
280 }
281}
282
283/*
284 * ProcessSyncRequests() -- Process queued fsync requests.
285 */
286void
288{
289 static bool sync_in_progress = false;
290
292 PendingFsyncEntry *entry;
293 int absorb_counter;
294
295 /* Statistics on sync times */
296 int processed = 0;
298 sync_end,
299 sync_diff;
301 uint64 longest = 0;
303
304 /*
305 * This is only called during checkpoints, and checkpoints should only
306 * occur in processes that have created a pendingOps.
307 */
308 if (!pendingOps)
309 elog(ERROR, "cannot sync without a pendingOps table");
310
311 /*
312 * If we are in the checkpointer, the sync had better include all fsync
313 * requests that were queued by backends up to this point. The tightest
314 * race condition that could occur is that a buffer that must be written
315 * and fsync'd for the checkpoint could have been dumped by a backend just
316 * before it was visited by BufferSync(). We know the backend will have
317 * queued an fsync request before clearing the buffer's dirtybit, so we
318 * are safe as long as we do an Absorb after completing BufferSync().
319 */
321
322 /*
323 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
324 * checkpoint), we want to ignore fsync requests that are entered into the
325 * hashtable after this point --- they should be processed next time,
326 * instead. We use sync_cycle_ctr to tell old entries apart from new
327 * ones: new ones will have cycle_ctr equal to the incremented value of
328 * sync_cycle_ctr.
329 *
330 * In normal circumstances, all entries present in the table at this point
331 * will have cycle_ctr exactly equal to the current (about to be old)
332 * value of sync_cycle_ctr. However, if we fail partway through the
333 * fsync'ing loop, then older values of cycle_ctr might remain when we
334 * come back here to try again. Repeated checkpoint failures would
335 * eventually wrap the counter around to the point where an old entry
336 * might appear new, causing us to skip it, possibly allowing a checkpoint
337 * to succeed that should not have. To forestall wraparound, any time the
338 * previous ProcessSyncRequests() failed to complete, run through the
339 * table and forcibly set cycle_ctr = sync_cycle_ctr.
340 *
341 * Think not to merge this loop with the main loop, as the problem is
342 * exactly that that loop may fail before having visited all the entries.
343 * From a performance point of view it doesn't matter anyway, as this path
344 * will never be taken in a system that's functioning normally.
345 */
347 {
348 /* prior try failed, so update any stale cycle_ctr values */
350 while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
351 {
352 entry->cycle_ctr = sync_cycle_ctr;
353 }
354 }
355
356 /* Advance counter so that new hashtable entries are distinguishable */
358
359 /* Set flag to detect failure if we don't reach the end of the loop */
360 sync_in_progress = true;
361
362 /* Now scan the hashtable for fsync requests to process */
365 while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
366 {
367 int failures;
368
369 /*
370 * If the entry is new then don't process it this time; it is new.
371 * Note "continue" bypasses the hash-remove call at the bottom of the
372 * loop.
373 */
374 if (entry->cycle_ctr == sync_cycle_ctr)
375 continue;
376
377 /* Else assert we haven't missed it */
378 Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
379
380 /*
381 * If fsync is off then we don't have to bother opening the file at
382 * all. (We delay checking until this point so that changing fsync on
383 * the fly behaves sensibly.)
384 */
385 if (enableFsync)
386 {
387 /*
388 * If in checkpointer, we want to absorb pending requests every so
389 * often to prevent overflow of the fsync request queue. It is
390 * unspecified whether newly-added entries will be visited by
391 * hash_seq_search, but we don't care since we don't need to
392 * process them anyway.
393 */
394 if (--absorb_counter <= 0)
395 {
398 }
399
400 /*
401 * The fsync table could contain requests to fsync segments that
402 * have been deleted (unlinked) by the time we get to them. Rather
403 * than just hoping an ENOENT (or EACCES on Windows) error can be
404 * ignored, what we do on error is absorb pending requests and
405 * then retry. Since mdunlink() queues a "cancel" message before
406 * actually unlinking, the fsync request is guaranteed to be
407 * marked canceled after the absorb if it really was this case.
408 * DROP DATABASE likewise has to tell us to forget fsync requests
409 * before it starts deletions.
410 */
411 for (failures = 0; !entry->canceled; failures++)
412 {
413 char path[MAXPGPATH];
414
416 if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
417 path) == 0)
418 {
419 /* Success; update statistics about sync timing */
424 if (elapsed > longest)
427 processed++;
428
429 if (log_checkpoints)
430 elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms",
431 processed,
432 path,
433 (double) elapsed / 1000);
434
435 break; /* out of retry loop */
436 }
437
438 /*
439 * It is possible that the relation has been dropped or
440 * truncated since the fsync request was entered. Therefore,
441 * allow ENOENT, but only if we didn't fail already on this
442 * file.
443 */
444 if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
447 errmsg("could not fsync file \"%s\": %m",
448 path)));
449 else
452 errmsg_internal("could not fsync file \"%s\" but retrying: %m",
453 path)));
454
455 /*
456 * Absorb incoming requests and check to see if a cancel
457 * arrived for this relation fork.
458 */
460 absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
461 } /* end retry loop */
462 }
463
464 /* We are done with this entry, remove it */
465 if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
466 elog(ERROR, "pendingOps corrupted");
467 } /* end loop over hashtable entries */
468
469 /* Return sync performance metrics for report at checkpoint end */
473
474 /* Flag successful completion of ProcessSyncRequests */
475 sync_in_progress = false;
476}
477
478/*
479 * RememberSyncRequest() -- callback from checkpointer side of sync request
480 *
481 * We stuff fsync requests into the local hash table for execution
482 * during the checkpointer's next checkpoint. UNLINK requests go into a
483 * separate linked list, however, because they get processed separately.
484 *
485 * See sync.h for more information on the types of sync requests supported.
486 */
487void
489{
491
493 {
494 PendingFsyncEntry *entry;
495
496 /* Cancel previously entered request */
498 ftag,
499 HASH_FIND,
500 NULL);
501 if (entry != NULL)
502 entry->canceled = true;
503 }
504 else if (type == SYNC_FILTER_REQUEST)
505 {
508 ListCell *cell;
509
510 /* Cancel matching fsync requests */
512 while ((pfe = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
513 {
514 if (pfe->tag.handler == ftag->handler &&
515 syncsw[ftag->handler].sync_filetagmatches(ftag, &pfe->tag))
516 pfe->canceled = true;
517 }
518
519 /* Cancel matching unlink requests */
520 foreach(cell, pendingUnlinks)
521 {
523
524 if (pue->tag.handler == ftag->handler &&
525 syncsw[ftag->handler].sync_filetagmatches(ftag, &pue->tag))
526 pue->canceled = true;
527 }
528 }
529 else if (type == SYNC_UNLINK_REQUEST)
530 {
531 /* Unlink request: put it in the linked list */
533 PendingUnlinkEntry *entry;
534
536 entry->tag = *ftag;
538 entry->canceled = false;
539
541
543 }
544 else
545 {
546 /* Normal case: enter a request to fsync this segment */
548 PendingFsyncEntry *entry;
549 bool found;
550
552
554 ftag,
556 &found);
557 /* if new entry, or was previously canceled, initialize it */
558 if (!found || entry->canceled)
559 {
560 entry->cycle_ctr = sync_cycle_ctr;
561 entry->canceled = false;
562 }
563
564 /*
565 * NB: it's intentional that we don't change cycle_ctr if the entry
566 * already exists. The cycle_ctr must represent the oldest fsync
567 * request that could be in the entry.
568 */
569
571 }
572}
573
574/*
575 * Register the sync request locally, or forward it to the checkpointer.
576 *
577 * If retryOnError is true, we'll keep trying if there is no space in the
578 * queue. Return true if we succeeded, or false if there wasn't space.
579 */
580bool
582 bool retryOnError)
583{
584 bool ret;
585
586 if (pendingOps != NULL)
587 {
588 /* standalone backend or startup process: fsync state is local */
590 return true;
591 }
592
593 for (;;)
594 {
595 /*
596 * Notify the checkpointer about it. If we fail to queue a message in
597 * retryOnError mode, we have to sleep and try again ... ugly, but
598 * hopefully won't happen often.
599 *
600 * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
601 * error in the case of SYNC_UNLINK_REQUEST would leave the
602 * no-longer-used file still present on disk, which would be bad, so
603 * I'm inclined to assume that the checkpointer will always empty the
604 * queue soon.
605 */
606 ret = ForwardSyncRequest(ftag, type);
607
608 /*
609 * If we are successful in queueing the request, or we failed and were
610 * instructed not to retry on error, break.
611 */
612 if (ret || (!ret && !retryOnError))
613 break;
614
617 }
618
619 return ret;
620}
#define Assert(condition)
Definition c.h:945
uint64_t uint64
Definition c.h:619
uint16_t uint16
Definition c.h:617
bool ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
void AbsorbSyncRequests(void)
int clogsyncfiletag(const FileTag *ftag, char *path)
Definition clog.c:1105
int committssyncfiletag(const FileTag *ftag, char *path)
Definition commit_ts.c:1029
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition dynahash.c:952
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition dynahash.c:358
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition dynahash.c:1415
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition dynahash.c:1380
int errcode_for_file_access(void)
Definition elog.c:897
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:36
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int data_sync_elevel(int elevel)
Definition fd.c:3986
#define FILE_POSSIBLY_DELETED(err)
Definition fd.h:89
#define palloc_object(type)
Definition fe_memutils.h:74
bool enableFsync
Definition globals.c:129
bool IsUnderPostmaster
Definition globals.c:120
@ HASH_FIND
Definition hsearch.h:113
@ HASH_REMOVE
Definition hsearch.h:115
@ HASH_ENTER
Definition hsearch.h:114
#define HASH_CONTEXT
Definition hsearch.h:102
#define HASH_ELEM
Definition hsearch.h:95
#define HASH_BLOBS
Definition hsearch.h:97
#define INSTR_TIME_SET_CURRENT(t)
Definition instr_time.h:122
#define INSTR_TIME_SUBTRACT(x, y)
Definition instr_time.h:177
#define INSTR_TIME_GET_MICROSEC(t)
Definition instr_time.h:192
int i
Definition isn.c:77
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition latch.c:172
List * lappend(List *list, void *datum)
Definition list.c:339
List * list_delete_first_n(List *list, int n)
Definition list.c:983
void list_free_deep(List *list)
Definition list.c:1560
void pfree(void *pointer)
Definition mcxt.c:1616
MemoryContext TopMemoryContext
Definition mcxt.c:166
void MemoryContextAllowInCriticalSection(MemoryContext context, bool allow)
Definition mcxt.c:743
bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
Definition md.c:1975
int mdunlinkfiletag(const FileTag *ftag, char *path)
Definition md.c:1957
int mdsyncfiletag(const FileTag *ftag, char *path)
Definition md.c:1905
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
#define AmCheckpointerProcess()
Definition miscadmin.h:389
int multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
Definition multixact.c:2996
int multixactmemberssyncfiletag(const FileTag *ftag, char *path)
Definition multixact.c:3005
static char * errmsg
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition palloc.h:124
#define MAXPGPATH
#define lfirst(lc)
Definition pg_list.h:172
#define NIL
Definition pg_list.h:68
static void * list_nth(const List *list, int n)
Definition pg_list.h:299
static int list_cell_number(const List *l, const ListCell *c)
Definition pg_list.h:333
static int fb(int x)
static chr * longest(struct vars *v, struct dfa *d, chr *start, chr *stop, int *hitstopp)
Definition rege_dfa.c:42
uint64 ckpt_agg_sync_time
Definition xlog.h:187
uint64 ckpt_longest_sync
Definition xlog.h:186
Definition sync.h:51
int16 handler
Definition sync.h:52
Definition pg_list.h:54
FileTag tag
Definition sync.c:59
CycleCtr cycle_ctr
Definition sync.c:60
bool canceled
Definition sync.c:61
FileTag tag
Definition sync.c:66
CycleCtr cycle_ctr
Definition sync.c:67
Definition sync.c:86
int(* sync_syncfiletag)(const FileTag *ftag, char *path)
Definition sync.c:87
bool(* sync_filetagmatches)(const FileTag *ftag, const FileTag *candidate)
Definition sync.c:89
int(* sync_unlinkfiletag)(const FileTag *ftag, char *path)
Definition sync.c:88
void ProcessSyncRequests(void)
Definition sync.c:287
static CycleCtr checkpoint_cycle_ctr
Definition sync.c:76
void SyncPreCheckpoint(void)
Definition sync.c:178
static List * pendingUnlinks
Definition sync.c:72
static HTAB * pendingOps
Definition sync.c:71
#define UNLINKS_PER_ABSORB
Definition sync.c:80
void InitSync(void)
Definition sync.c:125
static const SyncOps syncsw[]
Definition sync.c:96
static MemoryContext pendingOpsCxt
Definition sync.c:73
void RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
Definition sync.c:488
static CycleCtr sync_cycle_ctr
Definition sync.c:75
#define FSYNCS_PER_ABSORB
Definition sync.c:79
void SyncPostCheckpoint(void)
Definition sync.c:203
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition sync.c:581
uint16 CycleCtr
Definition sync.c:55
@ SYNC_HANDLER_MD
Definition sync.h:37
@ SYNC_HANDLER_COMMIT_TS
Definition sync.h:39
@ SYNC_HANDLER_MULTIXACT_MEMBER
Definition sync.h:41
@ SYNC_HANDLER_CLOG
Definition sync.h:38
@ SYNC_HANDLER_MULTIXACT_OFFSET
Definition sync.h:40
SyncRequestType
Definition sync.h:24
@ SYNC_FILTER_REQUEST
Definition sync.h:28
@ SYNC_FORGET_REQUEST
Definition sync.h:27
@ SYNC_UNLINK_REQUEST
Definition sync.h:26
@ SYNC_REQUEST
Definition sync.h:25
const char * type
#define WL_TIMEOUT
#define WL_EXIT_ON_PM_DEATH
bool log_checkpoints
Definition xlog.c:133
CheckpointStatsData CheckpointStats
Definition xlog.c:213