#include "storage/relfilelocator.h"

Include dependency graph for sync.h:

This graph shows which files directly or indirectly include this file:

Data Structures
struct	FileTag

Typedefs
typedef enum SyncRequestType	SyncRequestType

typedef enum SyncRequestHandler	SyncRequestHandler

typedef struct FileTag	FileTag

Enumerations
enum	SyncRequestType { SYNC_REQUEST , SYNC_UNLINK_REQUEST , SYNC_FORGET_REQUEST , SYNC_FILTER_REQUEST }

enum	SyncRequestHandler { SYNC_HANDLER_MD = 0 , SYNC_HANDLER_CLOG , SYNC_HANDLER_COMMIT_TS , SYNC_HANDLER_MULTIXACT_OFFSET , SYNC_HANDLER_MULTIXACT_MEMBER , SYNC_HANDLER_NONE }

Functions
void	InitSync (void)

void	SyncPreCheckpoint (void)

void	SyncPostCheckpoint (void)

void	ProcessSyncRequests (void)

void	RememberSyncRequest (const FileTag *ftag, SyncRequestType type)

bool	RegisterSyncRequest (const FileTag *ftag, SyncRequestType type, bool retryOnError)

Typedef Documentation

◆ FileTag

typedef struct FileTag FileTag

◆ SyncRequestHandler

typedef enum SyncRequestHandler SyncRequestHandler

◆ SyncRequestType

typedef enum SyncRequestType SyncRequestType

Enumeration Type Documentation

◆ SyncRequestHandler

enum SyncRequestHandler

Enumerator
SYNC_HANDLER_MD
SYNC_HANDLER_CLOG
SYNC_HANDLER_COMMIT_TS
SYNC_HANDLER_MULTIXACT_OFFSET
SYNC_HANDLER_MULTIXACT_MEMBER
SYNC_HANDLER_NONE

Definition at line 35 of file sync.h.

{
    SYNC_HANDLER_MD = 0,
    SYNC_HANDLER_CLOG,
    SYNC_HANDLER_COMMIT_TS,
    SYNC_HANDLER_MULTIXACT_OFFSET,
    SYNC_HANDLER_MULTIXACT_MEMBER,
    SYNC_HANDLER_NONE,
} SyncRequestHandler;

◆ SyncRequestType

enum SyncRequestType

Enumerator
SYNC_REQUEST
SYNC_UNLINK_REQUEST
SYNC_FORGET_REQUEST
SYNC_FILTER_REQUEST

Definition at line 23 of file sync.h.

{
    SYNC_REQUEST,               /* schedule a call of sync function */
    SYNC_UNLINK_REQUEST,        /* schedule a call of unlink function */
    SYNC_FORGET_REQUEST,        /* forget all calls for a tag */
    SYNC_FILTER_REQUEST,        /* forget all calls satisfying match fn */
} SyncRequestType;

Function Documentation

◆ InitSync()

void InitSync ( void )

Definition at line 124 of file sync.c.

{
    /*
     * Create pending-operations hashtable if we need it.  Currently, we need
     * it if we are standalone (not under a postmaster) or if we are a
     * checkpointer auxiliary process.
     */
    if (!IsUnderPostmaster || AmCheckpointerProcess())
    {
        HASHCTL     hash_ctl;
 
        /*
         * XXX: The checkpointer needs to add entries to the pending ops table
         * when absorbing fsync requests.  That is done within a critical
         * section, which isn't usually allowed, but we make an exception. It
         * means that there's a theoretical possibility that you run out of
         * memory while absorbing fsync requests, which leads to a PANIC.
         * Fortunately the hash table is small so that's unlikely to happen in
         * practice.
         */
        pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
                                              "Pending ops context",
                                              ALLOCSET_DEFAULT_SIZES);
        MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
 
        hash_ctl.keysize = sizeof(FileTag);
        hash_ctl.entrysize = sizeof(PendingFsyncEntry);
        hash_ctl.hcxt = pendingOpsCxt;
        pendingOps = hash_create("Pending Ops Table",
                                 100L,
                                 &hash_ctl,
                                 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
        pendingUnlinks = NIL;
    }
}

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, AmCheckpointerProcess, HASHCTL::entrysize, HASH_BLOBS, HASH_CONTEXT, hash_create(), HASH_ELEM, HASHCTL::hcxt, IsUnderPostmaster, HASHCTL::keysize, MemoryContextAllowInCriticalSection(), NIL, pendingOps, pendingOpsCxt, pendingUnlinks, and TopMemoryContext.

Referenced by BaseInit().

◆ ProcessSyncRequests()

void ProcessSyncRequests ( void )

Definition at line 286 of file sync.c.

{
    static bool sync_in_progress = false;
 
    HASH_SEQ_STATUS hstat;
    PendingFsyncEntry *entry;
    int         absorb_counter;
 
    /* Statistics on sync times */
    int         processed = 0;
    instr_time  sync_start,
                sync_end,
                sync_diff;
    uint64      elapsed;
    uint64      longest = 0;
    uint64      total_elapsed = 0;
 
    /*
     * This is only called during checkpoints, and checkpoints should only
     * occur in processes that have created a pendingOps.
     */
    if (!pendingOps)
        elog(ERROR, "cannot sync without a pendingOps table");
 
    /*
     * If we are in the checkpointer, the sync had better include all fsync
     * requests that were queued by backends up to this point.  The tightest
     * race condition that could occur is that a buffer that must be written
     * and fsync'd for the checkpoint could have been dumped by a backend just
     * before it was visited by BufferSync().  We know the backend will have
     * queued an fsync request before clearing the buffer's dirtybit, so we
     * are safe as long as we do an Absorb after completing BufferSync().
     */
    AbsorbSyncRequests();
 
    /*
     * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
     * checkpoint), we want to ignore fsync requests that are entered into the
     * hashtable after this point --- they should be processed next time,
     * instead.  We use sync_cycle_ctr to tell old entries apart from new
     * ones: new ones will have cycle_ctr equal to the incremented value of
     * sync_cycle_ctr.
     *
     * In normal circumstances, all entries present in the table at this point
     * will have cycle_ctr exactly equal to the current (about to be old)
     * value of sync_cycle_ctr.  However, if we fail partway through the
     * fsync'ing loop, then older values of cycle_ctr might remain when we
     * come back here to try again.  Repeated checkpoint failures would
     * eventually wrap the counter around to the point where an old entry
     * might appear new, causing us to skip it, possibly allowing a checkpoint
     * to succeed that should not have.  To forestall wraparound, any time the
     * previous ProcessSyncRequests() failed to complete, run through the
     * table and forcibly set cycle_ctr = sync_cycle_ctr.
     *
     * Think not to merge this loop with the main loop, as the problem is
     * exactly that that loop may fail before having visited all the entries.
     * From a performance point of view it doesn't matter anyway, as this path
     * will never be taken in a system that's functioning normally.
     */
    if (sync_in_progress)
    {
        /* prior try failed, so update any stale cycle_ctr values */
        hash_seq_init(&hstat, pendingOps);
        while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
        {
            entry->cycle_ctr = sync_cycle_ctr;
        }
    }
 
    /* Advance counter so that new hashtable entries are distinguishable */
    sync_cycle_ctr++;
 
    /* Set flag to detect failure if we don't reach the end of the loop */
    sync_in_progress = true;
 
    /* Now scan the hashtable for fsync requests to process */
    absorb_counter = FSYNCS_PER_ABSORB;
    hash_seq_init(&hstat, pendingOps);
    while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
    {
        int         failures;
 
        /*
         * If the entry is new then don't process it this time; it is new.
         * Note "continue" bypasses the hash-remove call at the bottom of the
         * loop.
         */
        if (entry->cycle_ctr == sync_cycle_ctr)
            continue;
 
        /* Else assert we haven't missed it */
        Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
 
        /*
         * If fsync is off then we don't have to bother opening the file at
         * all.  (We delay checking until this point so that changing fsync on
         * the fly behaves sensibly.)
         */
        if (enableFsync)
        {
            /*
             * If in checkpointer, we want to absorb pending requests every so
             * often to prevent overflow of the fsync request queue.  It is
             * unspecified whether newly-added entries will be visited by
             * hash_seq_search, but we don't care since we don't need to
             * process them anyway.
             */
            if (--absorb_counter <= 0)
            {
                AbsorbSyncRequests();
                absorb_counter = FSYNCS_PER_ABSORB;
            }
 
            /*
             * The fsync table could contain requests to fsync segments that
             * have been deleted (unlinked) by the time we get to them. Rather
             * than just hoping an ENOENT (or EACCES on Windows) error can be
             * ignored, what we do on error is absorb pending requests and
             * then retry. Since mdunlink() queues a "cancel" message before
             * actually unlinking, the fsync request is guaranteed to be
             * marked canceled after the absorb if it really was this case.
             * DROP DATABASE likewise has to tell us to forget fsync requests
             * before it starts deletions.
             */
            for (failures = 0; !entry->canceled; failures++)
            {
                char        path[MAXPGPATH];
 
                INSTR_TIME_SET_CURRENT(sync_start);
                if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
                                                                path) == 0)
                {
                    /* Success; update statistics about sync timing */
                    INSTR_TIME_SET_CURRENT(sync_end);
                    sync_diff = sync_end;
                    INSTR_TIME_SUBTRACT(sync_diff, sync_start);
                    elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
                    if (elapsed > longest)
                        longest = elapsed;
                    total_elapsed += elapsed;
                    processed++;
 
                    if (log_checkpoints)
                        elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms",
                             processed,
                             path,
                             (double) elapsed / 1000);
 
                    break;      /* out of retry loop */
                }
 
                /*
                 * It is possible that the relation has been dropped or
                 * truncated since the fsync request was entered. Therefore,
                 * allow ENOENT, but only if we didn't fail already on this
                 * file.
                 */
                if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
                    ereport(data_sync_elevel(ERROR),
                            (errcode_for_file_access(),
                             errmsg("could not fsync file \"%s\": %m",
                                    path)));
                else
                    ereport(DEBUG1,
                            (errcode_for_file_access(),
                             errmsg_internal("could not fsync file \"%s\" but retrying: %m",
                                             path)));
 
                /*
                 * Absorb incoming requests and check to see if a cancel
                 * arrived for this relation fork.
                 */
                AbsorbSyncRequests();
                absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
            }                   /* end retry loop */
        }
 
        /* We are done with this entry, remove it */
        if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
            elog(ERROR, "pendingOps corrupted");
    }                           /* end loop over hashtable entries */
 
    /* Return sync performance metrics for report at checkpoint end */
    CheckpointStats.ckpt_sync_rels = processed;
    CheckpointStats.ckpt_longest_sync = longest;
    CheckpointStats.ckpt_agg_sync_time = total_elapsed;
 
    /* Flag successful completion of ProcessSyncRequests */
    sync_in_progress = false;
}

References AbsorbSyncRequests(), Assert(), PendingFsyncEntry::canceled, CheckpointStats, CheckpointStatsData::ckpt_agg_sync_time, CheckpointStatsData::ckpt_longest_sync, CheckpointStatsData::ckpt_sync_rels, PendingFsyncEntry::cycle_ctr, data_sync_elevel(), DEBUG1, elog, enableFsync, ereport, errcode_for_file_access(), errmsg(), errmsg_internal(), ERROR, FILE_POSSIBLY_DELETED, FSYNCS_PER_ABSORB, FileTag::handler, HASH_REMOVE, hash_search(), hash_seq_init(), hash_seq_search(), INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, log_checkpoints, longest(), MAXPGPATH, pendingOps, sync_cycle_ctr, SyncOps::sync_syncfiletag, syncsw, and PendingFsyncEntry::tag.

Referenced by CheckPointGuts().

◆ RegisterSyncRequest()

bool RegisterSyncRequest	(	const FileTag *	ftag,
		SyncRequestType	type,
		bool	retryOnError
	)

Definition at line 580 of file sync.c.

{
    bool        ret;
 
    if (pendingOps != NULL)
    {
        /* standalone backend or startup process: fsync state is local */
        RememberSyncRequest(ftag, type);
        return true;
    }
 
    for (;;)
    {
        /*
         * Notify the checkpointer about it.  If we fail to queue a message in
         * retryOnError mode, we have to sleep and try again ... ugly, but
         * hopefully won't happen often.
         *
         * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
         * error in the case of SYNC_UNLINK_REQUEST would leave the
         * no-longer-used file still present on disk, which would be bad, so
         * I'm inclined to assume that the checkpointer will always empty the
         * queue soon.
         */
        ret = ForwardSyncRequest(ftag, type);
 
        /*
         * If we are successful in queueing the request, or we failed and were
         * instructed not to retry on error, break.
         */
        if (ret || (!ret && !retryOnError))
            break;
 
        WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10,
                  WAIT_EVENT_REGISTER_SYNC_REQUEST);
    }
 
    return ret;
}

References ForwardSyncRequest(), pendingOps, RememberSyncRequest(), type, WaitLatch(), WL_EXIT_ON_PM_DEATH, and WL_TIMEOUT.

Referenced by ForgetDatabaseSyncRequests(), register_dirty_segment(), register_forget_request(), register_unlink_segment(), SlruInternalDeleteSegment(), and SlruPhysicalWritePage().

◆ RememberSyncRequest()

void RememberSyncRequest	(	const FileTag *	ftag,
		SyncRequestType	type
	)

Definition at line 487 of file sync.c.

{
    Assert(pendingOps);
 
    if (type == SYNC_FORGET_REQUEST)
    {
        PendingFsyncEntry *entry;
 
        /* Cancel previously entered request */
        entry = (PendingFsyncEntry *) hash_search(pendingOps,
                                                  ftag,
                                                  HASH_FIND,
                                                  NULL);
        if (entry != NULL)
            entry->canceled = true;
    }
    else if (type == SYNC_FILTER_REQUEST)
    {
        HASH_SEQ_STATUS hstat;
        PendingFsyncEntry *pfe;
        ListCell   *cell;
 
        /* Cancel matching fsync requests */
        hash_seq_init(&hstat, pendingOps);
        while ((pfe = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
        {
            if (pfe->tag.handler == ftag->handler &&
                syncsw[ftag->handler].sync_filetagmatches(ftag, &pfe->tag))
                pfe->canceled = true;
        }
 
        /* Cancel matching unlink requests */
        foreach(cell, pendingUnlinks)
        {
            PendingUnlinkEntry *pue = (PendingUnlinkEntry *) lfirst(cell);
 
            if (pue->tag.handler == ftag->handler &&
                syncsw[ftag->handler].sync_filetagmatches(ftag, &pue->tag))
                pue->canceled = true;
        }
    }
    else if (type == SYNC_UNLINK_REQUEST)
    {
        /* Unlink request: put it in the linked list */
        MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
        PendingUnlinkEntry *entry;
 
        entry = palloc(sizeof(PendingUnlinkEntry));
        entry->tag = *ftag;
        entry->cycle_ctr = checkpoint_cycle_ctr;
        entry->canceled = false;
 
        pendingUnlinks = lappend(pendingUnlinks, entry);
 
        MemoryContextSwitchTo(oldcxt);
    }
    else
    {
        /* Normal case: enter a request to fsync this segment */
        MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
        PendingFsyncEntry *entry;
        bool        found;
 
        Assert(type == SYNC_REQUEST);
 
        entry = (PendingFsyncEntry *) hash_search(pendingOps,
                                                  ftag,
                                                  HASH_ENTER,
                                                  &found);
        /* if new entry, or was previously canceled, initialize it */
        if (!found || entry->canceled)
        {
            entry->cycle_ctr = sync_cycle_ctr;
            entry->canceled = false;
        }
 
        /*
         * NB: it's intentional that we don't change cycle_ctr if the entry
         * already exists.  The cycle_ctr must represent the oldest fsync
         * request that could be in the entry.
         */
 
        MemoryContextSwitchTo(oldcxt);
    }
}

References Assert(), PendingFsyncEntry::canceled, PendingUnlinkEntry::canceled, checkpoint_cycle_ctr, PendingFsyncEntry::cycle_ctr, PendingUnlinkEntry::cycle_ctr, FileTag::handler, HASH_ENTER, HASH_FIND, hash_search(), hash_seq_init(), hash_seq_search(), lappend(), lfirst, MemoryContextSwitchTo(), palloc(), pendingOps, pendingOpsCxt, pendingUnlinks, sync_cycle_ctr, SyncOps::sync_filetagmatches, SYNC_FILTER_REQUEST, SYNC_FORGET_REQUEST, SYNC_REQUEST, SYNC_UNLINK_REQUEST, syncsw, PendingFsyncEntry::tag, PendingUnlinkEntry::tag, and type.

Referenced by AbsorbSyncRequests(), and RegisterSyncRequest().

◆ SyncPostCheckpoint()

void SyncPostCheckpoint ( void )

Definition at line 202 of file sync.c.

{
    int         absorb_counter;
    ListCell   *lc;
 
    absorb_counter = UNLINKS_PER_ABSORB;
    foreach(lc, pendingUnlinks)
    {
        PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc);
        char        path[MAXPGPATH];
 
        /* Skip over any canceled entries */
        if (entry->canceled)
            continue;
 
        /*
         * New entries are appended to the end, so if the entry is new we've
         * reached the end of old entries.
         *
         * Note: if just the right number of consecutive checkpoints fail, we
         * could be fooled here by cycle_ctr wraparound.  However, the only
         * consequence is that we'd delay unlinking for one more checkpoint,
         * which is perfectly tolerable.
         */
        if (entry->cycle_ctr == checkpoint_cycle_ctr)
            break;
 
        /* Unlink the file */
        if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
                                                          path) < 0)
        {
            /*
             * There's a race condition, when the database is dropped at the
             * same time that we process the pending unlink requests. If the
             * DROP DATABASE deletes the file before we do, we will get ENOENT
             * here. rmtree() also has to ignore ENOENT errors, to deal with
             * the possibility that we delete the file first.
             */
            if (errno != ENOENT)
                ereport(WARNING,
                        (errcode_for_file_access(),
                         errmsg("could not remove file \"%s\": %m", path)));
        }
 
        /* Mark the list entry as canceled, just in case */
        entry->canceled = true;
 
        /*
         * As in ProcessSyncRequests, we don't want to stop absorbing fsync
         * requests for a long time when there are many deletions to be done.
         * We can safely call AbsorbSyncRequests() at this point in the loop.
         */
        if (--absorb_counter <= 0)
        {
            AbsorbSyncRequests();
            absorb_counter = UNLINKS_PER_ABSORB;
        }
    }
 
    /*
     * If we reached the end of the list, we can just remove the whole list
     * (remembering to pfree all the PendingUnlinkEntry objects).  Otherwise,
     * we must keep the entries at or after "lc".
     */
    if (lc == NULL)
    {
        list_free_deep(pendingUnlinks);
        pendingUnlinks = NIL;
    }
    else
    {
        int         ntodelete = list_cell_number(pendingUnlinks, lc);
 
        for (int i = 0; i < ntodelete; i++)
            pfree(list_nth(pendingUnlinks, i));
 
        pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete);
    }
}

References AbsorbSyncRequests(), PendingUnlinkEntry::canceled, checkpoint_cycle_ctr, PendingUnlinkEntry::cycle_ctr, ereport, errcode_for_file_access(), errmsg(), FileTag::handler, i, lfirst, list_cell_number(), list_delete_first_n(), list_free_deep(), list_nth(), MAXPGPATH, NIL, pendingUnlinks, pfree(), SyncOps::sync_unlinkfiletag, syncsw, PendingUnlinkEntry::tag, UNLINKS_PER_ABSORB, and WARNING.

Referenced by CreateCheckPoint().

◆ SyncPreCheckpoint()

void SyncPreCheckpoint ( void )

Definition at line 177 of file sync.c.

{
    /*
     * Operations such as DROP TABLESPACE assume that the next checkpoint will
     * process all recently forwarded unlink requests, but if they aren't
     * absorbed prior to advancing the cycle counter, they won't be processed
     * until a future checkpoint.  The following absorb ensures that any
     * unlink requests forwarded before the checkpoint began will be processed
     * in the current checkpoint.
     */
    AbsorbSyncRequests();
 
    /*
     * Any unlink requests arriving after this point will be assigned the next
     * cycle counter, and won't be unlinked until next checkpoint.
     */
    checkpoint_cycle_ctr++;
}

References AbsorbSyncRequests(), and checkpoint_cycle_ctr.

Referenced by CreateCheckPoint().

Data Structures

Typedefs

Enumerations

Functions

Typedef Documentation

◆ FileTag

◆ SyncRequestHandler

◆ SyncRequestType

Enumeration Type Documentation

◆ SyncRequestHandler

◆ SyncRequestType

Function Documentation

◆ InitSync()

◆ ProcessSyncRequests()

◆ RegisterSyncRequest()

◆ RememberSyncRequest()

◆ SyncPostCheckpoint()

◆ SyncPreCheckpoint()