#include "postgres.h"
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
#include "access/xlogutils.h"
#include "commands/tablespace.h"
#include "common/file_utils.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/md.h"
#include "storage/relfilelocator.h"
#include "storage/smgr.h"
#include "storage/sync.h"
#include "utils/memutils.h"

Include dependency graph for md.c:

Data Structures
struct	_MdfdVec

struct	MdPathStr

Macros
#define	INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)

#define	EXTENSION_FAIL (1 << 0)

#define	EXTENSION_RETURN_NULL (1 << 1)

#define	EXTENSION_CREATE (1 << 2)

#define	EXTENSION_CREATE_RECOVERY (1 << 3)

#define	EXTENSION_DONT_OPEN (1 << 5)

#define	SEGMENT_CHARS OIDCHARS

#define	MD_PATH_STR_MAXLEN

Typedefs
typedef struct _MdfdVec	MdfdVec

typedef struct MdPathStr	MdPathStr

Functions
static void	mdunlinkfork (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)

static MdfdVec *	mdopenfork (SMgrRelation reln, ForkNumber forknum, int behavior)

static void	register_dirty_segment (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)

static void	register_unlink_segment (RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)

static void	register_forget_request (RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)

static void	_fdvec_resize (SMgrRelation reln, ForkNumber forknum, int nseg)

static MdPathStr	_mdfd_segpath (SMgrRelation reln, ForkNumber forknum, BlockNumber segno)

static MdfdVec *	_mdfd_openseg (SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)

static MdfdVec *	_mdfd_getseg (SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)

static BlockNumber	_mdnblocks (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)

static PgAioResult	md_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)

static void	md_readv_report (PgAioResult result, const PgAioTargetData *td, int elevel)

static int	_mdfd_open_flags (void)

void	mdinit (void)

bool	mdexists (SMgrRelation reln, ForkNumber forknum)

void	mdcreate (SMgrRelation reln, ForkNumber forknum, bool isRedo)

void	mdunlink (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)

static int	do_truncate (const char *path)

void	mdextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)

void	mdzeroextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)

void	mdopen (SMgrRelation reln)

void	mdclose (SMgrRelation reln, ForkNumber forknum)

bool	mdprefetch (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)

static int	buffers_to_iovec (struct iovec iov, void *buffers, int nblocks)

uint32	mdmaxcombine (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)

void	mdreadv (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)

void	mdstartreadv (PgAioHandle ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffers, BlockNumber nblocks)

void	mdwritev (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)

void	mdwriteback (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)

BlockNumber	mdnblocks (SMgrRelation reln, ForkNumber forknum)

void	mdtruncate (SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)

void	mdregistersync (SMgrRelation reln, ForkNumber forknum)

void	mdimmedsync (SMgrRelation reln, ForkNumber forknum)

int	mdfd (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)

void	ForgetDatabaseSyncRequests (Oid dbid)

void	DropRelationFiles (RelFileLocator *delrels, int ndelrels, bool isRedo)

int	mdsyncfiletag (const FileTag ftag, char path)

int	mdunlinkfiletag (const FileTag ftag, char path)

bool	mdfiletagmatches (const FileTag ftag, const FileTag candidate)

Variables
static MemoryContext	MdCxt

const PgAioHandleCallbacks	aio_md_readv_cb

Macro Definition Documentation

◆ EXTENSION_CREATE

#define EXTENSION_CREATE (1 << 2)

Definition at line 107 of file md.c.

◆ EXTENSION_CREATE_RECOVERY

#define EXTENSION_CREATE_RECOVERY (1 << 3)

Definition at line 109 of file md.c.

◆ EXTENSION_DONT_OPEN

#define EXTENSION_DONT_OPEN (1 << 5)

Definition at line 111 of file md.c.

◆ EXTENSION_FAIL

#define EXTENSION_FAIL (1 << 0)

Definition at line 103 of file md.c.

◆ EXTENSION_RETURN_NULL

#define EXTENSION_RETURN_NULL (1 << 1)

Definition at line 105 of file md.c.

◆ INIT_MD_FILETAG

#define INIT_MD_FILETAG	(	a,
		xx_rlocator,
		xx_forknum,
		xx_segno
	)

Value:

( \
    memset(&(a), 0, sizeof(FileTag)), \
    (a).handler = SYNC_HANDLER_MD, \
    (a).rlocator = (xx_rlocator), \
    (a).forknum = (xx_forknum), \
    (a).segno = (xx_segno) \
)

Definition at line 91 of file md.c.

◆ MD_PATH_STR_MAXLEN

#define MD_PATH_STR_MAXLEN

Value:

    (\
        REL_PATH_STR_MAXLEN \
        + sizeof((char)'.') \
        + SEGMENT_CHARS \
    )

Definition at line 122 of file md.c.

◆ SEGMENT_CHARS

#define SEGMENT_CHARS OIDCHARS

Definition at line 121 of file md.c.

Typedef Documentation

◆ MdfdVec

typedef struct _MdfdVec MdfdVec

◆ MdPathStr

typedef struct MdPathStr MdPathStr

Function Documentation

◆ _fdvec_resize()

static void _fdvec_resize	(	SMgrRelation	reln,
		ForkNumber	forknum,
		int	nseg
	)

static

Definition at line 1619 of file md.c.

{
    if (nseg == 0)
    {
        if (reln->md_num_open_segs[forknum] > 0)
        {
            pfree(reln->md_seg_fds[forknum]);
            reln->md_seg_fds[forknum] = NULL;
        }
    }
    else if (reln->md_num_open_segs[forknum] == 0)
    {
        reln->md_seg_fds[forknum] =
            MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
    }
    else if (nseg > reln->md_num_open_segs[forknum])
    {
        /*
         * It doesn't seem worthwhile complicating the code to amortize
         * repalloc() calls.  Those are far faster than PathNameOpenFile() or
         * FileClose(), and the memory context internally will sometimes avoid
         * doing an actual reallocation.
         */
        reln->md_seg_fds[forknum] =
            repalloc(reln->md_seg_fds[forknum],
                     sizeof(MdfdVec) * nseg);
    }
    else
    {
        /*
         * We don't reallocate a smaller array, because we want mdtruncate()
         * to be able to promise that it won't allocate memory, so that it is
         * allowed in a critical section.  This means that a bit of space in
         * the array is now wasted, until the next time we add a segment and
         * reallocate.
         */
    }
 
    reln->md_num_open_segs[forknum] = nseg;
}

References SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, MdCxt, MemoryContextAlloc(), pfree(), and repalloc().

Referenced by _mdfd_openseg(), mdclose(), mdcreate(), mdimmedsync(), mdopenfork(), mdregistersync(), and mdtruncate().

◆ _mdfd_getseg()

static MdfdVec * _mdfd_getseg	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blkno,
		bool	skipFsync,
		int	behavior
	)

static

Definition at line 1730 of file md.c.

{
    MdfdVec    *v;
    BlockNumber targetseg;
    BlockNumber nextsegno;
 
    /* some way to handle non-existent segments needs to be specified */
    Assert(behavior &
           (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL |
            EXTENSION_DONT_OPEN));
 
    targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
 
    /* if an existing and opened segment, we're done */
    if (targetseg < reln->md_num_open_segs[forknum])
    {
        v = &reln->md_seg_fds[forknum][targetseg];
        return v;
    }
 
    /* The caller only wants the segment if we already had it open. */
    if (behavior & EXTENSION_DONT_OPEN)
        return NULL;
 
    /*
     * The target segment is not yet open. Iterate over all the segments
     * between the last opened and the target segment. This way missing
     * segments either raise an error, or get created (according to
     * 'behavior'). Start with either the last opened, or the first segment if
     * none was opened before.
     */
    if (reln->md_num_open_segs[forknum] > 0)
        v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
    else
    {
        v = mdopenfork(reln, forknum, behavior);
        if (!v)
            return NULL;        /* if behavior & EXTENSION_RETURN_NULL */
    }
 
    for (nextsegno = reln->md_num_open_segs[forknum];
         nextsegno <= targetseg; nextsegno++)
    {
        BlockNumber nblocks = _mdnblocks(reln, forknum, v);
        int         flags = 0;
 
        Assert(nextsegno == v->mdfd_segno + 1);
 
        if (nblocks > ((BlockNumber) RELSEG_SIZE))
            elog(FATAL, "segment too big");
 
        if ((behavior & EXTENSION_CREATE) ||
            (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
        {
            /*
             * Normally we will create new segments only if authorized by the
             * caller (i.e., we are doing mdextend()).  But when doing WAL
             * recovery, create segments anyway; this allows cases such as
             * replaying WAL data that has a write into a high-numbered
             * segment of a relation that was later deleted. We want to go
             * ahead and create the segments so we can finish out the replay.
             *
             * We have to maintain the invariant that segments before the last
             * active segment are of size RELSEG_SIZE; therefore, if
             * extending, pad them out with zeroes if needed.  (This only
             * matters if in recovery, or if the caller is extending the
             * relation discontiguously, but that can happen in hash indexes.)
             */
            if (nblocks < ((BlockNumber) RELSEG_SIZE))
            {
                char       *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
                                                     MCXT_ALLOC_ZERO);
 
                mdextend(reln, forknum,
                         nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
                         zerobuf, skipFsync);
                pfree(zerobuf);
            }
            flags = O_CREAT;
        }
        else if (nblocks < ((BlockNumber) RELSEG_SIZE))
        {
            /*
             * When not extending, only open the next segment if the current
             * one is exactly RELSEG_SIZE.  If not (this branch), either
             * return NULL or fail.
             */
            if (behavior & EXTENSION_RETURN_NULL)
            {
                /*
                 * Some callers discern between reasons for _mdfd_getseg()
                 * returning NULL based on errno. As there's no failing
                 * syscall involved in this case, explicitly set errno to
                 * ENOENT, as that seems the closest interpretation.
                 */
                errno = ENOENT;
                return NULL;
            }
 
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
                            _mdfd_segpath(reln, forknum, nextsegno).str,
                            blkno, nblocks)));
        }
 
        v = _mdfd_openseg(reln, forknum, nextsegno, flags);
 
        if (v == NULL)
        {
            if ((behavior & EXTENSION_RETURN_NULL) &&
                FILE_POSSIBLY_DELETED(errno))
                return NULL;
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not open file \"%s\" (target block %u): %m",
                            _mdfd_segpath(reln, forknum, nextsegno).str,
                            blkno)));
        }
    }
 
    return v;
}

References _mdfd_openseg(), _mdfd_segpath(), _mdnblocks(), Assert(), elog, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE, EXTENSION_CREATE_RECOVERY, EXTENSION_DONT_OPEN, EXTENSION_FAIL, EXTENSION_RETURN_NULL, FATAL, FILE_POSSIBLY_DELETED, InRecovery, MCXT_ALLOC_ZERO, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, mdextend(), _MdfdVec::mdfd_segno, mdopenfork(), palloc_aligned(), pfree(), PG_IO_ALIGN_SIZE, and str.

Referenced by mdextend(), mdfd(), mdprefetch(), mdreadv(), mdstartreadv(), mdwriteback(), mdwritev(), and mdzeroextend().

◆ _mdfd_open_flags()

static int _mdfd_open_flags ( void )

inlinestatic

Definition at line 166 of file md.c.

{
    int         flags = O_RDWR | PG_BINARY;
 
    if (io_direct_flags & IO_DIRECT_DATA)
        flags |= PG_O_DIRECT;
 
    return flags;
}

References IO_DIRECT_DATA, io_direct_flags, PG_BINARY, and PG_O_DIRECT.

Referenced by _mdfd_openseg(), mdcreate(), mdopenfork(), and mdsyncfiletag().

◆ _mdfd_openseg()

static MdfdVec * _mdfd_openseg	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	segno,
		int	oflags
	)

static

Definition at line 1687 of file md.c.

{
    MdfdVec    *v;
    File        fd;
    MdPathStr   fullpath;
 
    fullpath = _mdfd_segpath(reln, forknum, segno);
 
    /* open the file */
    fd = PathNameOpenFile(fullpath.str, _mdfd_open_flags() | oflags);
 
    if (fd < 0)
        return NULL;
 
    /*
     * Segments are always opened in order from lowest to highest, so we must
     * be adding a new one at the end.
     */
    Assert(segno == reln->md_num_open_segs[forknum]);
 
    _fdvec_resize(reln, forknum, segno + 1);
 
    /* fill the entry */
    v = &reln->md_seg_fds[forknum][segno];
    v->mdfd_vfd = fd;
    v->mdfd_segno = segno;
 
    Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 
    /* all done */
    return v;
}

References _fdvec_resize(), _mdfd_open_flags(), _mdfd_segpath(), _mdnblocks(), Assert(), fd(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, PathNameOpenFile(), and MdPathStr::str.

Referenced by _mdfd_getseg(), mdimmedsync(), mdnblocks(), and mdregistersync().

◆ _mdfd_segpath()

static MdPathStr _mdfd_segpath	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	segno
	)

static

Definition at line 1667 of file md.c.

{
    RelPathStr  path;
    MdPathStr   fullpath;
 
    path = relpath(reln->smgr_rlocator, forknum);
 
    if (segno > 0)
        sprintf(fullpath.str, "%s.%u", path.str, segno);
    else
        strcpy(fullpath.str, path.str);
 
    return fullpath;
}

References relpath, SMgrRelationData::smgr_rlocator, sprintf, MdPathStr::str, and RelPathStr::str.

Referenced by _mdfd_getseg(), _mdfd_openseg(), and mdsyncfiletag().

◆ _mdnblocks()

static BlockNumber _mdnblocks	(	SMgrRelation	reln,
		ForkNumber	forknum,
		MdfdVec *	seg
	)

static

Definition at line 1859 of file md.c.

{
    off_t       len;
 
    len = FileSize(seg->mdfd_vfd);
    if (len < 0)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not seek to end of file \"%s\": %m",
                        FilePathName(seg->mdfd_vfd))));
    /* note that this calculation will ignore any partial block at EOF */
    return (BlockNumber) (len / BLCKSZ);
}

References ereport, errcode_for_file_access(), errmsg(), ERROR, FilePathName(), FileSize(), len, and _MdfdVec::mdfd_vfd.

Referenced by _mdfd_getseg(), _mdfd_openseg(), mdextend(), mdnblocks(), mdopenfork(), and mdzeroextend().

◆ buffers_to_iovec()

static int buffers_to_iovec	(	struct iovec *	iov,
		void **	buffers,
		int	nblocks
	)

static

Definition at line 774 of file md.c.

{
    struct iovec *iovp;
    int         iovcnt;
 
    Assert(nblocks >= 1);
 
    /* If this build supports direct I/O, buffers must be I/O aligned. */
    for (int i = 0; i < nblocks; ++i)
    {
        if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
            Assert((uintptr_t) buffers[i] ==
                   TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
    }
 
    /* Start the first iovec off with the first buffer. */
    iovp = &iov[0];
    iovp->iov_base = buffers[0];
    iovp->iov_len = BLCKSZ;
    iovcnt = 1;
 
    /* Try to merge the rest. */
    for (int i = 1; i < nblocks; ++i)
    {
        void       *buffer = buffers[i];
 
        if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
        {
            /* Contiguous with the last iovec. */
            iovp->iov_len += BLCKSZ;
        }
        else
        {
            /* Need a new iovec. */
            iovp++;
            iovp->iov_base = buffer;
            iovp->iov_len = BLCKSZ;
            iovcnt++;
        }
    }
 
    return iovcnt;
}

References Assert(), i, PG_IO_ALIGN_SIZE, PG_O_DIRECT, and TYPEALIGN.

Referenced by mdreadv(), mdstartreadv(), and mdwritev().

◆ do_truncate()

static int do_truncate ( const char * path )

static

Definition at line 343 of file md.c.

{
    int         save_errno;
    int         ret;
 
    ret = pg_truncate(path, 0);
 
    /* Log a warning here to avoid repetition in callers. */
    if (ret < 0 && errno != ENOENT)
    {
        save_errno = errno;
        ereport(WARNING,
                (errcode_for_file_access(),
                 errmsg("could not truncate file \"%s\": %m", path)));
        errno = save_errno;
    }
 
    return ret;
}

References ereport, errcode_for_file_access(), errmsg(), pg_truncate(), and WARNING.

Referenced by mdunlinkfork().

◆ DropRelationFiles()

void DropRelationFiles	(	RelFileLocator *	delrels,
		int	ndelrels,
		bool	isRedo
	)

Definition at line 1587 of file md.c.

{
    SMgrRelation *srels;
    int         i;
 
    srels = palloc(sizeof(SMgrRelation) * ndelrels);
    for (i = 0; i < ndelrels; i++)
    {
        SMgrRelation srel = smgropen(delrels[i], INVALID_PROC_NUMBER);
 
        if (isRedo)
        {
            ForkNumber  fork;
 
            for (fork = 0; fork <= MAX_FORKNUM; fork++)
                XLogDropRelation(delrels[i], fork);
        }
        srels[i] = srel;
    }
 
    smgrdounlinkall(srels, ndelrels, isRedo);
 
    for (i = 0; i < ndelrels; i++)
        smgrclose(srels[i]);
    pfree(srels);
}

References i, INVALID_PROC_NUMBER, MAX_FORKNUM, palloc(), pfree(), smgrclose(), smgrdounlinkall(), smgropen(), and XLogDropRelation().

Referenced by FinishPreparedTransaction(), xact_redo_abort(), and xact_redo_commit().

◆ ForgetDatabaseSyncRequests()

void ForgetDatabaseSyncRequests ( Oid dbid )

Definition at line 1569 of file md.c.

{
    FileTag     tag;
    RelFileLocator rlocator;
 
    rlocator.dbOid = dbid;
    rlocator.spcOid = 0;
    rlocator.relNumber = 0;
 
    INIT_MD_FILETAG(tag, rlocator, InvalidForkNumber, InvalidBlockNumber);
 
    RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
}

References RelFileLocator::dbOid, INIT_MD_FILETAG, InvalidBlockNumber, InvalidForkNumber, RegisterSyncRequest(), RelFileLocator::relNumber, RelFileLocator::spcOid, and SYNC_FILTER_REQUEST.

Referenced by createdb_failure_callback(), dbase_redo(), and dropdb().

◆ md_readv_complete()

static PgAioResult md_readv_complete	(	PgAioHandle *	ioh,
		PgAioResult	prior_result,
		uint8	cb_data
	)

static

Definition at line 1965 of file md.c.

{
    PgAioTargetData *td = pgaio_io_get_target_data(ioh);
    PgAioResult result = prior_result;
 
    if (prior_result.result < 0)
    {
        result.status = PGAIO_RS_ERROR;
        result.id = PGAIO_HCB_MD_READV;
        /* For "hard" errors, track the error number in error_data */
        result.error_data = -prior_result.result;
        result.result = 0;
 
        /*
         * Immediately log a message about the IO error, but only to the
         * server log. The reason to do so immediately is that the originator
         * might not process the query result immediately (because it is busy
         * doing another part of query processing) or at all (e.g. if it was
         * cancelled or errored out due to another IO also failing).  The
         * definer of the IO will emit an ERROR when processing the IO's
         * results
         */
        pgaio_result_report(result, td, LOG_SERVER_ONLY);
 
        return result;
    }
 
    /*
     * As explained above smgrstartreadv(), the smgr API operates on the level
     * of blocks, rather than bytes. Convert.
     */
    result.result /= BLCKSZ;
 
    Assert(result.result <= td->smgr.nblocks);
 
    if (result.result == 0)
    {
        /* consider 0 blocks read a failure */
        result.status = PGAIO_RS_ERROR;
        result.id = PGAIO_HCB_MD_READV;
        result.error_data = 0;
 
        /* see comment above the "hard error" case */
        pgaio_result_report(result, td, LOG_SERVER_ONLY);
 
        return result;
    }
 
    if (result.status != PGAIO_RS_ERROR &&
        result.result < td->smgr.nblocks)
    {
        /* partial reads should be retried at upper level */
        result.status = PGAIO_RS_PARTIAL;
        result.id = PGAIO_HCB_MD_READV;
    }
 
    return result;
}

References Assert(), PgAioResult::error_data, PgAioResult::id, LOG_SERVER_ONLY, PgAioTargetData::nblocks, PGAIO_HCB_MD_READV, pgaio_io_get_target_data(), pgaio_result_report(), PGAIO_RS_ERROR, PGAIO_RS_PARTIAL, PgAioResult::result, PgAioTargetData::smgr, and PgAioResult::status.

◆ md_readv_report()

static void md_readv_report	(	PgAioResult	result,
		const PgAioTargetData *	td,
		int	elevel
	)

static

Definition at line 2032 of file md.c.

{
    RelPathStr  path;
 
    path = relpathbackend(td->smgr.rlocator,
                          td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER,
                          td->smgr.forkNum);
 
    if (result.error_data != 0)
    {
        /* for errcode_for_file_access() and %m */
        errno = result.error_data;
 
        ereport(elevel,
                errcode_for_file_access(),
                errmsg("could not read blocks %u..%u in file \"%s\": %m",
                       td->smgr.blockNum,
                       td->smgr.blockNum + td->smgr.nblocks - 1,
                       path.str));
    }
    else
    {
        /*
         * NB: This will typically only be output in debug messages, while
         * retrying a partial IO.
         */
        ereport(elevel,
                errcode(ERRCODE_DATA_CORRUPTED),
                errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
                       td->smgr.blockNum,
                       td->smgr.blockNum + td->smgr.nblocks - 1,
                       path.str,
                       result.result * (size_t) BLCKSZ,
                       td->smgr.nblocks * (size_t) BLCKSZ));
    }
}

References PgAioTargetData::blockNum, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg(), PgAioResult::error_data, PgAioTargetData::forkNum, INVALID_PROC_NUMBER, PgAioTargetData::is_temp, MyProcNumber, PgAioTargetData::nblocks, relpathbackend, PgAioResult::result, PgAioTargetData::rlocator, PgAioTargetData::smgr, and RelPathStr::str.

◆ mdclose()

void mdclose	(	SMgrRelation	reln,
		ForkNumber	forknum
	)

Definition at line 703 of file md.c.

{
    int         nopensegs = reln->md_num_open_segs[forknum];
 
    /* No work if already closed */
    if (nopensegs == 0)
        return;
 
    /* close segments starting from the end */
    while (nopensegs > 0)
    {
        MdfdVec    *v = &reln->md_seg_fds[forknum][nopensegs - 1];
 
        FileClose(v->mdfd_vfd);
        _fdvec_resize(reln, forknum, nopensegs - 1);
        nopensegs--;
    }
}

References _fdvec_resize(), FileClose(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, and _MdfdVec::mdfd_vfd.

Referenced by mdexists().

◆ mdcreate()

void mdcreate	(	SMgrRelation	reln,
		ForkNumber	forknum,
		bool	isRedo
	)

Definition at line 212 of file md.c.

{
    MdfdVec    *mdfd;
    RelPathStr  path;
    File        fd;
 
    if (isRedo && reln->md_num_open_segs[forknum] > 0)
        return;                 /* created and opened already... */
 
    Assert(reln->md_num_open_segs[forknum] == 0);
 
    /*
     * We may be using the target table space for the first time in this
     * database, so create a per-database subdirectory if needed.
     *
     * XXX this is a fairly ugly violation of module layering, but this seems
     * to be the best place to put the check.  Maybe TablespaceCreateDbspace
     * should be here and not in commands/tablespace.c?  But that would imply
     * importing a lot of stuff that smgr.c oughtn't know, either.
     */
    TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
                            reln->smgr_rlocator.locator.dbOid,
                            isRedo);
 
    path = relpath(reln->smgr_rlocator, forknum);
 
    fd = PathNameOpenFile(path.str, _mdfd_open_flags() | O_CREAT | O_EXCL);
 
    if (fd < 0)
    {
        int         save_errno = errno;
 
        if (isRedo)
            fd = PathNameOpenFile(path.str, _mdfd_open_flags());
        if (fd < 0)
        {
            /* be sure to report the error reported by create, not open */
            errno = save_errno;
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not create file \"%s\": %m", path.str)));
        }
    }
 
    _fdvec_resize(reln, forknum, 1);
    mdfd = &reln->md_seg_fds[forknum][0];
    mdfd->mdfd_vfd = fd;
    mdfd->mdfd_segno = 0;
 
    if (!SmgrIsTemp(reln))
        register_dirty_segment(reln, forknum, mdfd);
}

References _fdvec_resize(), _mdfd_open_flags(), Assert(), RelFileLocator::dbOid, ereport, errcode_for_file_access(), errmsg(), ERROR, fd(), RelFileLocatorBackend::locator, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, mdfd(), PathNameOpenFile(), register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, SmgrIsTemp, RelFileLocator::spcOid, RelPathStr::str, and TablespaceCreateDbspace().

◆ mdexists()

bool mdexists	(	SMgrRelation	reln,
		ForkNumber	forknum
	)

Definition at line 193 of file md.c.

{
    /*
     * Close it first, to ensure that we notice if the fork has been unlinked
     * since we opened it.  As an optimization, we can skip that in recovery,
     * which already closes relations when dropping them.
     */
    if (!InRecovery)
        mdclose(reln, forknum);
 
    return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
}

References EXTENSION_RETURN_NULL, InRecovery, mdclose(), and mdopenfork().

◆ mdextend()

void mdextend	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum,
		const void *	buffer,
		bool	skipFsync
	)

Definition at line 477 of file md.c.

{
    off_t       seekpos;
    int         nbytes;
    MdfdVec    *v;
 
    /* If this build supports direct I/O, the buffer must be I/O aligned. */
    if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
        Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
 
    /* This assert is too expensive to have on normally ... */
#ifdef CHECK_WRITE_VS_EXTEND
    Assert(blocknum >= mdnblocks(reln, forknum));
#endif
 
    /*
     * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
     * more --- we mustn't create a block whose number actually is
     * InvalidBlockNumber.  (Note that this failure should be unreachable
     * because of upstream checks in bufmgr.c.)
     */
    if (blocknum == InvalidBlockNumber)
        ereport(ERROR,
                (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                 errmsg("cannot extend file \"%s\" beyond %u blocks",
                        relpath(reln->smgr_rlocator, forknum).str,
                        InvalidBlockNumber)));
 
    v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
 
    seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
    Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
    if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
    {
        if (nbytes < 0)
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not extend file \"%s\": %m",
                            FilePathName(v->mdfd_vfd)),
                     errhint("Check free disk space.")));
        /* short write: complain appropriately */
        ereport(ERROR,
                (errcode(ERRCODE_DISK_FULL),
                 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
                        FilePathName(v->mdfd_vfd),
                        nbytes, BLCKSZ, blocknum),
                 errhint("Check free disk space.")));
    }
 
    if (!skipFsync && !SmgrIsTemp(reln))
        register_dirty_segment(reln, forknum, v);
 
    Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
}

References _mdfd_getseg(), _mdnblocks(), Assert(), ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, FilePathName(), FileWrite(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), PG_IO_ALIGN_SIZE, PG_O_DIRECT, register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, SmgrIsTemp, and TYPEALIGN.

Referenced by _mdfd_getseg().

◆ mdfd()

int mdfd	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum,
		uint32 *	off
	)

Definition at line 1470 of file md.c.

{
    MdfdVec    *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
 
    v = _mdfd_getseg(reln, forknum, blocknum, false,
                     EXTENSION_FAIL);
 
    *off = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
    Assert(*off < (off_t) BLCKSZ * RELSEG_SIZE);
 
    return FileGetRawDesc(v->mdfd_vfd);
}

References _mdfd_getseg(), Assert(), EXTENSION_FAIL, FileGetRawDesc(), _MdfdVec::mdfd_vfd, and mdopenfork().

Referenced by mdcreate(), and mdopenfork().

◆ mdfiletagmatches()

bool mdfiletagmatches	(	const FileTag *	ftag,
		const FileTag *	candidate
	)

Definition at line 1950 of file md.c.

{
    /*
     * For now we only use filter requests as a way to drop all scheduled
     * callbacks relating to a given database, when dropping the database.
     * We'll return true for all candidates that have the same database OID as
     * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
     */
    return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
}

References RelFileLocator::dbOid, and FileTag::rlocator.

◆ mdimmedsync()

void mdimmedsync	(	SMgrRelation	reln,
		ForkNumber	forknum
	)

Definition at line 1417 of file md.c.

{
    int         segno;
    int         min_inactive_seg;
 
    /*
     * NOTE: mdnblocks makes sure we have opened all active segments, so that
     * the loop below will get them all!
     */
    mdnblocks(reln, forknum);
 
    min_inactive_seg = segno = reln->md_num_open_segs[forknum];
 
    /*
     * Temporarily open inactive segments, then close them after sync.  There
     * may be some inactive segments left opened after fsync() error, but that
     * is harmless.  We don't bother to clean them up and take a risk of
     * further trouble.  The next mdclose() will soon close them.
     */
    while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
        segno++;
 
    while (segno > 0)
    {
        MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
 
        /*
         * fsyncs done through mdimmedsync() should be tracked in a separate
         * IOContext than those done through mdsyncfiletag() to differentiate
         * between unavoidable client backend fsyncs (e.g. those done during
         * index build) and those which ideally would have been done by the
         * checkpointer. Since other IO operations bypassing the buffer
         * manager could also be tracked in such an IOContext, wait until
         * these are also tracked to track immediate fsyncs.
         */
        if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
            ereport(data_sync_elevel(ERROR),
                    (errcode_for_file_access(),
                     errmsg("could not fsync file \"%s\": %m",
                            FilePathName(v->mdfd_vfd))));
 
        /* Close inactive segments immediately */
        if (segno > min_inactive_seg)
        {
            FileClose(v->mdfd_vfd);
            _fdvec_resize(reln, forknum, segno - 1);
        }
 
        segno--;
    }
}

References _fdvec_resize(), _mdfd_openseg(), data_sync_elevel(), ereport, errcode_for_file_access(), errmsg(), ERROR, FileClose(), FilePathName(), FileSync(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, and mdnblocks().

◆ mdinit()

void mdinit ( void )

Definition at line 180 of file md.c.

{
    MdCxt = AllocSetContextCreate(TopMemoryContext,
                                  "MdSmgr",
                                  ALLOCSET_DEFAULT_SIZES);
}

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, MdCxt, and TopMemoryContext.

◆ mdmaxcombine()

uint32 mdmaxcombine	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum
	)

Definition at line 823 of file md.c.

{
    BlockNumber segoff;
 
    segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
 
    return RELSEG_SIZE - segoff;
}

◆ mdnblocks()

BlockNumber mdnblocks	(	SMgrRelation	reln,
		ForkNumber	forknum
	)

Definition at line 1213 of file md.c.

{
    MdfdVec    *v;
    BlockNumber nblocks;
    BlockNumber segno;
 
    mdopenfork(reln, forknum, EXTENSION_FAIL);
 
    /* mdopen has opened the first segment */
    Assert(reln->md_num_open_segs[forknum] > 0);
 
    /*
     * Start from the last open segments, to avoid redundant seeks.  We have
     * previously verified that these segments are exactly RELSEG_SIZE long,
     * and it's useless to recheck that each time.
     *
     * NOTE: this assumption could only be wrong if another backend has
     * truncated the relation.  We rely on higher code levels to handle that
     * scenario by closing and re-opening the md fd, which is handled via
     * relcache flush.  (Since the checkpointer doesn't participate in
     * relcache flush, it could have segment entries for inactive segments;
     * that's OK because the checkpointer never needs to compute relation
     * size.)
     */
    segno = reln->md_num_open_segs[forknum] - 1;
    v = &reln->md_seg_fds[forknum][segno];
 
    for (;;)
    {
        nblocks = _mdnblocks(reln, forknum, v);
        if (nblocks > ((BlockNumber) RELSEG_SIZE))
            elog(FATAL, "segment too big");
        if (nblocks < ((BlockNumber) RELSEG_SIZE))
            return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
 
        /*
         * If segment is exactly RELSEG_SIZE, advance to next one.
         */
        segno++;
 
        /*
         * We used to pass O_CREAT here, but that has the disadvantage that it
         * might create a segment which has vanished through some operating
         * system misadventure.  In such a case, creating the segment here
         * undermines _mdfd_getseg's attempts to notice and report an error
         * upon access to a missing segment.
         */
        v = _mdfd_openseg(reln, forknum, segno, 0);
        if (v == NULL)
            return segno * ((BlockNumber) RELSEG_SIZE);
    }
}

References _mdfd_openseg(), _mdnblocks(), Assert(), elog, EXTENSION_FAIL, FATAL, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, and mdopenfork().

Referenced by mdextend(), mdimmedsync(), mdregistersync(), mdwritev(), and mdzeroextend().

◆ mdopen()

void mdopen ( SMgrRelation reln )

Definition at line 692 of file md.c.

{
    /* mark it not open */
    for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
        reln->md_num_open_segs[forknum] = 0;
}

References MAX_FORKNUM, and SMgrRelationData::md_num_open_segs.

◆ mdopenfork()

static MdfdVec * mdopenfork	(	SMgrRelation	reln,
		ForkNumber	forknum,
		int	behavior
	)

static

Definition at line 654 of file md.c.

{
    MdfdVec    *mdfd;
    RelPathStr  path;
    File        fd;
 
    /* No work if already open */
    if (reln->md_num_open_segs[forknum] > 0)
        return &reln->md_seg_fds[forknum][0];
 
    path = relpath(reln->smgr_rlocator, forknum);
 
    fd = PathNameOpenFile(path.str, _mdfd_open_flags());
 
    if (fd < 0)
    {
        if ((behavior & EXTENSION_RETURN_NULL) &&
            FILE_POSSIBLY_DELETED(errno))
            return NULL;
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not open file \"%s\": %m", path.str)));
    }
 
    _fdvec_resize(reln, forknum, 1);
    mdfd = &reln->md_seg_fds[forknum][0];
    mdfd->mdfd_vfd = fd;
    mdfd->mdfd_segno = 0;
 
    Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 
    return mdfd;
}

References _fdvec_resize(), _mdfd_open_flags(), _mdnblocks(), Assert(), ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_RETURN_NULL, fd(), FILE_POSSIBLY_DELETED, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, mdfd(), PathNameOpenFile(), relpath, SMgrRelationData::smgr_rlocator, and RelPathStr::str.

Referenced by _mdfd_getseg(), mdexists(), mdfd(), and mdnblocks().

◆ mdprefetch()

bool mdprefetch	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum,
		int	nblocks
	)

Definition at line 726 of file md.c.

{
#ifdef USE_PREFETCH
 
    Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
 
    if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
        return false;
 
    while (nblocks > 0)
    {
        off_t       seekpos;
        MdfdVec    *v;
        int         nblocks_this_segment;
 
        v = _mdfd_getseg(reln, forknum, blocknum, false,
                         InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
        if (v == NULL)
            return false;
 
        seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
        Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
        nblocks_this_segment =
            Min(nblocks,
                RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
 
        (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ * nblocks_this_segment,
                            WAIT_EVENT_DATA_FILE_PREFETCH);
 
        blocknum += nblocks_this_segment;
        nblocks -= nblocks_this_segment;
    }
#endif                          /* USE_PREFETCH */
 
    return true;
}

References _mdfd_getseg(), Assert(), EXTENSION_FAIL, EXTENSION_RETURN_NULL, FilePrefetch(), InRecovery, IO_DIRECT_DATA, io_direct_flags, MaxBlockNumber, _MdfdVec::mdfd_vfd, and Min.

◆ mdreadv()

void mdreadv	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum,
		void **	buffers,
		BlockNumber	nblocks
	)

Definition at line 837 of file md.c.

{
    while (nblocks > 0)
    {
        struct iovec iov[PG_IOV_MAX];
        int         iovcnt;
        off_t       seekpos;
        int         nbytes;
        MdfdVec    *v;
        BlockNumber nblocks_this_segment;
        size_t      transferred_this_segment;
        size_t      size_this_segment;
 
        v = _mdfd_getseg(reln, forknum, blocknum, false,
                         EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
 
        seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
        Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
        nblocks_this_segment =
            Min(nblocks,
                RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
        nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
 
        if (nblocks_this_segment != nblocks)
            elog(ERROR, "read crosses segment boundary");
 
        iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
        size_this_segment = nblocks_this_segment * BLCKSZ;
        transferred_this_segment = 0;
 
        /*
         * Inner loop to continue after a short read.  We'll keep going until
         * we hit EOF rather than assuming that a short read means we hit the
         * end.
         */
        for (;;)
        {
            TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
                                                reln->smgr_rlocator.locator.spcOid,
                                                reln->smgr_rlocator.locator.dbOid,
                                                reln->smgr_rlocator.locator.relNumber,
                                                reln->smgr_rlocator.backend);
            nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
                               WAIT_EVENT_DATA_FILE_READ);
            TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
                                               reln->smgr_rlocator.locator.spcOid,
                                               reln->smgr_rlocator.locator.dbOid,
                                               reln->smgr_rlocator.locator.relNumber,
                                               reln->smgr_rlocator.backend,
                                               nbytes,
                                               size_this_segment - transferred_this_segment);
 
#ifdef SIMULATE_SHORT_READ
            nbytes = Min(nbytes, 4096);
#endif
 
            if (nbytes < 0)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not read blocks %u..%u in file \"%s\": %m",
                                blocknum,
                                blocknum + nblocks_this_segment - 1,
                                FilePathName(v->mdfd_vfd))));
 
            if (nbytes == 0)
            {
                /*
                 * We are at or past EOF, or we read a partial block at EOF.
                 * Normally this is an error; upper levels should never try to
                 * read a nonexistent block.  However, if zero_damaged_pages
                 * is ON or we are InRecovery, we should instead return zeroes
                 * without complaining.  This allows, for example, the case of
                 * trying to update a block that was later truncated away.
                 *
                 * NB: We think that this codepath is unreachable in recovery
                 * and incomplete with zero_damaged_pages, as missing segments
                 * are not created. Putting blocks into the buffer-pool that
                 * do not exist on disk is rather problematic, as it will not
                 * be found by scans that rely on smgrnblocks(), as they are
                 * beyond EOF. It also can cause weird problems with relation
                 * extension, as relation extension does not expect blocks
                 * beyond EOF to exist.
                 *
                 * Therefore we do not want to copy the logic into
                 * mdstartreadv(), where it would have to be more complicated
                 * due to potential differences in the zero_damaged_pages
                 * setting between the definer and completor of IO.
                 *
                 * For PG 18, we are putting an Assert(false) in mdreadv()
                 * (triggering failures in assertion-enabled builds, but
                 * continuing to work in production builds). Afterwards we
                 * plan to remove this code entirely.
                 */
                if (zero_damaged_pages || InRecovery)
                {
                    Assert(false);  /* see comment above */
 
                    for (BlockNumber i = transferred_this_segment / BLCKSZ;
                         i < nblocks_this_segment;
                         ++i)
                        memset(buffers[i], 0, BLCKSZ);
                    break;
                }
                else
                    ereport(ERROR,
                            (errcode(ERRCODE_DATA_CORRUPTED),
                             errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
                                    blocknum,
                                    blocknum + nblocks_this_segment - 1,
                                    FilePathName(v->mdfd_vfd),
                                    transferred_this_segment,
                                    size_this_segment)));
            }
 
            /* One loop should usually be enough. */
            transferred_this_segment += nbytes;
            Assert(transferred_this_segment <= size_this_segment);
            if (transferred_this_segment == size_this_segment)
                break;
 
            /* Adjust position and vectors after a short read. */
            seekpos += nbytes;
            iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
        }
 
        nblocks -= nblocks_this_segment;
        buffers += nblocks_this_segment;
        blocknum += nblocks_this_segment;
    }
}

References _mdfd_getseg(), Assert(), RelFileLocatorBackend::backend, buffers_to_iovec(), compute_remaining_iovec(), RelFileLocator::dbOid, elog, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, FilePathName(), FileReadV(), i, InRecovery, lengthof, RelFileLocatorBackend::locator, _MdfdVec::mdfd_vfd, Min, PG_IOV_MAX, RelFileLocator::relNumber, SMgrRelationData::smgr_rlocator, RelFileLocator::spcOid, and zero_damaged_pages.

◆ mdregistersync()

void mdregistersync	(	SMgrRelation	reln,
		ForkNumber	forknum
	)

Definition at line 1366 of file md.c.

{
    int         segno;
    int         min_inactive_seg;
 
    /*
     * NOTE: mdnblocks makes sure we have opened all active segments, so that
     * the loop below will get them all!
     */
    mdnblocks(reln, forknum);
 
    min_inactive_seg = segno = reln->md_num_open_segs[forknum];
 
    /*
     * Temporarily open inactive segments, then close them after sync.  There
     * may be some inactive segments left opened after error, but that is
     * harmless.  We don't bother to clean them up and take a risk of further
     * trouble.  The next mdclose() will soon close them.
     */
    while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
        segno++;
 
    while (segno > 0)
    {
        MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
 
        register_dirty_segment(reln, forknum, v);
 
        /* Close inactive segments immediately */
        if (segno > min_inactive_seg)
        {
            FileClose(v->mdfd_vfd);
            _fdvec_resize(reln, forknum, segno - 1);
        }
 
        segno--;
    }
}

References _fdvec_resize(), _mdfd_openseg(), FileClose(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, mdnblocks(), and register_dirty_segment().

◆ mdstartreadv()

void mdstartreadv	(	PgAioHandle *	ioh,
		SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum,
		void **	buffers,
		BlockNumber	nblocks
	)

Definition at line 975 of file md.c.

{
    off_t       seekpos;
    MdfdVec    *v;
    BlockNumber nblocks_this_segment;
    struct iovec *iov;
    int         iovcnt;
    int         ret;
 
    v = _mdfd_getseg(reln, forknum, blocknum, false,
                     EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
 
    seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
    Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
    nblocks_this_segment =
        Min(nblocks,
            RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
 
    if (nblocks_this_segment != nblocks)
        elog(ERROR, "read crossing segment boundary");
 
    iovcnt = pgaio_io_get_iovec(ioh, &iov);
 
    Assert(nblocks <= iovcnt);
 
    iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
 
    Assert(iovcnt <= nblocks_this_segment);
 
    if (!(io_direct_flags & IO_DIRECT_DATA))
        pgaio_io_set_flag(ioh, PGAIO_HF_BUFFERED);
 
    pgaio_io_set_target_smgr(ioh,
                             reln,
                             forknum,
                             blocknum,
                             nblocks,
                             false);
    pgaio_io_register_callbacks(ioh, PGAIO_HCB_MD_READV, 0);
 
    ret = FileStartReadV(ioh, v->mdfd_vfd, iovcnt, seekpos, WAIT_EVENT_DATA_FILE_READ);
    if (ret != 0)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
                        blocknum,
                        blocknum + nblocks_this_segment - 1,
                        FilePathName(v->mdfd_vfd))));
 
    /*
     * The error checks corresponding to the post-read checks in mdreadv() are
     * in md_readv_complete().
     *
     * However we chose, at least for now, to not implement the
     * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
     * that logic is rather problematic, and we want to get rid of it. Here
     * equivalent logic would have to be more complicated due to potential
     * differences in the zero_damaged_pages setting between the definer and
     * completor of IO.
     */
}

References _mdfd_getseg(), Assert(), buffers_to_iovec(), elog, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, FilePathName(), FileStartReadV(), IO_DIRECT_DATA, io_direct_flags, _MdfdVec::mdfd_vfd, Min, PGAIO_HCB_MD_READV, PGAIO_HF_BUFFERED, pgaio_io_get_iovec(), pgaio_io_register_callbacks(), pgaio_io_set_flag(), and pgaio_io_set_target_smgr().

◆ mdsyncfiletag()

int mdsyncfiletag	(	const FileTag *	ftag,
		char *	path
	)

Definition at line 1880 of file md.c.

{
    SMgrRelation reln = smgropen(ftag->rlocator, INVALID_PROC_NUMBER);
    File        file;
    instr_time  io_start;
    bool        need_to_close;
    int         result,
                save_errno;
 
    /* See if we already have the file open, or need to open it. */
    if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
    {
        file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
        strlcpy(path, FilePathName(file), MAXPGPATH);
        need_to_close = false;
    }
    else
    {
        MdPathStr   p;
 
        p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
        strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
 
        file = PathNameOpenFile(path, _mdfd_open_flags());
        if (file < 0)
            return -1;
        need_to_close = true;
    }
 
    io_start = pgstat_prepare_io_time(track_io_timing);
 
    /* Sync the file. */
    result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
    save_errno = errno;
 
    if (need_to_close)
        FileClose(file);
 
    pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
                            IOOP_FSYNC, io_start, 1, 0);
 
    errno = save_errno;
    return result;
}

References _mdfd_open_flags(), _mdfd_segpath(), FileClose(), FilePathName(), FileSync(), FileTag::forknum, INVALID_PROC_NUMBER, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, MAXPGPATH, SMgrRelationData::md_num_open_segs, MD_PATH_STR_MAXLEN, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, PathNameOpenFile(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), FileTag::rlocator, FileTag::segno, smgropen(), MdPathStr::str, strlcpy(), and track_io_timing.

◆ mdtruncate()

void mdtruncate	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	curnblk,
		BlockNumber	nblocks
	)

Definition at line 1277 of file md.c.

{
    BlockNumber priorblocks;
    int         curopensegs;
 
    if (nblocks > curnblk)
    {
        /* Bogus request ... but no complaint if InRecovery */
        if (InRecovery)
            return;
        ereport(ERROR,
                (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
                        relpath(reln->smgr_rlocator, forknum).str,
                        nblocks, curnblk)));
    }
    if (nblocks == curnblk)
        return;                 /* no work */
 
    /*
     * Truncate segments, starting at the last one. Starting at the end makes
     * managing the memory for the fd array easier, should there be errors.
     */
    curopensegs = reln->md_num_open_segs[forknum];
    while (curopensegs > 0)
    {
        MdfdVec    *v;
 
        priorblocks = (curopensegs - 1) * RELSEG_SIZE;
 
        v = &reln->md_seg_fds[forknum][curopensegs - 1];
 
        if (priorblocks > nblocks)
        {
            /*
             * This segment is no longer active. We truncate the file, but do
             * not delete it, for reasons explained in the header comments.
             */
            if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not truncate file \"%s\": %m",
                                FilePathName(v->mdfd_vfd))));
 
            if (!SmgrIsTemp(reln))
                register_dirty_segment(reln, forknum, v);
 
            /* we never drop the 1st segment */
            Assert(v != &reln->md_seg_fds[forknum][0]);
 
            FileClose(v->mdfd_vfd);
            _fdvec_resize(reln, forknum, curopensegs - 1);
        }
        else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
        {
            /*
             * This is the last segment we want to keep. Truncate the file to
             * the right length. NOTE: if nblocks is exactly a multiple K of
             * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
             * keep it. This adheres to the invariant given in the header
             * comments.
             */
            BlockNumber lastsegblocks = nblocks - priorblocks;
 
            if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not truncate file \"%s\" to %u blocks: %m",
                                FilePathName(v->mdfd_vfd),
                                nblocks)));
            if (!SmgrIsTemp(reln))
                register_dirty_segment(reln, forknum, v);
        }
        else
        {
            /*
             * We still need this segment, so nothing to do for this and any
             * earlier segment.
             */
            break;
        }
        curopensegs--;
    }
}

References _fdvec_resize(), Assert(), ereport, errcode_for_file_access(), errmsg(), ERROR, FileClose(), FilePathName(), FileTruncate(), InRecovery, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, and SmgrIsTemp.

◆ mdunlink()

void mdunlink	(	RelFileLocatorBackend	rlocator,
		ForkNumber	forknum,
		bool	isRedo
	)

Definition at line 327 of file md.c.

{
    /* Now do the per-fork work */
    if (forknum == InvalidForkNumber)
    {
        for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
            mdunlinkfork(rlocator, forknum, isRedo);
    }
    else
        mdunlinkfork(rlocator, forknum, isRedo);
}

References InvalidForkNumber, MAX_FORKNUM, and mdunlinkfork().

◆ mdunlinkfiletag()

int mdunlinkfiletag	(	const FileTag *	ftag,
		char *	path
	)

Definition at line 1932 of file md.c.

{
    RelPathStr  p;
 
    /* Compute the path. */
    p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
    strlcpy(path, p.str, MAXPGPATH);
 
    /* Try to unlink the file. */
    return unlink(path);
}

References MAIN_FORKNUM, MAXPGPATH, relpathperm, FileTag::rlocator, RelPathStr::str, and strlcpy().

◆ mdunlinkfork()

static void mdunlinkfork	(	RelFileLocatorBackend	rlocator,
		ForkNumber	forknum,
		bool	isRedo
	)

static

Definition at line 364 of file md.c.

{
    RelPathStr  path;
    int         ret;
    int         save_errno;
 
    path = relpath(rlocator, forknum);
 
    /*
     * Truncate and then unlink the first segment, or just register a request
     * to unlink it later, as described in the comments for mdunlink().
     */
    if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
        RelFileLocatorBackendIsTemp(rlocator))
    {
        if (!RelFileLocatorBackendIsTemp(rlocator))
        {
            /* Prevent other backends' fds from holding on to the disk space */
            ret = do_truncate(path.str);
 
            /* Forget any pending sync requests for the first segment */
            save_errno = errno;
            register_forget_request(rlocator, forknum, 0 /* first seg */ );
            errno = save_errno;
        }
        else
            ret = 0;
 
        /* Next unlink the file, unless it was already found to be missing */
        if (ret >= 0 || errno != ENOENT)
        {
            ret = unlink(path.str);
            if (ret < 0 && errno != ENOENT)
            {
                save_errno = errno;
                ereport(WARNING,
                        (errcode_for_file_access(),
                         errmsg("could not remove file \"%s\": %m", path.str)));
                errno = save_errno;
            }
        }
    }
    else
    {
        /* Prevent other backends' fds from holding on to the disk space */
        ret = do_truncate(path.str);
 
        /* Register request to unlink first segment later */
        save_errno = errno;
        register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
        errno = save_errno;
    }
 
    /*
     * Delete any additional segments.
     *
     * Note that because we loop until getting ENOENT, we will correctly
     * remove all inactive segments as well as active ones.  Ideally we'd
     * continue the loop until getting exactly that errno, but that risks an
     * infinite loop if the problem is directory-wide (for instance, if we
     * suddenly can't read the data directory itself).  We compromise by
     * continuing after a non-ENOENT truncate error, but stopping after any
     * unlink error.  If there is indeed a directory-wide problem, additional
     * unlink attempts wouldn't work anyway.
     */
    if (ret >= 0 || errno != ENOENT)
    {
        MdPathStr   segpath;
        BlockNumber segno;
 
        for (segno = 1;; segno++)
        {
            sprintf(segpath.str, "%s.%u", path.str, segno);
 
            if (!RelFileLocatorBackendIsTemp(rlocator))
            {
                /*
                 * Prevent other backends' fds from holding on to the disk
                 * space.  We're done if we see ENOENT, though.
                 */
                if (do_truncate(segpath.str) < 0 && errno == ENOENT)
                    break;
 
                /*
                 * Forget any pending sync requests for this segment before we
                 * try to unlink.
                 */
                register_forget_request(rlocator, forknum, segno);
            }
 
            if (unlink(segpath.str) < 0)
            {
                /* ENOENT is expected after the last segment... */
                if (errno != ENOENT)
                    ereport(WARNING,
                            (errcode_for_file_access(),
                             errmsg("could not remove file \"%s\": %m", segpath.str)));
                break;
            }
        }
    }
}

References do_truncate(), ereport, errcode_for_file_access(), errmsg(), IsBinaryUpgrade, MAIN_FORKNUM, register_forget_request(), register_unlink_segment(), RelFileLocatorBackendIsTemp, relpath, sprintf, MdPathStr::str, RelPathStr::str, and WARNING.

Referenced by mdunlink().

◆ mdwriteback()

void mdwriteback	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum,
		BlockNumber	nblocks
	)

Definition at line 1154 of file md.c.

{
    Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
 
    /*
     * Issue flush requests in as few requests as possible; have to split at
     * segment boundaries though, since those are actually separate files.
     */
    while (nblocks > 0)
    {
        BlockNumber nflush = nblocks;
        off_t       seekpos;
        MdfdVec    *v;
        int         segnum_start,
                    segnum_end;
 
        v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
                         EXTENSION_DONT_OPEN);
 
        /*
         * We might be flushing buffers of already removed relations, that's
         * ok, just ignore that case.  If the segment file wasn't open already
         * (ie from a recent mdwrite()), then we don't want to re-open it, to
         * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
         * us with a descriptor to a file that is about to be unlinked.
         */
        if (!v)
            return;
 
        /* compute offset inside the current segment */
        segnum_start = blocknum / RELSEG_SIZE;
 
        /* compute number of desired writes within the current segment */
        segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
        if (segnum_start != segnum_end)
            nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
 
        Assert(nflush >= 1);
        Assert(nflush <= nblocks);
 
        seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
        FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
 
        nblocks -= nflush;
        blocknum += nflush;
    }
}

References _mdfd_getseg(), Assert(), EXTENSION_DONT_OPEN, FileWriteback(), IO_DIRECT_DATA, io_direct_flags, and _MdfdVec::mdfd_vfd.

◆ mdwritev()

void mdwritev	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum,
		const void **	buffers,
		BlockNumber	nblocks,
		bool	skipFsync
	)

Definition at line 1049 of file md.c.

{
    /* This assert is too expensive to have on normally ... */
#ifdef CHECK_WRITE_VS_EXTEND
    Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
#endif
 
    while (nblocks > 0)
    {
        struct iovec iov[PG_IOV_MAX];
        int         iovcnt;
        off_t       seekpos;
        int         nbytes;
        MdfdVec    *v;
        BlockNumber nblocks_this_segment;
        size_t      transferred_this_segment;
        size_t      size_this_segment;
 
        v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
                         EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
 
        seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
        Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
        nblocks_this_segment =
            Min(nblocks,
                RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
        nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
 
        if (nblocks_this_segment != nblocks)
            elog(ERROR, "write crosses segment boundary");
 
        iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
        size_this_segment = nblocks_this_segment * BLCKSZ;
        transferred_this_segment = 0;
 
        /*
         * Inner loop to continue after a short write.  If the reason is that
         * we're out of disk space, a future attempt should get an ENOSPC
         * error from the kernel.
         */
        for (;;)
        {
            TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
                                                 reln->smgr_rlocator.locator.spcOid,
                                                 reln->smgr_rlocator.locator.dbOid,
                                                 reln->smgr_rlocator.locator.relNumber,
                                                 reln->smgr_rlocator.backend);
            nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
                                WAIT_EVENT_DATA_FILE_WRITE);
            TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
                                                reln->smgr_rlocator.locator.spcOid,
                                                reln->smgr_rlocator.locator.dbOid,
                                                reln->smgr_rlocator.locator.relNumber,
                                                reln->smgr_rlocator.backend,
                                                nbytes,
                                                size_this_segment - transferred_this_segment);
 
#ifdef SIMULATE_SHORT_WRITE
            nbytes = Min(nbytes, 4096);
#endif
 
            if (nbytes < 0)
            {
                bool        enospc = errno == ENOSPC;
 
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not write blocks %u..%u in file \"%s\": %m",
                                blocknum,
                                blocknum + nblocks_this_segment - 1,
                                FilePathName(v->mdfd_vfd)),
                         enospc ? errhint("Check free disk space.") : 0));
            }
 
            /* One loop should usually be enough. */
            transferred_this_segment += nbytes;
            Assert(transferred_this_segment <= size_this_segment);
            if (transferred_this_segment == size_this_segment)
                break;
 
            /* Adjust position and iovecs after a short write. */
            seekpos += nbytes;
            iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
        }
 
        if (!skipFsync && !SmgrIsTemp(reln))
            register_dirty_segment(reln, forknum, v);
 
        nblocks -= nblocks_this_segment;
        buffers += nblocks_this_segment;
        blocknum += nblocks_this_segment;
    }
}

References _mdfd_getseg(), Assert(), RelFileLocatorBackend::backend, buffers_to_iovec(), compute_remaining_iovec(), RelFileLocator::dbOid, elog, ereport, errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, FilePathName(), FileWriteV(), lengthof, RelFileLocatorBackend::locator, _MdfdVec::mdfd_vfd, mdnblocks(), Min, PG_IOV_MAX, register_dirty_segment(), RelFileLocator::relNumber, SMgrRelationData::smgr_rlocator, SmgrIsTemp, and RelFileLocator::spcOid.

◆ mdzeroextend()

void mdzeroextend	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum,
		int	nblocks,
		bool	skipFsync
	)

Definition at line 542 of file md.c.

{
    MdfdVec    *v;
    BlockNumber curblocknum = blocknum;
    int         remblocks = nblocks;
 
    Assert(nblocks > 0);
 
    /* This assert is too expensive to have on normally ... */
#ifdef CHECK_WRITE_VS_EXTEND
    Assert(blocknum >= mdnblocks(reln, forknum));
#endif
 
    /*
     * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
     * more --- we mustn't create a block whose number actually is
     * InvalidBlockNumber or larger.
     */
    if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
        ereport(ERROR,
                (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                 errmsg("cannot extend file \"%s\" beyond %u blocks",
                        relpath(reln->smgr_rlocator, forknum).str,
                        InvalidBlockNumber)));
 
    while (remblocks > 0)
    {
        BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
        off_t       seekpos = (off_t) BLCKSZ * segstartblock;
        int         numblocks;
 
        if (segstartblock + remblocks > RELSEG_SIZE)
            numblocks = RELSEG_SIZE - segstartblock;
        else
            numblocks = remblocks;
 
        v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
 
        Assert(segstartblock < RELSEG_SIZE);
        Assert(segstartblock + numblocks <= RELSEG_SIZE);
 
        /*
         * If available and useful, use posix_fallocate() (via
         * FileFallocate()) to extend the relation. That's often more
         * efficient than using write(), as it commonly won't cause the kernel
         * to allocate page cache space for the extended pages.
         *
         * However, we don't use FileFallocate() for small extensions, as it
         * defeats delayed allocation on some filesystems. Not clear where
         * that decision should be made though? For now just use a cutoff of
         * 8, anything between 4 and 8 worked OK in some local testing.
         */
        if (numblocks > 8)
        {
            int         ret;
 
            ret = FileFallocate(v->mdfd_vfd,
                                seekpos, (off_t) BLCKSZ * numblocks,
                                WAIT_EVENT_DATA_FILE_EXTEND);
            if (ret != 0)
            {
                ereport(ERROR,
                        errcode_for_file_access(),
                        errmsg("could not extend file \"%s\" with FileFallocate(): %m",
                               FilePathName(v->mdfd_vfd)),
                        errhint("Check free disk space."));
            }
        }
        else
        {
            int         ret;
 
            /*
             * Even if we don't want to use fallocate, we can still extend a
             * bit more efficiently than writing each 8kB block individually.
             * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
             * to avoid multiple writes or needing a zeroed buffer for the
             * whole length of the extension.
             */
            ret = FileZero(v->mdfd_vfd,
                           seekpos, (off_t) BLCKSZ * numblocks,
                           WAIT_EVENT_DATA_FILE_EXTEND);
            if (ret < 0)
                ereport(ERROR,
                        errcode_for_file_access(),
                        errmsg("could not extend file \"%s\": %m",
                               FilePathName(v->mdfd_vfd)),
                        errhint("Check free disk space."));
        }
 
        if (!skipFsync && !SmgrIsTemp(reln))
            register_dirty_segment(reln, forknum, v);
 
        Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 
        remblocks -= numblocks;
        curblocknum += numblocks;
    }
}

References _mdfd_getseg(), _mdnblocks(), Assert(), ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, FileFallocate(), FilePathName(), FileZero(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, and SmgrIsTemp.

◆ register_dirty_segment()

static void register_dirty_segment	(	SMgrRelation	reln,
		ForkNumber	forknum,
		MdfdVec *	seg
	)

static

Definition at line 1494 of file md.c.

{
    FileTag     tag;
 
    INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
 
    /* Temp relations should never be fsync'd */
    Assert(!SmgrIsTemp(reln));
 
    if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
    {
        instr_time  io_start;
 
        ereport(DEBUG1,
                (errmsg_internal("could not forward fsync request because request queue is full")));
 
        io_start = pgstat_prepare_io_time(track_io_timing);
 
        if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
            ereport(data_sync_elevel(ERROR),
                    (errcode_for_file_access(),
                     errmsg("could not fsync file \"%s\": %m",
                            FilePathName(seg->mdfd_vfd))));
 
        /*
         * We have no way of knowing if the current IOContext is
         * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
         * point, so count the fsync as being in the IOCONTEXT_NORMAL
         * IOContext. This is probably okay, because the number of backend
         * fsyncs doesn't say anything about the efficacy of the
         * BufferAccessStrategy. And counting both fsyncs done in
         * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
         * IOCONTEXT_NORMAL is likely clearer when investigating the number of
         * backend fsyncs.
         */
        pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
                                IOOP_FSYNC, io_start, 1, 0);
    }
}

References Assert(), data_sync_elevel(), DEBUG1, ereport, errcode_for_file_access(), errmsg(), errmsg_internal(), ERROR, FilePathName(), FileSync(), INIT_MD_FILETAG, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, RelFileLocatorBackend::locator, _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RegisterSyncRequest(), SMgrRelationData::smgr_rlocator, SmgrIsTemp, SYNC_REQUEST, and track_io_timing.

Referenced by mdcreate(), mdextend(), mdregistersync(), mdtruncate(), mdwritev(), and mdzeroextend().

◆ register_forget_request()

static void register_forget_request	(	RelFileLocatorBackend	rlocator,
		ForkNumber	forknum,
		BlockNumber	segno
	)

static

Definition at line 1555 of file md.c.

{
    FileTag     tag;
 
    INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
 
    RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
}

References INIT_MD_FILETAG, RelFileLocatorBackend::locator, RegisterSyncRequest(), and SYNC_FORGET_REQUEST.

Referenced by mdunlinkfork().

◆ register_unlink_segment()

static void register_unlink_segment	(	RelFileLocatorBackend	rlocator,
		ForkNumber	forknum,
		BlockNumber	segno
	)

static

Definition at line 1538 of file md.c.

{
    FileTag     tag;
 
    INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
 
    /* Should never be used with temp relations */
    Assert(!RelFileLocatorBackendIsTemp(rlocator));
 
    RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
}

References Assert(), INIT_MD_FILETAG, RelFileLocatorBackend::locator, RegisterSyncRequest(), RelFileLocatorBackendIsTemp, and SYNC_UNLINK_REQUEST.

Referenced by mdunlinkfork().

Variable Documentation

◆ aio_md_readv_cb

const PgAioHandleCallbacks aio_md_readv_cb

Initial value:

= {
    .complete_shared = md_readv_complete,
    .report = md_readv_report,
}

Definition at line 159 of file md.c.

◆ MdCxt

MemoryContext MdCxt

static

Definition at line 87 of file md.c.

Referenced by _fdvec_resize(), and mdinit().

Data Structures

Macros

Typedefs

Functions

Variables

Macro Definition Documentation

◆ EXTENSION_CREATE

◆ EXTENSION_CREATE_RECOVERY

◆ EXTENSION_DONT_OPEN

◆ EXTENSION_FAIL

◆ EXTENSION_RETURN_NULL

◆ INIT_MD_FILETAG

◆ MD_PATH_STR_MAXLEN

◆ SEGMENT_CHARS

Typedef Documentation

◆ MdfdVec

◆ MdPathStr

Function Documentation

◆ _fdvec_resize()

◆ _mdfd_getseg()

◆ _mdfd_open_flags()

◆ _mdfd_openseg()

◆ _mdfd_segpath()

◆ _mdnblocks()

◆ buffers_to_iovec()

◆ do_truncate()

◆ DropRelationFiles()

◆ ForgetDatabaseSyncRequests()

◆ md_readv_complete()

◆ md_readv_report()

◆ mdclose()

◆ mdcreate()

◆ mdexists()

◆ mdextend()

◆ mdfd()

◆ mdfiletagmatches()

◆ mdimmedsync()

◆ mdinit()

◆ mdmaxcombine()

◆ mdnblocks()

◆ mdopen()

◆ mdopenfork()

◆ mdprefetch()

◆ mdreadv()

◆ mdregistersync()

◆ mdstartreadv()

◆ mdsyncfiletag()

◆ mdtruncate()

◆ mdunlink()

◆ mdunlinkfiletag()

◆ mdunlinkfork()

◆ mdwriteback()

◆ mdwritev()

◆ mdzeroextend()

◆ register_dirty_segment()

◆ register_forget_request()

◆ register_unlink_segment()

Variable Documentation

◆ aio_md_readv_cb

◆ MdCxt