PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
md.c File Reference
#include "postgres.h"
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
#include "access/xlogutils.h"
#include "commands/tablespace.h"
#include "common/file_utils.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/md.h"
#include "storage/relfilelocator.h"
#include "storage/smgr.h"
#include "storage/sync.h"
#include "utils/memutils.h"
Include dependency graph for md.c:

Go to the source code of this file.

Data Structures

struct  _MdfdVec
 
struct  MdPathStr
 

Macros

#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
 
#define EXTENSION_FAIL   (1 << 0)
 
#define EXTENSION_RETURN_NULL   (1 << 1)
 
#define EXTENSION_CREATE   (1 << 2)
 
#define EXTENSION_CREATE_RECOVERY   (1 << 3)
 
#define EXTENSION_DONT_OPEN   (1 << 5)
 
#define SEGMENT_CHARS   OIDCHARS
 
#define MD_PATH_STR_MAXLEN
 

Typedefs

typedef struct _MdfdVec MdfdVec
 
typedef struct MdPathStr MdPathStr
 

Functions

static void mdunlinkfork (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 
static MdfdVecmdopenfork (SMgrRelation reln, ForkNumber forknum, int behavior)
 
static void register_dirty_segment (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
static void register_unlink_segment (RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
 
static void register_forget_request (RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
 
static void _fdvec_resize (SMgrRelation reln, ForkNumber forknum, int nseg)
 
static MdPathStr _mdfd_segpath (SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
 
static MdfdVec_mdfd_openseg (SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
 
static MdfdVec_mdfd_getseg (SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
 
static BlockNumber _mdnblocks (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
static PgAioResult md_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static void md_readv_report (PgAioResult result, const PgAioTargetData *td, int elevel)
 
static int _mdfd_open_flags (void)
 
void mdinit (void)
 
bool mdexists (SMgrRelation reln, ForkNumber forknum)
 
void mdcreate (SMgrRelation reln, ForkNumber forknum, bool isRedo)
 
void mdunlink (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 
static int do_truncate (const char *path)
 
void mdextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
 
void mdzeroextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
 
void mdopen (SMgrRelation reln)
 
void mdclose (SMgrRelation reln, ForkNumber forknum)
 
bool mdprefetch (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
 
static int buffers_to_iovec (struct iovec *iov, void **buffers, int nblocks)
 
uint32 mdmaxcombine (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
void mdreadv (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
 
void mdstartreadv (PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
 
void mdwritev (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
 
void mdwriteback (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
 
BlockNumber mdnblocks (SMgrRelation reln, ForkNumber forknum)
 
void mdtruncate (SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
 
void mdregistersync (SMgrRelation reln, ForkNumber forknum)
 
void mdimmedsync (SMgrRelation reln, ForkNumber forknum)
 
int mdfd (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
 
void ForgetDatabaseSyncRequests (Oid dbid)
 
void DropRelationFiles (RelFileLocator *delrels, int ndelrels, bool isRedo)
 
int mdsyncfiletag (const FileTag *ftag, char *path)
 
int mdunlinkfiletag (const FileTag *ftag, char *path)
 
bool mdfiletagmatches (const FileTag *ftag, const FileTag *candidate)
 

Variables

static MemoryContext MdCxt
 
const PgAioHandleCallbacks aio_md_readv_cb
 

Macro Definition Documentation

◆ EXTENSION_CREATE

#define EXTENSION_CREATE   (1 << 2)

Definition at line 107 of file md.c.

◆ EXTENSION_CREATE_RECOVERY

#define EXTENSION_CREATE_RECOVERY   (1 << 3)

Definition at line 109 of file md.c.

◆ EXTENSION_DONT_OPEN

#define EXTENSION_DONT_OPEN   (1 << 5)

Definition at line 111 of file md.c.

◆ EXTENSION_FAIL

#define EXTENSION_FAIL   (1 << 0)

Definition at line 103 of file md.c.

◆ EXTENSION_RETURN_NULL

#define EXTENSION_RETURN_NULL   (1 << 1)

Definition at line 105 of file md.c.

◆ INIT_MD_FILETAG

#define INIT_MD_FILETAG (   a,
  xx_rlocator,
  xx_forknum,
  xx_segno 
)
Value:
( \
memset(&(a), 0, sizeof(FileTag)), \
(a).handler = SYNC_HANDLER_MD, \
(a).rlocator = (xx_rlocator), \
(a).forknum = (xx_forknum), \
(a).segno = (xx_segno) \
)
int a
Definition: isn.c:73
Definition: sync.h:51
@ SYNC_HANDLER_MD
Definition: sync.h:37

Definition at line 91 of file md.c.

◆ MD_PATH_STR_MAXLEN

#define MD_PATH_STR_MAXLEN
Value:
(\
REL_PATH_STR_MAXLEN \
+ sizeof((char)'.') \
)
#define SEGMENT_CHARS
Definition: md.c:121

Definition at line 122 of file md.c.

◆ SEGMENT_CHARS

#define SEGMENT_CHARS   OIDCHARS

Definition at line 121 of file md.c.

Typedef Documentation

◆ MdfdVec

typedef struct _MdfdVec MdfdVec

◆ MdPathStr

typedef struct MdPathStr MdPathStr

Function Documentation

◆ _fdvec_resize()

static void _fdvec_resize ( SMgrRelation  reln,
ForkNumber  forknum,
int  nseg 
)
static

Definition at line 1619 of file md.c.

1622{
1623 if (nseg == 0)
1624 {
1625 if (reln->md_num_open_segs[forknum] > 0)
1626 {
1627 pfree(reln->md_seg_fds[forknum]);
1628 reln->md_seg_fds[forknum] = NULL;
1629 }
1630 }
1631 else if (reln->md_num_open_segs[forknum] == 0)
1632 {
1633 reln->md_seg_fds[forknum] =
1634 MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
1635 }
1636 else if (nseg > reln->md_num_open_segs[forknum])
1637 {
1638 /*
1639 * It doesn't seem worthwhile complicating the code to amortize
1640 * repalloc() calls. Those are far faster than PathNameOpenFile() or
1641 * FileClose(), and the memory context internally will sometimes avoid
1642 * doing an actual reallocation.
1643 */
1644 reln->md_seg_fds[forknum] =
1645 repalloc(reln->md_seg_fds[forknum],
1646 sizeof(MdfdVec) * nseg);
1647 }
1648 else
1649 {
1650 /*
1651 * We don't reallocate a smaller array, because we want mdtruncate()
1652 * to be able to promise that it won't allocate memory, so that it is
1653 * allowed in a critical section. This means that a bit of space in
1654 * the array is now wasted, until the next time we add a segment and
1655 * reallocate.
1656 */
1657 }
1658
1659 reln->md_num_open_segs[forknum] = nseg;
1660}
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1256
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:2167
void pfree(void *pointer)
Definition: mcxt.c:2147
static MemoryContext MdCxt
Definition: md.c:87
int md_num_open_segs[MAX_FORKNUM+1]
Definition: smgr.h:61
struct _MdfdVec * md_seg_fds[MAX_FORKNUM+1]
Definition: smgr.h:62
Definition: md.c:82

References SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, MdCxt, MemoryContextAlloc(), pfree(), and repalloc().

Referenced by _mdfd_openseg(), mdclose(), mdcreate(), mdimmedsync(), mdopenfork(), mdregistersync(), and mdtruncate().

◆ _mdfd_getseg()

static MdfdVec * _mdfd_getseg ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blkno,
bool  skipFsync,
int  behavior 
)
static

Definition at line 1730 of file md.c.

1732{
1733 MdfdVec *v;
1734 BlockNumber targetseg;
1735 BlockNumber nextsegno;
1736
1737 /* some way to handle non-existent segments needs to be specified */
1738 Assert(behavior &
1741
1742 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1743
1744 /* if an existing and opened segment, we're done */
1745 if (targetseg < reln->md_num_open_segs[forknum])
1746 {
1747 v = &reln->md_seg_fds[forknum][targetseg];
1748 return v;
1749 }
1750
1751 /* The caller only wants the segment if we already had it open. */
1752 if (behavior & EXTENSION_DONT_OPEN)
1753 return NULL;
1754
1755 /*
1756 * The target segment is not yet open. Iterate over all the segments
1757 * between the last opened and the target segment. This way missing
1758 * segments either raise an error, or get created (according to
1759 * 'behavior'). Start with either the last opened, or the first segment if
1760 * none was opened before.
1761 */
1762 if (reln->md_num_open_segs[forknum] > 0)
1763 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1764 else
1765 {
1766 v = mdopenfork(reln, forknum, behavior);
1767 if (!v)
1768 return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1769 }
1770
1771 for (nextsegno = reln->md_num_open_segs[forknum];
1772 nextsegno <= targetseg; nextsegno++)
1773 {
1774 BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1775 int flags = 0;
1776
1777 Assert(nextsegno == v->mdfd_segno + 1);
1778
1779 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1780 elog(FATAL, "segment too big");
1781
1782 if ((behavior & EXTENSION_CREATE) ||
1783 (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1784 {
1785 /*
1786 * Normally we will create new segments only if authorized by the
1787 * caller (i.e., we are doing mdextend()). But when doing WAL
1788 * recovery, create segments anyway; this allows cases such as
1789 * replaying WAL data that has a write into a high-numbered
1790 * segment of a relation that was later deleted. We want to go
1791 * ahead and create the segments so we can finish out the replay.
1792 *
1793 * We have to maintain the invariant that segments before the last
1794 * active segment are of size RELSEG_SIZE; therefore, if
1795 * extending, pad them out with zeroes if needed. (This only
1796 * matters if in recovery, or if the caller is extending the
1797 * relation discontiguously, but that can happen in hash indexes.)
1798 */
1799 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1800 {
1801 char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
1803
1804 mdextend(reln, forknum,
1805 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1806 zerobuf, skipFsync);
1807 pfree(zerobuf);
1808 }
1809 flags = O_CREAT;
1810 }
1811 else if (nblocks < ((BlockNumber) RELSEG_SIZE))
1812 {
1813 /*
1814 * When not extending, only open the next segment if the current
1815 * one is exactly RELSEG_SIZE. If not (this branch), either
1816 * return NULL or fail.
1817 */
1818 if (behavior & EXTENSION_RETURN_NULL)
1819 {
1820 /*
1821 * Some callers discern between reasons for _mdfd_getseg()
1822 * returning NULL based on errno. As there's no failing
1823 * syscall involved in this case, explicitly set errno to
1824 * ENOENT, as that seems the closest interpretation.
1825 */
1826 errno = ENOENT;
1827 return NULL;
1828 }
1829
1830 ereport(ERROR,
1832 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1833 _mdfd_segpath(reln, forknum, nextsegno).str,
1834 blkno, nblocks)));
1835 }
1836
1837 v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1838
1839 if (v == NULL)
1840 {
1841 if ((behavior & EXTENSION_RETURN_NULL) &&
1842 FILE_POSSIBLY_DELETED(errno))
1843 return NULL;
1844 ereport(ERROR,
1846 errmsg("could not open file \"%s\" (target block %u): %m",
1847 _mdfd_segpath(reln, forknum, nextsegno).str,
1848 blkno)));
1849 }
1850 }
1851
1852 return v;
1853}
uint32 BlockNumber
Definition: block.h:31
int errcode_for_file_access(void)
Definition: elog.c:877
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:149
#define FILE_POSSIBLY_DELETED(err)
Definition: fd.h:78
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:30
Assert(PointerIsAligned(start, uint64))
const char * str
void * palloc_aligned(Size size, Size alignto, int flags)
Definition: mcxt.c:2137
#define EXTENSION_CREATE_RECOVERY
Definition: md.c:109
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition: md.c:1859
static MdPathStr _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1667
#define EXTENSION_DONT_OPEN
Definition: md.c:111
void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: md.c:477
static MdfdVec * _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
Definition: md.c:1687
#define EXTENSION_RETURN_NULL
Definition: md.c:105
#define EXTENSION_CREATE
Definition: md.c:107
#define EXTENSION_FAIL
Definition: md.c:103
static MdfdVec * mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
Definition: md.c:654
#define PG_IO_ALIGN_SIZE
BlockNumber mdfd_segno
Definition: md.c:84
bool InRecovery
Definition: xlogutils.c:50

References _mdfd_openseg(), _mdfd_segpath(), _mdnblocks(), Assert(), elog, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE, EXTENSION_CREATE_RECOVERY, EXTENSION_DONT_OPEN, EXTENSION_FAIL, EXTENSION_RETURN_NULL, FATAL, FILE_POSSIBLY_DELETED, InRecovery, MCXT_ALLOC_ZERO, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, mdextend(), _MdfdVec::mdfd_segno, mdopenfork(), palloc_aligned(), pfree(), PG_IO_ALIGN_SIZE, and str.

Referenced by mdextend(), mdfd(), mdprefetch(), mdreadv(), mdstartreadv(), mdwriteback(), mdwritev(), and mdzeroextend().

◆ _mdfd_open_flags()

static int _mdfd_open_flags ( void  )
inlinestatic

Definition at line 166 of file md.c.

167{
168 int flags = O_RDWR | PG_BINARY;
169
171 flags |= PG_O_DIRECT;
172
173 return flags;
174}
#define PG_BINARY
Definition: c.h:1244
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
#define PG_O_DIRECT
Definition: fd.h:97

References IO_DIRECT_DATA, io_direct_flags, PG_BINARY, and PG_O_DIRECT.

Referenced by _mdfd_openseg(), mdcreate(), mdopenfork(), and mdsyncfiletag().

◆ _mdfd_openseg()

static MdfdVec * _mdfd_openseg ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  segno,
int  oflags 
)
static

Definition at line 1687 of file md.c.

1689{
1690 MdfdVec *v;
1691 File fd;
1692 MdPathStr fullpath;
1693
1694 fullpath = _mdfd_segpath(reln, forknum, segno);
1695
1696 /* open the file */
1697 fd = PathNameOpenFile(fullpath.str, _mdfd_open_flags() | oflags);
1698
1699 if (fd < 0)
1700 return NULL;
1701
1702 /*
1703 * Segments are always opened in order from lowest to highest, so we must
1704 * be adding a new one at the end.
1705 */
1706 Assert(segno == reln->md_num_open_segs[forknum]);
1707
1708 _fdvec_resize(reln, forknum, segno + 1);
1709
1710 /* fill the entry */
1711 v = &reln->md_seg_fds[forknum][segno];
1712 v->mdfd_vfd = fd;
1713 v->mdfd_segno = segno;
1714
1715 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1716
1717 /* all done */
1718 return v;
1719}
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1579
int File
Definition: fd.h:51
static int _mdfd_open_flags(void)
Definition: md.c:166
static void _fdvec_resize(SMgrRelation reln, ForkNumber forknum, int nseg)
Definition: md.c:1619
static int fd(const char *x, int i)
Definition: preproc-init.c:105
Definition: md.c:129
char str[MD_PATH_STR_MAXLEN+1]
Definition: md.c:130
File mdfd_vfd
Definition: md.c:83

References _fdvec_resize(), _mdfd_open_flags(), _mdfd_segpath(), _mdnblocks(), Assert(), fd(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, PathNameOpenFile(), and MdPathStr::str.

Referenced by _mdfd_getseg(), mdimmedsync(), mdnblocks(), and mdregistersync().

◆ _mdfd_segpath()

static MdPathStr _mdfd_segpath ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1667 of file md.c.

1668{
1669 RelPathStr path;
1670 MdPathStr fullpath;
1671
1672 path = relpath(reln->smgr_rlocator, forknum);
1673
1674 if (segno > 0)
1675 sprintf(fullpath.str, "%s.%u", path.str, segno);
1676 else
1677 strcpy(fullpath.str, path.str);
1678
1679 return fullpath;
1680}
#define sprintf
Definition: port.h:241
#define relpath(rlocator, forknum)
Definition: relpath.h:150
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38

References relpath, SMgrRelationData::smgr_rlocator, sprintf, MdPathStr::str, and RelPathStr::str.

Referenced by _mdfd_getseg(), _mdfd_openseg(), and mdsyncfiletag().

◆ _mdnblocks()

static BlockNumber _mdnblocks ( SMgrRelation  reln,
ForkNumber  forknum,
MdfdVec seg 
)
static

Definition at line 1859 of file md.c.

1860{
1861 off_t len;
1862
1863 len = FileSize(seg->mdfd_vfd);
1864 if (len < 0)
1865 ereport(ERROR,
1867 errmsg("could not seek to end of file \"%s\": %m",
1868 FilePathName(seg->mdfd_vfd))));
1869 /* note that this calculation will ignore any partial block at EOF */
1870 return (BlockNumber) (len / BLCKSZ);
1871}
char * FilePathName(File file)
Definition: fd.c:2516
off_t FileSize(File file)
Definition: fd.c:2464
const void size_t len

References ereport, errcode_for_file_access(), errmsg(), ERROR, FilePathName(), FileSize(), len, and _MdfdVec::mdfd_vfd.

Referenced by _mdfd_getseg(), _mdfd_openseg(), mdextend(), mdnblocks(), mdopenfork(), and mdzeroextend().

◆ buffers_to_iovec()

static int buffers_to_iovec ( struct iovec *  iov,
void **  buffers,
int  nblocks 
)
static

Definition at line 774 of file md.c.

775{
776 struct iovec *iovp;
777 int iovcnt;
778
779 Assert(nblocks >= 1);
780
781 /* If this build supports direct I/O, buffers must be I/O aligned. */
782 for (int i = 0; i < nblocks; ++i)
783 {
784 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
785 Assert((uintptr_t) buffers[i] ==
786 TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
787 }
788
789 /* Start the first iovec off with the first buffer. */
790 iovp = &iov[0];
791 iovp->iov_base = buffers[0];
792 iovp->iov_len = BLCKSZ;
793 iovcnt = 1;
794
795 /* Try to merge the rest. */
796 for (int i = 1; i < nblocks; ++i)
797 {
798 void *buffer = buffers[i];
799
800 if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
801 {
802 /* Contiguous with the last iovec. */
803 iovp->iov_len += BLCKSZ;
804 }
805 else
806 {
807 /* Need a new iovec. */
808 iovp++;
809 iovp->iov_base = buffer;
810 iovp->iov_len = BLCKSZ;
811 iovcnt++;
812 }
813 }
814
815 return iovcnt;
816}
#define TYPEALIGN(ALIGNVAL, LEN)
Definition: c.h:775
int i
Definition: isn.c:77

References Assert(), i, PG_IO_ALIGN_SIZE, PG_O_DIRECT, and TYPEALIGN.

Referenced by mdreadv(), mdstartreadv(), and mdwritev().

◆ do_truncate()

static int do_truncate ( const char *  path)
static

Definition at line 343 of file md.c.

344{
345 int save_errno;
346 int ret;
347
348 ret = pg_truncate(path, 0);
349
350 /* Log a warning here to avoid repetition in callers. */
351 if (ret < 0 && errno != ENOENT)
352 {
353 save_errno = errno;
356 errmsg("could not truncate file \"%s\": %m", path)));
357 errno = save_errno;
358 }
359
360 return ret;
361}
#define WARNING
Definition: elog.h:36
int pg_truncate(const char *path, off_t length)
Definition: fd.c:720

References ereport, errcode_for_file_access(), errmsg(), pg_truncate(), and WARNING.

Referenced by mdunlinkfork().

◆ DropRelationFiles()

void DropRelationFiles ( RelFileLocator delrels,
int  ndelrels,
bool  isRedo 
)

Definition at line 1587 of file md.c.

1588{
1589 SMgrRelation *srels;
1590 int i;
1591
1592 srels = palloc(sizeof(SMgrRelation) * ndelrels);
1593 for (i = 0; i < ndelrels; i++)
1594 {
1595 SMgrRelation srel = smgropen(delrels[i], INVALID_PROC_NUMBER);
1596
1597 if (isRedo)
1598 {
1599 ForkNumber fork;
1600
1601 for (fork = 0; fork <= MAX_FORKNUM; fork++)
1602 XLogDropRelation(delrels[i], fork);
1603 }
1604 srels[i] = srel;
1605 }
1606
1607 smgrdounlinkall(srels, ndelrels, isRedo);
1608
1609 for (i = 0; i < ndelrels; i++)
1610 smgrclose(srels[i]);
1611 pfree(srels);
1612}
void * palloc(Size size)
Definition: mcxt.c:1940
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
ForkNumber
Definition: relpath.h:56
#define MAX_FORKNUM
Definition: relpath.h:70
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrclose(SMgrRelation reln)
Definition: smgr.c:374
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
Definition: smgr.c:538
void XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum)
Definition: xlogutils.c:630

References i, INVALID_PROC_NUMBER, MAX_FORKNUM, palloc(), pfree(), smgrclose(), smgrdounlinkall(), smgropen(), and XLogDropRelation().

Referenced by FinishPreparedTransaction(), xact_redo_abort(), and xact_redo_commit().

◆ ForgetDatabaseSyncRequests()

void ForgetDatabaseSyncRequests ( Oid  dbid)

Definition at line 1569 of file md.c.

1570{
1571 FileTag tag;
1572 RelFileLocator rlocator;
1573
1574 rlocator.dbOid = dbid;
1575 rlocator.spcOid = 0;
1576 rlocator.relNumber = 0;
1577
1579
1580 RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1581}
#define InvalidBlockNumber
Definition: block.h:33
#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
Definition: md.c:91
@ InvalidForkNumber
Definition: relpath.h:57
RelFileNumber relNumber
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition: sync.c:580
@ SYNC_FILTER_REQUEST
Definition: sync.h:28

References RelFileLocator::dbOid, INIT_MD_FILETAG, InvalidBlockNumber, InvalidForkNumber, RegisterSyncRequest(), RelFileLocator::relNumber, RelFileLocator::spcOid, and SYNC_FILTER_REQUEST.

Referenced by createdb_failure_callback(), dbase_redo(), and dropdb().

◆ md_readv_complete()

static PgAioResult md_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 1965 of file md.c.

1966{
1968 PgAioResult result = prior_result;
1969
1970 if (prior_result.result < 0)
1971 {
1972 result.status = PGAIO_RS_ERROR;
1973 result.id = PGAIO_HCB_MD_READV;
1974 /* For "hard" errors, track the error number in error_data */
1975 result.error_data = -prior_result.result;
1976 result.result = 0;
1977
1978 /*
1979 * Immediately log a message about the IO error, but only to the
1980 * server log. The reason to do so immediately is that the originator
1981 * might not process the query result immediately (because it is busy
1982 * doing another part of query processing) or at all (e.g. if it was
1983 * cancelled or errored out due to another IO also failing). The
1984 * definer of the IO will emit an ERROR when processing the IO's
1985 * results
1986 */
1988
1989 return result;
1990 }
1991
1992 /*
1993 * As explained above smgrstartreadv(), the smgr API operates on the level
1994 * of blocks, rather than bytes. Convert.
1995 */
1996 result.result /= BLCKSZ;
1997
1998 Assert(result.result <= td->smgr.nblocks);
1999
2000 if (result.result == 0)
2001 {
2002 /* consider 0 blocks read a failure */
2003 result.status = PGAIO_RS_ERROR;
2004 result.id = PGAIO_HCB_MD_READV;
2005 result.error_data = 0;
2006
2007 /* see comment above the "hard error" case */
2009
2010 return result;
2011 }
2012
2013 if (result.status != PGAIO_RS_ERROR &&
2014 result.result < td->smgr.nblocks)
2015 {
2016 /* partial reads should be retried at upper level */
2017 result.status = PGAIO_RS_PARTIAL;
2018 result.id = PGAIO_HCB_MD_READV;
2019 }
2020
2021 return result;
2022}
@ PGAIO_HCB_MD_READV
Definition: aio.h:196
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
Definition: aio_callback.c:171
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:72
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
#define LOG_SERVER_ONLY
Definition: elog.h:32
uint32 status
Definition: aio_types.h:108
uint32 error_data
Definition: aio_types.h:111
int32 result
Definition: aio_types.h:113
uint32 id
Definition: aio_types.h:105
BlockNumber nblocks
Definition: aio_types.h:67
struct PgAioTargetData::@124 smgr

References Assert(), PgAioResult::error_data, PgAioResult::id, LOG_SERVER_ONLY, PgAioTargetData::nblocks, PGAIO_HCB_MD_READV, pgaio_io_get_target_data(), pgaio_result_report(), PGAIO_RS_ERROR, PGAIO_RS_PARTIAL, PgAioResult::result, PgAioTargetData::smgr, and PgAioResult::status.

◆ md_readv_report()

static void md_readv_report ( PgAioResult  result,
const PgAioTargetData td,
int  elevel 
)
static

Definition at line 2032 of file md.c.

2033{
2034 RelPathStr path;
2035
2036 path = relpathbackend(td->smgr.rlocator,
2038 td->smgr.forkNum);
2039
2040 if (result.error_data != 0)
2041 {
2042 /* for errcode_for_file_access() and %m */
2043 errno = result.error_data;
2044
2045 ereport(elevel,
2047 errmsg("could not read blocks %u..%u in file \"%s\": %m",
2048 td->smgr.blockNum,
2049 td->smgr.blockNum + td->smgr.nblocks - 1,
2050 path.str));
2051 }
2052 else
2053 {
2054 /*
2055 * NB: This will typically only be output in debug messages, while
2056 * retrying a partial IO.
2057 */
2058 ereport(elevel,
2060 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
2061 td->smgr.blockNum,
2062 td->smgr.blockNum + td->smgr.nblocks - 1,
2063 path.str,
2064 result.result * (size_t) BLCKSZ,
2065 td->smgr.nblocks * (size_t) BLCKSZ));
2066 }
2067}
int errcode(int sqlerrcode)
Definition: elog.c:854
ProcNumber MyProcNumber
Definition: globals.c:91
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
BlockNumber blockNum
Definition: aio_types.h:66
RelFileLocator rlocator
Definition: aio_types.h:65
ForkNumber forkNum
Definition: aio_types.h:68

References PgAioTargetData::blockNum, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg(), PgAioResult::error_data, PgAioTargetData::forkNum, INVALID_PROC_NUMBER, PgAioTargetData::is_temp, MyProcNumber, PgAioTargetData::nblocks, relpathbackend, PgAioResult::result, PgAioTargetData::rlocator, PgAioTargetData::smgr, and RelPathStr::str.

◆ mdclose()

void mdclose ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 703 of file md.c.

704{
705 int nopensegs = reln->md_num_open_segs[forknum];
706
707 /* No work if already closed */
708 if (nopensegs == 0)
709 return;
710
711 /* close segments starting from the end */
712 while (nopensegs > 0)
713 {
714 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
715
717 _fdvec_resize(reln, forknum, nopensegs - 1);
718 nopensegs--;
719 }
720}
void FileClose(File file)
Definition: fd.c:1982

References _fdvec_resize(), FileClose(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, and _MdfdVec::mdfd_vfd.

Referenced by mdexists().

◆ mdcreate()

void mdcreate ( SMgrRelation  reln,
ForkNumber  forknum,
bool  isRedo 
)

Definition at line 212 of file md.c.

213{
214 MdfdVec *mdfd;
215 RelPathStr path;
216 File fd;
217
218 if (isRedo && reln->md_num_open_segs[forknum] > 0)
219 return; /* created and opened already... */
220
221 Assert(reln->md_num_open_segs[forknum] == 0);
222
223 /*
224 * We may be using the target table space for the first time in this
225 * database, so create a per-database subdirectory if needed.
226 *
227 * XXX this is a fairly ugly violation of module layering, but this seems
228 * to be the best place to put the check. Maybe TablespaceCreateDbspace
229 * should be here and not in commands/tablespace.c? But that would imply
230 * importing a lot of stuff that smgr.c oughtn't know, either.
231 */
234 isRedo);
235
236 path = relpath(reln->smgr_rlocator, forknum);
237
238 fd = PathNameOpenFile(path.str, _mdfd_open_flags() | O_CREAT | O_EXCL);
239
240 if (fd < 0)
241 {
242 int save_errno = errno;
243
244 if (isRedo)
246 if (fd < 0)
247 {
248 /* be sure to report the error reported by create, not open */
249 errno = save_errno;
252 errmsg("could not create file \"%s\": %m", path.str)));
253 }
254 }
255
256 _fdvec_resize(reln, forknum, 1);
257 mdfd = &reln->md_seg_fds[forknum][0];
258 mdfd->mdfd_vfd = fd;
259 mdfd->mdfd_segno = 0;
260
261 if (!SmgrIsTemp(reln))
262 register_dirty_segment(reln, forknum, mdfd);
263}
void TablespaceCreateDbspace(Oid spcOid, Oid dbOid, bool isRedo)
Definition: tablespace.c:112
int mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
Definition: md.c:1470
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition: md.c:1494
#define SmgrIsTemp(smgr)
Definition: smgr.h:74
RelFileLocator locator

References _fdvec_resize(), _mdfd_open_flags(), Assert(), RelFileLocator::dbOid, ereport, errcode_for_file_access(), errmsg(), ERROR, fd(), RelFileLocatorBackend::locator, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, mdfd(), PathNameOpenFile(), register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, SmgrIsTemp, RelFileLocator::spcOid, RelPathStr::str, and TablespaceCreateDbspace().

◆ mdexists()

bool mdexists ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 193 of file md.c.

194{
195 /*
196 * Close it first, to ensure that we notice if the fork has been unlinked
197 * since we opened it. As an optimization, we can skip that in recovery,
198 * which already closes relations when dropping them.
199 */
200 if (!InRecovery)
201 mdclose(reln, forknum);
202
203 return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
204}
void mdclose(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:703

References EXTENSION_RETURN_NULL, InRecovery, mdclose(), and mdopenfork().

◆ mdextend()

void mdextend ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
const void *  buffer,
bool  skipFsync 
)

Definition at line 477 of file md.c.

479{
480 off_t seekpos;
481 int nbytes;
482 MdfdVec *v;
483
484 /* If this build supports direct I/O, the buffer must be I/O aligned. */
485 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
486 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
487
488 /* This assert is too expensive to have on normally ... */
489#ifdef CHECK_WRITE_VS_EXTEND
490 Assert(blocknum >= mdnblocks(reln, forknum));
491#endif
492
493 /*
494 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
495 * more --- we mustn't create a block whose number actually is
496 * InvalidBlockNumber. (Note that this failure should be unreachable
497 * because of upstream checks in bufmgr.c.)
498 */
499 if (blocknum == InvalidBlockNumber)
501 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
502 errmsg("cannot extend file \"%s\" beyond %u blocks",
503 relpath(reln->smgr_rlocator, forknum).str,
505
506 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
507
508 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
509
510 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
511
512 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
513 {
514 if (nbytes < 0)
517 errmsg("could not extend file \"%s\": %m",
519 errhint("Check free disk space.")));
520 /* short write: complain appropriately */
522 (errcode(ERRCODE_DISK_FULL),
523 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
525 nbytes, BLCKSZ, blocknum),
526 errhint("Check free disk space.")));
527 }
528
529 if (!skipFsync && !SmgrIsTemp(reln))
530 register_dirty_segment(reln, forknum, v);
531
532 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
533}
int errhint(const char *fmt,...)
Definition: elog.c:1318
static ssize_t FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
Definition: fd.h:211
BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:1213
static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
Definition: md.c:1730

References _mdfd_getseg(), _mdnblocks(), Assert(), ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, FilePathName(), FileWrite(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), PG_IO_ALIGN_SIZE, PG_O_DIRECT, register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, SmgrIsTemp, and TYPEALIGN.

Referenced by _mdfd_getseg().

◆ mdfd()

int mdfd ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
uint32 off 
)

Definition at line 1470 of file md.c.

1471{
1472 MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
1473
1474 v = _mdfd_getseg(reln, forknum, blocknum, false,
1476
1477 *off = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1478
1479 Assert(*off < (off_t) BLCKSZ * RELSEG_SIZE);
1480
1481 return FileGetRawDesc(v->mdfd_vfd);
1482}
int FileGetRawDesc(File file)
Definition: fd.c:2532

References _mdfd_getseg(), Assert(), EXTENSION_FAIL, FileGetRawDesc(), _MdfdVec::mdfd_vfd, and mdopenfork().

Referenced by mdcreate(), and mdopenfork().

◆ mdfiletagmatches()

bool mdfiletagmatches ( const FileTag ftag,
const FileTag candidate 
)

Definition at line 1950 of file md.c.

1951{
1952 /*
1953 * For now we only use filter requests as a way to drop all scheduled
1954 * callbacks relating to a given database, when dropping the database.
1955 * We'll return true for all candidates that have the same database OID as
1956 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1957 */
1958 return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1959}
RelFileLocator rlocator
Definition: sync.h:54

References RelFileLocator::dbOid, and FileTag::rlocator.

◆ mdimmedsync()

void mdimmedsync ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1417 of file md.c.

1418{
1419 int segno;
1420 int min_inactive_seg;
1421
1422 /*
1423 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1424 * the loop below will get them all!
1425 */
1426 mdnblocks(reln, forknum);
1427
1428 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1429
1430 /*
1431 * Temporarily open inactive segments, then close them after sync. There
1432 * may be some inactive segments left opened after fsync() error, but that
1433 * is harmless. We don't bother to clean them up and take a risk of
1434 * further trouble. The next mdclose() will soon close them.
1435 */
1436 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1437 segno++;
1438
1439 while (segno > 0)
1440 {
1441 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1442
1443 /*
1444 * fsyncs done through mdimmedsync() should be tracked in a separate
1445 * IOContext than those done through mdsyncfiletag() to differentiate
1446 * between unavoidable client backend fsyncs (e.g. those done during
1447 * index build) and those which ideally would have been done by the
1448 * checkpointer. Since other IO operations bypassing the buffer
1449 * manager could also be tracked in such an IOContext, wait until
1450 * these are also tracked to track immediate fsyncs.
1451 */
1452 if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
1455 errmsg("could not fsync file \"%s\": %m",
1456 FilePathName(v->mdfd_vfd))));
1457
1458 /* Close inactive segments immediately */
1459 if (segno > min_inactive_seg)
1460 {
1461 FileClose(v->mdfd_vfd);
1462 _fdvec_resize(reln, forknum, segno - 1);
1463 }
1464
1465 segno--;
1466 }
1467}
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2352
int data_sync_elevel(int elevel)
Definition: fd.c:4001

References _fdvec_resize(), _mdfd_openseg(), data_sync_elevel(), ereport, errcode_for_file_access(), errmsg(), ERROR, FileClose(), FilePathName(), FileSync(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, and mdnblocks().

◆ mdinit()

void mdinit ( void  )

Definition at line 180 of file md.c.

181{
183 "MdSmgr",
185}
MemoryContext TopMemoryContext
Definition: mcxt.c:165
#define AllocSetContextCreate
Definition: memutils.h:149
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:180

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, MdCxt, and TopMemoryContext.

◆ mdmaxcombine()

uint32 mdmaxcombine ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum 
)

Definition at line 823 of file md.c.

825{
826 BlockNumber segoff;
827
828 segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
829
830 return RELSEG_SIZE - segoff;
831}

◆ mdnblocks()

BlockNumber mdnblocks ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1213 of file md.c.

1214{
1215 MdfdVec *v;
1216 BlockNumber nblocks;
1217 BlockNumber segno;
1218
1219 mdopenfork(reln, forknum, EXTENSION_FAIL);
1220
1221 /* mdopen has opened the first segment */
1222 Assert(reln->md_num_open_segs[forknum] > 0);
1223
1224 /*
1225 * Start from the last open segments, to avoid redundant seeks. We have
1226 * previously verified that these segments are exactly RELSEG_SIZE long,
1227 * and it's useless to recheck that each time.
1228 *
1229 * NOTE: this assumption could only be wrong if another backend has
1230 * truncated the relation. We rely on higher code levels to handle that
1231 * scenario by closing and re-opening the md fd, which is handled via
1232 * relcache flush. (Since the checkpointer doesn't participate in
1233 * relcache flush, it could have segment entries for inactive segments;
1234 * that's OK because the checkpointer never needs to compute relation
1235 * size.)
1236 */
1237 segno = reln->md_num_open_segs[forknum] - 1;
1238 v = &reln->md_seg_fds[forknum][segno];
1239
1240 for (;;)
1241 {
1242 nblocks = _mdnblocks(reln, forknum, v);
1243 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1244 elog(FATAL, "segment too big");
1245 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1246 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1247
1248 /*
1249 * If segment is exactly RELSEG_SIZE, advance to next one.
1250 */
1251 segno++;
1252
1253 /*
1254 * We used to pass O_CREAT here, but that has the disadvantage that it
1255 * might create a segment which has vanished through some operating
1256 * system misadventure. In such a case, creating the segment here
1257 * undermines _mdfd_getseg's attempts to notice and report an error
1258 * upon access to a missing segment.
1259 */
1260 v = _mdfd_openseg(reln, forknum, segno, 0);
1261 if (v == NULL)
1262 return segno * ((BlockNumber) RELSEG_SIZE);
1263 }
1264}

References _mdfd_openseg(), _mdnblocks(), Assert(), elog, EXTENSION_FAIL, FATAL, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, and mdopenfork().

Referenced by mdextend(), mdimmedsync(), mdregistersync(), mdwritev(), and mdzeroextend().

◆ mdopen()

void mdopen ( SMgrRelation  reln)

Definition at line 692 of file md.c.

693{
694 /* mark it not open */
695 for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
696 reln->md_num_open_segs[forknum] = 0;
697}

References MAX_FORKNUM, and SMgrRelationData::md_num_open_segs.

◆ mdopenfork()

static MdfdVec * mdopenfork ( SMgrRelation  reln,
ForkNumber  forknum,
int  behavior 
)
static

Definition at line 654 of file md.c.

655{
656 MdfdVec *mdfd;
657 RelPathStr path;
658 File fd;
659
660 /* No work if already open */
661 if (reln->md_num_open_segs[forknum] > 0)
662 return &reln->md_seg_fds[forknum][0];
663
664 path = relpath(reln->smgr_rlocator, forknum);
665
667
668 if (fd < 0)
669 {
670 if ((behavior & EXTENSION_RETURN_NULL) &&
672 return NULL;
675 errmsg("could not open file \"%s\": %m", path.str)));
676 }
677
678 _fdvec_resize(reln, forknum, 1);
679 mdfd = &reln->md_seg_fds[forknum][0];
680 mdfd->mdfd_vfd = fd;
681 mdfd->mdfd_segno = 0;
682
683 Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
684
685 return mdfd;
686}

References _fdvec_resize(), _mdfd_open_flags(), _mdnblocks(), Assert(), ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_RETURN_NULL, fd(), FILE_POSSIBLY_DELETED, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, mdfd(), PathNameOpenFile(), relpath, SMgrRelationData::smgr_rlocator, and RelPathStr::str.

Referenced by _mdfd_getseg(), mdexists(), mdfd(), and mdnblocks().

◆ mdprefetch()

bool mdprefetch ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
int  nblocks 
)

Definition at line 726 of file md.c.

728{
729#ifdef USE_PREFETCH
730
732
733 if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
734 return false;
735
736 while (nblocks > 0)
737 {
738 off_t seekpos;
739 MdfdVec *v;
740 int nblocks_this_segment;
741
742 v = _mdfd_getseg(reln, forknum, blocknum, false,
744 if (v == NULL)
745 return false;
746
747 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
748
749 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
750
751 nblocks_this_segment =
752 Min(nblocks,
753 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
754
755 (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ * nblocks_this_segment,
756 WAIT_EVENT_DATA_FILE_PREFETCH);
757
758 blocknum += nblocks_this_segment;
759 nblocks -= nblocks_this_segment;
760 }
761#endif /* USE_PREFETCH */
762
763 return true;
764}
#define MaxBlockNumber
Definition: block.h:35
#define Min(x, y)
Definition: c.h:975
uint64_t uint64
Definition: c.h:503
int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2083

References _mdfd_getseg(), Assert(), EXTENSION_FAIL, EXTENSION_RETURN_NULL, FilePrefetch(), InRecovery, IO_DIRECT_DATA, io_direct_flags, MaxBlockNumber, _MdfdVec::mdfd_vfd, and Min.

◆ mdreadv()

void mdreadv ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
void **  buffers,
BlockNumber  nblocks 
)

Definition at line 837 of file md.c.

839{
840 while (nblocks > 0)
841 {
842 struct iovec iov[PG_IOV_MAX];
843 int iovcnt;
844 off_t seekpos;
845 int nbytes;
846 MdfdVec *v;
847 BlockNumber nblocks_this_segment;
848 size_t transferred_this_segment;
849 size_t size_this_segment;
850
851 v = _mdfd_getseg(reln, forknum, blocknum, false,
853
854 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
855
856 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
857
858 nblocks_this_segment =
859 Min(nblocks,
860 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
861 nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
862
863 if (nblocks_this_segment != nblocks)
864 elog(ERROR, "read crosses segment boundary");
865
866 iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
867 size_this_segment = nblocks_this_segment * BLCKSZ;
868 transferred_this_segment = 0;
869
870 /*
871 * Inner loop to continue after a short read. We'll keep going until
872 * we hit EOF rather than assuming that a short read means we hit the
873 * end.
874 */
875 for (;;)
876 {
877 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
881 reln->smgr_rlocator.backend);
882 nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
883 WAIT_EVENT_DATA_FILE_READ);
884 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
889 nbytes,
890 size_this_segment - transferred_this_segment);
891
892#ifdef SIMULATE_SHORT_READ
893 nbytes = Min(nbytes, 4096);
894#endif
895
896 if (nbytes < 0)
899 errmsg("could not read blocks %u..%u in file \"%s\": %m",
900 blocknum,
901 blocknum + nblocks_this_segment - 1,
902 FilePathName(v->mdfd_vfd))));
903
904 if (nbytes == 0)
905 {
906 /*
907 * We are at or past EOF, or we read a partial block at EOF.
908 * Normally this is an error; upper levels should never try to
909 * read a nonexistent block. However, if zero_damaged_pages
910 * is ON or we are InRecovery, we should instead return zeroes
911 * without complaining. This allows, for example, the case of
912 * trying to update a block that was later truncated away.
913 *
914 * NB: We think that this codepath is unreachable in recovery
915 * and incomplete with zero_damaged_pages, as missing segments
916 * are not created. Putting blocks into the buffer-pool that
917 * do not exist on disk is rather problematic, as it will not
918 * be found by scans that rely on smgrnblocks(), as they are
919 * beyond EOF. It also can cause weird problems with relation
920 * extension, as relation extension does not expect blocks
921 * beyond EOF to exist.
922 *
923 * Therefore we do not want to copy the logic into
924 * mdstartreadv(), where it would have to be more complicated
925 * due to potential differences in the zero_damaged_pages
926 * setting between the definer and completor of IO.
927 *
928 * For PG 18, we are putting an Assert(false) in mdreadv()
929 * (triggering failures in assertion-enabled builds, but
930 * continuing to work in production builds). Afterwards we
931 * plan to remove this code entirely.
932 */
934 {
935 Assert(false); /* see comment above */
936
937 for (BlockNumber i = transferred_this_segment / BLCKSZ;
938 i < nblocks_this_segment;
939 ++i)
940 memset(buffers[i], 0, BLCKSZ);
941 break;
942 }
943 else
946 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
947 blocknum,
948 blocknum + nblocks_this_segment - 1,
950 transferred_this_segment,
951 size_this_segment)));
952 }
953
954 /* One loop should usually be enough. */
955 transferred_this_segment += nbytes;
956 Assert(transferred_this_segment <= size_this_segment);
957 if (transferred_this_segment == size_this_segment)
958 break;
959
960 /* Adjust position and vectors after a short read. */
961 seekpos += nbytes;
962 iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
963 }
964
965 nblocks -= nblocks_this_segment;
966 buffers += nblocks_this_segment;
967 blocknum += nblocks_this_segment;
968 }
969}
bool zero_damaged_pages
Definition: bufmgr.c:141
#define lengthof(array)
Definition: c.h:759
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2165
int compute_remaining_iovec(struct iovec *destination, const struct iovec *source, int iovcnt, size_t transferred)
Definition: file_utils.c:614
static int buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
Definition: md.c:774
#define PG_IOV_MAX
Definition: pg_iovec.h:41

References _mdfd_getseg(), Assert(), RelFileLocatorBackend::backend, buffers_to_iovec(), compute_remaining_iovec(), RelFileLocator::dbOid, elog, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, FilePathName(), FileReadV(), i, InRecovery, lengthof, RelFileLocatorBackend::locator, _MdfdVec::mdfd_vfd, Min, PG_IOV_MAX, RelFileLocator::relNumber, SMgrRelationData::smgr_rlocator, RelFileLocator::spcOid, and zero_damaged_pages.

◆ mdregistersync()

void mdregistersync ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1366 of file md.c.

1367{
1368 int segno;
1369 int min_inactive_seg;
1370
1371 /*
1372 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1373 * the loop below will get them all!
1374 */
1375 mdnblocks(reln, forknum);
1376
1377 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1378
1379 /*
1380 * Temporarily open inactive segments, then close them after sync. There
1381 * may be some inactive segments left opened after error, but that is
1382 * harmless. We don't bother to clean them up and take a risk of further
1383 * trouble. The next mdclose() will soon close them.
1384 */
1385 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1386 segno++;
1387
1388 while (segno > 0)
1389 {
1390 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1391
1392 register_dirty_segment(reln, forknum, v);
1393
1394 /* Close inactive segments immediately */
1395 if (segno > min_inactive_seg)
1396 {
1397 FileClose(v->mdfd_vfd);
1398 _fdvec_resize(reln, forknum, segno - 1);
1399 }
1400
1401 segno--;
1402 }
1403}

References _fdvec_resize(), _mdfd_openseg(), FileClose(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, mdnblocks(), and register_dirty_segment().

◆ mdstartreadv()

void mdstartreadv ( PgAioHandle ioh,
SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
void **  buffers,
BlockNumber  nblocks 
)

Definition at line 975 of file md.c.

978{
979 off_t seekpos;
980 MdfdVec *v;
981 BlockNumber nblocks_this_segment;
982 struct iovec *iov;
983 int iovcnt;
984 int ret;
985
986 v = _mdfd_getseg(reln, forknum, blocknum, false,
988
989 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
990
991 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
992
993 nblocks_this_segment =
994 Min(nblocks,
995 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
996
997 if (nblocks_this_segment != nblocks)
998 elog(ERROR, "read crossing segment boundary");
999
1000 iovcnt = pgaio_io_get_iovec(ioh, &iov);
1001
1002 Assert(nblocks <= iovcnt);
1003
1004 iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
1005
1006 Assert(iovcnt <= nblocks_this_segment);
1007
1010
1012 reln,
1013 forknum,
1014 blocknum,
1015 nblocks,
1016 false);
1018
1019 ret = FileStartReadV(ioh, v->mdfd_vfd, iovcnt, seekpos, WAIT_EVENT_DATA_FILE_READ);
1020 if (ret != 0)
1021 ereport(ERROR,
1023 errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
1024 blocknum,
1025 blocknum + nblocks_this_segment - 1,
1026 FilePathName(v->mdfd_vfd))));
1027
1028 /*
1029 * The error checks corresponding to the post-read checks in mdreadv() are
1030 * in md_readv_complete().
1031 *
1032 * However we chose, at least for now, to not implement the
1033 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
1034 * that logic is rather problematic, and we want to get rid of it. Here
1035 * equivalent logic would have to be more complicated due to potential
1036 * differences in the zero_damaged_pages setting between the definer and
1037 * completor of IO.
1038 */
1039}
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:318
@ PGAIO_HF_BUFFERED
Definition: aio.h:77
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
int pgaio_io_get_iovec(PgAioHandle *ioh, struct iovec **iov)
Definition: aio_io.c:42
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2221
void pgaio_io_set_target_smgr(PgAioHandle *ioh, SMgrRelationData *smgr, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skip_fsync)
Definition: smgr.c:1029

References _mdfd_getseg(), Assert(), buffers_to_iovec(), elog, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, FilePathName(), FileStartReadV(), IO_DIRECT_DATA, io_direct_flags, _MdfdVec::mdfd_vfd, Min, PGAIO_HCB_MD_READV, PGAIO_HF_BUFFERED, pgaio_io_get_iovec(), pgaio_io_register_callbacks(), pgaio_io_set_flag(), and pgaio_io_set_target_smgr().

◆ mdsyncfiletag()

int mdsyncfiletag ( const FileTag ftag,
char *  path 
)

Definition at line 1880 of file md.c.

1881{
1883 File file;
1884 instr_time io_start;
1885 bool need_to_close;
1886 int result,
1887 save_errno;
1888
1889 /* See if we already have the file open, or need to open it. */
1890 if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1891 {
1892 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1893 strlcpy(path, FilePathName(file), MAXPGPATH);
1894 need_to_close = false;
1895 }
1896 else
1897 {
1898 MdPathStr p;
1899
1900 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1901 strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
1902
1903 file = PathNameOpenFile(path, _mdfd_open_flags());
1904 if (file < 0)
1905 return -1;
1906 need_to_close = true;
1907 }
1908
1910
1911 /* Sync the file. */
1912 result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1913 save_errno = errno;
1914
1915 if (need_to_close)
1916 FileClose(file);
1917
1919 IOOP_FSYNC, io_start, 1, 0);
1920
1921 errno = save_errno;
1922 return result;
1923}
bool track_io_timing
Definition: bufmgr.c:144
#define MD_PATH_STR_MAXLEN
Definition: md.c:122
#define MAXPGPATH
@ IOOBJECT_RELATION
Definition: pgstat.h:274
@ IOCONTEXT_NORMAL
Definition: pgstat.h:286
@ IOOP_FSYNC
Definition: pgstat.h:305
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:90
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:121
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
int16 forknum
Definition: sync.h:53
uint64 segno
Definition: sync.h:55

References _mdfd_open_flags(), _mdfd_segpath(), FileClose(), FilePathName(), FileSync(), FileTag::forknum, INVALID_PROC_NUMBER, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, MAXPGPATH, SMgrRelationData::md_num_open_segs, MD_PATH_STR_MAXLEN, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, PathNameOpenFile(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), FileTag::rlocator, FileTag::segno, smgropen(), MdPathStr::str, strlcpy(), and track_io_timing.

◆ mdtruncate()

void mdtruncate ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  curnblk,
BlockNumber  nblocks 
)

Definition at line 1277 of file md.c.

1279{
1280 BlockNumber priorblocks;
1281 int curopensegs;
1282
1283 if (nblocks > curnblk)
1284 {
1285 /* Bogus request ... but no complaint if InRecovery */
1286 if (InRecovery)
1287 return;
1288 ereport(ERROR,
1289 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1290 relpath(reln->smgr_rlocator, forknum).str,
1291 nblocks, curnblk)));
1292 }
1293 if (nblocks == curnblk)
1294 return; /* no work */
1295
1296 /*
1297 * Truncate segments, starting at the last one. Starting at the end makes
1298 * managing the memory for the fd array easier, should there be errors.
1299 */
1300 curopensegs = reln->md_num_open_segs[forknum];
1301 while (curopensegs > 0)
1302 {
1303 MdfdVec *v;
1304
1305 priorblocks = (curopensegs - 1) * RELSEG_SIZE;
1306
1307 v = &reln->md_seg_fds[forknum][curopensegs - 1];
1308
1309 if (priorblocks > nblocks)
1310 {
1311 /*
1312 * This segment is no longer active. We truncate the file, but do
1313 * not delete it, for reasons explained in the header comments.
1314 */
1315 if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1316 ereport(ERROR,
1318 errmsg("could not truncate file \"%s\": %m",
1319 FilePathName(v->mdfd_vfd))));
1320
1321 if (!SmgrIsTemp(reln))
1322 register_dirty_segment(reln, forknum, v);
1323
1324 /* we never drop the 1st segment */
1325 Assert(v != &reln->md_seg_fds[forknum][0]);
1326
1327 FileClose(v->mdfd_vfd);
1328 _fdvec_resize(reln, forknum, curopensegs - 1);
1329 }
1330 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1331 {
1332 /*
1333 * This is the last segment we want to keep. Truncate the file to
1334 * the right length. NOTE: if nblocks is exactly a multiple K of
1335 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1336 * keep it. This adheres to the invariant given in the header
1337 * comments.
1338 */
1339 BlockNumber lastsegblocks = nblocks - priorblocks;
1340
1341 if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1342 ereport(ERROR,
1344 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1346 nblocks)));
1347 if (!SmgrIsTemp(reln))
1348 register_dirty_segment(reln, forknum, v);
1349 }
1350 else
1351 {
1352 /*
1353 * We still need this segment, so nothing to do for this and any
1354 * earlier segment.
1355 */
1356 break;
1357 }
1358 curopensegs--;
1359 }
1360}
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2481

References _fdvec_resize(), Assert(), ereport, errcode_for_file_access(), errmsg(), ERROR, FileClose(), FilePathName(), FileTruncate(), InRecovery, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, and SmgrIsTemp.

◆ mdunlink()

void mdunlink ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
bool  isRedo 
)

Definition at line 327 of file md.c.

328{
329 /* Now do the per-fork work */
330 if (forknum == InvalidForkNumber)
331 {
332 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
333 mdunlinkfork(rlocator, forknum, isRedo);
334 }
335 else
336 mdunlinkfork(rlocator, forknum, isRedo);
337}
static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition: md.c:364

References InvalidForkNumber, MAX_FORKNUM, and mdunlinkfork().

◆ mdunlinkfiletag()

int mdunlinkfiletag ( const FileTag ftag,
char *  path 
)

Definition at line 1932 of file md.c.

1933{
1934 RelPathStr p;
1935
1936 /* Compute the path. */
1937 p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1938 strlcpy(path, p.str, MAXPGPATH);
1939
1940 /* Try to unlink the file. */
1941 return unlink(path);
1942}
@ MAIN_FORKNUM
Definition: relpath.h:58
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146

References MAIN_FORKNUM, MAXPGPATH, relpathperm, FileTag::rlocator, RelPathStr::str, and strlcpy().

◆ mdunlinkfork()

static void mdunlinkfork ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
bool  isRedo 
)
static

Definition at line 364 of file md.c.

365{
366 RelPathStr path;
367 int ret;
368 int save_errno;
369
370 path = relpath(rlocator, forknum);
371
372 /*
373 * Truncate and then unlink the first segment, or just register a request
374 * to unlink it later, as described in the comments for mdunlink().
375 */
376 if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
378 {
379 if (!RelFileLocatorBackendIsTemp(rlocator))
380 {
381 /* Prevent other backends' fds from holding on to the disk space */
382 ret = do_truncate(path.str);
383
384 /* Forget any pending sync requests for the first segment */
385 save_errno = errno;
386 register_forget_request(rlocator, forknum, 0 /* first seg */ );
387 errno = save_errno;
388 }
389 else
390 ret = 0;
391
392 /* Next unlink the file, unless it was already found to be missing */
393 if (ret >= 0 || errno != ENOENT)
394 {
395 ret = unlink(path.str);
396 if (ret < 0 && errno != ENOENT)
397 {
398 save_errno = errno;
401 errmsg("could not remove file \"%s\": %m", path.str)));
402 errno = save_errno;
403 }
404 }
405 }
406 else
407 {
408 /* Prevent other backends' fds from holding on to the disk space */
409 ret = do_truncate(path.str);
410
411 /* Register request to unlink first segment later */
412 save_errno = errno;
413 register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
414 errno = save_errno;
415 }
416
417 /*
418 * Delete any additional segments.
419 *
420 * Note that because we loop until getting ENOENT, we will correctly
421 * remove all inactive segments as well as active ones. Ideally we'd
422 * continue the loop until getting exactly that errno, but that risks an
423 * infinite loop if the problem is directory-wide (for instance, if we
424 * suddenly can't read the data directory itself). We compromise by
425 * continuing after a non-ENOENT truncate error, but stopping after any
426 * unlink error. If there is indeed a directory-wide problem, additional
427 * unlink attempts wouldn't work anyway.
428 */
429 if (ret >= 0 || errno != ENOENT)
430 {
431 MdPathStr segpath;
432 BlockNumber segno;
433
434 for (segno = 1;; segno++)
435 {
436 sprintf(segpath.str, "%s.%u", path.str, segno);
437
438 if (!RelFileLocatorBackendIsTemp(rlocator))
439 {
440 /*
441 * Prevent other backends' fds from holding on to the disk
442 * space. We're done if we see ENOENT, though.
443 */
444 if (do_truncate(segpath.str) < 0 && errno == ENOENT)
445 break;
446
447 /*
448 * Forget any pending sync requests for this segment before we
449 * try to unlink.
450 */
451 register_forget_request(rlocator, forknum, segno);
452 }
453
454 if (unlink(segpath.str) < 0)
455 {
456 /* ENOENT is expected after the last segment... */
457 if (errno != ENOENT)
460 errmsg("could not remove file \"%s\": %m", segpath.str)));
461 break;
462 }
463 }
464 }
465}
bool IsBinaryUpgrade
Definition: globals.c:122
static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1555
static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1538
static int do_truncate(const char *path)
Definition: md.c:343
#define RelFileLocatorBackendIsTemp(rlocator)

References do_truncate(), ereport, errcode_for_file_access(), errmsg(), IsBinaryUpgrade, MAIN_FORKNUM, register_forget_request(), register_unlink_segment(), RelFileLocatorBackendIsTemp, relpath, sprintf, MdPathStr::str, RelPathStr::str, and WARNING.

Referenced by mdunlink().

◆ mdwriteback()

void mdwriteback ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
BlockNumber  nblocks 
)

Definition at line 1154 of file md.c.

1156{
1158
1159 /*
1160 * Issue flush requests in as few requests as possible; have to split at
1161 * segment boundaries though, since those are actually separate files.
1162 */
1163 while (nblocks > 0)
1164 {
1165 BlockNumber nflush = nblocks;
1166 off_t seekpos;
1167 MdfdVec *v;
1168 int segnum_start,
1169 segnum_end;
1170
1171 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1173
1174 /*
1175 * We might be flushing buffers of already removed relations, that's
1176 * ok, just ignore that case. If the segment file wasn't open already
1177 * (ie from a recent mdwrite()), then we don't want to re-open it, to
1178 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1179 * us with a descriptor to a file that is about to be unlinked.
1180 */
1181 if (!v)
1182 return;
1183
1184 /* compute offset inside the current segment */
1185 segnum_start = blocknum / RELSEG_SIZE;
1186
1187 /* compute number of desired writes within the current segment */
1188 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1189 if (segnum_start != segnum_end)
1190 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1191
1192 Assert(nflush >= 1);
1193 Assert(nflush <= nblocks);
1194
1195 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1196
1197 FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
1198
1199 nblocks -= nflush;
1200 blocknum += nflush;
1201 }
1202}
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2139

References _mdfd_getseg(), Assert(), EXTENSION_DONT_OPEN, FileWriteback(), IO_DIRECT_DATA, io_direct_flags, and _MdfdVec::mdfd_vfd.

◆ mdwritev()

void mdwritev ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
const void **  buffers,
BlockNumber  nblocks,
bool  skipFsync 
)

Definition at line 1049 of file md.c.

1051{
1052 /* This assert is too expensive to have on normally ... */
1053#ifdef CHECK_WRITE_VS_EXTEND
1054 Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
1055#endif
1056
1057 while (nblocks > 0)
1058 {
1059 struct iovec iov[PG_IOV_MAX];
1060 int iovcnt;
1061 off_t seekpos;
1062 int nbytes;
1063 MdfdVec *v;
1064 BlockNumber nblocks_this_segment;
1065 size_t transferred_this_segment;
1066 size_t size_this_segment;
1067
1068 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
1070
1071 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1072
1073 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
1074
1075 nblocks_this_segment =
1076 Min(nblocks,
1077 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1078 nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
1079
1080 if (nblocks_this_segment != nblocks)
1081 elog(ERROR, "write crosses segment boundary");
1082
1083 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
1084 size_this_segment = nblocks_this_segment * BLCKSZ;
1085 transferred_this_segment = 0;
1086
1087 /*
1088 * Inner loop to continue after a short write. If the reason is that
1089 * we're out of disk space, a future attempt should get an ENOSPC
1090 * error from the kernel.
1091 */
1092 for (;;)
1093 {
1094 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
1098 reln->smgr_rlocator.backend);
1099 nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
1100 WAIT_EVENT_DATA_FILE_WRITE);
1101 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
1105 reln->smgr_rlocator.backend,
1106 nbytes,
1107 size_this_segment - transferred_this_segment);
1108
1109#ifdef SIMULATE_SHORT_WRITE
1110 nbytes = Min(nbytes, 4096);
1111#endif
1112
1113 if (nbytes < 0)
1114 {
1115 bool enospc = errno == ENOSPC;
1116
1117 ereport(ERROR,
1119 errmsg("could not write blocks %u..%u in file \"%s\": %m",
1120 blocknum,
1121 blocknum + nblocks_this_segment - 1,
1123 enospc ? errhint("Check free disk space.") : 0));
1124 }
1125
1126 /* One loop should usually be enough. */
1127 transferred_this_segment += nbytes;
1128 Assert(transferred_this_segment <= size_this_segment);
1129 if (transferred_this_segment == size_this_segment)
1130 break;
1131
1132 /* Adjust position and iovecs after a short write. */
1133 seekpos += nbytes;
1134 iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
1135 }
1136
1137 if (!skipFsync && !SmgrIsTemp(reln))
1138 register_dirty_segment(reln, forknum, v);
1139
1140 nblocks -= nblocks_this_segment;
1141 buffers += nblocks_this_segment;
1142 blocknum += nblocks_this_segment;
1143 }
1144}
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2247

References _mdfd_getseg(), Assert(), RelFileLocatorBackend::backend, buffers_to_iovec(), compute_remaining_iovec(), RelFileLocator::dbOid, elog, ereport, errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, FilePathName(), FileWriteV(), lengthof, RelFileLocatorBackend::locator, _MdfdVec::mdfd_vfd, mdnblocks(), Min, PG_IOV_MAX, register_dirty_segment(), RelFileLocator::relNumber, SMgrRelationData::smgr_rlocator, SmgrIsTemp, and RelFileLocator::spcOid.

◆ mdzeroextend()

void mdzeroextend ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
int  nblocks,
bool  skipFsync 
)

Definition at line 542 of file md.c.

544{
545 MdfdVec *v;
546 BlockNumber curblocknum = blocknum;
547 int remblocks = nblocks;
548
549 Assert(nblocks > 0);
550
551 /* This assert is too expensive to have on normally ... */
552#ifdef CHECK_WRITE_VS_EXTEND
553 Assert(blocknum >= mdnblocks(reln, forknum));
554#endif
555
556 /*
557 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
558 * more --- we mustn't create a block whose number actually is
559 * InvalidBlockNumber or larger.
560 */
561 if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
563 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
564 errmsg("cannot extend file \"%s\" beyond %u blocks",
565 relpath(reln->smgr_rlocator, forknum).str,
567
568 while (remblocks > 0)
569 {
570 BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
571 off_t seekpos = (off_t) BLCKSZ * segstartblock;
572 int numblocks;
573
574 if (segstartblock + remblocks > RELSEG_SIZE)
575 numblocks = RELSEG_SIZE - segstartblock;
576 else
577 numblocks = remblocks;
578
579 v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
580
581 Assert(segstartblock < RELSEG_SIZE);
582 Assert(segstartblock + numblocks <= RELSEG_SIZE);
583
584 /*
585 * If available and useful, use posix_fallocate() (via
586 * FileFallocate()) to extend the relation. That's often more
587 * efficient than using write(), as it commonly won't cause the kernel
588 * to allocate page cache space for the extended pages.
589 *
590 * However, we don't use FileFallocate() for small extensions, as it
591 * defeats delayed allocation on some filesystems. Not clear where
592 * that decision should be made though? For now just use a cutoff of
593 * 8, anything between 4 and 8 worked OK in some local testing.
594 */
595 if (numblocks > 8)
596 {
597 int ret;
598
599 ret = FileFallocate(v->mdfd_vfd,
600 seekpos, (off_t) BLCKSZ * numblocks,
601 WAIT_EVENT_DATA_FILE_EXTEND);
602 if (ret != 0)
603 {
606 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
608 errhint("Check free disk space."));
609 }
610 }
611 else
612 {
613 int ret;
614
615 /*
616 * Even if we don't want to use fallocate, we can still extend a
617 * bit more efficiently than writing each 8kB block individually.
618 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
619 * to avoid multiple writes or needing a zeroed buffer for the
620 * whole length of the extension.
621 */
622 ret = FileZero(v->mdfd_vfd,
623 seekpos, (off_t) BLCKSZ * numblocks,
624 WAIT_EVENT_DATA_FILE_EXTEND);
625 if (ret < 0)
628 errmsg("could not extend file \"%s\": %m",
630 errhint("Check free disk space."));
631 }
632
633 if (!skipFsync && !SmgrIsTemp(reln))
634 register_dirty_segment(reln, forknum, v);
635
636 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
637
638 remblocks -= numblocks;
639 curblocknum += numblocks;
640 }
641}
int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2424
int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2379

References _mdfd_getseg(), _mdnblocks(), Assert(), ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, FileFallocate(), FilePathName(), FileZero(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, and SmgrIsTemp.

◆ register_dirty_segment()

static void register_dirty_segment ( SMgrRelation  reln,
ForkNumber  forknum,
MdfdVec seg 
)
static

Definition at line 1494 of file md.c.

1495{
1496 FileTag tag;
1497
1498 INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1499
1500 /* Temp relations should never be fsync'd */
1501 Assert(!SmgrIsTemp(reln));
1502
1503 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1504 {
1505 instr_time io_start;
1506
1508 (errmsg_internal("could not forward fsync request because request queue is full")));
1509
1511
1512 if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
1515 errmsg("could not fsync file \"%s\": %m",
1516 FilePathName(seg->mdfd_vfd))));
1517
1518 /*
1519 * We have no way of knowing if the current IOContext is
1520 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1521 * point, so count the fsync as being in the IOCONTEXT_NORMAL
1522 * IOContext. This is probably okay, because the number of backend
1523 * fsyncs doesn't say anything about the efficacy of the
1524 * BufferAccessStrategy. And counting both fsyncs done in
1525 * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1526 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1527 * backend fsyncs.
1528 */
1530 IOOP_FSYNC, io_start, 1, 0);
1531 }
1532}
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1158
#define DEBUG1
Definition: elog.h:30
@ SYNC_REQUEST
Definition: sync.h:25

References Assert(), data_sync_elevel(), DEBUG1, ereport, errcode_for_file_access(), errmsg(), errmsg_internal(), ERROR, FilePathName(), FileSync(), INIT_MD_FILETAG, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, RelFileLocatorBackend::locator, _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RegisterSyncRequest(), SMgrRelationData::smgr_rlocator, SmgrIsTemp, SYNC_REQUEST, and track_io_timing.

Referenced by mdcreate(), mdextend(), mdregistersync(), mdtruncate(), mdwritev(), and mdzeroextend().

◆ register_forget_request()

static void register_forget_request ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1555 of file md.c.

1557{
1558 FileTag tag;
1559
1560 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1561
1562 RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
1563}
@ SYNC_FORGET_REQUEST
Definition: sync.h:27

References INIT_MD_FILETAG, RelFileLocatorBackend::locator, RegisterSyncRequest(), and SYNC_FORGET_REQUEST.

Referenced by mdunlinkfork().

◆ register_unlink_segment()

static void register_unlink_segment ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1538 of file md.c.

1540{
1541 FileTag tag;
1542
1543 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1544
1545 /* Should never be used with temp relations */
1547
1548 RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
1549}
@ SYNC_UNLINK_REQUEST
Definition: sync.h:26

References Assert(), INIT_MD_FILETAG, RelFileLocatorBackend::locator, RegisterSyncRequest(), RelFileLocatorBackendIsTemp, and SYNC_UNLINK_REQUEST.

Referenced by mdunlinkfork().

Variable Documentation

◆ aio_md_readv_cb

const PgAioHandleCallbacks aio_md_readv_cb
Initial value:
= {
.complete_shared = md_readv_complete,
.report = md_readv_report,
}
static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition: md.c:2032
static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: md.c:1965

Definition at line 159 of file md.c.

◆ MdCxt

MemoryContext MdCxt
static

Definition at line 87 of file md.c.

Referenced by _fdvec_resize(), and mdinit().