PostgreSQL Source Code git master
md.c File Reference
#include "postgres.h"
#include <limits.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
#include "access/xlogutils.h"
#include "commands/tablespace.h"
#include "common/file_utils.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/md.h"
#include "storage/relfilelocator.h"
#include "storage/smgr.h"
#include "storage/sync.h"
#include "utils/memutils.h"
Include dependency graph for md.c:

Go to the source code of this file.

Data Structures

struct  _MdfdVec
 
struct  MdPathStr
 

Macros

#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
 
#define EXTENSION_FAIL   (1 << 0)
 
#define EXTENSION_RETURN_NULL   (1 << 1)
 
#define EXTENSION_CREATE   (1 << 2)
 
#define EXTENSION_CREATE_RECOVERY   (1 << 3)
 
#define EXTENSION_DONT_OPEN   (1 << 5)
 
#define SEGMENT_CHARS   OIDCHARS
 
#define MD_PATH_STR_MAXLEN
 

Typedefs

typedef struct _MdfdVec MdfdVec
 
typedef struct MdPathStr MdPathStr
 

Functions

 StaticAssertDecl (RELSEG_SIZE > 0 &&RELSEG_SIZE<=INT_MAX, "RELSEG_SIZE must fit in an integer")
 
static void mdunlinkfork (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 
static MdfdVecmdopenfork (SMgrRelation reln, ForkNumber forknum, int behavior)
 
static void register_dirty_segment (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
static void register_unlink_segment (RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
 
static void register_forget_request (RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
 
static void _fdvec_resize (SMgrRelation reln, ForkNumber forknum, int nseg)
 
static MdPathStr _mdfd_segpath (SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
 
static MdfdVec_mdfd_openseg (SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
 
static MdfdVec_mdfd_getseg (SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
 
static BlockNumber _mdnblocks (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
static PgAioResult md_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static void md_readv_report (PgAioResult result, const PgAioTargetData *td, int elevel)
 
static int _mdfd_open_flags (void)
 
void mdinit (void)
 
bool mdexists (SMgrRelation reln, ForkNumber forknum)
 
void mdcreate (SMgrRelation reln, ForkNumber forknum, bool isRedo)
 
void mdunlink (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 
static int do_truncate (const char *path)
 
void mdextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
 
void mdzeroextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
 
void mdopen (SMgrRelation reln)
 
void mdclose (SMgrRelation reln, ForkNumber forknum)
 
bool mdprefetch (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
 
static int buffers_to_iovec (struct iovec *iov, void **buffers, int nblocks)
 
uint32 mdmaxcombine (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
void mdreadv (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
 
void mdstartreadv (PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
 
void mdwritev (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
 
void mdwriteback (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
 
BlockNumber mdnblocks (SMgrRelation reln, ForkNumber forknum)
 
void mdtruncate (SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
 
void mdregistersync (SMgrRelation reln, ForkNumber forknum)
 
void mdimmedsync (SMgrRelation reln, ForkNumber forknum)
 
int mdfd (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
 
void ForgetDatabaseSyncRequests (Oid dbid)
 
void DropRelationFiles (RelFileLocator *delrels, int ndelrels, bool isRedo)
 
int mdsyncfiletag (const FileTag *ftag, char *path)
 
int mdunlinkfiletag (const FileTag *ftag, char *path)
 
bool mdfiletagmatches (const FileTag *ftag, const FileTag *candidate)
 

Variables

static MemoryContext MdCxt
 
const PgAioHandleCallbacks aio_md_readv_cb
 

Macro Definition Documentation

◆ EXTENSION_CREATE

#define EXTENSION_CREATE   (1 << 2)

Definition at line 117 of file md.c.

◆ EXTENSION_CREATE_RECOVERY

#define EXTENSION_CREATE_RECOVERY   (1 << 3)

Definition at line 119 of file md.c.

◆ EXTENSION_DONT_OPEN

#define EXTENSION_DONT_OPEN   (1 << 5)

Definition at line 121 of file md.c.

◆ EXTENSION_FAIL

#define EXTENSION_FAIL   (1 << 0)

Definition at line 113 of file md.c.

◆ EXTENSION_RETURN_NULL

#define EXTENSION_RETURN_NULL   (1 << 1)

Definition at line 115 of file md.c.

◆ INIT_MD_FILETAG

#define INIT_MD_FILETAG (   a,
  xx_rlocator,
  xx_forknum,
  xx_segno 
)
Value:
( \
memset(&(a), 0, sizeof(FileTag)), \
(a).handler = SYNC_HANDLER_MD, \
(a).rlocator = (xx_rlocator), \
(a).forknum = (xx_forknum), \
(a).segno = (xx_segno) \
)
int a
Definition: isn.c:73
Definition: sync.h:51
@ SYNC_HANDLER_MD
Definition: sync.h:37

Definition at line 101 of file md.c.

◆ MD_PATH_STR_MAXLEN

#define MD_PATH_STR_MAXLEN
Value:
(\
REL_PATH_STR_MAXLEN \
+ sizeof((char)'.') \
)
#define SEGMENT_CHARS
Definition: md.c:131

Definition at line 132 of file md.c.

◆ SEGMENT_CHARS

#define SEGMENT_CHARS   OIDCHARS

Definition at line 131 of file md.c.

Typedef Documentation

◆ MdfdVec

typedef struct _MdfdVec MdfdVec

◆ MdPathStr

typedef struct MdPathStr MdPathStr

Function Documentation

◆ _fdvec_resize()

static void _fdvec_resize ( SMgrRelation  reln,
ForkNumber  forknum,
int  nseg 
)
static

Definition at line 1629 of file md.c.

1632{
1633 if (nseg == 0)
1634 {
1635 if (reln->md_num_open_segs[forknum] > 0)
1636 {
1637 pfree(reln->md_seg_fds[forknum]);
1638 reln->md_seg_fds[forknum] = NULL;
1639 }
1640 }
1641 else if (reln->md_num_open_segs[forknum] == 0)
1642 {
1643 reln->md_seg_fds[forknum] =
1644 MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
1645 }
1646 else if (nseg > reln->md_num_open_segs[forknum])
1647 {
1648 /*
1649 * It doesn't seem worthwhile complicating the code to amortize
1650 * repalloc() calls. Those are far faster than PathNameOpenFile() or
1651 * FileClose(), and the memory context internally will sometimes avoid
1652 * doing an actual reallocation.
1653 */
1654 reln->md_seg_fds[forknum] =
1655 repalloc(reln->md_seg_fds[forknum],
1656 sizeof(MdfdVec) * nseg);
1657 }
1658 else
1659 {
1660 /*
1661 * We don't reallocate a smaller array, because we want mdtruncate()
1662 * to be able to promise that it won't allocate memory, so that it is
1663 * allowed in a critical section. This means that a bit of space in
1664 * the array is now wasted, until the next time we add a segment and
1665 * reallocate.
1666 */
1667 }
1668
1669 reln->md_num_open_segs[forknum] = nseg;
1670}
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1229
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void pfree(void *pointer)
Definition: mcxt.c:1594
static MemoryContext MdCxt
Definition: md.c:97
int md_num_open_segs[MAX_FORKNUM+1]
Definition: smgr.h:61
struct _MdfdVec * md_seg_fds[MAX_FORKNUM+1]
Definition: smgr.h:62
Definition: md.c:92

References SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, MdCxt, MemoryContextAlloc(), pfree(), and repalloc().

Referenced by _mdfd_openseg(), mdclose(), mdcreate(), mdimmedsync(), mdopenfork(), mdregistersync(), and mdtruncate().

◆ _mdfd_getseg()

static MdfdVec * _mdfd_getseg ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blkno,
bool  skipFsync,
int  behavior 
)
static

Definition at line 1740 of file md.c.

1742{
1743 MdfdVec *v;
1744 BlockNumber targetseg;
1745 BlockNumber nextsegno;
1746
1747 /* some way to handle non-existent segments needs to be specified */
1748 Assert(behavior &
1751
1752 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1753
1754 /* if an existing and opened segment, we're done */
1755 if (targetseg < reln->md_num_open_segs[forknum])
1756 {
1757 v = &reln->md_seg_fds[forknum][targetseg];
1758 return v;
1759 }
1760
1761 /* The caller only wants the segment if we already had it open. */
1762 if (behavior & EXTENSION_DONT_OPEN)
1763 return NULL;
1764
1765 /*
1766 * The target segment is not yet open. Iterate over all the segments
1767 * between the last opened and the target segment. This way missing
1768 * segments either raise an error, or get created (according to
1769 * 'behavior'). Start with either the last opened, or the first segment if
1770 * none was opened before.
1771 */
1772 if (reln->md_num_open_segs[forknum] > 0)
1773 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1774 else
1775 {
1776 v = mdopenfork(reln, forknum, behavior);
1777 if (!v)
1778 return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1779 }
1780
1781 for (nextsegno = reln->md_num_open_segs[forknum];
1782 nextsegno <= targetseg; nextsegno++)
1783 {
1784 BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1785 int flags = 0;
1786
1787 Assert(nextsegno == v->mdfd_segno + 1);
1788
1789 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1790 elog(FATAL, "segment too big");
1791
1792 if ((behavior & EXTENSION_CREATE) ||
1793 (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1794 {
1795 /*
1796 * Normally we will create new segments only if authorized by the
1797 * caller (i.e., we are doing mdextend()). But when doing WAL
1798 * recovery, create segments anyway; this allows cases such as
1799 * replaying WAL data that has a write into a high-numbered
1800 * segment of a relation that was later deleted. We want to go
1801 * ahead and create the segments so we can finish out the replay.
1802 *
1803 * We have to maintain the invariant that segments before the last
1804 * active segment are of size RELSEG_SIZE; therefore, if
1805 * extending, pad them out with zeroes if needed. (This only
1806 * matters if in recovery, or if the caller is extending the
1807 * relation discontiguously, but that can happen in hash indexes.)
1808 */
1809 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1810 {
1811 char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
1813
1814 mdextend(reln, forknum,
1815 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1816 zerobuf, skipFsync);
1817 pfree(zerobuf);
1818 }
1819 flags = O_CREAT;
1820 }
1821 else if (nblocks < ((BlockNumber) RELSEG_SIZE))
1822 {
1823 /*
1824 * When not extending, only open the next segment if the current
1825 * one is exactly RELSEG_SIZE. If not (this branch), either
1826 * return NULL or fail.
1827 */
1828 if (behavior & EXTENSION_RETURN_NULL)
1829 {
1830 /*
1831 * Some callers discern between reasons for _mdfd_getseg()
1832 * returning NULL based on errno. As there's no failing
1833 * syscall involved in this case, explicitly set errno to
1834 * ENOENT, as that seems the closest interpretation.
1835 */
1836 errno = ENOENT;
1837 return NULL;
1838 }
1839
1840 ereport(ERROR,
1842 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1843 _mdfd_segpath(reln, forknum, nextsegno).str,
1844 blkno, nblocks)));
1845 }
1846
1847 v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1848
1849 if (v == NULL)
1850 {
1851 if ((behavior & EXTENSION_RETURN_NULL) &&
1852 FILE_POSSIBLY_DELETED(errno))
1853 return NULL;
1854 ereport(ERROR,
1856 errmsg("could not open file \"%s\" (target block %u): %m",
1857 _mdfd_segpath(reln, forknum, nextsegno).str,
1858 blkno)));
1859 }
1860 }
1861
1862 return v;
1863}
uint32 BlockNumber
Definition: block.h:31
int errcode_for_file_access(void)
Definition: elog.c:886
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
#define FILE_POSSIBLY_DELETED(err)
Definition: fd.h:78
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:30
Assert(PointerIsAligned(start, uint64))
const char * str
void * palloc_aligned(Size size, Size alignto, int flags)
Definition: mcxt.c:1584
#define EXTENSION_CREATE_RECOVERY
Definition: md.c:119
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition: md.c:1869
static MdPathStr _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1677
#define EXTENSION_DONT_OPEN
Definition: md.c:121
void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: md.c:487
static MdfdVec * _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
Definition: md.c:1697
#define EXTENSION_RETURN_NULL
Definition: md.c:115
#define EXTENSION_CREATE
Definition: md.c:117
#define EXTENSION_FAIL
Definition: md.c:113
static MdfdVec * mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
Definition: md.c:664
#define PG_IO_ALIGN_SIZE
BlockNumber mdfd_segno
Definition: md.c:94
bool InRecovery
Definition: xlogutils.c:50

References _mdfd_openseg(), _mdfd_segpath(), _mdnblocks(), Assert(), elog, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE, EXTENSION_CREATE_RECOVERY, EXTENSION_DONT_OPEN, EXTENSION_FAIL, EXTENSION_RETURN_NULL, FATAL, FILE_POSSIBLY_DELETED, InRecovery, MCXT_ALLOC_ZERO, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, mdextend(), _MdfdVec::mdfd_segno, mdopenfork(), palloc_aligned(), pfree(), PG_IO_ALIGN_SIZE, and str.

Referenced by mdextend(), mdfd(), mdprefetch(), mdreadv(), mdstartreadv(), mdwriteback(), mdwritev(), and mdzeroextend().

◆ _mdfd_open_flags()

static int _mdfd_open_flags ( void  )
inlinestatic

Definition at line 176 of file md.c.

177{
178 int flags = O_RDWR | PG_BINARY;
179
181 flags |= PG_O_DIRECT;
182
183 return flags;
184}
#define PG_BINARY
Definition: c.h:1261
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
#define PG_O_DIRECT
Definition: fd.h:96

References IO_DIRECT_DATA, io_direct_flags, PG_BINARY, and PG_O_DIRECT.

Referenced by _mdfd_openseg(), mdcreate(), mdopenfork(), and mdsyncfiletag().

◆ _mdfd_openseg()

static MdfdVec * _mdfd_openseg ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  segno,
int  oflags 
)
static

Definition at line 1697 of file md.c.

1699{
1700 MdfdVec *v;
1701 File fd;
1702 MdPathStr fullpath;
1703
1704 fullpath = _mdfd_segpath(reln, forknum, segno);
1705
1706 /* open the file */
1707 fd = PathNameOpenFile(fullpath.str, _mdfd_open_flags() | oflags);
1708
1709 if (fd < 0)
1710 return NULL;
1711
1712 /*
1713 * Segments are always opened in order from lowest to highest, so we must
1714 * be adding a new one at the end.
1715 */
1716 Assert(segno == reln->md_num_open_segs[forknum]);
1717
1718 _fdvec_resize(reln, forknum, segno + 1);
1719
1720 /* fill the entry */
1721 v = &reln->md_seg_fds[forknum][segno];
1722 v->mdfd_vfd = fd;
1723 v->mdfd_segno = segno;
1724
1725 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1726
1727 /* all done */
1728 return v;
1729}
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1576
int File
Definition: fd.h:51
static int _mdfd_open_flags(void)
Definition: md.c:176
static void _fdvec_resize(SMgrRelation reln, ForkNumber forknum, int nseg)
Definition: md.c:1629
static int fd(const char *x, int i)
Definition: preproc-init.c:105
Definition: md.c:139
char str[MD_PATH_STR_MAXLEN+1]
Definition: md.c:140
File mdfd_vfd
Definition: md.c:93

References _fdvec_resize(), _mdfd_open_flags(), _mdfd_segpath(), _mdnblocks(), Assert(), fd(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, PathNameOpenFile(), and MdPathStr::str.

Referenced by _mdfd_getseg(), mdimmedsync(), mdnblocks(), and mdregistersync().

◆ _mdfd_segpath()

static MdPathStr _mdfd_segpath ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1677 of file md.c.

1678{
1679 RelPathStr path;
1680 MdPathStr fullpath;
1681
1682 path = relpath(reln->smgr_rlocator, forknum);
1683
1684 if (segno > 0)
1685 sprintf(fullpath.str, "%s.%u", path.str, segno);
1686 else
1687 strcpy(fullpath.str, path.str);
1688
1689 return fullpath;
1690}
#define sprintf
Definition: port.h:262
#define relpath(rlocator, forknum)
Definition: relpath.h:150
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38

References relpath, SMgrRelationData::smgr_rlocator, sprintf, MdPathStr::str, and RelPathStr::str.

Referenced by _mdfd_getseg(), _mdfd_openseg(), and mdsyncfiletag().

◆ _mdnblocks()

static BlockNumber _mdnblocks ( SMgrRelation  reln,
ForkNumber  forknum,
MdfdVec seg 
)
static

Definition at line 1869 of file md.c.

1870{
1871 pgoff_t len;
1872
1873 len = FileSize(seg->mdfd_vfd);
1874 if (len < 0)
1875 ereport(ERROR,
1877 errmsg("could not seek to end of file \"%s\": %m",
1878 FilePathName(seg->mdfd_vfd))));
1879 /* note that this calculation will ignore any partial block at EOF */
1880 return (BlockNumber) (len / BLCKSZ);
1881}
char * FilePathName(File file)
Definition: fd.c:2513
pgoff_t FileSize(File file)
Definition: fd.c:2461
const void size_t len
#define pgoff_t
Definition: port.h:422

References ereport, errcode_for_file_access(), errmsg(), ERROR, FilePathName(), FileSize(), len, _MdfdVec::mdfd_vfd, and pgoff_t.

Referenced by _mdfd_getseg(), _mdfd_openseg(), mdextend(), mdnblocks(), mdopenfork(), and mdzeroextend().

◆ buffers_to_iovec()

static int buffers_to_iovec ( struct iovec *  iov,
void **  buffers,
int  nblocks 
)
static

Definition at line 784 of file md.c.

785{
786 struct iovec *iovp;
787 int iovcnt;
788
789 Assert(nblocks >= 1);
790
791 /* If this build supports direct I/O, buffers must be I/O aligned. */
792 for (int i = 0; i < nblocks; ++i)
793 {
794 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
795 Assert((uintptr_t) buffers[i] ==
796 TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
797 }
798
799 /* Start the first iovec off with the first buffer. */
800 iovp = &iov[0];
801 iovp->iov_base = buffers[0];
802 iovp->iov_len = BLCKSZ;
803 iovcnt = 1;
804
805 /* Try to merge the rest. */
806 for (int i = 1; i < nblocks; ++i)
807 {
808 void *buffer = buffers[i];
809
810 if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
811 {
812 /* Contiguous with the last iovec. */
813 iovp->iov_len += BLCKSZ;
814 }
815 else
816 {
817 /* Need a new iovec. */
818 iovp++;
819 iovp->iov_base = buffer;
820 iovp->iov_len = BLCKSZ;
821 iovcnt++;
822 }
823 }
824
825 return iovcnt;
826}
#define TYPEALIGN(ALIGNVAL, LEN)
Definition: c.h:806
int i
Definition: isn.c:77

References Assert(), i, PG_IO_ALIGN_SIZE, PG_O_DIRECT, and TYPEALIGN.

Referenced by mdreadv(), mdstartreadv(), and mdwritev().

◆ do_truncate()

static int do_truncate ( const char *  path)
static

Definition at line 353 of file md.c.

354{
355 int save_errno;
356 int ret;
357
358 ret = pg_truncate(path, 0);
359
360 /* Log a warning here to avoid repetition in callers. */
361 if (ret < 0 && errno != ENOENT)
362 {
363 save_errno = errno;
366 errmsg("could not truncate file \"%s\": %m", path)));
367 errno = save_errno;
368 }
369
370 return ret;
371}
#define WARNING
Definition: elog.h:36
int pg_truncate(const char *path, pgoff_t length)
Definition: fd.c:717

References ereport, errcode_for_file_access(), errmsg(), pg_truncate(), and WARNING.

Referenced by mdunlinkfork().

◆ DropRelationFiles()

void DropRelationFiles ( RelFileLocator delrels,
int  ndelrels,
bool  isRedo 
)

Definition at line 1597 of file md.c.

1598{
1599 SMgrRelation *srels;
1600 int i;
1601
1602 srels = palloc(sizeof(SMgrRelation) * ndelrels);
1603 for (i = 0; i < ndelrels; i++)
1604 {
1605 SMgrRelation srel = smgropen(delrels[i], INVALID_PROC_NUMBER);
1606
1607 if (isRedo)
1608 {
1609 ForkNumber fork;
1610
1611 for (fork = 0; fork <= MAX_FORKNUM; fork++)
1612 XLogDropRelation(delrels[i], fork);
1613 }
1614 srels[i] = srel;
1615 }
1616
1617 smgrdounlinkall(srels, ndelrels, isRedo);
1618
1619 for (i = 0; i < ndelrels; i++)
1620 smgrclose(srels[i]);
1621 pfree(srels);
1622}
void * palloc(Size size)
Definition: mcxt.c:1365
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
ForkNumber
Definition: relpath.h:56
#define MAX_FORKNUM
Definition: relpath.h:70
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrclose(SMgrRelation reln)
Definition: smgr.c:374
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
Definition: smgr.c:538
void XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum)
Definition: xlogutils.c:630

References i, INVALID_PROC_NUMBER, MAX_FORKNUM, palloc(), pfree(), smgrclose(), smgrdounlinkall(), smgropen(), and XLogDropRelation().

Referenced by FinishPreparedTransaction(), xact_redo_abort(), and xact_redo_commit().

◆ ForgetDatabaseSyncRequests()

void ForgetDatabaseSyncRequests ( Oid  dbid)

Definition at line 1579 of file md.c.

1580{
1581 FileTag tag;
1582 RelFileLocator rlocator;
1583
1584 rlocator.dbOid = dbid;
1585 rlocator.spcOid = 0;
1586 rlocator.relNumber = 0;
1587
1589
1590 RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1591}
#define InvalidBlockNumber
Definition: block.h:33
#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
Definition: md.c:101
@ InvalidForkNumber
Definition: relpath.h:57
RelFileNumber relNumber
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition: sync.c:580
@ SYNC_FILTER_REQUEST
Definition: sync.h:28

References RelFileLocator::dbOid, INIT_MD_FILETAG, InvalidBlockNumber, InvalidForkNumber, RegisterSyncRequest(), RelFileLocator::relNumber, RelFileLocator::spcOid, and SYNC_FILTER_REQUEST.

Referenced by createdb_failure_callback(), dbase_redo(), and dropdb().

◆ md_readv_complete()

static PgAioResult md_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 1975 of file md.c.

1976{
1978 PgAioResult result = prior_result;
1979
1980 if (prior_result.result < 0)
1981 {
1982 result.status = PGAIO_RS_ERROR;
1983 result.id = PGAIO_HCB_MD_READV;
1984 /* For "hard" errors, track the error number in error_data */
1985 result.error_data = -prior_result.result;
1986 result.result = 0;
1987
1988 /*
1989 * Immediately log a message about the IO error, but only to the
1990 * server log. The reason to do so immediately is that the originator
1991 * might not process the query result immediately (because it is busy
1992 * doing another part of query processing) or at all (e.g. if it was
1993 * cancelled or errored out due to another IO also failing). The
1994 * definer of the IO will emit an ERROR when processing the IO's
1995 * results
1996 */
1998
1999 return result;
2000 }
2001
2002 /*
2003 * As explained above smgrstartreadv(), the smgr API operates on the level
2004 * of blocks, rather than bytes. Convert.
2005 */
2006 result.result /= BLCKSZ;
2007
2008 Assert(result.result <= td->smgr.nblocks);
2009
2010 if (result.result == 0)
2011 {
2012 /* consider 0 blocks read a failure */
2013 result.status = PGAIO_RS_ERROR;
2014 result.id = PGAIO_HCB_MD_READV;
2015 result.error_data = 0;
2016
2017 /* see comment above the "hard error" case */
2019
2020 return result;
2021 }
2022
2023 if (result.status != PGAIO_RS_ERROR &&
2024 result.result < td->smgr.nblocks)
2025 {
2026 /* partial reads should be retried at upper level */
2027 result.status = PGAIO_RS_PARTIAL;
2028 result.id = PGAIO_HCB_MD_READV;
2029 }
2030
2031 return result;
2032}
@ PGAIO_HCB_MD_READV
Definition: aio.h:196
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
Definition: aio_callback.c:173
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:73
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
#define LOG_SERVER_ONLY
Definition: elog.h:32
uint32 status
Definition: aio_types.h:108
uint32 error_data
Definition: aio_types.h:111
int32 result
Definition: aio_types.h:113
uint32 id
Definition: aio_types.h:105
struct PgAioTargetData::@126 smgr
BlockNumber nblocks
Definition: aio_types.h:67

References Assert(), PgAioResult::error_data, PgAioResult::id, LOG_SERVER_ONLY, PgAioTargetData::nblocks, PGAIO_HCB_MD_READV, pgaio_io_get_target_data(), pgaio_result_report(), PGAIO_RS_ERROR, PGAIO_RS_PARTIAL, PgAioResult::result, PgAioTargetData::smgr, and PgAioResult::status.

◆ md_readv_report()

static void md_readv_report ( PgAioResult  result,
const PgAioTargetData td,
int  elevel 
)
static

Definition at line 2042 of file md.c.

2043{
2044 RelPathStr path;
2045
2046 path = relpathbackend(td->smgr.rlocator,
2048 td->smgr.forkNum);
2049
2050 if (result.error_data != 0)
2051 {
2052 /* for errcode_for_file_access() and %m */
2053 errno = result.error_data;
2054
2055 ereport(elevel,
2057 errmsg("could not read blocks %u..%u in file \"%s\": %m",
2058 td->smgr.blockNum,
2059 td->smgr.blockNum + td->smgr.nblocks - 1,
2060 path.str));
2061 }
2062 else
2063 {
2064 /*
2065 * NB: This will typically only be output in debug messages, while
2066 * retrying a partial IO.
2067 */
2068 ereport(elevel,
2070 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
2071 td->smgr.blockNum,
2072 td->smgr.blockNum + td->smgr.nblocks - 1,
2073 path.str,
2074 result.result * (size_t) BLCKSZ,
2075 td->smgr.nblocks * (size_t) BLCKSZ));
2076 }
2077}
int errcode(int sqlerrcode)
Definition: elog.c:863
ProcNumber MyProcNumber
Definition: globals.c:90
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
BlockNumber blockNum
Definition: aio_types.h:66
RelFileLocator rlocator
Definition: aio_types.h:65
ForkNumber forkNum
Definition: aio_types.h:68

References PgAioTargetData::blockNum, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg(), PgAioResult::error_data, PgAioTargetData::forkNum, INVALID_PROC_NUMBER, PgAioTargetData::is_temp, MyProcNumber, PgAioTargetData::nblocks, relpathbackend, PgAioResult::result, PgAioTargetData::rlocator, PgAioTargetData::smgr, and RelPathStr::str.

◆ mdclose()

void mdclose ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 713 of file md.c.

714{
715 int nopensegs = reln->md_num_open_segs[forknum];
716
717 /* No work if already closed */
718 if (nopensegs == 0)
719 return;
720
721 /* close segments starting from the end */
722 while (nopensegs > 0)
723 {
724 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
725
727 _fdvec_resize(reln, forknum, nopensegs - 1);
728 nopensegs--;
729 }
730}
void FileClose(File file)
Definition: fd.c:1979

References _fdvec_resize(), FileClose(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, and _MdfdVec::mdfd_vfd.

Referenced by mdexists().

◆ mdcreate()

void mdcreate ( SMgrRelation  reln,
ForkNumber  forknum,
bool  isRedo 
)

Definition at line 222 of file md.c.

223{
224 MdfdVec *mdfd;
225 RelPathStr path;
226 File fd;
227
228 if (isRedo && reln->md_num_open_segs[forknum] > 0)
229 return; /* created and opened already... */
230
231 Assert(reln->md_num_open_segs[forknum] == 0);
232
233 /*
234 * We may be using the target table space for the first time in this
235 * database, so create a per-database subdirectory if needed.
236 *
237 * XXX this is a fairly ugly violation of module layering, but this seems
238 * to be the best place to put the check. Maybe TablespaceCreateDbspace
239 * should be here and not in commands/tablespace.c? But that would imply
240 * importing a lot of stuff that smgr.c oughtn't know, either.
241 */
244 isRedo);
245
246 path = relpath(reln->smgr_rlocator, forknum);
247
248 fd = PathNameOpenFile(path.str, _mdfd_open_flags() | O_CREAT | O_EXCL);
249
250 if (fd < 0)
251 {
252 int save_errno = errno;
253
254 if (isRedo)
256 if (fd < 0)
257 {
258 /* be sure to report the error reported by create, not open */
259 errno = save_errno;
262 errmsg("could not create file \"%s\": %m", path.str)));
263 }
264 }
265
266 _fdvec_resize(reln, forknum, 1);
267 mdfd = &reln->md_seg_fds[forknum][0];
268 mdfd->mdfd_vfd = fd;
269 mdfd->mdfd_segno = 0;
270
271 if (!SmgrIsTemp(reln))
272 register_dirty_segment(reln, forknum, mdfd);
273}
void TablespaceCreateDbspace(Oid spcOid, Oid dbOid, bool isRedo)
Definition: tablespace.c:112
int mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
Definition: md.c:1480
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition: md.c:1504
#define SmgrIsTemp(smgr)
Definition: smgr.h:74
RelFileLocator locator

References _fdvec_resize(), _mdfd_open_flags(), Assert(), RelFileLocator::dbOid, ereport, errcode_for_file_access(), errmsg(), ERROR, fd(), RelFileLocatorBackend::locator, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, mdfd(), PathNameOpenFile(), register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, SmgrIsTemp, RelFileLocator::spcOid, RelPathStr::str, and TablespaceCreateDbspace().

◆ mdexists()

bool mdexists ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 203 of file md.c.

204{
205 /*
206 * Close it first, to ensure that we notice if the fork has been unlinked
207 * since we opened it. As an optimization, we can skip that in recovery,
208 * which already closes relations when dropping them.
209 */
210 if (!InRecovery)
211 mdclose(reln, forknum);
212
213 return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
214}
void mdclose(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:713

References EXTENSION_RETURN_NULL, InRecovery, mdclose(), and mdopenfork().

◆ mdextend()

void mdextend ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
const void *  buffer,
bool  skipFsync 
)

Definition at line 487 of file md.c.

489{
490 pgoff_t seekpos;
491 int nbytes;
492 MdfdVec *v;
493
494 /* If this build supports direct I/O, the buffer must be I/O aligned. */
495 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
496 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
497
498 /* This assert is too expensive to have on normally ... */
499#ifdef CHECK_WRITE_VS_EXTEND
500 Assert(blocknum >= mdnblocks(reln, forknum));
501#endif
502
503 /*
504 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
505 * more --- we mustn't create a block whose number actually is
506 * InvalidBlockNumber. (Note that this failure should be unreachable
507 * because of upstream checks in bufmgr.c.)
508 */
509 if (blocknum == InvalidBlockNumber)
511 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
512 errmsg("cannot extend file \"%s\" beyond %u blocks",
513 relpath(reln->smgr_rlocator, forknum).str,
515
516 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
517
518 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
519
520 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
521
522 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
523 {
524 if (nbytes < 0)
527 errmsg("could not extend file \"%s\": %m",
529 errhint("Check free disk space.")));
530 /* short write: complain appropriately */
532 (errcode(ERRCODE_DISK_FULL),
533 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
535 nbytes, BLCKSZ, blocknum),
536 errhint("Check free disk space.")));
537 }
538
539 if (!skipFsync && !SmgrIsTemp(reln))
540 register_dirty_segment(reln, forknum, v);
541
542 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
543}
int errhint(const char *fmt,...)
Definition: elog.c:1330
static ssize_t FileWrite(File file, const void *buffer, size_t amount, pgoff_t offset, uint32 wait_event_info)
Definition: fd.h:210
BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:1223
static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
Definition: md.c:1740

References _mdfd_getseg(), _mdnblocks(), Assert(), ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, FilePathName(), FileWrite(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), PG_IO_ALIGN_SIZE, PG_O_DIRECT, pgoff_t, register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, SmgrIsTemp, and TYPEALIGN.

Referenced by _mdfd_getseg().

◆ mdfd()

int mdfd ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
uint32 off 
)

Definition at line 1480 of file md.c.

1481{
1482 MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
1483
1484 v = _mdfd_getseg(reln, forknum, blocknum, false,
1486
1487 *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1488
1489 Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1490
1491 return FileGetRawDesc(v->mdfd_vfd);
1492}
int FileGetRawDesc(File file)
Definition: fd.c:2529

References _mdfd_getseg(), Assert(), EXTENSION_FAIL, FileGetRawDesc(), _MdfdVec::mdfd_vfd, mdopenfork(), and pgoff_t.

Referenced by mdcreate(), and mdopenfork().

◆ mdfiletagmatches()

bool mdfiletagmatches ( const FileTag ftag,
const FileTag candidate 
)

Definition at line 1960 of file md.c.

1961{
1962 /*
1963 * For now we only use filter requests as a way to drop all scheduled
1964 * callbacks relating to a given database, when dropping the database.
1965 * We'll return true for all candidates that have the same database OID as
1966 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1967 */
1968 return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1969}
RelFileLocator rlocator
Definition: sync.h:54

References RelFileLocator::dbOid, and FileTag::rlocator.

◆ mdimmedsync()

void mdimmedsync ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1427 of file md.c.

1428{
1429 int segno;
1430 int min_inactive_seg;
1431
1432 /*
1433 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1434 * the loop below will get them all!
1435 */
1436 mdnblocks(reln, forknum);
1437
1438 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1439
1440 /*
1441 * Temporarily open inactive segments, then close them after sync. There
1442 * may be some inactive segments left opened after fsync() error, but that
1443 * is harmless. We don't bother to clean them up and take a risk of
1444 * further trouble. The next mdclose() will soon close them.
1445 */
1446 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1447 segno++;
1448
1449 while (segno > 0)
1450 {
1451 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1452
1453 /*
1454 * fsyncs done through mdimmedsync() should be tracked in a separate
1455 * IOContext than those done through mdsyncfiletag() to differentiate
1456 * between unavoidable client backend fsyncs (e.g. those done during
1457 * index build) and those which ideally would have been done by the
1458 * checkpointer. Since other IO operations bypassing the buffer
1459 * manager could also be tracked in such an IOContext, wait until
1460 * these are also tracked to track immediate fsyncs.
1461 */
1462 if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
1465 errmsg("could not fsync file \"%s\": %m",
1466 FilePathName(v->mdfd_vfd))));
1467
1468 /* Close inactive segments immediately */
1469 if (segno > min_inactive_seg)
1470 {
1471 FileClose(v->mdfd_vfd);
1472 _fdvec_resize(reln, forknum, segno - 1);
1473 }
1474
1475 segno--;
1476 }
1477}
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2349
int data_sync_elevel(int elevel)
Definition: fd.c:3998

References _fdvec_resize(), _mdfd_openseg(), data_sync_elevel(), ereport, errcode_for_file_access(), errmsg(), ERROR, FileClose(), FilePathName(), FileSync(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, and mdnblocks().

◆ mdinit()

void mdinit ( void  )

Definition at line 190 of file md.c.

191{
193 "MdSmgr",
195}
MemoryContext TopMemoryContext
Definition: mcxt.c:166
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, MdCxt, and TopMemoryContext.

◆ mdmaxcombine()

uint32 mdmaxcombine ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum 
)

Definition at line 833 of file md.c.

835{
836 BlockNumber segoff;
837
838 segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
839
840 return RELSEG_SIZE - segoff;
841}

◆ mdnblocks()

BlockNumber mdnblocks ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1223 of file md.c.

1224{
1225 MdfdVec *v;
1226 BlockNumber nblocks;
1227 BlockNumber segno;
1228
1229 mdopenfork(reln, forknum, EXTENSION_FAIL);
1230
1231 /* mdopen has opened the first segment */
1232 Assert(reln->md_num_open_segs[forknum] > 0);
1233
1234 /*
1235 * Start from the last open segments, to avoid redundant seeks. We have
1236 * previously verified that these segments are exactly RELSEG_SIZE long,
1237 * and it's useless to recheck that each time.
1238 *
1239 * NOTE: this assumption could only be wrong if another backend has
1240 * truncated the relation. We rely on higher code levels to handle that
1241 * scenario by closing and re-opening the md fd, which is handled via
1242 * relcache flush. (Since the checkpointer doesn't participate in
1243 * relcache flush, it could have segment entries for inactive segments;
1244 * that's OK because the checkpointer never needs to compute relation
1245 * size.)
1246 */
1247 segno = reln->md_num_open_segs[forknum] - 1;
1248 v = &reln->md_seg_fds[forknum][segno];
1249
1250 for (;;)
1251 {
1252 nblocks = _mdnblocks(reln, forknum, v);
1253 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1254 elog(FATAL, "segment too big");
1255 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1256 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1257
1258 /*
1259 * If segment is exactly RELSEG_SIZE, advance to next one.
1260 */
1261 segno++;
1262
1263 /*
1264 * We used to pass O_CREAT here, but that has the disadvantage that it
1265 * might create a segment which has vanished through some operating
1266 * system misadventure. In such a case, creating the segment here
1267 * undermines _mdfd_getseg's attempts to notice and report an error
1268 * upon access to a missing segment.
1269 */
1270 v = _mdfd_openseg(reln, forknum, segno, 0);
1271 if (v == NULL)
1272 return segno * ((BlockNumber) RELSEG_SIZE);
1273 }
1274}

References _mdfd_openseg(), _mdnblocks(), Assert(), elog, EXTENSION_FAIL, FATAL, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, and mdopenfork().

Referenced by mdextend(), mdimmedsync(), mdregistersync(), mdwritev(), and mdzeroextend().

◆ mdopen()

void mdopen ( SMgrRelation  reln)

Definition at line 702 of file md.c.

703{
704 /* mark it not open */
705 for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
706 reln->md_num_open_segs[forknum] = 0;
707}

References MAX_FORKNUM, and SMgrRelationData::md_num_open_segs.

◆ mdopenfork()

static MdfdVec * mdopenfork ( SMgrRelation  reln,
ForkNumber  forknum,
int  behavior 
)
static

Definition at line 664 of file md.c.

665{
666 MdfdVec *mdfd;
667 RelPathStr path;
668 File fd;
669
670 /* No work if already open */
671 if (reln->md_num_open_segs[forknum] > 0)
672 return &reln->md_seg_fds[forknum][0];
673
674 path = relpath(reln->smgr_rlocator, forknum);
675
677
678 if (fd < 0)
679 {
680 if ((behavior & EXTENSION_RETURN_NULL) &&
682 return NULL;
685 errmsg("could not open file \"%s\": %m", path.str)));
686 }
687
688 _fdvec_resize(reln, forknum, 1);
689 mdfd = &reln->md_seg_fds[forknum][0];
690 mdfd->mdfd_vfd = fd;
691 mdfd->mdfd_segno = 0;
692
693 Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
694
695 return mdfd;
696}

References _fdvec_resize(), _mdfd_open_flags(), _mdnblocks(), Assert(), ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_RETURN_NULL, fd(), FILE_POSSIBLY_DELETED, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, mdfd(), PathNameOpenFile(), relpath, SMgrRelationData::smgr_rlocator, and RelPathStr::str.

Referenced by _mdfd_getseg(), mdexists(), mdfd(), and mdnblocks().

◆ mdprefetch()

bool mdprefetch ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
int  nblocks 
)

Definition at line 736 of file md.c.

738{
739#ifdef USE_PREFETCH
740
742
743 if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
744 return false;
745
746 while (nblocks > 0)
747 {
748 pgoff_t seekpos;
749 MdfdVec *v;
750 int nblocks_this_segment;
751
752 v = _mdfd_getseg(reln, forknum, blocknum, false,
754 if (v == NULL)
755 return false;
756
757 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
758
759 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
760
761 nblocks_this_segment =
762 Min(nblocks,
763 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
764
765 (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ * nblocks_this_segment,
766 WAIT_EVENT_DATA_FILE_PREFETCH);
767
768 blocknum += nblocks_this_segment;
769 nblocks -= nblocks_this_segment;
770 }
771#endif /* USE_PREFETCH */
772
773 return true;
774}
#define MaxBlockNumber
Definition: block.h:35
#define Min(x, y)
Definition: c.h:1006
uint64_t uint64
Definition: c.h:542
int FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition: fd.c:2080

References _mdfd_getseg(), Assert(), EXTENSION_FAIL, EXTENSION_RETURN_NULL, FilePrefetch(), InRecovery, IO_DIRECT_DATA, io_direct_flags, MaxBlockNumber, _MdfdVec::mdfd_vfd, Min, and pgoff_t.

◆ mdreadv()

void mdreadv ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
void **  buffers,
BlockNumber  nblocks 
)

Definition at line 847 of file md.c.

849{
850 while (nblocks > 0)
851 {
852 struct iovec iov[PG_IOV_MAX];
853 int iovcnt;
854 pgoff_t seekpos;
855 int nbytes;
856 MdfdVec *v;
857 BlockNumber nblocks_this_segment;
858 size_t transferred_this_segment;
859 size_t size_this_segment;
860
861 v = _mdfd_getseg(reln, forknum, blocknum, false,
863
864 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
865
866 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
867
868 nblocks_this_segment =
869 Min(nblocks,
870 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
871 nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
872
873 if (nblocks_this_segment != nblocks)
874 elog(ERROR, "read crosses segment boundary");
875
876 iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
877 size_this_segment = nblocks_this_segment * BLCKSZ;
878 transferred_this_segment = 0;
879
880 /*
881 * Inner loop to continue after a short read. We'll keep going until
882 * we hit EOF rather than assuming that a short read means we hit the
883 * end.
884 */
885 for (;;)
886 {
887 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
891 reln->smgr_rlocator.backend);
892 nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
893 WAIT_EVENT_DATA_FILE_READ);
894 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
899 nbytes,
900 size_this_segment - transferred_this_segment);
901
902#ifdef SIMULATE_SHORT_READ
903 nbytes = Min(nbytes, 4096);
904#endif
905
906 if (nbytes < 0)
909 errmsg("could not read blocks %u..%u in file \"%s\": %m",
910 blocknum,
911 blocknum + nblocks_this_segment - 1,
912 FilePathName(v->mdfd_vfd))));
913
914 if (nbytes == 0)
915 {
916 /*
917 * We are at or past EOF, or we read a partial block at EOF.
918 * Normally this is an error; upper levels should never try to
919 * read a nonexistent block. However, if zero_damaged_pages
920 * is ON or we are InRecovery, we should instead return zeroes
921 * without complaining. This allows, for example, the case of
922 * trying to update a block that was later truncated away.
923 *
924 * NB: We think that this codepath is unreachable in recovery
925 * and incomplete with zero_damaged_pages, as missing segments
926 * are not created. Putting blocks into the buffer-pool that
927 * do not exist on disk is rather problematic, as it will not
928 * be found by scans that rely on smgrnblocks(), as they are
929 * beyond EOF. It also can cause weird problems with relation
930 * extension, as relation extension does not expect blocks
931 * beyond EOF to exist.
932 *
933 * Therefore we do not want to copy the logic into
934 * mdstartreadv(), where it would have to be more complicated
935 * due to potential differences in the zero_damaged_pages
936 * setting between the definer and completor of IO.
937 *
938 * For PG 18, we are putting an Assert(false) in mdreadv()
939 * (triggering failures in assertion-enabled builds, but
940 * continuing to work in production builds). Afterwards we
941 * plan to remove this code entirely.
942 */
944 {
945 Assert(false); /* see comment above */
946
947 for (BlockNumber i = transferred_this_segment / BLCKSZ;
948 i < nblocks_this_segment;
949 ++i)
950 memset(buffers[i], 0, BLCKSZ);
951 break;
952 }
953 else
956 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
957 blocknum,
958 blocknum + nblocks_this_segment - 1,
960 transferred_this_segment,
961 size_this_segment)));
962 }
963
964 /* One loop should usually be enough. */
965 transferred_this_segment += nbytes;
966 Assert(transferred_this_segment <= size_this_segment);
967 if (transferred_this_segment == size_this_segment)
968 break;
969
970 /* Adjust position and vectors after a short read. */
971 seekpos += nbytes;
972 iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
973 }
974
975 nblocks -= nblocks_this_segment;
976 buffers += nblocks_this_segment;
977 blocknum += nblocks_this_segment;
978 }
979}
bool zero_damaged_pages
Definition: bufmgr.c:144
#define lengthof(array)
Definition: c.h:790
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition: fd.c:2162
int compute_remaining_iovec(struct iovec *destination, const struct iovec *source, int iovcnt, size_t transferred)
Definition: file_utils.c:614
static int buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
Definition: md.c:784
#define PG_IOV_MAX
Definition: pg_iovec.h:47

References _mdfd_getseg(), Assert(), RelFileLocatorBackend::backend, buffers_to_iovec(), compute_remaining_iovec(), RelFileLocator::dbOid, elog, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, FilePathName(), FileReadV(), i, InRecovery, lengthof, RelFileLocatorBackend::locator, _MdfdVec::mdfd_vfd, Min, PG_IOV_MAX, pgoff_t, RelFileLocator::relNumber, SMgrRelationData::smgr_rlocator, RelFileLocator::spcOid, and zero_damaged_pages.

◆ mdregistersync()

void mdregistersync ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1376 of file md.c.

1377{
1378 int segno;
1379 int min_inactive_seg;
1380
1381 /*
1382 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1383 * the loop below will get them all!
1384 */
1385 mdnblocks(reln, forknum);
1386
1387 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1388
1389 /*
1390 * Temporarily open inactive segments, then close them after sync. There
1391 * may be some inactive segments left opened after error, but that is
1392 * harmless. We don't bother to clean them up and take a risk of further
1393 * trouble. The next mdclose() will soon close them.
1394 */
1395 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1396 segno++;
1397
1398 while (segno > 0)
1399 {
1400 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1401
1402 register_dirty_segment(reln, forknum, v);
1403
1404 /* Close inactive segments immediately */
1405 if (segno > min_inactive_seg)
1406 {
1407 FileClose(v->mdfd_vfd);
1408 _fdvec_resize(reln, forknum, segno - 1);
1409 }
1410
1411 segno--;
1412 }
1413}

References _fdvec_resize(), _mdfd_openseg(), FileClose(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, mdnblocks(), and register_dirty_segment().

◆ mdstartreadv()

void mdstartreadv ( PgAioHandle ioh,
SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
void **  buffers,
BlockNumber  nblocks 
)

Definition at line 985 of file md.c.

988{
989 pgoff_t seekpos;
990 MdfdVec *v;
991 BlockNumber nblocks_this_segment;
992 struct iovec *iov;
993 int iovcnt;
994 int ret;
995
996 v = _mdfd_getseg(reln, forknum, blocknum, false,
998
999 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1000
1001 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1002
1003 nblocks_this_segment =
1004 Min(nblocks,
1005 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1006
1007 if (nblocks_this_segment != nblocks)
1008 elog(ERROR, "read crossing segment boundary");
1009
1010 iovcnt = pgaio_io_get_iovec(ioh, &iov);
1011
1012 Assert(nblocks <= iovcnt);
1013
1014 iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
1015
1016 Assert(iovcnt <= nblocks_this_segment);
1017
1020
1022 reln,
1023 forknum,
1024 blocknum,
1025 nblocks,
1026 false);
1028
1029 ret = FileStartReadV(ioh, v->mdfd_vfd, iovcnt, seekpos, WAIT_EVENT_DATA_FILE_READ);
1030 if (ret != 0)
1031 ereport(ERROR,
1033 errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
1034 blocknum,
1035 blocknum + nblocks_this_segment - 1,
1036 FilePathName(v->mdfd_vfd))));
1037
1038 /*
1039 * The error checks corresponding to the post-read checks in mdreadv() are
1040 * in md_readv_complete().
1041 *
1042 * However we chose, at least for now, to not implement the
1043 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
1044 * that logic is rather problematic, and we want to get rid of it. Here
1045 * equivalent logic would have to be more complicated due to potential
1046 * differences in the zero_damaged_pages setting between the definer and
1047 * completor of IO.
1048 */
1049}
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:330
@ PGAIO_HF_BUFFERED
Definition: aio.h:77
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
int pgaio_io_get_iovec(PgAioHandle *ioh, struct iovec **iov)
Definition: aio_io.c:42
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition: fd.c:2218
void pgaio_io_set_target_smgr(PgAioHandle *ioh, SMgrRelationData *smgr, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skip_fsync)
Definition: smgr.c:1029

References _mdfd_getseg(), Assert(), buffers_to_iovec(), elog, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, FilePathName(), FileStartReadV(), IO_DIRECT_DATA, io_direct_flags, _MdfdVec::mdfd_vfd, Min, PGAIO_HCB_MD_READV, PGAIO_HF_BUFFERED, pgaio_io_get_iovec(), pgaio_io_register_callbacks(), pgaio_io_set_flag(), pgaio_io_set_target_smgr(), and pgoff_t.

◆ mdsyncfiletag()

int mdsyncfiletag ( const FileTag ftag,
char *  path 
)

Definition at line 1890 of file md.c.

1891{
1893 File file;
1894 instr_time io_start;
1895 bool need_to_close;
1896 int result,
1897 save_errno;
1898
1899 /* See if we already have the file open, or need to open it. */
1900 if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1901 {
1902 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1903 strlcpy(path, FilePathName(file), MAXPGPATH);
1904 need_to_close = false;
1905 }
1906 else
1907 {
1908 MdPathStr p;
1909
1910 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1911 strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
1912
1913 file = PathNameOpenFile(path, _mdfd_open_flags());
1914 if (file < 0)
1915 return -1;
1916 need_to_close = true;
1917 }
1918
1920
1921 /* Sync the file. */
1922 result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1923 save_errno = errno;
1924
1925 if (need_to_close)
1926 FileClose(file);
1927
1929 IOOP_FSYNC, io_start, 1, 0);
1930
1931 errno = save_errno;
1932 return result;
1933}
bool track_io_timing
Definition: bufmgr.c:147
#define MD_PATH_STR_MAXLEN
Definition: md.c:132
#define MAXPGPATH
@ IOOBJECT_RELATION
Definition: pgstat.h:277
@ IOCONTEXT_NORMAL
Definition: pgstat.h:289
@ IOOP_FSYNC
Definition: pgstat.h:308
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:122
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
int16 forknum
Definition: sync.h:53
uint64 segno
Definition: sync.h:55

References _mdfd_open_flags(), _mdfd_segpath(), FileClose(), FilePathName(), FileSync(), FileTag::forknum, INVALID_PROC_NUMBER, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, MAXPGPATH, SMgrRelationData::md_num_open_segs, MD_PATH_STR_MAXLEN, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, PathNameOpenFile(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), FileTag::rlocator, FileTag::segno, smgropen(), MdPathStr::str, strlcpy(), and track_io_timing.

◆ mdtruncate()

void mdtruncate ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  curnblk,
BlockNumber  nblocks 
)

Definition at line 1287 of file md.c.

1289{
1290 BlockNumber priorblocks;
1291 int curopensegs;
1292
1293 if (nblocks > curnblk)
1294 {
1295 /* Bogus request ... but no complaint if InRecovery */
1296 if (InRecovery)
1297 return;
1298 ereport(ERROR,
1299 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1300 relpath(reln->smgr_rlocator, forknum).str,
1301 nblocks, curnblk)));
1302 }
1303 if (nblocks == curnblk)
1304 return; /* no work */
1305
1306 /*
1307 * Truncate segments, starting at the last one. Starting at the end makes
1308 * managing the memory for the fd array easier, should there be errors.
1309 */
1310 curopensegs = reln->md_num_open_segs[forknum];
1311 while (curopensegs > 0)
1312 {
1313 MdfdVec *v;
1314
1315 priorblocks = (curopensegs - 1) * RELSEG_SIZE;
1316
1317 v = &reln->md_seg_fds[forknum][curopensegs - 1];
1318
1319 if (priorblocks > nblocks)
1320 {
1321 /*
1322 * This segment is no longer active. We truncate the file, but do
1323 * not delete it, for reasons explained in the header comments.
1324 */
1325 if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1326 ereport(ERROR,
1328 errmsg("could not truncate file \"%s\": %m",
1329 FilePathName(v->mdfd_vfd))));
1330
1331 if (!SmgrIsTemp(reln))
1332 register_dirty_segment(reln, forknum, v);
1333
1334 /* we never drop the 1st segment */
1335 Assert(v != &reln->md_seg_fds[forknum][0]);
1336
1337 FileClose(v->mdfd_vfd);
1338 _fdvec_resize(reln, forknum, curopensegs - 1);
1339 }
1340 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1341 {
1342 /*
1343 * This is the last segment we want to keep. Truncate the file to
1344 * the right length. NOTE: if nblocks is exactly a multiple K of
1345 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1346 * keep it. This adheres to the invariant given in the header
1347 * comments.
1348 */
1349 BlockNumber lastsegblocks = nblocks - priorblocks;
1350
1351 if (FileTruncate(v->mdfd_vfd, (pgoff_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1352 ereport(ERROR,
1354 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1356 nblocks)));
1357 if (!SmgrIsTemp(reln))
1358 register_dirty_segment(reln, forknum, v);
1359 }
1360 else
1361 {
1362 /*
1363 * We still need this segment, so nothing to do for this and any
1364 * earlier segment.
1365 */
1366 break;
1367 }
1368 curopensegs--;
1369 }
1370}
int FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
Definition: fd.c:2478

References _fdvec_resize(), Assert(), ereport, errcode_for_file_access(), errmsg(), ERROR, FileClose(), FilePathName(), FileTruncate(), InRecovery, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, pgoff_t, register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, and SmgrIsTemp.

◆ mdunlink()

void mdunlink ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
bool  isRedo 
)

Definition at line 337 of file md.c.

338{
339 /* Now do the per-fork work */
340 if (forknum == InvalidForkNumber)
341 {
342 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
343 mdunlinkfork(rlocator, forknum, isRedo);
344 }
345 else
346 mdunlinkfork(rlocator, forknum, isRedo);
347}
static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition: md.c:374

References InvalidForkNumber, MAX_FORKNUM, and mdunlinkfork().

◆ mdunlinkfiletag()

int mdunlinkfiletag ( const FileTag ftag,
char *  path 
)

Definition at line 1942 of file md.c.

1943{
1944 RelPathStr p;
1945
1946 /* Compute the path. */
1947 p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1948 strlcpy(path, p.str, MAXPGPATH);
1949
1950 /* Try to unlink the file. */
1951 return unlink(path);
1952}
@ MAIN_FORKNUM
Definition: relpath.h:58
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146

References MAIN_FORKNUM, MAXPGPATH, relpathperm, FileTag::rlocator, RelPathStr::str, and strlcpy().

◆ mdunlinkfork()

static void mdunlinkfork ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
bool  isRedo 
)
static

Definition at line 374 of file md.c.

375{
376 RelPathStr path;
377 int ret;
378 int save_errno;
379
380 path = relpath(rlocator, forknum);
381
382 /*
383 * Truncate and then unlink the first segment, or just register a request
384 * to unlink it later, as described in the comments for mdunlink().
385 */
386 if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
388 {
389 if (!RelFileLocatorBackendIsTemp(rlocator))
390 {
391 /* Prevent other backends' fds from holding on to the disk space */
392 ret = do_truncate(path.str);
393
394 /* Forget any pending sync requests for the first segment */
395 save_errno = errno;
396 register_forget_request(rlocator, forknum, 0 /* first seg */ );
397 errno = save_errno;
398 }
399 else
400 ret = 0;
401
402 /* Next unlink the file, unless it was already found to be missing */
403 if (ret >= 0 || errno != ENOENT)
404 {
405 ret = unlink(path.str);
406 if (ret < 0 && errno != ENOENT)
407 {
408 save_errno = errno;
411 errmsg("could not remove file \"%s\": %m", path.str)));
412 errno = save_errno;
413 }
414 }
415 }
416 else
417 {
418 /* Prevent other backends' fds from holding on to the disk space */
419 ret = do_truncate(path.str);
420
421 /* Register request to unlink first segment later */
422 save_errno = errno;
423 register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
424 errno = save_errno;
425 }
426
427 /*
428 * Delete any additional segments.
429 *
430 * Note that because we loop until getting ENOENT, we will correctly
431 * remove all inactive segments as well as active ones. Ideally we'd
432 * continue the loop until getting exactly that errno, but that risks an
433 * infinite loop if the problem is directory-wide (for instance, if we
434 * suddenly can't read the data directory itself). We compromise by
435 * continuing after a non-ENOENT truncate error, but stopping after any
436 * unlink error. If there is indeed a directory-wide problem, additional
437 * unlink attempts wouldn't work anyway.
438 */
439 if (ret >= 0 || errno != ENOENT)
440 {
441 MdPathStr segpath;
442 BlockNumber segno;
443
444 for (segno = 1;; segno++)
445 {
446 sprintf(segpath.str, "%s.%u", path.str, segno);
447
448 if (!RelFileLocatorBackendIsTemp(rlocator))
449 {
450 /*
451 * Prevent other backends' fds from holding on to the disk
452 * space. We're done if we see ENOENT, though.
453 */
454 if (do_truncate(segpath.str) < 0 && errno == ENOENT)
455 break;
456
457 /*
458 * Forget any pending sync requests for this segment before we
459 * try to unlink.
460 */
461 register_forget_request(rlocator, forknum, segno);
462 }
463
464 if (unlink(segpath.str) < 0)
465 {
466 /* ENOENT is expected after the last segment... */
467 if (errno != ENOENT)
470 errmsg("could not remove file \"%s\": %m", segpath.str)));
471 break;
472 }
473 }
474 }
475}
bool IsBinaryUpgrade
Definition: globals.c:121
static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1565
static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1548
static int do_truncate(const char *path)
Definition: md.c:353
#define RelFileLocatorBackendIsTemp(rlocator)

References do_truncate(), ereport, errcode_for_file_access(), errmsg(), IsBinaryUpgrade, MAIN_FORKNUM, register_forget_request(), register_unlink_segment(), RelFileLocatorBackendIsTemp, relpath, sprintf, MdPathStr::str, RelPathStr::str, and WARNING.

Referenced by mdunlink().

◆ mdwriteback()

void mdwriteback ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
BlockNumber  nblocks 
)

Definition at line 1164 of file md.c.

1166{
1168
1169 /*
1170 * Issue flush requests in as few requests as possible; have to split at
1171 * segment boundaries though, since those are actually separate files.
1172 */
1173 while (nblocks > 0)
1174 {
1175 BlockNumber nflush = nblocks;
1176 pgoff_t seekpos;
1177 MdfdVec *v;
1178 int segnum_start,
1179 segnum_end;
1180
1181 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1183
1184 /*
1185 * We might be flushing buffers of already removed relations, that's
1186 * ok, just ignore that case. If the segment file wasn't open already
1187 * (ie from a recent mdwrite()), then we don't want to re-open it, to
1188 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1189 * us with a descriptor to a file that is about to be unlinked.
1190 */
1191 if (!v)
1192 return;
1193
1194 /* compute offset inside the current segment */
1195 segnum_start = blocknum / RELSEG_SIZE;
1196
1197 /* compute number of desired writes within the current segment */
1198 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1199 if (segnum_start != segnum_end)
1200 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1201
1202 Assert(nflush >= 1);
1203 Assert(nflush <= nblocks);
1204
1205 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1206
1207 FileWriteback(v->mdfd_vfd, seekpos, (pgoff_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
1208
1209 nblocks -= nflush;
1210 blocknum += nflush;
1211 }
1212}
void FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
Definition: fd.c:2136

References _mdfd_getseg(), Assert(), EXTENSION_DONT_OPEN, FileWriteback(), IO_DIRECT_DATA, io_direct_flags, _MdfdVec::mdfd_vfd, and pgoff_t.

◆ mdwritev()

void mdwritev ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
const void **  buffers,
BlockNumber  nblocks,
bool  skipFsync 
)

Definition at line 1059 of file md.c.

1061{
1062 /* This assert is too expensive to have on normally ... */
1063#ifdef CHECK_WRITE_VS_EXTEND
1064 Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
1065#endif
1066
1067 while (nblocks > 0)
1068 {
1069 struct iovec iov[PG_IOV_MAX];
1070 int iovcnt;
1071 pgoff_t seekpos;
1072 int nbytes;
1073 MdfdVec *v;
1074 BlockNumber nblocks_this_segment;
1075 size_t transferred_this_segment;
1076 size_t size_this_segment;
1077
1078 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
1080
1081 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1082
1083 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1084
1085 nblocks_this_segment =
1086 Min(nblocks,
1087 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1088 nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
1089
1090 if (nblocks_this_segment != nblocks)
1091 elog(ERROR, "write crosses segment boundary");
1092
1093 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
1094 size_this_segment = nblocks_this_segment * BLCKSZ;
1095 transferred_this_segment = 0;
1096
1097 /*
1098 * Inner loop to continue after a short write. If the reason is that
1099 * we're out of disk space, a future attempt should get an ENOSPC
1100 * error from the kernel.
1101 */
1102 for (;;)
1103 {
1104 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
1108 reln->smgr_rlocator.backend);
1109 nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
1110 WAIT_EVENT_DATA_FILE_WRITE);
1111 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
1115 reln->smgr_rlocator.backend,
1116 nbytes,
1117 size_this_segment - transferred_this_segment);
1118
1119#ifdef SIMULATE_SHORT_WRITE
1120 nbytes = Min(nbytes, 4096);
1121#endif
1122
1123 if (nbytes < 0)
1124 {
1125 bool enospc = errno == ENOSPC;
1126
1127 ereport(ERROR,
1129 errmsg("could not write blocks %u..%u in file \"%s\": %m",
1130 blocknum,
1131 blocknum + nblocks_this_segment - 1,
1133 enospc ? errhint("Check free disk space.") : 0));
1134 }
1135
1136 /* One loop should usually be enough. */
1137 transferred_this_segment += nbytes;
1138 Assert(transferred_this_segment <= size_this_segment);
1139 if (transferred_this_segment == size_this_segment)
1140 break;
1141
1142 /* Adjust position and iovecs after a short write. */
1143 seekpos += nbytes;
1144 iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
1145 }
1146
1147 if (!skipFsync && !SmgrIsTemp(reln))
1148 register_dirty_segment(reln, forknum, v);
1149
1150 nblocks -= nblocks_this_segment;
1151 buffers += nblocks_this_segment;
1152 blocknum += nblocks_this_segment;
1153 }
1154}
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition: fd.c:2244

References _mdfd_getseg(), Assert(), RelFileLocatorBackend::backend, buffers_to_iovec(), compute_remaining_iovec(), RelFileLocator::dbOid, elog, ereport, errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, FilePathName(), FileWriteV(), lengthof, RelFileLocatorBackend::locator, _MdfdVec::mdfd_vfd, mdnblocks(), Min, PG_IOV_MAX, pgoff_t, register_dirty_segment(), RelFileLocator::relNumber, SMgrRelationData::smgr_rlocator, SmgrIsTemp, and RelFileLocator::spcOid.

◆ mdzeroextend()

void mdzeroextend ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
int  nblocks,
bool  skipFsync 
)

Definition at line 552 of file md.c.

554{
555 MdfdVec *v;
556 BlockNumber curblocknum = blocknum;
557 int remblocks = nblocks;
558
559 Assert(nblocks > 0);
560
561 /* This assert is too expensive to have on normally ... */
562#ifdef CHECK_WRITE_VS_EXTEND
563 Assert(blocknum >= mdnblocks(reln, forknum));
564#endif
565
566 /*
567 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
568 * more --- we mustn't create a block whose number actually is
569 * InvalidBlockNumber or larger.
570 */
571 if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
573 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
574 errmsg("cannot extend file \"%s\" beyond %u blocks",
575 relpath(reln->smgr_rlocator, forknum).str,
577
578 while (remblocks > 0)
579 {
580 BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
581 pgoff_t seekpos = (pgoff_t) BLCKSZ * segstartblock;
582 int numblocks;
583
584 if (segstartblock + remblocks > RELSEG_SIZE)
585 numblocks = RELSEG_SIZE - segstartblock;
586 else
587 numblocks = remblocks;
588
589 v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
590
591 Assert(segstartblock < RELSEG_SIZE);
592 Assert(segstartblock + numblocks <= RELSEG_SIZE);
593
594 /*
595 * If available and useful, use posix_fallocate() (via
596 * FileFallocate()) to extend the relation. That's often more
597 * efficient than using write(), as it commonly won't cause the kernel
598 * to allocate page cache space for the extended pages.
599 *
600 * However, we don't use FileFallocate() for small extensions, as it
601 * defeats delayed allocation on some filesystems. Not clear where
602 * that decision should be made though? For now just use a cutoff of
603 * 8, anything between 4 and 8 worked OK in some local testing.
604 */
605 if (numblocks > 8)
606 {
607 int ret;
608
609 ret = FileFallocate(v->mdfd_vfd,
610 seekpos, (pgoff_t) BLCKSZ * numblocks,
611 WAIT_EVENT_DATA_FILE_EXTEND);
612 if (ret != 0)
613 {
616 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
618 errhint("Check free disk space."));
619 }
620 }
621 else
622 {
623 int ret;
624
625 /*
626 * Even if we don't want to use fallocate, we can still extend a
627 * bit more efficiently than writing each 8kB block individually.
628 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
629 * to avoid multiple writes or needing a zeroed buffer for the
630 * whole length of the extension.
631 */
632 ret = FileZero(v->mdfd_vfd,
633 seekpos, (pgoff_t) BLCKSZ * numblocks,
634 WAIT_EVENT_DATA_FILE_EXTEND);
635 if (ret < 0)
638 errmsg("could not extend file \"%s\": %m",
640 errhint("Check free disk space."));
641 }
642
643 if (!skipFsync && !SmgrIsTemp(reln))
644 register_dirty_segment(reln, forknum, v);
645
646 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
647
648 remblocks -= numblocks;
649 curblocknum += numblocks;
650 }
651}
int FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition: fd.c:2421
int FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition: fd.c:2376

References _mdfd_getseg(), _mdnblocks(), Assert(), ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, FileFallocate(), FilePathName(), FileZero(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), pgoff_t, register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, and SmgrIsTemp.

◆ register_dirty_segment()

static void register_dirty_segment ( SMgrRelation  reln,
ForkNumber  forknum,
MdfdVec seg 
)
static

Definition at line 1504 of file md.c.

1505{
1506 FileTag tag;
1507
1508 INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1509
1510 /* Temp relations should never be fsync'd */
1511 Assert(!SmgrIsTemp(reln));
1512
1513 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1514 {
1515 instr_time io_start;
1516
1518 (errmsg_internal("could not forward fsync request because request queue is full")));
1519
1521
1522 if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
1525 errmsg("could not fsync file \"%s\": %m",
1526 FilePathName(seg->mdfd_vfd))));
1527
1528 /*
1529 * We have no way of knowing if the current IOContext is
1530 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1531 * point, so count the fsync as being in the IOCONTEXT_NORMAL
1532 * IOContext. This is probably okay, because the number of backend
1533 * fsyncs doesn't say anything about the efficacy of the
1534 * BufferAccessStrategy. And counting both fsyncs done in
1535 * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1536 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1537 * backend fsyncs.
1538 */
1540 IOOP_FSYNC, io_start, 1, 0);
1541 }
1542}
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
#define DEBUG1
Definition: elog.h:30
@ SYNC_REQUEST
Definition: sync.h:25

References Assert(), data_sync_elevel(), DEBUG1, ereport, errcode_for_file_access(), errmsg(), errmsg_internal(), ERROR, FilePathName(), FileSync(), INIT_MD_FILETAG, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, RelFileLocatorBackend::locator, _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RegisterSyncRequest(), SMgrRelationData::smgr_rlocator, SmgrIsTemp, SYNC_REQUEST, and track_io_timing.

Referenced by mdcreate(), mdextend(), mdregistersync(), mdtruncate(), mdwritev(), and mdzeroextend().

◆ register_forget_request()

static void register_forget_request ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1565 of file md.c.

1567{
1568 FileTag tag;
1569
1570 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1571
1572 RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
1573}
@ SYNC_FORGET_REQUEST
Definition: sync.h:27

References INIT_MD_FILETAG, RelFileLocatorBackend::locator, RegisterSyncRequest(), and SYNC_FORGET_REQUEST.

Referenced by mdunlinkfork().

◆ register_unlink_segment()

static void register_unlink_segment ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1548 of file md.c.

1550{
1551 FileTag tag;
1552
1553 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1554
1555 /* Should never be used with temp relations */
1557
1558 RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
1559}
@ SYNC_UNLINK_REQUEST
Definition: sync.h:26

References Assert(), INIT_MD_FILETAG, RelFileLocatorBackend::locator, RegisterSyncRequest(), RelFileLocatorBackendIsTemp, and SYNC_UNLINK_REQUEST.

Referenced by mdunlinkfork().

◆ StaticAssertDecl()

StaticAssertDecl ( RELSEG_SIZE  ,
0 &&RELSEG_SIZE<=  INT_MAX,
"RELSEG_SIZE must fit in an integer"   
)

Variable Documentation

◆ aio_md_readv_cb

const PgAioHandleCallbacks aio_md_readv_cb
Initial value:
= {
.complete_shared = md_readv_complete,
.report = md_readv_report,
}
static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition: md.c:2042
static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: md.c:1975

Definition at line 169 of file md.c.

◆ MdCxt

MemoryContext MdCxt
static

Definition at line 97 of file md.c.

Referenced by _fdvec_resize(), and mdinit().