PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
md.h File Reference
#include "storage/aio_types.h"
#include "storage/block.h"
#include "storage/relfilelocator.h"
#include "storage/smgr.h"
#include "storage/sync.h"
Include dependency graph for md.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Functions

void mdinit (void)
 
void mdopen (SMgrRelation reln)
 
void mdclose (SMgrRelation reln, ForkNumber forknum)
 
void mdcreate (SMgrRelation reln, ForkNumber forknum, bool isRedo)
 
bool mdexists (SMgrRelation reln, ForkNumber forknum)
 
void mdunlink (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 
void mdextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
 
void mdzeroextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
 
bool mdprefetch (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
 
uint32 mdmaxcombine (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
void mdreadv (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
 
void mdstartreadv (PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
 
void mdwritev (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
 
void mdwriteback (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
 
BlockNumber mdnblocks (SMgrRelation reln, ForkNumber forknum)
 
void mdtruncate (SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
 
void mdimmedsync (SMgrRelation reln, ForkNumber forknum)
 
void mdregistersync (SMgrRelation reln, ForkNumber forknum)
 
int mdfd (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
 
void ForgetDatabaseSyncRequests (Oid dbid)
 
void DropRelationFiles (RelFileLocator *delrels, int ndelrels, bool isRedo)
 
int mdsyncfiletag (const FileTag *ftag, char *path)
 
int mdunlinkfiletag (const FileTag *ftag, char *path)
 
bool mdfiletagmatches (const FileTag *ftag, const FileTag *candidate)
 

Variables

PGDLLIMPORT const PgAioHandleCallbacks aio_md_readv_cb
 

Function Documentation

◆ DropRelationFiles()

void DropRelationFiles ( RelFileLocator delrels,
int  ndelrels,
bool  isRedo 
)

Definition at line 1587 of file md.c.

1588{
1589 SMgrRelation *srels;
1590 int i;
1591
1592 srels = palloc(sizeof(SMgrRelation) * ndelrels);
1593 for (i = 0; i < ndelrels; i++)
1594 {
1595 SMgrRelation srel = smgropen(delrels[i], INVALID_PROC_NUMBER);
1596
1597 if (isRedo)
1598 {
1599 ForkNumber fork;
1600
1601 for (fork = 0; fork <= MAX_FORKNUM; fork++)
1602 XLogDropRelation(delrels[i], fork);
1603 }
1604 srels[i] = srel;
1605 }
1606
1607 smgrdounlinkall(srels, ndelrels, isRedo);
1608
1609 for (i = 0; i < ndelrels; i++)
1610 smgrclose(srels[i]);
1611 pfree(srels);
1612}
int i
Definition: isn.c:77
void pfree(void *pointer)
Definition: mcxt.c:2147
void * palloc(Size size)
Definition: mcxt.c:1940
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
ForkNumber
Definition: relpath.h:56
#define MAX_FORKNUM
Definition: relpath.h:70
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrclose(SMgrRelation reln)
Definition: smgr.c:374
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
Definition: smgr.c:538
void XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum)
Definition: xlogutils.c:630

References i, INVALID_PROC_NUMBER, MAX_FORKNUM, palloc(), pfree(), smgrclose(), smgrdounlinkall(), smgropen(), and XLogDropRelation().

Referenced by FinishPreparedTransaction(), xact_redo_abort(), and xact_redo_commit().

◆ ForgetDatabaseSyncRequests()

void ForgetDatabaseSyncRequests ( Oid  dbid)

Definition at line 1569 of file md.c.

1570{
1571 FileTag tag;
1572 RelFileLocator rlocator;
1573
1574 rlocator.dbOid = dbid;
1575 rlocator.spcOid = 0;
1576 rlocator.relNumber = 0;
1577
1579
1580 RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1581}
#define InvalidBlockNumber
Definition: block.h:33
#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
Definition: md.c:91
@ InvalidForkNumber
Definition: relpath.h:57
Definition: sync.h:51
RelFileNumber relNumber
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition: sync.c:580
@ SYNC_FILTER_REQUEST
Definition: sync.h:28

References RelFileLocator::dbOid, INIT_MD_FILETAG, InvalidBlockNumber, InvalidForkNumber, RegisterSyncRequest(), RelFileLocator::relNumber, RelFileLocator::spcOid, and SYNC_FILTER_REQUEST.

Referenced by createdb_failure_callback(), dbase_redo(), and dropdb().

◆ mdclose()

void mdclose ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 703 of file md.c.

704{
705 int nopensegs = reln->md_num_open_segs[forknum];
706
707 /* No work if already closed */
708 if (nopensegs == 0)
709 return;
710
711 /* close segments starting from the end */
712 while (nopensegs > 0)
713 {
714 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
715
717 _fdvec_resize(reln, forknum, nopensegs - 1);
718 nopensegs--;
719 }
720}
void FileClose(File file)
Definition: fd.c:1982
static void _fdvec_resize(SMgrRelation reln, ForkNumber forknum, int nseg)
Definition: md.c:1619
int md_num_open_segs[MAX_FORKNUM+1]
Definition: smgr.h:61
struct _MdfdVec * md_seg_fds[MAX_FORKNUM+1]
Definition: smgr.h:62
Definition: md.c:82
File mdfd_vfd
Definition: md.c:83

References _fdvec_resize(), FileClose(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, and _MdfdVec::mdfd_vfd.

Referenced by mdexists().

◆ mdcreate()

void mdcreate ( SMgrRelation  reln,
ForkNumber  forknum,
bool  isRedo 
)

Definition at line 212 of file md.c.

213{
214 MdfdVec *mdfd;
215 RelPathStr path;
216 File fd;
217
218 if (isRedo && reln->md_num_open_segs[forknum] > 0)
219 return; /* created and opened already... */
220
221 Assert(reln->md_num_open_segs[forknum] == 0);
222
223 /*
224 * We may be using the target table space for the first time in this
225 * database, so create a per-database subdirectory if needed.
226 *
227 * XXX this is a fairly ugly violation of module layering, but this seems
228 * to be the best place to put the check. Maybe TablespaceCreateDbspace
229 * should be here and not in commands/tablespace.c? But that would imply
230 * importing a lot of stuff that smgr.c oughtn't know, either.
231 */
234 isRedo);
235
236 path = relpath(reln->smgr_rlocator, forknum);
237
238 fd = PathNameOpenFile(path.str, _mdfd_open_flags() | O_CREAT | O_EXCL);
239
240 if (fd < 0)
241 {
242 int save_errno = errno;
243
244 if (isRedo)
246 if (fd < 0)
247 {
248 /* be sure to report the error reported by create, not open */
249 errno = save_errno;
252 errmsg("could not create file \"%s\": %m", path.str)));
253 }
254 }
255
256 _fdvec_resize(reln, forknum, 1);
257 mdfd = &reln->md_seg_fds[forknum][0];
258 mdfd->mdfd_vfd = fd;
259 mdfd->mdfd_segno = 0;
260
261 if (!SmgrIsTemp(reln))
262 register_dirty_segment(reln, forknum, mdfd);
263}
void TablespaceCreateDbspace(Oid spcOid, Oid dbOid, bool isRedo)
Definition: tablespace.c:112
int errcode_for_file_access(void)
Definition: elog.c:877
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1579
int File
Definition: fd.h:51
Assert(PointerIsAligned(start, uint64))
int mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
Definition: md.c:1470
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition: md.c:1494
static int _mdfd_open_flags(void)
Definition: md.c:166
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define relpath(rlocator, forknum)
Definition: relpath.h:150
#define SmgrIsTemp(smgr)
Definition: smgr.h:74
RelFileLocator locator
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38

References _fdvec_resize(), _mdfd_open_flags(), Assert(), RelFileLocator::dbOid, ereport, errcode_for_file_access(), errmsg(), ERROR, fd(), RelFileLocatorBackend::locator, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, mdfd(), PathNameOpenFile(), register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, SmgrIsTemp, RelFileLocator::spcOid, RelPathStr::str, and TablespaceCreateDbspace().

◆ mdexists()

bool mdexists ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 193 of file md.c.

194{
195 /*
196 * Close it first, to ensure that we notice if the fork has been unlinked
197 * since we opened it. As an optimization, we can skip that in recovery,
198 * which already closes relations when dropping them.
199 */
200 if (!InRecovery)
201 mdclose(reln, forknum);
202
203 return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
204}
void mdclose(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:703
#define EXTENSION_RETURN_NULL
Definition: md.c:105
static MdfdVec * mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
Definition: md.c:654
bool InRecovery
Definition: xlogutils.c:50

References EXTENSION_RETURN_NULL, InRecovery, mdclose(), and mdopenfork().

◆ mdextend()

void mdextend ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
const void *  buffer,
bool  skipFsync 
)

Definition at line 477 of file md.c.

479{
480 off_t seekpos;
481 int nbytes;
482 MdfdVec *v;
483
484 /* If this build supports direct I/O, the buffer must be I/O aligned. */
485 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
486 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
487
488 /* This assert is too expensive to have on normally ... */
489#ifdef CHECK_WRITE_VS_EXTEND
490 Assert(blocknum >= mdnblocks(reln, forknum));
491#endif
492
493 /*
494 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
495 * more --- we mustn't create a block whose number actually is
496 * InvalidBlockNumber. (Note that this failure should be unreachable
497 * because of upstream checks in bufmgr.c.)
498 */
499 if (blocknum == InvalidBlockNumber)
501 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
502 errmsg("cannot extend file \"%s\" beyond %u blocks",
503 relpath(reln->smgr_rlocator, forknum).str,
505
506 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
507
508 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
509
510 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
511
512 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
513 {
514 if (nbytes < 0)
517 errmsg("could not extend file \"%s\": %m",
519 errhint("Check free disk space.")));
520 /* short write: complain appropriately */
522 (errcode(ERRCODE_DISK_FULL),
523 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
525 nbytes, BLCKSZ, blocknum),
526 errhint("Check free disk space.")));
527 }
528
529 if (!skipFsync && !SmgrIsTemp(reln))
530 register_dirty_segment(reln, forknum, v);
531
532 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
533}
uint32 BlockNumber
Definition: block.h:31
#define TYPEALIGN(ALIGNVAL, LEN)
Definition: c.h:775
int errhint(const char *fmt,...)
Definition: elog.c:1318
int errcode(int sqlerrcode)
Definition: elog.c:854
char * FilePathName(File file)
Definition: fd.c:2516
static ssize_t FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
Definition: fd.h:211
#define PG_O_DIRECT
Definition: fd.h:97
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition: md.c:1859
BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:1213
static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
Definition: md.c:1730
#define EXTENSION_CREATE
Definition: md.c:107
#define PG_IO_ALIGN_SIZE

References _mdfd_getseg(), _mdnblocks(), Assert(), ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, FilePathName(), FileWrite(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), PG_IO_ALIGN_SIZE, PG_O_DIRECT, register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, SmgrIsTemp, and TYPEALIGN.

Referenced by _mdfd_getseg().

◆ mdfd()

int mdfd ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
uint32 off 
)

Definition at line 1470 of file md.c.

1471{
1472 MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
1473
1474 v = _mdfd_getseg(reln, forknum, blocknum, false,
1476
1477 *off = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1478
1479 Assert(*off < (off_t) BLCKSZ * RELSEG_SIZE);
1480
1481 return FileGetRawDesc(v->mdfd_vfd);
1482}
int FileGetRawDesc(File file)
Definition: fd.c:2532
#define EXTENSION_FAIL
Definition: md.c:103

References _mdfd_getseg(), Assert(), EXTENSION_FAIL, FileGetRawDesc(), _MdfdVec::mdfd_vfd, and mdopenfork().

Referenced by mdcreate(), and mdopenfork().

◆ mdfiletagmatches()

bool mdfiletagmatches ( const FileTag ftag,
const FileTag candidate 
)

Definition at line 1950 of file md.c.

1951{
1952 /*
1953 * For now we only use filter requests as a way to drop all scheduled
1954 * callbacks relating to a given database, when dropping the database.
1955 * We'll return true for all candidates that have the same database OID as
1956 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1957 */
1958 return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1959}
RelFileLocator rlocator
Definition: sync.h:54

References RelFileLocator::dbOid, and FileTag::rlocator.

◆ mdimmedsync()

void mdimmedsync ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1417 of file md.c.

1418{
1419 int segno;
1420 int min_inactive_seg;
1421
1422 /*
1423 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1424 * the loop below will get them all!
1425 */
1426 mdnblocks(reln, forknum);
1427
1428 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1429
1430 /*
1431 * Temporarily open inactive segments, then close them after sync. There
1432 * may be some inactive segments left opened after fsync() error, but that
1433 * is harmless. We don't bother to clean them up and take a risk of
1434 * further trouble. The next mdclose() will soon close them.
1435 */
1436 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1437 segno++;
1438
1439 while (segno > 0)
1440 {
1441 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1442
1443 /*
1444 * fsyncs done through mdimmedsync() should be tracked in a separate
1445 * IOContext than those done through mdsyncfiletag() to differentiate
1446 * between unavoidable client backend fsyncs (e.g. those done during
1447 * index build) and those which ideally would have been done by the
1448 * checkpointer. Since other IO operations bypassing the buffer
1449 * manager could also be tracked in such an IOContext, wait until
1450 * these are also tracked to track immediate fsyncs.
1451 */
1452 if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
1455 errmsg("could not fsync file \"%s\": %m",
1456 FilePathName(v->mdfd_vfd))));
1457
1458 /* Close inactive segments immediately */
1459 if (segno > min_inactive_seg)
1460 {
1461 FileClose(v->mdfd_vfd);
1462 _fdvec_resize(reln, forknum, segno - 1);
1463 }
1464
1465 segno--;
1466 }
1467}
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2352
int data_sync_elevel(int elevel)
Definition: fd.c:4001
static MdfdVec * _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
Definition: md.c:1687

References _fdvec_resize(), _mdfd_openseg(), data_sync_elevel(), ereport, errcode_for_file_access(), errmsg(), ERROR, FileClose(), FilePathName(), FileSync(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, and mdnblocks().

◆ mdinit()

void mdinit ( void  )

Definition at line 180 of file md.c.

181{
183 "MdSmgr",
185}
MemoryContext TopMemoryContext
Definition: mcxt.c:165
static MemoryContext MdCxt
Definition: md.c:87
#define AllocSetContextCreate
Definition: memutils.h:149
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:180

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, MdCxt, and TopMemoryContext.

◆ mdmaxcombine()

uint32 mdmaxcombine ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum 
)

Definition at line 823 of file md.c.

825{
826 BlockNumber segoff;
827
828 segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
829
830 return RELSEG_SIZE - segoff;
831}

◆ mdnblocks()

BlockNumber mdnblocks ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1213 of file md.c.

1214{
1215 MdfdVec *v;
1216 BlockNumber nblocks;
1217 BlockNumber segno;
1218
1219 mdopenfork(reln, forknum, EXTENSION_FAIL);
1220
1221 /* mdopen has opened the first segment */
1222 Assert(reln->md_num_open_segs[forknum] > 0);
1223
1224 /*
1225 * Start from the last open segments, to avoid redundant seeks. We have
1226 * previously verified that these segments are exactly RELSEG_SIZE long,
1227 * and it's useless to recheck that each time.
1228 *
1229 * NOTE: this assumption could only be wrong if another backend has
1230 * truncated the relation. We rely on higher code levels to handle that
1231 * scenario by closing and re-opening the md fd, which is handled via
1232 * relcache flush. (Since the checkpointer doesn't participate in
1233 * relcache flush, it could have segment entries for inactive segments;
1234 * that's OK because the checkpointer never needs to compute relation
1235 * size.)
1236 */
1237 segno = reln->md_num_open_segs[forknum] - 1;
1238 v = &reln->md_seg_fds[forknum][segno];
1239
1240 for (;;)
1241 {
1242 nblocks = _mdnblocks(reln, forknum, v);
1243 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1244 elog(FATAL, "segment too big");
1245 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1246 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1247
1248 /*
1249 * If segment is exactly RELSEG_SIZE, advance to next one.
1250 */
1251 segno++;
1252
1253 /*
1254 * We used to pass O_CREAT here, but that has the disadvantage that it
1255 * might create a segment which has vanished through some operating
1256 * system misadventure. In such a case, creating the segment here
1257 * undermines _mdfd_getseg's attempts to notice and report an error
1258 * upon access to a missing segment.
1259 */
1260 v = _mdfd_openseg(reln, forknum, segno, 0);
1261 if (v == NULL)
1262 return segno * ((BlockNumber) RELSEG_SIZE);
1263 }
1264}
#define FATAL
Definition: elog.h:41
#define elog(elevel,...)
Definition: elog.h:226

References _mdfd_openseg(), _mdnblocks(), Assert(), elog, EXTENSION_FAIL, FATAL, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, and mdopenfork().

Referenced by mdextend(), mdimmedsync(), mdregistersync(), mdwritev(), and mdzeroextend().

◆ mdopen()

void mdopen ( SMgrRelation  reln)

Definition at line 692 of file md.c.

693{
694 /* mark it not open */
695 for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
696 reln->md_num_open_segs[forknum] = 0;
697}

References MAX_FORKNUM, and SMgrRelationData::md_num_open_segs.

◆ mdprefetch()

bool mdprefetch ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
int  nblocks 
)

Definition at line 726 of file md.c.

728{
729#ifdef USE_PREFETCH
730
732
733 if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
734 return false;
735
736 while (nblocks > 0)
737 {
738 off_t seekpos;
739 MdfdVec *v;
740 int nblocks_this_segment;
741
742 v = _mdfd_getseg(reln, forknum, blocknum, false,
744 if (v == NULL)
745 return false;
746
747 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
748
749 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
750
751 nblocks_this_segment =
752 Min(nblocks,
753 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
754
755 (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ * nblocks_this_segment,
756 WAIT_EVENT_DATA_FILE_PREFETCH);
757
758 blocknum += nblocks_this_segment;
759 nblocks -= nblocks_this_segment;
760 }
761#endif /* USE_PREFETCH */
762
763 return true;
764}
#define MaxBlockNumber
Definition: block.h:35
#define Min(x, y)
Definition: c.h:975
uint64_t uint64
Definition: c.h:503
int io_direct_flags
Definition: fd.c:168
int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2083
#define IO_DIRECT_DATA
Definition: fd.h:54

References _mdfd_getseg(), Assert(), EXTENSION_FAIL, EXTENSION_RETURN_NULL, FilePrefetch(), InRecovery, IO_DIRECT_DATA, io_direct_flags, MaxBlockNumber, _MdfdVec::mdfd_vfd, and Min.

◆ mdreadv()

void mdreadv ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
void **  buffers,
BlockNumber  nblocks 
)

Definition at line 837 of file md.c.

839{
840 while (nblocks > 0)
841 {
842 struct iovec iov[PG_IOV_MAX];
843 int iovcnt;
844 off_t seekpos;
845 int nbytes;
846 MdfdVec *v;
847 BlockNumber nblocks_this_segment;
848 size_t transferred_this_segment;
849 size_t size_this_segment;
850
851 v = _mdfd_getseg(reln, forknum, blocknum, false,
853
854 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
855
856 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
857
858 nblocks_this_segment =
859 Min(nblocks,
860 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
861 nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
862
863 if (nblocks_this_segment != nblocks)
864 elog(ERROR, "read crosses segment boundary");
865
866 iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
867 size_this_segment = nblocks_this_segment * BLCKSZ;
868 transferred_this_segment = 0;
869
870 /*
871 * Inner loop to continue after a short read. We'll keep going until
872 * we hit EOF rather than assuming that a short read means we hit the
873 * end.
874 */
875 for (;;)
876 {
877 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
881 reln->smgr_rlocator.backend);
882 nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
883 WAIT_EVENT_DATA_FILE_READ);
884 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
889 nbytes,
890 size_this_segment - transferred_this_segment);
891
892#ifdef SIMULATE_SHORT_READ
893 nbytes = Min(nbytes, 4096);
894#endif
895
896 if (nbytes < 0)
899 errmsg("could not read blocks %u..%u in file \"%s\": %m",
900 blocknum,
901 blocknum + nblocks_this_segment - 1,
902 FilePathName(v->mdfd_vfd))));
903
904 if (nbytes == 0)
905 {
906 /*
907 * We are at or past EOF, or we read a partial block at EOF.
908 * Normally this is an error; upper levels should never try to
909 * read a nonexistent block. However, if zero_damaged_pages
910 * is ON or we are InRecovery, we should instead return zeroes
911 * without complaining. This allows, for example, the case of
912 * trying to update a block that was later truncated away.
913 *
914 * NB: We think that this codepath is unreachable in recovery
915 * and incomplete with zero_damaged_pages, as missing segments
916 * are not created. Putting blocks into the buffer-pool that
917 * do not exist on disk is rather problematic, as it will not
918 * be found by scans that rely on smgrnblocks(), as they are
919 * beyond EOF. It also can cause weird problems with relation
920 * extension, as relation extension does not expect blocks
921 * beyond EOF to exist.
922 *
923 * Therefore we do not want to copy the logic into
924 * mdstartreadv(), where it would have to be more complicated
925 * due to potential differences in the zero_damaged_pages
926 * setting between the definer and completor of IO.
927 *
928 * For PG 18, we are putting an Assert(false) in mdreadv()
929 * (triggering failures in assertion-enabled builds, but
930 * continuing to work in production builds). Afterwards we
931 * plan to remove this code entirely.
932 */
934 {
935 Assert(false); /* see comment above */
936
937 for (BlockNumber i = transferred_this_segment / BLCKSZ;
938 i < nblocks_this_segment;
939 ++i)
940 memset(buffers[i], 0, BLCKSZ);
941 break;
942 }
943 else
946 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
947 blocknum,
948 blocknum + nblocks_this_segment - 1,
950 transferred_this_segment,
951 size_this_segment)));
952 }
953
954 /* One loop should usually be enough. */
955 transferred_this_segment += nbytes;
956 Assert(transferred_this_segment <= size_this_segment);
957 if (transferred_this_segment == size_this_segment)
958 break;
959
960 /* Adjust position and vectors after a short read. */
961 seekpos += nbytes;
962 iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
963 }
964
965 nblocks -= nblocks_this_segment;
966 buffers += nblocks_this_segment;
967 blocknum += nblocks_this_segment;
968 }
969}
bool zero_damaged_pages
Definition: bufmgr.c:141
#define lengthof(array)
Definition: c.h:759
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2165
int compute_remaining_iovec(struct iovec *destination, const struct iovec *source, int iovcnt, size_t transferred)
Definition: file_utils.c:614
#define EXTENSION_CREATE_RECOVERY
Definition: md.c:109
static int buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
Definition: md.c:774
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define PG_IOV_MAX
Definition: pg_iovec.h:41

References _mdfd_getseg(), Assert(), RelFileLocatorBackend::backend, buffers_to_iovec(), compute_remaining_iovec(), RelFileLocator::dbOid, elog, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, FilePathName(), FileReadV(), i, InRecovery, lengthof, RelFileLocatorBackend::locator, _MdfdVec::mdfd_vfd, Min, PG_IOV_MAX, RelFileLocator::relNumber, SMgrRelationData::smgr_rlocator, RelFileLocator::spcOid, and zero_damaged_pages.

◆ mdregistersync()

void mdregistersync ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1366 of file md.c.

1367{
1368 int segno;
1369 int min_inactive_seg;
1370
1371 /*
1372 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1373 * the loop below will get them all!
1374 */
1375 mdnblocks(reln, forknum);
1376
1377 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1378
1379 /*
1380 * Temporarily open inactive segments, then close them after sync. There
1381 * may be some inactive segments left opened after error, but that is
1382 * harmless. We don't bother to clean them up and take a risk of further
1383 * trouble. The next mdclose() will soon close them.
1384 */
1385 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1386 segno++;
1387
1388 while (segno > 0)
1389 {
1390 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1391
1392 register_dirty_segment(reln, forknum, v);
1393
1394 /* Close inactive segments immediately */
1395 if (segno > min_inactive_seg)
1396 {
1397 FileClose(v->mdfd_vfd);
1398 _fdvec_resize(reln, forknum, segno - 1);
1399 }
1400
1401 segno--;
1402 }
1403}

References _fdvec_resize(), _mdfd_openseg(), FileClose(), SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, mdnblocks(), and register_dirty_segment().

◆ mdstartreadv()

void mdstartreadv ( PgAioHandle ioh,
SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
void **  buffers,
BlockNumber  nblocks 
)

Definition at line 975 of file md.c.

978{
979 off_t seekpos;
980 MdfdVec *v;
981 BlockNumber nblocks_this_segment;
982 struct iovec *iov;
983 int iovcnt;
984 int ret;
985
986 v = _mdfd_getseg(reln, forknum, blocknum, false,
988
989 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
990
991 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
992
993 nblocks_this_segment =
994 Min(nblocks,
995 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
996
997 if (nblocks_this_segment != nblocks)
998 elog(ERROR, "read crossing segment boundary");
999
1000 iovcnt = pgaio_io_get_iovec(ioh, &iov);
1001
1002 Assert(nblocks <= iovcnt);
1003
1004 iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
1005
1006 Assert(iovcnt <= nblocks_this_segment);
1007
1010
1012 reln,
1013 forknum,
1014 blocknum,
1015 nblocks,
1016 false);
1018
1019 ret = FileStartReadV(ioh, v->mdfd_vfd, iovcnt, seekpos, WAIT_EVENT_DATA_FILE_READ);
1020 if (ret != 0)
1021 ereport(ERROR,
1023 errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
1024 blocknum,
1025 blocknum + nblocks_this_segment - 1,
1026 FilePathName(v->mdfd_vfd))));
1027
1028 /*
1029 * The error checks corresponding to the post-read checks in mdreadv() are
1030 * in md_readv_complete().
1031 *
1032 * However we chose, at least for now, to not implement the
1033 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
1034 * that logic is rather problematic, and we want to get rid of it. Here
1035 * equivalent logic would have to be more complicated due to potential
1036 * differences in the zero_damaged_pages setting between the definer and
1037 * completor of IO.
1038 */
1039}
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:318
@ PGAIO_HCB_MD_READV
Definition: aio.h:196
@ PGAIO_HF_BUFFERED
Definition: aio.h:77
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
int pgaio_io_get_iovec(PgAioHandle *ioh, struct iovec **iov)
Definition: aio_io.c:42
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2221
void pgaio_io_set_target_smgr(PgAioHandle *ioh, SMgrRelationData *smgr, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skip_fsync)
Definition: smgr.c:1029

References _mdfd_getseg(), Assert(), buffers_to_iovec(), elog, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, FilePathName(), FileStartReadV(), IO_DIRECT_DATA, io_direct_flags, _MdfdVec::mdfd_vfd, Min, PGAIO_HCB_MD_READV, PGAIO_HF_BUFFERED, pgaio_io_get_iovec(), pgaio_io_register_callbacks(), pgaio_io_set_flag(), and pgaio_io_set_target_smgr().

◆ mdsyncfiletag()

int mdsyncfiletag ( const FileTag ftag,
char *  path 
)

Definition at line 1880 of file md.c.

1881{
1883 File file;
1884 instr_time io_start;
1885 bool need_to_close;
1886 int result,
1887 save_errno;
1888
1889 /* See if we already have the file open, or need to open it. */
1890 if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1891 {
1892 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1893 strlcpy(path, FilePathName(file), MAXPGPATH);
1894 need_to_close = false;
1895 }
1896 else
1897 {
1898 MdPathStr p;
1899
1900 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1901 strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
1902
1903 file = PathNameOpenFile(path, _mdfd_open_flags());
1904 if (file < 0)
1905 return -1;
1906 need_to_close = true;
1907 }
1908
1910
1911 /* Sync the file. */
1912 result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1913 save_errno = errno;
1914
1915 if (need_to_close)
1916 FileClose(file);
1917
1919 IOOP_FSYNC, io_start, 1, 0);
1920
1921 errno = save_errno;
1922 return result;
1923}
bool track_io_timing
Definition: bufmgr.c:144
static MdPathStr _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1667
#define MD_PATH_STR_MAXLEN
Definition: md.c:122
#define MAXPGPATH
@ IOOBJECT_RELATION
Definition: pgstat.h:274
@ IOCONTEXT_NORMAL
Definition: pgstat.h:286
@ IOOP_FSYNC
Definition: pgstat.h:305
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:90
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:121
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
int16 forknum
Definition: sync.h:53
uint64 segno
Definition: sync.h:55
Definition: md.c:129
char str[MD_PATH_STR_MAXLEN+1]
Definition: md.c:130

References _mdfd_open_flags(), _mdfd_segpath(), FileClose(), FilePathName(), FileSync(), FileTag::forknum, INVALID_PROC_NUMBER, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, MAXPGPATH, SMgrRelationData::md_num_open_segs, MD_PATH_STR_MAXLEN, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, PathNameOpenFile(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), FileTag::rlocator, FileTag::segno, smgropen(), MdPathStr::str, strlcpy(), and track_io_timing.

◆ mdtruncate()

void mdtruncate ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  curnblk,
BlockNumber  nblocks 
)

Definition at line 1277 of file md.c.

1279{
1280 BlockNumber priorblocks;
1281 int curopensegs;
1282
1283 if (nblocks > curnblk)
1284 {
1285 /* Bogus request ... but no complaint if InRecovery */
1286 if (InRecovery)
1287 return;
1288 ereport(ERROR,
1289 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1290 relpath(reln->smgr_rlocator, forknum).str,
1291 nblocks, curnblk)));
1292 }
1293 if (nblocks == curnblk)
1294 return; /* no work */
1295
1296 /*
1297 * Truncate segments, starting at the last one. Starting at the end makes
1298 * managing the memory for the fd array easier, should there be errors.
1299 */
1300 curopensegs = reln->md_num_open_segs[forknum];
1301 while (curopensegs > 0)
1302 {
1303 MdfdVec *v;
1304
1305 priorblocks = (curopensegs - 1) * RELSEG_SIZE;
1306
1307 v = &reln->md_seg_fds[forknum][curopensegs - 1];
1308
1309 if (priorblocks > nblocks)
1310 {
1311 /*
1312 * This segment is no longer active. We truncate the file, but do
1313 * not delete it, for reasons explained in the header comments.
1314 */
1315 if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1316 ereport(ERROR,
1318 errmsg("could not truncate file \"%s\": %m",
1319 FilePathName(v->mdfd_vfd))));
1320
1321 if (!SmgrIsTemp(reln))
1322 register_dirty_segment(reln, forknum, v);
1323
1324 /* we never drop the 1st segment */
1325 Assert(v != &reln->md_seg_fds[forknum][0]);
1326
1327 FileClose(v->mdfd_vfd);
1328 _fdvec_resize(reln, forknum, curopensegs - 1);
1329 }
1330 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1331 {
1332 /*
1333 * This is the last segment we want to keep. Truncate the file to
1334 * the right length. NOTE: if nblocks is exactly a multiple K of
1335 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1336 * keep it. This adheres to the invariant given in the header
1337 * comments.
1338 */
1339 BlockNumber lastsegblocks = nblocks - priorblocks;
1340
1341 if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1342 ereport(ERROR,
1344 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1346 nblocks)));
1347 if (!SmgrIsTemp(reln))
1348 register_dirty_segment(reln, forknum, v);
1349 }
1350 else
1351 {
1352 /*
1353 * We still need this segment, so nothing to do for this and any
1354 * earlier segment.
1355 */
1356 break;
1357 }
1358 curopensegs--;
1359 }
1360}
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2481

References _fdvec_resize(), Assert(), ereport, errcode_for_file_access(), errmsg(), ERROR, FileClose(), FilePathName(), FileTruncate(), InRecovery, SMgrRelationData::md_num_open_segs, SMgrRelationData::md_seg_fds, _MdfdVec::mdfd_vfd, register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, and SmgrIsTemp.

◆ mdunlink()

void mdunlink ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
bool  isRedo 
)

Definition at line 327 of file md.c.

328{
329 /* Now do the per-fork work */
330 if (forknum == InvalidForkNumber)
331 {
332 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
333 mdunlinkfork(rlocator, forknum, isRedo);
334 }
335 else
336 mdunlinkfork(rlocator, forknum, isRedo);
337}
static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition: md.c:364

References InvalidForkNumber, MAX_FORKNUM, and mdunlinkfork().

◆ mdunlinkfiletag()

int mdunlinkfiletag ( const FileTag ftag,
char *  path 
)

Definition at line 1932 of file md.c.

1933{
1934 RelPathStr p;
1935
1936 /* Compute the path. */
1937 p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1938 strlcpy(path, p.str, MAXPGPATH);
1939
1940 /* Try to unlink the file. */
1941 return unlink(path);
1942}
@ MAIN_FORKNUM
Definition: relpath.h:58
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146

References MAIN_FORKNUM, MAXPGPATH, relpathperm, FileTag::rlocator, RelPathStr::str, and strlcpy().

◆ mdwriteback()

void mdwriteback ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
BlockNumber  nblocks 
)

Definition at line 1154 of file md.c.

1156{
1158
1159 /*
1160 * Issue flush requests in as few requests as possible; have to split at
1161 * segment boundaries though, since those are actually separate files.
1162 */
1163 while (nblocks > 0)
1164 {
1165 BlockNumber nflush = nblocks;
1166 off_t seekpos;
1167 MdfdVec *v;
1168 int segnum_start,
1169 segnum_end;
1170
1171 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1173
1174 /*
1175 * We might be flushing buffers of already removed relations, that's
1176 * ok, just ignore that case. If the segment file wasn't open already
1177 * (ie from a recent mdwrite()), then we don't want to re-open it, to
1178 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1179 * us with a descriptor to a file that is about to be unlinked.
1180 */
1181 if (!v)
1182 return;
1183
1184 /* compute offset inside the current segment */
1185 segnum_start = blocknum / RELSEG_SIZE;
1186
1187 /* compute number of desired writes within the current segment */
1188 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1189 if (segnum_start != segnum_end)
1190 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1191
1192 Assert(nflush >= 1);
1193 Assert(nflush <= nblocks);
1194
1195 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1196
1197 FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
1198
1199 nblocks -= nflush;
1200 blocknum += nflush;
1201 }
1202}
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2139
#define EXTENSION_DONT_OPEN
Definition: md.c:111

References _mdfd_getseg(), Assert(), EXTENSION_DONT_OPEN, FileWriteback(), IO_DIRECT_DATA, io_direct_flags, and _MdfdVec::mdfd_vfd.

◆ mdwritev()

void mdwritev ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
const void **  buffers,
BlockNumber  nblocks,
bool  skipFsync 
)

Definition at line 1049 of file md.c.

1051{
1052 /* This assert is too expensive to have on normally ... */
1053#ifdef CHECK_WRITE_VS_EXTEND
1054 Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
1055#endif
1056
1057 while (nblocks > 0)
1058 {
1059 struct iovec iov[PG_IOV_MAX];
1060 int iovcnt;
1061 off_t seekpos;
1062 int nbytes;
1063 MdfdVec *v;
1064 BlockNumber nblocks_this_segment;
1065 size_t transferred_this_segment;
1066 size_t size_this_segment;
1067
1068 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
1070
1071 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1072
1073 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
1074
1075 nblocks_this_segment =
1076 Min(nblocks,
1077 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1078 nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
1079
1080 if (nblocks_this_segment != nblocks)
1081 elog(ERROR, "write crosses segment boundary");
1082
1083 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
1084 size_this_segment = nblocks_this_segment * BLCKSZ;
1085 transferred_this_segment = 0;
1086
1087 /*
1088 * Inner loop to continue after a short write. If the reason is that
1089 * we're out of disk space, a future attempt should get an ENOSPC
1090 * error from the kernel.
1091 */
1092 for (;;)
1093 {
1094 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
1098 reln->smgr_rlocator.backend);
1099 nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
1100 WAIT_EVENT_DATA_FILE_WRITE);
1101 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
1105 reln->smgr_rlocator.backend,
1106 nbytes,
1107 size_this_segment - transferred_this_segment);
1108
1109#ifdef SIMULATE_SHORT_WRITE
1110 nbytes = Min(nbytes, 4096);
1111#endif
1112
1113 if (nbytes < 0)
1114 {
1115 bool enospc = errno == ENOSPC;
1116
1117 ereport(ERROR,
1119 errmsg("could not write blocks %u..%u in file \"%s\": %m",
1120 blocknum,
1121 blocknum + nblocks_this_segment - 1,
1123 enospc ? errhint("Check free disk space.") : 0));
1124 }
1125
1126 /* One loop should usually be enough. */
1127 transferred_this_segment += nbytes;
1128 Assert(transferred_this_segment <= size_this_segment);
1129 if (transferred_this_segment == size_this_segment)
1130 break;
1131
1132 /* Adjust position and iovecs after a short write. */
1133 seekpos += nbytes;
1134 iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
1135 }
1136
1137 if (!skipFsync && !SmgrIsTemp(reln))
1138 register_dirty_segment(reln, forknum, v);
1139
1140 nblocks -= nblocks_this_segment;
1141 buffers += nblocks_this_segment;
1142 blocknum += nblocks_this_segment;
1143 }
1144}
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2247

References _mdfd_getseg(), Assert(), RelFileLocatorBackend::backend, buffers_to_iovec(), compute_remaining_iovec(), RelFileLocator::dbOid, elog, ereport, errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, FilePathName(), FileWriteV(), lengthof, RelFileLocatorBackend::locator, _MdfdVec::mdfd_vfd, mdnblocks(), Min, PG_IOV_MAX, register_dirty_segment(), RelFileLocator::relNumber, SMgrRelationData::smgr_rlocator, SmgrIsTemp, and RelFileLocator::spcOid.

◆ mdzeroextend()

void mdzeroextend ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
int  nblocks,
bool  skipFsync 
)

Definition at line 542 of file md.c.

544{
545 MdfdVec *v;
546 BlockNumber curblocknum = blocknum;
547 int remblocks = nblocks;
548
549 Assert(nblocks > 0);
550
551 /* This assert is too expensive to have on normally ... */
552#ifdef CHECK_WRITE_VS_EXTEND
553 Assert(blocknum >= mdnblocks(reln, forknum));
554#endif
555
556 /*
557 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
558 * more --- we mustn't create a block whose number actually is
559 * InvalidBlockNumber or larger.
560 */
561 if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
563 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
564 errmsg("cannot extend file \"%s\" beyond %u blocks",
565 relpath(reln->smgr_rlocator, forknum).str,
567
568 while (remblocks > 0)
569 {
570 BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
571 off_t seekpos = (off_t) BLCKSZ * segstartblock;
572 int numblocks;
573
574 if (segstartblock + remblocks > RELSEG_SIZE)
575 numblocks = RELSEG_SIZE - segstartblock;
576 else
577 numblocks = remblocks;
578
579 v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
580
581 Assert(segstartblock < RELSEG_SIZE);
582 Assert(segstartblock + numblocks <= RELSEG_SIZE);
583
584 /*
585 * If available and useful, use posix_fallocate() (via
586 * FileFallocate()) to extend the relation. That's often more
587 * efficient than using write(), as it commonly won't cause the kernel
588 * to allocate page cache space for the extended pages.
589 *
590 * However, we don't use FileFallocate() for small extensions, as it
591 * defeats delayed allocation on some filesystems. Not clear where
592 * that decision should be made though? For now just use a cutoff of
593 * 8, anything between 4 and 8 worked OK in some local testing.
594 */
595 if (numblocks > 8)
596 {
597 int ret;
598
599 ret = FileFallocate(v->mdfd_vfd,
600 seekpos, (off_t) BLCKSZ * numblocks,
601 WAIT_EVENT_DATA_FILE_EXTEND);
602 if (ret != 0)
603 {
606 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
608 errhint("Check free disk space."));
609 }
610 }
611 else
612 {
613 int ret;
614
615 /*
616 * Even if we don't want to use fallocate, we can still extend a
617 * bit more efficiently than writing each 8kB block individually.
618 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
619 * to avoid multiple writes or needing a zeroed buffer for the
620 * whole length of the extension.
621 */
622 ret = FileZero(v->mdfd_vfd,
623 seekpos, (off_t) BLCKSZ * numblocks,
624 WAIT_EVENT_DATA_FILE_EXTEND);
625 if (ret < 0)
628 errmsg("could not extend file \"%s\": %m",
630 errhint("Check free disk space."));
631 }
632
633 if (!skipFsync && !SmgrIsTemp(reln))
634 register_dirty_segment(reln, forknum, v);
635
636 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
637
638 remblocks -= numblocks;
639 curblocknum += numblocks;
640 }
641}
int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2424
int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2379

References _mdfd_getseg(), _mdnblocks(), Assert(), ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, FileFallocate(), FilePathName(), FileZero(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), register_dirty_segment(), relpath, SMgrRelationData::smgr_rlocator, and SmgrIsTemp.

Variable Documentation

◆ aio_md_readv_cb

PGDLLIMPORT const PgAioHandleCallbacks aio_md_readv_cb
extern

Definition at line 159 of file md.c.