PostgreSQL Source Code git master
Loading...
Searching...
No Matches
md.c File Reference
#include "postgres.h"
#include <limits.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
#include "access/xlogutils.h"
#include "commands/tablespace.h"
#include "common/file_utils.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/md.h"
#include "storage/relfilelocator.h"
#include "storage/smgr.h"
#include "storage/sync.h"
#include "utils/memutils.h"
Include dependency graph for md.c:

Go to the source code of this file.

Data Structures

struct  _MdfdVec
 
struct  MdPathStr
 

Macros

#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
 
#define EXTENSION_FAIL   (1 << 0)
 
#define EXTENSION_RETURN_NULL   (1 << 1)
 
#define EXTENSION_CREATE   (1 << 2)
 
#define EXTENSION_CREATE_RECOVERY   (1 << 3)
 
#define EXTENSION_DONT_OPEN   (1 << 5)
 
#define SEGMENT_CHARS   OIDCHARS
 
#define MD_PATH_STR_MAXLEN
 

Typedefs

typedef struct _MdfdVec MdfdVec
 
typedef struct MdPathStr MdPathStr
 

Functions

 StaticAssertDecl (RELSEG_SIZE > 0 &&RELSEG_SIZE<=INT_MAX, "RELSEG_SIZE must fit in an integer")
 
static void mdunlinkfork (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 
static MdfdVecmdopenfork (SMgrRelation reln, ForkNumber forknum, int behavior)
 
static void register_dirty_segment (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
static void register_unlink_segment (RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
 
static void register_forget_request (RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
 
static void _fdvec_resize (SMgrRelation reln, ForkNumber forknum, int nseg)
 
static MdPathStr _mdfd_segpath (SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
 
static MdfdVec_mdfd_openseg (SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
 
static MdfdVec_mdfd_getseg (SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
 
static BlockNumber _mdnblocks (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
static PgAioResult md_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static void md_readv_report (PgAioResult result, const PgAioTargetData *td, int elevel)
 
static int _mdfd_open_flags (void)
 
void mdinit (void)
 
bool mdexists (SMgrRelation reln, ForkNumber forknum)
 
void mdcreate (SMgrRelation reln, ForkNumber forknum, bool isRedo)
 
void mdunlink (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 
static int do_truncate (const char *path)
 
void mdextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
 
void mdzeroextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
 
void mdopen (SMgrRelation reln)
 
void mdclose (SMgrRelation reln, ForkNumber forknum)
 
bool mdprefetch (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
 
static int buffers_to_iovec (struct iovec *iov, void **buffers, int nblocks)
 
uint32 mdmaxcombine (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
void mdreadv (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
 
void mdstartreadv (PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
 
void mdwritev (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
 
void mdwriteback (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
 
BlockNumber mdnblocks (SMgrRelation reln, ForkNumber forknum)
 
void mdtruncate (SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
 
void mdregistersync (SMgrRelation reln, ForkNumber forknum)
 
void mdimmedsync (SMgrRelation reln, ForkNumber forknum)
 
int mdfd (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
 
void ForgetDatabaseSyncRequests (Oid dbid)
 
void DropRelationFiles (RelFileLocator *delrels, int ndelrels, bool isRedo)
 
int mdsyncfiletag (const FileTag *ftag, char *path)
 
int mdunlinkfiletag (const FileTag *ftag, char *path)
 
bool mdfiletagmatches (const FileTag *ftag, const FileTag *candidate)
 

Variables

static MemoryContext MdCxt
 
const PgAioHandleCallbacks aio_md_readv_cb
 

Macro Definition Documentation

◆ EXTENSION_CREATE

#define EXTENSION_CREATE   (1 << 2)

Definition at line 117 of file md.c.

◆ EXTENSION_CREATE_RECOVERY

#define EXTENSION_CREATE_RECOVERY   (1 << 3)

Definition at line 119 of file md.c.

◆ EXTENSION_DONT_OPEN

#define EXTENSION_DONT_OPEN   (1 << 5)

Definition at line 121 of file md.c.

◆ EXTENSION_FAIL

#define EXTENSION_FAIL   (1 << 0)

Definition at line 113 of file md.c.

◆ EXTENSION_RETURN_NULL

#define EXTENSION_RETURN_NULL   (1 << 1)

Definition at line 115 of file md.c.

◆ INIT_MD_FILETAG

#define INIT_MD_FILETAG (   a,
  xx_rlocator,
  xx_forknum,
  xx_segno 
)
Value:
( \
memset(&(a), 0, sizeof(FileTag)), \
(a).handler = SYNC_HANDLER_MD, \
(a).rlocator = (xx_rlocator), \
(a).forknum = (xx_forknum), \
(a).segno = (xx_segno) \
)
int a
Definition isn.c:73
static int fb(int x)
Definition sync.h:51
@ SYNC_HANDLER_MD
Definition sync.h:37

Definition at line 101 of file md.c.

138{
139 char str[MD_PATH_STR_MAXLEN + 1];
140} MdPathStr;
141
142
143/* local routines */
144static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
145 bool isRedo);
146static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
148 MdfdVec *seg);
149static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
150 BlockNumber segno);
151static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
152 BlockNumber segno);
154 ForkNumber forknum,
155 int nseg);
157 BlockNumber segno);
159 BlockNumber segno, int oflags);
161 BlockNumber blkno, bool skipFsync, int behavior);
163 MdfdVec *seg);
164
166static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel);
167
170 .report = md_readv_report,
171};
172
173
174static inline int
176{
177 int flags = O_RDWR | PG_BINARY;
178
180 flags |= PG_O_DIRECT;
181
182 return flags;
183}
184
185/*
186 * mdinit() -- Initialize private state for magnetic disk storage manager.
187 */
188void
189mdinit(void)
190{
192 "MdSmgr",
194}
195
196/*
197 * mdexists() -- Does the physical file exist?
198 *
199 * Note: this will return true for lingering files, with pending deletions
200 */
201bool
203{
204 /*
205 * Close it first, to ensure that we notice if the fork has been unlinked
206 * since we opened it. As an optimization, we can skip that in recovery,
207 * which already closes relations when dropping them.
208 */
209 if (!InRecovery)
210 mdclose(reln, forknum);
211
212 return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
213}
214
215/*
216 * mdcreate() -- Create a new relation on magnetic disk.
217 *
218 * If isRedo is true, it's okay for the relation to exist already.
219 */
220void
222{
223 MdfdVec *mdfd;
224 RelPathStr path;
225 File fd;
226
227 if (isRedo && reln->md_num_open_segs[forknum] > 0)
228 return; /* created and opened already... */
229
230 Assert(reln->md_num_open_segs[forknum] == 0);
231
232 /*
233 * We may be using the target table space for the first time in this
234 * database, so create a per-database subdirectory if needed.
235 *
236 * XXX this is a fairly ugly violation of module layering, but this seems
237 * to be the best place to put the check. Maybe TablespaceCreateDbspace
238 * should be here and not in commands/tablespace.c? But that would imply
239 * importing a lot of stuff that smgr.c oughtn't know, either.
240 */
241 TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
242 reln->smgr_rlocator.locator.dbOid,
243 isRedo);
244
245 path = relpath(reln->smgr_rlocator, forknum);
246
248
249 if (fd < 0)
250 {
251 int save_errno = errno;
252
253 if (isRedo)
255 if (fd < 0)
256 {
257 /* be sure to report the error reported by create, not open */
261 errmsg("could not create file \"%s\": %m", path.str)));
262 }
263 }
264
265 _fdvec_resize(reln, forknum, 1);
266 mdfd = &reln->md_seg_fds[forknum][0];
267 mdfd->mdfd_vfd = fd;
268 mdfd->mdfd_segno = 0;
269
270 if (!SmgrIsTemp(reln))
272}
273
274/*
275 * mdunlink() -- Unlink a relation.
276 *
277 * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
278 * there won't be an SMgrRelation hashtable entry anymore.
279 *
280 * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
281 * to delete all forks.
282 *
283 * For regular relations, we don't unlink the first segment file of the rel,
284 * but just truncate it to zero length, and record a request to unlink it after
285 * the next checkpoint. Additional segments can be unlinked immediately,
286 * however. Leaving the empty file in place prevents that relfilenumber
287 * from being reused. The scenario this protects us from is:
288 * 1. We delete a relation (and commit, and actually remove its file).
289 * 2. We create a new relation, which by chance gets the same relfilenumber as
290 * the just-deleted one (OIDs must've wrapped around for that to happen).
291 * 3. We crash before another checkpoint occurs.
292 * During replay, we would delete the file and then recreate it, which is fine
293 * if the contents of the file were repopulated by subsequent WAL entries.
294 * But if we didn't WAL-log insertions, but instead relied on fsyncing the
295 * file after populating it (as we do at wal_level=minimal), the contents of
296 * the file would be lost forever. By leaving the empty file until after the
297 * next checkpoint, we prevent reassignment of the relfilenumber until it's
298 * safe, because relfilenumber assignment skips over any existing file.
299 *
300 * Additional segments, if any, are truncated and then unlinked. The reason
301 * for truncating is that other backends may still hold open FDs for these at
302 * the smgr level, so that the kernel can't remove the file yet. We want to
303 * reclaim the disk space right away despite that.
304 *
305 * We do not need to go through this dance for temp relations, though, because
306 * we never make WAL entries for temp rels, and so a temp rel poses no threat
307 * to the health of a regular rel that has taken over its relfilenumber.
308 * The fact that temp rels and regular rels have different file naming
309 * patterns provides additional safety. Other backends shouldn't have open
310 * FDs for them, either.
311 *
312 * We also don't do it while performing a binary upgrade. There is no reuse
313 * hazard in that case, since after a crash or even a simple ERROR, the
314 * upgrade fails and the whole cluster must be recreated from scratch.
315 * Furthermore, it is important to remove the files from disk immediately,
316 * because we may be about to reuse the same relfilenumber.
317 *
318 * All the above applies only to the relation's main fork; other forks can
319 * just be removed immediately, since they are not needed to prevent the
320 * relfilenumber from being recycled. Also, we do not carefully
321 * track whether other forks have been created or not, but just attempt to
322 * unlink them unconditionally; so we should never complain about ENOENT.
323 *
324 * If isRedo is true, it's unsurprising for the relation to be already gone.
325 * Also, we should remove the file immediately instead of queuing a request
326 * for later, since during redo there's no possibility of creating a
327 * conflicting relation.
328 *
329 * Note: we currently just never warn about ENOENT at all. We could warn in
330 * the main-fork, non-isRedo case, but it doesn't seem worth the trouble.
331 *
332 * Note: any failure should be reported as WARNING not ERROR, because
333 * we are usually not in a transaction anymore when this is called.
334 */
335void
336mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
337{
338 /* Now do the per-fork work */
339 if (forknum == InvalidForkNumber)
340 {
341 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
342 mdunlinkfork(rlocator, forknum, isRedo);
343 }
344 else
345 mdunlinkfork(rlocator, forknum, isRedo);
346}
347
348/*
349 * Truncate a file to release disk space.
350 */
351static int
352do_truncate(const char *path)
353{
354 int save_errno;
355 int ret;
356
357 ret = pg_truncate(path, 0);
358
359 /* Log a warning here to avoid repetition in callers. */
360 if (ret < 0 && errno != ENOENT)
361 {
365 errmsg("could not truncate file \"%s\": %m", path)));
367 }
368
369 return ret;
370}
371
372static void
374{
375 RelPathStr path;
376 int ret;
377 int save_errno;
378
379 path = relpath(rlocator, forknum);
380
381 /*
382 * Truncate and then unlink the first segment, or just register a request
383 * to unlink it later, as described in the comments for mdunlink().
384 */
385 if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
387 {
388 if (!RelFileLocatorBackendIsTemp(rlocator))
389 {
390 /* Prevent other backends' fds from holding on to the disk space */
391 ret = do_truncate(path.str);
392
393 /* Forget any pending sync requests for the first segment */
395 register_forget_request(rlocator, forknum, 0 /* first seg */ );
397 }
398 else
399 ret = 0;
400
401 /* Next unlink the file, unless it was already found to be missing */
402 if (ret >= 0 || errno != ENOENT)
403 {
404 ret = unlink(path.str);
405 if (ret < 0 && errno != ENOENT)
406 {
410 errmsg("could not remove file \"%s\": %m", path.str)));
412 }
413 }
414 }
415 else
416 {
417 /* Prevent other backends' fds from holding on to the disk space */
418 ret = do_truncate(path.str);
419
420 /* Register request to unlink first segment later */
422 register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
424 }
425
426 /*
427 * Delete any additional segments.
428 *
429 * Note that because we loop until getting ENOENT, we will correctly
430 * remove all inactive segments as well as active ones. Ideally we'd
431 * continue the loop until getting exactly that errno, but that risks an
432 * infinite loop if the problem is directory-wide (for instance, if we
433 * suddenly can't read the data directory itself). We compromise by
434 * continuing after a non-ENOENT truncate error, but stopping after any
435 * unlink error. If there is indeed a directory-wide problem, additional
436 * unlink attempts wouldn't work anyway.
437 */
438 if (ret >= 0 || errno != ENOENT)
439 {
441 BlockNumber segno;
442
443 for (segno = 1;; segno++)
444 {
445 sprintf(segpath.str, "%s.%u", path.str, segno);
446
447 if (!RelFileLocatorBackendIsTemp(rlocator))
448 {
449 /*
450 * Prevent other backends' fds from holding on to the disk
451 * space. We're done if we see ENOENT, though.
452 */
453 if (do_truncate(segpath.str) < 0 && errno == ENOENT)
454 break;
455
456 /*
457 * Forget any pending sync requests for this segment before we
458 * try to unlink.
459 */
460 register_forget_request(rlocator, forknum, segno);
461 }
462
463 if (unlink(segpath.str) < 0)
464 {
465 /* ENOENT is expected after the last segment... */
466 if (errno != ENOENT)
469 errmsg("could not remove file \"%s\": %m", segpath.str)));
470 break;
471 }
472 }
473 }
474}
475
476/*
477 * mdextend() -- Add a block to the specified relation.
478 *
479 * The semantics are nearly the same as mdwrite(): write at the
480 * specified position. However, this is to be used for the case of
481 * extending a relation (i.e., blocknum is at or beyond the current
482 * EOF). Note that we assume writing a block beyond current EOF
483 * causes intervening file space to become filled with zeroes.
484 */
485void
487 const void *buffer, bool skipFsync)
488{
489 pgoff_t seekpos;
490 int nbytes;
491 MdfdVec *v;
492
493 /* If this build supports direct I/O, the buffer must be I/O aligned. */
494 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
495 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
496
497 /* This assert is too expensive to have on normally ... */
498#ifdef CHECK_WRITE_VS_EXTEND
499 Assert(blocknum >= mdnblocks(reln, forknum));
500#endif
501
502 /*
503 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
504 * more --- we mustn't create a block whose number actually is
505 * InvalidBlockNumber. (Note that this failure should be unreachable
506 * because of upstream checks in bufmgr.c.)
507 */
508 if (blocknum == InvalidBlockNumber)
511 errmsg("cannot extend file \"%s\" beyond %u blocks",
512 relpath(reln->smgr_rlocator, forknum).str,
514
515 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
516
517 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
518
519 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
520
521 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
522 {
523 if (nbytes < 0)
526 errmsg("could not extend file \"%s\": %m",
528 errhint("Check free disk space.")));
529 /* short write: complain appropriately */
532 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
534 nbytes, BLCKSZ, blocknum),
535 errhint("Check free disk space.")));
536 }
537
538 if (!skipFsync && !SmgrIsTemp(reln))
539 register_dirty_segment(reln, forknum, v);
540
541 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
542}
543
544/*
545 * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
546 *
547 * Similar to mdextend(), except the relation can be extended by multiple
548 * blocks at once and the added blocks will be filled with zeroes.
549 */
550void
552 BlockNumber blocknum, int nblocks, bool skipFsync)
553{
554 MdfdVec *v;
555 BlockNumber curblocknum = blocknum;
556 int remblocks = nblocks;
557
558 Assert(nblocks > 0);
559
560 /* This assert is too expensive to have on normally ... */
561#ifdef CHECK_WRITE_VS_EXTEND
562 Assert(blocknum >= mdnblocks(reln, forknum));
563#endif
564
565 /*
566 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
567 * more --- we mustn't create a block whose number actually is
568 * InvalidBlockNumber or larger.
569 */
570 if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
573 errmsg("cannot extend file \"%s\" beyond %u blocks",
574 relpath(reln->smgr_rlocator, forknum).str,
576
577 while (remblocks > 0)
578 {
580 pgoff_t seekpos = (pgoff_t) BLCKSZ * segstartblock;
581 int numblocks;
582
585 else
587
589
592
593 /*
594 * If available and useful, use posix_fallocate() (via
595 * FileFallocate()) to extend the relation. That's often more
596 * efficient than using write(), as it commonly won't cause the kernel
597 * to allocate page cache space for the extended pages.
598 *
599 * However, we don't use FileFallocate() for small extensions, as it
600 * defeats delayed allocation on some filesystems. Not clear where
601 * that decision should be made though? For now just use a cutoff of
602 * 8, anything between 4 and 8 worked OK in some local testing.
603 */
604 if (numblocks > 8)
605 {
606 int ret;
607
608 ret = FileFallocate(v->mdfd_vfd,
609 seekpos, (pgoff_t) BLCKSZ * numblocks,
611 if (ret != 0)
612 {
615 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
617 errhint("Check free disk space."));
618 }
619 }
620 else
621 {
622 int ret;
623
624 /*
625 * Even if we don't want to use fallocate, we can still extend a
626 * bit more efficiently than writing each 8kB block individually.
627 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
628 * to avoid multiple writes or needing a zeroed buffer for the
629 * whole length of the extension.
630 */
631 ret = FileZero(v->mdfd_vfd,
632 seekpos, (pgoff_t) BLCKSZ * numblocks,
634 if (ret < 0)
637 errmsg("could not extend file \"%s\": %m",
639 errhint("Check free disk space."));
640 }
641
642 if (!skipFsync && !SmgrIsTemp(reln))
643 register_dirty_segment(reln, forknum, v);
644
645 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
646
649 }
650}
651
652/*
653 * mdopenfork() -- Open one fork of the specified relation.
654 *
655 * Note we only open the first segment, when there are multiple segments.
656 *
657 * If first segment is not present, either ereport or return NULL according
658 * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
659 * EXTENSION_CREATE means it's OK to extend an existing relation, not to
660 * invent one out of whole cloth.
661 */
662static MdfdVec *
663mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
664{
665 MdfdVec *mdfd;
666 RelPathStr path;
667 File fd;
668
669 /* No work if already open */
670 if (reln->md_num_open_segs[forknum] > 0)
671 return &reln->md_seg_fds[forknum][0];
672
673 path = relpath(reln->smgr_rlocator, forknum);
674
676
677 if (fd < 0)
678 {
679 if ((behavior & EXTENSION_RETURN_NULL) &&
681 return NULL;
684 errmsg("could not open file \"%s\": %m", path.str)));
685 }
686
687 _fdvec_resize(reln, forknum, 1);
688 mdfd = &reln->md_seg_fds[forknum][0];
689 mdfd->mdfd_vfd = fd;
690 mdfd->mdfd_segno = 0;
691
693
694 return mdfd;
695}
696
697/*
698 * mdopen() -- Initialize newly-opened relation.
699 */
700void
702{
703 /* mark it not open */
704 for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
705 reln->md_num_open_segs[forknum] = 0;
706}
707
708/*
709 * mdclose() -- Close the specified relation, if it isn't closed already.
710 */
711void
713{
714 int nopensegs = reln->md_num_open_segs[forknum];
715
716 /* No work if already closed */
717 if (nopensegs == 0)
718 return;
719
720 /* close segments starting from the end */
721 while (nopensegs > 0)
722 {
723 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
724
726 _fdvec_resize(reln, forknum, nopensegs - 1);
727 nopensegs--;
728 }
729}
730
731/*
732 * mdprefetch() -- Initiate asynchronous read of the specified blocks of a relation
733 */
734bool
736 int nblocks)
737{
738#ifdef USE_PREFETCH
739
741
742 if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
743 return false;
744
745 while (nblocks > 0)
746 {
747 pgoff_t seekpos;
748 MdfdVec *v;
750
751 v = _mdfd_getseg(reln, forknum, blocknum, false,
753 if (v == NULL)
754 return false;
755
756 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
757
758 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
759
761 Min(nblocks,
762 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
763
766
767 blocknum += nblocks_this_segment;
768 nblocks -= nblocks_this_segment;
769 }
770#endif /* USE_PREFETCH */
771
772 return true;
773}
774
775/*
776 * Convert an array of buffer address into an array of iovec objects, and
777 * return the number that were required. 'iov' must have enough space for up
778 * to 'nblocks' elements, but the number used may be less depending on
779 * merging. In the case of a run of fully contiguous buffers, a single iovec
780 * will be populated that can be handled as a plain non-vectored I/O.
781 */
782static int
783buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
784{
785 struct iovec *iovp;
786 int iovcnt;
787
788 Assert(nblocks >= 1);
789
790 /* If this build supports direct I/O, buffers must be I/O aligned. */
791 for (int i = 0; i < nblocks; ++i)
792 {
793 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
794 Assert((uintptr_t) buffers[i] ==
795 TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
796 }
797
798 /* Start the first iovec off with the first buffer. */
799 iovp = &iov[0];
800 iovp->iov_base = buffers[0];
801 iovp->iov_len = BLCKSZ;
802 iovcnt = 1;
803
804 /* Try to merge the rest. */
805 for (int i = 1; i < nblocks; ++i)
806 {
807 void *buffer = buffers[i];
808
809 if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
810 {
811 /* Contiguous with the last iovec. */
812 iovp->iov_len += BLCKSZ;
813 }
814 else
815 {
816 /* Need a new iovec. */
817 iovp++;
818 iovp->iov_base = buffer;
819 iovp->iov_len = BLCKSZ;
820 iovcnt++;
821 }
822 }
823
824 return iovcnt;
825}
826
827/*
828 * mdmaxcombine() -- Return the maximum number of total blocks that can be
829 * combined with an IO starting at blocknum.
830 */
831uint32
833 BlockNumber blocknum)
834{
835 BlockNumber segoff;
836
837 segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
838
839 return RELSEG_SIZE - segoff;
840}
841
842/*
843 * mdreadv() -- Read the specified blocks from a relation.
844 */
845void
847 void **buffers, BlockNumber nblocks)
848{
849 while (nblocks > 0)
850 {
851 struct iovec iov[PG_IOV_MAX];
852 int iovcnt;
853 pgoff_t seekpos;
854 int nbytes;
855 MdfdVec *v;
858 size_t size_this_segment;
859
860 v = _mdfd_getseg(reln, forknum, blocknum, false,
862
863 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
864
865 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
866
868 Min(nblocks,
869 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
871
872 if (nblocks_this_segment != nblocks)
873 elog(ERROR, "read crosses segment boundary");
874
878
879 /*
880 * Inner loop to continue after a short read. We'll keep going until
881 * we hit EOF rather than assuming that a short read means we hit the
882 * end.
883 */
884 for (;;)
885 {
886 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
887 reln->smgr_rlocator.locator.spcOid,
888 reln->smgr_rlocator.locator.dbOid,
889 reln->smgr_rlocator.locator.relNumber,
890 reln->smgr_rlocator.backend);
891 nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
893 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
894 reln->smgr_rlocator.locator.spcOid,
895 reln->smgr_rlocator.locator.dbOid,
896 reln->smgr_rlocator.locator.relNumber,
897 reln->smgr_rlocator.backend,
898 nbytes,
900
901#ifdef SIMULATE_SHORT_READ
902 nbytes = Min(nbytes, 4096);
903#endif
904
905 if (nbytes < 0)
908 errmsg("could not read blocks %u..%u in file \"%s\": %m",
909 blocknum,
910 blocknum + nblocks_this_segment - 1,
911 FilePathName(v->mdfd_vfd))));
912
913 if (nbytes == 0)
914 {
915 /*
916 * We are at or past EOF, or we read a partial block at EOF.
917 * Normally this is an error; upper levels should never try to
918 * read a nonexistent block. However, if zero_damaged_pages
919 * is ON or we are InRecovery, we should instead return zeroes
920 * without complaining. This allows, for example, the case of
921 * trying to update a block that was later truncated away.
922 *
923 * NB: We think that this codepath is unreachable in recovery
924 * and incomplete with zero_damaged_pages, as missing segments
925 * are not created. Putting blocks into the buffer-pool that
926 * do not exist on disk is rather problematic, as it will not
927 * be found by scans that rely on smgrnblocks(), as they are
928 * beyond EOF. It also can cause weird problems with relation
929 * extension, as relation extension does not expect blocks
930 * beyond EOF to exist.
931 *
932 * Therefore we do not want to copy the logic into
933 * mdstartreadv(), where it would have to be more complicated
934 * due to potential differences in the zero_damaged_pages
935 * setting between the definer and completor of IO.
936 *
937 * For PG 18, we are putting an Assert(false) in mdreadv()
938 * (triggering failures in assertion-enabled builds, but
939 * continuing to work in production builds). Afterwards we
940 * plan to remove this code entirely.
941 */
943 {
944 Assert(false); /* see comment above */
945
948 ++i)
949 memset(buffers[i], 0, BLCKSZ);
950 break;
951 }
952 else
955 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
956 blocknum,
957 blocknum + nblocks_this_segment - 1,
961 }
962
963 /* One loop should usually be enough. */
964 transferred_this_segment += nbytes;
967 break;
968
969 /* Adjust position and vectors after a short read. */
970 seekpos += nbytes;
972 }
973
974 nblocks -= nblocks_this_segment;
975 buffers += nblocks_this_segment;
976 blocknum += nblocks_this_segment;
977 }
978}
979
980/*
981 * mdstartreadv() -- Asynchronous version of mdreadv().
982 */
983void
985 SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
986 void **buffers, BlockNumber nblocks)
987{
988 pgoff_t seekpos;
989 MdfdVec *v;
991 struct iovec *iov;
992 int iovcnt;
993 int ret;
994
995 v = _mdfd_getseg(reln, forknum, blocknum, false,
997
998 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
999
1000 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1001
1003 Min(nblocks,
1004 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1005
1006 if (nblocks_this_segment != nblocks)
1007 elog(ERROR, "read crossing segment boundary");
1008
1010
1011 Assert(nblocks <= iovcnt);
1012
1014
1016
1019
1021 reln,
1022 forknum,
1023 blocknum,
1024 nblocks,
1025 false);
1027
1029 if (ret != 0)
1030 ereport(ERROR,
1032 errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
1033 blocknum,
1034 blocknum + nblocks_this_segment - 1,
1035 FilePathName(v->mdfd_vfd))));
1036
1037 /*
1038 * The error checks corresponding to the post-read checks in mdreadv() are
1039 * in md_readv_complete().
1040 *
1041 * However we chose, at least for now, to not implement the
1042 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
1043 * that logic is rather problematic, and we want to get rid of it. Here
1044 * equivalent logic would have to be more complicated due to potential
1045 * differences in the zero_damaged_pages setting between the definer and
1046 * completor of IO.
1047 */
1048}
1049
1050/*
1051 * mdwritev() -- Write the supplied blocks at the appropriate location.
1052 *
1053 * This is to be used only for updating already-existing blocks of a
1054 * relation (ie, those before the current EOF). To extend a relation,
1055 * use mdextend().
1056 */
1057void
1059 const void **buffers, BlockNumber nblocks, bool skipFsync)
1060{
1061 /* This assert is too expensive to have on normally ... */
1062#ifdef CHECK_WRITE_VS_EXTEND
1063 Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
1064#endif
1065
1066 while (nblocks > 0)
1067 {
1068 struct iovec iov[PG_IOV_MAX];
1069 int iovcnt;
1070 pgoff_t seekpos;
1071 int nbytes;
1072 MdfdVec *v;
1075 size_t size_this_segment;
1076
1077 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
1079
1080 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1081
1082 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1083
1085 Min(nblocks,
1086 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1088
1089 if (nblocks_this_segment != nblocks)
1090 elog(ERROR, "write crosses segment boundary");
1091
1092 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
1095
1096 /*
1097 * Inner loop to continue after a short write. If the reason is that
1098 * we're out of disk space, a future attempt should get an ENOSPC
1099 * error from the kernel.
1100 */
1101 for (;;)
1102 {
1103 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
1104 reln->smgr_rlocator.locator.spcOid,
1105 reln->smgr_rlocator.locator.dbOid,
1106 reln->smgr_rlocator.locator.relNumber,
1107 reln->smgr_rlocator.backend);
1108 nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
1110 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
1111 reln->smgr_rlocator.locator.spcOid,
1112 reln->smgr_rlocator.locator.dbOid,
1113 reln->smgr_rlocator.locator.relNumber,
1114 reln->smgr_rlocator.backend,
1115 nbytes,
1117
1118#ifdef SIMULATE_SHORT_WRITE
1119 nbytes = Min(nbytes, 4096);
1120#endif
1121
1122 if (nbytes < 0)
1123 {
1124 bool enospc = errno == ENOSPC;
1125
1126 ereport(ERROR,
1128 errmsg("could not write blocks %u..%u in file \"%s\": %m",
1129 blocknum,
1130 blocknum + nblocks_this_segment - 1,
1132 enospc ? errhint("Check free disk space.") : 0));
1133 }
1134
1135 /* One loop should usually be enough. */
1136 transferred_this_segment += nbytes;
1139 break;
1140
1141 /* Adjust position and iovecs after a short write. */
1142 seekpos += nbytes;
1144 }
1145
1146 if (!skipFsync && !SmgrIsTemp(reln))
1147 register_dirty_segment(reln, forknum, v);
1148
1149 nblocks -= nblocks_this_segment;
1150 buffers += nblocks_this_segment;
1151 blocknum += nblocks_this_segment;
1152 }
1153}
1154
1155
1156/*
1157 * mdwriteback() -- Tell the kernel to write pages back to storage.
1158 *
1159 * This accepts a range of blocks because flushing several pages at once is
1160 * considerably more efficient than doing so individually.
1161 */
1162void
1164 BlockNumber blocknum, BlockNumber nblocks)
1165{
1167
1168 /*
1169 * Issue flush requests in as few requests as possible; have to split at
1170 * segment boundaries though, since those are actually separate files.
1171 */
1172 while (nblocks > 0)
1173 {
1174 BlockNumber nflush = nblocks;
1175 pgoff_t seekpos;
1176 MdfdVec *v;
1177 int segnum_start,
1178 segnum_end;
1179
1180 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1182
1183 /*
1184 * We might be flushing buffers of already removed relations, that's
1185 * ok, just ignore that case. If the segment file wasn't open already
1186 * (ie from a recent mdwrite()), then we don't want to re-open it, to
1187 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1188 * us with a descriptor to a file that is about to be unlinked.
1189 */
1190 if (!v)
1191 return;
1192
1193 /* compute offset inside the current segment */
1194 segnum_start = blocknum / RELSEG_SIZE;
1195
1196 /* compute number of desired writes within the current segment */
1197 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1198 if (segnum_start != segnum_end)
1199 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1200
1201 Assert(nflush >= 1);
1202 Assert(nflush <= nblocks);
1203
1204 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1205
1207
1208 nblocks -= nflush;
1209 blocknum += nflush;
1210 }
1211}
1212
1213/*
1214 * mdnblocks() -- Get the number of blocks stored in a relation.
1215 *
1216 * Important side effect: all active segments of the relation are opened
1217 * and added to the md_seg_fds array. If this routine has not been
1218 * called, then only segments up to the last one actually touched
1219 * are present in the array.
1220 */
1223{
1224 MdfdVec *v;
1225 BlockNumber nblocks;
1226 BlockNumber segno;
1227
1228 mdopenfork(reln, forknum, EXTENSION_FAIL);
1229
1230 /* mdopen has opened the first segment */
1231 Assert(reln->md_num_open_segs[forknum] > 0);
1232
1233 /*
1234 * Start from the last open segments, to avoid redundant seeks. We have
1235 * previously verified that these segments are exactly RELSEG_SIZE long,
1236 * and it's useless to recheck that each time.
1237 *
1238 * NOTE: this assumption could only be wrong if another backend has
1239 * truncated the relation. We rely on higher code levels to handle that
1240 * scenario by closing and re-opening the md fd, which is handled via
1241 * relcache flush. (Since the checkpointer doesn't participate in
1242 * relcache flush, it could have segment entries for inactive segments;
1243 * that's OK because the checkpointer never needs to compute relation
1244 * size.)
1245 */
1246 segno = reln->md_num_open_segs[forknum] - 1;
1247 v = &reln->md_seg_fds[forknum][segno];
1248
1249 for (;;)
1250 {
1251 nblocks = _mdnblocks(reln, forknum, v);
1252 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1253 elog(FATAL, "segment too big");
1254 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1255 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1256
1257 /*
1258 * If segment is exactly RELSEG_SIZE, advance to next one.
1259 */
1260 segno++;
1261
1262 /*
1263 * We used to pass O_CREAT here, but that has the disadvantage that it
1264 * might create a segment which has vanished through some operating
1265 * system misadventure. In such a case, creating the segment here
1266 * undermines _mdfd_getseg's attempts to notice and report an error
1267 * upon access to a missing segment.
1268 */
1269 v = _mdfd_openseg(reln, forknum, segno, 0);
1270 if (v == NULL)
1271 return segno * ((BlockNumber) RELSEG_SIZE);
1272 }
1273}
1274
1275/*
1276 * mdtruncate() -- Truncate relation to specified number of blocks.
1277 *
1278 * Guaranteed not to allocate memory, so it can be used in a critical section.
1279 * Caller must have called smgrnblocks() to obtain curnblk while holding a
1280 * sufficient lock to prevent a change in relation size, and not used any smgr
1281 * functions for this relation or handled interrupts in between. This makes
1282 * sure we have opened all active segments, so that truncate loop will get
1283 * them all!
1284 *
1285 * If nblocks > curnblk, the request is ignored when we are InRecovery,
1286 * otherwise, an error is raised.
1287 */
1288void
1291{
1293 int curopensegs;
1294
1295 if (nblocks > curnblk)
1296 {
1297 /* Bogus request ... but no complaint if InRecovery */
1298 if (InRecovery)
1299 return;
1300 ereport(ERROR,
1301 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1302 relpath(reln->smgr_rlocator, forknum).str,
1303 nblocks, curnblk)));
1304 }
1305 if (nblocks == curnblk)
1306 return; /* no work */
1307
1308 /*
1309 * Truncate segments, starting at the last one. Starting at the end makes
1310 * managing the memory for the fd array easier, should there be errors.
1311 */
1312 curopensegs = reln->md_num_open_segs[forknum];
1313 while (curopensegs > 0)
1314 {
1315 MdfdVec *v;
1316
1318
1319 v = &reln->md_seg_fds[forknum][curopensegs - 1];
1320
1321 if (priorblocks > nblocks)
1322 {
1323 /*
1324 * This segment is no longer active. We truncate the file, but do
1325 * not delete it, for reasons explained in the header comments.
1326 */
1328 ereport(ERROR,
1330 errmsg("could not truncate file \"%s\": %m",
1331 FilePathName(v->mdfd_vfd))));
1332
1333 if (!SmgrIsTemp(reln))
1334 register_dirty_segment(reln, forknum, v);
1335
1336 /* we never drop the 1st segment */
1337 Assert(v != &reln->md_seg_fds[forknum][0]);
1338
1339 FileClose(v->mdfd_vfd);
1340 _fdvec_resize(reln, forknum, curopensegs - 1);
1341 }
1342 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1343 {
1344 /*
1345 * This is the last segment we want to keep. Truncate the file to
1346 * the right length. NOTE: if nblocks is exactly a multiple K of
1347 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1348 * keep it. This adheres to the invariant given in the header
1349 * comments.
1350 */
1352
1354 ereport(ERROR,
1356 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1358 nblocks)));
1359 if (!SmgrIsTemp(reln))
1360 register_dirty_segment(reln, forknum, v);
1361 }
1362 else
1363 {
1364 /*
1365 * We still need this segment, so nothing to do for this and any
1366 * earlier segment.
1367 */
1368 break;
1369 }
1370 curopensegs--;
1371 }
1372}
1373
1374/*
1375 * mdregistersync() -- Mark whole relation as needing fsync
1376 */
1377void
1379{
1380 int segno;
1381 int min_inactive_seg;
1382
1383 /*
1384 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1385 * the loop below will get them all!
1386 */
1387 mdnblocks(reln, forknum);
1388
1389 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1390
1391 /*
1392 * Temporarily open inactive segments, then close them after sync. There
1393 * may be some inactive segments left opened after error, but that is
1394 * harmless. We don't bother to clean them up and take a risk of further
1395 * trouble. The next mdclose() will soon close them.
1396 */
1397 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1398 segno++;
1399
1400 while (segno > 0)
1401 {
1402 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1403
1404 register_dirty_segment(reln, forknum, v);
1405
1406 /* Close inactive segments immediately */
1407 if (segno > min_inactive_seg)
1408 {
1409 FileClose(v->mdfd_vfd);
1410 _fdvec_resize(reln, forknum, segno - 1);
1411 }
1412
1413 segno--;
1414 }
1415}
1416
1417/*
1418 * mdimmedsync() -- Immediately sync a relation to stable storage.
1419 *
1420 * Note that only writes already issued are synced; this routine knows
1421 * nothing of dirty buffers that may exist inside the buffer manager. We
1422 * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
1423 * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
1424 * some segment, then mdtruncate() renders that segment inactive. If we
1425 * crash before the next checkpoint syncs the newly-inactive segment, that
1426 * segment may survive recovery, reintroducing unwanted data into the table.
1427 */
1428void
1430{
1431 int segno;
1432 int min_inactive_seg;
1433
1434 /*
1435 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1436 * the loop below will get them all!
1437 */
1438 mdnblocks(reln, forknum);
1439
1440 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1441
1442 /*
1443 * Temporarily open inactive segments, then close them after sync. There
1444 * may be some inactive segments left opened after fsync() error, but that
1445 * is harmless. We don't bother to clean them up and take a risk of
1446 * further trouble. The next mdclose() will soon close them.
1447 */
1448 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1449 segno++;
1450
1451 while (segno > 0)
1452 {
1453 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1454
1455 /*
1456 * fsyncs done through mdimmedsync() should be tracked in a separate
1457 * IOContext than those done through mdsyncfiletag() to differentiate
1458 * between unavoidable client backend fsyncs (e.g. those done during
1459 * index build) and those which ideally would have been done by the
1460 * checkpointer. Since other IO operations bypassing the buffer
1461 * manager could also be tracked in such an IOContext, wait until
1462 * these are also tracked to track immediate fsyncs.
1463 */
1467 errmsg("could not fsync file \"%s\": %m",
1468 FilePathName(v->mdfd_vfd))));
1469
1470 /* Close inactive segments immediately */
1471 if (segno > min_inactive_seg)
1472 {
1473 FileClose(v->mdfd_vfd);
1474 _fdvec_resize(reln, forknum, segno - 1);
1475 }
1476
1477 segno--;
1478 }
1479}
1480
1481int
1482mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
1483{
1484 MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
1485
1486 v = _mdfd_getseg(reln, forknum, blocknum, false,
1488
1489 *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1490
1491 Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1492
1493 return FileGetRawDesc(v->mdfd_vfd);
1494}
1495
1496/*
1497 * register_dirty_segment() -- Mark a relation segment as needing fsync
1498 *
1499 * If there is a local pending-ops table, just make an entry in it for
1500 * ProcessSyncRequests to process later. Otherwise, try to pass off the
1501 * fsync request to the checkpointer process. If that fails, just do the
1502 * fsync locally before returning (we hope this will not happen often
1503 * enough to be a performance problem).
1504 */
1505static void
1507{
1508 FileTag tag;
1509
1510 INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1511
1512 /* Temp relations should never be fsync'd */
1514
1515 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1516 {
1518
1520 (errmsg_internal("could not forward fsync request because request queue is full")));
1521
1523
1527 errmsg("could not fsync file \"%s\": %m",
1528 FilePathName(seg->mdfd_vfd))));
1529
1530 /*
1531 * We have no way of knowing if the current IOContext is
1532 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1533 * point, so count the fsync as being in the IOCONTEXT_NORMAL
1534 * IOContext. This is probably okay, because the number of backend
1535 * fsyncs doesn't say anything about the efficacy of the
1536 * BufferAccessStrategy. And counting both fsyncs done in
1537 * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1538 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1539 * backend fsyncs.
1540 */
1542 IOOP_FSYNC, io_start, 1, 0);
1543 }
1544}
1545
1546/*
1547 * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
1548 */
1549static void
1551 BlockNumber segno)
1552{
1553 FileTag tag;
1554
1555 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1556
1557 /* Should never be used with temp relations */
1559
1560 RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
1561}
1562
1563/*
1564 * register_forget_request() -- forget any fsyncs for a relation fork's segment
1565 */
1566static void
1568 BlockNumber segno)
1569{
1570 FileTag tag;
1571
1572 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1573
1574 RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
1575}
1576
1577/*
1578 * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
1579 */
1580void
1582{
1583 FileTag tag;
1584 RelFileLocator rlocator;
1585
1586 rlocator.dbOid = dbid;
1587 rlocator.spcOid = 0;
1588 rlocator.relNumber = 0;
1589
1591
1592 RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1593}
1594
1595/*
1596 * DropRelationFiles -- drop files of all given relations
1597 */
1598void
1600{
1602 int i;
1603
1605 for (i = 0; i < ndelrels; i++)
1606 {
1608
1609 if (isRedo)
1610 {
1612
1613 for (fork = 0; fork <= MAX_FORKNUM; fork++)
1615 }
1616 srels[i] = srel;
1617 }
1618
1620
1621 for (i = 0; i < ndelrels; i++)
1622 smgrclose(srels[i]);
1623 pfree(srels);
1624}
1625
1626
1627/*
1628 * _fdvec_resize() -- Resize the fork's open segments array
1629 */
1630static void
1632 ForkNumber forknum,
1633 int nseg)
1634{
1635 if (nseg == 0)
1636 {
1637 if (reln->md_num_open_segs[forknum] > 0)
1638 {
1639 pfree(reln->md_seg_fds[forknum]);
1640 reln->md_seg_fds[forknum] = NULL;
1641 }
1642 }
1643 else if (reln->md_num_open_segs[forknum] == 0)
1644 {
1645 reln->md_seg_fds[forknum] =
1647 }
1648 else if (nseg > reln->md_num_open_segs[forknum])
1649 {
1650 /*
1651 * It doesn't seem worthwhile complicating the code to amortize
1652 * repalloc() calls. Those are far faster than PathNameOpenFile() or
1653 * FileClose(), and the memory context internally will sometimes avoid
1654 * doing an actual reallocation.
1655 */
1656 reln->md_seg_fds[forknum] =
1657 repalloc(reln->md_seg_fds[forknum],
1658 sizeof(MdfdVec) * nseg);
1659 }
1660 else
1661 {
1662 /*
1663 * We don't reallocate a smaller array, because we want mdtruncate()
1664 * to be able to promise that it won't allocate memory, so that it is
1665 * allowed in a critical section. This means that a bit of space in
1666 * the array is now wasted, until the next time we add a segment and
1667 * reallocate.
1668 */
1669 }
1670
1671 reln->md_num_open_segs[forknum] = nseg;
1672}
1673
1674/*
1675 * Return the filename for the specified segment of the relation. The
1676 * returned string is palloc'd.
1677 */
1678static MdPathStr
1680{
1681 RelPathStr path;
1682 MdPathStr fullpath;
1683
1684 path = relpath(reln->smgr_rlocator, forknum);
1685
1686 if (segno > 0)
1687 sprintf(fullpath.str, "%s.%u", path.str, segno);
1688 else
1689 strcpy(fullpath.str, path.str);
1690
1691 return fullpath;
1692}
1693
1694/*
1695 * Open the specified segment of the relation,
1696 * and make a MdfdVec object for it. Returns NULL on failure.
1697 */
1698static MdfdVec *
1700 int oflags)
1701{
1702 MdfdVec *v;
1703 File fd;
1704 MdPathStr fullpath;
1705
1706 fullpath = _mdfd_segpath(reln, forknum, segno);
1707
1708 /* open the file */
1710
1711 if (fd < 0)
1712 return NULL;
1713
1714 /*
1715 * Segments are always opened in order from lowest to highest, so we must
1716 * be adding a new one at the end.
1717 */
1718 Assert(segno == reln->md_num_open_segs[forknum]);
1719
1720 _fdvec_resize(reln, forknum, segno + 1);
1721
1722 /* fill the entry */
1723 v = &reln->md_seg_fds[forknum][segno];
1724 v->mdfd_vfd = fd;
1725 v->mdfd_segno = segno;
1726
1727 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1728
1729 /* all done */
1730 return v;
1731}
1732
1733/*
1734 * _mdfd_getseg() -- Find the segment of the relation holding the
1735 * specified block.
1736 *
1737 * If the segment doesn't exist, we ereport, return NULL, or create the
1738 * segment, according to "behavior". Note: skipFsync is only used in the
1739 * EXTENSION_CREATE case.
1740 */
1741static MdfdVec *
1743 bool skipFsync, int behavior)
1744{
1745 MdfdVec *v;
1748
1749 /* some way to handle non-existent segments needs to be specified */
1750 Assert(behavior &
1753
1754 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1755
1756 /* if an existing and opened segment, we're done */
1757 if (targetseg < reln->md_num_open_segs[forknum])
1758 {
1759 v = &reln->md_seg_fds[forknum][targetseg];
1760 return v;
1761 }
1762
1763 /* The caller only wants the segment if we already had it open. */
1764 if (behavior & EXTENSION_DONT_OPEN)
1765 return NULL;
1766
1767 /*
1768 * The target segment is not yet open. Iterate over all the segments
1769 * between the last opened and the target segment. This way missing
1770 * segments either raise an error, or get created (according to
1771 * 'behavior'). Start with either the last opened, or the first segment if
1772 * none was opened before.
1773 */
1774 if (reln->md_num_open_segs[forknum] > 0)
1775 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1776 else
1777 {
1778 v = mdopenfork(reln, forknum, behavior);
1779 if (!v)
1780 return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1781 }
1782
1783 for (nextsegno = reln->md_num_open_segs[forknum];
1785 {
1786 BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1787 int flags = 0;
1788
1789 Assert(nextsegno == v->mdfd_segno + 1);
1790
1791 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1792 elog(FATAL, "segment too big");
1793
1794 if ((behavior & EXTENSION_CREATE) ||
1795 (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1796 {
1797 /*
1798 * Normally we will create new segments only if authorized by the
1799 * caller (i.e., we are doing mdextend()). But when doing WAL
1800 * recovery, create segments anyway; this allows cases such as
1801 * replaying WAL data that has a write into a high-numbered
1802 * segment of a relation that was later deleted. We want to go
1803 * ahead and create the segments so we can finish out the replay.
1804 *
1805 * We have to maintain the invariant that segments before the last
1806 * active segment are of size RELSEG_SIZE; therefore, if
1807 * extending, pad them out with zeroes if needed. (This only
1808 * matters if in recovery, or if the caller is extending the
1809 * relation discontiguously, but that can happen in hash indexes.)
1810 */
1811 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1812 {
1815
1816 mdextend(reln, forknum,
1819 pfree(zerobuf);
1820 }
1821 flags = O_CREAT;
1822 }
1823 else if (nblocks < ((BlockNumber) RELSEG_SIZE))
1824 {
1825 /*
1826 * When not extending, only open the next segment if the current
1827 * one is exactly RELSEG_SIZE. If not (this branch), either
1828 * return NULL or fail.
1829 */
1830 if (behavior & EXTENSION_RETURN_NULL)
1831 {
1832 /*
1833 * Some callers discern between reasons for _mdfd_getseg()
1834 * returning NULL based on errno. As there's no failing
1835 * syscall involved in this case, explicitly set errno to
1836 * ENOENT, as that seems the closest interpretation.
1837 */
1838 errno = ENOENT;
1839 return NULL;
1840 }
1841
1842 ereport(ERROR,
1844 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1845 _mdfd_segpath(reln, forknum, nextsegno).str,
1846 blkno, nblocks)));
1847 }
1848
1849 v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1850
1851 if (v == NULL)
1852 {
1853 if ((behavior & EXTENSION_RETURN_NULL) &&
1855 return NULL;
1856 ereport(ERROR,
1858 errmsg("could not open file \"%s\" (target block %u): %m",
1859 _mdfd_segpath(reln, forknum, nextsegno).str,
1860 blkno)));
1861 }
1862 }
1863
1864 return v;
1865}
1866
1867/*
1868 * Get number of blocks present in a single disk file
1869 */
1870static BlockNumber
1872{
1873 pgoff_t len;
1874
1875 len = FileSize(seg->mdfd_vfd);
1876 if (len < 0)
1877 ereport(ERROR,
1879 errmsg("could not seek to end of file \"%s\": %m",
1880 FilePathName(seg->mdfd_vfd))));
1881 /* note that this calculation will ignore any partial block at EOF */
1882 return (BlockNumber) (len / BLCKSZ);
1883}
1884
1885/*
1886 * Sync a file to disk, given a file tag. Write the path into an output
1887 * buffer so the caller can use it in error messages.
1888 *
1889 * Return 0 on success, -1 on failure, with errno set.
1890 */
1891int
1892mdsyncfiletag(const FileTag *ftag, char *path)
1893{
1895 File file;
1897 bool need_to_close;
1898 int result,
1899 save_errno;
1900
1901 /* See if we already have the file open, or need to open it. */
1902 if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1903 {
1904 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1905 strlcpy(path, FilePathName(file), MAXPGPATH);
1906 need_to_close = false;
1907 }
1908 else
1909 {
1910 MdPathStr p;
1911
1912 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1913 strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
1914
1915 file = PathNameOpenFile(path, _mdfd_open_flags());
1916 if (file < 0)
1917 return -1;
1918 need_to_close = true;
1919 }
1920
1922
1923 /* Sync the file. */
1924 result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1925 save_errno = errno;
1926
1927 if (need_to_close)
1928 FileClose(file);
1929
1931 IOOP_FSYNC, io_start, 1, 0);
1932
1933 errno = save_errno;
1934 return result;
1935}
1936
1937/*
1938 * Unlink a file, given a file tag. Write the path into an output
1939 * buffer so the caller can use it in error messages.
1940 *
1941 * Return 0 on success, -1 on failure, with errno set.
1942 */
1943int
1944mdunlinkfiletag(const FileTag *ftag, char *path)
1945{
1946 RelPathStr p;
1947
1948 /* Compute the path. */
1949 p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1950 strlcpy(path, p.str, MAXPGPATH);
1951
1952 /* Try to unlink the file. */
1953 return unlink(path);
1954}
1955
1956/*
1957 * Check if a given candidate request matches a given tag, when processing
1958 * a SYNC_FILTER_REQUEST request. This will be called for all pending
1959 * requests to find out whether to forget them.
1960 */
1961bool
1962mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
1963{
1964 /*
1965 * For now we only use filter requests as a way to drop all scheduled
1966 * callbacks relating to a given database, when dropping the database.
1967 * We'll return true for all candidates that have the same database OID as
1968 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1969 */
1970 return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1971}
1972
1973/*
1974 * AIO completion callback for mdstartreadv().
1975 */
1976static PgAioResult
1978{
1980 PgAioResult result = prior_result;
1981
1982 if (prior_result.result < 0)
1983 {
1984 result.status = PGAIO_RS_ERROR;
1985 result.id = PGAIO_HCB_MD_READV;
1986 /* For "hard" errors, track the error number in error_data */
1987 result.error_data = -prior_result.result;
1988 result.result = 0;
1989
1990 /*
1991 * Immediately log a message about the IO error, but only to the
1992 * server log. The reason to do so immediately is that the originator
1993 * might not process the query result immediately (because it is busy
1994 * doing another part of query processing) or at all (e.g. if it was
1995 * cancelled or errored out due to another IO also failing). The
1996 * definer of the IO will emit an ERROR when processing the IO's
1997 * results
1998 */
2000
2001 return result;
2002 }
2003
2004 /*
2005 * As explained above smgrstartreadv(), the smgr API operates on the level
2006 * of blocks, rather than bytes. Convert.
2007 */
2008 result.result /= BLCKSZ;
2009
2010 Assert(result.result <= td->smgr.nblocks);
2011
2012 if (result.result == 0)
2013 {
2014 /* consider 0 blocks read a failure */
2015 result.status = PGAIO_RS_ERROR;
2016 result.id = PGAIO_HCB_MD_READV;
2017 result.error_data = 0;
2018
2019 /* see comment above the "hard error" case */
2021
2022 return result;
2023 }
2024
2025 if (result.status != PGAIO_RS_ERROR &&
2026 result.result < td->smgr.nblocks)
2027 {
2028 /* partial reads should be retried at upper level */
2029 result.status = PGAIO_RS_PARTIAL;
2030 result.id = PGAIO_HCB_MD_READV;
2031 }
2032
2033 return result;
2034}
2035
2036/*
2037 * AIO error reporting callback for mdstartreadv().
2038 *
2039 * Errors are encoded as follows:
2040 * - PgAioResult.error_data != 0 encodes IO that failed with that errno
2041 * - PgAioResult.error_data == 0 encodes IO that didn't read all data
2042 */
2043static void
2044md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
2045{
2046 RelPathStr path;
2047
2048 path = relpathbackend(td->smgr.rlocator,
2050 td->smgr.forkNum);
2051
2052 if (result.error_data != 0)
2053 {
2054 /* for errcode_for_file_access() and %m */
2055 errno = result.error_data;
2056
2057 ereport(elevel,
2059 errmsg("could not read blocks %u..%u in file \"%s\": %m",
2060 td->smgr.blockNum,
2061 td->smgr.blockNum + td->smgr.nblocks - 1,
2062 path.str));
2063 }
2064 else
2065 {
2066 /*
2067 * NB: This will typically only be output in debug messages, while
2068 * retrying a partial IO.
2069 */
2070 ereport(elevel,
2072 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
2073 td->smgr.blockNum,
2074 td->smgr.blockNum + td->smgr.nblocks - 1,
2075 path.str,
2076 result.result * (size_t) BLCKSZ,
2077 td->smgr.nblocks * (size_t) BLCKSZ));
2078 }
2079}
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition aio.c:330
@ PGAIO_HCB_MD_READV
Definition aio.h:196
@ PGAIO_HF_BUFFERED
Definition aio.h:77
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
int pgaio_io_get_iovec(PgAioHandle *ioh, struct iovec **iov)
Definition aio_io.c:42
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition aio_target.c:73
@ PGAIO_RS_PARTIAL
Definition aio_types.h:82
@ PGAIO_RS_ERROR
Definition aio_types.h:84
void TablespaceCreateDbspace(Oid spcOid, Oid dbOid, bool isRedo)
Definition tablespace.c:112
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
#define MaxBlockNumber
Definition block.h:35
bool track_io_timing
Definition bufmgr.c:176
bool zero_damaged_pages
Definition bufmgr.c:173
#define Min(x, y)
Definition c.h:997
#define TYPEALIGN(ALIGNVAL, LEN)
Definition c.h:819
uint8_t uint8
Definition c.h:544
#define Assert(condition)
Definition c.h:873
#define PG_BINARY
Definition c.h:1281
uint64_t uint64
Definition c.h:547
uint32_t uint32
Definition c.h:546
#define lengthof(array)
Definition c.h:803
int errmsg_internal(const char *fmt,...)
Definition elog.c:1170
int errcode_for_file_access(void)
Definition elog.c:886
int errhint(const char *fmt,...)
Definition elog.c:1330
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define LOG_SERVER_ONLY
Definition elog.h:32
#define FATAL
Definition elog.h:41
#define WARNING
Definition elog.h:36
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int pg_truncate(const char *path, pgoff_t length)
Definition fd.c:717
int FileGetRawDesc(File file)
Definition fd.c:2512
void FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
Definition fd.c:2119
int io_direct_flags
Definition fd.c:168
char * FilePathName(File file)
Definition fd.c:2496
int FileSync(File file, uint32 wait_event_info)
Definition fd.c:2332
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2201
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2145
int FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2404
pgoff_t FileSize(File file)
Definition fd.c:2444
void FileClose(File file)
Definition fd.c:1962
int data_sync_elevel(int elevel)
Definition fd.c:3982
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1559
int FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2461
int FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2359
int FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2063
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2227
#define IO_DIRECT_DATA
Definition fd.h:54
static ssize_t FileWrite(File file, const void *buffer, size_t amount, pgoff_t offset, uint32 wait_event_info)
Definition fd.h:226
#define FILE_POSSIBLY_DELETED(err)
Definition fd.h:78
int File
Definition fd.h:51
#define PG_O_DIRECT
Definition fd.h:112
#define MCXT_ALLOC_ZERO
Definition fe_memutils.h:30
#define palloc_array(type, count)
Definition fe_memutils.h:76
int compute_remaining_iovec(struct iovec *destination, const struct iovec *source, int iovcnt, size_t transferred)
Definition file_utils.c:614
bool IsBinaryUpgrade
Definition globals.c:121
ProcNumber MyProcNumber
Definition globals.c:90
const char * str
int i
Definition isn.c:77
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1232
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
MemoryContext TopMemoryContext
Definition mcxt.c:166
void * palloc_aligned(Size size, Size alignto, int flags)
Definition mcxt.c:1606
void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition md.c:337
static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition md.c:2045
static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition md.c:1568
#define EXTENSION_CREATE_RECOVERY
Definition md.c:119
void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
Definition md.c:1290
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition md.c:1872
static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition md.c:374
void mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
Definition md.c:1059
bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
Definition md.c:1963
bool mdexists(SMgrRelation reln, ForkNumber forknum)
Definition md.c:203
void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition md.c:847
static MdPathStr _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
Definition md.c:1680
static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition md.c:1551
#define EXTENSION_DONT_OPEN
Definition md.c:121
BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum)
Definition md.c:1223
int mdunlinkfiletag(const FileTag *ftag, char *path)
Definition md.c:1945
static MemoryContext MdCxt
Definition md.c:97
void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition md.c:222
int mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
Definition md.c:1483
void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition md.c:487
static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition md.c:1978
static int do_truncate(const char *path)
Definition md.c:353
void mdinit(void)
Definition md.c:190
void mdclose(SMgrRelation reln, ForkNumber forknum)
Definition md.c:713
void mdzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition md.c:552
static MdfdVec * _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
Definition md.c:1700
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition md.c:1507
int mdsyncfiletag(const FileTag *ftag, char *path)
Definition md.c:1893
void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition md.c:1164
uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition md.c:833
static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
Definition md.c:1743
#define EXTENSION_RETURN_NULL
Definition md.c:115
void mdstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition md.c:985
bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition md.c:736
void mdregistersync(SMgrRelation reln, ForkNumber forknum)
Definition md.c:1379
void mdopen(SMgrRelation reln)
Definition md.c:702
#define EXTENSION_CREATE
Definition md.c:117
const PgAioHandleCallbacks aio_md_readv_cb
Definition md.c:169
static int _mdfd_open_flags(void)
Definition md.c:176
#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
Definition md.c:101
#define EXTENSION_FAIL
Definition md.c:113
static MdfdVec * mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
Definition md.c:664
void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
Definition md.c:1600
static int buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
Definition md.c:784
#define MD_PATH_STR_MAXLEN
Definition md.c:132
static void _fdvec_resize(SMgrRelation reln, ForkNumber forknum, int nseg)
Definition md.c:1632
void ForgetDatabaseSyncRequests(Oid dbid)
Definition md.c:1582
void mdimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition md.c:1430
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
#define ERRCODE_DATA_CORRUPTED
#define MAXPGPATH
#define PG_IO_ALIGN_SIZE
const void size_t len
#define PG_IOV_MAX
Definition pg_iovec.h:47
@ IOOBJECT_RELATION
Definition pgstat.h:277
@ IOCONTEXT_NORMAL
Definition pgstat.h:289
@ IOOP_FSYNC
Definition pgstat.h:308
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
#define sprintf
Definition port.h:262
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
off_t pgoff_t
Definition port.h:421
unsigned int Oid
static int fd(const char *x, int i)
#define INVALID_PROC_NUMBER
Definition procnumber.h:26
#define RelFileLocatorBackendIsTemp(rlocator)
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
@ InvalidForkNumber
Definition relpath.h:57
#define MAX_FORKNUM
Definition relpath.h:70
#define relpath(rlocator, forknum)
Definition relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition relpath.h:141
#define relpathperm(rlocator, forknum)
Definition relpath.h:146
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition smgr.c:240
void smgrclose(SMgrRelation reln)
Definition smgr.c:374
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
Definition smgr.c:538
void pgaio_io_set_target_smgr(PgAioHandle *ioh, SMgrRelationData *smgr, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skip_fsync)
Definition smgr.c:1038
#define SmgrIsTemp(smgr)
Definition smgr.h:74
RelFileLocator rlocator
Definition sync.h:54
int16 forknum
Definition sync.h:53
uint64 segno
Definition sync.h:55
char str[MD_PATH_STR_MAXLEN+1]
Definition md.c:140
PgAioHandleCallbackComplete complete_shared
Definition aio.h:239
uint32 status
Definition aio_types.h:108
uint32 error_data
Definition aio_types.h:111
int32 result
Definition aio_types.h:113
uint32 id
Definition aio_types.h:105
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition relpath.h:123
Definition md.c:92
File mdfd_vfd
Definition md.c:93
BlockNumber mdfd_segno
Definition md.c:94
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition sync.c:580
@ SYNC_FILTER_REQUEST
Definition sync.h:28
@ SYNC_FORGET_REQUEST
Definition sync.h:27
@ SYNC_UNLINK_REQUEST
Definition sync.h:26
@ SYNC_REQUEST
Definition sync.h:25
BlockNumber blockNum
Definition aio_types.h:66
RelFileLocator rlocator
Definition aio_types.h:65
struct PgAioTargetData::@126 smgr
BlockNumber nblocks
Definition aio_types.h:67
bool InRecovery
Definition xlogutils.c:50
void XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum)
Definition xlogutils.c:630

◆ MD_PATH_STR_MAXLEN

#define MD_PATH_STR_MAXLEN
Value:
(\
+ sizeof((char)'.') \
)
#define SEGMENT_CHARS
Definition md.c:131
#define REL_PATH_STR_MAXLEN
Definition relpath.h:96

Definition at line 132 of file md.c.

◆ SEGMENT_CHARS

#define SEGMENT_CHARS   OIDCHARS

Definition at line 131 of file md.c.

Typedef Documentation

◆ MdfdVec

◆ MdPathStr

Function Documentation

◆ _fdvec_resize()

static void _fdvec_resize ( SMgrRelation  reln,
ForkNumber  forknum,
int  nseg 
)
static

Definition at line 1632 of file md.c.

1635{
1636 if (nseg == 0)
1637 {
1638 if (reln->md_num_open_segs[forknum] > 0)
1639 {
1640 pfree(reln->md_seg_fds[forknum]);
1641 reln->md_seg_fds[forknum] = NULL;
1642 }
1643 }
1644 else if (reln->md_num_open_segs[forknum] == 0)
1645 {
1646 reln->md_seg_fds[forknum] =
1648 }
1649 else if (nseg > reln->md_num_open_segs[forknum])
1650 {
1651 /*
1652 * It doesn't seem worthwhile complicating the code to amortize
1653 * repalloc() calls. Those are far faster than PathNameOpenFile() or
1654 * FileClose(), and the memory context internally will sometimes avoid
1655 * doing an actual reallocation.
1656 */
1657 reln->md_seg_fds[forknum] =
1658 repalloc(reln->md_seg_fds[forknum],
1659 sizeof(MdfdVec) * nseg);
1660 }
1661 else
1662 {
1663 /*
1664 * We don't reallocate a smaller array, because we want mdtruncate()
1665 * to be able to promise that it won't allocate memory, so that it is
1666 * allowed in a critical section. This means that a bit of space in
1667 * the array is now wasted, until the next time we add a segment and
1668 * reallocate.
1669 */
1670 }
1671
1672 reln->md_num_open_segs[forknum] = nseg;
1673}

References fb(), MdCxt, MemoryContextAlloc(), pfree(), and repalloc().

Referenced by _mdfd_openseg(), mdclose(), mdcreate(), mdimmedsync(), mdopenfork(), mdregistersync(), and mdtruncate().

◆ _mdfd_getseg()

static MdfdVec * _mdfd_getseg ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blkno,
bool  skipFsync,
int  behavior 
)
static

Definition at line 1743 of file md.c.

1745{
1746 MdfdVec *v;
1749
1750 /* some way to handle non-existent segments needs to be specified */
1751 Assert(behavior &
1754
1755 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1756
1757 /* if an existing and opened segment, we're done */
1758 if (targetseg < reln->md_num_open_segs[forknum])
1759 {
1760 v = &reln->md_seg_fds[forknum][targetseg];
1761 return v;
1762 }
1763
1764 /* The caller only wants the segment if we already had it open. */
1765 if (behavior & EXTENSION_DONT_OPEN)
1766 return NULL;
1767
1768 /*
1769 * The target segment is not yet open. Iterate over all the segments
1770 * between the last opened and the target segment. This way missing
1771 * segments either raise an error, or get created (according to
1772 * 'behavior'). Start with either the last opened, or the first segment if
1773 * none was opened before.
1774 */
1775 if (reln->md_num_open_segs[forknum] > 0)
1776 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1777 else
1778 {
1779 v = mdopenfork(reln, forknum, behavior);
1780 if (!v)
1781 return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1782 }
1783
1784 for (nextsegno = reln->md_num_open_segs[forknum];
1786 {
1787 BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1788 int flags = 0;
1789
1790 Assert(nextsegno == v->mdfd_segno + 1);
1791
1792 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1793 elog(FATAL, "segment too big");
1794
1795 if ((behavior & EXTENSION_CREATE) ||
1796 (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1797 {
1798 /*
1799 * Normally we will create new segments only if authorized by the
1800 * caller (i.e., we are doing mdextend()). But when doing WAL
1801 * recovery, create segments anyway; this allows cases such as
1802 * replaying WAL data that has a write into a high-numbered
1803 * segment of a relation that was later deleted. We want to go
1804 * ahead and create the segments so we can finish out the replay.
1805 *
1806 * We have to maintain the invariant that segments before the last
1807 * active segment are of size RELSEG_SIZE; therefore, if
1808 * extending, pad them out with zeroes if needed. (This only
1809 * matters if in recovery, or if the caller is extending the
1810 * relation discontiguously, but that can happen in hash indexes.)
1811 */
1812 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1813 {
1816
1817 mdextend(reln, forknum,
1820 pfree(zerobuf);
1821 }
1822 flags = O_CREAT;
1823 }
1824 else if (nblocks < ((BlockNumber) RELSEG_SIZE))
1825 {
1826 /*
1827 * When not extending, only open the next segment if the current
1828 * one is exactly RELSEG_SIZE. If not (this branch), either
1829 * return NULL or fail.
1830 */
1831 if (behavior & EXTENSION_RETURN_NULL)
1832 {
1833 /*
1834 * Some callers discern between reasons for _mdfd_getseg()
1835 * returning NULL based on errno. As there's no failing
1836 * syscall involved in this case, explicitly set errno to
1837 * ENOENT, as that seems the closest interpretation.
1838 */
1839 errno = ENOENT;
1840 return NULL;
1841 }
1842
1843 ereport(ERROR,
1845 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1846 _mdfd_segpath(reln, forknum, nextsegno).str,
1847 blkno, nblocks)));
1848 }
1849
1850 v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1851
1852 if (v == NULL)
1853 {
1854 if ((behavior & EXTENSION_RETURN_NULL) &&
1856 return NULL;
1857 ereport(ERROR,
1859 errmsg("could not open file \"%s\" (target block %u): %m",
1860 _mdfd_segpath(reln, forknum, nextsegno).str,
1861 blkno)));
1862 }
1863 }
1864
1865 return v;
1866}

References _mdfd_openseg(), _mdfd_segpath(), _mdnblocks(), Assert, elog, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE, EXTENSION_CREATE_RECOVERY, EXTENSION_DONT_OPEN, EXTENSION_FAIL, EXTENSION_RETURN_NULL, FATAL, fb(), FILE_POSSIBLY_DELETED, InRecovery, MCXT_ALLOC_ZERO, mdextend(), _MdfdVec::mdfd_segno, mdopenfork(), palloc_aligned(), pfree(), PG_IO_ALIGN_SIZE, and str.

Referenced by mdextend(), mdfd(), mdprefetch(), mdreadv(), mdstartreadv(), mdwriteback(), mdwritev(), and mdzeroextend().

◆ _mdfd_open_flags()

static int _mdfd_open_flags ( void  )
inlinestatic

Definition at line 176 of file md.c.

177{
178 int flags = O_RDWR | PG_BINARY;
179
181 flags |= PG_O_DIRECT;
182
183 return flags;
184}

References fb(), IO_DIRECT_DATA, io_direct_flags, PG_BINARY, and PG_O_DIRECT.

Referenced by _mdfd_openseg(), mdcreate(), mdopenfork(), and mdsyncfiletag().

◆ _mdfd_openseg()

static MdfdVec * _mdfd_openseg ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  segno,
int  oflags 
)
static

Definition at line 1700 of file md.c.

1702{
1703 MdfdVec *v;
1704 File fd;
1705 MdPathStr fullpath;
1706
1707 fullpath = _mdfd_segpath(reln, forknum, segno);
1708
1709 /* open the file */
1711
1712 if (fd < 0)
1713 return NULL;
1714
1715 /*
1716 * Segments are always opened in order from lowest to highest, so we must
1717 * be adding a new one at the end.
1718 */
1719 Assert(segno == reln->md_num_open_segs[forknum]);
1720
1721 _fdvec_resize(reln, forknum, segno + 1);
1722
1723 /* fill the entry */
1724 v = &reln->md_seg_fds[forknum][segno];
1725 v->mdfd_vfd = fd;
1726 v->mdfd_segno = segno;
1727
1728 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1729
1730 /* all done */
1731 return v;
1732}

References _fdvec_resize(), _mdfd_open_flags(), _mdfd_segpath(), _mdnblocks(), Assert, fb(), fd(), _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, PathNameOpenFile(), and MdPathStr::str.

Referenced by _mdfd_getseg(), mdimmedsync(), mdnblocks(), and mdregistersync().

◆ _mdfd_segpath()

static MdPathStr _mdfd_segpath ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1680 of file md.c.

1681{
1682 RelPathStr path;
1683 MdPathStr fullpath;
1684
1685 path = relpath(reln->smgr_rlocator, forknum);
1686
1687 if (segno > 0)
1688 sprintf(fullpath.str, "%s.%u", path.str, segno);
1689 else
1690 strcpy(fullpath.str, path.str);
1691
1692 return fullpath;
1693}

References fb(), relpath, sprintf, MdPathStr::str, and RelPathStr::str.

Referenced by _mdfd_getseg(), _mdfd_openseg(), and mdsyncfiletag().

◆ _mdnblocks()

static BlockNumber _mdnblocks ( SMgrRelation  reln,
ForkNumber  forknum,
MdfdVec seg 
)
static

Definition at line 1872 of file md.c.

1873{
1874 pgoff_t len;
1875
1876 len = FileSize(seg->mdfd_vfd);
1877 if (len < 0)
1878 ereport(ERROR,
1880 errmsg("could not seek to end of file \"%s\": %m",
1881 FilePathName(seg->mdfd_vfd))));
1882 /* note that this calculation will ignore any partial block at EOF */
1883 return (BlockNumber) (len / BLCKSZ);
1884}

References ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), FilePathName(), FileSize(), len, and _MdfdVec::mdfd_vfd.

Referenced by _mdfd_getseg(), _mdfd_openseg(), mdextend(), mdnblocks(), mdopenfork(), and mdzeroextend().

◆ buffers_to_iovec()

static int buffers_to_iovec ( struct iovec iov,
void **  buffers,
int  nblocks 
)
static

Definition at line 784 of file md.c.

785{
786 struct iovec *iovp;
787 int iovcnt;
788
789 Assert(nblocks >= 1);
790
791 /* If this build supports direct I/O, buffers must be I/O aligned. */
792 for (int i = 0; i < nblocks; ++i)
793 {
794 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
795 Assert((uintptr_t) buffers[i] ==
796 TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
797 }
798
799 /* Start the first iovec off with the first buffer. */
800 iovp = &iov[0];
801 iovp->iov_base = buffers[0];
802 iovp->iov_len = BLCKSZ;
803 iovcnt = 1;
804
805 /* Try to merge the rest. */
806 for (int i = 1; i < nblocks; ++i)
807 {
808 void *buffer = buffers[i];
809
810 if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
811 {
812 /* Contiguous with the last iovec. */
813 iovp->iov_len += BLCKSZ;
814 }
815 else
816 {
817 /* Need a new iovec. */
818 iovp++;
819 iovp->iov_base = buffer;
820 iovp->iov_len = BLCKSZ;
821 iovcnt++;
822 }
823 }
824
825 return iovcnt;
826}

References Assert, fb(), i, PG_IO_ALIGN_SIZE, PG_O_DIRECT, and TYPEALIGN.

Referenced by mdreadv(), mdstartreadv(), and mdwritev().

◆ do_truncate()

static int do_truncate ( const char path)
static

Definition at line 353 of file md.c.

354{
355 int save_errno;
356 int ret;
357
358 ret = pg_truncate(path, 0);
359
360 /* Log a warning here to avoid repetition in callers. */
361 if (ret < 0 && errno != ENOENT)
362 {
366 errmsg("could not truncate file \"%s\": %m", path)));
368 }
369
370 return ret;
371}

References ereport, errcode_for_file_access(), errmsg(), fb(), pg_truncate(), and WARNING.

Referenced by mdunlinkfork().

◆ DropRelationFiles()

void DropRelationFiles ( RelFileLocator delrels,
int  ndelrels,
bool  isRedo 
)

Definition at line 1600 of file md.c.

1601{
1603 int i;
1604
1606 for (i = 0; i < ndelrels; i++)
1607 {
1609
1610 if (isRedo)
1611 {
1613
1614 for (fork = 0; fork <= MAX_FORKNUM; fork++)
1616 }
1617 srels[i] = srel;
1618 }
1619
1621
1622 for (i = 0; i < ndelrels; i++)
1623 smgrclose(srels[i]);
1624 pfree(srels);
1625}

References fb(), i, INVALID_PROC_NUMBER, MAX_FORKNUM, palloc_array, pfree(), smgrclose(), smgrdounlinkall(), smgropen(), and XLogDropRelation().

Referenced by FinishPreparedTransaction(), xact_redo_abort(), and xact_redo_commit().

◆ ForgetDatabaseSyncRequests()

void ForgetDatabaseSyncRequests ( Oid  dbid)

Definition at line 1582 of file md.c.

1583{
1584 FileTag tag;
1585 RelFileLocator rlocator;
1586
1587 rlocator.dbOid = dbid;
1588 rlocator.spcOid = 0;
1589 rlocator.relNumber = 0;
1590
1592
1593 RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1594}

References RelFileLocator::dbOid, INIT_MD_FILETAG, InvalidBlockNumber, InvalidForkNumber, RegisterSyncRequest(), RelFileLocator::relNumber, RelFileLocator::spcOid, and SYNC_FILTER_REQUEST.

Referenced by createdb_failure_callback(), dbase_redo(), and dropdb().

◆ md_readv_complete()

static PgAioResult md_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 1978 of file md.c.

1979{
1981 PgAioResult result = prior_result;
1982
1983 if (prior_result.result < 0)
1984 {
1985 result.status = PGAIO_RS_ERROR;
1986 result.id = PGAIO_HCB_MD_READV;
1987 /* For "hard" errors, track the error number in error_data */
1988 result.error_data = -prior_result.result;
1989 result.result = 0;
1990
1991 /*
1992 * Immediately log a message about the IO error, but only to the
1993 * server log. The reason to do so immediately is that the originator
1994 * might not process the query result immediately (because it is busy
1995 * doing another part of query processing) or at all (e.g. if it was
1996 * cancelled or errored out due to another IO also failing). The
1997 * definer of the IO will emit an ERROR when processing the IO's
1998 * results
1999 */
2001
2002 return result;
2003 }
2004
2005 /*
2006 * As explained above smgrstartreadv(), the smgr API operates on the level
2007 * of blocks, rather than bytes. Convert.
2008 */
2009 result.result /= BLCKSZ;
2010
2011 Assert(result.result <= td->smgr.nblocks);
2012
2013 if (result.result == 0)
2014 {
2015 /* consider 0 blocks read a failure */
2016 result.status = PGAIO_RS_ERROR;
2017 result.id = PGAIO_HCB_MD_READV;
2018 result.error_data = 0;
2019
2020 /* see comment above the "hard error" case */
2022
2023 return result;
2024 }
2025
2026 if (result.status != PGAIO_RS_ERROR &&
2027 result.result < td->smgr.nblocks)
2028 {
2029 /* partial reads should be retried at upper level */
2030 result.status = PGAIO_RS_PARTIAL;
2031 result.id = PGAIO_HCB_MD_READV;
2032 }
2033
2034 return result;
2035}

References Assert, PgAioResult::error_data, fb(), PgAioResult::id, LOG_SERVER_ONLY, PgAioTargetData::nblocks, PGAIO_HCB_MD_READV, pgaio_io_get_target_data(), pgaio_result_report(), PGAIO_RS_ERROR, PGAIO_RS_PARTIAL, PgAioResult::result, PgAioTargetData::smgr, and PgAioResult::status.

◆ md_readv_report()

static void md_readv_report ( PgAioResult  result,
const PgAioTargetData td,
int  elevel 
)
static

Definition at line 2045 of file md.c.

2046{
2047 RelPathStr path;
2048
2049 path = relpathbackend(td->smgr.rlocator,
2051 td->smgr.forkNum);
2052
2053 if (result.error_data != 0)
2054 {
2055 /* for errcode_for_file_access() and %m */
2056 errno = result.error_data;
2057
2058 ereport(elevel,
2060 errmsg("could not read blocks %u..%u in file \"%s\": %m",
2061 td->smgr.blockNum,
2062 td->smgr.blockNum + td->smgr.nblocks - 1,
2063 path.str));
2064 }
2065 else
2066 {
2067 /*
2068 * NB: This will typically only be output in debug messages, while
2069 * retrying a partial IO.
2070 */
2071 ereport(elevel,
2073 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
2074 td->smgr.blockNum,
2075 td->smgr.blockNum + td->smgr.nblocks - 1,
2076 path.str,
2077 result.result * (size_t) BLCKSZ,
2078 td->smgr.nblocks * (size_t) BLCKSZ));
2079 }
2080}

References PgAioTargetData::blockNum, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg(), PgAioResult::error_data, fb(), PgAioTargetData::forkNum, INVALID_PROC_NUMBER, PgAioTargetData::is_temp, MyProcNumber, PgAioTargetData::nblocks, relpathbackend, PgAioResult::result, PgAioTargetData::rlocator, PgAioTargetData::smgr, and RelPathStr::str.

◆ mdclose()

void mdclose ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 713 of file md.c.

714{
715 int nopensegs = reln->md_num_open_segs[forknum];
716
717 /* No work if already closed */
718 if (nopensegs == 0)
719 return;
720
721 /* close segments starting from the end */
722 while (nopensegs > 0)
723 {
724 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
725
727 _fdvec_resize(reln, forknum, nopensegs - 1);
728 nopensegs--;
729 }
730}

References _fdvec_resize(), fb(), FileClose(), and _MdfdVec::mdfd_vfd.

Referenced by mdexists().

◆ mdcreate()

void mdcreate ( SMgrRelation  reln,
ForkNumber  forknum,
bool  isRedo 
)

Definition at line 222 of file md.c.

223{
224 MdfdVec *mdfd;
225 RelPathStr path;
226 File fd;
227
228 if (isRedo && reln->md_num_open_segs[forknum] > 0)
229 return; /* created and opened already... */
230
231 Assert(reln->md_num_open_segs[forknum] == 0);
232
233 /*
234 * We may be using the target table space for the first time in this
235 * database, so create a per-database subdirectory if needed.
236 *
237 * XXX this is a fairly ugly violation of module layering, but this seems
238 * to be the best place to put the check. Maybe TablespaceCreateDbspace
239 * should be here and not in commands/tablespace.c? But that would imply
240 * importing a lot of stuff that smgr.c oughtn't know, either.
241 */
242 TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
243 reln->smgr_rlocator.locator.dbOid,
244 isRedo);
245
246 path = relpath(reln->smgr_rlocator, forknum);
247
249
250 if (fd < 0)
251 {
252 int save_errno = errno;
253
254 if (isRedo)
256 if (fd < 0)
257 {
258 /* be sure to report the error reported by create, not open */
262 errmsg("could not create file \"%s\": %m", path.str)));
263 }
264 }
265
266 _fdvec_resize(reln, forknum, 1);
267 mdfd = &reln->md_seg_fds[forknum][0];
268 mdfd->mdfd_vfd = fd;
269 mdfd->mdfd_segno = 0;
270
271 if (!SmgrIsTemp(reln))
273}

References _fdvec_resize(), _mdfd_open_flags(), Assert, ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), fd(), mdfd(), PathNameOpenFile(), register_dirty_segment(), relpath, SmgrIsTemp, RelPathStr::str, and TablespaceCreateDbspace().

◆ mdexists()

bool mdexists ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 203 of file md.c.

204{
205 /*
206 * Close it first, to ensure that we notice if the fork has been unlinked
207 * since we opened it. As an optimization, we can skip that in recovery,
208 * which already closes relations when dropping them.
209 */
210 if (!InRecovery)
211 mdclose(reln, forknum);
212
213 return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
214}

References EXTENSION_RETURN_NULL, fb(), InRecovery, mdclose(), and mdopenfork().

◆ mdextend()

void mdextend ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
const void buffer,
bool  skipFsync 
)

Definition at line 487 of file md.c.

489{
490 pgoff_t seekpos;
491 int nbytes;
492 MdfdVec *v;
493
494 /* If this build supports direct I/O, the buffer must be I/O aligned. */
495 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
496 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
497
498 /* This assert is too expensive to have on normally ... */
499#ifdef CHECK_WRITE_VS_EXTEND
500 Assert(blocknum >= mdnblocks(reln, forknum));
501#endif
502
503 /*
504 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
505 * more --- we mustn't create a block whose number actually is
506 * InvalidBlockNumber. (Note that this failure should be unreachable
507 * because of upstream checks in bufmgr.c.)
508 */
509 if (blocknum == InvalidBlockNumber)
512 errmsg("cannot extend file \"%s\" beyond %u blocks",
513 relpath(reln->smgr_rlocator, forknum).str,
515
516 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
517
518 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
519
520 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
521
522 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
523 {
524 if (nbytes < 0)
527 errmsg("could not extend file \"%s\": %m",
529 errhint("Check free disk space.")));
530 /* short write: complain appropriately */
533 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
535 nbytes, BLCKSZ, blocknum),
536 errhint("Check free disk space.")));
537 }
538
539 if (!skipFsync && !SmgrIsTemp(reln))
540 register_dirty_segment(reln, forknum, v);
541
542 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
543}

References _mdfd_getseg(), _mdnblocks(), Assert, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, fb(), FilePathName(), FileWrite(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), PG_IO_ALIGN_SIZE, PG_O_DIRECT, register_dirty_segment(), relpath, SmgrIsTemp, and TYPEALIGN.

Referenced by _mdfd_getseg().

◆ mdfd()

int mdfd ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
uint32 off 
)

Definition at line 1483 of file md.c.

1484{
1485 MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
1486
1487 v = _mdfd_getseg(reln, forknum, blocknum, false,
1489
1490 *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1491
1492 Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1493
1494 return FileGetRawDesc(v->mdfd_vfd);
1495}

References _mdfd_getseg(), Assert, EXTENSION_FAIL, fb(), FileGetRawDesc(), _MdfdVec::mdfd_vfd, and mdopenfork().

Referenced by mdcreate(), and mdopenfork().

◆ mdfiletagmatches()

bool mdfiletagmatches ( const FileTag ftag,
const FileTag candidate 
)

Definition at line 1963 of file md.c.

1964{
1965 /*
1966 * For now we only use filter requests as a way to drop all scheduled
1967 * callbacks relating to a given database, when dropping the database.
1968 * We'll return true for all candidates that have the same database OID as
1969 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1970 */
1971 return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1972}

References RelFileLocator::dbOid, fb(), and FileTag::rlocator.

◆ mdimmedsync()

void mdimmedsync ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1430 of file md.c.

1431{
1432 int segno;
1433 int min_inactive_seg;
1434
1435 /*
1436 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1437 * the loop below will get them all!
1438 */
1439 mdnblocks(reln, forknum);
1440
1441 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1442
1443 /*
1444 * Temporarily open inactive segments, then close them after sync. There
1445 * may be some inactive segments left opened after fsync() error, but that
1446 * is harmless. We don't bother to clean them up and take a risk of
1447 * further trouble. The next mdclose() will soon close them.
1448 */
1449 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1450 segno++;
1451
1452 while (segno > 0)
1453 {
1454 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1455
1456 /*
1457 * fsyncs done through mdimmedsync() should be tracked in a separate
1458 * IOContext than those done through mdsyncfiletag() to differentiate
1459 * between unavoidable client backend fsyncs (e.g. those done during
1460 * index build) and those which ideally would have been done by the
1461 * checkpointer. Since other IO operations bypassing the buffer
1462 * manager could also be tracked in such an IOContext, wait until
1463 * these are also tracked to track immediate fsyncs.
1464 */
1468 errmsg("could not fsync file \"%s\": %m",
1469 FilePathName(v->mdfd_vfd))));
1470
1471 /* Close inactive segments immediately */
1472 if (segno > min_inactive_seg)
1473 {
1474 FileClose(v->mdfd_vfd);
1475 _fdvec_resize(reln, forknum, segno - 1);
1476 }
1477
1478 segno--;
1479 }
1480}

References _fdvec_resize(), _mdfd_openseg(), data_sync_elevel(), ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), FileClose(), FilePathName(), FileSync(), _MdfdVec::mdfd_vfd, and mdnblocks().

◆ mdinit()

void mdinit ( void  )

Definition at line 190 of file md.c.

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, MdCxt, and TopMemoryContext.

◆ mdmaxcombine()

uint32 mdmaxcombine ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum 
)

Definition at line 833 of file md.c.

835{
836 BlockNumber segoff;
837
838 segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
839
840 return RELSEG_SIZE - segoff;
841}

References fb().

◆ mdnblocks()

BlockNumber mdnblocks ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1223 of file md.c.

1224{
1225 MdfdVec *v;
1226 BlockNumber nblocks;
1227 BlockNumber segno;
1228
1229 mdopenfork(reln, forknum, EXTENSION_FAIL);
1230
1231 /* mdopen has opened the first segment */
1232 Assert(reln->md_num_open_segs[forknum] > 0);
1233
1234 /*
1235 * Start from the last open segments, to avoid redundant seeks. We have
1236 * previously verified that these segments are exactly RELSEG_SIZE long,
1237 * and it's useless to recheck that each time.
1238 *
1239 * NOTE: this assumption could only be wrong if another backend has
1240 * truncated the relation. We rely on higher code levels to handle that
1241 * scenario by closing and re-opening the md fd, which is handled via
1242 * relcache flush. (Since the checkpointer doesn't participate in
1243 * relcache flush, it could have segment entries for inactive segments;
1244 * that's OK because the checkpointer never needs to compute relation
1245 * size.)
1246 */
1247 segno = reln->md_num_open_segs[forknum] - 1;
1248 v = &reln->md_seg_fds[forknum][segno];
1249
1250 for (;;)
1251 {
1252 nblocks = _mdnblocks(reln, forknum, v);
1253 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1254 elog(FATAL, "segment too big");
1255 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1256 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1257
1258 /*
1259 * If segment is exactly RELSEG_SIZE, advance to next one.
1260 */
1261 segno++;
1262
1263 /*
1264 * We used to pass O_CREAT here, but that has the disadvantage that it
1265 * might create a segment which has vanished through some operating
1266 * system misadventure. In such a case, creating the segment here
1267 * undermines _mdfd_getseg's attempts to notice and report an error
1268 * upon access to a missing segment.
1269 */
1270 v = _mdfd_openseg(reln, forknum, segno, 0);
1271 if (v == NULL)
1272 return segno * ((BlockNumber) RELSEG_SIZE);
1273 }
1274}

References _mdfd_openseg(), _mdnblocks(), Assert, elog, EXTENSION_FAIL, FATAL, fb(), and mdopenfork().

Referenced by mdextend(), mdimmedsync(), mdregistersync(), mdwritev(), and mdzeroextend().

◆ mdopen()

void mdopen ( SMgrRelation  reln)

Definition at line 702 of file md.c.

703{
704 /* mark it not open */
705 for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
706 reln->md_num_open_segs[forknum] = 0;
707}

References fb(), and MAX_FORKNUM.

◆ mdopenfork()

static MdfdVec * mdopenfork ( SMgrRelation  reln,
ForkNumber  forknum,
int  behavior 
)
static

Definition at line 664 of file md.c.

665{
666 MdfdVec *mdfd;
667 RelPathStr path;
668 File fd;
669
670 /* No work if already open */
671 if (reln->md_num_open_segs[forknum] > 0)
672 return &reln->md_seg_fds[forknum][0];
673
674 path = relpath(reln->smgr_rlocator, forknum);
675
677
678 if (fd < 0)
679 {
680 if ((behavior & EXTENSION_RETURN_NULL) &&
682 return NULL;
685 errmsg("could not open file \"%s\": %m", path.str)));
686 }
687
688 _fdvec_resize(reln, forknum, 1);
689 mdfd = &reln->md_seg_fds[forknum][0];
690 mdfd->mdfd_vfd = fd;
691 mdfd->mdfd_segno = 0;
692
694
695 return mdfd;
696}

References _fdvec_resize(), _mdfd_open_flags(), _mdnblocks(), Assert, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_RETURN_NULL, fb(), fd(), FILE_POSSIBLY_DELETED, mdfd(), PathNameOpenFile(), relpath, and RelPathStr::str.

Referenced by _mdfd_getseg(), mdexists(), mdfd(), and mdnblocks().

◆ mdprefetch()

bool mdprefetch ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
int  nblocks 
)

Definition at line 736 of file md.c.

738{
739#ifdef USE_PREFETCH
740
742
743 if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
744 return false;
745
746 while (nblocks > 0)
747 {
748 pgoff_t seekpos;
749 MdfdVec *v;
751
752 v = _mdfd_getseg(reln, forknum, blocknum, false,
754 if (v == NULL)
755 return false;
756
757 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
758
759 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
760
762 Min(nblocks,
763 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
764
767
768 blocknum += nblocks_this_segment;
769 nblocks -= nblocks_this_segment;
770 }
771#endif /* USE_PREFETCH */
772
773 return true;
774}

References _mdfd_getseg(), Assert, EXTENSION_FAIL, EXTENSION_RETURN_NULL, fb(), FilePrefetch(), InRecovery, IO_DIRECT_DATA, io_direct_flags, MaxBlockNumber, _MdfdVec::mdfd_vfd, and Min.

◆ mdreadv()

void mdreadv ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
void **  buffers,
BlockNumber  nblocks 
)

Definition at line 847 of file md.c.

849{
850 while (nblocks > 0)
851 {
852 struct iovec iov[PG_IOV_MAX];
853 int iovcnt;
854 pgoff_t seekpos;
855 int nbytes;
856 MdfdVec *v;
859 size_t size_this_segment;
860
861 v = _mdfd_getseg(reln, forknum, blocknum, false,
863
864 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
865
866 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
867
869 Min(nblocks,
870 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
872
873 if (nblocks_this_segment != nblocks)
874 elog(ERROR, "read crosses segment boundary");
875
879
880 /*
881 * Inner loop to continue after a short read. We'll keep going until
882 * we hit EOF rather than assuming that a short read means we hit the
883 * end.
884 */
885 for (;;)
886 {
887 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
888 reln->smgr_rlocator.locator.spcOid,
889 reln->smgr_rlocator.locator.dbOid,
890 reln->smgr_rlocator.locator.relNumber,
891 reln->smgr_rlocator.backend);
892 nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
894 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
895 reln->smgr_rlocator.locator.spcOid,
896 reln->smgr_rlocator.locator.dbOid,
897 reln->smgr_rlocator.locator.relNumber,
898 reln->smgr_rlocator.backend,
899 nbytes,
901
902#ifdef SIMULATE_SHORT_READ
903 nbytes = Min(nbytes, 4096);
904#endif
905
906 if (nbytes < 0)
909 errmsg("could not read blocks %u..%u in file \"%s\": %m",
910 blocknum,
911 blocknum + nblocks_this_segment - 1,
912 FilePathName(v->mdfd_vfd))));
913
914 if (nbytes == 0)
915 {
916 /*
917 * We are at or past EOF, or we read a partial block at EOF.
918 * Normally this is an error; upper levels should never try to
919 * read a nonexistent block. However, if zero_damaged_pages
920 * is ON or we are InRecovery, we should instead return zeroes
921 * without complaining. This allows, for example, the case of
922 * trying to update a block that was later truncated away.
923 *
924 * NB: We think that this codepath is unreachable in recovery
925 * and incomplete with zero_damaged_pages, as missing segments
926 * are not created. Putting blocks into the buffer-pool that
927 * do not exist on disk is rather problematic, as it will not
928 * be found by scans that rely on smgrnblocks(), as they are
929 * beyond EOF. It also can cause weird problems with relation
930 * extension, as relation extension does not expect blocks
931 * beyond EOF to exist.
932 *
933 * Therefore we do not want to copy the logic into
934 * mdstartreadv(), where it would have to be more complicated
935 * due to potential differences in the zero_damaged_pages
936 * setting between the definer and completor of IO.
937 *
938 * For PG 18, we are putting an Assert(false) in mdreadv()
939 * (triggering failures in assertion-enabled builds, but
940 * continuing to work in production builds). Afterwards we
941 * plan to remove this code entirely.
942 */
944 {
945 Assert(false); /* see comment above */
946
949 ++i)
950 memset(buffers[i], 0, BLCKSZ);
951 break;
952 }
953 else
956 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
957 blocknum,
958 blocknum + nblocks_this_segment - 1,
962 }
963
964 /* One loop should usually be enough. */
965 transferred_this_segment += nbytes;
968 break;
969
970 /* Adjust position and vectors after a short read. */
971 seekpos += nbytes;
973 }
974
975 nblocks -= nblocks_this_segment;
976 buffers += nblocks_this_segment;
977 blocknum += nblocks_this_segment;
978 }
979}

References _mdfd_getseg(), Assert, buffers_to_iovec(), compute_remaining_iovec(), elog, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, fb(), FilePathName(), FileReadV(), i, InRecovery, lengthof, _MdfdVec::mdfd_vfd, Min, PG_IOV_MAX, and zero_damaged_pages.

◆ mdregistersync()

void mdregistersync ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1379 of file md.c.

1380{
1381 int segno;
1382 int min_inactive_seg;
1383
1384 /*
1385 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1386 * the loop below will get them all!
1387 */
1388 mdnblocks(reln, forknum);
1389
1390 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1391
1392 /*
1393 * Temporarily open inactive segments, then close them after sync. There
1394 * may be some inactive segments left opened after error, but that is
1395 * harmless. We don't bother to clean them up and take a risk of further
1396 * trouble. The next mdclose() will soon close them.
1397 */
1398 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1399 segno++;
1400
1401 while (segno > 0)
1402 {
1403 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1404
1405 register_dirty_segment(reln, forknum, v);
1406
1407 /* Close inactive segments immediately */
1408 if (segno > min_inactive_seg)
1409 {
1410 FileClose(v->mdfd_vfd);
1411 _fdvec_resize(reln, forknum, segno - 1);
1412 }
1413
1414 segno--;
1415 }
1416}

References _fdvec_resize(), _mdfd_openseg(), fb(), FileClose(), _MdfdVec::mdfd_vfd, mdnblocks(), and register_dirty_segment().

◆ mdstartreadv()

void mdstartreadv ( PgAioHandle ioh,
SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
void **  buffers,
BlockNumber  nblocks 
)

Definition at line 985 of file md.c.

988{
989 pgoff_t seekpos;
990 MdfdVec *v;
992 struct iovec *iov;
993 int iovcnt;
994 int ret;
995
996 v = _mdfd_getseg(reln, forknum, blocknum, false,
998
999 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1000
1001 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1002
1004 Min(nblocks,
1005 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1006
1007 if (nblocks_this_segment != nblocks)
1008 elog(ERROR, "read crossing segment boundary");
1009
1011
1012 Assert(nblocks <= iovcnt);
1013
1015
1017
1020
1022 reln,
1023 forknum,
1024 blocknum,
1025 nblocks,
1026 false);
1028
1030 if (ret != 0)
1031 ereport(ERROR,
1033 errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
1034 blocknum,
1035 blocknum + nblocks_this_segment - 1,
1036 FilePathName(v->mdfd_vfd))));
1037
1038 /*
1039 * The error checks corresponding to the post-read checks in mdreadv() are
1040 * in md_readv_complete().
1041 *
1042 * However we chose, at least for now, to not implement the
1043 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
1044 * that logic is rather problematic, and we want to get rid of it. Here
1045 * equivalent logic would have to be more complicated due to potential
1046 * differences in the zero_damaged_pages setting between the definer and
1047 * completor of IO.
1048 */
1049}

References _mdfd_getseg(), Assert, buffers_to_iovec(), elog, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, fb(), FilePathName(), FileStartReadV(), IO_DIRECT_DATA, io_direct_flags, _MdfdVec::mdfd_vfd, Min, PGAIO_HCB_MD_READV, PGAIO_HF_BUFFERED, pgaio_io_get_iovec(), pgaio_io_register_callbacks(), pgaio_io_set_flag(), and pgaio_io_set_target_smgr().

◆ mdsyncfiletag()

int mdsyncfiletag ( const FileTag ftag,
char path 
)

Definition at line 1893 of file md.c.

1894{
1896 File file;
1898 bool need_to_close;
1899 int result,
1900 save_errno;
1901
1902 /* See if we already have the file open, or need to open it. */
1903 if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1904 {
1905 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1906 strlcpy(path, FilePathName(file), MAXPGPATH);
1907 need_to_close = false;
1908 }
1909 else
1910 {
1911 MdPathStr p;
1912
1913 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1914 strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
1915
1916 file = PathNameOpenFile(path, _mdfd_open_flags());
1917 if (file < 0)
1918 return -1;
1919 need_to_close = true;
1920 }
1921
1923
1924 /* Sync the file. */
1925 result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1926 save_errno = errno;
1927
1928 if (need_to_close)
1929 FileClose(file);
1930
1932 IOOP_FSYNC, io_start, 1, 0);
1933
1934 errno = save_errno;
1935 return result;
1936}

References _mdfd_open_flags(), _mdfd_segpath(), fb(), FileClose(), FilePathName(), FileSync(), FileTag::forknum, INVALID_PROC_NUMBER, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, MAXPGPATH, MD_PATH_STR_MAXLEN, PathNameOpenFile(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), FileTag::rlocator, FileTag::segno, smgropen(), MdPathStr::str, strlcpy(), and track_io_timing.

◆ mdtruncate()

void mdtruncate ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  curnblk,
BlockNumber  nblocks 
)

Definition at line 1290 of file md.c.

1292{
1294 int curopensegs;
1295
1296 if (nblocks > curnblk)
1297 {
1298 /* Bogus request ... but no complaint if InRecovery */
1299 if (InRecovery)
1300 return;
1301 ereport(ERROR,
1302 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1303 relpath(reln->smgr_rlocator, forknum).str,
1304 nblocks, curnblk)));
1305 }
1306 if (nblocks == curnblk)
1307 return; /* no work */
1308
1309 /*
1310 * Truncate segments, starting at the last one. Starting at the end makes
1311 * managing the memory for the fd array easier, should there be errors.
1312 */
1313 curopensegs = reln->md_num_open_segs[forknum];
1314 while (curopensegs > 0)
1315 {
1316 MdfdVec *v;
1317
1319
1320 v = &reln->md_seg_fds[forknum][curopensegs - 1];
1321
1322 if (priorblocks > nblocks)
1323 {
1324 /*
1325 * This segment is no longer active. We truncate the file, but do
1326 * not delete it, for reasons explained in the header comments.
1327 */
1329 ereport(ERROR,
1331 errmsg("could not truncate file \"%s\": %m",
1332 FilePathName(v->mdfd_vfd))));
1333
1334 if (!SmgrIsTemp(reln))
1335 register_dirty_segment(reln, forknum, v);
1336
1337 /* we never drop the 1st segment */
1338 Assert(v != &reln->md_seg_fds[forknum][0]);
1339
1340 FileClose(v->mdfd_vfd);
1341 _fdvec_resize(reln, forknum, curopensegs - 1);
1342 }
1343 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1344 {
1345 /*
1346 * This is the last segment we want to keep. Truncate the file to
1347 * the right length. NOTE: if nblocks is exactly a multiple K of
1348 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1349 * keep it. This adheres to the invariant given in the header
1350 * comments.
1351 */
1353
1355 ereport(ERROR,
1357 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1359 nblocks)));
1360 if (!SmgrIsTemp(reln))
1361 register_dirty_segment(reln, forknum, v);
1362 }
1363 else
1364 {
1365 /*
1366 * We still need this segment, so nothing to do for this and any
1367 * earlier segment.
1368 */
1369 break;
1370 }
1371 curopensegs--;
1372 }
1373}

References _fdvec_resize(), Assert, ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), FileClose(), FilePathName(), FileTruncate(), InRecovery, _MdfdVec::mdfd_vfd, register_dirty_segment(), relpath, and SmgrIsTemp.

◆ mdunlink()

void mdunlink ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
bool  isRedo 
)

Definition at line 337 of file md.c.

338{
339 /* Now do the per-fork work */
340 if (forknum == InvalidForkNumber)
341 {
342 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
343 mdunlinkfork(rlocator, forknum, isRedo);
344 }
345 else
346 mdunlinkfork(rlocator, forknum, isRedo);
347}

References fb(), InvalidForkNumber, MAX_FORKNUM, and mdunlinkfork().

◆ mdunlinkfiletag()

int mdunlinkfiletag ( const FileTag ftag,
char path 
)

Definition at line 1945 of file md.c.

1946{
1947 RelPathStr p;
1948
1949 /* Compute the path. */
1950 p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1951 strlcpy(path, p.str, MAXPGPATH);
1952
1953 /* Try to unlink the file. */
1954 return unlink(path);
1955}

References fb(), MAIN_FORKNUM, MAXPGPATH, relpathperm, FileTag::rlocator, RelPathStr::str, and strlcpy().

◆ mdunlinkfork()

static void mdunlinkfork ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
bool  isRedo 
)
static

Definition at line 374 of file md.c.

375{
376 RelPathStr path;
377 int ret;
378 int save_errno;
379
380 path = relpath(rlocator, forknum);
381
382 /*
383 * Truncate and then unlink the first segment, or just register a request
384 * to unlink it later, as described in the comments for mdunlink().
385 */
386 if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
388 {
389 if (!RelFileLocatorBackendIsTemp(rlocator))
390 {
391 /* Prevent other backends' fds from holding on to the disk space */
392 ret = do_truncate(path.str);
393
394 /* Forget any pending sync requests for the first segment */
396 register_forget_request(rlocator, forknum, 0 /* first seg */ );
398 }
399 else
400 ret = 0;
401
402 /* Next unlink the file, unless it was already found to be missing */
403 if (ret >= 0 || errno != ENOENT)
404 {
405 ret = unlink(path.str);
406 if (ret < 0 && errno != ENOENT)
407 {
411 errmsg("could not remove file \"%s\": %m", path.str)));
413 }
414 }
415 }
416 else
417 {
418 /* Prevent other backends' fds from holding on to the disk space */
419 ret = do_truncate(path.str);
420
421 /* Register request to unlink first segment later */
423 register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
425 }
426
427 /*
428 * Delete any additional segments.
429 *
430 * Note that because we loop until getting ENOENT, we will correctly
431 * remove all inactive segments as well as active ones. Ideally we'd
432 * continue the loop until getting exactly that errno, but that risks an
433 * infinite loop if the problem is directory-wide (for instance, if we
434 * suddenly can't read the data directory itself). We compromise by
435 * continuing after a non-ENOENT truncate error, but stopping after any
436 * unlink error. If there is indeed a directory-wide problem, additional
437 * unlink attempts wouldn't work anyway.
438 */
439 if (ret >= 0 || errno != ENOENT)
440 {
442 BlockNumber segno;
443
444 for (segno = 1;; segno++)
445 {
446 sprintf(segpath.str, "%s.%u", path.str, segno);
447
448 if (!RelFileLocatorBackendIsTemp(rlocator))
449 {
450 /*
451 * Prevent other backends' fds from holding on to the disk
452 * space. We're done if we see ENOENT, though.
453 */
454 if (do_truncate(segpath.str) < 0 && errno == ENOENT)
455 break;
456
457 /*
458 * Forget any pending sync requests for this segment before we
459 * try to unlink.
460 */
461 register_forget_request(rlocator, forknum, segno);
462 }
463
464 if (unlink(segpath.str) < 0)
465 {
466 /* ENOENT is expected after the last segment... */
467 if (errno != ENOENT)
470 errmsg("could not remove file \"%s\": %m", segpath.str)));
471 break;
472 }
473 }
474 }
475}

References do_truncate(), ereport, errcode_for_file_access(), errmsg(), fb(), IsBinaryUpgrade, MAIN_FORKNUM, register_forget_request(), register_unlink_segment(), RelFileLocatorBackendIsTemp, relpath, sprintf, RelPathStr::str, and WARNING.

Referenced by mdunlink().

◆ mdwriteback()

void mdwriteback ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
BlockNumber  nblocks 
)

Definition at line 1164 of file md.c.

1166{
1168
1169 /*
1170 * Issue flush requests in as few requests as possible; have to split at
1171 * segment boundaries though, since those are actually separate files.
1172 */
1173 while (nblocks > 0)
1174 {
1175 BlockNumber nflush = nblocks;
1176 pgoff_t seekpos;
1177 MdfdVec *v;
1178 int segnum_start,
1179 segnum_end;
1180
1181 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1183
1184 /*
1185 * We might be flushing buffers of already removed relations, that's
1186 * ok, just ignore that case. If the segment file wasn't open already
1187 * (ie from a recent mdwrite()), then we don't want to re-open it, to
1188 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1189 * us with a descriptor to a file that is about to be unlinked.
1190 */
1191 if (!v)
1192 return;
1193
1194 /* compute offset inside the current segment */
1195 segnum_start = blocknum / RELSEG_SIZE;
1196
1197 /* compute number of desired writes within the current segment */
1198 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1199 if (segnum_start != segnum_end)
1200 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1201
1202 Assert(nflush >= 1);
1203 Assert(nflush <= nblocks);
1204
1205 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1206
1208
1209 nblocks -= nflush;
1210 blocknum += nflush;
1211 }
1212}

References _mdfd_getseg(), Assert, EXTENSION_DONT_OPEN, fb(), FileWriteback(), IO_DIRECT_DATA, io_direct_flags, and _MdfdVec::mdfd_vfd.

◆ mdwritev()

void mdwritev ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
const void **  buffers,
BlockNumber  nblocks,
bool  skipFsync 
)

Definition at line 1059 of file md.c.

1061{
1062 /* This assert is too expensive to have on normally ... */
1063#ifdef CHECK_WRITE_VS_EXTEND
1064 Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
1065#endif
1066
1067 while (nblocks > 0)
1068 {
1069 struct iovec iov[PG_IOV_MAX];
1070 int iovcnt;
1071 pgoff_t seekpos;
1072 int nbytes;
1073 MdfdVec *v;
1076 size_t size_this_segment;
1077
1078 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
1080
1081 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1082
1083 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1084
1086 Min(nblocks,
1087 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1089
1090 if (nblocks_this_segment != nblocks)
1091 elog(ERROR, "write crosses segment boundary");
1092
1093 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
1096
1097 /*
1098 * Inner loop to continue after a short write. If the reason is that
1099 * we're out of disk space, a future attempt should get an ENOSPC
1100 * error from the kernel.
1101 */
1102 for (;;)
1103 {
1104 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
1105 reln->smgr_rlocator.locator.spcOid,
1106 reln->smgr_rlocator.locator.dbOid,
1107 reln->smgr_rlocator.locator.relNumber,
1108 reln->smgr_rlocator.backend);
1109 nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
1111 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
1112 reln->smgr_rlocator.locator.spcOid,
1113 reln->smgr_rlocator.locator.dbOid,
1114 reln->smgr_rlocator.locator.relNumber,
1115 reln->smgr_rlocator.backend,
1116 nbytes,
1118
1119#ifdef SIMULATE_SHORT_WRITE
1120 nbytes = Min(nbytes, 4096);
1121#endif
1122
1123 if (nbytes < 0)
1124 {
1125 bool enospc = errno == ENOSPC;
1126
1127 ereport(ERROR,
1129 errmsg("could not write blocks %u..%u in file \"%s\": %m",
1130 blocknum,
1131 blocknum + nblocks_this_segment - 1,
1133 enospc ? errhint("Check free disk space.") : 0));
1134 }
1135
1136 /* One loop should usually be enough. */
1137 transferred_this_segment += nbytes;
1140 break;
1141
1142 /* Adjust position and iovecs after a short write. */
1143 seekpos += nbytes;
1145 }
1146
1147 if (!skipFsync && !SmgrIsTemp(reln))
1148 register_dirty_segment(reln, forknum, v);
1149
1150 nblocks -= nblocks_this_segment;
1151 buffers += nblocks_this_segment;
1152 blocknum += nblocks_this_segment;
1153 }
1154}

References _mdfd_getseg(), Assert, buffers_to_iovec(), compute_remaining_iovec(), elog, ereport, errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, fb(), FilePathName(), FileWriteV(), lengthof, _MdfdVec::mdfd_vfd, mdnblocks(), Min, PG_IOV_MAX, register_dirty_segment(), and SmgrIsTemp.

◆ mdzeroextend()

void mdzeroextend ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
int  nblocks,
bool  skipFsync 
)

Definition at line 552 of file md.c.

554{
555 MdfdVec *v;
556 BlockNumber curblocknum = blocknum;
557 int remblocks = nblocks;
558
559 Assert(nblocks > 0);
560
561 /* This assert is too expensive to have on normally ... */
562#ifdef CHECK_WRITE_VS_EXTEND
563 Assert(blocknum >= mdnblocks(reln, forknum));
564#endif
565
566 /*
567 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
568 * more --- we mustn't create a block whose number actually is
569 * InvalidBlockNumber or larger.
570 */
571 if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
574 errmsg("cannot extend file \"%s\" beyond %u blocks",
575 relpath(reln->smgr_rlocator, forknum).str,
577
578 while (remblocks > 0)
579 {
581 pgoff_t seekpos = (pgoff_t) BLCKSZ * segstartblock;
582 int numblocks;
583
586 else
588
590
593
594 /*
595 * If available and useful, use posix_fallocate() (via
596 * FileFallocate()) to extend the relation. That's often more
597 * efficient than using write(), as it commonly won't cause the kernel
598 * to allocate page cache space for the extended pages.
599 *
600 * However, we don't use FileFallocate() for small extensions, as it
601 * defeats delayed allocation on some filesystems. Not clear where
602 * that decision should be made though? For now just use a cutoff of
603 * 8, anything between 4 and 8 worked OK in some local testing.
604 */
605 if (numblocks > 8)
606 {
607 int ret;
608
609 ret = FileFallocate(v->mdfd_vfd,
610 seekpos, (pgoff_t) BLCKSZ * numblocks,
612 if (ret != 0)
613 {
616 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
618 errhint("Check free disk space."));
619 }
620 }
621 else
622 {
623 int ret;
624
625 /*
626 * Even if we don't want to use fallocate, we can still extend a
627 * bit more efficiently than writing each 8kB block individually.
628 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
629 * to avoid multiple writes or needing a zeroed buffer for the
630 * whole length of the extension.
631 */
632 ret = FileZero(v->mdfd_vfd,
633 seekpos, (pgoff_t) BLCKSZ * numblocks,
635 if (ret < 0)
638 errmsg("could not extend file \"%s\": %m",
640 errhint("Check free disk space."));
641 }
642
643 if (!skipFsync && !SmgrIsTemp(reln))
644 register_dirty_segment(reln, forknum, v);
645
646 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
647
650 }
651}

References _mdfd_getseg(), _mdnblocks(), Assert, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, fb(), FileFallocate(), FilePathName(), FileZero(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), register_dirty_segment(), relpath, and SmgrIsTemp.

◆ register_dirty_segment()

static void register_dirty_segment ( SMgrRelation  reln,
ForkNumber  forknum,
MdfdVec seg 
)
static

Definition at line 1507 of file md.c.

1508{
1509 FileTag tag;
1510
1511 INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1512
1513 /* Temp relations should never be fsync'd */
1515
1516 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1517 {
1519
1521 (errmsg_internal("could not forward fsync request because request queue is full")));
1522
1524
1528 errmsg("could not fsync file \"%s\": %m",
1529 FilePathName(seg->mdfd_vfd))));
1530
1531 /*
1532 * We have no way of knowing if the current IOContext is
1533 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1534 * point, so count the fsync as being in the IOCONTEXT_NORMAL
1535 * IOContext. This is probably okay, because the number of backend
1536 * fsyncs doesn't say anything about the efficacy of the
1537 * BufferAccessStrategy. And counting both fsyncs done in
1538 * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1539 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1540 * backend fsyncs.
1541 */
1543 IOOP_FSYNC, io_start, 1, 0);
1544 }
1545}

References Assert, data_sync_elevel(), DEBUG1, ereport, errcode_for_file_access(), errmsg(), errmsg_internal(), ERROR, fb(), FilePathName(), FileSync(), INIT_MD_FILETAG, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RegisterSyncRequest(), SmgrIsTemp, SYNC_REQUEST, and track_io_timing.

Referenced by mdcreate(), mdextend(), mdregistersync(), mdtruncate(), mdwritev(), and mdzeroextend().

◆ register_forget_request()

static void register_forget_request ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1568 of file md.c.

1570{
1571 FileTag tag;
1572
1573 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1574
1575 RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
1576}

References INIT_MD_FILETAG, RelFileLocatorBackend::locator, RegisterSyncRequest(), and SYNC_FORGET_REQUEST.

Referenced by mdunlinkfork().

◆ register_unlink_segment()

static void register_unlink_segment ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1551 of file md.c.

1553{
1554 FileTag tag;
1555
1556 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1557
1558 /* Should never be used with temp relations */
1560
1561 RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
1562}

References Assert, INIT_MD_FILETAG, RelFileLocatorBackend::locator, RegisterSyncRequest(), RelFileLocatorBackendIsTemp, and SYNC_UNLINK_REQUEST.

Referenced by mdunlinkfork().

◆ StaticAssertDecl()

StaticAssertDecl ( RELSEG_SIZE  ,
0 &&RELSEG_SIZE<=  INT_MAX,
"RELSEG_SIZE must fit in an integer"   
)

Variable Documentation

◆ aio_md_readv_cb

const PgAioHandleCallbacks aio_md_readv_cb
Initial value:
= {
.complete_shared = md_readv_complete,
.report = md_readv_report,
}

Definition at line 169 of file md.c.

169 {
170 .complete_shared = md_readv_complete,
171 .report = md_readv_report,
172};

◆ MdCxt

MemoryContext MdCxt
static

Definition at line 97 of file md.c.

Referenced by _fdvec_resize(), and mdinit().