PostgreSQL Source Code git master
Loading...
Searching...
No Matches
md.c File Reference
#include "postgres.h"
#include <limits.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
#include "access/xlogutils.h"
#include "commands/tablespace.h"
#include "common/file_utils.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/md.h"
#include "storage/relfilelocator.h"
#include "storage/smgr.h"
#include "storage/sync.h"
#include "utils/memutils.h"
Include dependency graph for md.c:

Go to the source code of this file.

Data Structures

struct  _MdfdVec
 
struct  MdPathStr
 

Macros

#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
 
#define EXTENSION_FAIL   (1 << 0)
 
#define EXTENSION_RETURN_NULL   (1 << 1)
 
#define EXTENSION_CREATE   (1 << 2)
 
#define EXTENSION_CREATE_RECOVERY   (1 << 3)
 
#define EXTENSION_DONT_OPEN   (1 << 5)
 
#define SEGMENT_CHARS   OIDCHARS
 
#define MD_PATH_STR_MAXLEN
 

Typedefs

typedef struct _MdfdVec MdfdVec
 
typedef struct MdPathStr MdPathStr
 

Functions

 StaticAssertDecl (RELSEG_SIZE > 0 &&RELSEG_SIZE<=INT_MAX, "RELSEG_SIZE must fit in an integer")
 
static void mdunlinkfork (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 
static MdfdVecmdopenfork (SMgrRelation reln, ForkNumber forknum, int behavior)
 
static void register_dirty_segment (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
static void register_unlink_segment (RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
 
static void register_forget_request (RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
 
static void _fdvec_resize (SMgrRelation reln, ForkNumber forknum, int nseg)
 
static MdPathStr _mdfd_segpath (SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
 
static MdfdVec_mdfd_openseg (SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
 
static MdfdVec_mdfd_getseg (SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
 
static BlockNumber _mdnblocks (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
static PgAioResult md_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static void md_readv_report (PgAioResult result, const PgAioTargetData *td, int elevel)
 
static int _mdfd_open_flags (void)
 
void mdinit (void)
 
bool mdexists (SMgrRelation reln, ForkNumber forknum)
 
void mdcreate (SMgrRelation reln, ForkNumber forknum, bool isRedo)
 
void mdunlink (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 
static int do_truncate (const char *path)
 
void mdextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
 
void mdzeroextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
 
void mdopen (SMgrRelation reln)
 
void mdclose (SMgrRelation reln, ForkNumber forknum)
 
bool mdprefetch (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
 
static int buffers_to_iovec (struct iovec *iov, void **buffers, int nblocks)
 
uint32 mdmaxcombine (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
void mdreadv (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
 
void mdstartreadv (PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
 
void mdwritev (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
 
void mdwriteback (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
 
BlockNumber mdnblocks (SMgrRelation reln, ForkNumber forknum)
 
void mdtruncate (SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
 
void mdregistersync (SMgrRelation reln, ForkNumber forknum)
 
void mdimmedsync (SMgrRelation reln, ForkNumber forknum)
 
int mdfd (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
 
void ForgetDatabaseSyncRequests (Oid dbid)
 
void DropRelationFiles (RelFileLocator *delrels, int ndelrels, bool isRedo)
 
int mdsyncfiletag (const FileTag *ftag, char *path)
 
int mdunlinkfiletag (const FileTag *ftag, char *path)
 
bool mdfiletagmatches (const FileTag *ftag, const FileTag *candidate)
 

Variables

static MemoryContext MdCxt
 
const PgAioHandleCallbacks aio_md_readv_cb
 

Macro Definition Documentation

◆ EXTENSION_CREATE

#define EXTENSION_CREATE   (1 << 2)

Definition at line 117 of file md.c.

◆ EXTENSION_CREATE_RECOVERY

#define EXTENSION_CREATE_RECOVERY   (1 << 3)

Definition at line 119 of file md.c.

◆ EXTENSION_DONT_OPEN

#define EXTENSION_DONT_OPEN   (1 << 5)

Definition at line 121 of file md.c.

◆ EXTENSION_FAIL

#define EXTENSION_FAIL   (1 << 0)

Definition at line 113 of file md.c.

◆ EXTENSION_RETURN_NULL

#define EXTENSION_RETURN_NULL   (1 << 1)

Definition at line 115 of file md.c.

◆ INIT_MD_FILETAG

#define INIT_MD_FILETAG (   a,
  xx_rlocator,
  xx_forknum,
  xx_segno 
)
Value:
( \
memset(&(a), 0, sizeof(FileTag)), \
(a).handler = SYNC_HANDLER_MD, \
(a).rlocator = (xx_rlocator), \
(a).forknum = (xx_forknum), \
(a).segno = (xx_segno) \
)
int a
Definition isn.c:73
static int fb(int x)
Definition sync.h:51
@ SYNC_HANDLER_MD
Definition sync.h:37

Definition at line 101 of file md.c.

138{
139 char str[MD_PATH_STR_MAXLEN + 1];
140} MdPathStr;
141
142
143/* local routines */
144static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
145 bool isRedo);
146static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
148 MdfdVec *seg);
149static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
150 BlockNumber segno);
151static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
152 BlockNumber segno);
154 ForkNumber forknum,
155 int nseg);
157 BlockNumber segno);
159 BlockNumber segno, int oflags);
161 BlockNumber blkno, bool skipFsync, int behavior);
163 MdfdVec *seg);
164
166static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel);
167
170 .report = md_readv_report,
171};
172
173
174static inline int
176{
177 int flags = O_RDWR | PG_BINARY;
178
180 flags |= PG_O_DIRECT;
181
182 return flags;
183}
184
185/*
186 * mdinit() -- Initialize private state for magnetic disk storage manager.
187 */
188void
189mdinit(void)
190{
192 "MdSmgr",
194}
195
196/*
197 * mdexists() -- Does the physical file exist?
198 *
199 * Note: this will return true for lingering files, with pending deletions
200 */
201bool
203{
204 /*
205 * Close it first, to ensure that we notice if the fork has been unlinked
206 * since we opened it. As an optimization, we can skip that in recovery,
207 * which already closes relations when dropping them.
208 */
209 if (!InRecovery)
210 mdclose(reln, forknum);
211
212 return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
213}
214
215/*
216 * mdcreate() -- Create a new relation on magnetic disk.
217 *
218 * If isRedo is true, it's okay for the relation to exist already.
219 */
220void
222{
223 MdfdVec *mdfd;
224 RelPathStr path;
225 File fd;
226
227 if (isRedo && reln->md_num_open_segs[forknum] > 0)
228 return; /* created and opened already... */
229
230 Assert(reln->md_num_open_segs[forknum] == 0);
231
232 /*
233 * We may be using the target table space for the first time in this
234 * database, so create a per-database subdirectory if needed.
235 *
236 * XXX this is a fairly ugly violation of module layering, but this seems
237 * to be the best place to put the check. Maybe TablespaceCreateDbspace
238 * should be here and not in commands/tablespace.c? But that would imply
239 * importing a lot of stuff that smgr.c oughtn't know, either.
240 */
241 TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
242 reln->smgr_rlocator.locator.dbOid,
243 isRedo);
244
245 path = relpath(reln->smgr_rlocator, forknum);
246
248
249 if (fd < 0)
250 {
251 int save_errno = errno;
252
253 if (isRedo)
255 if (fd < 0)
256 {
257 /* be sure to report the error reported by create, not open */
261 errmsg("could not create file \"%s\": %m", path.str)));
262 }
263 }
264
265 _fdvec_resize(reln, forknum, 1);
266 mdfd = &reln->md_seg_fds[forknum][0];
267 mdfd->mdfd_vfd = fd;
268 mdfd->mdfd_segno = 0;
269
270 if (!SmgrIsTemp(reln))
272}
273
274/*
275 * mdunlink() -- Unlink a relation.
276 *
277 * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
278 * there won't be an SMgrRelation hashtable entry anymore.
279 *
280 * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
281 * to delete all forks.
282 *
283 * For regular relations, we don't unlink the first segment file of the rel,
284 * but just truncate it to zero length, and record a request to unlink it after
285 * the next checkpoint. Additional segments can be unlinked immediately,
286 * however. Leaving the empty file in place prevents that relfilenumber
287 * from being reused. The scenario this protects us from is:
288 * 1. We delete a relation (and commit, and actually remove its file).
289 * 2. We create a new relation, which by chance gets the same relfilenumber as
290 * the just-deleted one (OIDs must've wrapped around for that to happen).
291 * 3. We crash before another checkpoint occurs.
292 * During replay, we would delete the file and then recreate it, which is fine
293 * if the contents of the file were repopulated by subsequent WAL entries.
294 * But if we didn't WAL-log insertions, but instead relied on fsyncing the
295 * file after populating it (as we do at wal_level=minimal), the contents of
296 * the file would be lost forever. By leaving the empty file until after the
297 * next checkpoint, we prevent reassignment of the relfilenumber until it's
298 * safe, because relfilenumber assignment skips over any existing file.
299 *
300 * Additional segments, if any, are truncated and then unlinked. The reason
301 * for truncating is that other backends may still hold open FDs for these at
302 * the smgr level, so that the kernel can't remove the file yet. We want to
303 * reclaim the disk space right away despite that.
304 *
305 * We do not need to go through this dance for temp relations, though, because
306 * we never make WAL entries for temp rels, and so a temp rel poses no threat
307 * to the health of a regular rel that has taken over its relfilenumber.
308 * The fact that temp rels and regular rels have different file naming
309 * patterns provides additional safety. Other backends shouldn't have open
310 * FDs for them, either.
311 *
312 * We also don't do it while performing a binary upgrade. There is no reuse
313 * hazard in that case, since after a crash or even a simple ERROR, the
314 * upgrade fails and the whole cluster must be recreated from scratch.
315 * Furthermore, it is important to remove the files from disk immediately,
316 * because we may be about to reuse the same relfilenumber.
317 *
318 * All the above applies only to the relation's main fork; other forks can
319 * just be removed immediately, since they are not needed to prevent the
320 * relfilenumber from being recycled. Also, we do not carefully
321 * track whether other forks have been created or not, but just attempt to
322 * unlink them unconditionally; so we should never complain about ENOENT.
323 *
324 * If isRedo is true, it's unsurprising for the relation to be already gone.
325 * Also, we should remove the file immediately instead of queuing a request
326 * for later, since during redo there's no possibility of creating a
327 * conflicting relation.
328 *
329 * Note: we currently just never warn about ENOENT at all. We could warn in
330 * the main-fork, non-isRedo case, but it doesn't seem worth the trouble.
331 *
332 * Note: any failure should be reported as WARNING not ERROR, because
333 * we are usually not in a transaction anymore when this is called.
334 */
335void
336mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
337{
338 /* Now do the per-fork work */
339 if (forknum == InvalidForkNumber)
340 {
341 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
342 mdunlinkfork(rlocator, forknum, isRedo);
343 }
344 else
345 mdunlinkfork(rlocator, forknum, isRedo);
346}
347
348/*
349 * Truncate a file to release disk space.
350 */
351static int
352do_truncate(const char *path)
353{
354 int save_errno;
355 int ret;
356
357 ret = pg_truncate(path, 0);
358
359 /* Log a warning here to avoid repetition in callers. */
360 if (ret < 0 && errno != ENOENT)
361 {
365 errmsg("could not truncate file \"%s\": %m", path)));
367 }
368
369 return ret;
370}
371
372static void
374{
375 RelPathStr path;
376 int ret;
377 int save_errno;
378
379 path = relpath(rlocator, forknum);
380
381 /*
382 * Truncate and then unlink the first segment, or just register a request
383 * to unlink it later, as described in the comments for mdunlink().
384 */
385 if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
387 {
388 if (!RelFileLocatorBackendIsTemp(rlocator))
389 {
390 /* Prevent other backends' fds from holding on to the disk space */
391 ret = do_truncate(path.str);
392
393 /* Forget any pending sync requests for the first segment */
395 register_forget_request(rlocator, forknum, 0 /* first seg */ );
397 }
398 else
399 ret = 0;
400
401 /* Next unlink the file, unless it was already found to be missing */
402 if (ret >= 0 || errno != ENOENT)
403 {
404 ret = unlink(path.str);
405 if (ret < 0 && errno != ENOENT)
406 {
410 errmsg("could not remove file \"%s\": %m", path.str)));
412 }
413 }
414 }
415 else
416 {
417 /* Prevent other backends' fds from holding on to the disk space */
418 ret = do_truncate(path.str);
419
420 /* Register request to unlink first segment later */
422 register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
424 }
425
426 /*
427 * Delete any additional segments.
428 *
429 * Note that because we loop until getting ENOENT, we will correctly
430 * remove all inactive segments as well as active ones. Ideally we'd
431 * continue the loop until getting exactly that errno, but that risks an
432 * infinite loop if the problem is directory-wide (for instance, if we
433 * suddenly can't read the data directory itself). We compromise by
434 * continuing after a non-ENOENT truncate error, but stopping after any
435 * unlink error. If there is indeed a directory-wide problem, additional
436 * unlink attempts wouldn't work anyway.
437 */
438 if (ret >= 0 || errno != ENOENT)
439 {
441 BlockNumber segno;
442
443 for (segno = 1;; segno++)
444 {
445 sprintf(segpath.str, "%s.%u", path.str, segno);
446
447 if (!RelFileLocatorBackendIsTemp(rlocator))
448 {
449 /*
450 * Prevent other backends' fds from holding on to the disk
451 * space. We're done if we see ENOENT, though.
452 */
453 if (do_truncate(segpath.str) < 0 && errno == ENOENT)
454 break;
455
456 /*
457 * Forget any pending sync requests for this segment before we
458 * try to unlink.
459 */
460 register_forget_request(rlocator, forknum, segno);
461 }
462
463 if (unlink(segpath.str) < 0)
464 {
465 /* ENOENT is expected after the last segment... */
466 if (errno != ENOENT)
469 errmsg("could not remove file \"%s\": %m", segpath.str)));
470 break;
471 }
472 }
473 }
474}
475
476/*
477 * mdextend() -- Add a block to the specified relation.
478 *
479 * The semantics are nearly the same as mdwrite(): write at the
480 * specified position. However, this is to be used for the case of
481 * extending a relation (i.e., blocknum is at or beyond the current
482 * EOF). Note that we assume writing a block beyond current EOF
483 * causes intervening file space to become filled with zeroes.
484 */
485void
487 const void *buffer, bool skipFsync)
488{
489 pgoff_t seekpos;
490 int nbytes;
491 MdfdVec *v;
492
493 /* If this build supports direct I/O, the buffer must be I/O aligned. */
494 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
495 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
496
497 /* This assert is too expensive to have on normally ... */
498#ifdef CHECK_WRITE_VS_EXTEND
499 Assert(blocknum >= mdnblocks(reln, forknum));
500#endif
501
502 /*
503 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
504 * more --- we mustn't create a block whose number actually is
505 * InvalidBlockNumber. (Note that this failure should be unreachable
506 * because of upstream checks in bufmgr.c.)
507 */
508 if (blocknum == InvalidBlockNumber)
511 errmsg("cannot extend file \"%s\" beyond %u blocks",
512 relpath(reln->smgr_rlocator, forknum).str,
514
515 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
516
517 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
518
519 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
520
521 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
522 {
523 if (nbytes < 0)
526 errmsg("could not extend file \"%s\": %m",
528 errhint("Check free disk space.")));
529 /* short write: complain appropriately */
532 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
534 nbytes, BLCKSZ, blocknum),
535 errhint("Check free disk space.")));
536 }
537
538 if (!skipFsync && !SmgrIsTemp(reln))
539 register_dirty_segment(reln, forknum, v);
540
541 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
542}
543
544/*
545 * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
546 *
547 * Similar to mdextend(), except the relation can be extended by multiple
548 * blocks at once and the added blocks will be filled with zeroes.
549 */
550void
552 BlockNumber blocknum, int nblocks, bool skipFsync)
553{
554 MdfdVec *v;
555 BlockNumber curblocknum = blocknum;
556 int remblocks = nblocks;
557
558 Assert(nblocks > 0);
559
560 /* This assert is too expensive to have on normally ... */
561#ifdef CHECK_WRITE_VS_EXTEND
562 Assert(blocknum >= mdnblocks(reln, forknum));
563#endif
564
565 /*
566 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
567 * more --- we mustn't create a block whose number actually is
568 * InvalidBlockNumber or larger.
569 */
570 if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
573 errmsg("cannot extend file \"%s\" beyond %u blocks",
574 relpath(reln->smgr_rlocator, forknum).str,
576
577 while (remblocks > 0)
578 {
580 pgoff_t seekpos = (pgoff_t) BLCKSZ * segstartblock;
581 int numblocks;
582
585 else
587
589
592
593 /*
594 * If available and useful, use posix_fallocate() (via
595 * FileFallocate()) to extend the relation. That's often more
596 * efficient than using write(), as it commonly won't cause the kernel
597 * to allocate page cache space for the extended pages.
598 *
599 * However, we don't use FileFallocate() for small extensions, as it
600 * defeats delayed allocation on some filesystems. Not clear where
601 * that decision should be made though? For now just use a cutoff of
602 * 8, anything between 4 and 8 worked OK in some local testing.
603 */
604 if (numblocks > 8 &&
606 {
607 int ret = 0;
608
609#ifdef HAVE_POSIX_FALLOCATE
611 {
612 ret = FileFallocate(v->mdfd_vfd,
613 seekpos, (pgoff_t) BLCKSZ * numblocks,
615 }
616 else
617#endif
618 {
619 elog(ERROR, "unsupported file_extend_method: %d",
621 }
622 if (ret != 0)
623 {
626 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
628 errhint("Check free disk space."));
629 }
630 }
631 else
632 {
633 int ret;
634
635 /*
636 * Even if we don't want to use fallocate, we can still extend a
637 * bit more efficiently than writing each 8kB block individually.
638 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
639 * to avoid multiple writes or needing a zeroed buffer for the
640 * whole length of the extension.
641 */
642 ret = FileZero(v->mdfd_vfd,
643 seekpos, (pgoff_t) BLCKSZ * numblocks,
645 if (ret < 0)
648 errmsg("could not extend file \"%s\": %m",
650 errhint("Check free disk space."));
651 }
652
653 if (!skipFsync && !SmgrIsTemp(reln))
654 register_dirty_segment(reln, forknum, v);
655
656 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
657
660 }
661}
662
663/*
664 * mdopenfork() -- Open one fork of the specified relation.
665 *
666 * Note we only open the first segment, when there are multiple segments.
667 *
668 * If first segment is not present, either ereport or return NULL according
669 * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
670 * EXTENSION_CREATE means it's OK to extend an existing relation, not to
671 * invent one out of whole cloth.
672 */
673static MdfdVec *
674mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
675{
676 MdfdVec *mdfd;
677 RelPathStr path;
678 File fd;
679
680 /* No work if already open */
681 if (reln->md_num_open_segs[forknum] > 0)
682 return &reln->md_seg_fds[forknum][0];
683
684 path = relpath(reln->smgr_rlocator, forknum);
685
687
688 if (fd < 0)
689 {
690 if ((behavior & EXTENSION_RETURN_NULL) &&
692 return NULL;
695 errmsg("could not open file \"%s\": %m", path.str)));
696 }
697
698 _fdvec_resize(reln, forknum, 1);
699 mdfd = &reln->md_seg_fds[forknum][0];
700 mdfd->mdfd_vfd = fd;
701 mdfd->mdfd_segno = 0;
702
704
705 return mdfd;
706}
707
708/*
709 * mdopen() -- Initialize newly-opened relation.
710 */
711void
713{
714 /* mark it not open */
715 for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
716 reln->md_num_open_segs[forknum] = 0;
717}
718
719/*
720 * mdclose() -- Close the specified relation, if it isn't closed already.
721 */
722void
724{
725 int nopensegs = reln->md_num_open_segs[forknum];
726
727 /* No work if already closed */
728 if (nopensegs == 0)
729 return;
730
731 /* close segments starting from the end */
732 while (nopensegs > 0)
733 {
734 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
735
737 _fdvec_resize(reln, forknum, nopensegs - 1);
738 nopensegs--;
739 }
740}
741
742/*
743 * mdprefetch() -- Initiate asynchronous read of the specified blocks of a relation
744 */
745bool
747 int nblocks)
748{
749#ifdef USE_PREFETCH
750
752
753 if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
754 return false;
755
756 while (nblocks > 0)
757 {
758 pgoff_t seekpos;
759 MdfdVec *v;
761
762 v = _mdfd_getseg(reln, forknum, blocknum, false,
764 if (v == NULL)
765 return false;
766
767 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
768
769 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
770
772 Min(nblocks,
773 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
774
777
778 blocknum += nblocks_this_segment;
779 nblocks -= nblocks_this_segment;
780 }
781#endif /* USE_PREFETCH */
782
783 return true;
784}
785
786/*
787 * Convert an array of buffer address into an array of iovec objects, and
788 * return the number that were required. 'iov' must have enough space for up
789 * to 'nblocks' elements, but the number used may be less depending on
790 * merging. In the case of a run of fully contiguous buffers, a single iovec
791 * will be populated that can be handled as a plain non-vectored I/O.
792 */
793static int
794buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
795{
796 struct iovec *iovp;
797 int iovcnt;
798
799 Assert(nblocks >= 1);
800
801 /* If this build supports direct I/O, buffers must be I/O aligned. */
802 for (int i = 0; i < nblocks; ++i)
803 {
804 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
805 Assert((uintptr_t) buffers[i] ==
806 TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
807 }
808
809 /* Start the first iovec off with the first buffer. */
810 iovp = &iov[0];
811 iovp->iov_base = buffers[0];
812 iovp->iov_len = BLCKSZ;
813 iovcnt = 1;
814
815 /* Try to merge the rest. */
816 for (int i = 1; i < nblocks; ++i)
817 {
818 void *buffer = buffers[i];
819
820 if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
821 {
822 /* Contiguous with the last iovec. */
823 iovp->iov_len += BLCKSZ;
824 }
825 else
826 {
827 /* Need a new iovec. */
828 iovp++;
829 iovp->iov_base = buffer;
830 iovp->iov_len = BLCKSZ;
831 iovcnt++;
832 }
833 }
834
835 return iovcnt;
836}
837
838/*
839 * mdmaxcombine() -- Return the maximum number of total blocks that can be
840 * combined with an IO starting at blocknum.
841 */
842uint32
844 BlockNumber blocknum)
845{
846 BlockNumber segoff;
847
848 segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
849
850 return RELSEG_SIZE - segoff;
851}
852
853/*
854 * mdreadv() -- Read the specified blocks from a relation.
855 */
856void
858 void **buffers, BlockNumber nblocks)
859{
860 while (nblocks > 0)
861 {
862 struct iovec iov[PG_IOV_MAX];
863 int iovcnt;
864 pgoff_t seekpos;
865 int nbytes;
866 MdfdVec *v;
869 size_t size_this_segment;
870
871 v = _mdfd_getseg(reln, forknum, blocknum, false,
873
874 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
875
876 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
877
879 Min(nblocks,
880 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
882
883 if (nblocks_this_segment != nblocks)
884 elog(ERROR, "read crosses segment boundary");
885
889
890 /*
891 * Inner loop to continue after a short read. We'll keep going until
892 * we hit EOF rather than assuming that a short read means we hit the
893 * end.
894 */
895 for (;;)
896 {
897 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
898 reln->smgr_rlocator.locator.spcOid,
899 reln->smgr_rlocator.locator.dbOid,
900 reln->smgr_rlocator.locator.relNumber,
901 reln->smgr_rlocator.backend);
902 nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
904 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
905 reln->smgr_rlocator.locator.spcOid,
906 reln->smgr_rlocator.locator.dbOid,
907 reln->smgr_rlocator.locator.relNumber,
908 reln->smgr_rlocator.backend,
909 nbytes,
911
912#ifdef SIMULATE_SHORT_READ
913 nbytes = Min(nbytes, 4096);
914#endif
915
916 if (nbytes < 0)
919 errmsg("could not read blocks %u..%u in file \"%s\": %m",
920 blocknum,
921 blocknum + nblocks_this_segment - 1,
922 FilePathName(v->mdfd_vfd))));
923
924 if (nbytes == 0)
925 {
926 /*
927 * We are at or past EOF, or we read a partial block at EOF.
928 * Normally this is an error; upper levels should never try to
929 * read a nonexistent block. However, if zero_damaged_pages
930 * is ON or we are InRecovery, we should instead return zeroes
931 * without complaining. This allows, for example, the case of
932 * trying to update a block that was later truncated away.
933 *
934 * NB: We think that this codepath is unreachable in recovery
935 * and incomplete with zero_damaged_pages, as missing segments
936 * are not created. Putting blocks into the buffer-pool that
937 * do not exist on disk is rather problematic, as it will not
938 * be found by scans that rely on smgrnblocks(), as they are
939 * beyond EOF. It also can cause weird problems with relation
940 * extension, as relation extension does not expect blocks
941 * beyond EOF to exist.
942 *
943 * Therefore we do not want to copy the logic into
944 * mdstartreadv(), where it would have to be more complicated
945 * due to potential differences in the zero_damaged_pages
946 * setting between the definer and completor of IO.
947 *
948 * For PG 18, we are putting an Assert(false) in mdreadv()
949 * (triggering failures in assertion-enabled builds, but
950 * continuing to work in production builds). Afterwards we
951 * plan to remove this code entirely.
952 */
954 {
955 Assert(false); /* see comment above */
956
959 ++i)
960 memset(buffers[i], 0, BLCKSZ);
961 break;
962 }
963 else
966 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
967 blocknum,
968 blocknum + nblocks_this_segment - 1,
972 }
973
974 /* One loop should usually be enough. */
975 transferred_this_segment += nbytes;
978 break;
979
980 /* Adjust position and vectors after a short read. */
981 seekpos += nbytes;
983 }
984
985 nblocks -= nblocks_this_segment;
986 buffers += nblocks_this_segment;
987 blocknum += nblocks_this_segment;
988 }
989}
990
991/*
992 * mdstartreadv() -- Asynchronous version of mdreadv().
993 */
994void
996 SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
997 void **buffers, BlockNumber nblocks)
998{
999 pgoff_t seekpos;
1000 MdfdVec *v;
1002 struct iovec *iov;
1003 int iovcnt;
1004 int ret;
1005
1006 v = _mdfd_getseg(reln, forknum, blocknum, false,
1008
1009 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1010
1011 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1012
1014 Min(nblocks,
1015 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1016
1017 if (nblocks_this_segment != nblocks)
1018 elog(ERROR, "read crossing segment boundary");
1019
1021
1022 Assert(nblocks <= iovcnt);
1023
1025
1027
1030
1032 reln,
1033 forknum,
1034 blocknum,
1035 nblocks,
1036 false);
1038
1040 if (ret != 0)
1041 ereport(ERROR,
1043 errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
1044 blocknum,
1045 blocknum + nblocks_this_segment - 1,
1046 FilePathName(v->mdfd_vfd))));
1047
1048 /*
1049 * The error checks corresponding to the post-read checks in mdreadv() are
1050 * in md_readv_complete().
1051 *
1052 * However we chose, at least for now, to not implement the
1053 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
1054 * that logic is rather problematic, and we want to get rid of it. Here
1055 * equivalent logic would have to be more complicated due to potential
1056 * differences in the zero_damaged_pages setting between the definer and
1057 * completor of IO.
1058 */
1059}
1060
1061/*
1062 * mdwritev() -- Write the supplied blocks at the appropriate location.
1063 *
1064 * This is to be used only for updating already-existing blocks of a
1065 * relation (ie, those before the current EOF). To extend a relation,
1066 * use mdextend().
1067 */
1068void
1070 const void **buffers, BlockNumber nblocks, bool skipFsync)
1071{
1072 /* This assert is too expensive to have on normally ... */
1073#ifdef CHECK_WRITE_VS_EXTEND
1074 Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
1075#endif
1076
1077 while (nblocks > 0)
1078 {
1079 struct iovec iov[PG_IOV_MAX];
1080 int iovcnt;
1081 pgoff_t seekpos;
1082 int nbytes;
1083 MdfdVec *v;
1086 size_t size_this_segment;
1087
1088 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
1090
1091 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1092
1093 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1094
1096 Min(nblocks,
1097 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1099
1100 if (nblocks_this_segment != nblocks)
1101 elog(ERROR, "write crosses segment boundary");
1102
1103 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
1106
1107 /*
1108 * Inner loop to continue after a short write. If the reason is that
1109 * we're out of disk space, a future attempt should get an ENOSPC
1110 * error from the kernel.
1111 */
1112 for (;;)
1113 {
1114 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
1115 reln->smgr_rlocator.locator.spcOid,
1116 reln->smgr_rlocator.locator.dbOid,
1117 reln->smgr_rlocator.locator.relNumber,
1118 reln->smgr_rlocator.backend);
1119 nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
1121 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
1122 reln->smgr_rlocator.locator.spcOid,
1123 reln->smgr_rlocator.locator.dbOid,
1124 reln->smgr_rlocator.locator.relNumber,
1125 reln->smgr_rlocator.backend,
1126 nbytes,
1128
1129#ifdef SIMULATE_SHORT_WRITE
1130 nbytes = Min(nbytes, 4096);
1131#endif
1132
1133 if (nbytes < 0)
1134 {
1135 bool enospc = errno == ENOSPC;
1136
1137 ereport(ERROR,
1139 errmsg("could not write blocks %u..%u in file \"%s\": %m",
1140 blocknum,
1141 blocknum + nblocks_this_segment - 1,
1143 enospc ? errhint("Check free disk space.") : 0));
1144 }
1145
1146 /* One loop should usually be enough. */
1147 transferred_this_segment += nbytes;
1150 break;
1151
1152 /* Adjust position and iovecs after a short write. */
1153 seekpos += nbytes;
1155 }
1156
1157 if (!skipFsync && !SmgrIsTemp(reln))
1158 register_dirty_segment(reln, forknum, v);
1159
1160 nblocks -= nblocks_this_segment;
1161 buffers += nblocks_this_segment;
1162 blocknum += nblocks_this_segment;
1163 }
1164}
1165
1166
1167/*
1168 * mdwriteback() -- Tell the kernel to write pages back to storage.
1169 *
1170 * This accepts a range of blocks because flushing several pages at once is
1171 * considerably more efficient than doing so individually.
1172 */
1173void
1175 BlockNumber blocknum, BlockNumber nblocks)
1176{
1178
1179 /*
1180 * Issue flush requests in as few requests as possible; have to split at
1181 * segment boundaries though, since those are actually separate files.
1182 */
1183 while (nblocks > 0)
1184 {
1185 BlockNumber nflush = nblocks;
1186 pgoff_t seekpos;
1187 MdfdVec *v;
1188 int segnum_start,
1189 segnum_end;
1190
1191 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1193
1194 /*
1195 * We might be flushing buffers of already removed relations, that's
1196 * ok, just ignore that case. If the segment file wasn't open already
1197 * (ie from a recent mdwrite()), then we don't want to re-open it, to
1198 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1199 * us with a descriptor to a file that is about to be unlinked.
1200 */
1201 if (!v)
1202 return;
1203
1204 /* compute offset inside the current segment */
1205 segnum_start = blocknum / RELSEG_SIZE;
1206
1207 /* compute number of desired writes within the current segment */
1208 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1209 if (segnum_start != segnum_end)
1210 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1211
1212 Assert(nflush >= 1);
1213 Assert(nflush <= nblocks);
1214
1215 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1216
1218
1219 nblocks -= nflush;
1220 blocknum += nflush;
1221 }
1222}
1223
1224/*
1225 * mdnblocks() -- Get the number of blocks stored in a relation.
1226 *
1227 * Important side effect: all active segments of the relation are opened
1228 * and added to the md_seg_fds array. If this routine has not been
1229 * called, then only segments up to the last one actually touched
1230 * are present in the array.
1231 */
1234{
1235 MdfdVec *v;
1236 BlockNumber nblocks;
1237 BlockNumber segno;
1238
1239 mdopenfork(reln, forknum, EXTENSION_FAIL);
1240
1241 /* mdopen has opened the first segment */
1242 Assert(reln->md_num_open_segs[forknum] > 0);
1243
1244 /*
1245 * Start from the last open segments, to avoid redundant seeks. We have
1246 * previously verified that these segments are exactly RELSEG_SIZE long,
1247 * and it's useless to recheck that each time.
1248 *
1249 * NOTE: this assumption could only be wrong if another backend has
1250 * truncated the relation. We rely on higher code levels to handle that
1251 * scenario by closing and re-opening the md fd, which is handled via
1252 * relcache flush. (Since the checkpointer doesn't participate in
1253 * relcache flush, it could have segment entries for inactive segments;
1254 * that's OK because the checkpointer never needs to compute relation
1255 * size.)
1256 */
1257 segno = reln->md_num_open_segs[forknum] - 1;
1258 v = &reln->md_seg_fds[forknum][segno];
1259
1260 for (;;)
1261 {
1262 nblocks = _mdnblocks(reln, forknum, v);
1263 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1264 elog(FATAL, "segment too big");
1265 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1266 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1267
1268 /*
1269 * If segment is exactly RELSEG_SIZE, advance to next one.
1270 */
1271 segno++;
1272
1273 /*
1274 * We used to pass O_CREAT here, but that has the disadvantage that it
1275 * might create a segment which has vanished through some operating
1276 * system misadventure. In such a case, creating the segment here
1277 * undermines _mdfd_getseg's attempts to notice and report an error
1278 * upon access to a missing segment.
1279 */
1280 v = _mdfd_openseg(reln, forknum, segno, 0);
1281 if (v == NULL)
1282 return segno * ((BlockNumber) RELSEG_SIZE);
1283 }
1284}
1285
1286/*
1287 * mdtruncate() -- Truncate relation to specified number of blocks.
1288 *
1289 * Guaranteed not to allocate memory, so it can be used in a critical section.
1290 * Caller must have called smgrnblocks() to obtain curnblk while holding a
1291 * sufficient lock to prevent a change in relation size, and not used any smgr
1292 * functions for this relation or handled interrupts in between. This makes
1293 * sure we have opened all active segments, so that truncate loop will get
1294 * them all!
1295 *
1296 * If nblocks > curnblk, the request is ignored when we are InRecovery,
1297 * otherwise, an error is raised.
1298 */
1299void
1302{
1304 int curopensegs;
1305
1306 if (nblocks > curnblk)
1307 {
1308 /* Bogus request ... but no complaint if InRecovery */
1309 if (InRecovery)
1310 return;
1311 ereport(ERROR,
1312 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1313 relpath(reln->smgr_rlocator, forknum).str,
1314 nblocks, curnblk)));
1315 }
1316 if (nblocks == curnblk)
1317 return; /* no work */
1318
1319 /*
1320 * Truncate segments, starting at the last one. Starting at the end makes
1321 * managing the memory for the fd array easier, should there be errors.
1322 */
1323 curopensegs = reln->md_num_open_segs[forknum];
1324 while (curopensegs > 0)
1325 {
1326 MdfdVec *v;
1327
1329
1330 v = &reln->md_seg_fds[forknum][curopensegs - 1];
1331
1332 if (priorblocks > nblocks)
1333 {
1334 /*
1335 * This segment is no longer active. We truncate the file, but do
1336 * not delete it, for reasons explained in the header comments.
1337 */
1339 ereport(ERROR,
1341 errmsg("could not truncate file \"%s\": %m",
1342 FilePathName(v->mdfd_vfd))));
1343
1344 if (!SmgrIsTemp(reln))
1345 register_dirty_segment(reln, forknum, v);
1346
1347 /* we never drop the 1st segment */
1348 Assert(v != &reln->md_seg_fds[forknum][0]);
1349
1350 FileClose(v->mdfd_vfd);
1351 _fdvec_resize(reln, forknum, curopensegs - 1);
1352 }
1353 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1354 {
1355 /*
1356 * This is the last segment we want to keep. Truncate the file to
1357 * the right length. NOTE: if nblocks is exactly a multiple K of
1358 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1359 * keep it. This adheres to the invariant given in the header
1360 * comments.
1361 */
1363
1365 ereport(ERROR,
1367 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1369 nblocks)));
1370 if (!SmgrIsTemp(reln))
1371 register_dirty_segment(reln, forknum, v);
1372 }
1373 else
1374 {
1375 /*
1376 * We still need this segment, so nothing to do for this and any
1377 * earlier segment.
1378 */
1379 break;
1380 }
1381 curopensegs--;
1382 }
1383}
1384
1385/*
1386 * mdregistersync() -- Mark whole relation as needing fsync
1387 */
1388void
1390{
1391 int segno;
1392 int min_inactive_seg;
1393
1394 /*
1395 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1396 * the loop below will get them all!
1397 */
1398 mdnblocks(reln, forknum);
1399
1400 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1401
1402 /*
1403 * Temporarily open inactive segments, then close them after sync. There
1404 * may be some inactive segments left opened after error, but that is
1405 * harmless. We don't bother to clean them up and take a risk of further
1406 * trouble. The next mdclose() will soon close them.
1407 */
1408 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1409 segno++;
1410
1411 while (segno > 0)
1412 {
1413 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1414
1415 register_dirty_segment(reln, forknum, v);
1416
1417 /* Close inactive segments immediately */
1418 if (segno > min_inactive_seg)
1419 {
1420 FileClose(v->mdfd_vfd);
1421 _fdvec_resize(reln, forknum, segno - 1);
1422 }
1423
1424 segno--;
1425 }
1426}
1427
1428/*
1429 * mdimmedsync() -- Immediately sync a relation to stable storage.
1430 *
1431 * Note that only writes already issued are synced; this routine knows
1432 * nothing of dirty buffers that may exist inside the buffer manager. We
1433 * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
1434 * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
1435 * some segment, then mdtruncate() renders that segment inactive. If we
1436 * crash before the next checkpoint syncs the newly-inactive segment, that
1437 * segment may survive recovery, reintroducing unwanted data into the table.
1438 */
1439void
1441{
1442 int segno;
1443 int min_inactive_seg;
1444
1445 /*
1446 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1447 * the loop below will get them all!
1448 */
1449 mdnblocks(reln, forknum);
1450
1451 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1452
1453 /*
1454 * Temporarily open inactive segments, then close them after sync. There
1455 * may be some inactive segments left opened after fsync() error, but that
1456 * is harmless. We don't bother to clean them up and take a risk of
1457 * further trouble. The next mdclose() will soon close them.
1458 */
1459 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1460 segno++;
1461
1462 while (segno > 0)
1463 {
1464 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1465
1466 /*
1467 * fsyncs done through mdimmedsync() should be tracked in a separate
1468 * IOContext than those done through mdsyncfiletag() to differentiate
1469 * between unavoidable client backend fsyncs (e.g. those done during
1470 * index build) and those which ideally would have been done by the
1471 * checkpointer. Since other IO operations bypassing the buffer
1472 * manager could also be tracked in such an IOContext, wait until
1473 * these are also tracked to track immediate fsyncs.
1474 */
1478 errmsg("could not fsync file \"%s\": %m",
1479 FilePathName(v->mdfd_vfd))));
1480
1481 /* Close inactive segments immediately */
1482 if (segno > min_inactive_seg)
1483 {
1484 FileClose(v->mdfd_vfd);
1485 _fdvec_resize(reln, forknum, segno - 1);
1486 }
1487
1488 segno--;
1489 }
1490}
1491
1492int
1493mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
1494{
1495 MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
1496
1497 v = _mdfd_getseg(reln, forknum, blocknum, false,
1499
1500 *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1501
1502 Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1503
1504 return FileGetRawDesc(v->mdfd_vfd);
1505}
1506
1507/*
1508 * register_dirty_segment() -- Mark a relation segment as needing fsync
1509 *
1510 * If there is a local pending-ops table, just make an entry in it for
1511 * ProcessSyncRequests to process later. Otherwise, try to pass off the
1512 * fsync request to the checkpointer process. If that fails, just do the
1513 * fsync locally before returning (we hope this will not happen often
1514 * enough to be a performance problem).
1515 */
1516static void
1518{
1519 FileTag tag;
1520
1521 INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1522
1523 /* Temp relations should never be fsync'd */
1525
1526 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1527 {
1529
1531 (errmsg_internal("could not forward fsync request because request queue is full")));
1532
1534
1538 errmsg("could not fsync file \"%s\": %m",
1539 FilePathName(seg->mdfd_vfd))));
1540
1541 /*
1542 * We have no way of knowing if the current IOContext is
1543 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1544 * point, so count the fsync as being in the IOCONTEXT_NORMAL
1545 * IOContext. This is probably okay, because the number of backend
1546 * fsyncs doesn't say anything about the efficacy of the
1547 * BufferAccessStrategy. And counting both fsyncs done in
1548 * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1549 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1550 * backend fsyncs.
1551 */
1553 IOOP_FSYNC, io_start, 1, 0);
1554 }
1555}
1556
1557/*
1558 * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
1559 */
1560static void
1562 BlockNumber segno)
1563{
1564 FileTag tag;
1565
1566 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1567
1568 /* Should never be used with temp relations */
1570
1571 RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
1572}
1573
1574/*
1575 * register_forget_request() -- forget any fsyncs for a relation fork's segment
1576 */
1577static void
1579 BlockNumber segno)
1580{
1581 FileTag tag;
1582
1583 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1584
1585 RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
1586}
1587
1588/*
1589 * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
1590 */
1591void
1593{
1594 FileTag tag;
1595 RelFileLocator rlocator;
1596
1597 rlocator.dbOid = dbid;
1598 rlocator.spcOid = 0;
1599 rlocator.relNumber = 0;
1600
1602
1603 RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1604}
1605
1606/*
1607 * DropRelationFiles -- drop files of all given relations
1608 */
1609void
1611{
1613 int i;
1614
1616 for (i = 0; i < ndelrels; i++)
1617 {
1619
1620 if (isRedo)
1621 {
1623
1624 for (fork = 0; fork <= MAX_FORKNUM; fork++)
1626 }
1627 srels[i] = srel;
1628 }
1629
1631
1632 for (i = 0; i < ndelrels; i++)
1633 smgrclose(srels[i]);
1634 pfree(srels);
1635}
1636
1637
1638/*
1639 * _fdvec_resize() -- Resize the fork's open segments array
1640 */
1641static void
1643 ForkNumber forknum,
1644 int nseg)
1645{
1646 if (nseg == 0)
1647 {
1648 if (reln->md_num_open_segs[forknum] > 0)
1649 {
1650 pfree(reln->md_seg_fds[forknum]);
1651 reln->md_seg_fds[forknum] = NULL;
1652 }
1653 }
1654 else if (reln->md_num_open_segs[forknum] == 0)
1655 {
1656 reln->md_seg_fds[forknum] =
1658 }
1659 else if (nseg > reln->md_num_open_segs[forknum])
1660 {
1661 /*
1662 * It doesn't seem worthwhile complicating the code to amortize
1663 * repalloc() calls. Those are far faster than PathNameOpenFile() or
1664 * FileClose(), and the memory context internally will sometimes avoid
1665 * doing an actual reallocation.
1666 */
1667 reln->md_seg_fds[forknum] =
1668 repalloc(reln->md_seg_fds[forknum],
1669 sizeof(MdfdVec) * nseg);
1670 }
1671 else
1672 {
1673 /*
1674 * We don't reallocate a smaller array, because we want mdtruncate()
1675 * to be able to promise that it won't allocate memory, so that it is
1676 * allowed in a critical section. This means that a bit of space in
1677 * the array is now wasted, until the next time we add a segment and
1678 * reallocate.
1679 */
1680 }
1681
1682 reln->md_num_open_segs[forknum] = nseg;
1683}
1684
1685/*
1686 * Return the filename for the specified segment of the relation. The
1687 * returned string is palloc'd.
1688 */
1689static MdPathStr
1691{
1692 RelPathStr path;
1693 MdPathStr fullpath;
1694
1695 path = relpath(reln->smgr_rlocator, forknum);
1696
1697 if (segno > 0)
1698 sprintf(fullpath.str, "%s.%u", path.str, segno);
1699 else
1700 strcpy(fullpath.str, path.str);
1701
1702 return fullpath;
1703}
1704
1705/*
1706 * Open the specified segment of the relation,
1707 * and make a MdfdVec object for it. Returns NULL on failure.
1708 */
1709static MdfdVec *
1711 int oflags)
1712{
1713 MdfdVec *v;
1714 File fd;
1715 MdPathStr fullpath;
1716
1717 fullpath = _mdfd_segpath(reln, forknum, segno);
1718
1719 /* open the file */
1721
1722 if (fd < 0)
1723 return NULL;
1724
1725 /*
1726 * Segments are always opened in order from lowest to highest, so we must
1727 * be adding a new one at the end.
1728 */
1729 Assert(segno == reln->md_num_open_segs[forknum]);
1730
1731 _fdvec_resize(reln, forknum, segno + 1);
1732
1733 /* fill the entry */
1734 v = &reln->md_seg_fds[forknum][segno];
1735 v->mdfd_vfd = fd;
1736 v->mdfd_segno = segno;
1737
1738 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1739
1740 /* all done */
1741 return v;
1742}
1743
1744/*
1745 * _mdfd_getseg() -- Find the segment of the relation holding the
1746 * specified block.
1747 *
1748 * If the segment doesn't exist, we ereport, return NULL, or create the
1749 * segment, according to "behavior". Note: skipFsync is only used in the
1750 * EXTENSION_CREATE case.
1751 */
1752static MdfdVec *
1754 bool skipFsync, int behavior)
1755{
1756 MdfdVec *v;
1759
1760 /* some way to handle non-existent segments needs to be specified */
1761 Assert(behavior &
1764
1765 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1766
1767 /* if an existing and opened segment, we're done */
1768 if (targetseg < reln->md_num_open_segs[forknum])
1769 {
1770 v = &reln->md_seg_fds[forknum][targetseg];
1771 return v;
1772 }
1773
1774 /* The caller only wants the segment if we already had it open. */
1775 if (behavior & EXTENSION_DONT_OPEN)
1776 return NULL;
1777
1778 /*
1779 * The target segment is not yet open. Iterate over all the segments
1780 * between the last opened and the target segment. This way missing
1781 * segments either raise an error, or get created (according to
1782 * 'behavior'). Start with either the last opened, or the first segment if
1783 * none was opened before.
1784 */
1785 if (reln->md_num_open_segs[forknum] > 0)
1786 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1787 else
1788 {
1789 v = mdopenfork(reln, forknum, behavior);
1790 if (!v)
1791 return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1792 }
1793
1794 for (nextsegno = reln->md_num_open_segs[forknum];
1796 {
1797 BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1798 int flags = 0;
1799
1800 Assert(nextsegno == v->mdfd_segno + 1);
1801
1802 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1803 elog(FATAL, "segment too big");
1804
1805 if ((behavior & EXTENSION_CREATE) ||
1806 (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1807 {
1808 /*
1809 * Normally we will create new segments only if authorized by the
1810 * caller (i.e., we are doing mdextend()). But when doing WAL
1811 * recovery, create segments anyway; this allows cases such as
1812 * replaying WAL data that has a write into a high-numbered
1813 * segment of a relation that was later deleted. We want to go
1814 * ahead and create the segments so we can finish out the replay.
1815 *
1816 * We have to maintain the invariant that segments before the last
1817 * active segment are of size RELSEG_SIZE; therefore, if
1818 * extending, pad them out with zeroes if needed. (This only
1819 * matters if in recovery, or if the caller is extending the
1820 * relation discontiguously, but that can happen in hash indexes.)
1821 */
1822 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1823 {
1826
1827 mdextend(reln, forknum,
1830 pfree(zerobuf);
1831 }
1832 flags = O_CREAT;
1833 }
1834 else if (nblocks < ((BlockNumber) RELSEG_SIZE))
1835 {
1836 /*
1837 * When not extending, only open the next segment if the current
1838 * one is exactly RELSEG_SIZE. If not (this branch), either
1839 * return NULL or fail.
1840 */
1841 if (behavior & EXTENSION_RETURN_NULL)
1842 {
1843 /*
1844 * Some callers discern between reasons for _mdfd_getseg()
1845 * returning NULL based on errno. As there's no failing
1846 * syscall involved in this case, explicitly set errno to
1847 * ENOENT, as that seems the closest interpretation.
1848 */
1849 errno = ENOENT;
1850 return NULL;
1851 }
1852
1853 ereport(ERROR,
1855 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1856 _mdfd_segpath(reln, forknum, nextsegno).str,
1857 blkno, nblocks)));
1858 }
1859
1860 v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1861
1862 if (v == NULL)
1863 {
1864 if ((behavior & EXTENSION_RETURN_NULL) &&
1866 return NULL;
1867 ereport(ERROR,
1869 errmsg("could not open file \"%s\" (target block %u): %m",
1870 _mdfd_segpath(reln, forknum, nextsegno).str,
1871 blkno)));
1872 }
1873 }
1874
1875 return v;
1876}
1877
1878/*
1879 * Get number of blocks present in a single disk file
1880 */
1881static BlockNumber
1883{
1884 pgoff_t len;
1885
1886 len = FileSize(seg->mdfd_vfd);
1887 if (len < 0)
1888 ereport(ERROR,
1890 errmsg("could not seek to end of file \"%s\": %m",
1891 FilePathName(seg->mdfd_vfd))));
1892 /* note that this calculation will ignore any partial block at EOF */
1893 return (BlockNumber) (len / BLCKSZ);
1894}
1895
1896/*
1897 * Sync a file to disk, given a file tag. Write the path into an output
1898 * buffer so the caller can use it in error messages.
1899 *
1900 * Return 0 on success, -1 on failure, with errno set.
1901 */
1902int
1903mdsyncfiletag(const FileTag *ftag, char *path)
1904{
1906 File file;
1908 bool need_to_close;
1909 int result,
1910 save_errno;
1911
1912 /* See if we already have the file open, or need to open it. */
1913 if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1914 {
1915 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1916 strlcpy(path, FilePathName(file), MAXPGPATH);
1917 need_to_close = false;
1918 }
1919 else
1920 {
1921 MdPathStr p;
1922
1923 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1924 strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
1925
1926 file = PathNameOpenFile(path, _mdfd_open_flags());
1927 if (file < 0)
1928 return -1;
1929 need_to_close = true;
1930 }
1931
1933
1934 /* Sync the file. */
1935 result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1936 save_errno = errno;
1937
1938 if (need_to_close)
1939 FileClose(file);
1940
1942 IOOP_FSYNC, io_start, 1, 0);
1943
1944 errno = save_errno;
1945 return result;
1946}
1947
1948/*
1949 * Unlink a file, given a file tag. Write the path into an output
1950 * buffer so the caller can use it in error messages.
1951 *
1952 * Return 0 on success, -1 on failure, with errno set.
1953 */
1954int
1955mdunlinkfiletag(const FileTag *ftag, char *path)
1956{
1957 RelPathStr p;
1958
1959 /* Compute the path. */
1960 p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1961 strlcpy(path, p.str, MAXPGPATH);
1962
1963 /* Try to unlink the file. */
1964 return unlink(path);
1965}
1966
1967/*
1968 * Check if a given candidate request matches a given tag, when processing
1969 * a SYNC_FILTER_REQUEST request. This will be called for all pending
1970 * requests to find out whether to forget them.
1971 */
1972bool
1973mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
1974{
1975 /*
1976 * For now we only use filter requests as a way to drop all scheduled
1977 * callbacks relating to a given database, when dropping the database.
1978 * We'll return true for all candidates that have the same database OID as
1979 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1980 */
1981 return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1982}
1983
1984/*
1985 * AIO completion callback for mdstartreadv().
1986 */
1987static PgAioResult
1989{
1991 PgAioResult result = prior_result;
1992
1993 if (prior_result.result < 0)
1994 {
1995 result.status = PGAIO_RS_ERROR;
1996 result.id = PGAIO_HCB_MD_READV;
1997 /* For "hard" errors, track the error number in error_data */
1998 result.error_data = -prior_result.result;
1999 result.result = 0;
2000
2001 /*
2002 * Immediately log a message about the IO error, but only to the
2003 * server log. The reason to do so immediately is that the originator
2004 * might not process the query result immediately (because it is busy
2005 * doing another part of query processing) or at all (e.g. if it was
2006 * cancelled or errored out due to another IO also failing). The
2007 * definer of the IO will emit an ERROR when processing the IO's
2008 * results
2009 */
2011
2012 return result;
2013 }
2014
2015 /*
2016 * As explained above smgrstartreadv(), the smgr API operates on the level
2017 * of blocks, rather than bytes. Convert.
2018 */
2019 result.result /= BLCKSZ;
2020
2021 Assert(result.result <= td->smgr.nblocks);
2022
2023 if (result.result == 0)
2024 {
2025 /* consider 0 blocks read a failure */
2026 result.status = PGAIO_RS_ERROR;
2027 result.id = PGAIO_HCB_MD_READV;
2028 result.error_data = 0;
2029
2030 /* see comment above the "hard error" case */
2032
2033 return result;
2034 }
2035
2036 if (result.status != PGAIO_RS_ERROR &&
2037 result.result < td->smgr.nblocks)
2038 {
2039 /* partial reads should be retried at upper level */
2040 result.status = PGAIO_RS_PARTIAL;
2041 result.id = PGAIO_HCB_MD_READV;
2042 }
2043
2044 return result;
2045}
2046
2047/*
2048 * AIO error reporting callback for mdstartreadv().
2049 *
2050 * Errors are encoded as follows:
2051 * - PgAioResult.error_data != 0 encodes IO that failed with that errno
2052 * - PgAioResult.error_data == 0 encodes IO that didn't read all data
2053 */
2054static void
2055md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
2056{
2057 RelPathStr path;
2058
2059 path = relpathbackend(td->smgr.rlocator,
2061 td->smgr.forkNum);
2062
2063 if (result.error_data != 0)
2064 {
2065 /* for errcode_for_file_access() and %m */
2066 errno = result.error_data;
2067
2068 ereport(elevel,
2070 errmsg("could not read blocks %u..%u in file \"%s\": %m",
2071 td->smgr.blockNum,
2072 td->smgr.blockNum + td->smgr.nblocks - 1,
2073 path.str));
2074 }
2075 else
2076 {
2077 /*
2078 * NB: This will typically only be output in debug messages, while
2079 * retrying a partial IO.
2080 */
2081 ereport(elevel,
2083 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
2084 td->smgr.blockNum,
2085 td->smgr.blockNum + td->smgr.nblocks - 1,
2086 path.str,
2087 result.result * (size_t) BLCKSZ,
2088 td->smgr.nblocks * (size_t) BLCKSZ));
2089 }
2090}
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition aio.c:330
@ PGAIO_HCB_MD_READV
Definition aio.h:196
@ PGAIO_HF_BUFFERED
Definition aio.h:77
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
int pgaio_io_get_iovec(PgAioHandle *ioh, struct iovec **iov)
Definition aio_io.c:42
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition aio_target.c:73
@ PGAIO_RS_PARTIAL
Definition aio_types.h:82
@ PGAIO_RS_ERROR
Definition aio_types.h:84
void TablespaceCreateDbspace(Oid spcOid, Oid dbOid, bool isRedo)
Definition tablespace.c:112
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
#define MaxBlockNumber
Definition block.h:35
bool track_io_timing
Definition bufmgr.c:176
bool zero_damaged_pages
Definition bufmgr.c:173
#define Min(x, y)
Definition c.h:997
#define TYPEALIGN(ALIGNVAL, LEN)
Definition c.h:819
uint8_t uint8
Definition c.h:544
#define Assert(condition)
Definition c.h:873
#define PG_BINARY
Definition c.h:1287
uint64_t uint64
Definition c.h:547
uint32_t uint32
Definition c.h:546
#define lengthof(array)
Definition c.h:803
int errmsg_internal(const char *fmt,...)
Definition elog.c:1170
int errcode_for_file_access(void)
Definition elog.c:886
int errhint(const char *fmt,...)
Definition elog.c:1330
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define LOG_SERVER_ONLY
Definition elog.h:32
#define FATAL
Definition elog.h:41
#define WARNING
Definition elog.h:36
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int pg_truncate(const char *path, pgoff_t length)
Definition fd.c:720
int FileGetRawDesc(File file)
Definition fd.c:2515
void FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
Definition fd.c:2122
int io_direct_flags
Definition fd.c:171
int file_extend_method
Definition fd.c:168
char * FilePathName(File file)
Definition fd.c:2499
int FileSync(File file, uint32 wait_event_info)
Definition fd.c:2335
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2204
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2148
int FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2407
pgoff_t FileSize(File file)
Definition fd.c:2447
void FileClose(File file)
Definition fd.c:1965
int data_sync_elevel(int elevel)
Definition fd.c:3985
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1562
int FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2464
int FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2362
int FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2066
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2230
#define IO_DIRECT_DATA
Definition fd.h:54
static ssize_t FileWrite(File file, const void *buffer, size_t amount, pgoff_t offset, uint32 wait_event_info)
Definition fd.h:237
@ FILE_EXTEND_METHOD_WRITE_ZEROS
Definition fd.h:63
#define FILE_POSSIBLY_DELETED(err)
Definition fd.h:89
int File
Definition fd.h:51
#define PG_O_DIRECT
Definition fd.h:123
#define MCXT_ALLOC_ZERO
Definition fe_memutils.h:30
#define palloc_array(type, count)
Definition fe_memutils.h:76
int compute_remaining_iovec(struct iovec *destination, const struct iovec *source, int iovcnt, size_t transferred)
Definition file_utils.c:614
bool IsBinaryUpgrade
Definition globals.c:121
ProcNumber MyProcNumber
Definition globals.c:90
const char * str
int i
Definition isn.c:77
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1232
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
MemoryContext TopMemoryContext
Definition mcxt.c:166
void * palloc_aligned(Size size, Size alignto, int flags)
Definition mcxt.c:1606
void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition md.c:337
static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition md.c:2056
static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition md.c:1579
#define EXTENSION_CREATE_RECOVERY
Definition md.c:119
void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
Definition md.c:1301
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition md.c:1883
static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition md.c:374
void mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
Definition md.c:1070
bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
Definition md.c:1974
bool mdexists(SMgrRelation reln, ForkNumber forknum)
Definition md.c:203
void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition md.c:858
static MdPathStr _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
Definition md.c:1691
static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition md.c:1562
#define EXTENSION_DONT_OPEN
Definition md.c:121
BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum)
Definition md.c:1234
int mdunlinkfiletag(const FileTag *ftag, char *path)
Definition md.c:1956
static MemoryContext MdCxt
Definition md.c:97
void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition md.c:222
int mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
Definition md.c:1494
void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition md.c:487
static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition md.c:1989
static int do_truncate(const char *path)
Definition md.c:353
void mdinit(void)
Definition md.c:190
void mdclose(SMgrRelation reln, ForkNumber forknum)
Definition md.c:724
void mdzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition md.c:552
static MdfdVec * _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
Definition md.c:1711
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition md.c:1518
int mdsyncfiletag(const FileTag *ftag, char *path)
Definition md.c:1904
void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition md.c:1175
uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition md.c:844
static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
Definition md.c:1754
#define EXTENSION_RETURN_NULL
Definition md.c:115
void mdstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition md.c:996
bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition md.c:747
void mdregistersync(SMgrRelation reln, ForkNumber forknum)
Definition md.c:1390
void mdopen(SMgrRelation reln)
Definition md.c:713
#define EXTENSION_CREATE
Definition md.c:117
const PgAioHandleCallbacks aio_md_readv_cb
Definition md.c:169
static int _mdfd_open_flags(void)
Definition md.c:176
#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
Definition md.c:101
#define EXTENSION_FAIL
Definition md.c:113
static MdfdVec * mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
Definition md.c:675
void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
Definition md.c:1611
static int buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
Definition md.c:795
#define MD_PATH_STR_MAXLEN
Definition md.c:132
static void _fdvec_resize(SMgrRelation reln, ForkNumber forknum, int nseg)
Definition md.c:1643
void ForgetDatabaseSyncRequests(Oid dbid)
Definition md.c:1593
void mdimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition md.c:1441
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
#define ERRCODE_DATA_CORRUPTED
#define MAXPGPATH
#define PG_IO_ALIGN_SIZE
const void size_t len
#define PG_IOV_MAX
Definition pg_iovec.h:47
@ IOOBJECT_RELATION
Definition pgstat.h:277
@ IOCONTEXT_NORMAL
Definition pgstat.h:289
@ IOOP_FSYNC
Definition pgstat.h:308
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
#define sprintf
Definition port.h:262
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
off_t pgoff_t
Definition port.h:421
unsigned int Oid
static int fd(const char *x, int i)
#define INVALID_PROC_NUMBER
Definition procnumber.h:26
#define RelFileLocatorBackendIsTemp(rlocator)
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
@ InvalidForkNumber
Definition relpath.h:57
#define MAX_FORKNUM
Definition relpath.h:70
#define relpath(rlocator, forknum)
Definition relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition relpath.h:141
#define relpathperm(rlocator, forknum)
Definition relpath.h:146
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition smgr.c:240
void smgrclose(SMgrRelation reln)
Definition smgr.c:374
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
Definition smgr.c:538
void pgaio_io_set_target_smgr(PgAioHandle *ioh, SMgrRelationData *smgr, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skip_fsync)
Definition smgr.c:1038
#define SmgrIsTemp(smgr)
Definition smgr.h:74
RelFileLocator rlocator
Definition sync.h:54
int16 forknum
Definition sync.h:53
uint64 segno
Definition sync.h:55
char str[MD_PATH_STR_MAXLEN+1]
Definition md.c:140
PgAioHandleCallbackComplete complete_shared
Definition aio.h:239
uint32 status
Definition aio_types.h:108
uint32 error_data
Definition aio_types.h:111
int32 result
Definition aio_types.h:113
uint32 id
Definition aio_types.h:105
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition relpath.h:123
Definition md.c:92
File mdfd_vfd
Definition md.c:93
BlockNumber mdfd_segno
Definition md.c:94
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition sync.c:580
@ SYNC_FILTER_REQUEST
Definition sync.h:28
@ SYNC_FORGET_REQUEST
Definition sync.h:27
@ SYNC_UNLINK_REQUEST
Definition sync.h:26
@ SYNC_REQUEST
Definition sync.h:25
BlockNumber blockNum
Definition aio_types.h:66
RelFileLocator rlocator
Definition aio_types.h:65
struct PgAioTargetData::@126 smgr
BlockNumber nblocks
Definition aio_types.h:67
bool InRecovery
Definition xlogutils.c:50
void XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum)
Definition xlogutils.c:630

◆ MD_PATH_STR_MAXLEN

#define MD_PATH_STR_MAXLEN
Value:
(\
+ sizeof((char)'.') \
)
#define SEGMENT_CHARS
Definition md.c:131
#define REL_PATH_STR_MAXLEN
Definition relpath.h:96

Definition at line 132 of file md.c.

◆ SEGMENT_CHARS

#define SEGMENT_CHARS   OIDCHARS

Definition at line 131 of file md.c.

Typedef Documentation

◆ MdfdVec

◆ MdPathStr

Function Documentation

◆ _fdvec_resize()

static void _fdvec_resize ( SMgrRelation  reln,
ForkNumber  forknum,
int  nseg 
)
static

Definition at line 1643 of file md.c.

1646{
1647 if (nseg == 0)
1648 {
1649 if (reln->md_num_open_segs[forknum] > 0)
1650 {
1651 pfree(reln->md_seg_fds[forknum]);
1652 reln->md_seg_fds[forknum] = NULL;
1653 }
1654 }
1655 else if (reln->md_num_open_segs[forknum] == 0)
1656 {
1657 reln->md_seg_fds[forknum] =
1659 }
1660 else if (nseg > reln->md_num_open_segs[forknum])
1661 {
1662 /*
1663 * It doesn't seem worthwhile complicating the code to amortize
1664 * repalloc() calls. Those are far faster than PathNameOpenFile() or
1665 * FileClose(), and the memory context internally will sometimes avoid
1666 * doing an actual reallocation.
1667 */
1668 reln->md_seg_fds[forknum] =
1669 repalloc(reln->md_seg_fds[forknum],
1670 sizeof(MdfdVec) * nseg);
1671 }
1672 else
1673 {
1674 /*
1675 * We don't reallocate a smaller array, because we want mdtruncate()
1676 * to be able to promise that it won't allocate memory, so that it is
1677 * allowed in a critical section. This means that a bit of space in
1678 * the array is now wasted, until the next time we add a segment and
1679 * reallocate.
1680 */
1681 }
1682
1683 reln->md_num_open_segs[forknum] = nseg;
1684}

References fb(), MdCxt, MemoryContextAlloc(), pfree(), and repalloc().

Referenced by _mdfd_openseg(), mdclose(), mdcreate(), mdimmedsync(), mdopenfork(), mdregistersync(), and mdtruncate().

◆ _mdfd_getseg()

static MdfdVec * _mdfd_getseg ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blkno,
bool  skipFsync,
int  behavior 
)
static

Definition at line 1754 of file md.c.

1756{
1757 MdfdVec *v;
1760
1761 /* some way to handle non-existent segments needs to be specified */
1762 Assert(behavior &
1765
1766 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1767
1768 /* if an existing and opened segment, we're done */
1769 if (targetseg < reln->md_num_open_segs[forknum])
1770 {
1771 v = &reln->md_seg_fds[forknum][targetseg];
1772 return v;
1773 }
1774
1775 /* The caller only wants the segment if we already had it open. */
1776 if (behavior & EXTENSION_DONT_OPEN)
1777 return NULL;
1778
1779 /*
1780 * The target segment is not yet open. Iterate over all the segments
1781 * between the last opened and the target segment. This way missing
1782 * segments either raise an error, or get created (according to
1783 * 'behavior'). Start with either the last opened, or the first segment if
1784 * none was opened before.
1785 */
1786 if (reln->md_num_open_segs[forknum] > 0)
1787 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1788 else
1789 {
1790 v = mdopenfork(reln, forknum, behavior);
1791 if (!v)
1792 return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1793 }
1794
1795 for (nextsegno = reln->md_num_open_segs[forknum];
1797 {
1798 BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1799 int flags = 0;
1800
1801 Assert(nextsegno == v->mdfd_segno + 1);
1802
1803 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1804 elog(FATAL, "segment too big");
1805
1806 if ((behavior & EXTENSION_CREATE) ||
1807 (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1808 {
1809 /*
1810 * Normally we will create new segments only if authorized by the
1811 * caller (i.e., we are doing mdextend()). But when doing WAL
1812 * recovery, create segments anyway; this allows cases such as
1813 * replaying WAL data that has a write into a high-numbered
1814 * segment of a relation that was later deleted. We want to go
1815 * ahead and create the segments so we can finish out the replay.
1816 *
1817 * We have to maintain the invariant that segments before the last
1818 * active segment are of size RELSEG_SIZE; therefore, if
1819 * extending, pad them out with zeroes if needed. (This only
1820 * matters if in recovery, or if the caller is extending the
1821 * relation discontiguously, but that can happen in hash indexes.)
1822 */
1823 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1824 {
1827
1828 mdextend(reln, forknum,
1831 pfree(zerobuf);
1832 }
1833 flags = O_CREAT;
1834 }
1835 else if (nblocks < ((BlockNumber) RELSEG_SIZE))
1836 {
1837 /*
1838 * When not extending, only open the next segment if the current
1839 * one is exactly RELSEG_SIZE. If not (this branch), either
1840 * return NULL or fail.
1841 */
1842 if (behavior & EXTENSION_RETURN_NULL)
1843 {
1844 /*
1845 * Some callers discern between reasons for _mdfd_getseg()
1846 * returning NULL based on errno. As there's no failing
1847 * syscall involved in this case, explicitly set errno to
1848 * ENOENT, as that seems the closest interpretation.
1849 */
1850 errno = ENOENT;
1851 return NULL;
1852 }
1853
1854 ereport(ERROR,
1856 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1857 _mdfd_segpath(reln, forknum, nextsegno).str,
1858 blkno, nblocks)));
1859 }
1860
1861 v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1862
1863 if (v == NULL)
1864 {
1865 if ((behavior & EXTENSION_RETURN_NULL) &&
1867 return NULL;
1868 ereport(ERROR,
1870 errmsg("could not open file \"%s\" (target block %u): %m",
1871 _mdfd_segpath(reln, forknum, nextsegno).str,
1872 blkno)));
1873 }
1874 }
1875
1876 return v;
1877}

References _mdfd_openseg(), _mdfd_segpath(), _mdnblocks(), Assert, elog, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE, EXTENSION_CREATE_RECOVERY, EXTENSION_DONT_OPEN, EXTENSION_FAIL, EXTENSION_RETURN_NULL, FATAL, fb(), FILE_POSSIBLY_DELETED, InRecovery, MCXT_ALLOC_ZERO, mdextend(), _MdfdVec::mdfd_segno, mdopenfork(), palloc_aligned(), pfree(), PG_IO_ALIGN_SIZE, and str.

Referenced by mdextend(), mdfd(), mdprefetch(), mdreadv(), mdstartreadv(), mdwriteback(), mdwritev(), and mdzeroextend().

◆ _mdfd_open_flags()

static int _mdfd_open_flags ( void  )
inlinestatic

Definition at line 176 of file md.c.

177{
178 int flags = O_RDWR | PG_BINARY;
179
181 flags |= PG_O_DIRECT;
182
183 return flags;
184}

References fb(), IO_DIRECT_DATA, io_direct_flags, PG_BINARY, and PG_O_DIRECT.

Referenced by _mdfd_openseg(), mdcreate(), mdopenfork(), and mdsyncfiletag().

◆ _mdfd_openseg()

static MdfdVec * _mdfd_openseg ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  segno,
int  oflags 
)
static

Definition at line 1711 of file md.c.

1713{
1714 MdfdVec *v;
1715 File fd;
1716 MdPathStr fullpath;
1717
1718 fullpath = _mdfd_segpath(reln, forknum, segno);
1719
1720 /* open the file */
1722
1723 if (fd < 0)
1724 return NULL;
1725
1726 /*
1727 * Segments are always opened in order from lowest to highest, so we must
1728 * be adding a new one at the end.
1729 */
1730 Assert(segno == reln->md_num_open_segs[forknum]);
1731
1732 _fdvec_resize(reln, forknum, segno + 1);
1733
1734 /* fill the entry */
1735 v = &reln->md_seg_fds[forknum][segno];
1736 v->mdfd_vfd = fd;
1737 v->mdfd_segno = segno;
1738
1739 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1740
1741 /* all done */
1742 return v;
1743}

References _fdvec_resize(), _mdfd_open_flags(), _mdfd_segpath(), _mdnblocks(), Assert, fb(), fd(), _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, PathNameOpenFile(), and MdPathStr::str.

Referenced by _mdfd_getseg(), mdimmedsync(), mdnblocks(), and mdregistersync().

◆ _mdfd_segpath()

static MdPathStr _mdfd_segpath ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1691 of file md.c.

1692{
1693 RelPathStr path;
1694 MdPathStr fullpath;
1695
1696 path = relpath(reln->smgr_rlocator, forknum);
1697
1698 if (segno > 0)
1699 sprintf(fullpath.str, "%s.%u", path.str, segno);
1700 else
1701 strcpy(fullpath.str, path.str);
1702
1703 return fullpath;
1704}

References fb(), relpath, sprintf, MdPathStr::str, and RelPathStr::str.

Referenced by _mdfd_getseg(), _mdfd_openseg(), and mdsyncfiletag().

◆ _mdnblocks()

static BlockNumber _mdnblocks ( SMgrRelation  reln,
ForkNumber  forknum,
MdfdVec seg 
)
static

Definition at line 1883 of file md.c.

1884{
1885 pgoff_t len;
1886
1887 len = FileSize(seg->mdfd_vfd);
1888 if (len < 0)
1889 ereport(ERROR,
1891 errmsg("could not seek to end of file \"%s\": %m",
1892 FilePathName(seg->mdfd_vfd))));
1893 /* note that this calculation will ignore any partial block at EOF */
1894 return (BlockNumber) (len / BLCKSZ);
1895}

References ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), FilePathName(), FileSize(), len, and _MdfdVec::mdfd_vfd.

Referenced by _mdfd_getseg(), _mdfd_openseg(), mdextend(), mdnblocks(), mdopenfork(), and mdzeroextend().

◆ buffers_to_iovec()

static int buffers_to_iovec ( struct iovec iov,
void **  buffers,
int  nblocks 
)
static

Definition at line 795 of file md.c.

796{
797 struct iovec *iovp;
798 int iovcnt;
799
800 Assert(nblocks >= 1);
801
802 /* If this build supports direct I/O, buffers must be I/O aligned. */
803 for (int i = 0; i < nblocks; ++i)
804 {
805 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
806 Assert((uintptr_t) buffers[i] ==
807 TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
808 }
809
810 /* Start the first iovec off with the first buffer. */
811 iovp = &iov[0];
812 iovp->iov_base = buffers[0];
813 iovp->iov_len = BLCKSZ;
814 iovcnt = 1;
815
816 /* Try to merge the rest. */
817 for (int i = 1; i < nblocks; ++i)
818 {
819 void *buffer = buffers[i];
820
821 if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
822 {
823 /* Contiguous with the last iovec. */
824 iovp->iov_len += BLCKSZ;
825 }
826 else
827 {
828 /* Need a new iovec. */
829 iovp++;
830 iovp->iov_base = buffer;
831 iovp->iov_len = BLCKSZ;
832 iovcnt++;
833 }
834 }
835
836 return iovcnt;
837}

References Assert, fb(), i, PG_IO_ALIGN_SIZE, PG_O_DIRECT, and TYPEALIGN.

Referenced by mdreadv(), mdstartreadv(), and mdwritev().

◆ do_truncate()

static int do_truncate ( const char path)
static

Definition at line 353 of file md.c.

354{
355 int save_errno;
356 int ret;
357
358 ret = pg_truncate(path, 0);
359
360 /* Log a warning here to avoid repetition in callers. */
361 if (ret < 0 && errno != ENOENT)
362 {
366 errmsg("could not truncate file \"%s\": %m", path)));
368 }
369
370 return ret;
371}

References ereport, errcode_for_file_access(), errmsg(), fb(), pg_truncate(), and WARNING.

Referenced by mdunlinkfork().

◆ DropRelationFiles()

void DropRelationFiles ( RelFileLocator delrels,
int  ndelrels,
bool  isRedo 
)

Definition at line 1611 of file md.c.

1612{
1614 int i;
1615
1617 for (i = 0; i < ndelrels; i++)
1618 {
1620
1621 if (isRedo)
1622 {
1624
1625 for (fork = 0; fork <= MAX_FORKNUM; fork++)
1627 }
1628 srels[i] = srel;
1629 }
1630
1632
1633 for (i = 0; i < ndelrels; i++)
1634 smgrclose(srels[i]);
1635 pfree(srels);
1636}

References fb(), i, INVALID_PROC_NUMBER, MAX_FORKNUM, palloc_array, pfree(), smgrclose(), smgrdounlinkall(), smgropen(), and XLogDropRelation().

Referenced by FinishPreparedTransaction(), xact_redo_abort(), and xact_redo_commit().

◆ ForgetDatabaseSyncRequests()

void ForgetDatabaseSyncRequests ( Oid  dbid)

Definition at line 1593 of file md.c.

1594{
1595 FileTag tag;
1596 RelFileLocator rlocator;
1597
1598 rlocator.dbOid = dbid;
1599 rlocator.spcOid = 0;
1600 rlocator.relNumber = 0;
1601
1603
1604 RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1605}

References RelFileLocator::dbOid, INIT_MD_FILETAG, InvalidBlockNumber, InvalidForkNumber, RegisterSyncRequest(), RelFileLocator::relNumber, RelFileLocator::spcOid, and SYNC_FILTER_REQUEST.

Referenced by createdb_failure_callback(), dbase_redo(), and dropdb().

◆ md_readv_complete()

static PgAioResult md_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 1989 of file md.c.

1990{
1992 PgAioResult result = prior_result;
1993
1994 if (prior_result.result < 0)
1995 {
1996 result.status = PGAIO_RS_ERROR;
1997 result.id = PGAIO_HCB_MD_READV;
1998 /* For "hard" errors, track the error number in error_data */
1999 result.error_data = -prior_result.result;
2000 result.result = 0;
2001
2002 /*
2003 * Immediately log a message about the IO error, but only to the
2004 * server log. The reason to do so immediately is that the originator
2005 * might not process the query result immediately (because it is busy
2006 * doing another part of query processing) or at all (e.g. if it was
2007 * cancelled or errored out due to another IO also failing). The
2008 * definer of the IO will emit an ERROR when processing the IO's
2009 * results
2010 */
2012
2013 return result;
2014 }
2015
2016 /*
2017 * As explained above smgrstartreadv(), the smgr API operates on the level
2018 * of blocks, rather than bytes. Convert.
2019 */
2020 result.result /= BLCKSZ;
2021
2022 Assert(result.result <= td->smgr.nblocks);
2023
2024 if (result.result == 0)
2025 {
2026 /* consider 0 blocks read a failure */
2027 result.status = PGAIO_RS_ERROR;
2028 result.id = PGAIO_HCB_MD_READV;
2029 result.error_data = 0;
2030
2031 /* see comment above the "hard error" case */
2033
2034 return result;
2035 }
2036
2037 if (result.status != PGAIO_RS_ERROR &&
2038 result.result < td->smgr.nblocks)
2039 {
2040 /* partial reads should be retried at upper level */
2041 result.status = PGAIO_RS_PARTIAL;
2042 result.id = PGAIO_HCB_MD_READV;
2043 }
2044
2045 return result;
2046}

References Assert, PgAioResult::error_data, fb(), PgAioResult::id, LOG_SERVER_ONLY, PgAioTargetData::nblocks, PGAIO_HCB_MD_READV, pgaio_io_get_target_data(), pgaio_result_report(), PGAIO_RS_ERROR, PGAIO_RS_PARTIAL, PgAioResult::result, PgAioTargetData::smgr, and PgAioResult::status.

◆ md_readv_report()

static void md_readv_report ( PgAioResult  result,
const PgAioTargetData td,
int  elevel 
)
static

Definition at line 2056 of file md.c.

2057{
2058 RelPathStr path;
2059
2060 path = relpathbackend(td->smgr.rlocator,
2062 td->smgr.forkNum);
2063
2064 if (result.error_data != 0)
2065 {
2066 /* for errcode_for_file_access() and %m */
2067 errno = result.error_data;
2068
2069 ereport(elevel,
2071 errmsg("could not read blocks %u..%u in file \"%s\": %m",
2072 td->smgr.blockNum,
2073 td->smgr.blockNum + td->smgr.nblocks - 1,
2074 path.str));
2075 }
2076 else
2077 {
2078 /*
2079 * NB: This will typically only be output in debug messages, while
2080 * retrying a partial IO.
2081 */
2082 ereport(elevel,
2084 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
2085 td->smgr.blockNum,
2086 td->smgr.blockNum + td->smgr.nblocks - 1,
2087 path.str,
2088 result.result * (size_t) BLCKSZ,
2089 td->smgr.nblocks * (size_t) BLCKSZ));
2090 }
2091}

References PgAioTargetData::blockNum, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg(), PgAioResult::error_data, fb(), PgAioTargetData::forkNum, INVALID_PROC_NUMBER, PgAioTargetData::is_temp, MyProcNumber, PgAioTargetData::nblocks, relpathbackend, PgAioResult::result, PgAioTargetData::rlocator, PgAioTargetData::smgr, and RelPathStr::str.

◆ mdclose()

void mdclose ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 724 of file md.c.

725{
726 int nopensegs = reln->md_num_open_segs[forknum];
727
728 /* No work if already closed */
729 if (nopensegs == 0)
730 return;
731
732 /* close segments starting from the end */
733 while (nopensegs > 0)
734 {
735 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
736
738 _fdvec_resize(reln, forknum, nopensegs - 1);
739 nopensegs--;
740 }
741}

References _fdvec_resize(), fb(), FileClose(), and _MdfdVec::mdfd_vfd.

Referenced by mdexists().

◆ mdcreate()

void mdcreate ( SMgrRelation  reln,
ForkNumber  forknum,
bool  isRedo 
)

Definition at line 222 of file md.c.

223{
224 MdfdVec *mdfd;
225 RelPathStr path;
226 File fd;
227
228 if (isRedo && reln->md_num_open_segs[forknum] > 0)
229 return; /* created and opened already... */
230
231 Assert(reln->md_num_open_segs[forknum] == 0);
232
233 /*
234 * We may be using the target table space for the first time in this
235 * database, so create a per-database subdirectory if needed.
236 *
237 * XXX this is a fairly ugly violation of module layering, but this seems
238 * to be the best place to put the check. Maybe TablespaceCreateDbspace
239 * should be here and not in commands/tablespace.c? But that would imply
240 * importing a lot of stuff that smgr.c oughtn't know, either.
241 */
242 TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
243 reln->smgr_rlocator.locator.dbOid,
244 isRedo);
245
246 path = relpath(reln->smgr_rlocator, forknum);
247
249
250 if (fd < 0)
251 {
252 int save_errno = errno;
253
254 if (isRedo)
256 if (fd < 0)
257 {
258 /* be sure to report the error reported by create, not open */
262 errmsg("could not create file \"%s\": %m", path.str)));
263 }
264 }
265
266 _fdvec_resize(reln, forknum, 1);
267 mdfd = &reln->md_seg_fds[forknum][0];
268 mdfd->mdfd_vfd = fd;
269 mdfd->mdfd_segno = 0;
270
271 if (!SmgrIsTemp(reln))
273}

References _fdvec_resize(), _mdfd_open_flags(), Assert, ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), fd(), mdfd(), PathNameOpenFile(), register_dirty_segment(), relpath, SmgrIsTemp, RelPathStr::str, and TablespaceCreateDbspace().

◆ mdexists()

bool mdexists ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 203 of file md.c.

204{
205 /*
206 * Close it first, to ensure that we notice if the fork has been unlinked
207 * since we opened it. As an optimization, we can skip that in recovery,
208 * which already closes relations when dropping them.
209 */
210 if (!InRecovery)
211 mdclose(reln, forknum);
212
213 return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
214}

References EXTENSION_RETURN_NULL, fb(), InRecovery, mdclose(), and mdopenfork().

◆ mdextend()

void mdextend ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
const void buffer,
bool  skipFsync 
)

Definition at line 487 of file md.c.

489{
490 pgoff_t seekpos;
491 int nbytes;
492 MdfdVec *v;
493
494 /* If this build supports direct I/O, the buffer must be I/O aligned. */
495 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
496 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
497
498 /* This assert is too expensive to have on normally ... */
499#ifdef CHECK_WRITE_VS_EXTEND
500 Assert(blocknum >= mdnblocks(reln, forknum));
501#endif
502
503 /*
504 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
505 * more --- we mustn't create a block whose number actually is
506 * InvalidBlockNumber. (Note that this failure should be unreachable
507 * because of upstream checks in bufmgr.c.)
508 */
509 if (blocknum == InvalidBlockNumber)
512 errmsg("cannot extend file \"%s\" beyond %u blocks",
513 relpath(reln->smgr_rlocator, forknum).str,
515
516 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
517
518 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
519
520 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
521
522 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
523 {
524 if (nbytes < 0)
527 errmsg("could not extend file \"%s\": %m",
529 errhint("Check free disk space.")));
530 /* short write: complain appropriately */
533 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
535 nbytes, BLCKSZ, blocknum),
536 errhint("Check free disk space.")));
537 }
538
539 if (!skipFsync && !SmgrIsTemp(reln))
540 register_dirty_segment(reln, forknum, v);
541
542 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
543}

References _mdfd_getseg(), _mdnblocks(), Assert, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, fb(), FilePathName(), FileWrite(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), PG_IO_ALIGN_SIZE, PG_O_DIRECT, register_dirty_segment(), relpath, SmgrIsTemp, and TYPEALIGN.

Referenced by _mdfd_getseg().

◆ mdfd()

int mdfd ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
uint32 off 
)

Definition at line 1494 of file md.c.

1495{
1496 MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
1497
1498 v = _mdfd_getseg(reln, forknum, blocknum, false,
1500
1501 *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1502
1503 Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1504
1505 return FileGetRawDesc(v->mdfd_vfd);
1506}

References _mdfd_getseg(), Assert, EXTENSION_FAIL, fb(), FileGetRawDesc(), _MdfdVec::mdfd_vfd, and mdopenfork().

Referenced by mdcreate(), and mdopenfork().

◆ mdfiletagmatches()

bool mdfiletagmatches ( const FileTag ftag,
const FileTag candidate 
)

Definition at line 1974 of file md.c.

1975{
1976 /*
1977 * For now we only use filter requests as a way to drop all scheduled
1978 * callbacks relating to a given database, when dropping the database.
1979 * We'll return true for all candidates that have the same database OID as
1980 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1981 */
1982 return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1983}

References RelFileLocator::dbOid, fb(), and FileTag::rlocator.

◆ mdimmedsync()

void mdimmedsync ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1441 of file md.c.

1442{
1443 int segno;
1444 int min_inactive_seg;
1445
1446 /*
1447 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1448 * the loop below will get them all!
1449 */
1450 mdnblocks(reln, forknum);
1451
1452 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1453
1454 /*
1455 * Temporarily open inactive segments, then close them after sync. There
1456 * may be some inactive segments left opened after fsync() error, but that
1457 * is harmless. We don't bother to clean them up and take a risk of
1458 * further trouble. The next mdclose() will soon close them.
1459 */
1460 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1461 segno++;
1462
1463 while (segno > 0)
1464 {
1465 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1466
1467 /*
1468 * fsyncs done through mdimmedsync() should be tracked in a separate
1469 * IOContext than those done through mdsyncfiletag() to differentiate
1470 * between unavoidable client backend fsyncs (e.g. those done during
1471 * index build) and those which ideally would have been done by the
1472 * checkpointer. Since other IO operations bypassing the buffer
1473 * manager could also be tracked in such an IOContext, wait until
1474 * these are also tracked to track immediate fsyncs.
1475 */
1479 errmsg("could not fsync file \"%s\": %m",
1480 FilePathName(v->mdfd_vfd))));
1481
1482 /* Close inactive segments immediately */
1483 if (segno > min_inactive_seg)
1484 {
1485 FileClose(v->mdfd_vfd);
1486 _fdvec_resize(reln, forknum, segno - 1);
1487 }
1488
1489 segno--;
1490 }
1491}

References _fdvec_resize(), _mdfd_openseg(), data_sync_elevel(), ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), FileClose(), FilePathName(), FileSync(), _MdfdVec::mdfd_vfd, and mdnblocks().

◆ mdinit()

void mdinit ( void  )

Definition at line 190 of file md.c.

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, MdCxt, and TopMemoryContext.

◆ mdmaxcombine()

uint32 mdmaxcombine ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum 
)

Definition at line 844 of file md.c.

846{
847 BlockNumber segoff;
848
849 segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
850
851 return RELSEG_SIZE - segoff;
852}

References fb().

◆ mdnblocks()

BlockNumber mdnblocks ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1234 of file md.c.

1235{
1236 MdfdVec *v;
1237 BlockNumber nblocks;
1238 BlockNumber segno;
1239
1240 mdopenfork(reln, forknum, EXTENSION_FAIL);
1241
1242 /* mdopen has opened the first segment */
1243 Assert(reln->md_num_open_segs[forknum] > 0);
1244
1245 /*
1246 * Start from the last open segments, to avoid redundant seeks. We have
1247 * previously verified that these segments are exactly RELSEG_SIZE long,
1248 * and it's useless to recheck that each time.
1249 *
1250 * NOTE: this assumption could only be wrong if another backend has
1251 * truncated the relation. We rely on higher code levels to handle that
1252 * scenario by closing and re-opening the md fd, which is handled via
1253 * relcache flush. (Since the checkpointer doesn't participate in
1254 * relcache flush, it could have segment entries for inactive segments;
1255 * that's OK because the checkpointer never needs to compute relation
1256 * size.)
1257 */
1258 segno = reln->md_num_open_segs[forknum] - 1;
1259 v = &reln->md_seg_fds[forknum][segno];
1260
1261 for (;;)
1262 {
1263 nblocks = _mdnblocks(reln, forknum, v);
1264 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1265 elog(FATAL, "segment too big");
1266 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1267 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1268
1269 /*
1270 * If segment is exactly RELSEG_SIZE, advance to next one.
1271 */
1272 segno++;
1273
1274 /*
1275 * We used to pass O_CREAT here, but that has the disadvantage that it
1276 * might create a segment which has vanished through some operating
1277 * system misadventure. In such a case, creating the segment here
1278 * undermines _mdfd_getseg's attempts to notice and report an error
1279 * upon access to a missing segment.
1280 */
1281 v = _mdfd_openseg(reln, forknum, segno, 0);
1282 if (v == NULL)
1283 return segno * ((BlockNumber) RELSEG_SIZE);
1284 }
1285}

References _mdfd_openseg(), _mdnblocks(), Assert, elog, EXTENSION_FAIL, FATAL, fb(), and mdopenfork().

Referenced by mdextend(), mdimmedsync(), mdregistersync(), mdwritev(), and mdzeroextend().

◆ mdopen()

void mdopen ( SMgrRelation  reln)

Definition at line 713 of file md.c.

714{
715 /* mark it not open */
716 for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
717 reln->md_num_open_segs[forknum] = 0;
718}

References fb(), and MAX_FORKNUM.

◆ mdopenfork()

static MdfdVec * mdopenfork ( SMgrRelation  reln,
ForkNumber  forknum,
int  behavior 
)
static

Definition at line 675 of file md.c.

676{
677 MdfdVec *mdfd;
678 RelPathStr path;
679 File fd;
680
681 /* No work if already open */
682 if (reln->md_num_open_segs[forknum] > 0)
683 return &reln->md_seg_fds[forknum][0];
684
685 path = relpath(reln->smgr_rlocator, forknum);
686
688
689 if (fd < 0)
690 {
691 if ((behavior & EXTENSION_RETURN_NULL) &&
693 return NULL;
696 errmsg("could not open file \"%s\": %m", path.str)));
697 }
698
699 _fdvec_resize(reln, forknum, 1);
700 mdfd = &reln->md_seg_fds[forknum][0];
701 mdfd->mdfd_vfd = fd;
702 mdfd->mdfd_segno = 0;
703
705
706 return mdfd;
707}

References _fdvec_resize(), _mdfd_open_flags(), _mdnblocks(), Assert, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_RETURN_NULL, fb(), fd(), FILE_POSSIBLY_DELETED, mdfd(), PathNameOpenFile(), relpath, and RelPathStr::str.

Referenced by _mdfd_getseg(), mdexists(), mdfd(), and mdnblocks().

◆ mdprefetch()

bool mdprefetch ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
int  nblocks 
)

Definition at line 747 of file md.c.

749{
750#ifdef USE_PREFETCH
751
753
754 if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
755 return false;
756
757 while (nblocks > 0)
758 {
759 pgoff_t seekpos;
760 MdfdVec *v;
762
763 v = _mdfd_getseg(reln, forknum, blocknum, false,
765 if (v == NULL)
766 return false;
767
768 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
769
770 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
771
773 Min(nblocks,
774 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
775
778
779 blocknum += nblocks_this_segment;
780 nblocks -= nblocks_this_segment;
781 }
782#endif /* USE_PREFETCH */
783
784 return true;
785}

References _mdfd_getseg(), Assert, EXTENSION_FAIL, EXTENSION_RETURN_NULL, fb(), FilePrefetch(), InRecovery, IO_DIRECT_DATA, io_direct_flags, MaxBlockNumber, _MdfdVec::mdfd_vfd, and Min.

◆ mdreadv()

void mdreadv ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
void **  buffers,
BlockNumber  nblocks 
)

Definition at line 858 of file md.c.

860{
861 while (nblocks > 0)
862 {
863 struct iovec iov[PG_IOV_MAX];
864 int iovcnt;
865 pgoff_t seekpos;
866 int nbytes;
867 MdfdVec *v;
870 size_t size_this_segment;
871
872 v = _mdfd_getseg(reln, forknum, blocknum, false,
874
875 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
876
877 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
878
880 Min(nblocks,
881 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
883
884 if (nblocks_this_segment != nblocks)
885 elog(ERROR, "read crosses segment boundary");
886
890
891 /*
892 * Inner loop to continue after a short read. We'll keep going until
893 * we hit EOF rather than assuming that a short read means we hit the
894 * end.
895 */
896 for (;;)
897 {
898 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
899 reln->smgr_rlocator.locator.spcOid,
900 reln->smgr_rlocator.locator.dbOid,
901 reln->smgr_rlocator.locator.relNumber,
902 reln->smgr_rlocator.backend);
903 nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
905 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
906 reln->smgr_rlocator.locator.spcOid,
907 reln->smgr_rlocator.locator.dbOid,
908 reln->smgr_rlocator.locator.relNumber,
909 reln->smgr_rlocator.backend,
910 nbytes,
912
913#ifdef SIMULATE_SHORT_READ
914 nbytes = Min(nbytes, 4096);
915#endif
916
917 if (nbytes < 0)
920 errmsg("could not read blocks %u..%u in file \"%s\": %m",
921 blocknum,
922 blocknum + nblocks_this_segment - 1,
923 FilePathName(v->mdfd_vfd))));
924
925 if (nbytes == 0)
926 {
927 /*
928 * We are at or past EOF, or we read a partial block at EOF.
929 * Normally this is an error; upper levels should never try to
930 * read a nonexistent block. However, if zero_damaged_pages
931 * is ON or we are InRecovery, we should instead return zeroes
932 * without complaining. This allows, for example, the case of
933 * trying to update a block that was later truncated away.
934 *
935 * NB: We think that this codepath is unreachable in recovery
936 * and incomplete with zero_damaged_pages, as missing segments
937 * are not created. Putting blocks into the buffer-pool that
938 * do not exist on disk is rather problematic, as it will not
939 * be found by scans that rely on smgrnblocks(), as they are
940 * beyond EOF. It also can cause weird problems with relation
941 * extension, as relation extension does not expect blocks
942 * beyond EOF to exist.
943 *
944 * Therefore we do not want to copy the logic into
945 * mdstartreadv(), where it would have to be more complicated
946 * due to potential differences in the zero_damaged_pages
947 * setting between the definer and completor of IO.
948 *
949 * For PG 18, we are putting an Assert(false) in mdreadv()
950 * (triggering failures in assertion-enabled builds, but
951 * continuing to work in production builds). Afterwards we
952 * plan to remove this code entirely.
953 */
955 {
956 Assert(false); /* see comment above */
957
960 ++i)
961 memset(buffers[i], 0, BLCKSZ);
962 break;
963 }
964 else
967 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
968 blocknum,
969 blocknum + nblocks_this_segment - 1,
973 }
974
975 /* One loop should usually be enough. */
976 transferred_this_segment += nbytes;
979 break;
980
981 /* Adjust position and vectors after a short read. */
982 seekpos += nbytes;
984 }
985
986 nblocks -= nblocks_this_segment;
987 buffers += nblocks_this_segment;
988 blocknum += nblocks_this_segment;
989 }
990}

References _mdfd_getseg(), Assert, buffers_to_iovec(), compute_remaining_iovec(), elog, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, fb(), FilePathName(), FileReadV(), i, InRecovery, lengthof, _MdfdVec::mdfd_vfd, Min, PG_IOV_MAX, and zero_damaged_pages.

◆ mdregistersync()

void mdregistersync ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1390 of file md.c.

1391{
1392 int segno;
1393 int min_inactive_seg;
1394
1395 /*
1396 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1397 * the loop below will get them all!
1398 */
1399 mdnblocks(reln, forknum);
1400
1401 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1402
1403 /*
1404 * Temporarily open inactive segments, then close them after sync. There
1405 * may be some inactive segments left opened after error, but that is
1406 * harmless. We don't bother to clean them up and take a risk of further
1407 * trouble. The next mdclose() will soon close them.
1408 */
1409 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1410 segno++;
1411
1412 while (segno > 0)
1413 {
1414 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1415
1416 register_dirty_segment(reln, forknum, v);
1417
1418 /* Close inactive segments immediately */
1419 if (segno > min_inactive_seg)
1420 {
1421 FileClose(v->mdfd_vfd);
1422 _fdvec_resize(reln, forknum, segno - 1);
1423 }
1424
1425 segno--;
1426 }
1427}

References _fdvec_resize(), _mdfd_openseg(), fb(), FileClose(), _MdfdVec::mdfd_vfd, mdnblocks(), and register_dirty_segment().

◆ mdstartreadv()

void mdstartreadv ( PgAioHandle ioh,
SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
void **  buffers,
BlockNumber  nblocks 
)

Definition at line 996 of file md.c.

999{
1000 pgoff_t seekpos;
1001 MdfdVec *v;
1003 struct iovec *iov;
1004 int iovcnt;
1005 int ret;
1006
1007 v = _mdfd_getseg(reln, forknum, blocknum, false,
1009
1010 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1011
1012 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1013
1015 Min(nblocks,
1016 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1017
1018 if (nblocks_this_segment != nblocks)
1019 elog(ERROR, "read crossing segment boundary");
1020
1022
1023 Assert(nblocks <= iovcnt);
1024
1026
1028
1031
1033 reln,
1034 forknum,
1035 blocknum,
1036 nblocks,
1037 false);
1039
1041 if (ret != 0)
1042 ereport(ERROR,
1044 errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
1045 blocknum,
1046 blocknum + nblocks_this_segment - 1,
1047 FilePathName(v->mdfd_vfd))));
1048
1049 /*
1050 * The error checks corresponding to the post-read checks in mdreadv() are
1051 * in md_readv_complete().
1052 *
1053 * However we chose, at least for now, to not implement the
1054 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
1055 * that logic is rather problematic, and we want to get rid of it. Here
1056 * equivalent logic would have to be more complicated due to potential
1057 * differences in the zero_damaged_pages setting between the definer and
1058 * completor of IO.
1059 */
1060}

References _mdfd_getseg(), Assert, buffers_to_iovec(), elog, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, fb(), FilePathName(), FileStartReadV(), IO_DIRECT_DATA, io_direct_flags, _MdfdVec::mdfd_vfd, Min, PGAIO_HCB_MD_READV, PGAIO_HF_BUFFERED, pgaio_io_get_iovec(), pgaio_io_register_callbacks(), pgaio_io_set_flag(), and pgaio_io_set_target_smgr().

◆ mdsyncfiletag()

int mdsyncfiletag ( const FileTag ftag,
char path 
)

Definition at line 1904 of file md.c.

1905{
1907 File file;
1909 bool need_to_close;
1910 int result,
1911 save_errno;
1912
1913 /* See if we already have the file open, or need to open it. */
1914 if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1915 {
1916 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1917 strlcpy(path, FilePathName(file), MAXPGPATH);
1918 need_to_close = false;
1919 }
1920 else
1921 {
1922 MdPathStr p;
1923
1924 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1925 strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
1926
1927 file = PathNameOpenFile(path, _mdfd_open_flags());
1928 if (file < 0)
1929 return -1;
1930 need_to_close = true;
1931 }
1932
1934
1935 /* Sync the file. */
1936 result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1937 save_errno = errno;
1938
1939 if (need_to_close)
1940 FileClose(file);
1941
1943 IOOP_FSYNC, io_start, 1, 0);
1944
1945 errno = save_errno;
1946 return result;
1947}

References _mdfd_open_flags(), _mdfd_segpath(), fb(), FileClose(), FilePathName(), FileSync(), FileTag::forknum, INVALID_PROC_NUMBER, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, MAXPGPATH, MD_PATH_STR_MAXLEN, PathNameOpenFile(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), FileTag::rlocator, FileTag::segno, smgropen(), MdPathStr::str, strlcpy(), and track_io_timing.

◆ mdtruncate()

void mdtruncate ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  curnblk,
BlockNumber  nblocks 
)

Definition at line 1301 of file md.c.

1303{
1305 int curopensegs;
1306
1307 if (nblocks > curnblk)
1308 {
1309 /* Bogus request ... but no complaint if InRecovery */
1310 if (InRecovery)
1311 return;
1312 ereport(ERROR,
1313 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1314 relpath(reln->smgr_rlocator, forknum).str,
1315 nblocks, curnblk)));
1316 }
1317 if (nblocks == curnblk)
1318 return; /* no work */
1319
1320 /*
1321 * Truncate segments, starting at the last one. Starting at the end makes
1322 * managing the memory for the fd array easier, should there be errors.
1323 */
1324 curopensegs = reln->md_num_open_segs[forknum];
1325 while (curopensegs > 0)
1326 {
1327 MdfdVec *v;
1328
1330
1331 v = &reln->md_seg_fds[forknum][curopensegs - 1];
1332
1333 if (priorblocks > nblocks)
1334 {
1335 /*
1336 * This segment is no longer active. We truncate the file, but do
1337 * not delete it, for reasons explained in the header comments.
1338 */
1340 ereport(ERROR,
1342 errmsg("could not truncate file \"%s\": %m",
1343 FilePathName(v->mdfd_vfd))));
1344
1345 if (!SmgrIsTemp(reln))
1346 register_dirty_segment(reln, forknum, v);
1347
1348 /* we never drop the 1st segment */
1349 Assert(v != &reln->md_seg_fds[forknum][0]);
1350
1351 FileClose(v->mdfd_vfd);
1352 _fdvec_resize(reln, forknum, curopensegs - 1);
1353 }
1354 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1355 {
1356 /*
1357 * This is the last segment we want to keep. Truncate the file to
1358 * the right length. NOTE: if nblocks is exactly a multiple K of
1359 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1360 * keep it. This adheres to the invariant given in the header
1361 * comments.
1362 */
1364
1366 ereport(ERROR,
1368 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1370 nblocks)));
1371 if (!SmgrIsTemp(reln))
1372 register_dirty_segment(reln, forknum, v);
1373 }
1374 else
1375 {
1376 /*
1377 * We still need this segment, so nothing to do for this and any
1378 * earlier segment.
1379 */
1380 break;
1381 }
1382 curopensegs--;
1383 }
1384}

References _fdvec_resize(), Assert, ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), FileClose(), FilePathName(), FileTruncate(), InRecovery, _MdfdVec::mdfd_vfd, register_dirty_segment(), relpath, and SmgrIsTemp.

◆ mdunlink()

void mdunlink ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
bool  isRedo 
)

Definition at line 337 of file md.c.

338{
339 /* Now do the per-fork work */
340 if (forknum == InvalidForkNumber)
341 {
342 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
343 mdunlinkfork(rlocator, forknum, isRedo);
344 }
345 else
346 mdunlinkfork(rlocator, forknum, isRedo);
347}

References fb(), InvalidForkNumber, MAX_FORKNUM, and mdunlinkfork().

◆ mdunlinkfiletag()

int mdunlinkfiletag ( const FileTag ftag,
char path 
)

Definition at line 1956 of file md.c.

1957{
1958 RelPathStr p;
1959
1960 /* Compute the path. */
1961 p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1962 strlcpy(path, p.str, MAXPGPATH);
1963
1964 /* Try to unlink the file. */
1965 return unlink(path);
1966}

References fb(), MAIN_FORKNUM, MAXPGPATH, relpathperm, FileTag::rlocator, RelPathStr::str, and strlcpy().

◆ mdunlinkfork()

static void mdunlinkfork ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
bool  isRedo 
)
static

Definition at line 374 of file md.c.

375{
376 RelPathStr path;
377 int ret;
378 int save_errno;
379
380 path = relpath(rlocator, forknum);
381
382 /*
383 * Truncate and then unlink the first segment, or just register a request
384 * to unlink it later, as described in the comments for mdunlink().
385 */
386 if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
388 {
389 if (!RelFileLocatorBackendIsTemp(rlocator))
390 {
391 /* Prevent other backends' fds from holding on to the disk space */
392 ret = do_truncate(path.str);
393
394 /* Forget any pending sync requests for the first segment */
396 register_forget_request(rlocator, forknum, 0 /* first seg */ );
398 }
399 else
400 ret = 0;
401
402 /* Next unlink the file, unless it was already found to be missing */
403 if (ret >= 0 || errno != ENOENT)
404 {
405 ret = unlink(path.str);
406 if (ret < 0 && errno != ENOENT)
407 {
411 errmsg("could not remove file \"%s\": %m", path.str)));
413 }
414 }
415 }
416 else
417 {
418 /* Prevent other backends' fds from holding on to the disk space */
419 ret = do_truncate(path.str);
420
421 /* Register request to unlink first segment later */
423 register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
425 }
426
427 /*
428 * Delete any additional segments.
429 *
430 * Note that because we loop until getting ENOENT, we will correctly
431 * remove all inactive segments as well as active ones. Ideally we'd
432 * continue the loop until getting exactly that errno, but that risks an
433 * infinite loop if the problem is directory-wide (for instance, if we
434 * suddenly can't read the data directory itself). We compromise by
435 * continuing after a non-ENOENT truncate error, but stopping after any
436 * unlink error. If there is indeed a directory-wide problem, additional
437 * unlink attempts wouldn't work anyway.
438 */
439 if (ret >= 0 || errno != ENOENT)
440 {
442 BlockNumber segno;
443
444 for (segno = 1;; segno++)
445 {
446 sprintf(segpath.str, "%s.%u", path.str, segno);
447
448 if (!RelFileLocatorBackendIsTemp(rlocator))
449 {
450 /*
451 * Prevent other backends' fds from holding on to the disk
452 * space. We're done if we see ENOENT, though.
453 */
454 if (do_truncate(segpath.str) < 0 && errno == ENOENT)
455 break;
456
457 /*
458 * Forget any pending sync requests for this segment before we
459 * try to unlink.
460 */
461 register_forget_request(rlocator, forknum, segno);
462 }
463
464 if (unlink(segpath.str) < 0)
465 {
466 /* ENOENT is expected after the last segment... */
467 if (errno != ENOENT)
470 errmsg("could not remove file \"%s\": %m", segpath.str)));
471 break;
472 }
473 }
474 }
475}

References do_truncate(), ereport, errcode_for_file_access(), errmsg(), fb(), IsBinaryUpgrade, MAIN_FORKNUM, register_forget_request(), register_unlink_segment(), RelFileLocatorBackendIsTemp, relpath, sprintf, RelPathStr::str, and WARNING.

Referenced by mdunlink().

◆ mdwriteback()

void mdwriteback ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
BlockNumber  nblocks 
)

Definition at line 1175 of file md.c.

1177{
1179
1180 /*
1181 * Issue flush requests in as few requests as possible; have to split at
1182 * segment boundaries though, since those are actually separate files.
1183 */
1184 while (nblocks > 0)
1185 {
1186 BlockNumber nflush = nblocks;
1187 pgoff_t seekpos;
1188 MdfdVec *v;
1189 int segnum_start,
1190 segnum_end;
1191
1192 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1194
1195 /*
1196 * We might be flushing buffers of already removed relations, that's
1197 * ok, just ignore that case. If the segment file wasn't open already
1198 * (ie from a recent mdwrite()), then we don't want to re-open it, to
1199 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1200 * us with a descriptor to a file that is about to be unlinked.
1201 */
1202 if (!v)
1203 return;
1204
1205 /* compute offset inside the current segment */
1206 segnum_start = blocknum / RELSEG_SIZE;
1207
1208 /* compute number of desired writes within the current segment */
1209 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1210 if (segnum_start != segnum_end)
1211 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1212
1213 Assert(nflush >= 1);
1214 Assert(nflush <= nblocks);
1215
1216 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1217
1219
1220 nblocks -= nflush;
1221 blocknum += nflush;
1222 }
1223}

References _mdfd_getseg(), Assert, EXTENSION_DONT_OPEN, fb(), FileWriteback(), IO_DIRECT_DATA, io_direct_flags, and _MdfdVec::mdfd_vfd.

◆ mdwritev()

void mdwritev ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
const void **  buffers,
BlockNumber  nblocks,
bool  skipFsync 
)

Definition at line 1070 of file md.c.

1072{
1073 /* This assert is too expensive to have on normally ... */
1074#ifdef CHECK_WRITE_VS_EXTEND
1075 Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
1076#endif
1077
1078 while (nblocks > 0)
1079 {
1080 struct iovec iov[PG_IOV_MAX];
1081 int iovcnt;
1082 pgoff_t seekpos;
1083 int nbytes;
1084 MdfdVec *v;
1087 size_t size_this_segment;
1088
1089 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
1091
1092 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1093
1094 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1095
1097 Min(nblocks,
1098 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1100
1101 if (nblocks_this_segment != nblocks)
1102 elog(ERROR, "write crosses segment boundary");
1103
1104 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
1107
1108 /*
1109 * Inner loop to continue after a short write. If the reason is that
1110 * we're out of disk space, a future attempt should get an ENOSPC
1111 * error from the kernel.
1112 */
1113 for (;;)
1114 {
1115 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
1116 reln->smgr_rlocator.locator.spcOid,
1117 reln->smgr_rlocator.locator.dbOid,
1118 reln->smgr_rlocator.locator.relNumber,
1119 reln->smgr_rlocator.backend);
1120 nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
1122 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
1123 reln->smgr_rlocator.locator.spcOid,
1124 reln->smgr_rlocator.locator.dbOid,
1125 reln->smgr_rlocator.locator.relNumber,
1126 reln->smgr_rlocator.backend,
1127 nbytes,
1129
1130#ifdef SIMULATE_SHORT_WRITE
1131 nbytes = Min(nbytes, 4096);
1132#endif
1133
1134 if (nbytes < 0)
1135 {
1136 bool enospc = errno == ENOSPC;
1137
1138 ereport(ERROR,
1140 errmsg("could not write blocks %u..%u in file \"%s\": %m",
1141 blocknum,
1142 blocknum + nblocks_this_segment - 1,
1144 enospc ? errhint("Check free disk space.") : 0));
1145 }
1146
1147 /* One loop should usually be enough. */
1148 transferred_this_segment += nbytes;
1151 break;
1152
1153 /* Adjust position and iovecs after a short write. */
1154 seekpos += nbytes;
1156 }
1157
1158 if (!skipFsync && !SmgrIsTemp(reln))
1159 register_dirty_segment(reln, forknum, v);
1160
1161 nblocks -= nblocks_this_segment;
1162 buffers += nblocks_this_segment;
1163 blocknum += nblocks_this_segment;
1164 }
1165}

References _mdfd_getseg(), Assert, buffers_to_iovec(), compute_remaining_iovec(), elog, ereport, errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, fb(), FilePathName(), FileWriteV(), lengthof, _MdfdVec::mdfd_vfd, mdnblocks(), Min, PG_IOV_MAX, register_dirty_segment(), and SmgrIsTemp.

◆ mdzeroextend()

void mdzeroextend ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
int  nblocks,
bool  skipFsync 
)

Definition at line 552 of file md.c.

554{
555 MdfdVec *v;
556 BlockNumber curblocknum = blocknum;
557 int remblocks = nblocks;
558
559 Assert(nblocks > 0);
560
561 /* This assert is too expensive to have on normally ... */
562#ifdef CHECK_WRITE_VS_EXTEND
563 Assert(blocknum >= mdnblocks(reln, forknum));
564#endif
565
566 /*
567 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
568 * more --- we mustn't create a block whose number actually is
569 * InvalidBlockNumber or larger.
570 */
571 if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
574 errmsg("cannot extend file \"%s\" beyond %u blocks",
575 relpath(reln->smgr_rlocator, forknum).str,
577
578 while (remblocks > 0)
579 {
581 pgoff_t seekpos = (pgoff_t) BLCKSZ * segstartblock;
582 int numblocks;
583
586 else
588
590
593
594 /*
595 * If available and useful, use posix_fallocate() (via
596 * FileFallocate()) to extend the relation. That's often more
597 * efficient than using write(), as it commonly won't cause the kernel
598 * to allocate page cache space for the extended pages.
599 *
600 * However, we don't use FileFallocate() for small extensions, as it
601 * defeats delayed allocation on some filesystems. Not clear where
602 * that decision should be made though? For now just use a cutoff of
603 * 8, anything between 4 and 8 worked OK in some local testing.
604 */
605 if (numblocks > 8 &&
607 {
608 int ret = 0;
609
610#ifdef HAVE_POSIX_FALLOCATE
612 {
613 ret = FileFallocate(v->mdfd_vfd,
614 seekpos, (pgoff_t) BLCKSZ * numblocks,
616 }
617 else
618#endif
619 {
620 elog(ERROR, "unsupported file_extend_method: %d",
622 }
623 if (ret != 0)
624 {
627 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
629 errhint("Check free disk space."));
630 }
631 }
632 else
633 {
634 int ret;
635
636 /*
637 * Even if we don't want to use fallocate, we can still extend a
638 * bit more efficiently than writing each 8kB block individually.
639 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
640 * to avoid multiple writes or needing a zeroed buffer for the
641 * whole length of the extension.
642 */
643 ret = FileZero(v->mdfd_vfd,
644 seekpos, (pgoff_t) BLCKSZ * numblocks,
646 if (ret < 0)
649 errmsg("could not extend file \"%s\": %m",
651 errhint("Check free disk space."));
652 }
653
654 if (!skipFsync && !SmgrIsTemp(reln))
655 register_dirty_segment(reln, forknum, v);
656
657 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
658
661 }
662}

References _mdfd_getseg(), _mdnblocks(), Assert, elog, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, fb(), file_extend_method, FILE_EXTEND_METHOD_WRITE_ZEROS, FileFallocate(), FilePathName(), FileZero(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), register_dirty_segment(), relpath, and SmgrIsTemp.

◆ register_dirty_segment()

static void register_dirty_segment ( SMgrRelation  reln,
ForkNumber  forknum,
MdfdVec seg 
)
static

Definition at line 1518 of file md.c.

1519{
1520 FileTag tag;
1521
1522 INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1523
1524 /* Temp relations should never be fsync'd */
1526
1527 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1528 {
1530
1532 (errmsg_internal("could not forward fsync request because request queue is full")));
1533
1535
1539 errmsg("could not fsync file \"%s\": %m",
1540 FilePathName(seg->mdfd_vfd))));
1541
1542 /*
1543 * We have no way of knowing if the current IOContext is
1544 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1545 * point, so count the fsync as being in the IOCONTEXT_NORMAL
1546 * IOContext. This is probably okay, because the number of backend
1547 * fsyncs doesn't say anything about the efficacy of the
1548 * BufferAccessStrategy. And counting both fsyncs done in
1549 * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1550 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1551 * backend fsyncs.
1552 */
1554 IOOP_FSYNC, io_start, 1, 0);
1555 }
1556}

References Assert, data_sync_elevel(), DEBUG1, ereport, errcode_for_file_access(), errmsg(), errmsg_internal(), ERROR, fb(), FilePathName(), FileSync(), INIT_MD_FILETAG, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RegisterSyncRequest(), SmgrIsTemp, SYNC_REQUEST, and track_io_timing.

Referenced by mdcreate(), mdextend(), mdregistersync(), mdtruncate(), mdwritev(), and mdzeroextend().

◆ register_forget_request()

static void register_forget_request ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1579 of file md.c.

1581{
1582 FileTag tag;
1583
1584 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1585
1586 RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
1587}

References INIT_MD_FILETAG, RelFileLocatorBackend::locator, RegisterSyncRequest(), and SYNC_FORGET_REQUEST.

Referenced by mdunlinkfork().

◆ register_unlink_segment()

static void register_unlink_segment ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1562 of file md.c.

1564{
1565 FileTag tag;
1566
1567 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1568
1569 /* Should never be used with temp relations */
1571
1572 RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
1573}

References Assert, INIT_MD_FILETAG, RelFileLocatorBackend::locator, RegisterSyncRequest(), RelFileLocatorBackendIsTemp, and SYNC_UNLINK_REQUEST.

Referenced by mdunlinkfork().

◆ StaticAssertDecl()

StaticAssertDecl ( RELSEG_SIZE  ,
0 &&RELSEG_SIZE<=  INT_MAX,
"RELSEG_SIZE must fit in an integer"   
)

Variable Documentation

◆ aio_md_readv_cb

const PgAioHandleCallbacks aio_md_readv_cb
Initial value:
= {
.complete_shared = md_readv_complete,
.report = md_readv_report,
}

Definition at line 169 of file md.c.

169 {
170 .complete_shared = md_readv_complete,
171 .report = md_readv_report,
172};

◆ MdCxt

MemoryContext MdCxt
static

Definition at line 97 of file md.c.

Referenced by _fdvec_resize(), and mdinit().