PostgreSQL Source Code git master
Loading...
Searching...
No Matches
md.c File Reference
#include "postgres.h"
#include <limits.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
#include "access/xlogutils.h"
#include "commands/tablespace.h"
#include "common/file_utils.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "storage/aio.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/md.h"
#include "storage/relfilelocator.h"
#include "storage/smgr.h"
#include "storage/sync.h"
#include "utils/memutils.h"
#include "utils/wait_event.h"
Include dependency graph for md.c:

Go to the source code of this file.

Data Structures

struct  _MdfdVec
 
struct  MdPathStr
 

Macros

#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
 
#define EXTENSION_FAIL   (1 << 0)
 
#define EXTENSION_RETURN_NULL   (1 << 1)
 
#define EXTENSION_CREATE   (1 << 2)
 
#define EXTENSION_CREATE_RECOVERY   (1 << 3)
 
#define EXTENSION_DONT_OPEN   (1 << 5)
 
#define SEGMENT_CHARS   OIDCHARS
 
#define MD_PATH_STR_MAXLEN
 

Typedefs

typedef struct _MdfdVec MdfdVec
 
typedef struct MdPathStr MdPathStr
 

Functions

 StaticAssertDecl (RELSEG_SIZE > 0 &&RELSEG_SIZE<=INT_MAX, "RELSEG_SIZE must fit in an integer")
 
static void mdunlinkfork (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 
static MdfdVecmdopenfork (SMgrRelation reln, ForkNumber forknum, int behavior)
 
static void register_dirty_segment (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
static void register_unlink_segment (RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
 
static void register_forget_request (RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
 
static void _fdvec_resize (SMgrRelation reln, ForkNumber forknum, int nseg)
 
static MdPathStr _mdfd_segpath (SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
 
static MdfdVec_mdfd_openseg (SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
 
static MdfdVec_mdfd_getseg (SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
 
static BlockNumber _mdnblocks (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 
static PgAioResult md_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static void md_readv_report (PgAioResult result, const PgAioTargetData *td, int elevel)
 
static int _mdfd_open_flags (void)
 
void mdinit (void)
 
bool mdexists (SMgrRelation reln, ForkNumber forknum)
 
void mdcreate (SMgrRelation reln, ForkNumber forknum, bool isRedo)
 
void mdunlink (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 
static int do_truncate (const char *path)
 
void mdextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
 
void mdzeroextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
 
void mdopen (SMgrRelation reln)
 
void mdclose (SMgrRelation reln, ForkNumber forknum)
 
bool mdprefetch (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
 
static int buffers_to_iovec (struct iovec *iov, void **buffers, int nblocks)
 
uint32 mdmaxcombine (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
void mdreadv (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
 
void mdstartreadv (PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
 
void mdwritev (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
 
void mdwriteback (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
 
BlockNumber mdnblocks (SMgrRelation reln, ForkNumber forknum)
 
void mdtruncate (SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
 
void mdregistersync (SMgrRelation reln, ForkNumber forknum)
 
void mdimmedsync (SMgrRelation reln, ForkNumber forknum)
 
int mdfd (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
 
void ForgetDatabaseSyncRequests (Oid dbid)
 
void DropRelationFiles (RelFileLocator *delrels, int ndelrels, bool isRedo)
 
int mdsyncfiletag (const FileTag *ftag, char *path)
 
int mdunlinkfiletag (const FileTag *ftag, char *path)
 
bool mdfiletagmatches (const FileTag *ftag, const FileTag *candidate)
 

Variables

static MemoryContext MdCxt
 
const PgAioHandleCallbacks aio_md_readv_cb
 

Macro Definition Documentation

◆ EXTENSION_CREATE

#define EXTENSION_CREATE   (1 << 2)

Definition at line 118 of file md.c.

◆ EXTENSION_CREATE_RECOVERY

#define EXTENSION_CREATE_RECOVERY   (1 << 3)

Definition at line 120 of file md.c.

◆ EXTENSION_DONT_OPEN

#define EXTENSION_DONT_OPEN   (1 << 5)

Definition at line 122 of file md.c.

◆ EXTENSION_FAIL

#define EXTENSION_FAIL   (1 << 0)

Definition at line 114 of file md.c.

◆ EXTENSION_RETURN_NULL

#define EXTENSION_RETURN_NULL   (1 << 1)

Definition at line 116 of file md.c.

◆ INIT_MD_FILETAG

#define INIT_MD_FILETAG (   a,
  xx_rlocator,
  xx_forknum,
  xx_segno 
)
Value:
( \
memset(&(a), 0, sizeof(FileTag)), \
(a).handler = SYNC_HANDLER_MD, \
(a).rlocator = (xx_rlocator), \
(a).forknum = (xx_forknum), \
(a).segno = (xx_segno) \
)
int a
Definition isn.c:73
static int fb(int x)
Definition sync.h:51
@ SYNC_HANDLER_MD
Definition sync.h:37

Definition at line 102 of file md.c.

139{
140 char str[MD_PATH_STR_MAXLEN + 1];
141} MdPathStr;
142
143
144/* local routines */
145static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
146 bool isRedo);
147static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
149 MdfdVec *seg);
150static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
151 BlockNumber segno);
152static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
153 BlockNumber segno);
155 ForkNumber forknum,
156 int nseg);
158 BlockNumber segno);
160 BlockNumber segno, int oflags);
162 BlockNumber blkno, bool skipFsync, int behavior);
164 MdfdVec *seg);
165
167static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel);
168
171 .report = md_readv_report,
172};
173
174
175static inline int
177{
178 int flags = O_RDWR | PG_BINARY;
179
181 flags |= PG_O_DIRECT;
182
183 return flags;
184}
185
186/*
187 * mdinit() -- Initialize private state for magnetic disk storage manager.
188 */
189void
190mdinit(void)
191{
193 "MdSmgr",
195}
196
197/*
198 * mdexists() -- Does the physical file exist?
199 *
200 * Note: this will return true for lingering files, with pending deletions
201 */
202bool
204{
205 /*
206 * Close it first, to ensure that we notice if the fork has been unlinked
207 * since we opened it. As an optimization, we can skip that in recovery,
208 * which already closes relations when dropping them.
209 */
210 if (!InRecovery)
211 mdclose(reln, forknum);
212
213 return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
214}
215
216/*
217 * mdcreate() -- Create a new relation on magnetic disk.
218 *
219 * If isRedo is true, it's okay for the relation to exist already.
220 */
221void
223{
224 MdfdVec *mdfd;
225 RelPathStr path;
226 File fd;
227
228 if (isRedo && reln->md_num_open_segs[forknum] > 0)
229 return; /* created and opened already... */
230
231 Assert(reln->md_num_open_segs[forknum] == 0);
232
233 /*
234 * We may be using the target table space for the first time in this
235 * database, so create a per-database subdirectory if needed.
236 *
237 * XXX this is a fairly ugly violation of module layering, but this seems
238 * to be the best place to put the check. Maybe TablespaceCreateDbspace
239 * should be here and not in commands/tablespace.c? But that would imply
240 * importing a lot of stuff that smgr.c oughtn't know, either.
241 */
242 TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
243 reln->smgr_rlocator.locator.dbOid,
244 isRedo);
245
246 path = relpath(reln->smgr_rlocator, forknum);
247
249
250 if (fd < 0)
251 {
252 int save_errno = errno;
253
254 if (isRedo)
256 if (fd < 0)
257 {
258 /* be sure to report the error reported by create, not open */
262 errmsg("could not create file \"%s\": %m", path.str)));
263 }
264 }
265
266 _fdvec_resize(reln, forknum, 1);
267 mdfd = &reln->md_seg_fds[forknum][0];
268 mdfd->mdfd_vfd = fd;
269 mdfd->mdfd_segno = 0;
270
271 if (!SmgrIsTemp(reln))
273}
274
275/*
276 * mdunlink() -- Unlink a relation.
277 *
278 * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
279 * there won't be an SMgrRelation hashtable entry anymore.
280 *
281 * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
282 * to delete all forks.
283 *
284 * For regular relations, we don't unlink the first segment file of the rel,
285 * but just truncate it to zero length, and record a request to unlink it after
286 * the next checkpoint. Additional segments can be unlinked immediately,
287 * however. Leaving the empty file in place prevents that relfilenumber
288 * from being reused. The scenario this protects us from is:
289 * 1. We delete a relation (and commit, and actually remove its file).
290 * 2. We create a new relation, which by chance gets the same relfilenumber as
291 * the just-deleted one (OIDs must've wrapped around for that to happen).
292 * 3. We crash before another checkpoint occurs.
293 * During replay, we would delete the file and then recreate it, which is fine
294 * if the contents of the file were repopulated by subsequent WAL entries.
295 * But if we didn't WAL-log insertions, but instead relied on fsyncing the
296 * file after populating it (as we do at wal_level=minimal), the contents of
297 * the file would be lost forever. By leaving the empty file until after the
298 * next checkpoint, we prevent reassignment of the relfilenumber until it's
299 * safe, because relfilenumber assignment skips over any existing file.
300 *
301 * Additional segments, if any, are truncated and then unlinked. The reason
302 * for truncating is that other backends may still hold open FDs for these at
303 * the smgr level, so that the kernel can't remove the file yet. We want to
304 * reclaim the disk space right away despite that.
305 *
306 * We do not need to go through this dance for temp relations, though, because
307 * we never make WAL entries for temp rels, and so a temp rel poses no threat
308 * to the health of a regular rel that has taken over its relfilenumber.
309 * The fact that temp rels and regular rels have different file naming
310 * patterns provides additional safety. Other backends shouldn't have open
311 * FDs for them, either.
312 *
313 * We also don't do it while performing a binary upgrade. There is no reuse
314 * hazard in that case, since after a crash or even a simple ERROR, the
315 * upgrade fails and the whole cluster must be recreated from scratch.
316 * Furthermore, it is important to remove the files from disk immediately,
317 * because we may be about to reuse the same relfilenumber.
318 *
319 * All the above applies only to the relation's main fork; other forks can
320 * just be removed immediately, since they are not needed to prevent the
321 * relfilenumber from being recycled. Also, we do not carefully
322 * track whether other forks have been created or not, but just attempt to
323 * unlink them unconditionally; so we should never complain about ENOENT.
324 *
325 * If isRedo is true, it's unsurprising for the relation to be already gone.
326 * Also, we should remove the file immediately instead of queuing a request
327 * for later, since during redo there's no possibility of creating a
328 * conflicting relation.
329 *
330 * Note: we currently just never warn about ENOENT at all. We could warn in
331 * the main-fork, non-isRedo case, but it doesn't seem worth the trouble.
332 *
333 * Note: any failure should be reported as WARNING not ERROR, because
334 * we are usually not in a transaction anymore when this is called.
335 */
336void
337mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
338{
339 /* Now do the per-fork work */
340 if (forknum == InvalidForkNumber)
341 {
342 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
343 mdunlinkfork(rlocator, forknum, isRedo);
344 }
345 else
346 mdunlinkfork(rlocator, forknum, isRedo);
347}
348
349/*
350 * Truncate a file to release disk space.
351 */
352static int
353do_truncate(const char *path)
354{
355 int save_errno;
356 int ret;
357
358 ret = pg_truncate(path, 0);
359
360 /* Log a warning here to avoid repetition in callers. */
361 if (ret < 0 && errno != ENOENT)
362 {
366 errmsg("could not truncate file \"%s\": %m", path)));
368 }
369
370 return ret;
371}
372
373static void
375{
376 RelPathStr path;
377 int ret;
378 int save_errno;
379
380 path = relpath(rlocator, forknum);
381
382 /*
383 * Truncate and then unlink the first segment, or just register a request
384 * to unlink it later, as described in the comments for mdunlink().
385 */
386 if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
388 {
389 if (!RelFileLocatorBackendIsTemp(rlocator))
390 {
391 /* Prevent other backends' fds from holding on to the disk space */
392 ret = do_truncate(path.str);
393
394 /* Forget any pending sync requests for the first segment */
396 register_forget_request(rlocator, forknum, 0 /* first seg */ );
398 }
399 else
400 ret = 0;
401
402 /* Next unlink the file, unless it was already found to be missing */
403 if (ret >= 0 || errno != ENOENT)
404 {
405 ret = unlink(path.str);
406 if (ret < 0 && errno != ENOENT)
407 {
411 errmsg("could not remove file \"%s\": %m", path.str)));
413 }
414 }
415 }
416 else
417 {
418 /* Prevent other backends' fds from holding on to the disk space */
419 ret = do_truncate(path.str);
420
421 /* Register request to unlink first segment later */
423 register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
425 }
426
427 /*
428 * Delete any additional segments.
429 *
430 * Note that because we loop until getting ENOENT, we will correctly
431 * remove all inactive segments as well as active ones. Ideally we'd
432 * continue the loop until getting exactly that errno, but that risks an
433 * infinite loop if the problem is directory-wide (for instance, if we
434 * suddenly can't read the data directory itself). We compromise by
435 * continuing after a non-ENOENT truncate error, but stopping after any
436 * unlink error. If there is indeed a directory-wide problem, additional
437 * unlink attempts wouldn't work anyway.
438 */
439 if (ret >= 0 || errno != ENOENT)
440 {
442 BlockNumber segno;
443
444 for (segno = 1;; segno++)
445 {
446 sprintf(segpath.str, "%s.%u", path.str, segno);
447
448 if (!RelFileLocatorBackendIsTemp(rlocator))
449 {
450 /*
451 * Prevent other backends' fds from holding on to the disk
452 * space. We're done if we see ENOENT, though.
453 */
454 if (do_truncate(segpath.str) < 0 && errno == ENOENT)
455 break;
456
457 /*
458 * Forget any pending sync requests for this segment before we
459 * try to unlink.
460 */
461 register_forget_request(rlocator, forknum, segno);
462 }
463
464 if (unlink(segpath.str) < 0)
465 {
466 /* ENOENT is expected after the last segment... */
467 if (errno != ENOENT)
470 errmsg("could not remove file \"%s\": %m", segpath.str)));
471 break;
472 }
473 }
474 }
475}
476
477/*
478 * mdextend() -- Add a block to the specified relation.
479 *
480 * The semantics are nearly the same as mdwrite(): write at the
481 * specified position. However, this is to be used for the case of
482 * extending a relation (i.e., blocknum is at or beyond the current
483 * EOF). Note that we assume writing a block beyond current EOF
484 * causes intervening file space to become filled with zeroes.
485 */
486void
488 const void *buffer, bool skipFsync)
489{
490 pgoff_t seekpos;
491 int nbytes;
492 MdfdVec *v;
493
494 /* If this build supports direct I/O, the buffer must be I/O aligned. */
495 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
496 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
497
498 /* This assert is too expensive to have on normally ... */
499#ifdef CHECK_WRITE_VS_EXTEND
500 Assert(blocknum >= mdnblocks(reln, forknum));
501#endif
502
503 /*
504 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
505 * more --- we mustn't create a block whose number actually is
506 * InvalidBlockNumber. (Note that this failure should be unreachable
507 * because of upstream checks in bufmgr.c.)
508 */
509 if (blocknum == InvalidBlockNumber)
512 errmsg("cannot extend file \"%s\" beyond %u blocks",
513 relpath(reln->smgr_rlocator, forknum).str,
515
516 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
517
518 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
519
520 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
521
522 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
523 {
524 if (nbytes < 0)
527 errmsg("could not extend file \"%s\": %m",
529 errhint("Check free disk space.")));
530 /* short write: complain appropriately */
533 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
535 nbytes, BLCKSZ, blocknum),
536 errhint("Check free disk space.")));
537 }
538
539 if (!skipFsync && !SmgrIsTemp(reln))
540 register_dirty_segment(reln, forknum, v);
541
542 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
543}
544
545/*
546 * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
547 *
548 * Similar to mdextend(), except the relation can be extended by multiple
549 * blocks at once and the added blocks will be filled with zeroes.
550 */
551void
553 BlockNumber blocknum, int nblocks, bool skipFsync)
554{
555 MdfdVec *v;
556 BlockNumber curblocknum = blocknum;
557 int remblocks = nblocks;
558
559 Assert(nblocks > 0);
560
561 /* This assert is too expensive to have on normally ... */
562#ifdef CHECK_WRITE_VS_EXTEND
563 Assert(blocknum >= mdnblocks(reln, forknum));
564#endif
565
566 /*
567 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
568 * more --- we mustn't create a block whose number actually is
569 * InvalidBlockNumber or larger.
570 */
571 if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
574 errmsg("cannot extend file \"%s\" beyond %u blocks",
575 relpath(reln->smgr_rlocator, forknum).str,
577
578 while (remblocks > 0)
579 {
581 pgoff_t seekpos = (pgoff_t) BLCKSZ * segstartblock;
582 int numblocks;
583
586 else
588
590
593
594 /*
595 * If available and useful, use posix_fallocate() (via
596 * FileFallocate()) to extend the relation. That's often more
597 * efficient than using write(), as it commonly won't cause the kernel
598 * to allocate page cache space for the extended pages.
599 *
600 * However, we don't use FileFallocate() for small extensions, as it
601 * defeats delayed allocation on some filesystems. Not clear where
602 * that decision should be made though? For now just use a cutoff of
603 * 8, anything between 4 and 8 worked OK in some local testing.
604 */
605 if (numblocks > 8 &&
607 {
608 int ret = 0;
609
610#ifdef HAVE_POSIX_FALLOCATE
612 {
613 ret = FileFallocate(v->mdfd_vfd,
614 seekpos, (pgoff_t) BLCKSZ * numblocks,
616 }
617 else
618#endif
619 {
620 elog(ERROR, "unsupported file_extend_method: %d",
622 }
623 if (ret != 0)
624 {
627 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
629 errhint("Check free disk space."));
630 }
631 }
632 else
633 {
634 int ret;
635
636 /*
637 * Even if we don't want to use fallocate, we can still extend a
638 * bit more efficiently than writing each 8kB block individually.
639 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
640 * to avoid multiple writes or needing a zeroed buffer for the
641 * whole length of the extension.
642 */
643 ret = FileZero(v->mdfd_vfd,
644 seekpos, (pgoff_t) BLCKSZ * numblocks,
646 if (ret < 0)
649 errmsg("could not extend file \"%s\": %m",
651 errhint("Check free disk space."));
652 }
653
654 if (!skipFsync && !SmgrIsTemp(reln))
655 register_dirty_segment(reln, forknum, v);
656
657 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
658
661 }
662}
663
664/*
665 * mdopenfork() -- Open one fork of the specified relation.
666 *
667 * Note we only open the first segment, when there are multiple segments.
668 *
669 * If first segment is not present, either ereport or return NULL according
670 * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
671 * EXTENSION_CREATE means it's OK to extend an existing relation, not to
672 * invent one out of whole cloth.
673 */
674static MdfdVec *
675mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
676{
677 MdfdVec *mdfd;
678 RelPathStr path;
679 File fd;
680
681 /* No work if already open */
682 if (reln->md_num_open_segs[forknum] > 0)
683 return &reln->md_seg_fds[forknum][0];
684
685 path = relpath(reln->smgr_rlocator, forknum);
686
688
689 if (fd < 0)
690 {
691 if ((behavior & EXTENSION_RETURN_NULL) &&
693 return NULL;
696 errmsg("could not open file \"%s\": %m", path.str)));
697 }
698
699 _fdvec_resize(reln, forknum, 1);
700 mdfd = &reln->md_seg_fds[forknum][0];
701 mdfd->mdfd_vfd = fd;
702 mdfd->mdfd_segno = 0;
703
705
706 return mdfd;
707}
708
709/*
710 * mdopen() -- Initialize newly-opened relation.
711 */
712void
714{
715 /* mark it not open */
716 for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
717 reln->md_num_open_segs[forknum] = 0;
718}
719
720/*
721 * mdclose() -- Close the specified relation, if it isn't closed already.
722 */
723void
725{
726 int nopensegs = reln->md_num_open_segs[forknum];
727
728 /* No work if already closed */
729 if (nopensegs == 0)
730 return;
731
732 /* close segments starting from the end */
733 while (nopensegs > 0)
734 {
735 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
736
738 _fdvec_resize(reln, forknum, nopensegs - 1);
739 nopensegs--;
740 }
741}
742
743/*
744 * mdprefetch() -- Initiate asynchronous read of the specified blocks of a relation
745 */
746bool
748 int nblocks)
749{
750#ifdef USE_PREFETCH
751
753
754 if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
755 return false;
756
757 while (nblocks > 0)
758 {
759 pgoff_t seekpos;
760 MdfdVec *v;
762
763 v = _mdfd_getseg(reln, forknum, blocknum, false,
765 if (v == NULL)
766 return false;
767
768 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
769
770 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
771
773 Min(nblocks,
774 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
775
778
779 blocknum += nblocks_this_segment;
780 nblocks -= nblocks_this_segment;
781 }
782#endif /* USE_PREFETCH */
783
784 return true;
785}
786
787/*
788 * Convert an array of buffer address into an array of iovec objects, and
789 * return the number that were required. 'iov' must have enough space for up
790 * to 'nblocks' elements, but the number used may be less depending on
791 * merging. In the case of a run of fully contiguous buffers, a single iovec
792 * will be populated that can be handled as a plain non-vectored I/O.
793 */
794static int
795buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
796{
797 struct iovec *iovp;
798 int iovcnt;
799
800 Assert(nblocks >= 1);
801
802 /* If this build supports direct I/O, buffers must be I/O aligned. */
803 for (int i = 0; i < nblocks; ++i)
804 {
805 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
806 Assert((uintptr_t) buffers[i] ==
807 TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
808 }
809
810 /* Start the first iovec off with the first buffer. */
811 iovp = &iov[0];
812 iovp->iov_base = buffers[0];
813 iovp->iov_len = BLCKSZ;
814 iovcnt = 1;
815
816 /* Try to merge the rest. */
817 for (int i = 1; i < nblocks; ++i)
818 {
819 void *buffer = buffers[i];
820
821 if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
822 {
823 /* Contiguous with the last iovec. */
824 iovp->iov_len += BLCKSZ;
825 }
826 else
827 {
828 /* Need a new iovec. */
829 iovp++;
830 iovp->iov_base = buffer;
831 iovp->iov_len = BLCKSZ;
832 iovcnt++;
833 }
834 }
835
836 return iovcnt;
837}
838
839/*
840 * mdmaxcombine() -- Return the maximum number of total blocks that can be
841 * combined with an IO starting at blocknum.
842 */
843uint32
845 BlockNumber blocknum)
846{
847 BlockNumber segoff;
848
849 segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
850
851 return RELSEG_SIZE - segoff;
852}
853
854/*
855 * mdreadv() -- Read the specified blocks from a relation.
856 */
857void
859 void **buffers, BlockNumber nblocks)
860{
861 while (nblocks > 0)
862 {
863 struct iovec iov[PG_IOV_MAX];
864 int iovcnt;
865 pgoff_t seekpos;
866 int nbytes;
867 MdfdVec *v;
870 size_t size_this_segment;
871
872 v = _mdfd_getseg(reln, forknum, blocknum, false,
874
875 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
876
877 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
878
880 Min(nblocks,
881 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
883
884 if (nblocks_this_segment != nblocks)
885 elog(ERROR, "read crosses segment boundary");
886
890
891 /*
892 * Inner loop to continue after a short read. We'll keep going until
893 * we hit EOF rather than assuming that a short read means we hit the
894 * end.
895 */
896 for (;;)
897 {
898 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
899 reln->smgr_rlocator.locator.spcOid,
900 reln->smgr_rlocator.locator.dbOid,
901 reln->smgr_rlocator.locator.relNumber,
902 reln->smgr_rlocator.backend);
903 nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
905 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
906 reln->smgr_rlocator.locator.spcOid,
907 reln->smgr_rlocator.locator.dbOid,
908 reln->smgr_rlocator.locator.relNumber,
909 reln->smgr_rlocator.backend,
910 nbytes,
912
913#ifdef SIMULATE_SHORT_READ
914 nbytes = Min(nbytes, 4096);
915#endif
916
917 if (nbytes < 0)
920 errmsg("could not read blocks %u..%u in file \"%s\": %m",
921 blocknum,
922 blocknum + nblocks_this_segment - 1,
923 FilePathName(v->mdfd_vfd))));
924
925 if (nbytes == 0)
926 {
927 /*
928 * We are at or past EOF, or we read a partial block at EOF.
929 * Normally this is an error; upper levels should never try to
930 * read a nonexistent block. However, if zero_damaged_pages
931 * is ON or we are InRecovery, we should instead return zeroes
932 * without complaining. This allows, for example, the case of
933 * trying to update a block that was later truncated away.
934 *
935 * NB: We think that this codepath is unreachable in recovery
936 * and incomplete with zero_damaged_pages, as missing segments
937 * are not created. Putting blocks into the buffer-pool that
938 * do not exist on disk is rather problematic, as it will not
939 * be found by scans that rely on smgrnblocks(), as they are
940 * beyond EOF. It also can cause weird problems with relation
941 * extension, as relation extension does not expect blocks
942 * beyond EOF to exist.
943 *
944 * Therefore we do not want to copy the logic into
945 * mdstartreadv(), where it would have to be more complicated
946 * due to potential differences in the zero_damaged_pages
947 * setting between the definer and completor of IO.
948 *
949 * For PG 18, we are putting an Assert(false) in mdreadv()
950 * (triggering failures in assertion-enabled builds, but
951 * continuing to work in production builds). Afterwards we
952 * plan to remove this code entirely.
953 */
955 {
956 Assert(false); /* see comment above */
957
960 ++i)
961 memset(buffers[i], 0, BLCKSZ);
962 break;
963 }
964 else
967 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
968 blocknum,
969 blocknum + nblocks_this_segment - 1,
973 }
974
975 /* One loop should usually be enough. */
976 transferred_this_segment += nbytes;
979 break;
980
981 /* Adjust position and vectors after a short read. */
982 seekpos += nbytes;
984 }
985
986 nblocks -= nblocks_this_segment;
987 buffers += nblocks_this_segment;
988 blocknum += nblocks_this_segment;
989 }
990}
991
992/*
993 * mdstartreadv() -- Asynchronous version of mdreadv().
994 */
995void
997 SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
998 void **buffers, BlockNumber nblocks)
999{
1000 pgoff_t seekpos;
1001 MdfdVec *v;
1003 struct iovec *iov;
1004 int iovcnt;
1005 int ret;
1006
1007 v = _mdfd_getseg(reln, forknum, blocknum, false,
1009
1010 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1011
1012 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1013
1015 Min(nblocks,
1016 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1017
1018 if (nblocks_this_segment != nblocks)
1019 elog(ERROR, "read crossing segment boundary");
1020
1022
1023 Assert(nblocks <= iovcnt);
1024
1026
1028
1031
1033 reln,
1034 forknum,
1035 blocknum,
1036 nblocks,
1037 false);
1039
1041 if (ret != 0)
1042 ereport(ERROR,
1044 errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
1045 blocknum,
1046 blocknum + nblocks_this_segment - 1,
1047 FilePathName(v->mdfd_vfd))));
1048
1049 /*
1050 * The error checks corresponding to the post-read checks in mdreadv() are
1051 * in md_readv_complete().
1052 *
1053 * However we chose, at least for now, to not implement the
1054 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
1055 * that logic is rather problematic, and we want to get rid of it. Here
1056 * equivalent logic would have to be more complicated due to potential
1057 * differences in the zero_damaged_pages setting between the definer and
1058 * completor of IO.
1059 */
1060}
1061
1062/*
1063 * mdwritev() -- Write the supplied blocks at the appropriate location.
1064 *
1065 * This is to be used only for updating already-existing blocks of a
1066 * relation (ie, those before the current EOF). To extend a relation,
1067 * use mdextend().
1068 */
1069void
1071 const void **buffers, BlockNumber nblocks, bool skipFsync)
1072{
1073 /* This assert is too expensive to have on normally ... */
1074#ifdef CHECK_WRITE_VS_EXTEND
1075 Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
1076#endif
1077
1078 while (nblocks > 0)
1079 {
1080 struct iovec iov[PG_IOV_MAX];
1081 int iovcnt;
1082 pgoff_t seekpos;
1083 int nbytes;
1084 MdfdVec *v;
1087 size_t size_this_segment;
1088
1089 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
1091
1092 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1093
1094 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1095
1097 Min(nblocks,
1098 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1100
1101 if (nblocks_this_segment != nblocks)
1102 elog(ERROR, "write crosses segment boundary");
1103
1104 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
1107
1108 /*
1109 * Inner loop to continue after a short write. If the reason is that
1110 * we're out of disk space, a future attempt should get an ENOSPC
1111 * error from the kernel.
1112 */
1113 for (;;)
1114 {
1115 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
1116 reln->smgr_rlocator.locator.spcOid,
1117 reln->smgr_rlocator.locator.dbOid,
1118 reln->smgr_rlocator.locator.relNumber,
1119 reln->smgr_rlocator.backend);
1120 nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
1122 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
1123 reln->smgr_rlocator.locator.spcOid,
1124 reln->smgr_rlocator.locator.dbOid,
1125 reln->smgr_rlocator.locator.relNumber,
1126 reln->smgr_rlocator.backend,
1127 nbytes,
1129
1130#ifdef SIMULATE_SHORT_WRITE
1131 nbytes = Min(nbytes, 4096);
1132#endif
1133
1134 if (nbytes < 0)
1135 {
1136 bool enospc = errno == ENOSPC;
1137
1138 ereport(ERROR,
1140 errmsg("could not write blocks %u..%u in file \"%s\": %m",
1141 blocknum,
1142 blocknum + nblocks_this_segment - 1,
1144 enospc ? errhint("Check free disk space.") : 0));
1145 }
1146
1147 /* One loop should usually be enough. */
1148 transferred_this_segment += nbytes;
1151 break;
1152
1153 /* Adjust position and iovecs after a short write. */
1154 seekpos += nbytes;
1156 }
1157
1158 if (!skipFsync && !SmgrIsTemp(reln))
1159 register_dirty_segment(reln, forknum, v);
1160
1161 nblocks -= nblocks_this_segment;
1162 buffers += nblocks_this_segment;
1163 blocknum += nblocks_this_segment;
1164 }
1165}
1166
1167
1168/*
1169 * mdwriteback() -- Tell the kernel to write pages back to storage.
1170 *
1171 * This accepts a range of blocks because flushing several pages at once is
1172 * considerably more efficient than doing so individually.
1173 */
1174void
1176 BlockNumber blocknum, BlockNumber nblocks)
1177{
1179
1180 /*
1181 * Issue flush requests in as few requests as possible; have to split at
1182 * segment boundaries though, since those are actually separate files.
1183 */
1184 while (nblocks > 0)
1185 {
1186 BlockNumber nflush = nblocks;
1187 pgoff_t seekpos;
1188 MdfdVec *v;
1189 int segnum_start,
1190 segnum_end;
1191
1192 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1194
1195 /*
1196 * We might be flushing buffers of already removed relations, that's
1197 * ok, just ignore that case. If the segment file wasn't open already
1198 * (ie from a recent mdwrite()), then we don't want to re-open it, to
1199 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1200 * us with a descriptor to a file that is about to be unlinked.
1201 */
1202 if (!v)
1203 return;
1204
1205 /* compute offset inside the current segment */
1206 segnum_start = blocknum / RELSEG_SIZE;
1207
1208 /* compute number of desired writes within the current segment */
1209 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1210 if (segnum_start != segnum_end)
1211 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1212
1213 Assert(nflush >= 1);
1214 Assert(nflush <= nblocks);
1215
1216 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1217
1219
1220 nblocks -= nflush;
1221 blocknum += nflush;
1222 }
1223}
1224
1225/*
1226 * mdnblocks() -- Get the number of blocks stored in a relation.
1227 *
1228 * Important side effect: all active segments of the relation are opened
1229 * and added to the md_seg_fds array. If this routine has not been
1230 * called, then only segments up to the last one actually touched
1231 * are present in the array.
1232 */
1235{
1236 MdfdVec *v;
1237 BlockNumber nblocks;
1238 BlockNumber segno;
1239
1240 mdopenfork(reln, forknum, EXTENSION_FAIL);
1241
1242 /* mdopen has opened the first segment */
1243 Assert(reln->md_num_open_segs[forknum] > 0);
1244
1245 /*
1246 * Start from the last open segments, to avoid redundant seeks. We have
1247 * previously verified that these segments are exactly RELSEG_SIZE long,
1248 * and it's useless to recheck that each time.
1249 *
1250 * NOTE: this assumption could only be wrong if another backend has
1251 * truncated the relation. We rely on higher code levels to handle that
1252 * scenario by closing and re-opening the md fd, which is handled via
1253 * relcache flush. (Since the checkpointer doesn't participate in
1254 * relcache flush, it could have segment entries for inactive segments;
1255 * that's OK because the checkpointer never needs to compute relation
1256 * size.)
1257 */
1258 segno = reln->md_num_open_segs[forknum] - 1;
1259 v = &reln->md_seg_fds[forknum][segno];
1260
1261 for (;;)
1262 {
1263 nblocks = _mdnblocks(reln, forknum, v);
1264 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1265 elog(FATAL, "segment too big");
1266 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1267 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1268
1269 /*
1270 * If segment is exactly RELSEG_SIZE, advance to next one.
1271 */
1272 segno++;
1273
1274 /*
1275 * We used to pass O_CREAT here, but that has the disadvantage that it
1276 * might create a segment which has vanished through some operating
1277 * system misadventure. In such a case, creating the segment here
1278 * undermines _mdfd_getseg's attempts to notice and report an error
1279 * upon access to a missing segment.
1280 */
1281 v = _mdfd_openseg(reln, forknum, segno, 0);
1282 if (v == NULL)
1283 return segno * ((BlockNumber) RELSEG_SIZE);
1284 }
1285}
1286
1287/*
1288 * mdtruncate() -- Truncate relation to specified number of blocks.
1289 *
1290 * Guaranteed not to allocate memory, so it can be used in a critical section.
1291 * Caller must have called smgrnblocks() to obtain curnblk while holding a
1292 * sufficient lock to prevent a change in relation size, and not used any smgr
1293 * functions for this relation or handled interrupts in between. This makes
1294 * sure we have opened all active segments, so that truncate loop will get
1295 * them all!
1296 *
1297 * If nblocks > curnblk, the request is ignored when we are InRecovery,
1298 * otherwise, an error is raised.
1299 */
1300void
1303{
1305 int curopensegs;
1306
1307 if (nblocks > curnblk)
1308 {
1309 /* Bogus request ... but no complaint if InRecovery */
1310 if (InRecovery)
1311 return;
1312 ereport(ERROR,
1313 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1314 relpath(reln->smgr_rlocator, forknum).str,
1315 nblocks, curnblk)));
1316 }
1317 if (nblocks == curnblk)
1318 return; /* no work */
1319
1320 /*
1321 * Truncate segments, starting at the last one. Starting at the end makes
1322 * managing the memory for the fd array easier, should there be errors.
1323 */
1324 curopensegs = reln->md_num_open_segs[forknum];
1325 while (curopensegs > 0)
1326 {
1327 MdfdVec *v;
1328
1330
1331 v = &reln->md_seg_fds[forknum][curopensegs - 1];
1332
1333 if (priorblocks > nblocks)
1334 {
1335 /*
1336 * This segment is no longer active. We truncate the file, but do
1337 * not delete it, for reasons explained in the header comments.
1338 */
1340 ereport(ERROR,
1342 errmsg("could not truncate file \"%s\": %m",
1343 FilePathName(v->mdfd_vfd))));
1344
1345 if (!SmgrIsTemp(reln))
1346 register_dirty_segment(reln, forknum, v);
1347
1348 /* we never drop the 1st segment */
1349 Assert(v != &reln->md_seg_fds[forknum][0]);
1350
1351 FileClose(v->mdfd_vfd);
1352 _fdvec_resize(reln, forknum, curopensegs - 1);
1353 }
1354 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1355 {
1356 /*
1357 * This is the last segment we want to keep. Truncate the file to
1358 * the right length. NOTE: if nblocks is exactly a multiple K of
1359 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1360 * keep it. This adheres to the invariant given in the header
1361 * comments.
1362 */
1364
1366 ereport(ERROR,
1368 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1370 nblocks)));
1371 if (!SmgrIsTemp(reln))
1372 register_dirty_segment(reln, forknum, v);
1373 }
1374 else
1375 {
1376 /*
1377 * We still need this segment, so nothing to do for this and any
1378 * earlier segment.
1379 */
1380 break;
1381 }
1382 curopensegs--;
1383 }
1384}
1385
1386/*
1387 * mdregistersync() -- Mark whole relation as needing fsync
1388 */
1389void
1391{
1392 int segno;
1393 int min_inactive_seg;
1394
1395 /*
1396 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1397 * the loop below will get them all!
1398 */
1399 mdnblocks(reln, forknum);
1400
1401 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1402
1403 /*
1404 * Temporarily open inactive segments, then close them after sync. There
1405 * may be some inactive segments left opened after error, but that is
1406 * harmless. We don't bother to clean them up and take a risk of further
1407 * trouble. The next mdclose() will soon close them.
1408 */
1409 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1410 segno++;
1411
1412 while (segno > 0)
1413 {
1414 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1415
1416 register_dirty_segment(reln, forknum, v);
1417
1418 /* Close inactive segments immediately */
1419 if (segno > min_inactive_seg)
1420 {
1421 FileClose(v->mdfd_vfd);
1422 _fdvec_resize(reln, forknum, segno - 1);
1423 }
1424
1425 segno--;
1426 }
1427}
1428
1429/*
1430 * mdimmedsync() -- Immediately sync a relation to stable storage.
1431 *
1432 * Note that only writes already issued are synced; this routine knows
1433 * nothing of dirty buffers that may exist inside the buffer manager. We
1434 * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
1435 * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
1436 * some segment, then mdtruncate() renders that segment inactive. If we
1437 * crash before the next checkpoint syncs the newly-inactive segment, that
1438 * segment may survive recovery, reintroducing unwanted data into the table.
1439 */
1440void
1442{
1443 int segno;
1444 int min_inactive_seg;
1445
1446 /*
1447 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1448 * the loop below will get them all!
1449 */
1450 mdnblocks(reln, forknum);
1451
1452 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1453
1454 /*
1455 * Temporarily open inactive segments, then close them after sync. There
1456 * may be some inactive segments left opened after fsync() error, but that
1457 * is harmless. We don't bother to clean them up and take a risk of
1458 * further trouble. The next mdclose() will soon close them.
1459 */
1460 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1461 segno++;
1462
1463 while (segno > 0)
1464 {
1465 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1466
1467 /*
1468 * fsyncs done through mdimmedsync() should be tracked in a separate
1469 * IOContext than those done through mdsyncfiletag() to differentiate
1470 * between unavoidable client backend fsyncs (e.g. those done during
1471 * index build) and those which ideally would have been done by the
1472 * checkpointer. Since other IO operations bypassing the buffer
1473 * manager could also be tracked in such an IOContext, wait until
1474 * these are also tracked to track immediate fsyncs.
1475 */
1479 errmsg("could not fsync file \"%s\": %m",
1480 FilePathName(v->mdfd_vfd))));
1481
1482 /* Close inactive segments immediately */
1483 if (segno > min_inactive_seg)
1484 {
1485 FileClose(v->mdfd_vfd);
1486 _fdvec_resize(reln, forknum, segno - 1);
1487 }
1488
1489 segno--;
1490 }
1491}
1492
1493int
1494mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
1495{
1496 MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
1497
1498 v = _mdfd_getseg(reln, forknum, blocknum, false,
1500
1501 *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1502
1503 Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1504
1505 return FileGetRawDesc(v->mdfd_vfd);
1506}
1507
1508/*
1509 * register_dirty_segment() -- Mark a relation segment as needing fsync
1510 *
1511 * If there is a local pending-ops table, just make an entry in it for
1512 * ProcessSyncRequests to process later. Otherwise, try to pass off the
1513 * fsync request to the checkpointer process. If that fails, just do the
1514 * fsync locally before returning (we hope this will not happen often
1515 * enough to be a performance problem).
1516 */
1517static void
1519{
1520 FileTag tag;
1521
1522 INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1523
1524 /* Temp relations should never be fsync'd */
1526
1527 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1528 {
1530
1532 (errmsg_internal("could not forward fsync request because request queue is full")));
1533
1535
1539 errmsg("could not fsync file \"%s\": %m",
1540 FilePathName(seg->mdfd_vfd))));
1541
1542 /*
1543 * We have no way of knowing if the current IOContext is
1544 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1545 * point, so count the fsync as being in the IOCONTEXT_NORMAL
1546 * IOContext. This is probably okay, because the number of backend
1547 * fsyncs doesn't say anything about the efficacy of the
1548 * BufferAccessStrategy. And counting both fsyncs done in
1549 * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1550 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1551 * backend fsyncs.
1552 */
1554 IOOP_FSYNC, io_start, 1, 0);
1555 }
1556}
1557
1558/*
1559 * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
1560 */
1561static void
1563 BlockNumber segno)
1564{
1565 FileTag tag;
1566
1567 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1568
1569 /* Should never be used with temp relations */
1571
1572 RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
1573}
1574
1575/*
1576 * register_forget_request() -- forget any fsyncs for a relation fork's segment
1577 */
1578static void
1580 BlockNumber segno)
1581{
1582 FileTag tag;
1583
1584 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1585
1586 RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
1587}
1588
1589/*
1590 * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
1591 */
1592void
1594{
1595 FileTag tag;
1596 RelFileLocator rlocator;
1597
1598 rlocator.dbOid = dbid;
1599 rlocator.spcOid = 0;
1600 rlocator.relNumber = 0;
1601
1603
1604 RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1605}
1606
1607/*
1608 * DropRelationFiles -- drop files of all given relations
1609 */
1610void
1612{
1614 int i;
1615
1617 for (i = 0; i < ndelrels; i++)
1618 {
1620
1621 if (isRedo)
1622 {
1624
1625 for (fork = 0; fork <= MAX_FORKNUM; fork++)
1627 }
1628 srels[i] = srel;
1629 }
1630
1632
1633 for (i = 0; i < ndelrels; i++)
1634 smgrclose(srels[i]);
1635 pfree(srels);
1636}
1637
1638
1639/*
1640 * _fdvec_resize() -- Resize the fork's open segments array
1641 */
1642static void
1644 ForkNumber forknum,
1645 int nseg)
1646{
1647 if (nseg == 0)
1648 {
1649 if (reln->md_num_open_segs[forknum] > 0)
1650 {
1651 pfree(reln->md_seg_fds[forknum]);
1652 reln->md_seg_fds[forknum] = NULL;
1653 }
1654 }
1655 else if (reln->md_num_open_segs[forknum] == 0)
1656 {
1657 reln->md_seg_fds[forknum] =
1659 }
1660 else if (nseg > reln->md_num_open_segs[forknum])
1661 {
1662 /*
1663 * It doesn't seem worthwhile complicating the code to amortize
1664 * repalloc() calls. Those are far faster than PathNameOpenFile() or
1665 * FileClose(), and the memory context internally will sometimes avoid
1666 * doing an actual reallocation.
1667 */
1668 reln->md_seg_fds[forknum] =
1669 repalloc(reln->md_seg_fds[forknum],
1670 sizeof(MdfdVec) * nseg);
1671 }
1672 else
1673 {
1674 /*
1675 * We don't reallocate a smaller array, because we want mdtruncate()
1676 * to be able to promise that it won't allocate memory, so that it is
1677 * allowed in a critical section. This means that a bit of space in
1678 * the array is now wasted, until the next time we add a segment and
1679 * reallocate.
1680 */
1681 }
1682
1683 reln->md_num_open_segs[forknum] = nseg;
1684}
1685
1686/*
1687 * Return the filename for the specified segment of the relation. The
1688 * returned string is palloc'd.
1689 */
1690static MdPathStr
1692{
1693 RelPathStr path;
1694 MdPathStr fullpath;
1695
1696 path = relpath(reln->smgr_rlocator, forknum);
1697
1698 if (segno > 0)
1699 sprintf(fullpath.str, "%s.%u", path.str, segno);
1700 else
1701 strcpy(fullpath.str, path.str);
1702
1703 return fullpath;
1704}
1705
1706/*
1707 * Open the specified segment of the relation,
1708 * and make a MdfdVec object for it. Returns NULL on failure.
1709 */
1710static MdfdVec *
1712 int oflags)
1713{
1714 MdfdVec *v;
1715 File fd;
1716 MdPathStr fullpath;
1717
1718 fullpath = _mdfd_segpath(reln, forknum, segno);
1719
1720 /* open the file */
1722
1723 if (fd < 0)
1724 return NULL;
1725
1726 /*
1727 * Segments are always opened in order from lowest to highest, so we must
1728 * be adding a new one at the end.
1729 */
1730 Assert(segno == reln->md_num_open_segs[forknum]);
1731
1732 _fdvec_resize(reln, forknum, segno + 1);
1733
1734 /* fill the entry */
1735 v = &reln->md_seg_fds[forknum][segno];
1736 v->mdfd_vfd = fd;
1737 v->mdfd_segno = segno;
1738
1739 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1740
1741 /* all done */
1742 return v;
1743}
1744
1745/*
1746 * _mdfd_getseg() -- Find the segment of the relation holding the
1747 * specified block.
1748 *
1749 * If the segment doesn't exist, we ereport, return NULL, or create the
1750 * segment, according to "behavior". Note: skipFsync is only used in the
1751 * EXTENSION_CREATE case.
1752 */
1753static MdfdVec *
1755 bool skipFsync, int behavior)
1756{
1757 MdfdVec *v;
1760
1761 /* some way to handle non-existent segments needs to be specified */
1762 Assert(behavior &
1765
1766 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1767
1768 /* if an existing and opened segment, we're done */
1769 if (targetseg < reln->md_num_open_segs[forknum])
1770 {
1771 v = &reln->md_seg_fds[forknum][targetseg];
1772 return v;
1773 }
1774
1775 /* The caller only wants the segment if we already had it open. */
1776 if (behavior & EXTENSION_DONT_OPEN)
1777 return NULL;
1778
1779 /*
1780 * The target segment is not yet open. Iterate over all the segments
1781 * between the last opened and the target segment. This way missing
1782 * segments either raise an error, or get created (according to
1783 * 'behavior'). Start with either the last opened, or the first segment if
1784 * none was opened before.
1785 */
1786 if (reln->md_num_open_segs[forknum] > 0)
1787 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1788 else
1789 {
1790 v = mdopenfork(reln, forknum, behavior);
1791 if (!v)
1792 return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1793 }
1794
1795 for (nextsegno = reln->md_num_open_segs[forknum];
1797 {
1798 BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1799 int flags = 0;
1800
1801 Assert(nextsegno == v->mdfd_segno + 1);
1802
1803 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1804 elog(FATAL, "segment too big");
1805
1806 if ((behavior & EXTENSION_CREATE) ||
1807 (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1808 {
1809 /*
1810 * Normally we will create new segments only if authorized by the
1811 * caller (i.e., we are doing mdextend()). But when doing WAL
1812 * recovery, create segments anyway; this allows cases such as
1813 * replaying WAL data that has a write into a high-numbered
1814 * segment of a relation that was later deleted. We want to go
1815 * ahead and create the segments so we can finish out the replay.
1816 *
1817 * We have to maintain the invariant that segments before the last
1818 * active segment are of size RELSEG_SIZE; therefore, if
1819 * extending, pad them out with zeroes if needed. (This only
1820 * matters if in recovery, or if the caller is extending the
1821 * relation discontiguously, but that can happen in hash indexes.)
1822 */
1823 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1824 {
1827
1828 mdextend(reln, forknum,
1831 pfree(zerobuf);
1832 }
1833 flags = O_CREAT;
1834 }
1835 else if (nblocks < ((BlockNumber) RELSEG_SIZE))
1836 {
1837 /*
1838 * When not extending, only open the next segment if the current
1839 * one is exactly RELSEG_SIZE. If not (this branch), either
1840 * return NULL or fail.
1841 */
1842 if (behavior & EXTENSION_RETURN_NULL)
1843 {
1844 /*
1845 * Some callers discern between reasons for _mdfd_getseg()
1846 * returning NULL based on errno. As there's no failing
1847 * syscall involved in this case, explicitly set errno to
1848 * ENOENT, as that seems the closest interpretation.
1849 */
1850 errno = ENOENT;
1851 return NULL;
1852 }
1853
1854 ereport(ERROR,
1856 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1857 _mdfd_segpath(reln, forknum, nextsegno).str,
1858 blkno, nblocks)));
1859 }
1860
1861 v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1862
1863 if (v == NULL)
1864 {
1865 if ((behavior & EXTENSION_RETURN_NULL) &&
1867 return NULL;
1868 ereport(ERROR,
1870 errmsg("could not open file \"%s\" (target block %u): %m",
1871 _mdfd_segpath(reln, forknum, nextsegno).str,
1872 blkno)));
1873 }
1874 }
1875
1876 return v;
1877}
1878
1879/*
1880 * Get number of blocks present in a single disk file
1881 */
1882static BlockNumber
1884{
1885 pgoff_t len;
1886
1887 len = FileSize(seg->mdfd_vfd);
1888 if (len < 0)
1889 ereport(ERROR,
1891 errmsg("could not seek to end of file \"%s\": %m",
1892 FilePathName(seg->mdfd_vfd))));
1893 /* note that this calculation will ignore any partial block at EOF */
1894 return (BlockNumber) (len / BLCKSZ);
1895}
1896
1897/*
1898 * Sync a file to disk, given a file tag. Write the path into an output
1899 * buffer so the caller can use it in error messages.
1900 *
1901 * Return 0 on success, -1 on failure, with errno set.
1902 */
1903int
1904mdsyncfiletag(const FileTag *ftag, char *path)
1905{
1907 File file;
1909 bool need_to_close;
1910 int result,
1911 save_errno;
1912
1913 /* See if we already have the file open, or need to open it. */
1914 if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1915 {
1916 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1917 strlcpy(path, FilePathName(file), MAXPGPATH);
1918 need_to_close = false;
1919 }
1920 else
1921 {
1922 MdPathStr p;
1923
1924 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1925 strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
1926
1927 file = PathNameOpenFile(path, _mdfd_open_flags());
1928 if (file < 0)
1929 return -1;
1930 need_to_close = true;
1931 }
1932
1934
1935 /* Sync the file. */
1936 result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1937 save_errno = errno;
1938
1939 if (need_to_close)
1940 FileClose(file);
1941
1943 IOOP_FSYNC, io_start, 1, 0);
1944
1945 errno = save_errno;
1946 return result;
1947}
1948
1949/*
1950 * Unlink a file, given a file tag. Write the path into an output
1951 * buffer so the caller can use it in error messages.
1952 *
1953 * Return 0 on success, -1 on failure, with errno set.
1954 */
1955int
1956mdunlinkfiletag(const FileTag *ftag, char *path)
1957{
1958 RelPathStr p;
1959
1960 /* Compute the path. */
1961 p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1962 strlcpy(path, p.str, MAXPGPATH);
1963
1964 /* Try to unlink the file. */
1965 return unlink(path);
1966}
1967
1968/*
1969 * Check if a given candidate request matches a given tag, when processing
1970 * a SYNC_FILTER_REQUEST request. This will be called for all pending
1971 * requests to find out whether to forget them.
1972 */
1973bool
1974mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
1975{
1976 /*
1977 * For now we only use filter requests as a way to drop all scheduled
1978 * callbacks relating to a given database, when dropping the database.
1979 * We'll return true for all candidates that have the same database OID as
1980 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1981 */
1982 return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1983}
1984
1985/*
1986 * AIO completion callback for mdstartreadv().
1987 */
1988static PgAioResult
1990{
1992 PgAioResult result = prior_result;
1993
1994 if (prior_result.result < 0)
1995 {
1996 result.status = PGAIO_RS_ERROR;
1997 result.id = PGAIO_HCB_MD_READV;
1998 /* For "hard" errors, track the error number in error_data */
1999 result.error_data = -prior_result.result;
2000 result.result = 0;
2001
2002 /*
2003 * Immediately log a message about the IO error, but only to the
2004 * server log. The reason to do so immediately is that the originator
2005 * might not process the query result immediately (because it is busy
2006 * doing another part of query processing) or at all (e.g. if it was
2007 * cancelled or errored out due to another IO also failing). The
2008 * definer of the IO will emit an ERROR when processing the IO's
2009 * results
2010 */
2012
2013 return result;
2014 }
2015
2016 /*
2017 * As explained above smgrstartreadv(), the smgr API operates on the level
2018 * of blocks, rather than bytes. Convert.
2019 */
2020 result.result /= BLCKSZ;
2021
2022 Assert(result.result <= td->smgr.nblocks);
2023
2024 if (result.result == 0)
2025 {
2026 /* consider 0 blocks read a failure */
2027 result.status = PGAIO_RS_ERROR;
2028 result.id = PGAIO_HCB_MD_READV;
2029 result.error_data = 0;
2030
2031 /* see comment above the "hard error" case */
2033
2034 return result;
2035 }
2036
2037 if (result.status != PGAIO_RS_ERROR &&
2038 result.result < td->smgr.nblocks)
2039 {
2040 /* partial reads should be retried at upper level */
2041 result.status = PGAIO_RS_PARTIAL;
2042 result.id = PGAIO_HCB_MD_READV;
2043 }
2044
2045 return result;
2046}
2047
2048/*
2049 * AIO error reporting callback for mdstartreadv().
2050 *
2051 * Errors are encoded as follows:
2052 * - PgAioResult.error_data != 0 encodes IO that failed with that errno
2053 * - PgAioResult.error_data == 0 encodes IO that didn't read all data
2054 */
2055static void
2056md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
2057{
2058 RelPathStr path;
2059
2060 path = relpathbackend(td->smgr.rlocator,
2062 td->smgr.forkNum);
2063
2064 if (result.error_data != 0)
2065 {
2066 /* for errcode_for_file_access() and %m */
2067 errno = result.error_data;
2068
2069 ereport(elevel,
2071 errmsg("could not read blocks %u..%u in file \"%s\": %m",
2072 td->smgr.blockNum,
2073 td->smgr.blockNum + td->smgr.nblocks - 1,
2074 path.str));
2075 }
2076 else
2077 {
2078 /*
2079 * NB: This will typically only be output in debug messages, while
2080 * retrying a partial IO.
2081 */
2082 ereport(elevel,
2084 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
2085 td->smgr.blockNum,
2086 td->smgr.blockNum + td->smgr.nblocks - 1,
2087 path.str,
2088 result.result * (size_t) BLCKSZ,
2089 td->smgr.nblocks * (size_t) BLCKSZ));
2090 }
2091}
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition aio.c:330
@ PGAIO_HCB_MD_READV
Definition aio.h:196
@ PGAIO_HF_BUFFERED
Definition aio.h:77
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
int pgaio_io_get_iovec(PgAioHandle *ioh, struct iovec **iov)
Definition aio_io.c:42
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition aio_target.c:73
@ PGAIO_RS_PARTIAL
Definition aio_types.h:82
@ PGAIO_RS_ERROR
Definition aio_types.h:84
void TablespaceCreateDbspace(Oid spcOid, Oid dbOid, bool isRedo)
Definition tablespace.c:114
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
#define MaxBlockNumber
Definition block.h:35
bool track_io_timing
Definition bufmgr.c:192
bool zero_damaged_pages
Definition bufmgr.c:189
#define Min(x, y)
Definition c.h:1093
#define TYPEALIGN(ALIGNVAL, LEN)
Definition c.h:891
uint8_t uint8
Definition c.h:616
#define Assert(condition)
Definition c.h:945
#define PG_BINARY
Definition c.h:1376
uint64_t uint64
Definition c.h:619
uint32_t uint32
Definition c.h:618
#define lengthof(array)
Definition c.h:875
int errcode_for_file_access(void)
Definition elog.c:897
int errcode(int sqlerrcode)
Definition elog.c:874
int errhint(const char *fmt,...) pg_attribute_printf(1
#define LOG_SERVER_ONLY
Definition elog.h:32
#define FATAL
Definition elog.h:41
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:36
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int pg_truncate(const char *path, pgoff_t length)
Definition fd.c:721
int FileGetRawDesc(File file)
Definition fd.c:2516
void FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
Definition fd.c:2123
int io_direct_flags
Definition fd.c:172
int file_extend_method
Definition fd.c:169
char * FilePathName(File file)
Definition fd.c:2500
int FileSync(File file, uint32 wait_event_info)
Definition fd.c:2336
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2205
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2149
int FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2408
pgoff_t FileSize(File file)
Definition fd.c:2448
void FileClose(File file)
Definition fd.c:1966
int data_sync_elevel(int elevel)
Definition fd.c:3986
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1563
int FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2465
int FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2363
int FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2067
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2231
#define IO_DIRECT_DATA
Definition fd.h:54
static ssize_t FileWrite(File file, const void *buffer, size_t amount, pgoff_t offset, uint32 wait_event_info)
Definition fd.h:237
@ FILE_EXTEND_METHOD_WRITE_ZEROS
Definition fd.h:63
#define FILE_POSSIBLY_DELETED(err)
Definition fd.h:89
int File
Definition fd.h:51
#define PG_O_DIRECT
Definition fd.h:123
#define MCXT_ALLOC_ZERO
Definition fe_memutils.h:30
#define palloc_array(type, count)
Definition fe_memutils.h:76
int compute_remaining_iovec(struct iovec *destination, const struct iovec *source, int iovcnt, size_t transferred)
Definition file_utils.c:614
bool IsBinaryUpgrade
Definition globals.c:121
ProcNumber MyProcNumber
Definition globals.c:90
const char * str
int i
Definition isn.c:77
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1232
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
MemoryContext TopMemoryContext
Definition mcxt.c:166
void * palloc_aligned(Size size, Size alignto, int flags)
Definition mcxt.c:1606
void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition md.c:338
static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition md.c:2057
static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition md.c:1580
#define EXTENSION_CREATE_RECOVERY
Definition md.c:120
void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
Definition md.c:1302
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition md.c:1884
static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition md.c:375
void mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
Definition md.c:1071
bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
Definition md.c:1975
bool mdexists(SMgrRelation reln, ForkNumber forknum)
Definition md.c:204
void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition md.c:859
static MdPathStr _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
Definition md.c:1692
static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition md.c:1563
#define EXTENSION_DONT_OPEN
Definition md.c:122
BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum)
Definition md.c:1235
int mdunlinkfiletag(const FileTag *ftag, char *path)
Definition md.c:1957
static MemoryContext MdCxt
Definition md.c:98
void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition md.c:223
int mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
Definition md.c:1495
void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition md.c:488
static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition md.c:1990
static int do_truncate(const char *path)
Definition md.c:354
void mdinit(void)
Definition md.c:191
void mdclose(SMgrRelation reln, ForkNumber forknum)
Definition md.c:725
void mdzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition md.c:553
static MdfdVec * _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
Definition md.c:1712
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition md.c:1519
int mdsyncfiletag(const FileTag *ftag, char *path)
Definition md.c:1905
void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition md.c:1176
uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition md.c:845
static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
Definition md.c:1755
#define EXTENSION_RETURN_NULL
Definition md.c:116
void mdstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition md.c:997
bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition md.c:748
void mdregistersync(SMgrRelation reln, ForkNumber forknum)
Definition md.c:1391
void mdopen(SMgrRelation reln)
Definition md.c:714
#define EXTENSION_CREATE
Definition md.c:118
const PgAioHandleCallbacks aio_md_readv_cb
Definition md.c:170
static int _mdfd_open_flags(void)
Definition md.c:177
#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
Definition md.c:102
#define EXTENSION_FAIL
Definition md.c:114
static MdfdVec * mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
Definition md.c:676
void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
Definition md.c:1612
static int buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
Definition md.c:796
#define MD_PATH_STR_MAXLEN
Definition md.c:133
static void _fdvec_resize(SMgrRelation reln, ForkNumber forknum, int nseg)
Definition md.c:1644
void ForgetDatabaseSyncRequests(Oid dbid)
Definition md.c:1594
void mdimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition md.c:1442
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
static char * errmsg
#define ERRCODE_DATA_CORRUPTED
#define MAXPGPATH
#define PG_IO_ALIGN_SIZE
const void size_t len
#define PG_IOV_MAX
Definition pg_iovec.h:47
@ IOOBJECT_RELATION
Definition pgstat.h:281
@ IOCONTEXT_NORMAL
Definition pgstat.h:293
@ IOOP_FSYNC
Definition pgstat.h:312
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
#define sprintf
Definition port.h:262
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
off_t pgoff_t
Definition port.h:421
unsigned int Oid
static int fd(const char *x, int i)
#define INVALID_PROC_NUMBER
Definition procnumber.h:26
#define RelFileLocatorBackendIsTemp(rlocator)
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
@ InvalidForkNumber
Definition relpath.h:57
#define MAX_FORKNUM
Definition relpath.h:70
#define relpath(rlocator, forknum)
Definition relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition relpath.h:141
#define relpathperm(rlocator, forknum)
Definition relpath.h:146
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition smgr.c:240
void smgrclose(SMgrRelation reln)
Definition smgr.c:374
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
Definition smgr.c:538
void pgaio_io_set_target_smgr(PgAioHandle *ioh, SMgrRelationData *smgr, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skip_fsync)
Definition smgr.c:1038
#define SmgrIsTemp(smgr)
Definition smgr.h:74
RelFileLocator rlocator
Definition sync.h:54
int16 forknum
Definition sync.h:53
uint64 segno
Definition sync.h:55
char str[MD_PATH_STR_MAXLEN+1]
Definition md.c:141
PgAioHandleCallbackComplete complete_shared
Definition aio.h:239
uint32 status
Definition aio_types.h:108
uint32 error_data
Definition aio_types.h:111
int32 result
Definition aio_types.h:113
uint32 id
Definition aio_types.h:105
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition relpath.h:123
Definition md.c:93
File mdfd_vfd
Definition md.c:94
BlockNumber mdfd_segno
Definition md.c:95
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition sync.c:581
@ SYNC_FILTER_REQUEST
Definition sync.h:28
@ SYNC_FORGET_REQUEST
Definition sync.h:27
@ SYNC_UNLINK_REQUEST
Definition sync.h:26
@ SYNC_REQUEST
Definition sync.h:25
BlockNumber blockNum
Definition aio_types.h:66
RelFileLocator rlocator
Definition aio_types.h:65
struct PgAioTargetData::@128 smgr
BlockNumber nblocks
Definition aio_types.h:67
bool InRecovery
Definition xlogutils.c:50
void XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum)
Definition xlogutils.c:630

◆ MD_PATH_STR_MAXLEN

#define MD_PATH_STR_MAXLEN
Value:
(\
+ sizeof((char)'.') \
)
#define SEGMENT_CHARS
Definition md.c:132
#define REL_PATH_STR_MAXLEN
Definition relpath.h:96

Definition at line 133 of file md.c.

◆ SEGMENT_CHARS

#define SEGMENT_CHARS   OIDCHARS

Definition at line 132 of file md.c.

Typedef Documentation

◆ MdfdVec

◆ MdPathStr

Function Documentation

◆ _fdvec_resize()

static void _fdvec_resize ( SMgrRelation  reln,
ForkNumber  forknum,
int  nseg 
)
static

Definition at line 1644 of file md.c.

1647{
1648 if (nseg == 0)
1649 {
1650 if (reln->md_num_open_segs[forknum] > 0)
1651 {
1652 pfree(reln->md_seg_fds[forknum]);
1653 reln->md_seg_fds[forknum] = NULL;
1654 }
1655 }
1656 else if (reln->md_num_open_segs[forknum] == 0)
1657 {
1658 reln->md_seg_fds[forknum] =
1660 }
1661 else if (nseg > reln->md_num_open_segs[forknum])
1662 {
1663 /*
1664 * It doesn't seem worthwhile complicating the code to amortize
1665 * repalloc() calls. Those are far faster than PathNameOpenFile() or
1666 * FileClose(), and the memory context internally will sometimes avoid
1667 * doing an actual reallocation.
1668 */
1669 reln->md_seg_fds[forknum] =
1670 repalloc(reln->md_seg_fds[forknum],
1671 sizeof(MdfdVec) * nseg);
1672 }
1673 else
1674 {
1675 /*
1676 * We don't reallocate a smaller array, because we want mdtruncate()
1677 * to be able to promise that it won't allocate memory, so that it is
1678 * allowed in a critical section. This means that a bit of space in
1679 * the array is now wasted, until the next time we add a segment and
1680 * reallocate.
1681 */
1682 }
1683
1684 reln->md_num_open_segs[forknum] = nseg;
1685}

References fb(), MdCxt, MemoryContextAlloc(), pfree(), and repalloc().

Referenced by _mdfd_openseg(), mdclose(), mdcreate(), mdimmedsync(), mdopenfork(), mdregistersync(), and mdtruncate().

◆ _mdfd_getseg()

static MdfdVec * _mdfd_getseg ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blkno,
bool  skipFsync,
int  behavior 
)
static

Definition at line 1755 of file md.c.

1757{
1758 MdfdVec *v;
1761
1762 /* some way to handle non-existent segments needs to be specified */
1763 Assert(behavior &
1766
1767 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1768
1769 /* if an existing and opened segment, we're done */
1770 if (targetseg < reln->md_num_open_segs[forknum])
1771 {
1772 v = &reln->md_seg_fds[forknum][targetseg];
1773 return v;
1774 }
1775
1776 /* The caller only wants the segment if we already had it open. */
1777 if (behavior & EXTENSION_DONT_OPEN)
1778 return NULL;
1779
1780 /*
1781 * The target segment is not yet open. Iterate over all the segments
1782 * between the last opened and the target segment. This way missing
1783 * segments either raise an error, or get created (according to
1784 * 'behavior'). Start with either the last opened, or the first segment if
1785 * none was opened before.
1786 */
1787 if (reln->md_num_open_segs[forknum] > 0)
1788 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1789 else
1790 {
1791 v = mdopenfork(reln, forknum, behavior);
1792 if (!v)
1793 return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1794 }
1795
1796 for (nextsegno = reln->md_num_open_segs[forknum];
1798 {
1799 BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1800 int flags = 0;
1801
1802 Assert(nextsegno == v->mdfd_segno + 1);
1803
1804 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1805 elog(FATAL, "segment too big");
1806
1807 if ((behavior & EXTENSION_CREATE) ||
1808 (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1809 {
1810 /*
1811 * Normally we will create new segments only if authorized by the
1812 * caller (i.e., we are doing mdextend()). But when doing WAL
1813 * recovery, create segments anyway; this allows cases such as
1814 * replaying WAL data that has a write into a high-numbered
1815 * segment of a relation that was later deleted. We want to go
1816 * ahead and create the segments so we can finish out the replay.
1817 *
1818 * We have to maintain the invariant that segments before the last
1819 * active segment are of size RELSEG_SIZE; therefore, if
1820 * extending, pad them out with zeroes if needed. (This only
1821 * matters if in recovery, or if the caller is extending the
1822 * relation discontiguously, but that can happen in hash indexes.)
1823 */
1824 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1825 {
1828
1829 mdextend(reln, forknum,
1832 pfree(zerobuf);
1833 }
1834 flags = O_CREAT;
1835 }
1836 else if (nblocks < ((BlockNumber) RELSEG_SIZE))
1837 {
1838 /*
1839 * When not extending, only open the next segment if the current
1840 * one is exactly RELSEG_SIZE. If not (this branch), either
1841 * return NULL or fail.
1842 */
1843 if (behavior & EXTENSION_RETURN_NULL)
1844 {
1845 /*
1846 * Some callers discern between reasons for _mdfd_getseg()
1847 * returning NULL based on errno. As there's no failing
1848 * syscall involved in this case, explicitly set errno to
1849 * ENOENT, as that seems the closest interpretation.
1850 */
1851 errno = ENOENT;
1852 return NULL;
1853 }
1854
1855 ereport(ERROR,
1857 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1858 _mdfd_segpath(reln, forknum, nextsegno).str,
1859 blkno, nblocks)));
1860 }
1861
1862 v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1863
1864 if (v == NULL)
1865 {
1866 if ((behavior & EXTENSION_RETURN_NULL) &&
1868 return NULL;
1869 ereport(ERROR,
1871 errmsg("could not open file \"%s\" (target block %u): %m",
1872 _mdfd_segpath(reln, forknum, nextsegno).str,
1873 blkno)));
1874 }
1875 }
1876
1877 return v;
1878}

References _mdfd_openseg(), _mdfd_segpath(), _mdnblocks(), Assert, elog, ereport, errcode_for_file_access(), errmsg, ERROR, EXTENSION_CREATE, EXTENSION_CREATE_RECOVERY, EXTENSION_DONT_OPEN, EXTENSION_FAIL, EXTENSION_RETURN_NULL, FATAL, fb(), FILE_POSSIBLY_DELETED, InRecovery, MCXT_ALLOC_ZERO, mdextend(), _MdfdVec::mdfd_segno, mdopenfork(), palloc_aligned(), pfree(), PG_IO_ALIGN_SIZE, and str.

Referenced by mdextend(), mdfd(), mdprefetch(), mdreadv(), mdstartreadv(), mdwriteback(), mdwritev(), and mdzeroextend().

◆ _mdfd_open_flags()

static int _mdfd_open_flags ( void  )
inlinestatic

Definition at line 177 of file md.c.

178{
179 int flags = O_RDWR | PG_BINARY;
180
182 flags |= PG_O_DIRECT;
183
184 return flags;
185}

References fb(), IO_DIRECT_DATA, io_direct_flags, PG_BINARY, and PG_O_DIRECT.

Referenced by _mdfd_openseg(), mdcreate(), mdopenfork(), and mdsyncfiletag().

◆ _mdfd_openseg()

static MdfdVec * _mdfd_openseg ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  segno,
int  oflags 
)
static

Definition at line 1712 of file md.c.

1714{
1715 MdfdVec *v;
1716 File fd;
1717 MdPathStr fullpath;
1718
1719 fullpath = _mdfd_segpath(reln, forknum, segno);
1720
1721 /* open the file */
1723
1724 if (fd < 0)
1725 return NULL;
1726
1727 /*
1728 * Segments are always opened in order from lowest to highest, so we must
1729 * be adding a new one at the end.
1730 */
1731 Assert(segno == reln->md_num_open_segs[forknum]);
1732
1733 _fdvec_resize(reln, forknum, segno + 1);
1734
1735 /* fill the entry */
1736 v = &reln->md_seg_fds[forknum][segno];
1737 v->mdfd_vfd = fd;
1738 v->mdfd_segno = segno;
1739
1740 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1741
1742 /* all done */
1743 return v;
1744}

References _fdvec_resize(), _mdfd_open_flags(), _mdfd_segpath(), _mdnblocks(), Assert, fb(), fd(), _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, PathNameOpenFile(), and MdPathStr::str.

Referenced by _mdfd_getseg(), mdimmedsync(), mdnblocks(), and mdregistersync().

◆ _mdfd_segpath()

static MdPathStr _mdfd_segpath ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1692 of file md.c.

1693{
1694 RelPathStr path;
1695 MdPathStr fullpath;
1696
1697 path = relpath(reln->smgr_rlocator, forknum);
1698
1699 if (segno > 0)
1700 sprintf(fullpath.str, "%s.%u", path.str, segno);
1701 else
1702 strcpy(fullpath.str, path.str);
1703
1704 return fullpath;
1705}

References fb(), relpath, sprintf, MdPathStr::str, and RelPathStr::str.

Referenced by _mdfd_getseg(), _mdfd_openseg(), and mdsyncfiletag().

◆ _mdnblocks()

static BlockNumber _mdnblocks ( SMgrRelation  reln,
ForkNumber  forknum,
MdfdVec seg 
)
static

Definition at line 1884 of file md.c.

1885{
1886 pgoff_t len;
1887
1888 len = FileSize(seg->mdfd_vfd);
1889 if (len < 0)
1890 ereport(ERROR,
1892 errmsg("could not seek to end of file \"%s\": %m",
1893 FilePathName(seg->mdfd_vfd))));
1894 /* note that this calculation will ignore any partial block at EOF */
1895 return (BlockNumber) (len / BLCKSZ);
1896}

References ereport, errcode_for_file_access(), errmsg, ERROR, fb(), FilePathName(), FileSize(), len, and _MdfdVec::mdfd_vfd.

Referenced by _mdfd_getseg(), _mdfd_openseg(), mdextend(), mdnblocks(), mdopenfork(), and mdzeroextend().

◆ buffers_to_iovec()

static int buffers_to_iovec ( struct iovec iov,
void **  buffers,
int  nblocks 
)
static

Definition at line 796 of file md.c.

797{
798 struct iovec *iovp;
799 int iovcnt;
800
801 Assert(nblocks >= 1);
802
803 /* If this build supports direct I/O, buffers must be I/O aligned. */
804 for (int i = 0; i < nblocks; ++i)
805 {
806 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
807 Assert((uintptr_t) buffers[i] ==
808 TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
809 }
810
811 /* Start the first iovec off with the first buffer. */
812 iovp = &iov[0];
813 iovp->iov_base = buffers[0];
814 iovp->iov_len = BLCKSZ;
815 iovcnt = 1;
816
817 /* Try to merge the rest. */
818 for (int i = 1; i < nblocks; ++i)
819 {
820 void *buffer = buffers[i];
821
822 if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
823 {
824 /* Contiguous with the last iovec. */
825 iovp->iov_len += BLCKSZ;
826 }
827 else
828 {
829 /* Need a new iovec. */
830 iovp++;
831 iovp->iov_base = buffer;
832 iovp->iov_len = BLCKSZ;
833 iovcnt++;
834 }
835 }
836
837 return iovcnt;
838}

References Assert, fb(), i, PG_IO_ALIGN_SIZE, PG_O_DIRECT, and TYPEALIGN.

Referenced by mdreadv(), mdstartreadv(), and mdwritev().

◆ do_truncate()

static int do_truncate ( const char path)
static

Definition at line 354 of file md.c.

355{
356 int save_errno;
357 int ret;
358
359 ret = pg_truncate(path, 0);
360
361 /* Log a warning here to avoid repetition in callers. */
362 if (ret < 0 && errno != ENOENT)
363 {
367 errmsg("could not truncate file \"%s\": %m", path)));
369 }
370
371 return ret;
372}

References ereport, errcode_for_file_access(), errmsg, fb(), pg_truncate(), and WARNING.

Referenced by mdunlinkfork().

◆ DropRelationFiles()

void DropRelationFiles ( RelFileLocator delrels,
int  ndelrels,
bool  isRedo 
)

Definition at line 1612 of file md.c.

1613{
1615 int i;
1616
1618 for (i = 0; i < ndelrels; i++)
1619 {
1621
1622 if (isRedo)
1623 {
1625
1626 for (fork = 0; fork <= MAX_FORKNUM; fork++)
1628 }
1629 srels[i] = srel;
1630 }
1631
1633
1634 for (i = 0; i < ndelrels; i++)
1635 smgrclose(srels[i]);
1636 pfree(srels);
1637}

References fb(), i, INVALID_PROC_NUMBER, MAX_FORKNUM, palloc_array, pfree(), smgrclose(), smgrdounlinkall(), smgropen(), and XLogDropRelation().

Referenced by FinishPreparedTransaction(), xact_redo_abort(), and xact_redo_commit().

◆ ForgetDatabaseSyncRequests()

void ForgetDatabaseSyncRequests ( Oid  dbid)

Definition at line 1594 of file md.c.

1595{
1596 FileTag tag;
1597 RelFileLocator rlocator;
1598
1599 rlocator.dbOid = dbid;
1600 rlocator.spcOid = 0;
1601 rlocator.relNumber = 0;
1602
1604
1605 RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1606}

References RelFileLocator::dbOid, INIT_MD_FILETAG, InvalidBlockNumber, InvalidForkNumber, RegisterSyncRequest(), RelFileLocator::relNumber, RelFileLocator::spcOid, and SYNC_FILTER_REQUEST.

Referenced by createdb_failure_callback(), dbase_redo(), and dropdb().

◆ md_readv_complete()

static PgAioResult md_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 1990 of file md.c.

1991{
1993 PgAioResult result = prior_result;
1994
1995 if (prior_result.result < 0)
1996 {
1997 result.status = PGAIO_RS_ERROR;
1998 result.id = PGAIO_HCB_MD_READV;
1999 /* For "hard" errors, track the error number in error_data */
2000 result.error_data = -prior_result.result;
2001 result.result = 0;
2002
2003 /*
2004 * Immediately log a message about the IO error, but only to the
2005 * server log. The reason to do so immediately is that the originator
2006 * might not process the query result immediately (because it is busy
2007 * doing another part of query processing) or at all (e.g. if it was
2008 * cancelled or errored out due to another IO also failing). The
2009 * definer of the IO will emit an ERROR when processing the IO's
2010 * results
2011 */
2013
2014 return result;
2015 }
2016
2017 /*
2018 * As explained above smgrstartreadv(), the smgr API operates on the level
2019 * of blocks, rather than bytes. Convert.
2020 */
2021 result.result /= BLCKSZ;
2022
2023 Assert(result.result <= td->smgr.nblocks);
2024
2025 if (result.result == 0)
2026 {
2027 /* consider 0 blocks read a failure */
2028 result.status = PGAIO_RS_ERROR;
2029 result.id = PGAIO_HCB_MD_READV;
2030 result.error_data = 0;
2031
2032 /* see comment above the "hard error" case */
2034
2035 return result;
2036 }
2037
2038 if (result.status != PGAIO_RS_ERROR &&
2039 result.result < td->smgr.nblocks)
2040 {
2041 /* partial reads should be retried at upper level */
2042 result.status = PGAIO_RS_PARTIAL;
2043 result.id = PGAIO_HCB_MD_READV;
2044 }
2045
2046 return result;
2047}

References Assert, PgAioResult::error_data, fb(), PgAioResult::id, LOG_SERVER_ONLY, PgAioTargetData::nblocks, PGAIO_HCB_MD_READV, pgaio_io_get_target_data(), pgaio_result_report(), PGAIO_RS_ERROR, PGAIO_RS_PARTIAL, PgAioResult::result, PgAioTargetData::smgr, and PgAioResult::status.

◆ md_readv_report()

static void md_readv_report ( PgAioResult  result,
const PgAioTargetData td,
int  elevel 
)
static

Definition at line 2057 of file md.c.

2058{
2059 RelPathStr path;
2060
2061 path = relpathbackend(td->smgr.rlocator,
2063 td->smgr.forkNum);
2064
2065 if (result.error_data != 0)
2066 {
2067 /* for errcode_for_file_access() and %m */
2068 errno = result.error_data;
2069
2070 ereport(elevel,
2072 errmsg("could not read blocks %u..%u in file \"%s\": %m",
2073 td->smgr.blockNum,
2074 td->smgr.blockNum + td->smgr.nblocks - 1,
2075 path.str));
2076 }
2077 else
2078 {
2079 /*
2080 * NB: This will typically only be output in debug messages, while
2081 * retrying a partial IO.
2082 */
2083 ereport(elevel,
2085 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
2086 td->smgr.blockNum,
2087 td->smgr.blockNum + td->smgr.nblocks - 1,
2088 path.str,
2089 result.result * (size_t) BLCKSZ,
2090 td->smgr.nblocks * (size_t) BLCKSZ));
2091 }
2092}

References PgAioTargetData::blockNum, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg, PgAioResult::error_data, fb(), PgAioTargetData::forkNum, INVALID_PROC_NUMBER, PgAioTargetData::is_temp, MyProcNumber, PgAioTargetData::nblocks, relpathbackend, PgAioResult::result, PgAioTargetData::rlocator, PgAioTargetData::smgr, and RelPathStr::str.

◆ mdclose()

void mdclose ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 725 of file md.c.

726{
727 int nopensegs = reln->md_num_open_segs[forknum];
728
729 /* No work if already closed */
730 if (nopensegs == 0)
731 return;
732
733 /* close segments starting from the end */
734 while (nopensegs > 0)
735 {
736 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
737
739 _fdvec_resize(reln, forknum, nopensegs - 1);
740 nopensegs--;
741 }
742}

References _fdvec_resize(), fb(), FileClose(), and _MdfdVec::mdfd_vfd.

Referenced by mdexists().

◆ mdcreate()

void mdcreate ( SMgrRelation  reln,
ForkNumber  forknum,
bool  isRedo 
)

Definition at line 223 of file md.c.

224{
225 MdfdVec *mdfd;
226 RelPathStr path;
227 File fd;
228
229 if (isRedo && reln->md_num_open_segs[forknum] > 0)
230 return; /* created and opened already... */
231
232 Assert(reln->md_num_open_segs[forknum] == 0);
233
234 /*
235 * We may be using the target table space for the first time in this
236 * database, so create a per-database subdirectory if needed.
237 *
238 * XXX this is a fairly ugly violation of module layering, but this seems
239 * to be the best place to put the check. Maybe TablespaceCreateDbspace
240 * should be here and not in commands/tablespace.c? But that would imply
241 * importing a lot of stuff that smgr.c oughtn't know, either.
242 */
243 TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
244 reln->smgr_rlocator.locator.dbOid,
245 isRedo);
246
247 path = relpath(reln->smgr_rlocator, forknum);
248
250
251 if (fd < 0)
252 {
253 int save_errno = errno;
254
255 if (isRedo)
257 if (fd < 0)
258 {
259 /* be sure to report the error reported by create, not open */
263 errmsg("could not create file \"%s\": %m", path.str)));
264 }
265 }
266
267 _fdvec_resize(reln, forknum, 1);
268 mdfd = &reln->md_seg_fds[forknum][0];
269 mdfd->mdfd_vfd = fd;
270 mdfd->mdfd_segno = 0;
271
272 if (!SmgrIsTemp(reln))
274}

References _fdvec_resize(), _mdfd_open_flags(), Assert, ereport, errcode_for_file_access(), errmsg, ERROR, fb(), fd(), mdfd(), PathNameOpenFile(), register_dirty_segment(), relpath, SmgrIsTemp, RelPathStr::str, and TablespaceCreateDbspace().

◆ mdexists()

bool mdexists ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 204 of file md.c.

205{
206 /*
207 * Close it first, to ensure that we notice if the fork has been unlinked
208 * since we opened it. As an optimization, we can skip that in recovery,
209 * which already closes relations when dropping them.
210 */
211 if (!InRecovery)
212 mdclose(reln, forknum);
213
214 return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
215}

References EXTENSION_RETURN_NULL, fb(), InRecovery, mdclose(), and mdopenfork().

◆ mdextend()

void mdextend ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
const void buffer,
bool  skipFsync 
)

Definition at line 488 of file md.c.

490{
491 pgoff_t seekpos;
492 int nbytes;
493 MdfdVec *v;
494
495 /* If this build supports direct I/O, the buffer must be I/O aligned. */
496 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
497 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
498
499 /* This assert is too expensive to have on normally ... */
500#ifdef CHECK_WRITE_VS_EXTEND
501 Assert(blocknum >= mdnblocks(reln, forknum));
502#endif
503
504 /*
505 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
506 * more --- we mustn't create a block whose number actually is
507 * InvalidBlockNumber. (Note that this failure should be unreachable
508 * because of upstream checks in bufmgr.c.)
509 */
510 if (blocknum == InvalidBlockNumber)
513 errmsg("cannot extend file \"%s\" beyond %u blocks",
514 relpath(reln->smgr_rlocator, forknum).str,
516
517 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
518
519 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
520
521 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
522
523 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
524 {
525 if (nbytes < 0)
528 errmsg("could not extend file \"%s\": %m",
530 errhint("Check free disk space.")));
531 /* short write: complain appropriately */
534 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
536 nbytes, BLCKSZ, blocknum),
537 errhint("Check free disk space.")));
538 }
539
540 if (!skipFsync && !SmgrIsTemp(reln))
541 register_dirty_segment(reln, forknum, v);
542
543 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
544}

References _mdfd_getseg(), _mdnblocks(), Assert, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg, ERROR, EXTENSION_CREATE, fb(), FilePathName(), FileWrite(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), PG_IO_ALIGN_SIZE, PG_O_DIRECT, register_dirty_segment(), relpath, SmgrIsTemp, and TYPEALIGN.

Referenced by _mdfd_getseg().

◆ mdfd()

int mdfd ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
uint32 off 
)

Definition at line 1495 of file md.c.

1496{
1497 MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
1498
1499 v = _mdfd_getseg(reln, forknum, blocknum, false,
1501
1502 *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1503
1504 Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1505
1506 return FileGetRawDesc(v->mdfd_vfd);
1507}

References _mdfd_getseg(), Assert, EXTENSION_FAIL, fb(), FileGetRawDesc(), _MdfdVec::mdfd_vfd, and mdopenfork().

Referenced by mdcreate(), and mdopenfork().

◆ mdfiletagmatches()

bool mdfiletagmatches ( const FileTag ftag,
const FileTag candidate 
)

Definition at line 1975 of file md.c.

1976{
1977 /*
1978 * For now we only use filter requests as a way to drop all scheduled
1979 * callbacks relating to a given database, when dropping the database.
1980 * We'll return true for all candidates that have the same database OID as
1981 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1982 */
1983 return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1984}

References RelFileLocator::dbOid, fb(), and FileTag::rlocator.

◆ mdimmedsync()

void mdimmedsync ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1442 of file md.c.

1443{
1444 int segno;
1445 int min_inactive_seg;
1446
1447 /*
1448 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1449 * the loop below will get them all!
1450 */
1451 mdnblocks(reln, forknum);
1452
1453 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1454
1455 /*
1456 * Temporarily open inactive segments, then close them after sync. There
1457 * may be some inactive segments left opened after fsync() error, but that
1458 * is harmless. We don't bother to clean them up and take a risk of
1459 * further trouble. The next mdclose() will soon close them.
1460 */
1461 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1462 segno++;
1463
1464 while (segno > 0)
1465 {
1466 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1467
1468 /*
1469 * fsyncs done through mdimmedsync() should be tracked in a separate
1470 * IOContext than those done through mdsyncfiletag() to differentiate
1471 * between unavoidable client backend fsyncs (e.g. those done during
1472 * index build) and those which ideally would have been done by the
1473 * checkpointer. Since other IO operations bypassing the buffer
1474 * manager could also be tracked in such an IOContext, wait until
1475 * these are also tracked to track immediate fsyncs.
1476 */
1480 errmsg("could not fsync file \"%s\": %m",
1481 FilePathName(v->mdfd_vfd))));
1482
1483 /* Close inactive segments immediately */
1484 if (segno > min_inactive_seg)
1485 {
1486 FileClose(v->mdfd_vfd);
1487 _fdvec_resize(reln, forknum, segno - 1);
1488 }
1489
1490 segno--;
1491 }
1492}

References _fdvec_resize(), _mdfd_openseg(), data_sync_elevel(), ereport, errcode_for_file_access(), errmsg, ERROR, fb(), FileClose(), FilePathName(), FileSync(), _MdfdVec::mdfd_vfd, and mdnblocks().

◆ mdinit()

void mdinit ( void  )

Definition at line 191 of file md.c.

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, MdCxt, and TopMemoryContext.

◆ mdmaxcombine()

uint32 mdmaxcombine ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum 
)

Definition at line 845 of file md.c.

847{
848 BlockNumber segoff;
849
850 segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
851
852 return RELSEG_SIZE - segoff;
853}

References fb().

◆ mdnblocks()

BlockNumber mdnblocks ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1235 of file md.c.

1236{
1237 MdfdVec *v;
1238 BlockNumber nblocks;
1239 BlockNumber segno;
1240
1241 mdopenfork(reln, forknum, EXTENSION_FAIL);
1242
1243 /* mdopen has opened the first segment */
1244 Assert(reln->md_num_open_segs[forknum] > 0);
1245
1246 /*
1247 * Start from the last open segments, to avoid redundant seeks. We have
1248 * previously verified that these segments are exactly RELSEG_SIZE long,
1249 * and it's useless to recheck that each time.
1250 *
1251 * NOTE: this assumption could only be wrong if another backend has
1252 * truncated the relation. We rely on higher code levels to handle that
1253 * scenario by closing and re-opening the md fd, which is handled via
1254 * relcache flush. (Since the checkpointer doesn't participate in
1255 * relcache flush, it could have segment entries for inactive segments;
1256 * that's OK because the checkpointer never needs to compute relation
1257 * size.)
1258 */
1259 segno = reln->md_num_open_segs[forknum] - 1;
1260 v = &reln->md_seg_fds[forknum][segno];
1261
1262 for (;;)
1263 {
1264 nblocks = _mdnblocks(reln, forknum, v);
1265 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1266 elog(FATAL, "segment too big");
1267 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1268 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1269
1270 /*
1271 * If segment is exactly RELSEG_SIZE, advance to next one.
1272 */
1273 segno++;
1274
1275 /*
1276 * We used to pass O_CREAT here, but that has the disadvantage that it
1277 * might create a segment which has vanished through some operating
1278 * system misadventure. In such a case, creating the segment here
1279 * undermines _mdfd_getseg's attempts to notice and report an error
1280 * upon access to a missing segment.
1281 */
1282 v = _mdfd_openseg(reln, forknum, segno, 0);
1283 if (v == NULL)
1284 return segno * ((BlockNumber) RELSEG_SIZE);
1285 }
1286}

References _mdfd_openseg(), _mdnblocks(), Assert, elog, EXTENSION_FAIL, FATAL, fb(), and mdopenfork().

Referenced by mdextend(), mdimmedsync(), mdregistersync(), mdwritev(), and mdzeroextend().

◆ mdopen()

void mdopen ( SMgrRelation  reln)

Definition at line 714 of file md.c.

715{
716 /* mark it not open */
717 for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
718 reln->md_num_open_segs[forknum] = 0;
719}

References fb(), and MAX_FORKNUM.

◆ mdopenfork()

static MdfdVec * mdopenfork ( SMgrRelation  reln,
ForkNumber  forknum,
int  behavior 
)
static

Definition at line 676 of file md.c.

677{
678 MdfdVec *mdfd;
679 RelPathStr path;
680 File fd;
681
682 /* No work if already open */
683 if (reln->md_num_open_segs[forknum] > 0)
684 return &reln->md_seg_fds[forknum][0];
685
686 path = relpath(reln->smgr_rlocator, forknum);
687
689
690 if (fd < 0)
691 {
692 if ((behavior & EXTENSION_RETURN_NULL) &&
694 return NULL;
697 errmsg("could not open file \"%s\": %m", path.str)));
698 }
699
700 _fdvec_resize(reln, forknum, 1);
701 mdfd = &reln->md_seg_fds[forknum][0];
702 mdfd->mdfd_vfd = fd;
703 mdfd->mdfd_segno = 0;
704
706
707 return mdfd;
708}

References _fdvec_resize(), _mdfd_open_flags(), _mdnblocks(), Assert, ereport, errcode_for_file_access(), errmsg, ERROR, EXTENSION_RETURN_NULL, fb(), fd(), FILE_POSSIBLY_DELETED, mdfd(), PathNameOpenFile(), relpath, and RelPathStr::str.

Referenced by _mdfd_getseg(), mdexists(), mdfd(), and mdnblocks().

◆ mdprefetch()

bool mdprefetch ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
int  nblocks 
)

Definition at line 748 of file md.c.

750{
751#ifdef USE_PREFETCH
752
754
755 if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
756 return false;
757
758 while (nblocks > 0)
759 {
760 pgoff_t seekpos;
761 MdfdVec *v;
763
764 v = _mdfd_getseg(reln, forknum, blocknum, false,
766 if (v == NULL)
767 return false;
768
769 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
770
771 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
772
774 Min(nblocks,
775 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
776
779
780 blocknum += nblocks_this_segment;
781 nblocks -= nblocks_this_segment;
782 }
783#endif /* USE_PREFETCH */
784
785 return true;
786}

References _mdfd_getseg(), Assert, EXTENSION_FAIL, EXTENSION_RETURN_NULL, fb(), FilePrefetch(), InRecovery, IO_DIRECT_DATA, io_direct_flags, MaxBlockNumber, _MdfdVec::mdfd_vfd, and Min.

◆ mdreadv()

void mdreadv ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
void **  buffers,
BlockNumber  nblocks 
)

Definition at line 859 of file md.c.

861{
862 while (nblocks > 0)
863 {
864 struct iovec iov[PG_IOV_MAX];
865 int iovcnt;
866 pgoff_t seekpos;
867 int nbytes;
868 MdfdVec *v;
871 size_t size_this_segment;
872
873 v = _mdfd_getseg(reln, forknum, blocknum, false,
875
876 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
877
878 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
879
881 Min(nblocks,
882 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
884
885 if (nblocks_this_segment != nblocks)
886 elog(ERROR, "read crosses segment boundary");
887
891
892 /*
893 * Inner loop to continue after a short read. We'll keep going until
894 * we hit EOF rather than assuming that a short read means we hit the
895 * end.
896 */
897 for (;;)
898 {
899 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
900 reln->smgr_rlocator.locator.spcOid,
901 reln->smgr_rlocator.locator.dbOid,
902 reln->smgr_rlocator.locator.relNumber,
903 reln->smgr_rlocator.backend);
904 nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
906 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
907 reln->smgr_rlocator.locator.spcOid,
908 reln->smgr_rlocator.locator.dbOid,
909 reln->smgr_rlocator.locator.relNumber,
910 reln->smgr_rlocator.backend,
911 nbytes,
913
914#ifdef SIMULATE_SHORT_READ
915 nbytes = Min(nbytes, 4096);
916#endif
917
918 if (nbytes < 0)
921 errmsg("could not read blocks %u..%u in file \"%s\": %m",
922 blocknum,
923 blocknum + nblocks_this_segment - 1,
924 FilePathName(v->mdfd_vfd))));
925
926 if (nbytes == 0)
927 {
928 /*
929 * We are at or past EOF, or we read a partial block at EOF.
930 * Normally this is an error; upper levels should never try to
931 * read a nonexistent block. However, if zero_damaged_pages
932 * is ON or we are InRecovery, we should instead return zeroes
933 * without complaining. This allows, for example, the case of
934 * trying to update a block that was later truncated away.
935 *
936 * NB: We think that this codepath is unreachable in recovery
937 * and incomplete with zero_damaged_pages, as missing segments
938 * are not created. Putting blocks into the buffer-pool that
939 * do not exist on disk is rather problematic, as it will not
940 * be found by scans that rely on smgrnblocks(), as they are
941 * beyond EOF. It also can cause weird problems with relation
942 * extension, as relation extension does not expect blocks
943 * beyond EOF to exist.
944 *
945 * Therefore we do not want to copy the logic into
946 * mdstartreadv(), where it would have to be more complicated
947 * due to potential differences in the zero_damaged_pages
948 * setting between the definer and completor of IO.
949 *
950 * For PG 18, we are putting an Assert(false) in mdreadv()
951 * (triggering failures in assertion-enabled builds, but
952 * continuing to work in production builds). Afterwards we
953 * plan to remove this code entirely.
954 */
956 {
957 Assert(false); /* see comment above */
958
961 ++i)
962 memset(buffers[i], 0, BLCKSZ);
963 break;
964 }
965 else
968 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
969 blocknum,
970 blocknum + nblocks_this_segment - 1,
974 }
975
976 /* One loop should usually be enough. */
977 transferred_this_segment += nbytes;
980 break;
981
982 /* Adjust position and vectors after a short read. */
983 seekpos += nbytes;
985 }
986
987 nblocks -= nblocks_this_segment;
988 buffers += nblocks_this_segment;
989 blocknum += nblocks_this_segment;
990 }
991}

References _mdfd_getseg(), Assert, buffers_to_iovec(), compute_remaining_iovec(), elog, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errcode_for_file_access(), errmsg, ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, fb(), FilePathName(), FileReadV(), i, InRecovery, lengthof, _MdfdVec::mdfd_vfd, Min, PG_IOV_MAX, and zero_damaged_pages.

◆ mdregistersync()

void mdregistersync ( SMgrRelation  reln,
ForkNumber  forknum 
)

Definition at line 1391 of file md.c.

1392{
1393 int segno;
1394 int min_inactive_seg;
1395
1396 /*
1397 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1398 * the loop below will get them all!
1399 */
1400 mdnblocks(reln, forknum);
1401
1402 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1403
1404 /*
1405 * Temporarily open inactive segments, then close them after sync. There
1406 * may be some inactive segments left opened after error, but that is
1407 * harmless. We don't bother to clean them up and take a risk of further
1408 * trouble. The next mdclose() will soon close them.
1409 */
1410 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1411 segno++;
1412
1413 while (segno > 0)
1414 {
1415 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1416
1417 register_dirty_segment(reln, forknum, v);
1418
1419 /* Close inactive segments immediately */
1420 if (segno > min_inactive_seg)
1421 {
1422 FileClose(v->mdfd_vfd);
1423 _fdvec_resize(reln, forknum, segno - 1);
1424 }
1425
1426 segno--;
1427 }
1428}

References _fdvec_resize(), _mdfd_openseg(), fb(), FileClose(), _MdfdVec::mdfd_vfd, mdnblocks(), and register_dirty_segment().

◆ mdstartreadv()

void mdstartreadv ( PgAioHandle ioh,
SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
void **  buffers,
BlockNumber  nblocks 
)

Definition at line 997 of file md.c.

1000{
1001 pgoff_t seekpos;
1002 MdfdVec *v;
1004 struct iovec *iov;
1005 int iovcnt;
1006 int ret;
1007
1008 v = _mdfd_getseg(reln, forknum, blocknum, false,
1010
1011 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1012
1013 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1014
1016 Min(nblocks,
1017 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1018
1019 if (nblocks_this_segment != nblocks)
1020 elog(ERROR, "read crossing segment boundary");
1021
1023
1024 Assert(nblocks <= iovcnt);
1025
1027
1029
1032
1034 reln,
1035 forknum,
1036 blocknum,
1037 nblocks,
1038 false);
1040
1042 if (ret != 0)
1043 ereport(ERROR,
1045 errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
1046 blocknum,
1047 blocknum + nblocks_this_segment - 1,
1048 FilePathName(v->mdfd_vfd))));
1049
1050 /*
1051 * The error checks corresponding to the post-read checks in mdreadv() are
1052 * in md_readv_complete().
1053 *
1054 * However we chose, at least for now, to not implement the
1055 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
1056 * that logic is rather problematic, and we want to get rid of it. Here
1057 * equivalent logic would have to be more complicated due to potential
1058 * differences in the zero_damaged_pages setting between the definer and
1059 * completor of IO.
1060 */
1061}

References _mdfd_getseg(), Assert, buffers_to_iovec(), elog, ereport, errcode_for_file_access(), errmsg, ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, fb(), FilePathName(), FileStartReadV(), IO_DIRECT_DATA, io_direct_flags, _MdfdVec::mdfd_vfd, Min, PGAIO_HCB_MD_READV, PGAIO_HF_BUFFERED, pgaio_io_get_iovec(), pgaio_io_register_callbacks(), pgaio_io_set_flag(), and pgaio_io_set_target_smgr().

◆ mdsyncfiletag()

int mdsyncfiletag ( const FileTag ftag,
char path 
)

Definition at line 1905 of file md.c.

1906{
1908 File file;
1910 bool need_to_close;
1911 int result,
1912 save_errno;
1913
1914 /* See if we already have the file open, or need to open it. */
1915 if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1916 {
1917 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1918 strlcpy(path, FilePathName(file), MAXPGPATH);
1919 need_to_close = false;
1920 }
1921 else
1922 {
1923 MdPathStr p;
1924
1925 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1926 strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
1927
1928 file = PathNameOpenFile(path, _mdfd_open_flags());
1929 if (file < 0)
1930 return -1;
1931 need_to_close = true;
1932 }
1933
1935
1936 /* Sync the file. */
1937 result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1938 save_errno = errno;
1939
1940 if (need_to_close)
1941 FileClose(file);
1942
1944 IOOP_FSYNC, io_start, 1, 0);
1945
1946 errno = save_errno;
1947 return result;
1948}

References _mdfd_open_flags(), _mdfd_segpath(), fb(), FileClose(), FilePathName(), FileSync(), FileTag::forknum, INVALID_PROC_NUMBER, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, MAXPGPATH, MD_PATH_STR_MAXLEN, PathNameOpenFile(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), FileTag::rlocator, FileTag::segno, smgropen(), MdPathStr::str, strlcpy(), and track_io_timing.

◆ mdtruncate()

void mdtruncate ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  curnblk,
BlockNumber  nblocks 
)

Definition at line 1302 of file md.c.

1304{
1306 int curopensegs;
1307
1308 if (nblocks > curnblk)
1309 {
1310 /* Bogus request ... but no complaint if InRecovery */
1311 if (InRecovery)
1312 return;
1313 ereport(ERROR,
1314 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1315 relpath(reln->smgr_rlocator, forknum).str,
1316 nblocks, curnblk)));
1317 }
1318 if (nblocks == curnblk)
1319 return; /* no work */
1320
1321 /*
1322 * Truncate segments, starting at the last one. Starting at the end makes
1323 * managing the memory for the fd array easier, should there be errors.
1324 */
1325 curopensegs = reln->md_num_open_segs[forknum];
1326 while (curopensegs > 0)
1327 {
1328 MdfdVec *v;
1329
1331
1332 v = &reln->md_seg_fds[forknum][curopensegs - 1];
1333
1334 if (priorblocks > nblocks)
1335 {
1336 /*
1337 * This segment is no longer active. We truncate the file, but do
1338 * not delete it, for reasons explained in the header comments.
1339 */
1341 ereport(ERROR,
1343 errmsg("could not truncate file \"%s\": %m",
1344 FilePathName(v->mdfd_vfd))));
1345
1346 if (!SmgrIsTemp(reln))
1347 register_dirty_segment(reln, forknum, v);
1348
1349 /* we never drop the 1st segment */
1350 Assert(v != &reln->md_seg_fds[forknum][0]);
1351
1352 FileClose(v->mdfd_vfd);
1353 _fdvec_resize(reln, forknum, curopensegs - 1);
1354 }
1355 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1356 {
1357 /*
1358 * This is the last segment we want to keep. Truncate the file to
1359 * the right length. NOTE: if nblocks is exactly a multiple K of
1360 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1361 * keep it. This adheres to the invariant given in the header
1362 * comments.
1363 */
1365
1367 ereport(ERROR,
1369 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1371 nblocks)));
1372 if (!SmgrIsTemp(reln))
1373 register_dirty_segment(reln, forknum, v);
1374 }
1375 else
1376 {
1377 /*
1378 * We still need this segment, so nothing to do for this and any
1379 * earlier segment.
1380 */
1381 break;
1382 }
1383 curopensegs--;
1384 }
1385}

References _fdvec_resize(), Assert, ereport, errcode_for_file_access(), errmsg, ERROR, fb(), FileClose(), FilePathName(), FileTruncate(), InRecovery, _MdfdVec::mdfd_vfd, register_dirty_segment(), relpath, and SmgrIsTemp.

◆ mdunlink()

void mdunlink ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
bool  isRedo 
)

Definition at line 338 of file md.c.

339{
340 /* Now do the per-fork work */
341 if (forknum == InvalidForkNumber)
342 {
343 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
344 mdunlinkfork(rlocator, forknum, isRedo);
345 }
346 else
347 mdunlinkfork(rlocator, forknum, isRedo);
348}

References fb(), InvalidForkNumber, MAX_FORKNUM, and mdunlinkfork().

◆ mdunlinkfiletag()

int mdunlinkfiletag ( const FileTag ftag,
char path 
)

Definition at line 1957 of file md.c.

1958{
1959 RelPathStr p;
1960
1961 /* Compute the path. */
1962 p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1963 strlcpy(path, p.str, MAXPGPATH);
1964
1965 /* Try to unlink the file. */
1966 return unlink(path);
1967}

References fb(), MAIN_FORKNUM, MAXPGPATH, relpathperm, FileTag::rlocator, RelPathStr::str, and strlcpy().

◆ mdunlinkfork()

static void mdunlinkfork ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
bool  isRedo 
)
static

Definition at line 375 of file md.c.

376{
377 RelPathStr path;
378 int ret;
379 int save_errno;
380
381 path = relpath(rlocator, forknum);
382
383 /*
384 * Truncate and then unlink the first segment, or just register a request
385 * to unlink it later, as described in the comments for mdunlink().
386 */
387 if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
389 {
390 if (!RelFileLocatorBackendIsTemp(rlocator))
391 {
392 /* Prevent other backends' fds from holding on to the disk space */
393 ret = do_truncate(path.str);
394
395 /* Forget any pending sync requests for the first segment */
397 register_forget_request(rlocator, forknum, 0 /* first seg */ );
399 }
400 else
401 ret = 0;
402
403 /* Next unlink the file, unless it was already found to be missing */
404 if (ret >= 0 || errno != ENOENT)
405 {
406 ret = unlink(path.str);
407 if (ret < 0 && errno != ENOENT)
408 {
412 errmsg("could not remove file \"%s\": %m", path.str)));
414 }
415 }
416 }
417 else
418 {
419 /* Prevent other backends' fds from holding on to the disk space */
420 ret = do_truncate(path.str);
421
422 /* Register request to unlink first segment later */
424 register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
426 }
427
428 /*
429 * Delete any additional segments.
430 *
431 * Note that because we loop until getting ENOENT, we will correctly
432 * remove all inactive segments as well as active ones. Ideally we'd
433 * continue the loop until getting exactly that errno, but that risks an
434 * infinite loop if the problem is directory-wide (for instance, if we
435 * suddenly can't read the data directory itself). We compromise by
436 * continuing after a non-ENOENT truncate error, but stopping after any
437 * unlink error. If there is indeed a directory-wide problem, additional
438 * unlink attempts wouldn't work anyway.
439 */
440 if (ret >= 0 || errno != ENOENT)
441 {
443 BlockNumber segno;
444
445 for (segno = 1;; segno++)
446 {
447 sprintf(segpath.str, "%s.%u", path.str, segno);
448
449 if (!RelFileLocatorBackendIsTemp(rlocator))
450 {
451 /*
452 * Prevent other backends' fds from holding on to the disk
453 * space. We're done if we see ENOENT, though.
454 */
455 if (do_truncate(segpath.str) < 0 && errno == ENOENT)
456 break;
457
458 /*
459 * Forget any pending sync requests for this segment before we
460 * try to unlink.
461 */
462 register_forget_request(rlocator, forknum, segno);
463 }
464
465 if (unlink(segpath.str) < 0)
466 {
467 /* ENOENT is expected after the last segment... */
468 if (errno != ENOENT)
471 errmsg("could not remove file \"%s\": %m", segpath.str)));
472 break;
473 }
474 }
475 }
476}

References do_truncate(), ereport, errcode_for_file_access(), errmsg, fb(), IsBinaryUpgrade, MAIN_FORKNUM, register_forget_request(), register_unlink_segment(), RelFileLocatorBackendIsTemp, relpath, sprintf, RelPathStr::str, and WARNING.

Referenced by mdunlink().

◆ mdwriteback()

void mdwriteback ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
BlockNumber  nblocks 
)

Definition at line 1176 of file md.c.

1178{
1180
1181 /*
1182 * Issue flush requests in as few requests as possible; have to split at
1183 * segment boundaries though, since those are actually separate files.
1184 */
1185 while (nblocks > 0)
1186 {
1187 BlockNumber nflush = nblocks;
1188 pgoff_t seekpos;
1189 MdfdVec *v;
1190 int segnum_start,
1191 segnum_end;
1192
1193 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1195
1196 /*
1197 * We might be flushing buffers of already removed relations, that's
1198 * ok, just ignore that case. If the segment file wasn't open already
1199 * (ie from a recent mdwrite()), then we don't want to re-open it, to
1200 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1201 * us with a descriptor to a file that is about to be unlinked.
1202 */
1203 if (!v)
1204 return;
1205
1206 /* compute offset inside the current segment */
1207 segnum_start = blocknum / RELSEG_SIZE;
1208
1209 /* compute number of desired writes within the current segment */
1210 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1211 if (segnum_start != segnum_end)
1212 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1213
1214 Assert(nflush >= 1);
1215 Assert(nflush <= nblocks);
1216
1217 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1218
1220
1221 nblocks -= nflush;
1222 blocknum += nflush;
1223 }
1224}

References _mdfd_getseg(), Assert, EXTENSION_DONT_OPEN, fb(), FileWriteback(), IO_DIRECT_DATA, io_direct_flags, and _MdfdVec::mdfd_vfd.

◆ mdwritev()

void mdwritev ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
const void **  buffers,
BlockNumber  nblocks,
bool  skipFsync 
)

Definition at line 1071 of file md.c.

1073{
1074 /* This assert is too expensive to have on normally ... */
1075#ifdef CHECK_WRITE_VS_EXTEND
1076 Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
1077#endif
1078
1079 while (nblocks > 0)
1080 {
1081 struct iovec iov[PG_IOV_MAX];
1082 int iovcnt;
1083 pgoff_t seekpos;
1084 int nbytes;
1085 MdfdVec *v;
1088 size_t size_this_segment;
1089
1090 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
1092
1093 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1094
1095 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1096
1098 Min(nblocks,
1099 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1101
1102 if (nblocks_this_segment != nblocks)
1103 elog(ERROR, "write crosses segment boundary");
1104
1105 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
1108
1109 /*
1110 * Inner loop to continue after a short write. If the reason is that
1111 * we're out of disk space, a future attempt should get an ENOSPC
1112 * error from the kernel.
1113 */
1114 for (;;)
1115 {
1116 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
1117 reln->smgr_rlocator.locator.spcOid,
1118 reln->smgr_rlocator.locator.dbOid,
1119 reln->smgr_rlocator.locator.relNumber,
1120 reln->smgr_rlocator.backend);
1121 nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
1123 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
1124 reln->smgr_rlocator.locator.spcOid,
1125 reln->smgr_rlocator.locator.dbOid,
1126 reln->smgr_rlocator.locator.relNumber,
1127 reln->smgr_rlocator.backend,
1128 nbytes,
1130
1131#ifdef SIMULATE_SHORT_WRITE
1132 nbytes = Min(nbytes, 4096);
1133#endif
1134
1135 if (nbytes < 0)
1136 {
1137 bool enospc = errno == ENOSPC;
1138
1139 ereport(ERROR,
1141 errmsg("could not write blocks %u..%u in file \"%s\": %m",
1142 blocknum,
1143 blocknum + nblocks_this_segment - 1,
1145 enospc ? errhint("Check free disk space.") : 0));
1146 }
1147
1148 /* One loop should usually be enough. */
1149 transferred_this_segment += nbytes;
1152 break;
1153
1154 /* Adjust position and iovecs after a short write. */
1155 seekpos += nbytes;
1157 }
1158
1159 if (!skipFsync && !SmgrIsTemp(reln))
1160 register_dirty_segment(reln, forknum, v);
1161
1162 nblocks -= nblocks_this_segment;
1163 buffers += nblocks_this_segment;
1164 blocknum += nblocks_this_segment;
1165 }
1166}

References _mdfd_getseg(), Assert, buffers_to_iovec(), compute_remaining_iovec(), elog, ereport, errcode_for_file_access(), errhint(), errmsg, ERROR, EXTENSION_CREATE_RECOVERY, EXTENSION_FAIL, fb(), FilePathName(), FileWriteV(), lengthof, _MdfdVec::mdfd_vfd, mdnblocks(), Min, PG_IOV_MAX, register_dirty_segment(), and SmgrIsTemp.

◆ mdzeroextend()

void mdzeroextend ( SMgrRelation  reln,
ForkNumber  forknum,
BlockNumber  blocknum,
int  nblocks,
bool  skipFsync 
)

Definition at line 553 of file md.c.

555{
556 MdfdVec *v;
557 BlockNumber curblocknum = blocknum;
558 int remblocks = nblocks;
559
560 Assert(nblocks > 0);
561
562 /* This assert is too expensive to have on normally ... */
563#ifdef CHECK_WRITE_VS_EXTEND
564 Assert(blocknum >= mdnblocks(reln, forknum));
565#endif
566
567 /*
568 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
569 * more --- we mustn't create a block whose number actually is
570 * InvalidBlockNumber or larger.
571 */
572 if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
575 errmsg("cannot extend file \"%s\" beyond %u blocks",
576 relpath(reln->smgr_rlocator, forknum).str,
578
579 while (remblocks > 0)
580 {
582 pgoff_t seekpos = (pgoff_t) BLCKSZ * segstartblock;
583 int numblocks;
584
587 else
589
591
594
595 /*
596 * If available and useful, use posix_fallocate() (via
597 * FileFallocate()) to extend the relation. That's often more
598 * efficient than using write(), as it commonly won't cause the kernel
599 * to allocate page cache space for the extended pages.
600 *
601 * However, we don't use FileFallocate() for small extensions, as it
602 * defeats delayed allocation on some filesystems. Not clear where
603 * that decision should be made though? For now just use a cutoff of
604 * 8, anything between 4 and 8 worked OK in some local testing.
605 */
606 if (numblocks > 8 &&
608 {
609 int ret = 0;
610
611#ifdef HAVE_POSIX_FALLOCATE
613 {
614 ret = FileFallocate(v->mdfd_vfd,
615 seekpos, (pgoff_t) BLCKSZ * numblocks,
617 }
618 else
619#endif
620 {
621 elog(ERROR, "unsupported file_extend_method: %d",
623 }
624 if (ret != 0)
625 {
628 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
630 errhint("Check free disk space."));
631 }
632 }
633 else
634 {
635 int ret;
636
637 /*
638 * Even if we don't want to use fallocate, we can still extend a
639 * bit more efficiently than writing each 8kB block individually.
640 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
641 * to avoid multiple writes or needing a zeroed buffer for the
642 * whole length of the extension.
643 */
644 ret = FileZero(v->mdfd_vfd,
645 seekpos, (pgoff_t) BLCKSZ * numblocks,
647 if (ret < 0)
650 errmsg("could not extend file \"%s\": %m",
652 errhint("Check free disk space."));
653 }
654
655 if (!skipFsync && !SmgrIsTemp(reln))
656 register_dirty_segment(reln, forknum, v);
657
658 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
659
662 }
663}

References _mdfd_getseg(), _mdnblocks(), Assert, elog, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg, ERROR, EXTENSION_CREATE, fb(), file_extend_method, FILE_EXTEND_METHOD_WRITE_ZEROS, FileFallocate(), FilePathName(), FileZero(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), register_dirty_segment(), relpath, and SmgrIsTemp.

◆ register_dirty_segment()

static void register_dirty_segment ( SMgrRelation  reln,
ForkNumber  forknum,
MdfdVec seg 
)
static

Definition at line 1519 of file md.c.

1520{
1521 FileTag tag;
1522
1523 INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1524
1525 /* Temp relations should never be fsync'd */
1527
1528 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1529 {
1531
1533 (errmsg_internal("could not forward fsync request because request queue is full")));
1534
1536
1540 errmsg("could not fsync file \"%s\": %m",
1541 FilePathName(seg->mdfd_vfd))));
1542
1543 /*
1544 * We have no way of knowing if the current IOContext is
1545 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1546 * point, so count the fsync as being in the IOCONTEXT_NORMAL
1547 * IOContext. This is probably okay, because the number of backend
1548 * fsyncs doesn't say anything about the efficacy of the
1549 * BufferAccessStrategy. And counting both fsyncs done in
1550 * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1551 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1552 * backend fsyncs.
1553 */
1555 IOOP_FSYNC, io_start, 1, 0);
1556 }
1557}

References Assert, data_sync_elevel(), DEBUG1, ereport, errcode_for_file_access(), errmsg, errmsg_internal(), ERROR, fb(), FilePathName(), FileSync(), INIT_MD_FILETAG, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOP_FSYNC, _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RegisterSyncRequest(), SmgrIsTemp, SYNC_REQUEST, and track_io_timing.

Referenced by mdcreate(), mdextend(), mdregistersync(), mdtruncate(), mdwritev(), and mdzeroextend().

◆ register_forget_request()

static void register_forget_request ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1580 of file md.c.

1582{
1583 FileTag tag;
1584
1585 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1586
1587 RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
1588}

References INIT_MD_FILETAG, RelFileLocatorBackend::locator, RegisterSyncRequest(), and SYNC_FORGET_REQUEST.

Referenced by mdunlinkfork().

◆ register_unlink_segment()

static void register_unlink_segment ( RelFileLocatorBackend  rlocator,
ForkNumber  forknum,
BlockNumber  segno 
)
static

Definition at line 1563 of file md.c.

1565{
1566 FileTag tag;
1567
1568 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1569
1570 /* Should never be used with temp relations */
1572
1573 RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
1574}

References Assert, INIT_MD_FILETAG, RelFileLocatorBackend::locator, RegisterSyncRequest(), RelFileLocatorBackendIsTemp, and SYNC_UNLINK_REQUEST.

Referenced by mdunlinkfork().

◆ StaticAssertDecl()

StaticAssertDecl ( RELSEG_SIZE  ,
0 &&RELSEG_SIZE<=  INT_MAX,
"RELSEG_SIZE must fit in an integer"   
)

Variable Documentation

◆ aio_md_readv_cb

const PgAioHandleCallbacks aio_md_readv_cb
Initial value:
= {
.complete_shared = md_readv_complete,
.report = md_readv_report,
}

Definition at line 170 of file md.c.

170 {
171 .complete_shared = md_readv_complete,
172 .report = md_readv_report,
173};

◆ MdCxt

MemoryContext MdCxt
static

Definition at line 98 of file md.c.

Referenced by _fdvec_resize(), and mdinit().