PostgreSQL Source Code git master
Loading...
Searching...
No Matches
md.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * md.c
4 * This code manages relations that reside on magnetic disk.
5 *
6 * Or at least, that was what the Berkeley folk had in mind when they named
7 * this file. In reality, what this code provides is an interface from
8 * the smgr API to Unix-like filesystem APIs, so it will work with any type
9 * of device for which the operating system provides filesystem support.
10 * It doesn't matter whether the bits are on spinning rust or some other
11 * storage technology.
12 *
13 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
14 * Portions Copyright (c) 1994, Regents of the University of California
15 *
16 *
17 * IDENTIFICATION
18 * src/backend/storage/smgr/md.c
19 *
20 *-------------------------------------------------------------------------
21 */
22#include "postgres.h"
23
24#include <limits.h>
25#include <unistd.h>
26#include <fcntl.h>
27#include <sys/file.h>
28
29#include "access/xlogutils.h"
30#include "commands/tablespace.h"
31#include "common/file_utils.h"
32#include "miscadmin.h"
33#include "pg_trace.h"
34#include "pgstat.h"
35#include "storage/aio.h"
36#include "storage/bufmgr.h"
37#include "storage/fd.h"
38#include "storage/md.h"
40#include "storage/smgr.h"
41#include "storage/sync.h"
42#include "utils/memutils.h"
43
44/*
45 * The magnetic disk storage manager keeps track of open file
46 * descriptors in its own descriptor pool. This is done to make it
47 * easier to support relations that are larger than the operating
48 * system's file size limit (often 2GBytes). In order to do that,
49 * we break relations up into "segment" files that are each shorter than
50 * the OS file size limit. The segment size is set by the RELSEG_SIZE
51 * configuration constant in pg_config.h.
52 *
53 * On disk, a relation must consist of consecutively numbered segment
54 * files in the pattern
55 * -- Zero or more full segments of exactly RELSEG_SIZE blocks each
56 * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
57 * -- Optionally, any number of inactive segments of size 0 blocks.
58 * The full and partial segments are collectively the "active" segments.
59 * Inactive segments are those that once contained data but are currently
60 * not needed because of an mdtruncate() operation. The reason for leaving
61 * them present at size zero, rather than unlinking them, is that other
62 * backends and/or the checkpointer might be holding open file references to
63 * such segments. If the relation expands again after mdtruncate(), such
64 * that a deactivated segment becomes active again, it is important that
65 * such file references still be valid --- else data might get written
66 * out to an unlinked old copy of a segment file that will eventually
67 * disappear.
68 *
69 * RELSEG_SIZE must fit into BlockNumber; but since we expose its value
70 * as an integer GUC, it actually needs to fit in signed int. It's worth
71 * having a cross-check for this since configure's --with-segsize options
72 * could let people select insane values.
73 */
75 "RELSEG_SIZE must fit in an integer");
76
77/*
78 * File descriptors are stored in the per-fork md_seg_fds arrays inside
79 * SMgrRelation. The length of these arrays is stored in md_num_open_segs.
80 * Note that a fork's md_num_open_segs having a specific value does not
81 * necessarily mean the relation doesn't have additional segments; we may
82 * just not have opened the next segment yet. (We could not have "all
83 * segments are in the array" as an invariant anyway, since another backend
84 * could extend the relation while we aren't looking.) We do not have
85 * entries for inactive segments, however; as soon as we find a partial
86 * segment, we assume that any subsequent segments are inactive.
87 *
88 * The entire MdfdVec array is palloc'd in the MdCxt memory context.
89 */
90
91typedef struct _MdfdVec
92{
93 File mdfd_vfd; /* fd number in fd.c's pool */
94 BlockNumber mdfd_segno; /* segment number, from 0 */
96
97static MemoryContext MdCxt; /* context for all MdfdVec objects */
98
99
100/* Populate a file tag describing an md.c segment file. */
101#define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \
102( \
103 memset(&(a), 0, sizeof(FileTag)), \
104 (a).handler = SYNC_HANDLER_MD, \
105 (a).rlocator = (xx_rlocator), \
106 (a).forknum = (xx_forknum), \
107 (a).segno = (xx_segno) \
108)
109
110
111/*** behavior for mdopen & _mdfd_getseg ***/
112/* ereport if segment not present */
113#define EXTENSION_FAIL (1 << 0)
114/* return NULL if segment not present */
115#define EXTENSION_RETURN_NULL (1 << 1)
116/* create new segments as needed */
117#define EXTENSION_CREATE (1 << 2)
118/* create new segments if needed during recovery */
119#define EXTENSION_CREATE_RECOVERY (1 << 3)
120/* don't try to open a segment, if not already open */
121#define EXTENSION_DONT_OPEN (1 << 5)
122
123
124/*
125 * Fixed-length string to represent paths to files that need to be built by
126 * md.c.
127 *
128 * The maximum number of segments is MaxBlockNumber / RELSEG_SIZE, where
129 * RELSEG_SIZE can be set to 1 (for testing only).
130 */
131#define SEGMENT_CHARS OIDCHARS
132#define MD_PATH_STR_MAXLEN \
133 (\
134 REL_PATH_STR_MAXLEN \
135 + sizeof((char)'.') \
136 + SEGMENT_CHARS \
137 )
138typedef struct MdPathStr
139{
142
143
144/* local routines */
145static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
146 bool isRedo);
147static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
149 MdfdVec *seg);
150static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
151 BlockNumber segno);
152static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
153 BlockNumber segno);
155 ForkNumber forknum,
156 int nseg);
158 BlockNumber segno);
160 BlockNumber segno, int oflags);
162 BlockNumber blkno, bool skipFsync, int behavior);
164 MdfdVec *seg);
165
167static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel);
168
173
174
175static inline int
177{
178 int flags = O_RDWR | PG_BINARY;
179
181 flags |= PG_O_DIRECT;
182
183 return flags;
184}
185
186/*
187 * mdinit() -- Initialize private state for magnetic disk storage manager.
188 */
189void
196
197/*
198 * mdexists() -- Does the physical file exist?
199 *
200 * Note: this will return true for lingering files, with pending deletions
201 */
202bool
204{
205 /*
206 * Close it first, to ensure that we notice if the fork has been unlinked
207 * since we opened it. As an optimization, we can skip that in recovery,
208 * which already closes relations when dropping them.
209 */
210 if (!InRecovery)
211 mdclose(reln, forknum);
212
213 return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
214}
215
216/*
217 * mdcreate() -- Create a new relation on magnetic disk.
218 *
219 * If isRedo is true, it's okay for the relation to exist already.
220 */
221void
223{
224 MdfdVec *mdfd;
225 RelPathStr path;
226 File fd;
227
228 if (isRedo && reln->md_num_open_segs[forknum] > 0)
229 return; /* created and opened already... */
230
231 Assert(reln->md_num_open_segs[forknum] == 0);
232
233 /*
234 * We may be using the target table space for the first time in this
235 * database, so create a per-database subdirectory if needed.
236 *
237 * XXX this is a fairly ugly violation of module layering, but this seems
238 * to be the best place to put the check. Maybe TablespaceCreateDbspace
239 * should be here and not in commands/tablespace.c? But that would imply
240 * importing a lot of stuff that smgr.c oughtn't know, either.
241 */
242 TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
243 reln->smgr_rlocator.locator.dbOid,
244 isRedo);
245
246 path = relpath(reln->smgr_rlocator, forknum);
247
249
250 if (fd < 0)
251 {
252 int save_errno = errno;
253
254 if (isRedo)
256 if (fd < 0)
257 {
258 /* be sure to report the error reported by create, not open */
262 errmsg("could not create file \"%s\": %m", path.str)));
263 }
264 }
265
266 _fdvec_resize(reln, forknum, 1);
267 mdfd = &reln->md_seg_fds[forknum][0];
268 mdfd->mdfd_vfd = fd;
269 mdfd->mdfd_segno = 0;
270
271 if (!SmgrIsTemp(reln))
273}
274
275/*
276 * mdunlink() -- Unlink a relation.
277 *
278 * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
279 * there won't be an SMgrRelation hashtable entry anymore.
280 *
281 * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
282 * to delete all forks.
283 *
284 * For regular relations, we don't unlink the first segment file of the rel,
285 * but just truncate it to zero length, and record a request to unlink it after
286 * the next checkpoint. Additional segments can be unlinked immediately,
287 * however. Leaving the empty file in place prevents that relfilenumber
288 * from being reused. The scenario this protects us from is:
289 * 1. We delete a relation (and commit, and actually remove its file).
290 * 2. We create a new relation, which by chance gets the same relfilenumber as
291 * the just-deleted one (OIDs must've wrapped around for that to happen).
292 * 3. We crash before another checkpoint occurs.
293 * During replay, we would delete the file and then recreate it, which is fine
294 * if the contents of the file were repopulated by subsequent WAL entries.
295 * But if we didn't WAL-log insertions, but instead relied on fsyncing the
296 * file after populating it (as we do at wal_level=minimal), the contents of
297 * the file would be lost forever. By leaving the empty file until after the
298 * next checkpoint, we prevent reassignment of the relfilenumber until it's
299 * safe, because relfilenumber assignment skips over any existing file.
300 *
301 * Additional segments, if any, are truncated and then unlinked. The reason
302 * for truncating is that other backends may still hold open FDs for these at
303 * the smgr level, so that the kernel can't remove the file yet. We want to
304 * reclaim the disk space right away despite that.
305 *
306 * We do not need to go through this dance for temp relations, though, because
307 * we never make WAL entries for temp rels, and so a temp rel poses no threat
308 * to the health of a regular rel that has taken over its relfilenumber.
309 * The fact that temp rels and regular rels have different file naming
310 * patterns provides additional safety. Other backends shouldn't have open
311 * FDs for them, either.
312 *
313 * We also don't do it while performing a binary upgrade. There is no reuse
314 * hazard in that case, since after a crash or even a simple ERROR, the
315 * upgrade fails and the whole cluster must be recreated from scratch.
316 * Furthermore, it is important to remove the files from disk immediately,
317 * because we may be about to reuse the same relfilenumber.
318 *
319 * All the above applies only to the relation's main fork; other forks can
320 * just be removed immediately, since they are not needed to prevent the
321 * relfilenumber from being recycled. Also, we do not carefully
322 * track whether other forks have been created or not, but just attempt to
323 * unlink them unconditionally; so we should never complain about ENOENT.
324 *
325 * If isRedo is true, it's unsurprising for the relation to be already gone.
326 * Also, we should remove the file immediately instead of queuing a request
327 * for later, since during redo there's no possibility of creating a
328 * conflicting relation.
329 *
330 * Note: we currently just never warn about ENOENT at all. We could warn in
331 * the main-fork, non-isRedo case, but it doesn't seem worth the trouble.
332 *
333 * Note: any failure should be reported as WARNING not ERROR, because
334 * we are usually not in a transaction anymore when this is called.
335 */
336void
338{
339 /* Now do the per-fork work */
340 if (forknum == InvalidForkNumber)
341 {
342 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
343 mdunlinkfork(rlocator, forknum, isRedo);
344 }
345 else
346 mdunlinkfork(rlocator, forknum, isRedo);
347}
348
349/*
350 * Truncate a file to release disk space.
351 */
352static int
353do_truncate(const char *path)
354{
355 int save_errno;
356 int ret;
357
358 ret = pg_truncate(path, 0);
359
360 /* Log a warning here to avoid repetition in callers. */
361 if (ret < 0 && errno != ENOENT)
362 {
366 errmsg("could not truncate file \"%s\": %m", path)));
368 }
369
370 return ret;
371}
372
373static void
375{
376 RelPathStr path;
377 int ret;
378 int save_errno;
379
380 path = relpath(rlocator, forknum);
381
382 /*
383 * Truncate and then unlink the first segment, or just register a request
384 * to unlink it later, as described in the comments for mdunlink().
385 */
386 if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
388 {
389 if (!RelFileLocatorBackendIsTemp(rlocator))
390 {
391 /* Prevent other backends' fds from holding on to the disk space */
392 ret = do_truncate(path.str);
393
394 /* Forget any pending sync requests for the first segment */
396 register_forget_request(rlocator, forknum, 0 /* first seg */ );
398 }
399 else
400 ret = 0;
401
402 /* Next unlink the file, unless it was already found to be missing */
403 if (ret >= 0 || errno != ENOENT)
404 {
405 ret = unlink(path.str);
406 if (ret < 0 && errno != ENOENT)
407 {
411 errmsg("could not remove file \"%s\": %m", path.str)));
413 }
414 }
415 }
416 else
417 {
418 /* Prevent other backends' fds from holding on to the disk space */
419 ret = do_truncate(path.str);
420
421 /* Register request to unlink first segment later */
423 register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
425 }
426
427 /*
428 * Delete any additional segments.
429 *
430 * Note that because we loop until getting ENOENT, we will correctly
431 * remove all inactive segments as well as active ones. Ideally we'd
432 * continue the loop until getting exactly that errno, but that risks an
433 * infinite loop if the problem is directory-wide (for instance, if we
434 * suddenly can't read the data directory itself). We compromise by
435 * continuing after a non-ENOENT truncate error, but stopping after any
436 * unlink error. If there is indeed a directory-wide problem, additional
437 * unlink attempts wouldn't work anyway.
438 */
439 if (ret >= 0 || errno != ENOENT)
440 {
442 BlockNumber segno;
443
444 for (segno = 1;; segno++)
445 {
446 sprintf(segpath.str, "%s.%u", path.str, segno);
447
448 if (!RelFileLocatorBackendIsTemp(rlocator))
449 {
450 /*
451 * Prevent other backends' fds from holding on to the disk
452 * space. We're done if we see ENOENT, though.
453 */
454 if (do_truncate(segpath.str) < 0 && errno == ENOENT)
455 break;
456
457 /*
458 * Forget any pending sync requests for this segment before we
459 * try to unlink.
460 */
461 register_forget_request(rlocator, forknum, segno);
462 }
463
464 if (unlink(segpath.str) < 0)
465 {
466 /* ENOENT is expected after the last segment... */
467 if (errno != ENOENT)
470 errmsg("could not remove file \"%s\": %m", segpath.str)));
471 break;
472 }
473 }
474 }
475}
476
477/*
478 * mdextend() -- Add a block to the specified relation.
479 *
480 * The semantics are nearly the same as mdwrite(): write at the
481 * specified position. However, this is to be used for the case of
482 * extending a relation (i.e., blocknum is at or beyond the current
483 * EOF). Note that we assume writing a block beyond current EOF
484 * causes intervening file space to become filled with zeroes.
485 */
486void
488 const void *buffer, bool skipFsync)
489{
490 pgoff_t seekpos;
491 int nbytes;
492 MdfdVec *v;
493
494 /* If this build supports direct I/O, the buffer must be I/O aligned. */
495 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
496 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
497
498 /* This assert is too expensive to have on normally ... */
499#ifdef CHECK_WRITE_VS_EXTEND
500 Assert(blocknum >= mdnblocks(reln, forknum));
501#endif
502
503 /*
504 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
505 * more --- we mustn't create a block whose number actually is
506 * InvalidBlockNumber. (Note that this failure should be unreachable
507 * because of upstream checks in bufmgr.c.)
508 */
509 if (blocknum == InvalidBlockNumber)
512 errmsg("cannot extend file \"%s\" beyond %u blocks",
513 relpath(reln->smgr_rlocator, forknum).str,
515
516 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
517
518 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
519
520 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
521
522 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
523 {
524 if (nbytes < 0)
527 errmsg("could not extend file \"%s\": %m",
529 errhint("Check free disk space.")));
530 /* short write: complain appropriately */
533 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
535 nbytes, BLCKSZ, blocknum),
536 errhint("Check free disk space.")));
537 }
538
539 if (!skipFsync && !SmgrIsTemp(reln))
540 register_dirty_segment(reln, forknum, v);
541
542 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
543}
544
545/*
546 * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
547 *
548 * Similar to mdextend(), except the relation can be extended by multiple
549 * blocks at once and the added blocks will be filled with zeroes.
550 */
551void
553 BlockNumber blocknum, int nblocks, bool skipFsync)
554{
555 MdfdVec *v;
556 BlockNumber curblocknum = blocknum;
557 int remblocks = nblocks;
558
559 Assert(nblocks > 0);
560
561 /* This assert is too expensive to have on normally ... */
562#ifdef CHECK_WRITE_VS_EXTEND
563 Assert(blocknum >= mdnblocks(reln, forknum));
564#endif
565
566 /*
567 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
568 * more --- we mustn't create a block whose number actually is
569 * InvalidBlockNumber or larger.
570 */
571 if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
574 errmsg("cannot extend file \"%s\" beyond %u blocks",
575 relpath(reln->smgr_rlocator, forknum).str,
577
578 while (remblocks > 0)
579 {
581 pgoff_t seekpos = (pgoff_t) BLCKSZ * segstartblock;
582 int numblocks;
583
586 else
588
590
593
594 /*
595 * If available and useful, use posix_fallocate() (via
596 * FileFallocate()) to extend the relation. That's often more
597 * efficient than using write(), as it commonly won't cause the kernel
598 * to allocate page cache space for the extended pages.
599 *
600 * However, we don't use FileFallocate() for small extensions, as it
601 * defeats delayed allocation on some filesystems. Not clear where
602 * that decision should be made though? For now just use a cutoff of
603 * 8, anything between 4 and 8 worked OK in some local testing.
604 */
605 if (numblocks > 8 &&
607 {
608 int ret = 0;
609
610#ifdef HAVE_POSIX_FALLOCATE
612 {
613 ret = FileFallocate(v->mdfd_vfd,
614 seekpos, (pgoff_t) BLCKSZ * numblocks,
616 }
617 else
618#endif
619 {
620 elog(ERROR, "unsupported file_extend_method: %d",
622 }
623 if (ret != 0)
624 {
627 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
629 errhint("Check free disk space."));
630 }
631 }
632 else
633 {
634 int ret;
635
636 /*
637 * Even if we don't want to use fallocate, we can still extend a
638 * bit more efficiently than writing each 8kB block individually.
639 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
640 * to avoid multiple writes or needing a zeroed buffer for the
641 * whole length of the extension.
642 */
643 ret = FileZero(v->mdfd_vfd,
644 seekpos, (pgoff_t) BLCKSZ * numblocks,
646 if (ret < 0)
649 errmsg("could not extend file \"%s\": %m",
651 errhint("Check free disk space."));
652 }
653
654 if (!skipFsync && !SmgrIsTemp(reln))
655 register_dirty_segment(reln, forknum, v);
656
657 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
658
661 }
662}
663
664/*
665 * mdopenfork() -- Open one fork of the specified relation.
666 *
667 * Note we only open the first segment, when there are multiple segments.
668 *
669 * If first segment is not present, either ereport or return NULL according
670 * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
671 * EXTENSION_CREATE means it's OK to extend an existing relation, not to
672 * invent one out of whole cloth.
673 */
674static MdfdVec *
675mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
676{
677 MdfdVec *mdfd;
678 RelPathStr path;
679 File fd;
680
681 /* No work if already open */
682 if (reln->md_num_open_segs[forknum] > 0)
683 return &reln->md_seg_fds[forknum][0];
684
685 path = relpath(reln->smgr_rlocator, forknum);
686
688
689 if (fd < 0)
690 {
691 if ((behavior & EXTENSION_RETURN_NULL) &&
693 return NULL;
696 errmsg("could not open file \"%s\": %m", path.str)));
697 }
698
699 _fdvec_resize(reln, forknum, 1);
700 mdfd = &reln->md_seg_fds[forknum][0];
701 mdfd->mdfd_vfd = fd;
702 mdfd->mdfd_segno = 0;
703
705
706 return mdfd;
707}
708
709/*
710 * mdopen() -- Initialize newly-opened relation.
711 */
712void
714{
715 /* mark it not open */
716 for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
717 reln->md_num_open_segs[forknum] = 0;
718}
719
720/*
721 * mdclose() -- Close the specified relation, if it isn't closed already.
722 */
723void
725{
726 int nopensegs = reln->md_num_open_segs[forknum];
727
728 /* No work if already closed */
729 if (nopensegs == 0)
730 return;
731
732 /* close segments starting from the end */
733 while (nopensegs > 0)
734 {
735 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
736
738 _fdvec_resize(reln, forknum, nopensegs - 1);
739 nopensegs--;
740 }
741}
742
743/*
744 * mdprefetch() -- Initiate asynchronous read of the specified blocks of a relation
745 */
746bool
748 int nblocks)
749{
750#ifdef USE_PREFETCH
751
753
754 if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
755 return false;
756
757 while (nblocks > 0)
758 {
759 pgoff_t seekpos;
760 MdfdVec *v;
762
763 v = _mdfd_getseg(reln, forknum, blocknum, false,
765 if (v == NULL)
766 return false;
767
768 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
769
770 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
771
773 Min(nblocks,
774 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
775
778
779 blocknum += nblocks_this_segment;
780 nblocks -= nblocks_this_segment;
781 }
782#endif /* USE_PREFETCH */
783
784 return true;
785}
786
787/*
788 * Convert an array of buffer address into an array of iovec objects, and
789 * return the number that were required. 'iov' must have enough space for up
790 * to 'nblocks' elements, but the number used may be less depending on
791 * merging. In the case of a run of fully contiguous buffers, a single iovec
792 * will be populated that can be handled as a plain non-vectored I/O.
793 */
794static int
795buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
796{
797 struct iovec *iovp;
798 int iovcnt;
799
800 Assert(nblocks >= 1);
801
802 /* If this build supports direct I/O, buffers must be I/O aligned. */
803 for (int i = 0; i < nblocks; ++i)
804 {
805 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
806 Assert((uintptr_t) buffers[i] ==
807 TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
808 }
809
810 /* Start the first iovec off with the first buffer. */
811 iovp = &iov[0];
812 iovp->iov_base = buffers[0];
813 iovp->iov_len = BLCKSZ;
814 iovcnt = 1;
815
816 /* Try to merge the rest. */
817 for (int i = 1; i < nblocks; ++i)
818 {
819 void *buffer = buffers[i];
820
821 if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
822 {
823 /* Contiguous with the last iovec. */
824 iovp->iov_len += BLCKSZ;
825 }
826 else
827 {
828 /* Need a new iovec. */
829 iovp++;
830 iovp->iov_base = buffer;
831 iovp->iov_len = BLCKSZ;
832 iovcnt++;
833 }
834 }
835
836 return iovcnt;
837}
838
839/*
840 * mdmaxcombine() -- Return the maximum number of total blocks that can be
841 * combined with an IO starting at blocknum.
842 */
843uint32
845 BlockNumber blocknum)
846{
847 BlockNumber segoff;
848
849 segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
850
851 return RELSEG_SIZE - segoff;
852}
853
854/*
855 * mdreadv() -- Read the specified blocks from a relation.
856 */
857void
859 void **buffers, BlockNumber nblocks)
860{
861 while (nblocks > 0)
862 {
863 struct iovec iov[PG_IOV_MAX];
864 int iovcnt;
865 pgoff_t seekpos;
866 int nbytes;
867 MdfdVec *v;
870 size_t size_this_segment;
871
872 v = _mdfd_getseg(reln, forknum, blocknum, false,
874
875 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
876
877 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
878
880 Min(nblocks,
881 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
883
884 if (nblocks_this_segment != nblocks)
885 elog(ERROR, "read crosses segment boundary");
886
890
891 /*
892 * Inner loop to continue after a short read. We'll keep going until
893 * we hit EOF rather than assuming that a short read means we hit the
894 * end.
895 */
896 for (;;)
897 {
898 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
899 reln->smgr_rlocator.locator.spcOid,
900 reln->smgr_rlocator.locator.dbOid,
901 reln->smgr_rlocator.locator.relNumber,
902 reln->smgr_rlocator.backend);
903 nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
905 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
906 reln->smgr_rlocator.locator.spcOid,
907 reln->smgr_rlocator.locator.dbOid,
908 reln->smgr_rlocator.locator.relNumber,
909 reln->smgr_rlocator.backend,
910 nbytes,
912
913#ifdef SIMULATE_SHORT_READ
914 nbytes = Min(nbytes, 4096);
915#endif
916
917 if (nbytes < 0)
920 errmsg("could not read blocks %u..%u in file \"%s\": %m",
921 blocknum,
922 blocknum + nblocks_this_segment - 1,
923 FilePathName(v->mdfd_vfd))));
924
925 if (nbytes == 0)
926 {
927 /*
928 * We are at or past EOF, or we read a partial block at EOF.
929 * Normally this is an error; upper levels should never try to
930 * read a nonexistent block. However, if zero_damaged_pages
931 * is ON or we are InRecovery, we should instead return zeroes
932 * without complaining. This allows, for example, the case of
933 * trying to update a block that was later truncated away.
934 *
935 * NB: We think that this codepath is unreachable in recovery
936 * and incomplete with zero_damaged_pages, as missing segments
937 * are not created. Putting blocks into the buffer-pool that
938 * do not exist on disk is rather problematic, as it will not
939 * be found by scans that rely on smgrnblocks(), as they are
940 * beyond EOF. It also can cause weird problems with relation
941 * extension, as relation extension does not expect blocks
942 * beyond EOF to exist.
943 *
944 * Therefore we do not want to copy the logic into
945 * mdstartreadv(), where it would have to be more complicated
946 * due to potential differences in the zero_damaged_pages
947 * setting between the definer and completor of IO.
948 *
949 * For PG 18, we are putting an Assert(false) in mdreadv()
950 * (triggering failures in assertion-enabled builds, but
951 * continuing to work in production builds). Afterwards we
952 * plan to remove this code entirely.
953 */
955 {
956 Assert(false); /* see comment above */
957
960 ++i)
961 memset(buffers[i], 0, BLCKSZ);
962 break;
963 }
964 else
967 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
968 blocknum,
969 blocknum + nblocks_this_segment - 1,
973 }
974
975 /* One loop should usually be enough. */
976 transferred_this_segment += nbytes;
979 break;
980
981 /* Adjust position and vectors after a short read. */
982 seekpos += nbytes;
984 }
985
986 nblocks -= nblocks_this_segment;
987 buffers += nblocks_this_segment;
988 blocknum += nblocks_this_segment;
989 }
990}
991
992/*
993 * mdstartreadv() -- Asynchronous version of mdreadv().
994 */
995void
997 SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
998 void **buffers, BlockNumber nblocks)
999{
1000 pgoff_t seekpos;
1001 MdfdVec *v;
1003 struct iovec *iov;
1004 int iovcnt;
1005 int ret;
1006
1007 v = _mdfd_getseg(reln, forknum, blocknum, false,
1009
1010 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1011
1012 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1013
1015 Min(nblocks,
1016 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1017
1018 if (nblocks_this_segment != nblocks)
1019 elog(ERROR, "read crossing segment boundary");
1020
1022
1023 Assert(nblocks <= iovcnt);
1024
1026
1028
1031
1033 reln,
1034 forknum,
1035 blocknum,
1036 nblocks,
1037 false);
1039
1041 if (ret != 0)
1042 ereport(ERROR,
1044 errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
1045 blocknum,
1046 blocknum + nblocks_this_segment - 1,
1047 FilePathName(v->mdfd_vfd))));
1048
1049 /*
1050 * The error checks corresponding to the post-read checks in mdreadv() are
1051 * in md_readv_complete().
1052 *
1053 * However we chose, at least for now, to not implement the
1054 * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
1055 * that logic is rather problematic, and we want to get rid of it. Here
1056 * equivalent logic would have to be more complicated due to potential
1057 * differences in the zero_damaged_pages setting between the definer and
1058 * completor of IO.
1059 */
1060}
1061
1062/*
1063 * mdwritev() -- Write the supplied blocks at the appropriate location.
1064 *
1065 * This is to be used only for updating already-existing blocks of a
1066 * relation (ie, those before the current EOF). To extend a relation,
1067 * use mdextend().
1068 */
1069void
1071 const void **buffers, BlockNumber nblocks, bool skipFsync)
1072{
1073 /* This assert is too expensive to have on normally ... */
1074#ifdef CHECK_WRITE_VS_EXTEND
1075 Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
1076#endif
1077
1078 while (nblocks > 0)
1079 {
1080 struct iovec iov[PG_IOV_MAX];
1081 int iovcnt;
1082 pgoff_t seekpos;
1083 int nbytes;
1084 MdfdVec *v;
1087 size_t size_this_segment;
1088
1089 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
1091
1092 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1093
1094 Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1095
1097 Min(nblocks,
1098 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
1100
1101 if (nblocks_this_segment != nblocks)
1102 elog(ERROR, "write crosses segment boundary");
1103
1104 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
1107
1108 /*
1109 * Inner loop to continue after a short write. If the reason is that
1110 * we're out of disk space, a future attempt should get an ENOSPC
1111 * error from the kernel.
1112 */
1113 for (;;)
1114 {
1115 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
1116 reln->smgr_rlocator.locator.spcOid,
1117 reln->smgr_rlocator.locator.dbOid,
1118 reln->smgr_rlocator.locator.relNumber,
1119 reln->smgr_rlocator.backend);
1120 nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
1122 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
1123 reln->smgr_rlocator.locator.spcOid,
1124 reln->smgr_rlocator.locator.dbOid,
1125 reln->smgr_rlocator.locator.relNumber,
1126 reln->smgr_rlocator.backend,
1127 nbytes,
1129
1130#ifdef SIMULATE_SHORT_WRITE
1131 nbytes = Min(nbytes, 4096);
1132#endif
1133
1134 if (nbytes < 0)
1135 {
1136 bool enospc = errno == ENOSPC;
1137
1138 ereport(ERROR,
1140 errmsg("could not write blocks %u..%u in file \"%s\": %m",
1141 blocknum,
1142 blocknum + nblocks_this_segment - 1,
1144 enospc ? errhint("Check free disk space.") : 0));
1145 }
1146
1147 /* One loop should usually be enough. */
1148 transferred_this_segment += nbytes;
1151 break;
1152
1153 /* Adjust position and iovecs after a short write. */
1154 seekpos += nbytes;
1156 }
1157
1158 if (!skipFsync && !SmgrIsTemp(reln))
1159 register_dirty_segment(reln, forknum, v);
1160
1161 nblocks -= nblocks_this_segment;
1162 buffers += nblocks_this_segment;
1163 blocknum += nblocks_this_segment;
1164 }
1165}
1166
1167
1168/*
1169 * mdwriteback() -- Tell the kernel to write pages back to storage.
1170 *
1171 * This accepts a range of blocks because flushing several pages at once is
1172 * considerably more efficient than doing so individually.
1173 */
1174void
1176 BlockNumber blocknum, BlockNumber nblocks)
1177{
1179
1180 /*
1181 * Issue flush requests in as few requests as possible; have to split at
1182 * segment boundaries though, since those are actually separate files.
1183 */
1184 while (nblocks > 0)
1185 {
1186 BlockNumber nflush = nblocks;
1187 pgoff_t seekpos;
1188 MdfdVec *v;
1189 int segnum_start,
1190 segnum_end;
1191
1192 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1194
1195 /*
1196 * We might be flushing buffers of already removed relations, that's
1197 * ok, just ignore that case. If the segment file wasn't open already
1198 * (ie from a recent mdwrite()), then we don't want to re-open it, to
1199 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1200 * us with a descriptor to a file that is about to be unlinked.
1201 */
1202 if (!v)
1203 return;
1204
1205 /* compute offset inside the current segment */
1206 segnum_start = blocknum / RELSEG_SIZE;
1207
1208 /* compute number of desired writes within the current segment */
1209 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1210 if (segnum_start != segnum_end)
1211 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1212
1213 Assert(nflush >= 1);
1214 Assert(nflush <= nblocks);
1215
1216 seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1217
1219
1220 nblocks -= nflush;
1221 blocknum += nflush;
1222 }
1223}
1224
1225/*
1226 * mdnblocks() -- Get the number of blocks stored in a relation.
1227 *
1228 * Important side effect: all active segments of the relation are opened
1229 * and added to the md_seg_fds array. If this routine has not been
1230 * called, then only segments up to the last one actually touched
1231 * are present in the array.
1232 */
1235{
1236 MdfdVec *v;
1237 BlockNumber nblocks;
1238 BlockNumber segno;
1239
1240 mdopenfork(reln, forknum, EXTENSION_FAIL);
1241
1242 /* mdopen has opened the first segment */
1243 Assert(reln->md_num_open_segs[forknum] > 0);
1244
1245 /*
1246 * Start from the last open segments, to avoid redundant seeks. We have
1247 * previously verified that these segments are exactly RELSEG_SIZE long,
1248 * and it's useless to recheck that each time.
1249 *
1250 * NOTE: this assumption could only be wrong if another backend has
1251 * truncated the relation. We rely on higher code levels to handle that
1252 * scenario by closing and re-opening the md fd, which is handled via
1253 * relcache flush. (Since the checkpointer doesn't participate in
1254 * relcache flush, it could have segment entries for inactive segments;
1255 * that's OK because the checkpointer never needs to compute relation
1256 * size.)
1257 */
1258 segno = reln->md_num_open_segs[forknum] - 1;
1259 v = &reln->md_seg_fds[forknum][segno];
1260
1261 for (;;)
1262 {
1263 nblocks = _mdnblocks(reln, forknum, v);
1264 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1265 elog(FATAL, "segment too big");
1266 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1267 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1268
1269 /*
1270 * If segment is exactly RELSEG_SIZE, advance to next one.
1271 */
1272 segno++;
1273
1274 /*
1275 * We used to pass O_CREAT here, but that has the disadvantage that it
1276 * might create a segment which has vanished through some operating
1277 * system misadventure. In such a case, creating the segment here
1278 * undermines _mdfd_getseg's attempts to notice and report an error
1279 * upon access to a missing segment.
1280 */
1281 v = _mdfd_openseg(reln, forknum, segno, 0);
1282 if (v == NULL)
1283 return segno * ((BlockNumber) RELSEG_SIZE);
1284 }
1285}
1286
1287/*
1288 * mdtruncate() -- Truncate relation to specified number of blocks.
1289 *
1290 * Guaranteed not to allocate memory, so it can be used in a critical section.
1291 * Caller must have called smgrnblocks() to obtain curnblk while holding a
1292 * sufficient lock to prevent a change in relation size, and not used any smgr
1293 * functions for this relation or handled interrupts in between. This makes
1294 * sure we have opened all active segments, so that truncate loop will get
1295 * them all!
1296 *
1297 * If nblocks > curnblk, the request is ignored when we are InRecovery,
1298 * otherwise, an error is raised.
1299 */
1300void
1303{
1305 int curopensegs;
1306
1307 if (nblocks > curnblk)
1308 {
1309 /* Bogus request ... but no complaint if InRecovery */
1310 if (InRecovery)
1311 return;
1312 ereport(ERROR,
1313 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1314 relpath(reln->smgr_rlocator, forknum).str,
1315 nblocks, curnblk)));
1316 }
1317 if (nblocks == curnblk)
1318 return; /* no work */
1319
1320 /*
1321 * Truncate segments, starting at the last one. Starting at the end makes
1322 * managing the memory for the fd array easier, should there be errors.
1323 */
1324 curopensegs = reln->md_num_open_segs[forknum];
1325 while (curopensegs > 0)
1326 {
1327 MdfdVec *v;
1328
1330
1331 v = &reln->md_seg_fds[forknum][curopensegs - 1];
1332
1333 if (priorblocks > nblocks)
1334 {
1335 /*
1336 * This segment is no longer active. We truncate the file, but do
1337 * not delete it, for reasons explained in the header comments.
1338 */
1340 ereport(ERROR,
1342 errmsg("could not truncate file \"%s\": %m",
1343 FilePathName(v->mdfd_vfd))));
1344
1345 if (!SmgrIsTemp(reln))
1346 register_dirty_segment(reln, forknum, v);
1347
1348 /* we never drop the 1st segment */
1349 Assert(v != &reln->md_seg_fds[forknum][0]);
1350
1351 FileClose(v->mdfd_vfd);
1352 _fdvec_resize(reln, forknum, curopensegs - 1);
1353 }
1354 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1355 {
1356 /*
1357 * This is the last segment we want to keep. Truncate the file to
1358 * the right length. NOTE: if nblocks is exactly a multiple K of
1359 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1360 * keep it. This adheres to the invariant given in the header
1361 * comments.
1362 */
1364
1366 ereport(ERROR,
1368 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1370 nblocks)));
1371 if (!SmgrIsTemp(reln))
1372 register_dirty_segment(reln, forknum, v);
1373 }
1374 else
1375 {
1376 /*
1377 * We still need this segment, so nothing to do for this and any
1378 * earlier segment.
1379 */
1380 break;
1381 }
1382 curopensegs--;
1383 }
1384}
1385
1386/*
1387 * mdregistersync() -- Mark whole relation as needing fsync
1388 */
1389void
1391{
1392 int segno;
1393 int min_inactive_seg;
1394
1395 /*
1396 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1397 * the loop below will get them all!
1398 */
1399 mdnblocks(reln, forknum);
1400
1401 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1402
1403 /*
1404 * Temporarily open inactive segments, then close them after sync. There
1405 * may be some inactive segments left opened after error, but that is
1406 * harmless. We don't bother to clean them up and take a risk of further
1407 * trouble. The next mdclose() will soon close them.
1408 */
1409 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1410 segno++;
1411
1412 while (segno > 0)
1413 {
1414 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1415
1416 register_dirty_segment(reln, forknum, v);
1417
1418 /* Close inactive segments immediately */
1419 if (segno > min_inactive_seg)
1420 {
1421 FileClose(v->mdfd_vfd);
1422 _fdvec_resize(reln, forknum, segno - 1);
1423 }
1424
1425 segno--;
1426 }
1427}
1428
1429/*
1430 * mdimmedsync() -- Immediately sync a relation to stable storage.
1431 *
1432 * Note that only writes already issued are synced; this routine knows
1433 * nothing of dirty buffers that may exist inside the buffer manager. We
1434 * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
1435 * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
1436 * some segment, then mdtruncate() renders that segment inactive. If we
1437 * crash before the next checkpoint syncs the newly-inactive segment, that
1438 * segment may survive recovery, reintroducing unwanted data into the table.
1439 */
1440void
1442{
1443 int segno;
1444 int min_inactive_seg;
1445
1446 /*
1447 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1448 * the loop below will get them all!
1449 */
1450 mdnblocks(reln, forknum);
1451
1452 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1453
1454 /*
1455 * Temporarily open inactive segments, then close them after sync. There
1456 * may be some inactive segments left opened after fsync() error, but that
1457 * is harmless. We don't bother to clean them up and take a risk of
1458 * further trouble. The next mdclose() will soon close them.
1459 */
1460 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1461 segno++;
1462
1463 while (segno > 0)
1464 {
1465 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1466
1467 /*
1468 * fsyncs done through mdimmedsync() should be tracked in a separate
1469 * IOContext than those done through mdsyncfiletag() to differentiate
1470 * between unavoidable client backend fsyncs (e.g. those done during
1471 * index build) and those which ideally would have been done by the
1472 * checkpointer. Since other IO operations bypassing the buffer
1473 * manager could also be tracked in such an IOContext, wait until
1474 * these are also tracked to track immediate fsyncs.
1475 */
1479 errmsg("could not fsync file \"%s\": %m",
1480 FilePathName(v->mdfd_vfd))));
1481
1482 /* Close inactive segments immediately */
1483 if (segno > min_inactive_seg)
1484 {
1485 FileClose(v->mdfd_vfd);
1486 _fdvec_resize(reln, forknum, segno - 1);
1487 }
1488
1489 segno--;
1490 }
1491}
1492
1493int
1495{
1496 MdfdVec *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
1497
1498 v = _mdfd_getseg(reln, forknum, blocknum, false,
1500
1501 *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1502
1503 Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE);
1504
1505 return FileGetRawDesc(v->mdfd_vfd);
1506}
1507
1508/*
1509 * register_dirty_segment() -- Mark a relation segment as needing fsync
1510 *
1511 * If there is a local pending-ops table, just make an entry in it for
1512 * ProcessSyncRequests to process later. Otherwise, try to pass off the
1513 * fsync request to the checkpointer process. If that fails, just do the
1514 * fsync locally before returning (we hope this will not happen often
1515 * enough to be a performance problem).
1516 */
1517static void
1519{
1520 FileTag tag;
1521
1522 INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1523
1524 /* Temp relations should never be fsync'd */
1526
1527 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1528 {
1530
1532 (errmsg_internal("could not forward fsync request because request queue is full")));
1533
1535
1539 errmsg("could not fsync file \"%s\": %m",
1540 FilePathName(seg->mdfd_vfd))));
1541
1542 /*
1543 * We have no way of knowing if the current IOContext is
1544 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1545 * point, so count the fsync as being in the IOCONTEXT_NORMAL
1546 * IOContext. This is probably okay, because the number of backend
1547 * fsyncs doesn't say anything about the efficacy of the
1548 * BufferAccessStrategy. And counting both fsyncs done in
1549 * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1550 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1551 * backend fsyncs.
1552 */
1554 IOOP_FSYNC, io_start, 1, 0);
1555 }
1556}
1557
1558/*
1559 * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
1560 */
1561static void
1563 BlockNumber segno)
1564{
1565 FileTag tag;
1566
1567 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1568
1569 /* Should never be used with temp relations */
1571
1572 RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
1573}
1574
1575/*
1576 * register_forget_request() -- forget any fsyncs for a relation fork's segment
1577 */
1578static void
1580 BlockNumber segno)
1581{
1582 FileTag tag;
1583
1584 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1585
1586 RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
1587}
1588
1589/*
1590 * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
1591 */
1592void
1594{
1595 FileTag tag;
1596 RelFileLocator rlocator;
1597
1598 rlocator.dbOid = dbid;
1599 rlocator.spcOid = 0;
1600 rlocator.relNumber = 0;
1601
1603
1604 RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1605}
1606
1607/*
1608 * DropRelationFiles -- drop files of all given relations
1609 */
1610void
1612{
1614 int i;
1615
1617 for (i = 0; i < ndelrels; i++)
1618 {
1620
1621 if (isRedo)
1622 {
1624
1625 for (fork = 0; fork <= MAX_FORKNUM; fork++)
1627 }
1628 srels[i] = srel;
1629 }
1630
1632
1633 for (i = 0; i < ndelrels; i++)
1634 smgrclose(srels[i]);
1635 pfree(srels);
1636}
1637
1638
1639/*
1640 * _fdvec_resize() -- Resize the fork's open segments array
1641 */
1642static void
1644 ForkNumber forknum,
1645 int nseg)
1646{
1647 if (nseg == 0)
1648 {
1649 if (reln->md_num_open_segs[forknum] > 0)
1650 {
1651 pfree(reln->md_seg_fds[forknum]);
1652 reln->md_seg_fds[forknum] = NULL;
1653 }
1654 }
1655 else if (reln->md_num_open_segs[forknum] == 0)
1656 {
1657 reln->md_seg_fds[forknum] =
1659 }
1660 else if (nseg > reln->md_num_open_segs[forknum])
1661 {
1662 /*
1663 * It doesn't seem worthwhile complicating the code to amortize
1664 * repalloc() calls. Those are far faster than PathNameOpenFile() or
1665 * FileClose(), and the memory context internally will sometimes avoid
1666 * doing an actual reallocation.
1667 */
1668 reln->md_seg_fds[forknum] =
1669 repalloc(reln->md_seg_fds[forknum],
1670 sizeof(MdfdVec) * nseg);
1671 }
1672 else
1673 {
1674 /*
1675 * We don't reallocate a smaller array, because we want mdtruncate()
1676 * to be able to promise that it won't allocate memory, so that it is
1677 * allowed in a critical section. This means that a bit of space in
1678 * the array is now wasted, until the next time we add a segment and
1679 * reallocate.
1680 */
1681 }
1682
1683 reln->md_num_open_segs[forknum] = nseg;
1684}
1685
1686/*
1687 * Return the filename for the specified segment of the relation. The
1688 * returned string is palloc'd.
1689 */
1690static MdPathStr
1692{
1693 RelPathStr path;
1694 MdPathStr fullpath;
1695
1696 path = relpath(reln->smgr_rlocator, forknum);
1697
1698 if (segno > 0)
1699 sprintf(fullpath.str, "%s.%u", path.str, segno);
1700 else
1701 strcpy(fullpath.str, path.str);
1702
1703 return fullpath;
1704}
1705
1706/*
1707 * Open the specified segment of the relation,
1708 * and make a MdfdVec object for it. Returns NULL on failure.
1709 */
1710static MdfdVec *
1712 int oflags)
1713{
1714 MdfdVec *v;
1715 File fd;
1716 MdPathStr fullpath;
1717
1718 fullpath = _mdfd_segpath(reln, forknum, segno);
1719
1720 /* open the file */
1722
1723 if (fd < 0)
1724 return NULL;
1725
1726 /*
1727 * Segments are always opened in order from lowest to highest, so we must
1728 * be adding a new one at the end.
1729 */
1730 Assert(segno == reln->md_num_open_segs[forknum]);
1731
1732 _fdvec_resize(reln, forknum, segno + 1);
1733
1734 /* fill the entry */
1735 v = &reln->md_seg_fds[forknum][segno];
1736 v->mdfd_vfd = fd;
1737 v->mdfd_segno = segno;
1738
1739 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1740
1741 /* all done */
1742 return v;
1743}
1744
1745/*
1746 * _mdfd_getseg() -- Find the segment of the relation holding the
1747 * specified block.
1748 *
1749 * If the segment doesn't exist, we ereport, return NULL, or create the
1750 * segment, according to "behavior". Note: skipFsync is only used in the
1751 * EXTENSION_CREATE case.
1752 */
1753static MdfdVec *
1755 bool skipFsync, int behavior)
1756{
1757 MdfdVec *v;
1760
1761 /* some way to handle non-existent segments needs to be specified */
1762 Assert(behavior &
1765
1766 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1767
1768 /* if an existing and opened segment, we're done */
1769 if (targetseg < reln->md_num_open_segs[forknum])
1770 {
1771 v = &reln->md_seg_fds[forknum][targetseg];
1772 return v;
1773 }
1774
1775 /* The caller only wants the segment if we already had it open. */
1776 if (behavior & EXTENSION_DONT_OPEN)
1777 return NULL;
1778
1779 /*
1780 * The target segment is not yet open. Iterate over all the segments
1781 * between the last opened and the target segment. This way missing
1782 * segments either raise an error, or get created (according to
1783 * 'behavior'). Start with either the last opened, or the first segment if
1784 * none was opened before.
1785 */
1786 if (reln->md_num_open_segs[forknum] > 0)
1787 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1788 else
1789 {
1790 v = mdopenfork(reln, forknum, behavior);
1791 if (!v)
1792 return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1793 }
1794
1795 for (nextsegno = reln->md_num_open_segs[forknum];
1797 {
1798 BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1799 int flags = 0;
1800
1801 Assert(nextsegno == v->mdfd_segno + 1);
1802
1803 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1804 elog(FATAL, "segment too big");
1805
1806 if ((behavior & EXTENSION_CREATE) ||
1807 (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1808 {
1809 /*
1810 * Normally we will create new segments only if authorized by the
1811 * caller (i.e., we are doing mdextend()). But when doing WAL
1812 * recovery, create segments anyway; this allows cases such as
1813 * replaying WAL data that has a write into a high-numbered
1814 * segment of a relation that was later deleted. We want to go
1815 * ahead and create the segments so we can finish out the replay.
1816 *
1817 * We have to maintain the invariant that segments before the last
1818 * active segment are of size RELSEG_SIZE; therefore, if
1819 * extending, pad them out with zeroes if needed. (This only
1820 * matters if in recovery, or if the caller is extending the
1821 * relation discontiguously, but that can happen in hash indexes.)
1822 */
1823 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1824 {
1827
1828 mdextend(reln, forknum,
1831 pfree(zerobuf);
1832 }
1833 flags = O_CREAT;
1834 }
1835 else if (nblocks < ((BlockNumber) RELSEG_SIZE))
1836 {
1837 /*
1838 * When not extending, only open the next segment if the current
1839 * one is exactly RELSEG_SIZE. If not (this branch), either
1840 * return NULL or fail.
1841 */
1842 if (behavior & EXTENSION_RETURN_NULL)
1843 {
1844 /*
1845 * Some callers discern between reasons for _mdfd_getseg()
1846 * returning NULL based on errno. As there's no failing
1847 * syscall involved in this case, explicitly set errno to
1848 * ENOENT, as that seems the closest interpretation.
1849 */
1850 errno = ENOENT;
1851 return NULL;
1852 }
1853
1854 ereport(ERROR,
1856 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1857 _mdfd_segpath(reln, forknum, nextsegno).str,
1858 blkno, nblocks)));
1859 }
1860
1861 v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1862
1863 if (v == NULL)
1864 {
1865 if ((behavior & EXTENSION_RETURN_NULL) &&
1867 return NULL;
1868 ereport(ERROR,
1870 errmsg("could not open file \"%s\" (target block %u): %m",
1871 _mdfd_segpath(reln, forknum, nextsegno).str,
1872 blkno)));
1873 }
1874 }
1875
1876 return v;
1877}
1878
1879/*
1880 * Get number of blocks present in a single disk file
1881 */
1882static BlockNumber
1884{
1885 pgoff_t len;
1886
1887 len = FileSize(seg->mdfd_vfd);
1888 if (len < 0)
1889 ereport(ERROR,
1891 errmsg("could not seek to end of file \"%s\": %m",
1892 FilePathName(seg->mdfd_vfd))));
1893 /* note that this calculation will ignore any partial block at EOF */
1894 return (BlockNumber) (len / BLCKSZ);
1895}
1896
1897/*
1898 * Sync a file to disk, given a file tag. Write the path into an output
1899 * buffer so the caller can use it in error messages.
1900 *
1901 * Return 0 on success, -1 on failure, with errno set.
1902 */
1903int
1904mdsyncfiletag(const FileTag *ftag, char *path)
1905{
1907 File file;
1909 bool need_to_close;
1910 int result,
1911 save_errno;
1912
1913 /* See if we already have the file open, or need to open it. */
1914 if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1915 {
1916 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1917 strlcpy(path, FilePathName(file), MAXPGPATH);
1918 need_to_close = false;
1919 }
1920 else
1921 {
1922 MdPathStr p;
1923
1924 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1925 strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
1926
1927 file = PathNameOpenFile(path, _mdfd_open_flags());
1928 if (file < 0)
1929 return -1;
1930 need_to_close = true;
1931 }
1932
1934
1935 /* Sync the file. */
1936 result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1937 save_errno = errno;
1938
1939 if (need_to_close)
1940 FileClose(file);
1941
1943 IOOP_FSYNC, io_start, 1, 0);
1944
1945 errno = save_errno;
1946 return result;
1947}
1948
1949/*
1950 * Unlink a file, given a file tag. Write the path into an output
1951 * buffer so the caller can use it in error messages.
1952 *
1953 * Return 0 on success, -1 on failure, with errno set.
1954 */
1955int
1956mdunlinkfiletag(const FileTag *ftag, char *path)
1957{
1958 RelPathStr p;
1959
1960 /* Compute the path. */
1961 p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1962 strlcpy(path, p.str, MAXPGPATH);
1963
1964 /* Try to unlink the file. */
1965 return unlink(path);
1966}
1967
1968/*
1969 * Check if a given candidate request matches a given tag, when processing
1970 * a SYNC_FILTER_REQUEST request. This will be called for all pending
1971 * requests to find out whether to forget them.
1972 */
1973bool
1975{
1976 /*
1977 * For now we only use filter requests as a way to drop all scheduled
1978 * callbacks relating to a given database, when dropping the database.
1979 * We'll return true for all candidates that have the same database OID as
1980 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1981 */
1982 return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1983}
1984
1985/*
1986 * AIO completion callback for mdstartreadv().
1987 */
1988static PgAioResult
1990{
1992 PgAioResult result = prior_result;
1993
1994 if (prior_result.result < 0)
1995 {
1996 result.status = PGAIO_RS_ERROR;
1997 result.id = PGAIO_HCB_MD_READV;
1998 /* For "hard" errors, track the error number in error_data */
1999 result.error_data = -prior_result.result;
2000 result.result = 0;
2001
2002 /*
2003 * Immediately log a message about the IO error, but only to the
2004 * server log. The reason to do so immediately is that the originator
2005 * might not process the query result immediately (because it is busy
2006 * doing another part of query processing) or at all (e.g. if it was
2007 * cancelled or errored out due to another IO also failing). The
2008 * definer of the IO will emit an ERROR when processing the IO's
2009 * results
2010 */
2012
2013 return result;
2014 }
2015
2016 /*
2017 * As explained above smgrstartreadv(), the smgr API operates on the level
2018 * of blocks, rather than bytes. Convert.
2019 */
2020 result.result /= BLCKSZ;
2021
2022 Assert(result.result <= td->smgr.nblocks);
2023
2024 if (result.result == 0)
2025 {
2026 /* consider 0 blocks read a failure */
2027 result.status = PGAIO_RS_ERROR;
2028 result.id = PGAIO_HCB_MD_READV;
2029 result.error_data = 0;
2030
2031 /* see comment above the "hard error" case */
2033
2034 return result;
2035 }
2036
2037 if (result.status != PGAIO_RS_ERROR &&
2038 result.result < td->smgr.nblocks)
2039 {
2040 /* partial reads should be retried at upper level */
2041 result.status = PGAIO_RS_PARTIAL;
2042 result.id = PGAIO_HCB_MD_READV;
2043 }
2044
2045 return result;
2046}
2047
2048/*
2049 * AIO error reporting callback for mdstartreadv().
2050 *
2051 * Errors are encoded as follows:
2052 * - PgAioResult.error_data != 0 encodes IO that failed with that errno
2053 * - PgAioResult.error_data == 0 encodes IO that didn't read all data
2054 */
2055static void
2056md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
2057{
2058 RelPathStr path;
2059
2060 path = relpathbackend(td->smgr.rlocator,
2062 td->smgr.forkNum);
2063
2064 if (result.error_data != 0)
2065 {
2066 /* for errcode_for_file_access() and %m */
2067 errno = result.error_data;
2068
2069 ereport(elevel,
2071 errmsg("could not read blocks %u..%u in file \"%s\": %m",
2072 td->smgr.blockNum,
2073 td->smgr.blockNum + td->smgr.nblocks - 1,
2074 path.str));
2075 }
2076 else
2077 {
2078 /*
2079 * NB: This will typically only be output in debug messages, while
2080 * retrying a partial IO.
2081 */
2082 ereport(elevel,
2084 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
2085 td->smgr.blockNum,
2086 td->smgr.blockNum + td->smgr.nblocks - 1,
2087 path.str,
2088 result.result * (size_t) BLCKSZ,
2089 td->smgr.nblocks * (size_t) BLCKSZ));
2090 }
2091}
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition aio.c:330
@ PGAIO_HCB_MD_READV
Definition aio.h:196
@ PGAIO_HF_BUFFERED
Definition aio.h:77
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
int pgaio_io_get_iovec(PgAioHandle *ioh, struct iovec **iov)
Definition aio_io.c:42
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition aio_target.c:73
@ PGAIO_RS_PARTIAL
Definition aio_types.h:82
@ PGAIO_RS_ERROR
Definition aio_types.h:84
void TablespaceCreateDbspace(Oid spcOid, Oid dbOid, bool isRedo)
Definition tablespace.c:112
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
#define MaxBlockNumber
Definition block.h:35
bool track_io_timing
Definition bufmgr.c:176
bool zero_damaged_pages
Definition bufmgr.c:173
#define Min(x, y)
Definition c.h:997
#define TYPEALIGN(ALIGNVAL, LEN)
Definition c.h:819
uint8_t uint8
Definition c.h:544
#define Assert(condition)
Definition c.h:873
#define PG_BINARY
Definition c.h:1287
uint64_t uint64
Definition c.h:547
uint32_t uint32
Definition c.h:546
#define lengthof(array)
Definition c.h:803
#define StaticAssertDecl(condition, errmessage)
Definition c.h:942
int errmsg_internal(const char *fmt,...)
Definition elog.c:1170
int errcode_for_file_access(void)
Definition elog.c:886
int errhint(const char *fmt,...)
Definition elog.c:1330
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define LOG_SERVER_ONLY
Definition elog.h:32
#define FATAL
Definition elog.h:41
#define WARNING
Definition elog.h:36
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int pg_truncate(const char *path, pgoff_t length)
Definition fd.c:720
int FileGetRawDesc(File file)
Definition fd.c:2515
void FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
Definition fd.c:2122
int io_direct_flags
Definition fd.c:171
int file_extend_method
Definition fd.c:168
char * FilePathName(File file)
Definition fd.c:2499
int FileSync(File file, uint32 wait_event_info)
Definition fd.c:2335
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2204
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2148
int FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2407
pgoff_t FileSize(File file)
Definition fd.c:2447
void FileClose(File file)
Definition fd.c:1965
int data_sync_elevel(int elevel)
Definition fd.c:3985
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1562
int FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2464
int FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2362
int FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2066
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2230
#define IO_DIRECT_DATA
Definition fd.h:54
static ssize_t FileWrite(File file, const void *buffer, size_t amount, pgoff_t offset, uint32 wait_event_info)
Definition fd.h:237
@ FILE_EXTEND_METHOD_WRITE_ZEROS
Definition fd.h:63
#define FILE_POSSIBLY_DELETED(err)
Definition fd.h:89
int File
Definition fd.h:51
#define PG_O_DIRECT
Definition fd.h:123
#define MCXT_ALLOC_ZERO
Definition fe_memutils.h:30
#define palloc_array(type, count)
Definition fe_memutils.h:76
int compute_remaining_iovec(struct iovec *destination, const struct iovec *source, int iovcnt, size_t transferred)
Definition file_utils.c:614
bool IsBinaryUpgrade
Definition globals.c:121
ProcNumber MyProcNumber
Definition globals.c:90
const char * str
int i
Definition isn.c:77
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1232
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
MemoryContext TopMemoryContext
Definition mcxt.c:166
void * palloc_aligned(Size size, Size alignto, int flags)
Definition mcxt.c:1606
void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition md.c:337
static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition md.c:2056
static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition md.c:1579
#define EXTENSION_CREATE_RECOVERY
Definition md.c:119
void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
Definition md.c:1301
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition md.c:1883
static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition md.c:374
void mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
Definition md.c:1070
bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
Definition md.c:1974
bool mdexists(SMgrRelation reln, ForkNumber forknum)
Definition md.c:203
void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition md.c:858
static MdPathStr _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
Definition md.c:1691
static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition md.c:1562
#define EXTENSION_DONT_OPEN
Definition md.c:121
BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum)
Definition md.c:1234
int mdunlinkfiletag(const FileTag *ftag, char *path)
Definition md.c:1956
static MemoryContext MdCxt
Definition md.c:97
void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition md.c:222
int mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
Definition md.c:1494
void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition md.c:487
static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition md.c:1989
static int do_truncate(const char *path)
Definition md.c:353
void mdinit(void)
Definition md.c:190
void mdclose(SMgrRelation reln, ForkNumber forknum)
Definition md.c:724
void mdzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition md.c:552
static MdfdVec * _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
Definition md.c:1711
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition md.c:1518
int mdsyncfiletag(const FileTag *ftag, char *path)
Definition md.c:1904
void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition md.c:1175
uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition md.c:844
static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
Definition md.c:1754
#define EXTENSION_RETURN_NULL
Definition md.c:115
void mdstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition md.c:996
bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition md.c:747
void mdregistersync(SMgrRelation reln, ForkNumber forknum)
Definition md.c:1390
void mdopen(SMgrRelation reln)
Definition md.c:713
#define EXTENSION_CREATE
Definition md.c:117
const PgAioHandleCallbacks aio_md_readv_cb
Definition md.c:169
static int _mdfd_open_flags(void)
Definition md.c:176
#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
Definition md.c:101
#define EXTENSION_FAIL
Definition md.c:113
static MdfdVec * mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
Definition md.c:675
void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
Definition md.c:1611
static int buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
Definition md.c:795
#define MD_PATH_STR_MAXLEN
Definition md.c:132
static void _fdvec_resize(SMgrRelation reln, ForkNumber forknum, int nseg)
Definition md.c:1643
void ForgetDatabaseSyncRequests(Oid dbid)
Definition md.c:1593
void mdimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition md.c:1441
struct _MdfdVec MdfdVec
#define AllocSetContextCreate
Definition memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition memutils.h:160
#define ERRCODE_DATA_CORRUPTED
#define MAXPGPATH
#define PG_IO_ALIGN_SIZE
const void size_t len
#define PG_IOV_MAX
Definition pg_iovec.h:47
@ IOOBJECT_RELATION
Definition pgstat.h:277
@ IOCONTEXT_NORMAL
Definition pgstat.h:289
@ IOOP_FSYNC
Definition pgstat.h:308
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
#define sprintf
Definition port.h:262
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
off_t pgoff_t
Definition port.h:421
unsigned int Oid
static int fd(const char *x, int i)
static int fb(int x)
#define INVALID_PROC_NUMBER
Definition procnumber.h:26
#define RelFileLocatorBackendIsTemp(rlocator)
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
@ InvalidForkNumber
Definition relpath.h:57
#define MAX_FORKNUM
Definition relpath.h:70
#define relpath(rlocator, forknum)
Definition relpath.h:150
#define relpathbackend(rlocator, backend, forknum)
Definition relpath.h:141
#define relpathperm(rlocator, forknum)
Definition relpath.h:146
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition smgr.c:240
void smgrclose(SMgrRelation reln)
Definition smgr.c:374
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
Definition smgr.c:538
void pgaio_io_set_target_smgr(PgAioHandle *ioh, SMgrRelationData *smgr, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skip_fsync)
Definition smgr.c:1038
#define SmgrIsTemp(smgr)
Definition smgr.h:74
Definition sync.h:51
RelFileLocator rlocator
Definition sync.h:54
int16 forknum
Definition sync.h:53
uint64 segno
Definition sync.h:55
char str[MD_PATH_STR_MAXLEN+1]
Definition md.c:140
PgAioHandleCallbackComplete complete_shared
Definition aio.h:239
uint32 status
Definition aio_types.h:108
uint32 error_data
Definition aio_types.h:111
int32 result
Definition aio_types.h:113
uint32 id
Definition aio_types.h:105
RelFileLocator locator
RelFileNumber relNumber
char str[REL_PATH_STR_MAXLEN+1]
Definition relpath.h:123
Definition md.c:92
File mdfd_vfd
Definition md.c:93
BlockNumber mdfd_segno
Definition md.c:94
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition sync.c:580
@ SYNC_FILTER_REQUEST
Definition sync.h:28
@ SYNC_FORGET_REQUEST
Definition sync.h:27
@ SYNC_UNLINK_REQUEST
Definition sync.h:26
@ SYNC_REQUEST
Definition sync.h:25
BlockNumber blockNum
Definition aio_types.h:66
RelFileLocator rlocator
Definition aio_types.h:65
struct PgAioTargetData::@126 smgr
BlockNumber nblocks
Definition aio_types.h:67
ForkNumber forkNum
Definition aio_types.h:68
bool InRecovery
Definition xlogutils.c:50
void XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum)
Definition xlogutils.c:630