PostgreSQL Source Code git master
md.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * md.c
4 * This code manages relations that reside on magnetic disk.
5 *
6 * Or at least, that was what the Berkeley folk had in mind when they named
7 * this file. In reality, what this code provides is an interface from
8 * the smgr API to Unix-like filesystem APIs, so it will work with any type
9 * of device for which the operating system provides filesystem support.
10 * It doesn't matter whether the bits are on spinning rust or some other
11 * storage technology.
12 *
13 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
14 * Portions Copyright (c) 1994, Regents of the University of California
15 *
16 *
17 * IDENTIFICATION
18 * src/backend/storage/smgr/md.c
19 *
20 *-------------------------------------------------------------------------
21 */
22#include "postgres.h"
23
24#include <unistd.h>
25#include <fcntl.h>
26#include <sys/file.h>
27
28#include "access/xlogutils.h"
29#include "commands/tablespace.h"
30#include "common/file_utils.h"
31#include "miscadmin.h"
32#include "pg_trace.h"
33#include "pgstat.h"
34#include "storage/bufmgr.h"
35#include "storage/fd.h"
36#include "storage/md.h"
38#include "storage/smgr.h"
39#include "storage/sync.h"
40#include "utils/memutils.h"
41
42/*
43 * The magnetic disk storage manager keeps track of open file
44 * descriptors in its own descriptor pool. This is done to make it
45 * easier to support relations that are larger than the operating
46 * system's file size limit (often 2GBytes). In order to do that,
47 * we break relations up into "segment" files that are each shorter than
48 * the OS file size limit. The segment size is set by the RELSEG_SIZE
49 * configuration constant in pg_config.h.
50 *
51 * On disk, a relation must consist of consecutively numbered segment
52 * files in the pattern
53 * -- Zero or more full segments of exactly RELSEG_SIZE blocks each
54 * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
55 * -- Optionally, any number of inactive segments of size 0 blocks.
56 * The full and partial segments are collectively the "active" segments.
57 * Inactive segments are those that once contained data but are currently
58 * not needed because of an mdtruncate() operation. The reason for leaving
59 * them present at size zero, rather than unlinking them, is that other
60 * backends and/or the checkpointer might be holding open file references to
61 * such segments. If the relation expands again after mdtruncate(), such
62 * that a deactivated segment becomes active again, it is important that
63 * such file references still be valid --- else data might get written
64 * out to an unlinked old copy of a segment file that will eventually
65 * disappear.
66 *
67 * File descriptors are stored in the per-fork md_seg_fds arrays inside
68 * SMgrRelation. The length of these arrays is stored in md_num_open_segs.
69 * Note that a fork's md_num_open_segs having a specific value does not
70 * necessarily mean the relation doesn't have additional segments; we may
71 * just not have opened the next segment yet. (We could not have "all
72 * segments are in the array" as an invariant anyway, since another backend
73 * could extend the relation while we aren't looking.) We do not have
74 * entries for inactive segments, however; as soon as we find a partial
75 * segment, we assume that any subsequent segments are inactive.
76 *
77 * The entire MdfdVec array is palloc'd in the MdCxt memory context.
78 */
79
80typedef struct _MdfdVec
81{
82 File mdfd_vfd; /* fd number in fd.c's pool */
83 BlockNumber mdfd_segno; /* segment number, from 0 */
85
86static MemoryContext MdCxt; /* context for all MdfdVec objects */
87
88
89/* Populate a file tag describing an md.c segment file. */
90#define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \
91( \
92 memset(&(a), 0, sizeof(FileTag)), \
93 (a).handler = SYNC_HANDLER_MD, \
94 (a).rlocator = (xx_rlocator), \
95 (a).forknum = (xx_forknum), \
96 (a).segno = (xx_segno) \
97)
98
99
100/*** behavior for mdopen & _mdfd_getseg ***/
101/* ereport if segment not present */
102#define EXTENSION_FAIL (1 << 0)
103/* return NULL if segment not present */
104#define EXTENSION_RETURN_NULL (1 << 1)
105/* create new segments as needed */
106#define EXTENSION_CREATE (1 << 2)
107/* create new segments if needed during recovery */
108#define EXTENSION_CREATE_RECOVERY (1 << 3)
109/* don't try to open a segment, if not already open */
110#define EXTENSION_DONT_OPEN (1 << 5)
111
112
113/* local routines */
114static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
115 bool isRedo);
116static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
117static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
118 MdfdVec *seg);
119static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
120 BlockNumber segno);
121static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
122 BlockNumber segno);
123static void _fdvec_resize(SMgrRelation reln,
124 ForkNumber forknum,
125 int nseg);
126static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
127 BlockNumber segno);
128static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum,
129 BlockNumber segno, int oflags);
130static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
131 BlockNumber blkno, bool skipFsync, int behavior);
132static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
133 MdfdVec *seg);
134
135static inline int
137{
138 int flags = O_RDWR | PG_BINARY;
139
141 flags |= PG_O_DIRECT;
142
143 return flags;
144}
145
146/*
147 * mdinit() -- Initialize private state for magnetic disk storage manager.
148 */
149void
151{
153 "MdSmgr",
155}
156
157/*
158 * mdexists() -- Does the physical file exist?
159 *
160 * Note: this will return true for lingering files, with pending deletions
161 */
162bool
164{
165 /*
166 * Close it first, to ensure that we notice if the fork has been unlinked
167 * since we opened it. As an optimization, we can skip that in recovery,
168 * which already closes relations when dropping them.
169 */
170 if (!InRecovery)
171 mdclose(reln, forknum);
172
173 return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
174}
175
176/*
177 * mdcreate() -- Create a new relation on magnetic disk.
178 *
179 * If isRedo is true, it's okay for the relation to exist already.
180 */
181void
182mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
183{
184 MdfdVec *mdfd;
185 char *path;
186 File fd;
187
188 if (isRedo && reln->md_num_open_segs[forknum] > 0)
189 return; /* created and opened already... */
190
191 Assert(reln->md_num_open_segs[forknum] == 0);
192
193 /*
194 * We may be using the target table space for the first time in this
195 * database, so create a per-database subdirectory if needed.
196 *
197 * XXX this is a fairly ugly violation of module layering, but this seems
198 * to be the best place to put the check. Maybe TablespaceCreateDbspace
199 * should be here and not in commands/tablespace.c? But that would imply
200 * importing a lot of stuff that smgr.c oughtn't know, either.
201 */
204 isRedo);
205
206 path = relpath(reln->smgr_rlocator, forknum);
207
208 fd = PathNameOpenFile(path, _mdfd_open_flags() | O_CREAT | O_EXCL);
209
210 if (fd < 0)
211 {
212 int save_errno = errno;
213
214 if (isRedo)
216 if (fd < 0)
217 {
218 /* be sure to report the error reported by create, not open */
219 errno = save_errno;
222 errmsg("could not create file \"%s\": %m", path)));
223 }
224 }
225
226 pfree(path);
227
228 _fdvec_resize(reln, forknum, 1);
229 mdfd = &reln->md_seg_fds[forknum][0];
230 mdfd->mdfd_vfd = fd;
231 mdfd->mdfd_segno = 0;
232
233 if (!SmgrIsTemp(reln))
234 register_dirty_segment(reln, forknum, mdfd);
235}
236
237/*
238 * mdunlink() -- Unlink a relation.
239 *
240 * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
241 * there won't be an SMgrRelation hashtable entry anymore.
242 *
243 * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
244 * to delete all forks.
245 *
246 * For regular relations, we don't unlink the first segment file of the rel,
247 * but just truncate it to zero length, and record a request to unlink it after
248 * the next checkpoint. Additional segments can be unlinked immediately,
249 * however. Leaving the empty file in place prevents that relfilenumber
250 * from being reused. The scenario this protects us from is:
251 * 1. We delete a relation (and commit, and actually remove its file).
252 * 2. We create a new relation, which by chance gets the same relfilenumber as
253 * the just-deleted one (OIDs must've wrapped around for that to happen).
254 * 3. We crash before another checkpoint occurs.
255 * During replay, we would delete the file and then recreate it, which is fine
256 * if the contents of the file were repopulated by subsequent WAL entries.
257 * But if we didn't WAL-log insertions, but instead relied on fsyncing the
258 * file after populating it (as we do at wal_level=minimal), the contents of
259 * the file would be lost forever. By leaving the empty file until after the
260 * next checkpoint, we prevent reassignment of the relfilenumber until it's
261 * safe, because relfilenumber assignment skips over any existing file.
262 *
263 * Additional segments, if any, are truncated and then unlinked. The reason
264 * for truncating is that other backends may still hold open FDs for these at
265 * the smgr level, so that the kernel can't remove the file yet. We want to
266 * reclaim the disk space right away despite that.
267 *
268 * We do not need to go through this dance for temp relations, though, because
269 * we never make WAL entries for temp rels, and so a temp rel poses no threat
270 * to the health of a regular rel that has taken over its relfilenumber.
271 * The fact that temp rels and regular rels have different file naming
272 * patterns provides additional safety. Other backends shouldn't have open
273 * FDs for them, either.
274 *
275 * We also don't do it while performing a binary upgrade. There is no reuse
276 * hazard in that case, since after a crash or even a simple ERROR, the
277 * upgrade fails and the whole cluster must be recreated from scratch.
278 * Furthermore, it is important to remove the files from disk immediately,
279 * because we may be about to reuse the same relfilenumber.
280 *
281 * All the above applies only to the relation's main fork; other forks can
282 * just be removed immediately, since they are not needed to prevent the
283 * relfilenumber from being recycled. Also, we do not carefully
284 * track whether other forks have been created or not, but just attempt to
285 * unlink them unconditionally; so we should never complain about ENOENT.
286 *
287 * If isRedo is true, it's unsurprising for the relation to be already gone.
288 * Also, we should remove the file immediately instead of queuing a request
289 * for later, since during redo there's no possibility of creating a
290 * conflicting relation.
291 *
292 * Note: we currently just never warn about ENOENT at all. We could warn in
293 * the main-fork, non-isRedo case, but it doesn't seem worth the trouble.
294 *
295 * Note: any failure should be reported as WARNING not ERROR, because
296 * we are usually not in a transaction anymore when this is called.
297 */
298void
299mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
300{
301 /* Now do the per-fork work */
302 if (forknum == InvalidForkNumber)
303 {
304 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
305 mdunlinkfork(rlocator, forknum, isRedo);
306 }
307 else
308 mdunlinkfork(rlocator, forknum, isRedo);
309}
310
311/*
312 * Truncate a file to release disk space.
313 */
314static int
315do_truncate(const char *path)
316{
317 int save_errno;
318 int ret;
319
320 ret = pg_truncate(path, 0);
321
322 /* Log a warning here to avoid repetition in callers. */
323 if (ret < 0 && errno != ENOENT)
324 {
325 save_errno = errno;
328 errmsg("could not truncate file \"%s\": %m", path)));
329 errno = save_errno;
330 }
331
332 return ret;
333}
334
335static void
336mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
337{
338 char *path;
339 int ret;
340 int save_errno;
341
342 path = relpath(rlocator, forknum);
343
344 /*
345 * Truncate and then unlink the first segment, or just register a request
346 * to unlink it later, as described in the comments for mdunlink().
347 */
348 if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
350 {
351 if (!RelFileLocatorBackendIsTemp(rlocator))
352 {
353 /* Prevent other backends' fds from holding on to the disk space */
354 ret = do_truncate(path);
355
356 /* Forget any pending sync requests for the first segment */
357 save_errno = errno;
358 register_forget_request(rlocator, forknum, 0 /* first seg */ );
359 errno = save_errno;
360 }
361 else
362 ret = 0;
363
364 /* Next unlink the file, unless it was already found to be missing */
365 if (ret >= 0 || errno != ENOENT)
366 {
367 ret = unlink(path);
368 if (ret < 0 && errno != ENOENT)
369 {
370 save_errno = errno;
373 errmsg("could not remove file \"%s\": %m", path)));
374 errno = save_errno;
375 }
376 }
377 }
378 else
379 {
380 /* Prevent other backends' fds from holding on to the disk space */
381 ret = do_truncate(path);
382
383 /* Register request to unlink first segment later */
384 save_errno = errno;
385 register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
386 errno = save_errno;
387 }
388
389 /*
390 * Delete any additional segments.
391 *
392 * Note that because we loop until getting ENOENT, we will correctly
393 * remove all inactive segments as well as active ones. Ideally we'd
394 * continue the loop until getting exactly that errno, but that risks an
395 * infinite loop if the problem is directory-wide (for instance, if we
396 * suddenly can't read the data directory itself). We compromise by
397 * continuing after a non-ENOENT truncate error, but stopping after any
398 * unlink error. If there is indeed a directory-wide problem, additional
399 * unlink attempts wouldn't work anyway.
400 */
401 if (ret >= 0 || errno != ENOENT)
402 {
403 char *segpath = (char *) palloc(strlen(path) + 12);
404 BlockNumber segno;
405
406 for (segno = 1;; segno++)
407 {
408 sprintf(segpath, "%s.%u", path, segno);
409
410 if (!RelFileLocatorBackendIsTemp(rlocator))
411 {
412 /*
413 * Prevent other backends' fds from holding on to the disk
414 * space. We're done if we see ENOENT, though.
415 */
416 if (do_truncate(segpath) < 0 && errno == ENOENT)
417 break;
418
419 /*
420 * Forget any pending sync requests for this segment before we
421 * try to unlink.
422 */
423 register_forget_request(rlocator, forknum, segno);
424 }
425
426 if (unlink(segpath) < 0)
427 {
428 /* ENOENT is expected after the last segment... */
429 if (errno != ENOENT)
432 errmsg("could not remove file \"%s\": %m", segpath)));
433 break;
434 }
435 }
436 pfree(segpath);
437 }
438
439 pfree(path);
440}
441
442/*
443 * mdextend() -- Add a block to the specified relation.
444 *
445 * The semantics are nearly the same as mdwrite(): write at the
446 * specified position. However, this is to be used for the case of
447 * extending a relation (i.e., blocknum is at or beyond the current
448 * EOF). Note that we assume writing a block beyond current EOF
449 * causes intervening file space to become filled with zeroes.
450 */
451void
453 const void *buffer, bool skipFsync)
454{
455 off_t seekpos;
456 int nbytes;
457 MdfdVec *v;
458
459 /* If this build supports direct I/O, the buffer must be I/O aligned. */
460 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
461 Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
462
463 /* This assert is too expensive to have on normally ... */
464#ifdef CHECK_WRITE_VS_EXTEND
465 Assert(blocknum >= mdnblocks(reln, forknum));
466#endif
467
468 /*
469 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
470 * more --- we mustn't create a block whose number actually is
471 * InvalidBlockNumber. (Note that this failure should be unreachable
472 * because of upstream checks in bufmgr.c.)
473 */
474 if (blocknum == InvalidBlockNumber)
476 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
477 errmsg("cannot extend file \"%s\" beyond %u blocks",
478 relpath(reln->smgr_rlocator, forknum),
480
481 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
482
483 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
484
485 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
486
487 if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
488 {
489 if (nbytes < 0)
492 errmsg("could not extend file \"%s\": %m",
494 errhint("Check free disk space.")));
495 /* short write: complain appropriately */
497 (errcode(ERRCODE_DISK_FULL),
498 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
500 nbytes, BLCKSZ, blocknum),
501 errhint("Check free disk space.")));
502 }
503
504 if (!skipFsync && !SmgrIsTemp(reln))
505 register_dirty_segment(reln, forknum, v);
506
507 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
508}
509
510/*
511 * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
512 *
513 * Similar to mdextend(), except the relation can be extended by multiple
514 * blocks at once and the added blocks will be filled with zeroes.
515 */
516void
518 BlockNumber blocknum, int nblocks, bool skipFsync)
519{
520 MdfdVec *v;
521 BlockNumber curblocknum = blocknum;
522 int remblocks = nblocks;
523
524 Assert(nblocks > 0);
525
526 /* This assert is too expensive to have on normally ... */
527#ifdef CHECK_WRITE_VS_EXTEND
528 Assert(blocknum >= mdnblocks(reln, forknum));
529#endif
530
531 /*
532 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
533 * more --- we mustn't create a block whose number actually is
534 * InvalidBlockNumber or larger.
535 */
536 if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
538 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
539 errmsg("cannot extend file \"%s\" beyond %u blocks",
540 relpath(reln->smgr_rlocator, forknum),
542
543 while (remblocks > 0)
544 {
545 BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
546 off_t seekpos = (off_t) BLCKSZ * segstartblock;
547 int numblocks;
548
549 if (segstartblock + remblocks > RELSEG_SIZE)
550 numblocks = RELSEG_SIZE - segstartblock;
551 else
552 numblocks = remblocks;
553
554 v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
555
556 Assert(segstartblock < RELSEG_SIZE);
557 Assert(segstartblock + numblocks <= RELSEG_SIZE);
558
559 /*
560 * If available and useful, use posix_fallocate() (via
561 * FileFallocate()) to extend the relation. That's often more
562 * efficient than using write(), as it commonly won't cause the kernel
563 * to allocate page cache space for the extended pages.
564 *
565 * However, we don't use FileFallocate() for small extensions, as it
566 * defeats delayed allocation on some filesystems. Not clear where
567 * that decision should be made though? For now just use a cutoff of
568 * 8, anything between 4 and 8 worked OK in some local testing.
569 */
570 if (numblocks > 8)
571 {
572 int ret;
573
574 ret = FileFallocate(v->mdfd_vfd,
575 seekpos, (off_t) BLCKSZ * numblocks,
576 WAIT_EVENT_DATA_FILE_EXTEND);
577 if (ret != 0)
578 {
581 errmsg("could not extend file \"%s\" with FileFallocate(): %m",
583 errhint("Check free disk space."));
584 }
585 }
586 else
587 {
588 int ret;
589
590 /*
591 * Even if we don't want to use fallocate, we can still extend a
592 * bit more efficiently than writing each 8kB block individually.
593 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
594 * to avoid multiple writes or needing a zeroed buffer for the
595 * whole length of the extension.
596 */
597 ret = FileZero(v->mdfd_vfd,
598 seekpos, (off_t) BLCKSZ * numblocks,
599 WAIT_EVENT_DATA_FILE_EXTEND);
600 if (ret < 0)
603 errmsg("could not extend file \"%s\": %m",
605 errhint("Check free disk space."));
606 }
607
608 if (!skipFsync && !SmgrIsTemp(reln))
609 register_dirty_segment(reln, forknum, v);
610
611 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
612
613 remblocks -= numblocks;
614 curblocknum += numblocks;
615 }
616}
617
618/*
619 * mdopenfork() -- Open one fork of the specified relation.
620 *
621 * Note we only open the first segment, when there are multiple segments.
622 *
623 * If first segment is not present, either ereport or return NULL according
624 * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
625 * EXTENSION_CREATE means it's OK to extend an existing relation, not to
626 * invent one out of whole cloth.
627 */
628static MdfdVec *
629mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
630{
631 MdfdVec *mdfd;
632 char *path;
633 File fd;
634
635 /* No work if already open */
636 if (reln->md_num_open_segs[forknum] > 0)
637 return &reln->md_seg_fds[forknum][0];
638
639 path = relpath(reln->smgr_rlocator, forknum);
640
642
643 if (fd < 0)
644 {
645 if ((behavior & EXTENSION_RETURN_NULL) &&
647 {
648 pfree(path);
649 return NULL;
650 }
653 errmsg("could not open file \"%s\": %m", path)));
654 }
655
656 pfree(path);
657
658 _fdvec_resize(reln, forknum, 1);
659 mdfd = &reln->md_seg_fds[forknum][0];
660 mdfd->mdfd_vfd = fd;
661 mdfd->mdfd_segno = 0;
662
663 Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
664
665 return mdfd;
666}
667
668/*
669 * mdopen() -- Initialize newly-opened relation.
670 */
671void
673{
674 /* mark it not open */
675 for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
676 reln->md_num_open_segs[forknum] = 0;
677}
678
679/*
680 * mdclose() -- Close the specified relation, if it isn't closed already.
681 */
682void
684{
685 int nopensegs = reln->md_num_open_segs[forknum];
686
687 /* No work if already closed */
688 if (nopensegs == 0)
689 return;
690
691 /* close segments starting from the end */
692 while (nopensegs > 0)
693 {
694 MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
695
697 _fdvec_resize(reln, forknum, nopensegs - 1);
698 nopensegs--;
699 }
700}
701
702/*
703 * mdprefetch() -- Initiate asynchronous read of the specified blocks of a relation
704 */
705bool
707 int nblocks)
708{
709#ifdef USE_PREFETCH
710
712
713 if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
714 return false;
715
716 while (nblocks > 0)
717 {
718 off_t seekpos;
719 MdfdVec *v;
720 int nblocks_this_segment;
721
722 v = _mdfd_getseg(reln, forknum, blocknum, false,
724 if (v == NULL)
725 return false;
726
727 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
728
729 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
730
731 nblocks_this_segment =
732 Min(nblocks,
733 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
734
735 (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ * nblocks_this_segment,
736 WAIT_EVENT_DATA_FILE_PREFETCH);
737
738 blocknum += nblocks_this_segment;
739 nblocks -= nblocks_this_segment;
740 }
741#endif /* USE_PREFETCH */
742
743 return true;
744}
745
746/*
747 * Convert an array of buffer address into an array of iovec objects, and
748 * return the number that were required. 'iov' must have enough space for up
749 * to 'nblocks' elements, but the number used may be less depending on
750 * merging. In the case of a run of fully contiguous buffers, a single iovec
751 * will be populated that can be handled as a plain non-vectored I/O.
752 */
753static int
754buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
755{
756 struct iovec *iovp;
757 int iovcnt;
758
759 Assert(nblocks >= 1);
760
761 /* If this build supports direct I/O, buffers must be I/O aligned. */
762 for (int i = 0; i < nblocks; ++i)
763 {
764 if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
765 Assert((uintptr_t) buffers[i] ==
766 TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
767 }
768
769 /* Start the first iovec off with the first buffer. */
770 iovp = &iov[0];
771 iovp->iov_base = buffers[0];
772 iovp->iov_len = BLCKSZ;
773 iovcnt = 1;
774
775 /* Try to merge the rest. */
776 for (int i = 1; i < nblocks; ++i)
777 {
778 void *buffer = buffers[i];
779
780 if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
781 {
782 /* Contiguous with the last iovec. */
783 iovp->iov_len += BLCKSZ;
784 }
785 else
786 {
787 /* Need a new iovec. */
788 iovp++;
789 iovp->iov_base = buffer;
790 iovp->iov_len = BLCKSZ;
791 iovcnt++;
792 }
793 }
794
795 return iovcnt;
796}
797
798/*
799 * mdmaxcombine() -- Return the maximum number of total blocks that can be
800 * combined with an IO starting at blocknum.
801 */
802uint32
804 BlockNumber blocknum)
805{
806 BlockNumber segoff;
807
808 segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
809
810 return RELSEG_SIZE - segoff;
811}
812
813/*
814 * mdreadv() -- Read the specified blocks from a relation.
815 */
816void
818 void **buffers, BlockNumber nblocks)
819{
820 while (nblocks > 0)
821 {
822 struct iovec iov[PG_IOV_MAX];
823 int iovcnt;
824 off_t seekpos;
825 int nbytes;
826 MdfdVec *v;
827 BlockNumber nblocks_this_segment;
828 size_t transferred_this_segment;
829 size_t size_this_segment;
830
831 v = _mdfd_getseg(reln, forknum, blocknum, false,
833
834 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
835
836 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
837
838 nblocks_this_segment =
839 Min(nblocks,
840 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
841 nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
842
843 if (nblocks_this_segment != nblocks)
844 elog(ERROR, "read crosses segment boundary");
845
846 iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
847 size_this_segment = nblocks_this_segment * BLCKSZ;
848 transferred_this_segment = 0;
849
850 /*
851 * Inner loop to continue after a short read. We'll keep going until
852 * we hit EOF rather than assuming that a short read means we hit the
853 * end.
854 */
855 for (;;)
856 {
857 TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
861 reln->smgr_rlocator.backend);
862 nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
863 WAIT_EVENT_DATA_FILE_READ);
864 TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
869 nbytes,
870 size_this_segment - transferred_this_segment);
871
872#ifdef SIMULATE_SHORT_READ
873 nbytes = Min(nbytes, 4096);
874#endif
875
876 if (nbytes < 0)
879 errmsg("could not read blocks %u..%u in file \"%s\": %m",
880 blocknum,
881 blocknum + nblocks_this_segment - 1,
882 FilePathName(v->mdfd_vfd))));
883
884 if (nbytes == 0)
885 {
886 /*
887 * We are at or past EOF, or we read a partial block at EOF.
888 * Normally this is an error; upper levels should never try to
889 * read a nonexistent block. However, if zero_damaged_pages
890 * is ON or we are InRecovery, we should instead return zeroes
891 * without complaining. This allows, for example, the case of
892 * trying to update a block that was later truncated away.
893 */
895 {
896 for (BlockNumber i = transferred_this_segment / BLCKSZ;
897 i < nblocks_this_segment;
898 ++i)
899 memset(buffers[i], 0, BLCKSZ);
900 break;
901 }
902 else
905 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
906 blocknum,
907 blocknum + nblocks_this_segment - 1,
909 transferred_this_segment,
910 size_this_segment)));
911 }
912
913 /* One loop should usually be enough. */
914 transferred_this_segment += nbytes;
915 Assert(transferred_this_segment <= size_this_segment);
916 if (transferred_this_segment == size_this_segment)
917 break;
918
919 /* Adjust position and vectors after a short read. */
920 seekpos += nbytes;
921 iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
922 }
923
924 nblocks -= nblocks_this_segment;
925 buffers += nblocks_this_segment;
926 blocknum += nblocks_this_segment;
927 }
928}
929
930/*
931 * mdwritev() -- Write the supplied blocks at the appropriate location.
932 *
933 * This is to be used only for updating already-existing blocks of a
934 * relation (ie, those before the current EOF). To extend a relation,
935 * use mdextend().
936 */
937void
939 const void **buffers, BlockNumber nblocks, bool skipFsync)
940{
941 /* This assert is too expensive to have on normally ... */
942#ifdef CHECK_WRITE_VS_EXTEND
943 Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
944#endif
945
946 while (nblocks > 0)
947 {
948 struct iovec iov[PG_IOV_MAX];
949 int iovcnt;
950 off_t seekpos;
951 int nbytes;
952 MdfdVec *v;
953 BlockNumber nblocks_this_segment;
954 size_t transferred_this_segment;
955 size_t size_this_segment;
956
957 v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
959
960 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
961
962 Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
963
964 nblocks_this_segment =
965 Min(nblocks,
966 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
967 nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
968
969 if (nblocks_this_segment != nblocks)
970 elog(ERROR, "write crosses segment boundary");
971
972 iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
973 size_this_segment = nblocks_this_segment * BLCKSZ;
974 transferred_this_segment = 0;
975
976 /*
977 * Inner loop to continue after a short write. If the reason is that
978 * we're out of disk space, a future attempt should get an ENOSPC
979 * error from the kernel.
980 */
981 for (;;)
982 {
983 TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
987 reln->smgr_rlocator.backend);
988 nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
989 WAIT_EVENT_DATA_FILE_WRITE);
990 TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
995 nbytes,
996 size_this_segment - transferred_this_segment);
997
998#ifdef SIMULATE_SHORT_WRITE
999 nbytes = Min(nbytes, 4096);
1000#endif
1001
1002 if (nbytes < 0)
1003 {
1004 bool enospc = errno == ENOSPC;
1005
1006 ereport(ERROR,
1008 errmsg("could not write blocks %u..%u in file \"%s\": %m",
1009 blocknum,
1010 blocknum + nblocks_this_segment - 1,
1012 enospc ? errhint("Check free disk space.") : 0));
1013 }
1014
1015 /* One loop should usually be enough. */
1016 transferred_this_segment += nbytes;
1017 Assert(transferred_this_segment <= size_this_segment);
1018 if (transferred_this_segment == size_this_segment)
1019 break;
1020
1021 /* Adjust position and iovecs after a short write. */
1022 seekpos += nbytes;
1023 iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
1024 }
1025
1026 if (!skipFsync && !SmgrIsTemp(reln))
1027 register_dirty_segment(reln, forknum, v);
1028
1029 nblocks -= nblocks_this_segment;
1030 buffers += nblocks_this_segment;
1031 blocknum += nblocks_this_segment;
1032 }
1033}
1034
1035
1036/*
1037 * mdwriteback() -- Tell the kernel to write pages back to storage.
1038 *
1039 * This accepts a range of blocks because flushing several pages at once is
1040 * considerably more efficient than doing so individually.
1041 */
1042void
1044 BlockNumber blocknum, BlockNumber nblocks)
1045{
1047
1048 /*
1049 * Issue flush requests in as few requests as possible; have to split at
1050 * segment boundaries though, since those are actually separate files.
1051 */
1052 while (nblocks > 0)
1053 {
1054 BlockNumber nflush = nblocks;
1055 off_t seekpos;
1056 MdfdVec *v;
1057 int segnum_start,
1058 segnum_end;
1059
1060 v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
1062
1063 /*
1064 * We might be flushing buffers of already removed relations, that's
1065 * ok, just ignore that case. If the segment file wasn't open already
1066 * (ie from a recent mdwrite()), then we don't want to re-open it, to
1067 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
1068 * us with a descriptor to a file that is about to be unlinked.
1069 */
1070 if (!v)
1071 return;
1072
1073 /* compute offset inside the current segment */
1074 segnum_start = blocknum / RELSEG_SIZE;
1075
1076 /* compute number of desired writes within the current segment */
1077 segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
1078 if (segnum_start != segnum_end)
1079 nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
1080
1081 Assert(nflush >= 1);
1082 Assert(nflush <= nblocks);
1083
1084 seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
1085
1086 FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
1087
1088 nblocks -= nflush;
1089 blocknum += nflush;
1090 }
1091}
1092
1093/*
1094 * mdnblocks() -- Get the number of blocks stored in a relation.
1095 *
1096 * Important side effect: all active segments of the relation are opened
1097 * and added to the md_seg_fds array. If this routine has not been
1098 * called, then only segments up to the last one actually touched
1099 * are present in the array.
1100 */
1103{
1104 MdfdVec *v;
1105 BlockNumber nblocks;
1106 BlockNumber segno;
1107
1108 mdopenfork(reln, forknum, EXTENSION_FAIL);
1109
1110 /* mdopen has opened the first segment */
1111 Assert(reln->md_num_open_segs[forknum] > 0);
1112
1113 /*
1114 * Start from the last open segments, to avoid redundant seeks. We have
1115 * previously verified that these segments are exactly RELSEG_SIZE long,
1116 * and it's useless to recheck that each time.
1117 *
1118 * NOTE: this assumption could only be wrong if another backend has
1119 * truncated the relation. We rely on higher code levels to handle that
1120 * scenario by closing and re-opening the md fd, which is handled via
1121 * relcache flush. (Since the checkpointer doesn't participate in
1122 * relcache flush, it could have segment entries for inactive segments;
1123 * that's OK because the checkpointer never needs to compute relation
1124 * size.)
1125 */
1126 segno = reln->md_num_open_segs[forknum] - 1;
1127 v = &reln->md_seg_fds[forknum][segno];
1128
1129 for (;;)
1130 {
1131 nblocks = _mdnblocks(reln, forknum, v);
1132 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1133 elog(FATAL, "segment too big");
1134 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1135 return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
1136
1137 /*
1138 * If segment is exactly RELSEG_SIZE, advance to next one.
1139 */
1140 segno++;
1141
1142 /*
1143 * We used to pass O_CREAT here, but that has the disadvantage that it
1144 * might create a segment which has vanished through some operating
1145 * system misadventure. In such a case, creating the segment here
1146 * undermines _mdfd_getseg's attempts to notice and report an error
1147 * upon access to a missing segment.
1148 */
1149 v = _mdfd_openseg(reln, forknum, segno, 0);
1150 if (v == NULL)
1151 return segno * ((BlockNumber) RELSEG_SIZE);
1152 }
1153}
1154
1155/*
1156 * mdtruncate() -- Truncate relation to specified number of blocks.
1157 *
1158 * Guaranteed not to allocate memory, so it can be used in a critical section.
1159 * Caller must have called smgrnblocks() to obtain curnblk while holding a
1160 * sufficient lock to prevent a change in relation size, and not used any smgr
1161 * functions for this relation or handled interrupts in between. This makes
1162 * sure we have opened all active segments, so that truncate loop will get
1163 * them all!
1164 */
1165void
1167 BlockNumber curnblk, BlockNumber nblocks)
1168{
1169 BlockNumber priorblocks;
1170 int curopensegs;
1171
1172 if (nblocks > curnblk)
1173 {
1174 /* Bogus request ... but no complaint if InRecovery */
1175 if (InRecovery)
1176 return;
1177 ereport(ERROR,
1178 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
1179 relpath(reln->smgr_rlocator, forknum),
1180 nblocks, curnblk)));
1181 }
1182 if (nblocks == curnblk)
1183 return; /* no work */
1184
1185 /*
1186 * Truncate segments, starting at the last one. Starting at the end makes
1187 * managing the memory for the fd array easier, should there be errors.
1188 */
1189 curopensegs = reln->md_num_open_segs[forknum];
1190 while (curopensegs > 0)
1191 {
1192 MdfdVec *v;
1193
1194 priorblocks = (curopensegs - 1) * RELSEG_SIZE;
1195
1196 v = &reln->md_seg_fds[forknum][curopensegs - 1];
1197
1198 if (priorblocks > nblocks)
1199 {
1200 /*
1201 * This segment is no longer active. We truncate the file, but do
1202 * not delete it, for reasons explained in the header comments.
1203 */
1204 if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1205 ereport(ERROR,
1207 errmsg("could not truncate file \"%s\": %m",
1208 FilePathName(v->mdfd_vfd))));
1209
1210 if (!SmgrIsTemp(reln))
1211 register_dirty_segment(reln, forknum, v);
1212
1213 /* we never drop the 1st segment */
1214 Assert(v != &reln->md_seg_fds[forknum][0]);
1215
1216 FileClose(v->mdfd_vfd);
1217 _fdvec_resize(reln, forknum, curopensegs - 1);
1218 }
1219 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
1220 {
1221 /*
1222 * This is the last segment we want to keep. Truncate the file to
1223 * the right length. NOTE: if nblocks is exactly a multiple K of
1224 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
1225 * keep it. This adheres to the invariant given in the header
1226 * comments.
1227 */
1228 BlockNumber lastsegblocks = nblocks - priorblocks;
1229
1230 if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
1231 ereport(ERROR,
1233 errmsg("could not truncate file \"%s\" to %u blocks: %m",
1235 nblocks)));
1236 if (!SmgrIsTemp(reln))
1237 register_dirty_segment(reln, forknum, v);
1238 }
1239 else
1240 {
1241 /*
1242 * We still need this segment, so nothing to do for this and any
1243 * earlier segment.
1244 */
1245 break;
1246 }
1247 curopensegs--;
1248 }
1249}
1250
1251/*
1252 * mdregistersync() -- Mark whole relation as needing fsync
1253 */
1254void
1256{
1257 int segno;
1258 int min_inactive_seg;
1259
1260 /*
1261 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1262 * the loop below will get them all!
1263 */
1264 mdnblocks(reln, forknum);
1265
1266 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1267
1268 /*
1269 * Temporarily open inactive segments, then close them after sync. There
1270 * may be some inactive segments left opened after error, but that is
1271 * harmless. We don't bother to clean them up and take a risk of further
1272 * trouble. The next mdclose() will soon close them.
1273 */
1274 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1275 segno++;
1276
1277 while (segno > 0)
1278 {
1279 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1280
1281 register_dirty_segment(reln, forknum, v);
1282
1283 /* Close inactive segments immediately */
1284 if (segno > min_inactive_seg)
1285 {
1286 FileClose(v->mdfd_vfd);
1287 _fdvec_resize(reln, forknum, segno - 1);
1288 }
1289
1290 segno--;
1291 }
1292}
1293
1294/*
1295 * mdimmedsync() -- Immediately sync a relation to stable storage.
1296 *
1297 * Note that only writes already issued are synced; this routine knows
1298 * nothing of dirty buffers that may exist inside the buffer manager. We
1299 * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
1300 * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
1301 * some segment, then mdtruncate() renders that segment inactive. If we
1302 * crash before the next checkpoint syncs the newly-inactive segment, that
1303 * segment may survive recovery, reintroducing unwanted data into the table.
1304 */
1305void
1307{
1308 int segno;
1309 int min_inactive_seg;
1310
1311 /*
1312 * NOTE: mdnblocks makes sure we have opened all active segments, so that
1313 * the loop below will get them all!
1314 */
1315 mdnblocks(reln, forknum);
1316
1317 min_inactive_seg = segno = reln->md_num_open_segs[forknum];
1318
1319 /*
1320 * Temporarily open inactive segments, then close them after sync. There
1321 * may be some inactive segments left opened after fsync() error, but that
1322 * is harmless. We don't bother to clean them up and take a risk of
1323 * further trouble. The next mdclose() will soon close them.
1324 */
1325 while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
1326 segno++;
1327
1328 while (segno > 0)
1329 {
1330 MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
1331
1332 /*
1333 * fsyncs done through mdimmedsync() should be tracked in a separate
1334 * IOContext than those done through mdsyncfiletag() to differentiate
1335 * between unavoidable client backend fsyncs (e.g. those done during
1336 * index build) and those which ideally would have been done by the
1337 * checkpointer. Since other IO operations bypassing the buffer
1338 * manager could also be tracked in such an IOContext, wait until
1339 * these are also tracked to track immediate fsyncs.
1340 */
1341 if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
1344 errmsg("could not fsync file \"%s\": %m",
1345 FilePathName(v->mdfd_vfd))));
1346
1347 /* Close inactive segments immediately */
1348 if (segno > min_inactive_seg)
1349 {
1350 FileClose(v->mdfd_vfd);
1351 _fdvec_resize(reln, forknum, segno - 1);
1352 }
1353
1354 segno--;
1355 }
1356}
1357
1358/*
1359 * register_dirty_segment() -- Mark a relation segment as needing fsync
1360 *
1361 * If there is a local pending-ops table, just make an entry in it for
1362 * ProcessSyncRequests to process later. Otherwise, try to pass off the
1363 * fsync request to the checkpointer process. If that fails, just do the
1364 * fsync locally before returning (we hope this will not happen often
1365 * enough to be a performance problem).
1366 */
1367static void
1369{
1370 FileTag tag;
1371
1372 INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
1373
1374 /* Temp relations should never be fsync'd */
1375 Assert(!SmgrIsTemp(reln));
1376
1377 if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
1378 {
1379 instr_time io_start;
1380
1382 (errmsg_internal("could not forward fsync request because request queue is full")));
1383
1385
1386 if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
1389 errmsg("could not fsync file \"%s\": %m",
1390 FilePathName(seg->mdfd_vfd))));
1391
1392 /*
1393 * We have no way of knowing if the current IOContext is
1394 * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
1395 * point, so count the fsync as being in the IOCONTEXT_NORMAL
1396 * IOContext. This is probably okay, because the number of backend
1397 * fsyncs doesn't say anything about the efficacy of the
1398 * BufferAccessStrategy. And counting both fsyncs done in
1399 * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
1400 * IOCONTEXT_NORMAL is likely clearer when investigating the number of
1401 * backend fsyncs.
1402 */
1404 IOOP_FSYNC, io_start, 1, 0);
1405 }
1406}
1407
1408/*
1409 * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
1410 */
1411static void
1413 BlockNumber segno)
1414{
1415 FileTag tag;
1416
1417 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1418
1419 /* Should never be used with temp relations */
1421
1422 RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
1423}
1424
1425/*
1426 * register_forget_request() -- forget any fsyncs for a relation fork's segment
1427 */
1428static void
1430 BlockNumber segno)
1431{
1432 FileTag tag;
1433
1434 INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
1435
1436 RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
1437}
1438
1439/*
1440 * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
1441 */
1442void
1444{
1445 FileTag tag;
1446 RelFileLocator rlocator;
1447
1448 rlocator.dbOid = dbid;
1449 rlocator.spcOid = 0;
1450 rlocator.relNumber = 0;
1451
1453
1454 RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
1455}
1456
1457/*
1458 * DropRelationFiles -- drop files of all given relations
1459 */
1460void
1461DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
1462{
1463 SMgrRelation *srels;
1464 int i;
1465
1466 srels = palloc(sizeof(SMgrRelation) * ndelrels);
1467 for (i = 0; i < ndelrels; i++)
1468 {
1469 SMgrRelation srel = smgropen(delrels[i], INVALID_PROC_NUMBER);
1470
1471 if (isRedo)
1472 {
1473 ForkNumber fork;
1474
1475 for (fork = 0; fork <= MAX_FORKNUM; fork++)
1476 XLogDropRelation(delrels[i], fork);
1477 }
1478 srels[i] = srel;
1479 }
1480
1481 smgrdounlinkall(srels, ndelrels, isRedo);
1482
1483 for (i = 0; i < ndelrels; i++)
1484 smgrclose(srels[i]);
1485 pfree(srels);
1486}
1487
1488
1489/*
1490 * _fdvec_resize() -- Resize the fork's open segments array
1491 */
1492static void
1494 ForkNumber forknum,
1495 int nseg)
1496{
1497 if (nseg == 0)
1498 {
1499 if (reln->md_num_open_segs[forknum] > 0)
1500 {
1501 pfree(reln->md_seg_fds[forknum]);
1502 reln->md_seg_fds[forknum] = NULL;
1503 }
1504 }
1505 else if (reln->md_num_open_segs[forknum] == 0)
1506 {
1507 reln->md_seg_fds[forknum] =
1508 MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
1509 }
1510 else if (nseg > reln->md_num_open_segs[forknum])
1511 {
1512 /*
1513 * It doesn't seem worthwhile complicating the code to amortize
1514 * repalloc() calls. Those are far faster than PathNameOpenFile() or
1515 * FileClose(), and the memory context internally will sometimes avoid
1516 * doing an actual reallocation.
1517 */
1518 reln->md_seg_fds[forknum] =
1519 repalloc(reln->md_seg_fds[forknum],
1520 sizeof(MdfdVec) * nseg);
1521 }
1522 else
1523 {
1524 /*
1525 * We don't reallocate a smaller array, because we want mdtruncate()
1526 * to be able to promise that it won't allocate memory, so that it is
1527 * allowed in a critical section. This means that a bit of space in
1528 * the array is now wasted, until the next time we add a segment and
1529 * reallocate.
1530 */
1531 }
1532
1533 reln->md_num_open_segs[forknum] = nseg;
1534}
1535
1536/*
1537 * Return the filename for the specified segment of the relation. The
1538 * returned string is palloc'd.
1539 */
1540static char *
1542{
1543 char *path,
1544 *fullpath;
1545
1546 path = relpath(reln->smgr_rlocator, forknum);
1547
1548 if (segno > 0)
1549 {
1550 fullpath = psprintf("%s.%u", path, segno);
1551 pfree(path);
1552 }
1553 else
1554 fullpath = path;
1555
1556 return fullpath;
1557}
1558
1559/*
1560 * Open the specified segment of the relation,
1561 * and make a MdfdVec object for it. Returns NULL on failure.
1562 */
1563static MdfdVec *
1565 int oflags)
1566{
1567 MdfdVec *v;
1568 File fd;
1569 char *fullpath;
1570
1571 fullpath = _mdfd_segpath(reln, forknum, segno);
1572
1573 /* open the file */
1574 fd = PathNameOpenFile(fullpath, _mdfd_open_flags() | oflags);
1575
1576 pfree(fullpath);
1577
1578 if (fd < 0)
1579 return NULL;
1580
1581 /*
1582 * Segments are always opened in order from lowest to highest, so we must
1583 * be adding a new one at the end.
1584 */
1585 Assert(segno == reln->md_num_open_segs[forknum]);
1586
1587 _fdvec_resize(reln, forknum, segno + 1);
1588
1589 /* fill the entry */
1590 v = &reln->md_seg_fds[forknum][segno];
1591 v->mdfd_vfd = fd;
1592 v->mdfd_segno = segno;
1593
1594 Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1595
1596 /* all done */
1597 return v;
1598}
1599
1600/*
1601 * _mdfd_getseg() -- Find the segment of the relation holding the
1602 * specified block.
1603 *
1604 * If the segment doesn't exist, we ereport, return NULL, or create the
1605 * segment, according to "behavior". Note: skipFsync is only used in the
1606 * EXTENSION_CREATE case.
1607 */
1608static MdfdVec *
1610 bool skipFsync, int behavior)
1611{
1612 MdfdVec *v;
1613 BlockNumber targetseg;
1614 BlockNumber nextsegno;
1615
1616 /* some way to handle non-existent segments needs to be specified */
1617 Assert(behavior &
1620
1621 targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1622
1623 /* if an existing and opened segment, we're done */
1624 if (targetseg < reln->md_num_open_segs[forknum])
1625 {
1626 v = &reln->md_seg_fds[forknum][targetseg];
1627 return v;
1628 }
1629
1630 /* The caller only wants the segment if we already had it open. */
1631 if (behavior & EXTENSION_DONT_OPEN)
1632 return NULL;
1633
1634 /*
1635 * The target segment is not yet open. Iterate over all the segments
1636 * between the last opened and the target segment. This way missing
1637 * segments either raise an error, or get created (according to
1638 * 'behavior'). Start with either the last opened, or the first segment if
1639 * none was opened before.
1640 */
1641 if (reln->md_num_open_segs[forknum] > 0)
1642 v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
1643 else
1644 {
1645 v = mdopenfork(reln, forknum, behavior);
1646 if (!v)
1647 return NULL; /* if behavior & EXTENSION_RETURN_NULL */
1648 }
1649
1650 for (nextsegno = reln->md_num_open_segs[forknum];
1651 nextsegno <= targetseg; nextsegno++)
1652 {
1653 BlockNumber nblocks = _mdnblocks(reln, forknum, v);
1654 int flags = 0;
1655
1656 Assert(nextsegno == v->mdfd_segno + 1);
1657
1658 if (nblocks > ((BlockNumber) RELSEG_SIZE))
1659 elog(FATAL, "segment too big");
1660
1661 if ((behavior & EXTENSION_CREATE) ||
1662 (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
1663 {
1664 /*
1665 * Normally we will create new segments only if authorized by the
1666 * caller (i.e., we are doing mdextend()). But when doing WAL
1667 * recovery, create segments anyway; this allows cases such as
1668 * replaying WAL data that has a write into a high-numbered
1669 * segment of a relation that was later deleted. We want to go
1670 * ahead and create the segments so we can finish out the replay.
1671 *
1672 * We have to maintain the invariant that segments before the last
1673 * active segment are of size RELSEG_SIZE; therefore, if
1674 * extending, pad them out with zeroes if needed. (This only
1675 * matters if in recovery, or if the caller is extending the
1676 * relation discontiguously, but that can happen in hash indexes.)
1677 */
1678 if (nblocks < ((BlockNumber) RELSEG_SIZE))
1679 {
1680 char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
1682
1683 mdextend(reln, forknum,
1684 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1685 zerobuf, skipFsync);
1686 pfree(zerobuf);
1687 }
1688 flags = O_CREAT;
1689 }
1690 else if (nblocks < ((BlockNumber) RELSEG_SIZE))
1691 {
1692 /*
1693 * When not extending, only open the next segment if the current
1694 * one is exactly RELSEG_SIZE. If not (this branch), either
1695 * return NULL or fail.
1696 */
1697 if (behavior & EXTENSION_RETURN_NULL)
1698 {
1699 /*
1700 * Some callers discern between reasons for _mdfd_getseg()
1701 * returning NULL based on errno. As there's no failing
1702 * syscall involved in this case, explicitly set errno to
1703 * ENOENT, as that seems the closest interpretation.
1704 */
1705 errno = ENOENT;
1706 return NULL;
1707 }
1708
1709 ereport(ERROR,
1711 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
1712 _mdfd_segpath(reln, forknum, nextsegno),
1713 blkno, nblocks)));
1714 }
1715
1716 v = _mdfd_openseg(reln, forknum, nextsegno, flags);
1717
1718 if (v == NULL)
1719 {
1720 if ((behavior & EXTENSION_RETURN_NULL) &&
1721 FILE_POSSIBLY_DELETED(errno))
1722 return NULL;
1723 ereport(ERROR,
1725 errmsg("could not open file \"%s\" (target block %u): %m",
1726 _mdfd_segpath(reln, forknum, nextsegno),
1727 blkno)));
1728 }
1729 }
1730
1731 return v;
1732}
1733
1734/*
1735 * Get number of blocks present in a single disk file
1736 */
1737static BlockNumber
1739{
1740 off_t len;
1741
1742 len = FileSize(seg->mdfd_vfd);
1743 if (len < 0)
1744 ereport(ERROR,
1746 errmsg("could not seek to end of file \"%s\": %m",
1747 FilePathName(seg->mdfd_vfd))));
1748 /* note that this calculation will ignore any partial block at EOF */
1749 return (BlockNumber) (len / BLCKSZ);
1750}
1751
1752/*
1753 * Sync a file to disk, given a file tag. Write the path into an output
1754 * buffer so the caller can use it in error messages.
1755 *
1756 * Return 0 on success, -1 on failure, with errno set.
1757 */
1758int
1759mdsyncfiletag(const FileTag *ftag, char *path)
1760{
1762 File file;
1763 instr_time io_start;
1764 bool need_to_close;
1765 int result,
1766 save_errno;
1767
1768 /* See if we already have the file open, or need to open it. */
1769 if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
1770 {
1771 file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
1772 strlcpy(path, FilePathName(file), MAXPGPATH);
1773 need_to_close = false;
1774 }
1775 else
1776 {
1777 char *p;
1778
1779 p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
1780 strlcpy(path, p, MAXPGPATH);
1781 pfree(p);
1782
1783 file = PathNameOpenFile(path, _mdfd_open_flags());
1784 if (file < 0)
1785 return -1;
1786 need_to_close = true;
1787 }
1788
1790
1791 /* Sync the file. */
1792 result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
1793 save_errno = errno;
1794
1795 if (need_to_close)
1796 FileClose(file);
1797
1799 IOOP_FSYNC, io_start, 1, 0);
1800
1801 errno = save_errno;
1802 return result;
1803}
1804
1805/*
1806 * Unlink a file, given a file tag. Write the path into an output
1807 * buffer so the caller can use it in error messages.
1808 *
1809 * Return 0 on success, -1 on failure, with errno set.
1810 */
1811int
1812mdunlinkfiletag(const FileTag *ftag, char *path)
1813{
1814 char *p;
1815
1816 /* Compute the path. */
1817 p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
1818 strlcpy(path, p, MAXPGPATH);
1819 pfree(p);
1820
1821 /* Try to unlink the file. */
1822 return unlink(path);
1823}
1824
1825/*
1826 * Check if a given candidate request matches a given tag, when processing
1827 * a SYNC_FILTER_REQUEST request. This will be called for all pending
1828 * requests to find out whether to forget them.
1829 */
1830bool
1831mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
1832{
1833 /*
1834 * For now we only use filter requests as a way to drop all scheduled
1835 * callbacks relating to a given database, when dropping the database.
1836 * We'll return true for all candidates that have the same database OID as
1837 * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
1838 */
1839 return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
1840}
void TablespaceCreateDbspace(Oid spcOid, Oid dbOid, bool isRedo)
Definition: tablespace.c:112
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
#define MaxBlockNumber
Definition: block.h:35
bool track_io_timing
Definition: bufmgr.c:143
bool zero_damaged_pages
Definition: bufmgr.c:140
#define Min(x, y)
Definition: c.h:961
#define TYPEALIGN(ALIGNVAL, LEN)
Definition: c.h:761
#define Assert(condition)
Definition: c.h:815
#define PG_BINARY
Definition: c.h:1230
uint64_t uint64
Definition: c.h:489
uint32_t uint32
Definition: c.h:488
#define lengthof(array)
Definition: c.h:745
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1157
int errcode_for_file_access(void)
Definition: elog.c:876
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2132
int io_direct_flags
Definition: fd.c:167
char * FilePathName(File file)
Definition: fd.c:2483
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2319
void FileClose(File file)
Definition: fd.c:1977
int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2391
int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2076
int data_sync_elevel(int elevel)
Definition: fd.c:3959
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1574
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2214
int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2346
off_t FileSize(File file)
Definition: fd.c:2431
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2158
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2448
int pg_truncate(const char *path, off_t length)
Definition: fd.c:719
#define IO_DIRECT_DATA
Definition: fd.h:54
#define FILE_POSSIBLY_DELETED(err)
Definition: fd.h:78
int File
Definition: fd.h:51
static ssize_t FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
Definition: fd.h:208
#define PG_O_DIRECT
Definition: fd.h:97
#define MCXT_ALLOC_ZERO
Definition: fe_memutils.h:30
int compute_remaining_iovec(struct iovec *destination, const struct iovec *source, int iovcnt, size_t transferred)
Definition: file_utils.c:593
bool IsBinaryUpgrade
Definition: globals.c:120
int i
Definition: isn.c:72
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1181
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void pfree(void *pointer)
Definition: mcxt.c:1521
MemoryContext TopMemoryContext
Definition: mcxt.c:149
void * palloc(Size size)
Definition: mcxt.c:1317
void * palloc_aligned(Size size, Size alignto, int flags)
Definition: mcxt.c:1511
void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition: md.c:299
static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1429
#define EXTENSION_CREATE_RECOVERY
Definition: md.c:108
void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber curnblk, BlockNumber nblocks)
Definition: md.c:1166
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition: md.c:1738
static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
Definition: md.c:336
void mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void **buffers, BlockNumber nblocks, bool skipFsync)
Definition: md.c:938
bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
Definition: md.c:1831
bool mdexists(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:163
void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: md.c:817
static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1412
#define EXTENSION_DONT_OPEN
Definition: md.c:110
BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:1102
int mdunlinkfiletag(const FileTag *ftag, char *path)
Definition: md.c:1812
static MemoryContext MdCxt
Definition: md.c:86
void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: md.c:182
void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: md.c:452
static int do_truncate(const char *path)
Definition: md.c:315
void mdinit(void)
Definition: md.c:150
void mdclose(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:683
void mdzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: md.c:517
static MdfdVec * _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags)
Definition: md.c:1564
static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
Definition: md.c:1368
int mdsyncfiletag(const FileTag *ftag, char *path)
Definition: md.c:1759
void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: md.c:1043
uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: md.c:803
static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior)
Definition: md.c:1609
#define EXTENSION_RETURN_NULL
Definition: md.c:104
static char * _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
Definition: md.c:1541
bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: md.c:706
void mdregistersync(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:1255
void mdopen(SMgrRelation reln)
Definition: md.c:672
#define EXTENSION_CREATE
Definition: md.c:106
static int _mdfd_open_flags(void)
Definition: md.c:136
#define INIT_MD_FILETAG(a, xx_rlocator, xx_forknum, xx_segno)
Definition: md.c:90
#define EXTENSION_FAIL
Definition: md.c:102
static MdfdVec * mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
Definition: md.c:629
void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
Definition: md.c:1461
static int buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
Definition: md.c:754
static void _fdvec_resize(SMgrRelation reln, ForkNumber forknum, int nseg)
Definition: md.c:1493
void ForgetDatabaseSyncRequests(Oid dbid)
Definition: md.c:1443
void mdimmedsync(SMgrRelation reln, ForkNumber forknum)
Definition: md.c:1306
struct _MdfdVec MdfdVec
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
#define MAXPGPATH
#define PG_IO_ALIGN_SIZE
const void size_t len
#define PG_IOV_MAX
Definition: pg_iovec.h:37
@ IOOBJECT_RELATION
Definition: pgstat.h:275
@ IOCONTEXT_NORMAL
Definition: pgstat.h:287
@ IOOP_FSYNC
Definition: pgstat.h:306
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:90
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:120
#define sprintf
Definition: port.h:241
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
unsigned int Oid
Definition: postgres_ext.h:32
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
#define RelFileLocatorBackendIsTemp(rlocator)
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
@ InvalidForkNumber
Definition: relpath.h:57
#define MAX_FORKNUM
Definition: relpath.h:70
#define relpath(rlocator, forknum)
Definition: relpath.h:102
#define relpathperm(rlocator, forknum)
Definition: relpath.h:98
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:201
void smgrclose(SMgrRelation reln)
Definition: smgr.c:323
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
Definition: smgr.c:465
#define SmgrIsTemp(smgr)
Definition: smgr.h:73
Definition: sync.h:51
RelFileLocator rlocator
Definition: sync.h:54
int16 forknum
Definition: sync.h:53
uint64 segno
Definition: sync.h:55
RelFileLocator locator
RelFileNumber relNumber
int md_num_open_segs[MAX_FORKNUM+1]
Definition: smgr.h:60
struct _MdfdVec * md_seg_fds[MAX_FORKNUM+1]
Definition: smgr.h:61
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:37
Definition: md.c:81
File mdfd_vfd
Definition: md.c:82
BlockNumber mdfd_segno
Definition: md.c:83
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
Definition: sync.c:580
@ SYNC_FILTER_REQUEST
Definition: sync.h:28
@ SYNC_FORGET_REQUEST
Definition: sync.h:27
@ SYNC_UNLINK_REQUEST
Definition: sync.h:26
@ SYNC_REQUEST
Definition: sync.h:25
bool InRecovery
Definition: xlogutils.c:50
void XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum)
Definition: xlogutils.c:641