PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
fd.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * fd.c
4 * Virtual file descriptor code.
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
11 *
12 * NOTES:
13 *
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 1024 on many modern
20 * operating systems, but may be lower on others.)
21 *
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
27 *
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
32 *
33 * INTERFACE ROUTINES
34 *
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
41 *
42 * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 * temporary files that have names so that they can be shared between
44 * backends. Such files are automatically closed and count against the
45 * temporary file limit of the backend that creates them, but unlike anonymous
46 * files they are not automatically deleted. See sharedfileset.c for a shared
47 * ownership mechanism that provides automatic cleanup for shared files when
48 * the last of a group of backends detaches.
49 *
50 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 * They behave like the corresponding native functions, except that the handle
53 * is registered with the current subtransaction, and will be automatically
54 * closed at abort. These are intended mainly for short operations like
55 * reading a configuration file; there is a limit on the number of files that
56 * can be opened using these functions at any one time.
57 *
58 * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 * release file descriptors in use by the virtual file descriptors if
60 * necessary. There is no automatic cleanup of file descriptors returned by
61 * BasicOpenFile, it is solely the caller's responsibility to close the file
62 * descriptor by calling close(2).
63 *
64 * If a non-virtual file descriptor needs to be held open for any length of
65 * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 * (and eventually ReleaseExternalFD), so that we can take it into account
67 * while deciding how many VFDs can be open. This applies to FDs obtained
68 * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 *
70 *-------------------------------------------------------------------------
71 */
72
73#include "postgres.h"
74
75#include <dirent.h>
76#include <sys/file.h>
77#include <sys/param.h>
78#include <sys/resource.h> /* for getrlimit */
79#include <sys/stat.h>
80#include <sys/types.h>
81#ifndef WIN32
82#include <sys/mman.h>
83#endif
84#include <limits.h>
85#include <unistd.h>
86#include <fcntl.h>
87
88#include "access/xact.h"
89#include "access/xlog.h"
91#include "common/file_perm.h"
92#include "common/file_utils.h"
93#include "common/pg_prng.h"
94#include "miscadmin.h"
95#include "pgstat.h"
96#include "postmaster/startup.h"
97#include "storage/aio.h"
98#include "storage/fd.h"
99#include "storage/ipc.h"
100#include "utils/guc.h"
101#include "utils/guc_hooks.h"
102#include "utils/resowner.h"
103#include "utils/varlena.h"
104
105/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
106#if defined(HAVE_SYNC_FILE_RANGE)
107#define PG_FLUSH_DATA_WORKS 1
108#elif !defined(WIN32) && defined(MS_ASYNC)
109#define PG_FLUSH_DATA_WORKS 1
110#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
111#define PG_FLUSH_DATA_WORKS 1
112#endif
113
114/*
115 * We must leave some file descriptors free for system(), the dynamic loader,
116 * and other code that tries to open files without consulting fd.c. This
117 * is the number left free. (While we try fairly hard to prevent EMFILE
118 * errors, there's never any guarantee that we won't get ENFILE due to
119 * other processes chewing up FDs. So it's a bad idea to try to open files
120 * without consulting fd.c. Nonetheless we cannot control all code.)
121 *
122 * Because this is just a fixed setting, we are effectively assuming that
123 * no such code will leave FDs open over the long term; otherwise the slop
124 * is likely to be insufficient. Note in particular that we expect that
125 * loading a shared library does not result in any permanent increase in
126 * the number of open files. (This appears to be true on most if not
127 * all platforms as of Feb 2004.)
128 */
129#define NUM_RESERVED_FDS 10
130
131/*
132 * If we have fewer than this many usable FDs after allowing for the reserved
133 * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
134 * much less than that. Note that this value ensures numExternalFDs can be
135 * at least 16; as of this writing, the contrib/postgres_fdw regression tests
136 * will not pass unless that can grow to at least 14.)
137 */
138#define FD_MINFREE 48
139
140/*
141 * A number of platforms allow individual processes to open many more files
142 * than they can really support when *many* processes do the same thing.
143 * This GUC parameter lets the DBA limit max_safe_fds to something less than
144 * what the postmaster's initial probe suggests will work.
145 */
147
148/*
149 * Maximum number of file descriptors to open for operations that fd.c knows
150 * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
151 * to a conservative value, and remains that way indefinitely in bootstrap or
152 * standalone-backend cases. In normal postmaster operation, the postmaster
153 * calls set_max_safe_fds() late in initialization to update the value, and
154 * that value is then inherited by forked subprocesses.
155 *
156 * Note: the value of max_files_per_process is taken into account while
157 * setting this variable, and so need not be tested separately.
158 */
159int max_safe_fds = FD_MINFREE; /* default if not changed */
160
161/* Whether it is safe to continue running after fsync() fails. */
162bool data_sync_retry = false;
163
164/* How SyncDataDirectory() should do its job. */
166
167/* Which kinds of files should be opened with PG_O_DIRECT. */
169
170/* Debugging.... */
171
172#ifdef FDDEBUG
173#define DO_DB(A) \
174 do { \
175 int _do_db_save_errno = errno; \
176 A; \
177 errno = _do_db_save_errno; \
178 } while (0)
179#else
180#define DO_DB(A) \
181 ((void) 0)
182#endif
183
184#define VFD_CLOSED (-1)
185
186#define FileIsValid(file) \
187 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
188
189#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
190
191/* these are the assigned bits in fdstate below: */
192#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
193#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
194#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
195
196typedef struct vfd
197{
198 int fd; /* current FD, or VFD_CLOSED if none */
199 unsigned short fdstate; /* bitflags for VFD's state */
200 ResourceOwner resowner; /* owner, for automatic cleanup */
201 File nextFree; /* link to next free VFD, if in freelist */
202 File lruMoreRecently; /* doubly linked recency-of-use list */
204 off_t fileSize; /* current size of file (0 if not temporary) */
205 char *fileName; /* name of file, or NULL for unused VFD */
206 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
207 int fileFlags; /* open(2) flags for (re)opening the file */
208 mode_t fileMode; /* mode to pass to open(2) */
210
211/*
212 * Virtual File Descriptor array pointer and size. This grows as
213 * needed. 'File' values are indexes into this array.
214 * Note that VfdCache[0] is not a usable VFD, just a list header.
215 */
216static Vfd *VfdCache;
218
219/*
220 * Number of file descriptors known to be in use by VFD entries.
221 */
222static int nfile = 0;
223
224/*
225 * Flag to tell whether it's worth scanning VfdCache looking for temp files
226 * to close
227 */
228static bool have_xact_temporary_files = false;
229
230/*
231 * Tracks the total size of all temporary files. Note: when temp_file_limit
232 * is being enforced, this cannot overflow since the limit cannot be more
233 * than INT_MAX kilobytes. When not enforcing, it could theoretically
234 * overflow, but we don't care.
235 */
237
238/* Temporary file access initialized and not yet shut down? */
239#ifdef USE_ASSERT_CHECKING
240static bool temporary_files_allowed = false;
241#endif
242
243/*
244 * List of OS handles opened with AllocateFile, AllocateDir and
245 * OpenTransientFile.
246 */
247typedef enum
248{
254
255typedef struct
256{
259 union
260 {
261 FILE *file;
263 int fd;
264 } desc;
266
267static int numAllocatedDescs = 0;
268static int maxAllocatedDescs = 0;
270
271/*
272 * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
273 */
274static int numExternalFDs = 0;
275
276/*
277 * Number of temporary files opened during the current session;
278 * this is used in generation of tempfile names.
279 */
280static long tempFileCounter = 0;
281
282/*
283 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
284 * indicating that the current database's default tablespace should be used.)
285 * When numTempTableSpaces is -1, this has not been set in the current
286 * transaction.
287 */
288static Oid *tempTableSpaces = NULL;
289static int numTempTableSpaces = -1;
290static int nextTempTableSpace = 0;
291
292
293/*--------------------
294 *
295 * Private Routines
296 *
297 * Delete - delete a file from the Lru ring
298 * LruDelete - remove a file from the Lru ring and close its FD
299 * Insert - put a file at the front of the Lru ring
300 * LruInsert - put a file at the front of the Lru ring and open it
301 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
302 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
303 * AllocateVfd - grab a free (or new) file record (from VfdCache)
304 * FreeVfd - free a file record
305 *
306 * The Least Recently Used ring is a doubly linked list that begins and
307 * ends on element zero. Element zero is special -- it doesn't represent
308 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
309 * anchor that shows us the beginning/end of the ring.
310 * Only VFD elements that are currently really open (have an FD assigned) are
311 * in the Lru ring. Elements that are "virtually" open can be recognized
312 * by having a non-null fileName field.
313 *
314 * example:
315 *
316 * /--less----\ /---------\
317 * v \ v \
318 * #0 --more---> LeastRecentlyUsed --more-\ \
319 * ^\ | |
320 * \\less--> MostRecentlyUsedFile <---/ |
321 * \more---/ \--less--/
322 *
323 *--------------------
324 */
325static void Delete(File file);
326static void LruDelete(File file);
327static void Insert(File file);
328static int LruInsert(File file);
329static bool ReleaseLruFile(void);
330static void ReleaseLruFiles(void);
331static File AllocateVfd(void);
332static void FreeVfd(File file);
333
334static int FileAccess(File file);
335static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
336static bool reserveAllocatedDesc(void);
337static int FreeDesc(AllocateDesc *desc);
338
339static void BeforeShmemExit_Files(int code, Datum arg);
340static void CleanupTempFiles(bool isCommit, bool isProcExit);
341static void RemovePgTempRelationFiles(const char *tsdirname);
342static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
343
344static void walkdir(const char *path,
345 void (*action) (const char *fname, bool isdir, int elevel),
346 bool process_symlinks,
347 int elevel);
348#ifdef PG_FLUSH_DATA_WORKS
349static void pre_sync_fname(const char *fname, bool isdir, int elevel);
350#endif
351static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
352static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
353
354static int fsync_parent_path(const char *fname, int elevel);
355
356
357/* ResourceOwner callbacks to hold virtual file descriptors */
358static void ResOwnerReleaseFile(Datum res);
359static char *ResOwnerPrintFile(Datum res);
360
362{
363 .name = "File",
364 .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
365 .release_priority = RELEASE_PRIO_FILES,
366 .ReleaseResource = ResOwnerReleaseFile,
367 .DebugPrint = ResOwnerPrintFile
368};
369
370/* Convenience wrappers over ResourceOwnerRemember/Forget */
371static inline void
373{
375}
376static inline void
378{
380}
381
382/*
383 * pg_fsync --- do fsync with or without writethrough
384 */
385int
387{
388#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
389 struct stat st;
390
391 /*
392 * Some operating system implementations of fsync() have requirements
393 * about the file access modes that were used when their file descriptor
394 * argument was opened, and these requirements differ depending on whether
395 * the file descriptor is for a directory.
396 *
397 * For any file descriptor that may eventually be handed to fsync(), we
398 * should have opened it with access modes that are compatible with
399 * fsync() on all supported systems, otherwise the code may not be
400 * portable, even if it runs ok on the current system.
401 *
402 * We assert here that a descriptor for a file was opened with write
403 * permissions (either O_RDWR or O_WRONLY) and for a directory without
404 * write permissions (O_RDONLY).
405 *
406 * Ignore any fstat errors and let the follow-up fsync() do its work.
407 * Doing this sanity check here counts for the case where fsync() is
408 * disabled.
409 */
410 if (fstat(fd, &st) == 0)
411 {
412 int desc_flags = fcntl(fd, F_GETFL);
413
414 /*
415 * O_RDONLY is historically 0, so just make sure that for directories
416 * no write flags are used.
417 */
418 if (S_ISDIR(st.st_mode))
419 Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
420 else
421 Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
422 }
423 errno = 0;
424#endif
425
426 /* #if is to skip the wal_sync_method test if there's no need for it */
427#if defined(HAVE_FSYNC_WRITETHROUGH)
430 else
431#endif
433}
434
435
436/*
437 * pg_fsync_no_writethrough --- same as fsync except does nothing if
438 * enableFsync is off
439 */
440int
442{
443 int rc;
444
445 if (!enableFsync)
446 return 0;
447
448retry:
449 rc = fsync(fd);
450
451 if (rc == -1 && errno == EINTR)
452 goto retry;
453
454 return rc;
455}
456
457/*
458 * pg_fsync_writethrough
459 */
460int
462{
463 if (enableFsync)
464 {
465#if defined(F_FULLFSYNC)
466 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
467#else
468 errno = ENOSYS;
469 return -1;
470#endif
471 }
472 else
473 return 0;
474}
475
476/*
477 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
478 */
479int
481{
482 int rc;
483
484 if (!enableFsync)
485 return 0;
486
487retry:
488 rc = fdatasync(fd);
489
490 if (rc == -1 && errno == EINTR)
491 goto retry;
492
493 return rc;
494}
495
496/*
497 * pg_file_exists -- check that a file exists.
498 *
499 * This requires an absolute path to the file. Returns true if the file is
500 * not a directory, false otherwise.
501 */
502bool
504{
505 struct stat st;
506
507 Assert(name != NULL);
508
509 if (stat(name, &st) == 0)
510 return !S_ISDIR(st.st_mode);
511 else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
514 errmsg("could not access file \"%s\": %m", name)));
515
516 return false;
517}
518
519/*
520 * pg_flush_data --- advise OS that the described dirty data should be flushed
521 *
522 * offset of 0 with nbytes 0 means that the entire file should be flushed
523 */
524void
525pg_flush_data(int fd, off_t offset, off_t nbytes)
526{
527 /*
528 * Right now file flushing is primarily used to avoid making later
529 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
530 * if fsyncs are disabled - that's a decision we might want to make
531 * configurable at some point.
532 */
533 if (!enableFsync)
534 return;
535
536 /*
537 * We compile all alternatives that are supported on the current platform,
538 * to find portability problems more easily.
539 */
540#if defined(HAVE_SYNC_FILE_RANGE)
541 {
542 int rc;
543 static bool not_implemented_by_kernel = false;
544
545 if (not_implemented_by_kernel)
546 return;
547
548retry:
549
550 /*
551 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
552 * tells the OS that writeback for the specified blocks should be
553 * started, but that we don't want to wait for completion. Note that
554 * this call might block if too much dirty data exists in the range.
555 * This is the preferable method on OSs supporting it, as it works
556 * reliably when available (contrast to msync()) and doesn't flush out
557 * clean data (like FADV_DONTNEED).
558 */
559 rc = sync_file_range(fd, offset, nbytes,
560 SYNC_FILE_RANGE_WRITE);
561 if (rc != 0)
562 {
563 int elevel;
564
565 if (rc == EINTR)
566 goto retry;
567
568 /*
569 * For systems that don't have an implementation of
570 * sync_file_range() such as Windows WSL, generate only one
571 * warning and then suppress all further attempts by this process.
572 */
573 if (errno == ENOSYS)
574 {
575 elevel = WARNING;
576 not_implemented_by_kernel = true;
577 }
578 else
579 elevel = data_sync_elevel(WARNING);
580
581 ereport(elevel,
583 errmsg("could not flush dirty data: %m")));
584 }
585
586 return;
587 }
588#endif
589#if !defined(WIN32) && defined(MS_ASYNC)
590 {
591 void *p;
592 static int pagesize = 0;
593
594 /*
595 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
596 * writeback. On linux it only does so if MS_SYNC is specified, but
597 * then it does the writeback synchronously. Luckily all common linux
598 * systems have sync_file_range(). This is preferable over
599 * FADV_DONTNEED because it doesn't flush out clean data.
600 *
601 * We map the file (mmap()), tell the kernel to sync back the contents
602 * (msync()), and then remove the mapping again (munmap()).
603 */
604
605 /* mmap() needs actual length if we want to map whole file */
606 if (offset == 0 && nbytes == 0)
607 {
608 nbytes = lseek(fd, 0, SEEK_END);
609 if (nbytes < 0)
610 {
613 errmsg("could not determine dirty data size: %m")));
614 return;
615 }
616 }
617
618 /*
619 * Some platforms reject partial-page mmap() attempts. To deal with
620 * that, just truncate the request to a page boundary. If any extra
621 * bytes don't get flushed, well, it's only a hint anyway.
622 */
623
624 /* fetch pagesize only once */
625 if (pagesize == 0)
626 pagesize = sysconf(_SC_PAGESIZE);
627
628 /* align length to pagesize, dropping any fractional page */
629 if (pagesize > 0)
630 nbytes = (nbytes / pagesize) * pagesize;
631
632 /* fractional-page request is a no-op */
633 if (nbytes <= 0)
634 return;
635
636 /*
637 * mmap could well fail, particularly on 32-bit platforms where there
638 * may simply not be enough address space. If so, silently fall
639 * through to the next implementation.
640 */
641 if (nbytes <= (off_t) SSIZE_MAX)
642 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
643 else
644 p = MAP_FAILED;
645
646 if (p != MAP_FAILED)
647 {
648 int rc;
649
650 rc = msync(p, (size_t) nbytes, MS_ASYNC);
651 if (rc != 0)
652 {
655 errmsg("could not flush dirty data: %m")));
656 /* NB: need to fall through to munmap()! */
657 }
658
659 rc = munmap(p, (size_t) nbytes);
660 if (rc != 0)
661 {
662 /* FATAL error because mapping would remain */
665 errmsg("could not munmap() while flushing data: %m")));
666 }
667
668 return;
669 }
670 }
671#endif
672#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
673 {
674 int rc;
675
676 /*
677 * Signal the kernel that the passed in range should not be cached
678 * anymore. This has the, desired, side effect of writing out dirty
679 * data, and the, undesired, side effect of likely discarding useful
680 * clean cached blocks. For the latter reason this is the least
681 * preferable method.
682 */
683
684 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
685
686 if (rc != 0)
687 {
688 /* don't error out, this is just a performance optimization */
691 errmsg("could not flush dirty data: %m")));
692 }
693
694 return;
695 }
696#endif
697}
698
699/*
700 * Truncate an open file to a given length.
701 */
702static int
703pg_ftruncate(int fd, off_t length)
704{
705 int ret;
706
707retry:
708 ret = ftruncate(fd, length);
709
710 if (ret == -1 && errno == EINTR)
711 goto retry;
712
713 return ret;
714}
715
716/*
717 * Truncate a file to a given length by name.
718 */
719int
720pg_truncate(const char *path, off_t length)
721{
722 int ret;
723#ifdef WIN32
724 int save_errno;
725 int fd;
726
727 fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
728 if (fd >= 0)
729 {
730 ret = pg_ftruncate(fd, length);
731 save_errno = errno;
733 errno = save_errno;
734 }
735 else
736 ret = -1;
737#else
738
739retry:
740 ret = truncate(path, length);
741
742 if (ret == -1 && errno == EINTR)
743 goto retry;
744#endif
745
746 return ret;
747}
748
749/*
750 * fsync_fname -- fsync a file or directory, handling errors properly
751 *
752 * Try to fsync a file or directory. When doing the latter, ignore errors that
753 * indicate the OS just doesn't allow/require fsyncing directories.
754 */
755void
756fsync_fname(const char *fname, bool isdir)
757{
758 fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
759}
760
761/*
762 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
763 *
764 * This routine ensures that, after returning, the effect of renaming file
765 * persists in case of a crash. A crash while this routine is running will
766 * leave you with either the pre-existing or the moved file in place of the
767 * new file; no mixed state or truncated files are possible.
768 *
769 * It does so by using fsync on the old filename and the possibly existing
770 * target filename before the rename, and the target file and directory after.
771 *
772 * Note that rename() cannot be used across arbitrary directories, as they
773 * might not be on the same filesystem. Therefore this routine does not
774 * support renaming across directories.
775 *
776 * Log errors with the caller specified severity.
777 *
778 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
779 * valid upon return.
780 */
781int
782durable_rename(const char *oldfile, const char *newfile, int elevel)
783{
784 int fd;
785
786 /*
787 * First fsync the old and target path (if it exists), to ensure that they
788 * are properly persistent on disk. Syncing the target file is not
789 * strictly necessary, but it makes it easier to reason about crashes;
790 * because it's then guaranteed that either source or target file exists
791 * after a crash.
792 */
793 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
794 return -1;
795
796 fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
797 if (fd < 0)
798 {
799 if (errno != ENOENT)
800 {
801 ereport(elevel,
803 errmsg("could not open file \"%s\": %m", newfile)));
804 return -1;
805 }
806 }
807 else
808 {
809 if (pg_fsync(fd) != 0)
810 {
811 int save_errno;
812
813 /* close file upon error, might not be in transaction context */
814 save_errno = errno;
816 errno = save_errno;
817
818 ereport(elevel,
820 errmsg("could not fsync file \"%s\": %m", newfile)));
821 return -1;
822 }
823
824 if (CloseTransientFile(fd) != 0)
825 {
826 ereport(elevel,
828 errmsg("could not close file \"%s\": %m", newfile)));
829 return -1;
830 }
831 }
832
833 /* Time to do the real deal... */
834 if (rename(oldfile, newfile) < 0)
835 {
836 ereport(elevel,
838 errmsg("could not rename file \"%s\" to \"%s\": %m",
839 oldfile, newfile)));
840 return -1;
841 }
842
843 /*
844 * To guarantee renaming the file is persistent, fsync the file with its
845 * new name, and its containing directory.
846 */
847 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
848 return -1;
849
850 if (fsync_parent_path(newfile, elevel) != 0)
851 return -1;
852
853 return 0;
854}
855
856/*
857 * durable_unlink -- remove a file in a durable manner
858 *
859 * This routine ensures that, after returning, the effect of removing file
860 * persists in case of a crash. A crash while this routine is running will
861 * leave the system in no mixed state.
862 *
863 * It does so by using fsync on the parent directory of the file after the
864 * actual removal is done.
865 *
866 * Log errors with the severity specified by caller.
867 *
868 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
869 * valid upon return.
870 */
871int
872durable_unlink(const char *fname, int elevel)
873{
874 if (unlink(fname) < 0)
875 {
876 ereport(elevel,
878 errmsg("could not remove file \"%s\": %m",
879 fname)));
880 return -1;
881 }
882
883 /*
884 * To guarantee that the removal of the file is persistent, fsync its
885 * parent directory.
886 */
887 if (fsync_parent_path(fname, elevel) != 0)
888 return -1;
889
890 return 0;
891}
892
893/*
894 * InitFileAccess --- initialize this module during backend startup
895 *
896 * This is called during either normal or standalone backend start.
897 * It is *not* called in the postmaster.
898 *
899 * Note that this does not initialize temporary file access, that is
900 * separately initialized via InitTemporaryFileAccess().
901 */
902void
904{
905 Assert(SizeVfdCache == 0); /* call me only once */
906
907 /* initialize cache header entry */
908 VfdCache = (Vfd *) malloc(sizeof(Vfd));
909 if (VfdCache == NULL)
911 (errcode(ERRCODE_OUT_OF_MEMORY),
912 errmsg("out of memory")));
913
914 MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
916
917 SizeVfdCache = 1;
918}
919
920/*
921 * InitTemporaryFileAccess --- initialize temporary file access during startup
922 *
923 * This is called during either normal or standalone backend start.
924 * It is *not* called in the postmaster.
925 *
926 * This is separate from InitFileAccess() because temporary file cleanup can
927 * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
928 * our reporting has to happen before that. Low level file access should be
929 * available for longer, hence the separate initialization / shutdown of
930 * temporary file handling.
931 */
932void
934{
935 Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
936 Assert(!temporary_files_allowed); /* call me only once */
937
938 /*
939 * Register before-shmem-exit hook to ensure temp files are dropped while
940 * we can still report stats.
941 */
943
944#ifdef USE_ASSERT_CHECKING
945 temporary_files_allowed = true;
946#endif
947}
948
949/*
950 * count_usable_fds --- count how many FDs the system will let us open,
951 * and estimate how many are already open.
952 *
953 * We stop counting if usable_fds reaches max_to_probe. Note: a small
954 * value of max_to_probe might result in an underestimate of already_open;
955 * we must fill in any "gaps" in the set of used FDs before the calculation
956 * of already_open will give the right answer. In practice, max_to_probe
957 * of a couple of dozen should be enough to ensure good results.
958 *
959 * We assume stderr (FD 2) is available for dup'ing. While the calling
960 * script could theoretically close that, it would be a really bad idea,
961 * since then one risks loss of error messages from, e.g., libc.
962 */
963static void
964count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
965{
966 int *fd;
967 int size;
968 int used = 0;
969 int highestfd = 0;
970 int j;
971
972#ifdef HAVE_GETRLIMIT
973 struct rlimit rlim;
974 int getrlimit_status;
975#endif
976
977 size = 1024;
978 fd = (int *) palloc(size * sizeof(int));
979
980#ifdef HAVE_GETRLIMIT
981 getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
982 if (getrlimit_status != 0)
983 ereport(WARNING, (errmsg("getrlimit failed: %m")));
984#endif /* HAVE_GETRLIMIT */
985
986 /* dup until failure or probe limit reached */
987 for (;;)
988 {
989 int thisfd;
990
991#ifdef HAVE_GETRLIMIT
992
993 /*
994 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
995 * some platforms
996 */
997 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
998 break;
999#endif
1000
1001 thisfd = dup(2);
1002 if (thisfd < 0)
1003 {
1004 /* Expect EMFILE or ENFILE, else it's fishy */
1005 if (errno != EMFILE && errno != ENFILE)
1006 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1007 break;
1008 }
1009
1010 if (used >= size)
1011 {
1012 size *= 2;
1013 fd = (int *) repalloc(fd, size * sizeof(int));
1014 }
1015 fd[used++] = thisfd;
1016
1017 if (highestfd < thisfd)
1018 highestfd = thisfd;
1019
1020 if (used >= max_to_probe)
1021 break;
1022 }
1023
1024 /* release the files we opened */
1025 for (j = 0; j < used; j++)
1026 close(fd[j]);
1027
1028 pfree(fd);
1029
1030 /*
1031 * Return results. usable_fds is just the number of successful dups. We
1032 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1033 * number) and so already_open is highestfd+1 - usable_fds.
1034 */
1035 *usable_fds = used;
1036 *already_open = highestfd + 1 - used;
1037}
1038
1039/*
1040 * set_max_safe_fds
1041 * Determine number of file descriptors that fd.c is allowed to use
1042 */
1043void
1045{
1046 int usable_fds;
1047 int already_open;
1048
1049 /*----------
1050 * We want to set max_safe_fds to
1051 * MIN(usable_fds, max_files_per_process)
1052 * less the slop factor for files that are opened without consulting
1053 * fd.c. This ensures that we won't allow to open more than
1054 * max_files_per_process, or the experimentally-determined EMFILE limit,
1055 * additional files.
1056 *----------
1057 */
1059 &usable_fds, &already_open);
1060
1061 max_safe_fds = Min(usable_fds, max_files_per_process);
1062
1063 /*
1064 * Take off the FDs reserved for system() etc.
1065 */
1067
1068 /*
1069 * Make sure we still have enough to get by.
1070 */
1072 ereport(FATAL,
1073 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1074 errmsg("insufficient file descriptors available to start server process"),
1075 errdetail("System allows %d, server needs at least %d, %d files are already open.",
1078 already_open)));
1079
1080 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1081 max_safe_fds, usable_fds, already_open);
1082}
1083
1084/*
1085 * Open a file with BasicOpenFilePerm() and pass default file mode for the
1086 * fileMode parameter.
1087 */
1088int
1089BasicOpenFile(const char *fileName, int fileFlags)
1090{
1091 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1092}
1093
1094/*
1095 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1096 *
1097 * This is exported for use by places that really want a plain kernel FD,
1098 * but need to be proof against running out of FDs. Once an FD has been
1099 * successfully returned, it is the caller's responsibility to ensure that
1100 * it will not be leaked on ereport()! Most users should *not* call this
1101 * routine directly, but instead use the VFD abstraction level, which
1102 * provides protection against descriptor leaks as well as management of
1103 * files that need to be open for more than a short period of time.
1104 *
1105 * Ideally this should be the *only* direct call of open() in the backend.
1106 * In practice, the postmaster calls open() directly, and there are some
1107 * direct open() calls done early in backend startup. Those are OK since
1108 * this module wouldn't have any open files to close at that point anyway.
1109 */
1110int
1111BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1112{
1113 int fd;
1114
1115tryAgain:
1116#ifdef PG_O_DIRECT_USE_F_NOCACHE
1117
1118 /*
1119 * The value we defined to stand in for O_DIRECT when simulating it with
1120 * F_NOCACHE had better not collide with any of the standard flags.
1121 */
1123 (O_APPEND |
1124 O_CLOEXEC |
1125 O_CREAT |
1126 O_DSYNC |
1127 O_EXCL |
1128 O_RDWR |
1129 O_RDONLY |
1130 O_SYNC |
1131 O_TRUNC |
1132 O_WRONLY)) == 0,
1133 "PG_O_DIRECT value collides with standard flag");
1134 fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1135#else
1136 fd = open(fileName, fileFlags, fileMode);
1137#endif
1138
1139 if (fd >= 0)
1140 {
1141#ifdef PG_O_DIRECT_USE_F_NOCACHE
1142 if (fileFlags & PG_O_DIRECT)
1143 {
1144 if (fcntl(fd, F_NOCACHE, 1) < 0)
1145 {
1146 int save_errno = errno;
1147
1148 close(fd);
1149 errno = save_errno;
1150 return -1;
1151 }
1152 }
1153#endif
1154
1155 return fd; /* success! */
1156 }
1157
1158 if (errno == EMFILE || errno == ENFILE)
1159 {
1160 int save_errno = errno;
1161
1162 ereport(LOG,
1163 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1164 errmsg("out of file descriptors: %m; release and retry")));
1165 errno = 0;
1166 if (ReleaseLruFile())
1167 goto tryAgain;
1168 errno = save_errno;
1169 }
1170
1171 return -1; /* failure */
1172}
1173
1174/*
1175 * AcquireExternalFD - attempt to reserve an external file descriptor
1176 *
1177 * This should be used by callers that need to hold a file descriptor open
1178 * over more than a short interval, but cannot use any of the other facilities
1179 * provided by this module.
1180 *
1181 * The difference between this and the underlying ReserveExternalFD function
1182 * is that this will report failure (by setting errno and returning false)
1183 * if "too many" external FDs are already reserved. This should be used in
1184 * any code where the total number of FDs to be reserved is not predictable
1185 * and small.
1186 */
1187bool
1189{
1190 /*
1191 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1192 * "external" FDs.
1193 */
1194 if (numExternalFDs < max_safe_fds / 3)
1195 {
1197 return true;
1198 }
1199 errno = EMFILE;
1200 return false;
1201}
1202
1203/*
1204 * ReserveExternalFD - report external consumption of a file descriptor
1205 *
1206 * This should be used by callers that need to hold a file descriptor open
1207 * over more than a short interval, but cannot use any of the other facilities
1208 * provided by this module. This just tracks the use of the FD and closes
1209 * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1210 *
1211 * Call this directly only in code where failure to reserve the FD would be
1212 * fatal; for example, the WAL-writing code does so, since the alternative is
1213 * session failure. Also, it's very unwise to do so in code that could
1214 * consume more than one FD per process.
1215 *
1216 * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1217 * available, it doesn't matter too much whether this is called before or
1218 * after actually opening the FD; but doing so beforehand reduces the risk of
1219 * an EMFILE failure if not everybody played nice. In any case, it's solely
1220 * caller's responsibility to keep the external-FD count in sync with reality.
1221 */
1222void
1224{
1225 /*
1226 * Release VFDs if needed to stay safe. Because we do this before
1227 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1228 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1229 */
1231
1233}
1234
1235/*
1236 * ReleaseExternalFD - report release of an external file descriptor
1237 *
1238 * This is guaranteed not to change errno, so it can be used in failure paths.
1239 */
1240void
1242{
1245}
1246
1247
1248#if defined(FDDEBUG)
1249
1250static void
1251_dump_lru(void)
1252{
1253 int mru = VfdCache[0].lruLessRecently;
1254 Vfd *vfdP = &VfdCache[mru];
1255 char buf[2048];
1256
1257 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1258 while (mru != 0)
1259 {
1260 mru = vfdP->lruLessRecently;
1261 vfdP = &VfdCache[mru];
1262 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1263 }
1264 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1265 elog(LOG, "%s", buf);
1266}
1267#endif /* FDDEBUG */
1268
1269static void
1271{
1272 Vfd *vfdP;
1273
1274 Assert(file != 0);
1275
1276 DO_DB(elog(LOG, "Delete %d (%s)",
1277 file, VfdCache[file].fileName));
1278 DO_DB(_dump_lru());
1279
1280 vfdP = &VfdCache[file];
1281
1284
1285 DO_DB(_dump_lru());
1286}
1287
1288static void
1290{
1291 Vfd *vfdP;
1292
1293 Assert(file != 0);
1294
1295 DO_DB(elog(LOG, "LruDelete %d (%s)",
1296 file, VfdCache[file].fileName));
1297
1298 vfdP = &VfdCache[file];
1299
1300 pgaio_closing_fd(vfdP->fd);
1301
1302 /*
1303 * Close the file. We aren't expecting this to fail; if it does, better
1304 * to leak the FD than to mess up our internal state.
1305 */
1306 if (close(vfdP->fd) != 0)
1308 "could not close file \"%s\": %m", vfdP->fileName);
1309 vfdP->fd = VFD_CLOSED;
1310 --nfile;
1311
1312 /* delete the vfd record from the LRU ring */
1313 Delete(file);
1314}
1315
1316static void
1318{
1319 Vfd *vfdP;
1320
1321 Assert(file != 0);
1322
1323 DO_DB(elog(LOG, "Insert %d (%s)",
1324 file, VfdCache[file].fileName));
1325 DO_DB(_dump_lru());
1326
1327 vfdP = &VfdCache[file];
1328
1329 vfdP->lruMoreRecently = 0;
1331 VfdCache[0].lruLessRecently = file;
1333
1334 DO_DB(_dump_lru());
1335}
1336
1337/* returns 0 on success, -1 on re-open failure (with errno set) */
1338static int
1340{
1341 Vfd *vfdP;
1342
1343 Assert(file != 0);
1344
1345 DO_DB(elog(LOG, "LruInsert %d (%s)",
1346 file, VfdCache[file].fileName));
1347
1348 vfdP = &VfdCache[file];
1349
1350 if (FileIsNotOpen(file))
1351 {
1352 /* Close excess kernel FDs. */
1354
1355 /*
1356 * The open could still fail for lack of file descriptors, eg due to
1357 * overall system file table being full. So, be prepared to release
1358 * another FD if necessary...
1359 */
1360 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1361 vfdP->fileMode);
1362 if (vfdP->fd < 0)
1363 {
1364 DO_DB(elog(LOG, "re-open failed: %m"));
1365 return -1;
1366 }
1367 else
1368 {
1369 ++nfile;
1370 }
1371 }
1372
1373 /*
1374 * put it at the head of the Lru ring
1375 */
1376
1377 Insert(file);
1378
1379 return 0;
1380}
1381
1382/*
1383 * Release one kernel FD by closing the least-recently-used VFD.
1384 */
1385static bool
1387{
1388 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1389
1390 if (nfile > 0)
1391 {
1392 /*
1393 * There are opened files and so there should be at least one used vfd
1394 * in the ring.
1395 */
1396 Assert(VfdCache[0].lruMoreRecently != 0);
1397 LruDelete(VfdCache[0].lruMoreRecently);
1398 return true; /* freed a file */
1399 }
1400 return false; /* no files available to free */
1401}
1402
1403/*
1404 * Release kernel FDs as needed to get under the max_safe_fds limit.
1405 * After calling this, it's OK to try to open another file.
1406 */
1407static void
1409{
1411 {
1412 if (!ReleaseLruFile())
1413 break;
1414 }
1415}
1416
1417static File
1419{
1420 Index i;
1421 File file;
1422
1423 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1424
1425 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1426
1427 if (VfdCache[0].nextFree == 0)
1428 {
1429 /*
1430 * The free list is empty so it is time to increase the size of the
1431 * array. We choose to double it each time this happens. However,
1432 * there's not much point in starting *real* small.
1433 */
1434 Size newCacheSize = SizeVfdCache * 2;
1435 Vfd *newVfdCache;
1436
1437 if (newCacheSize < 32)
1438 newCacheSize = 32;
1439
1440 /*
1441 * Be careful not to clobber VfdCache ptr if realloc fails.
1442 */
1443 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1444 if (newVfdCache == NULL)
1445 ereport(ERROR,
1446 (errcode(ERRCODE_OUT_OF_MEMORY),
1447 errmsg("out of memory")));
1448 VfdCache = newVfdCache;
1449
1450 /*
1451 * Initialize the new entries and link them into the free list.
1452 */
1453 for (i = SizeVfdCache; i < newCacheSize; i++)
1454 {
1455 MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1456 VfdCache[i].nextFree = i + 1;
1458 }
1459 VfdCache[newCacheSize - 1].nextFree = 0;
1461
1462 /*
1463 * Record the new size
1464 */
1465 SizeVfdCache = newCacheSize;
1466 }
1467
1468 file = VfdCache[0].nextFree;
1469
1471
1472 return file;
1473}
1474
1475static void
1477{
1478 Vfd *vfdP = &VfdCache[file];
1479
1480 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1481 file, vfdP->fileName ? vfdP->fileName : ""));
1482
1483 if (vfdP->fileName != NULL)
1484 {
1485 free(vfdP->fileName);
1486 vfdP->fileName = NULL;
1487 }
1488 vfdP->fdstate = 0x0;
1489
1490 vfdP->nextFree = VfdCache[0].nextFree;
1491 VfdCache[0].nextFree = file;
1492}
1493
1494/* returns 0 on success, -1 on re-open failure (with errno set) */
1495static int
1497{
1498 int returnValue;
1499
1500 DO_DB(elog(LOG, "FileAccess %d (%s)",
1501 file, VfdCache[file].fileName));
1502
1503 /*
1504 * Is the file open? If not, open it and put it at the head of the LRU
1505 * ring (possibly closing the least recently used file to get an FD).
1506 */
1507
1508 if (FileIsNotOpen(file))
1509 {
1510 returnValue = LruInsert(file);
1511 if (returnValue != 0)
1512 return returnValue;
1513 }
1514 else if (VfdCache[0].lruLessRecently != file)
1515 {
1516 /*
1517 * We now know that the file is open and that it is not the last one
1518 * accessed, so we need to move it to the head of the Lru ring.
1519 */
1520
1521 Delete(file);
1522 Insert(file);
1523 }
1524
1525 return 0;
1526}
1527
1528/*
1529 * Called whenever a temporary file is deleted to report its size.
1530 */
1531static void
1532ReportTemporaryFileUsage(const char *path, off_t size)
1533{
1535
1536 if (log_temp_files >= 0)
1537 {
1538 if ((size / 1024) >= log_temp_files)
1539 ereport(LOG,
1540 (errmsg("temporary file: path \"%s\", size %lu",
1541 path, (unsigned long) size)));
1542 }
1543}
1544
1545/*
1546 * Called to register a temporary file for automatic close.
1547 * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1548 * before the file was opened.
1549 */
1550static void
1552{
1555
1556 /* Backup mechanism for closing at end of xact. */
1559}
1560
1561/*
1562 * Called when we get a shared invalidation message on some relation.
1563 */
1564#ifdef NOT_USED
1565void
1566FileInvalidate(File file)
1567{
1568 Assert(FileIsValid(file));
1569 if (!FileIsNotOpen(file))
1570 LruDelete(file);
1571}
1572#endif
1573
1574/*
1575 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1576 * fileMode parameter.
1577 */
1578File
1579PathNameOpenFile(const char *fileName, int fileFlags)
1580{
1581 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1582}
1583
1584/*
1585 * open a file in an arbitrary directory
1586 *
1587 * NB: if the passed pathname is relative (which it usually is),
1588 * it will be interpreted relative to the process' working directory
1589 * (which should always be $PGDATA when this code is running).
1590 */
1591File
1592PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1593{
1594 char *fnamecopy;
1595 File file;
1596 Vfd *vfdP;
1597
1598 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1599 fileName, fileFlags, fileMode));
1600
1601 /*
1602 * We need a malloc'd copy of the file name; fail cleanly if no room.
1603 */
1604 fnamecopy = strdup(fileName);
1605 if (fnamecopy == NULL)
1606 ereport(ERROR,
1607 (errcode(ERRCODE_OUT_OF_MEMORY),
1608 errmsg("out of memory")));
1609
1610 file = AllocateVfd();
1611 vfdP = &VfdCache[file];
1612
1613 /* Close excess kernel FDs. */
1615
1616 /*
1617 * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1618 * client shouldn't be expected to know which kernel descriptors are
1619 * currently open, so it wouldn't make sense for them to be inherited by
1620 * executed subprograms.
1621 */
1622 fileFlags |= O_CLOEXEC;
1623
1624 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1625
1626 if (vfdP->fd < 0)
1627 {
1628 int save_errno = errno;
1629
1630 FreeVfd(file);
1631 free(fnamecopy);
1632 errno = save_errno;
1633 return -1;
1634 }
1635 ++nfile;
1636 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1637 vfdP->fd));
1638
1639 vfdP->fileName = fnamecopy;
1640 /* Saved flags are adjusted to be OK for re-opening file */
1641 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1642 vfdP->fileMode = fileMode;
1643 vfdP->fileSize = 0;
1644 vfdP->fdstate = 0x0;
1645 vfdP->resowner = NULL;
1646
1647 Insert(file);
1648
1649 return file;
1650}
1651
1652/*
1653 * Create directory 'directory'. If necessary, create 'basedir', which must
1654 * be the directory above it. This is designed for creating the top-level
1655 * temporary directory on demand before creating a directory underneath it.
1656 * Do nothing if the directory already exists.
1657 *
1658 * Directories created within the top-level temporary directory should begin
1659 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1660 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1661 * that do not need any particular prefix.
1662*/
1663void
1665{
1666 if (MakePGDirectory(directory) < 0)
1667 {
1668 if (errno == EEXIST)
1669 return;
1670
1671 /*
1672 * Failed. Try to create basedir first in case it's missing. Tolerate
1673 * EEXIST to close a race against another process following the same
1674 * algorithm.
1675 */
1676 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1677 ereport(ERROR,
1679 errmsg("cannot create temporary directory \"%s\": %m",
1680 basedir)));
1681
1682 /* Try again. */
1683 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1684 ereport(ERROR,
1686 errmsg("cannot create temporary subdirectory \"%s\": %m",
1687 directory)));
1688 }
1689}
1690
1691/*
1692 * Delete a directory and everything in it, if it exists.
1693 */
1694void
1695PathNameDeleteTemporaryDir(const char *dirname)
1696{
1697 struct stat statbuf;
1698
1699 /* Silently ignore missing directory. */
1700 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1701 return;
1702
1703 /*
1704 * Currently, walkdir doesn't offer a way for our passed in function to
1705 * maintain state. Perhaps it should, so that we could tell the caller
1706 * whether this operation succeeded or failed. Since this operation is
1707 * used in a cleanup path, we wouldn't actually behave differently: we'll
1708 * just log failures.
1709 */
1710 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1711}
1712
1713/*
1714 * Open a temporary file that will disappear when we close it.
1715 *
1716 * This routine takes care of generating an appropriate tempfile name.
1717 * There's no need to pass in fileFlags or fileMode either, since only
1718 * one setting makes any sense for a temp file.
1719 *
1720 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1721 * to ensure it's closed and deleted when it's no longer needed, typically at
1722 * the end-of-transaction. In most cases, you don't want temporary files to
1723 * outlive the transaction that created them, so this should be false -- but
1724 * if you need "somewhat" temporary storage, this might be useful. In either
1725 * case, the file is removed when the File is explicitly closed.
1726 */
1727File
1728OpenTemporaryFile(bool interXact)
1729{
1730 File file = 0;
1731
1732 Assert(temporary_files_allowed); /* check temp file access is up */
1733
1734 /*
1735 * Make sure the current resource owner has space for this File before we
1736 * open it, if we'll be registering it below.
1737 */
1738 if (!interXact)
1740
1741 /*
1742 * If some temp tablespace(s) have been given to us, try to use the next
1743 * one. If a given tablespace can't be found, we silently fall back to
1744 * the database's default tablespace.
1745 *
1746 * BUT: if the temp file is slated to outlive the current transaction,
1747 * force it into the database's default tablespace, so that it will not
1748 * pose a threat to possible tablespace drop attempts.
1749 */
1750 if (numTempTableSpaces > 0 && !interXact)
1751 {
1752 Oid tblspcOid = GetNextTempTableSpace();
1753
1754 if (OidIsValid(tblspcOid))
1755 file = OpenTemporaryFileInTablespace(tblspcOid, false);
1756 }
1757
1758 /*
1759 * If not, or if tablespace is bad, create in database's default
1760 * tablespace. MyDatabaseTableSpace should normally be set before we get
1761 * here, but just in case it isn't, fall back to pg_default tablespace.
1762 */
1763 if (file <= 0)
1766 DEFAULTTABLESPACE_OID,
1767 true);
1768
1769 /* Mark it for deletion at close and temporary file size limit */
1771
1772 /* Register it with the current resource owner */
1773 if (!interXact)
1775
1776 return file;
1777}
1778
1779/*
1780 * Return the path of the temp directory in a given tablespace.
1781 */
1782void
1784{
1785 /*
1786 * Identify the tempfile directory for this tablespace.
1787 *
1788 * If someone tries to specify pg_global, use pg_default instead.
1789 */
1790 if (tablespace == InvalidOid ||
1791 tablespace == DEFAULTTABLESPACE_OID ||
1792 tablespace == GLOBALTABLESPACE_OID)
1793 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1794 else
1795 {
1796 /* All other tablespaces are accessed via symlinks */
1797 snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1800 }
1801}
1802
1803/*
1804 * Open a temporary file in a specific tablespace.
1805 * Subroutine for OpenTemporaryFile, which see for details.
1806 */
1807static File
1808OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1809{
1810 char tempdirpath[MAXPGPATH];
1811 char tempfilepath[MAXPGPATH];
1812 File file;
1813
1814 TempTablespacePath(tempdirpath, tblspcOid);
1815
1816 /*
1817 * Generate a tempfile name that should be unique within the current
1818 * database instance.
1819 */
1820 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1822
1823 /*
1824 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1825 * temp file that can be reused.
1826 */
1827 file = PathNameOpenFile(tempfilepath,
1828 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1829 if (file <= 0)
1830 {
1831 /*
1832 * We might need to create the tablespace's tempfile directory, if no
1833 * one has yet done so.
1834 *
1835 * Don't check for an error from MakePGDirectory; it could fail if
1836 * someone else just did the same thing. If it doesn't work then
1837 * we'll bomb out on the second create attempt, instead.
1838 */
1839 (void) MakePGDirectory(tempdirpath);
1840
1841 file = PathNameOpenFile(tempfilepath,
1842 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1843 if (file <= 0 && rejectError)
1844 elog(ERROR, "could not create temporary file \"%s\": %m",
1845 tempfilepath);
1846 }
1847
1848 return file;
1849}
1850
1851
1852/*
1853 * Create a new file. The directory containing it must already exist. Files
1854 * created this way are subject to temp_file_limit and are automatically
1855 * closed at end of transaction, but are not automatically deleted on close
1856 * because they are intended to be shared between cooperating backends.
1857 *
1858 * If the file is inside the top-level temporary directory, its name should
1859 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1860 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1861 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1862 * the prefix isn't needed.
1863 */
1864File
1865PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1866{
1867 File file;
1868
1869 Assert(temporary_files_allowed); /* check temp file access is up */
1870
1872
1873 /*
1874 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1875 * temp file that can be reused.
1876 */
1877 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1878 if (file <= 0)
1879 {
1880 if (error_on_failure)
1881 ereport(ERROR,
1883 errmsg("could not create temporary file \"%s\": %m",
1884 path)));
1885 else
1886 return file;
1887 }
1888
1889 /* Mark it for temp_file_limit accounting. */
1891
1892 /* Register it for automatic close. */
1894
1895 return file;
1896}
1897
1898/*
1899 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1900 * another backend. Files opened this way don't count against the
1901 * temp_file_limit of the caller, are automatically closed at the end of the
1902 * transaction but are not deleted on close.
1903 */
1904File
1905PathNameOpenTemporaryFile(const char *path, int mode)
1906{
1907 File file;
1908
1909 Assert(temporary_files_allowed); /* check temp file access is up */
1910
1912
1913 file = PathNameOpenFile(path, mode | PG_BINARY);
1914
1915 /* If no such file, then we don't raise an error. */
1916 if (file <= 0 && errno != ENOENT)
1917 ereport(ERROR,
1919 errmsg("could not open temporary file \"%s\": %m",
1920 path)));
1921
1922 if (file > 0)
1923 {
1924 /* Register it for automatic close. */
1926 }
1927
1928 return file;
1929}
1930
1931/*
1932 * Delete a file by pathname. Return true if the file existed, false if
1933 * didn't.
1934 */
1935bool
1936PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1937{
1938 struct stat filestats;
1939 int stat_errno;
1940
1941 /* Get the final size for pgstat reporting. */
1942 if (stat(path, &filestats) != 0)
1943 stat_errno = errno;
1944 else
1945 stat_errno = 0;
1946
1947 /*
1948 * Unlike FileClose's automatic file deletion code, we tolerate
1949 * non-existence to support BufFileDeleteFileSet which doesn't know how
1950 * many segments it has to delete until it runs out.
1951 */
1952 if (stat_errno == ENOENT)
1953 return false;
1954
1955 if (unlink(path) < 0)
1956 {
1957 if (errno != ENOENT)
1958 ereport(error_on_failure ? ERROR : LOG,
1960 errmsg("could not unlink temporary file \"%s\": %m",
1961 path)));
1962 return false;
1963 }
1964
1965 if (stat_errno == 0)
1966 ReportTemporaryFileUsage(path, filestats.st_size);
1967 else
1968 {
1969 errno = stat_errno;
1970 ereport(LOG,
1972 errmsg("could not stat file \"%s\": %m", path)));
1973 }
1974
1975 return true;
1976}
1977
1978/*
1979 * close a file when done with it
1980 */
1981void
1983{
1984 Vfd *vfdP;
1985
1986 Assert(FileIsValid(file));
1987
1988 DO_DB(elog(LOG, "FileClose: %d (%s)",
1989 file, VfdCache[file].fileName));
1990
1991 vfdP = &VfdCache[file];
1992
1993 if (!FileIsNotOpen(file))
1994 {
1995 pgaio_closing_fd(vfdP->fd);
1996
1997 /* close the file */
1998 if (close(vfdP->fd) != 0)
1999 {
2000 /*
2001 * We may need to panic on failure to close non-temporary files;
2002 * see LruDelete.
2003 */
2005 "could not close file \"%s\": %m", vfdP->fileName);
2006 }
2007
2008 --nfile;
2009 vfdP->fd = VFD_CLOSED;
2010
2011 /* remove the file from the lru ring */
2012 Delete(file);
2013 }
2014
2015 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2016 {
2017 /* Subtract its size from current usage (do first in case of error) */
2019 vfdP->fileSize = 0;
2020 }
2021
2022 /*
2023 * Delete the file if it was temporary, and make a log entry if wanted
2024 */
2025 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2026 {
2027 struct stat filestats;
2028 int stat_errno;
2029
2030 /*
2031 * If we get an error, as could happen within the ereport/elog calls,
2032 * we'll come right back here during transaction abort. Reset the
2033 * flag to ensure that we can't get into an infinite loop. This code
2034 * is arranged to ensure that the worst-case consequence is failing to
2035 * emit log message(s), not failing to attempt the unlink.
2036 */
2037 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2038
2039
2040 /* first try the stat() */
2041 if (stat(vfdP->fileName, &filestats))
2042 stat_errno = errno;
2043 else
2044 stat_errno = 0;
2045
2046 /* in any case do the unlink */
2047 if (unlink(vfdP->fileName))
2048 ereport(LOG,
2050 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2051
2052 /* and last report the stat results */
2053 if (stat_errno == 0)
2054 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2055 else
2056 {
2057 errno = stat_errno;
2058 ereport(LOG,
2060 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2061 }
2062 }
2063
2064 /* Unregister it from the resource owner */
2065 if (vfdP->resowner)
2066 ResourceOwnerForgetFile(vfdP->resowner, file);
2067
2068 /*
2069 * Return the Vfd slot to the free list
2070 */
2071 FreeVfd(file);
2072}
2073
2074/*
2075 * FilePrefetch - initiate asynchronous read of a given range of the file.
2076 *
2077 * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2078 *
2079 * posix_fadvise() is the simplest standardized interface that accomplishes
2080 * this.
2081 */
2082int
2083FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
2084{
2085 Assert(FileIsValid(file));
2086
2087 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2088 file, VfdCache[file].fileName,
2089 (int64) offset, (int64) amount));
2090
2091#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2092 {
2093 int returnCode;
2094
2095 returnCode = FileAccess(file);
2096 if (returnCode < 0)
2097 return returnCode;
2098
2099retry:
2100 pgstat_report_wait_start(wait_event_info);
2101 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2102 POSIX_FADV_WILLNEED);
2104
2105 if (returnCode == EINTR)
2106 goto retry;
2107
2108 return returnCode;
2109 }
2110#elif defined(__darwin__)
2111 {
2112 struct radvisory
2113 {
2114 off_t ra_offset; /* offset into the file */
2115 int ra_count; /* size of the read */
2116 } ra;
2117 int returnCode;
2118
2119 returnCode = FileAccess(file);
2120 if (returnCode < 0)
2121 return returnCode;
2122
2123 ra.ra_offset = offset;
2124 ra.ra_count = amount;
2125 pgstat_report_wait_start(wait_event_info);
2126 returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
2128 if (returnCode != -1)
2129 return 0;
2130 else
2131 return errno;
2132 }
2133#else
2134 return 0;
2135#endif
2136}
2137
2138void
2139FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2140{
2141 int returnCode;
2142
2143 Assert(FileIsValid(file));
2144
2145 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2146 file, VfdCache[file].fileName,
2147 (int64) offset, (int64) nbytes));
2148
2149 if (nbytes <= 0)
2150 return;
2151
2152 if (VfdCache[file].fileFlags & PG_O_DIRECT)
2153 return;
2154
2155 returnCode = FileAccess(file);
2156 if (returnCode < 0)
2157 return;
2158
2159 pgstat_report_wait_start(wait_event_info);
2160 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2162}
2163
2164ssize_t
2165FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2166 uint32 wait_event_info)
2167{
2168 ssize_t returnCode;
2169 Vfd *vfdP;
2170
2171 Assert(FileIsValid(file));
2172
2173 DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2174 file, VfdCache[file].fileName,
2175 (int64) offset,
2176 iovcnt));
2177
2178 returnCode = FileAccess(file);
2179 if (returnCode < 0)
2180 return returnCode;
2181
2182 vfdP = &VfdCache[file];
2183
2184retry:
2185 pgstat_report_wait_start(wait_event_info);
2186 returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2188
2189 if (returnCode < 0)
2190 {
2191 /*
2192 * Windows may run out of kernel buffers and return "Insufficient
2193 * system resources" error. Wait a bit and retry to solve it.
2194 *
2195 * It is rumored that EINTR is also possible on some Unix filesystems,
2196 * in which case immediate retry is indicated.
2197 */
2198#ifdef WIN32
2199 DWORD error = GetLastError();
2200
2201 switch (error)
2202 {
2203 case ERROR_NO_SYSTEM_RESOURCES:
2204 pg_usleep(1000L);
2205 errno = EINTR;
2206 break;
2207 default:
2209 break;
2210 }
2211#endif
2212 /* OK to retry if interrupted */
2213 if (errno == EINTR)
2214 goto retry;
2215 }
2216
2217 return returnCode;
2218}
2219
2220int
2222 int iovcnt, off_t offset,
2223 uint32 wait_event_info)
2224{
2225 int returnCode;
2226 Vfd *vfdP;
2227
2228 Assert(FileIsValid(file));
2229
2230 DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2231 file, VfdCache[file].fileName,
2232 (int64) offset,
2233 iovcnt));
2234
2235 returnCode = FileAccess(file);
2236 if (returnCode < 0)
2237 return returnCode;
2238
2239 vfdP = &VfdCache[file];
2240
2241 pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2242
2243 return 0;
2244}
2245
2246ssize_t
2247FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2248 uint32 wait_event_info)
2249{
2250 ssize_t returnCode;
2251 Vfd *vfdP;
2252
2253 Assert(FileIsValid(file));
2254
2255 DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2256 file, VfdCache[file].fileName,
2257 (int64) offset,
2258 iovcnt));
2259
2260 returnCode = FileAccess(file);
2261 if (returnCode < 0)
2262 return returnCode;
2263
2264 vfdP = &VfdCache[file];
2265
2266 /*
2267 * If enforcing temp_file_limit and it's a temp file, check to see if the
2268 * write would overrun temp_file_limit, and throw error if so. Note: it's
2269 * really a modularity violation to throw error here; we should set errno
2270 * and return -1. However, there's no way to report a suitable error
2271 * message if we do that. All current callers would just throw error
2272 * immediately anyway, so this is safe at present.
2273 */
2274 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2275 {
2276 off_t past_write = offset;
2277
2278 for (int i = 0; i < iovcnt; ++i)
2279 past_write += iov[i].iov_len;
2280
2281 if (past_write > vfdP->fileSize)
2282 {
2283 uint64 newTotal = temporary_files_size;
2284
2285 newTotal += past_write - vfdP->fileSize;
2286 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2287 ereport(ERROR,
2288 (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2289 errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2290 temp_file_limit)));
2291 }
2292 }
2293
2294retry:
2295 pgstat_report_wait_start(wait_event_info);
2296 returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2298
2299 if (returnCode >= 0)
2300 {
2301 /*
2302 * Some callers expect short writes to set errno, and traditionally we
2303 * have assumed that they imply disk space shortage. We don't want to
2304 * waste CPU cycles adding up the total size here, so we'll just set
2305 * it for all successful writes in case such a caller determines that
2306 * the write was short and ereports "%m".
2307 */
2308 errno = ENOSPC;
2309
2310 /*
2311 * Maintain fileSize and temporary_files_size if it's a temp file.
2312 */
2313 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2314 {
2315 off_t past_write = offset + returnCode;
2316
2317 if (past_write > vfdP->fileSize)
2318 {
2319 temporary_files_size += past_write - vfdP->fileSize;
2320 vfdP->fileSize = past_write;
2321 }
2322 }
2323 }
2324 else
2325 {
2326 /*
2327 * See comments in FileReadV()
2328 */
2329#ifdef WIN32
2330 DWORD error = GetLastError();
2331
2332 switch (error)
2333 {
2334 case ERROR_NO_SYSTEM_RESOURCES:
2335 pg_usleep(1000L);
2336 errno = EINTR;
2337 break;
2338 default:
2340 break;
2341 }
2342#endif
2343 /* OK to retry if interrupted */
2344 if (errno == EINTR)
2345 goto retry;
2346 }
2347
2348 return returnCode;
2349}
2350
2351int
2352FileSync(File file, uint32 wait_event_info)
2353{
2354 int returnCode;
2355
2356 Assert(FileIsValid(file));
2357
2358 DO_DB(elog(LOG, "FileSync: %d (%s)",
2359 file, VfdCache[file].fileName));
2360
2361 returnCode = FileAccess(file);
2362 if (returnCode < 0)
2363 return returnCode;
2364
2365 pgstat_report_wait_start(wait_event_info);
2366 returnCode = pg_fsync(VfdCache[file].fd);
2368
2369 return returnCode;
2370}
2371
2372/*
2373 * Zero a region of the file.
2374 *
2375 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2376 * appropriate error.
2377 */
2378int
2379FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
2380{
2381 int returnCode;
2382 ssize_t written;
2383
2384 Assert(FileIsValid(file));
2385
2386 DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2387 file, VfdCache[file].fileName,
2388 (int64) offset, (int64) amount));
2389
2390 returnCode = FileAccess(file);
2391 if (returnCode < 0)
2392 return returnCode;
2393
2394 pgstat_report_wait_start(wait_event_info);
2395 written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2397
2398 if (written < 0)
2399 return -1;
2400 else if (written != amount)
2401 {
2402 /* if errno is unset, assume problem is no disk space */
2403 if (errno == 0)
2404 errno = ENOSPC;
2405 return -1;
2406 }
2407
2408 return 0;
2409}
2410
2411/*
2412 * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2413 * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2414 * use FileZero() instead.
2415 *
2416 * Note that at least glibc() implements posix_fallocate() in userspace if not
2417 * implemented by the filesystem. That's not the case for all environments
2418 * though.
2419 *
2420 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2421 * appropriate error.
2422 */
2423int
2424FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
2425{
2426#ifdef HAVE_POSIX_FALLOCATE
2427 int returnCode;
2428
2429 Assert(FileIsValid(file));
2430
2431 DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2432 file, VfdCache[file].fileName,
2433 (int64) offset, (int64) amount));
2434
2435 returnCode = FileAccess(file);
2436 if (returnCode < 0)
2437 return -1;
2438
2439retry:
2440 pgstat_report_wait_start(wait_event_info);
2441 returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2443
2444 if (returnCode == 0)
2445 return 0;
2446 else if (returnCode == EINTR)
2447 goto retry;
2448
2449 /* for compatibility with %m printing etc */
2450 errno = returnCode;
2451
2452 /*
2453 * Return in cases of a "real" failure, if fallocate is not supported,
2454 * fall through to the FileZero() backed implementation.
2455 */
2456 if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2457 return -1;
2458#endif
2459
2460 return FileZero(file, offset, amount, wait_event_info);
2461}
2462
2463off_t
2465{
2466 Assert(FileIsValid(file));
2467
2468 DO_DB(elog(LOG, "FileSize %d (%s)",
2469 file, VfdCache[file].fileName));
2470
2471 if (FileIsNotOpen(file))
2472 {
2473 if (FileAccess(file) < 0)
2474 return (off_t) -1;
2475 }
2476
2477 return lseek(VfdCache[file].fd, 0, SEEK_END);
2478}
2479
2480int
2481FileTruncate(File file, off_t offset, uint32 wait_event_info)
2482{
2483 int returnCode;
2484
2485 Assert(FileIsValid(file));
2486
2487 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2488 file, VfdCache[file].fileName));
2489
2490 returnCode = FileAccess(file);
2491 if (returnCode < 0)
2492 return returnCode;
2493
2494 pgstat_report_wait_start(wait_event_info);
2495 returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2497
2498 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2499 {
2500 /* adjust our state for truncation of a temp file */
2501 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2502 temporary_files_size -= VfdCache[file].fileSize - offset;
2503 VfdCache[file].fileSize = offset;
2504 }
2505
2506 return returnCode;
2507}
2508
2509/*
2510 * Return the pathname associated with an open file.
2511 *
2512 * The returned string points to an internal buffer, which is valid until
2513 * the file is closed.
2514 */
2515char *
2517{
2518 Assert(FileIsValid(file));
2519
2520 return VfdCache[file].fileName;
2521}
2522
2523/*
2524 * Return the raw file descriptor of an opened file.
2525 *
2526 * The returned file descriptor will be valid until the file is closed, but
2527 * there are a lot of things that can make that happen. So the caller should
2528 * be careful not to do much of anything else before it finishes using the
2529 * returned file descriptor.
2530 */
2531int
2533{
2534 int returnCode;
2535
2536 returnCode = FileAccess(file);
2537 if (returnCode < 0)
2538 return returnCode;
2539
2540 Assert(FileIsValid(file));
2541 return VfdCache[file].fd;
2542}
2543
2544/*
2545 * FileGetRawFlags - returns the file flags on open(2)
2546 */
2547int
2549{
2550 Assert(FileIsValid(file));
2551 return VfdCache[file].fileFlags;
2552}
2553
2554/*
2555 * FileGetRawMode - returns the mode bitmask passed to open(2)
2556 */
2557mode_t
2559{
2560 Assert(FileIsValid(file));
2561 return VfdCache[file].fileMode;
2562}
2563
2564/*
2565 * Make room for another allocatedDescs[] array entry if needed and possible.
2566 * Returns true if an array element is available.
2567 */
2568static bool
2570{
2571 AllocateDesc *newDescs;
2572 int newMax;
2573
2574 /* Quick out if array already has a free slot. */
2576 return true;
2577
2578 /*
2579 * If the array hasn't yet been created in the current process, initialize
2580 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2581 * we will ever need, anyway. We don't want to look at max_safe_fds
2582 * immediately because set_max_safe_fds() may not have run yet.
2583 */
2584 if (allocatedDescs == NULL)
2585 {
2586 newMax = FD_MINFREE / 3;
2587 newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2588 /* Out of memory already? Treat as fatal error. */
2589 if (newDescs == NULL)
2590 ereport(ERROR,
2591 (errcode(ERRCODE_OUT_OF_MEMORY),
2592 errmsg("out of memory")));
2593 allocatedDescs = newDescs;
2594 maxAllocatedDescs = newMax;
2595 return true;
2596 }
2597
2598 /*
2599 * Consider enlarging the array beyond the initial allocation used above.
2600 * By the time this happens, max_safe_fds should be known accurately.
2601 *
2602 * We mustn't let allocated descriptors hog all the available FDs, and in
2603 * practice we'd better leave a reasonable number of FDs for VFD use. So
2604 * set the maximum to max_safe_fds / 3. (This should certainly be at
2605 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2606 * tightening the restriction here.) Recall that "external" FDs are
2607 * allowed to consume another third of max_safe_fds.
2608 */
2609 newMax = max_safe_fds / 3;
2610 if (newMax > maxAllocatedDescs)
2611 {
2612 newDescs = (AllocateDesc *) realloc(allocatedDescs,
2613 newMax * sizeof(AllocateDesc));
2614 /* Treat out-of-memory as a non-fatal error. */
2615 if (newDescs == NULL)
2616 return false;
2617 allocatedDescs = newDescs;
2618 maxAllocatedDescs = newMax;
2619 return true;
2620 }
2621
2622 /* Can't enlarge allocatedDescs[] any more. */
2623 return false;
2624}
2625
2626/*
2627 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2628 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2629 * necessary to open the file. When done, call FreeFile rather than fclose.
2630 *
2631 * Note that files that will be open for any significant length of time
2632 * should NOT be handled this way, since they cannot share kernel file
2633 * descriptors with other files; there is grave risk of running out of FDs
2634 * if anyone locks down too many FDs. Most callers of this routine are
2635 * simply reading a config file that they will read and close immediately.
2636 *
2637 * fd.c will automatically close all files opened with AllocateFile at
2638 * transaction commit or abort; this prevents FD leakage if a routine
2639 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2640 *
2641 * Ideally this should be the *only* direct call of fopen() in the backend.
2642 */
2643FILE *
2644AllocateFile(const char *name, const char *mode)
2645{
2646 FILE *file;
2647
2648 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2650
2651 /* Can we allocate another non-virtual FD? */
2652 if (!reserveAllocatedDesc())
2653 ereport(ERROR,
2654 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2655 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2657
2658 /* Close excess kernel FDs. */
2660
2661TryAgain:
2662 if ((file = fopen(name, mode)) != NULL)
2663 {
2665
2666 desc->kind = AllocateDescFile;
2667 desc->desc.file = file;
2670 return desc->desc.file;
2671 }
2672
2673 if (errno == EMFILE || errno == ENFILE)
2674 {
2675 int save_errno = errno;
2676
2677 ereport(LOG,
2678 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2679 errmsg("out of file descriptors: %m; release and retry")));
2680 errno = 0;
2681 if (ReleaseLruFile())
2682 goto TryAgain;
2683 errno = save_errno;
2684 }
2685
2686 return NULL;
2687}
2688
2689/*
2690 * Open a file with OpenTransientFilePerm() and pass default file mode for
2691 * the fileMode parameter.
2692 */
2693int
2694OpenTransientFile(const char *fileName, int fileFlags)
2695{
2696 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2697}
2698
2699/*
2700 * Like AllocateFile, but returns an unbuffered fd like open(2)
2701 */
2702int
2703OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2704{
2705 int fd;
2706
2707 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2708 numAllocatedDescs, fileName));
2709
2710 /* Can we allocate another non-virtual FD? */
2711 if (!reserveAllocatedDesc())
2712 ereport(ERROR,
2713 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2714 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2715 maxAllocatedDescs, fileName)));
2716
2717 /* Close excess kernel FDs. */
2719
2720 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2721
2722 if (fd >= 0)
2723 {
2725
2726 desc->kind = AllocateDescRawFD;
2727 desc->desc.fd = fd;
2730
2731 return fd;
2732 }
2733
2734 return -1; /* failure */
2735}
2736
2737/*
2738 * Routines that want to initiate a pipe stream should use OpenPipeStream
2739 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2740 * necessary. When done, call ClosePipeStream rather than pclose.
2741 *
2742 * This function also ensures that the popen'd program is run with default
2743 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2744 * uses. This ensures desirable response to, eg, closing a read pipe early.
2745 */
2746FILE *
2747OpenPipeStream(const char *command, const char *mode)
2748{
2749 FILE *file;
2750 int save_errno;
2751
2752 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2753 numAllocatedDescs, command));
2754
2755 /* Can we allocate another non-virtual FD? */
2756 if (!reserveAllocatedDesc())
2757 ereport(ERROR,
2758 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2759 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2760 maxAllocatedDescs, command)));
2761
2762 /* Close excess kernel FDs. */
2764
2765TryAgain:
2766 fflush(NULL);
2767 pqsignal(SIGPIPE, SIG_DFL);
2768 errno = 0;
2769 file = popen(command, mode);
2770 save_errno = errno;
2771 pqsignal(SIGPIPE, SIG_IGN);
2772 errno = save_errno;
2773 if (file != NULL)
2774 {
2776
2777 desc->kind = AllocateDescPipe;
2778 desc->desc.file = file;
2781 return desc->desc.file;
2782 }
2783
2784 if (errno == EMFILE || errno == ENFILE)
2785 {
2786 ereport(LOG,
2787 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2788 errmsg("out of file descriptors: %m; release and retry")));
2789 if (ReleaseLruFile())
2790 goto TryAgain;
2791 errno = save_errno;
2792 }
2793
2794 return NULL;
2795}
2796
2797/*
2798 * Free an AllocateDesc of any type.
2799 *
2800 * The argument *must* point into the allocatedDescs[] array.
2801 */
2802static int
2804{
2805 int result;
2806
2807 /* Close the underlying object */
2808 switch (desc->kind)
2809 {
2810 case AllocateDescFile:
2811 result = fclose(desc->desc.file);
2812 break;
2813 case AllocateDescPipe:
2814 result = pclose(desc->desc.file);
2815 break;
2816 case AllocateDescDir:
2817 result = closedir(desc->desc.dir);
2818 break;
2819 case AllocateDescRawFD:
2820 pgaio_closing_fd(desc->desc.fd);
2821 result = close(desc->desc.fd);
2822 break;
2823 default:
2824 elog(ERROR, "AllocateDesc kind not recognized");
2825 result = 0; /* keep compiler quiet */
2826 break;
2827 }
2828
2829 /* Compact storage in the allocatedDescs array */
2832
2833 return result;
2834}
2835
2836/*
2837 * Close a file returned by AllocateFile.
2838 *
2839 * Note we do not check fclose's return value --- it is up to the caller
2840 * to handle close errors.
2841 */
2842int
2843FreeFile(FILE *file)
2844{
2845 int i;
2846
2847 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2848
2849 /* Remove file from list of allocated files, if it's present */
2850 for (i = numAllocatedDescs; --i >= 0;)
2851 {
2852 AllocateDesc *desc = &allocatedDescs[i];
2853
2854 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2855 return FreeDesc(desc);
2856 }
2857
2858 /* Only get here if someone passes us a file not in allocatedDescs */
2859 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2860
2861 return fclose(file);
2862}
2863
2864/*
2865 * Close a file returned by OpenTransientFile.
2866 *
2867 * Note we do not check close's return value --- it is up to the caller
2868 * to handle close errors.
2869 */
2870int
2872{
2873 int i;
2874
2875 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2876
2877 /* Remove fd from list of allocated files, if it's present */
2878 for (i = numAllocatedDescs; --i >= 0;)
2879 {
2880 AllocateDesc *desc = &allocatedDescs[i];
2881
2882 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2883 return FreeDesc(desc);
2884 }
2885
2886 /* Only get here if someone passes us a file not in allocatedDescs */
2887 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2888
2890
2891 return close(fd);
2892}
2893
2894/*
2895 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2896 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2897 * necessary to open the directory, and with closing it after an elog.
2898 * When done, call FreeDir rather than closedir.
2899 *
2900 * Returns NULL, with errno set, on failure. Note that failure detection
2901 * is commonly left to the following call of ReadDir or ReadDirExtended;
2902 * see the comments for ReadDir.
2903 *
2904 * Ideally this should be the *only* direct call of opendir() in the backend.
2905 */
2906DIR *
2907AllocateDir(const char *dirname)
2908{
2909 DIR *dir;
2910
2911 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2912 numAllocatedDescs, dirname));
2913
2914 /* Can we allocate another non-virtual FD? */
2915 if (!reserveAllocatedDesc())
2916 ereport(ERROR,
2917 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2918 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2919 maxAllocatedDescs, dirname)));
2920
2921 /* Close excess kernel FDs. */
2923
2924TryAgain:
2925 if ((dir = opendir(dirname)) != NULL)
2926 {
2928
2929 desc->kind = AllocateDescDir;
2930 desc->desc.dir = dir;
2933 return desc->desc.dir;
2934 }
2935
2936 if (errno == EMFILE || errno == ENFILE)
2937 {
2938 int save_errno = errno;
2939
2940 ereport(LOG,
2941 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2942 errmsg("out of file descriptors: %m; release and retry")));
2943 errno = 0;
2944 if (ReleaseLruFile())
2945 goto TryAgain;
2946 errno = save_errno;
2947 }
2948
2949 return NULL;
2950}
2951
2952/*
2953 * Read a directory opened with AllocateDir, ereport'ing any error.
2954 *
2955 * This is easier to use than raw readdir() since it takes care of some
2956 * otherwise rather tedious and error-prone manipulation of errno. Also,
2957 * if you are happy with a generic error message for AllocateDir failure,
2958 * you can just do
2959 *
2960 * dir = AllocateDir(path);
2961 * while ((dirent = ReadDir(dir, path)) != NULL)
2962 * process dirent;
2963 * FreeDir(dir);
2964 *
2965 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2966 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2967 * use this shortcut.)
2968 *
2969 * The pathname passed to AllocateDir must be passed to this routine too,
2970 * but it is only used for error reporting.
2971 */
2972struct dirent *
2973ReadDir(DIR *dir, const char *dirname)
2974{
2975 return ReadDirExtended(dir, dirname, ERROR);
2976}
2977
2978/*
2979 * Alternate version of ReadDir that allows caller to specify the elevel
2980 * for any error report (whether it's reporting an initial failure of
2981 * AllocateDir or a subsequent directory read failure).
2982 *
2983 * If elevel < ERROR, returns NULL after any error. With the normal coding
2984 * pattern, this will result in falling out of the loop immediately as
2985 * though the directory contained no (more) entries.
2986 */
2987struct dirent *
2988ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2989{
2990 struct dirent *dent;
2991
2992 /* Give a generic message for AllocateDir failure, if caller didn't */
2993 if (dir == NULL)
2994 {
2995 ereport(elevel,
2997 errmsg("could not open directory \"%s\": %m",
2998 dirname)));
2999 return NULL;
3000 }
3001
3002 errno = 0;
3003 if ((dent = readdir(dir)) != NULL)
3004 return dent;
3005
3006 if (errno)
3007 ereport(elevel,
3009 errmsg("could not read directory \"%s\": %m",
3010 dirname)));
3011 return NULL;
3012}
3013
3014/*
3015 * Close a directory opened with AllocateDir.
3016 *
3017 * Returns closedir's return value (with errno set if it's not 0).
3018 * Note we do not check the return value --- it is up to the caller
3019 * to handle close errors if wanted.
3020 *
3021 * Does nothing if dir == NULL; we assume that directory open failure was
3022 * already reported if desired.
3023 */
3024int
3026{
3027 int i;
3028
3029 /* Nothing to do if AllocateDir failed */
3030 if (dir == NULL)
3031 return 0;
3032
3033 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3034
3035 /* Remove dir from list of allocated dirs, if it's present */
3036 for (i = numAllocatedDescs; --i >= 0;)
3037 {
3038 AllocateDesc *desc = &allocatedDescs[i];
3039
3040 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3041 return FreeDesc(desc);
3042 }
3043
3044 /* Only get here if someone passes us a dir not in allocatedDescs */
3045 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3046
3047 return closedir(dir);
3048}
3049
3050
3051/*
3052 * Close a pipe stream returned by OpenPipeStream.
3053 */
3054int
3056{
3057 int i;
3058
3059 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3060
3061 /* Remove file from list of allocated files, if it's present */
3062 for (i = numAllocatedDescs; --i >= 0;)
3063 {
3064 AllocateDesc *desc = &allocatedDescs[i];
3065
3066 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3067 return FreeDesc(desc);
3068 }
3069
3070 /* Only get here if someone passes us a file not in allocatedDescs */
3071 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3072
3073 return pclose(file);
3074}
3075
3076/*
3077 * closeAllVfds
3078 *
3079 * Force all VFDs into the physically-closed state, so that the fewest
3080 * possible number of kernel file descriptors are in use. There is no
3081 * change in the logical state of the VFDs.
3082 */
3083void
3085{
3086 Index i;
3087
3088 if (SizeVfdCache > 0)
3089 {
3090 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3091 for (i = 1; i < SizeVfdCache; i++)
3092 {
3093 if (!FileIsNotOpen(i))
3094 LruDelete(i);
3095 }
3096 }
3097}
3098
3099
3100/*
3101 * SetTempTablespaces
3102 *
3103 * Define a list (actually an array) of OIDs of tablespaces to use for
3104 * temporary files. This list will be used until end of transaction,
3105 * unless this function is called again before then. It is caller's
3106 * responsibility that the passed-in array has adequate lifespan (typically
3107 * it'd be allocated in TopTransactionContext).
3108 *
3109 * Some entries of the array may be InvalidOid, indicating that the current
3110 * database's default tablespace should be used.
3111 */
3112void
3113SetTempTablespaces(Oid *tableSpaces, int numSpaces)
3114{
3115 Assert(numSpaces >= 0);
3116 tempTableSpaces = tableSpaces;
3117 numTempTableSpaces = numSpaces;
3118
3119 /*
3120 * Select a random starting point in the list. This is to minimize
3121 * conflicts between backends that are most likely sharing the same list
3122 * of temp tablespaces. Note that if we create multiple temp files in the
3123 * same transaction, we'll advance circularly through the list --- this
3124 * ensures that large temporary sort files are nicely spread across all
3125 * available tablespaces.
3126 */
3127 if (numSpaces > 1)
3129 0, numSpaces - 1);
3130 else
3132}
3133
3134/*
3135 * TempTablespacesAreSet
3136 *
3137 * Returns true if SetTempTablespaces has been called in current transaction.
3138 * (This is just so that tablespaces.c doesn't need its own per-transaction
3139 * state.)
3140 */
3141bool
3143{
3144 return (numTempTableSpaces >= 0);
3145}
3146
3147/*
3148 * GetTempTablespaces
3149 *
3150 * Populate an array with the OIDs of the tablespaces that should be used for
3151 * temporary files. (Some entries may be InvalidOid, indicating that the
3152 * current database's default tablespace should be used.) At most numSpaces
3153 * entries will be filled.
3154 * Returns the number of OIDs that were copied into the output array.
3155 */
3156int
3157GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3158{
3159 int i;
3160
3162 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3163 tableSpaces[i] = tempTableSpaces[i];
3164
3165 return i;
3166}
3167
3168/*
3169 * GetNextTempTableSpace
3170 *
3171 * Select the next temp tablespace to use. A result of InvalidOid means
3172 * to use the current database's default tablespace.
3173 */
3174Oid
3176{
3177 if (numTempTableSpaces > 0)
3178 {
3179 /* Advance nextTempTableSpace counter with wraparound */
3183 }
3184 return InvalidOid;
3185}
3186
3187
3188/*
3189 * AtEOSubXact_Files
3190 *
3191 * Take care of subtransaction commit/abort. At abort, we close temp files
3192 * that the subtransaction may have opened. At commit, we reassign the
3193 * files that were opened to the parent subtransaction.
3194 */
3195void
3197 SubTransactionId parentSubid)
3198{
3199 Index i;
3200
3201 for (i = 0; i < numAllocatedDescs; i++)
3202 {
3203 if (allocatedDescs[i].create_subid == mySubid)
3204 {
3205 if (isCommit)
3206 allocatedDescs[i].create_subid = parentSubid;
3207 else
3208 {
3209 /* have to recheck the item after FreeDesc (ugly) */
3211 }
3212 }
3213 }
3214}
3215
3216/*
3217 * AtEOXact_Files
3218 *
3219 * This routine is called during transaction commit or abort. All still-open
3220 * per-transaction temporary file VFDs are closed, which also causes the
3221 * underlying files to be deleted (although they should've been closed already
3222 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3223 * closed. We also forget any transaction-local temp tablespace list.
3224 *
3225 * The isCommit flag is used only to decide whether to emit warnings about
3226 * unclosed files.
3227 */
3228void
3229AtEOXact_Files(bool isCommit)
3230{
3231 CleanupTempFiles(isCommit, false);
3232 tempTableSpaces = NULL;
3233 numTempTableSpaces = -1;
3234}
3235
3236/*
3237 * BeforeShmemExit_Files
3238 *
3239 * before_shmem_exit hook to clean up temp files during backend shutdown.
3240 * Here, we want to clean up *all* temp files including interXact ones.
3241 */
3242static void
3244{
3245 CleanupTempFiles(false, true);
3246
3247 /* prevent further temp files from being created */
3248#ifdef USE_ASSERT_CHECKING
3249 temporary_files_allowed = false;
3250#endif
3251}
3252
3253/*
3254 * Close temporary files and delete their underlying files.
3255 *
3256 * isCommit: if true, this is normal transaction commit, and we don't
3257 * expect any remaining files; warn if there are some.
3258 *
3259 * isProcExit: if true, this is being called as the backend process is
3260 * exiting. If that's the case, we should remove all temporary files; if
3261 * that's not the case, we are being called for transaction commit/abort
3262 * and should only remove transaction-local temp files. In either case,
3263 * also clean up "allocated" stdio files, dirs and fds.
3264 */
3265static void
3266CleanupTempFiles(bool isCommit, bool isProcExit)
3267{
3268 Index i;
3269
3270 /*
3271 * Careful here: at proc_exit we need extra cleanup, not just
3272 * xact_temporary files.
3273 */
3274 if (isProcExit || have_xact_temporary_files)
3275 {
3276 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3277 for (i = 1; i < SizeVfdCache; i++)
3278 {
3279 unsigned short fdstate = VfdCache[i].fdstate;
3280
3281 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3282 VfdCache[i].fileName != NULL)
3283 {
3284 /*
3285 * If we're in the process of exiting a backend process, close
3286 * all temporary files. Otherwise, only close temporary files
3287 * local to the current transaction. They should be closed by
3288 * the ResourceOwner mechanism already, so this is just a
3289 * debugging cross-check.
3290 */
3291 if (isProcExit)
3292 FileClose(i);
3293 else if (fdstate & FD_CLOSE_AT_EOXACT)
3294 {
3295 elog(WARNING,
3296 "temporary file %s not closed at end-of-transaction",
3297 VfdCache[i].fileName);
3298 FileClose(i);
3299 }
3300 }
3301 }
3302
3304 }
3305
3306 /* Complain if any allocated files remain open at commit. */
3307 if (isCommit && numAllocatedDescs > 0)
3308 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3310
3311 /* Clean up "allocated" stdio files, dirs and fds. */
3312 while (numAllocatedDescs > 0)
3314}
3315
3316
3317/*
3318 * Remove temporary and temporary relation files left over from a prior
3319 * postmaster session
3320 *
3321 * This should be called during postmaster startup. It will forcibly
3322 * remove any leftover files created by OpenTemporaryFile and any leftover
3323 * temporary relation files created by mdcreate.
3324 *
3325 * During post-backend-crash restart cycle, this routine is called when
3326 * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3327 * queries are using temp files could result in useless storage usage that can
3328 * only be reclaimed by a service restart. The argument against enabling it is
3329 * that someone might want to examine the temporary files for debugging
3330 * purposes. This does however mean that OpenTemporaryFile had better allow for
3331 * collision with an existing temp file name.
3332 *
3333 * NOTE: this function and its subroutines generally report syscall failures
3334 * with ereport(LOG) and keep going. Removing temp files is not so critical
3335 * that we should fail to start the database when we can't do it.
3336 */
3337void
3339{
3340 char temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3341 DIR *spc_dir;
3342 struct dirent *spc_de;
3343
3344 /*
3345 * First process temp files in pg_default ($PGDATA/base)
3346 */
3347 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3348 RemovePgTempFilesInDir(temp_path, true, false);
3350
3351 /*
3352 * Cycle through temp directories for all non-default tablespaces.
3353 */
3354 spc_dir = AllocateDir(PG_TBLSPC_DIR);
3355
3356 while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
3357 {
3358 if (strcmp(spc_de->d_name, ".") == 0 ||
3359 strcmp(spc_de->d_name, "..") == 0)
3360 continue;
3361
3362 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3365 RemovePgTempFilesInDir(temp_path, true, false);
3366
3367 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3369 RemovePgTempRelationFiles(temp_path);
3370 }
3371
3372 FreeDir(spc_dir);
3373
3374 /*
3375 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3376 * DataDir as well. However, that is *not* cleaned here because doing so
3377 * would create a race condition. It's done separately, earlier in
3378 * postmaster startup.
3379 */
3380}
3381
3382/*
3383 * Process one pgsql_tmp directory for RemovePgTempFiles.
3384 *
3385 * If missing_ok is true, it's all right for the named directory to not exist.
3386 * Any other problem results in a LOG message. (missing_ok should be true at
3387 * the top level, since pgsql_tmp directories are not created until needed.)
3388 *
3389 * At the top level, this should be called with unlink_all = false, so that
3390 * only files matching the temporary name prefix will be unlinked. When
3391 * recursing it will be called with unlink_all = true to unlink everything
3392 * under a top-level temporary directory.
3393 *
3394 * (These two flags could be replaced by one, but it seems clearer to keep
3395 * them separate.)
3396 */
3397void
3398RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3399{
3400 DIR *temp_dir;
3401 struct dirent *temp_de;
3402 char rm_path[MAXPGPATH * 2];
3403
3404 temp_dir = AllocateDir(tmpdirname);
3405
3406 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3407 return;
3408
3409 while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3410 {
3411 if (strcmp(temp_de->d_name, ".") == 0 ||
3412 strcmp(temp_de->d_name, "..") == 0)
3413 continue;
3414
3415 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3416 tmpdirname, temp_de->d_name);
3417
3418 if (unlink_all ||
3419 strncmp(temp_de->d_name,
3421 strlen(PG_TEMP_FILE_PREFIX)) == 0)
3422 {
3423 PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3424
3425 if (type == PGFILETYPE_ERROR)
3426 continue;
3427 else if (type == PGFILETYPE_DIR)
3428 {
3429 /* recursively remove contents, then directory itself */
3430 RemovePgTempFilesInDir(rm_path, false, true);
3431
3432 if (rmdir(rm_path) < 0)
3433 ereport(LOG,
3435 errmsg("could not remove directory \"%s\": %m",
3436 rm_path)));
3437 }
3438 else
3439 {
3440 if (unlink(rm_path) < 0)
3441 ereport(LOG,
3443 errmsg("could not remove file \"%s\": %m",
3444 rm_path)));
3445 }
3446 }
3447 else
3448 ereport(LOG,
3449 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3450 rm_path)));
3451 }
3452
3453 FreeDir(temp_dir);
3454}
3455
3456/* Process one tablespace directory, look for per-DB subdirectories */
3457static void
3458RemovePgTempRelationFiles(const char *tsdirname)
3459{
3460 DIR *ts_dir;
3461 struct dirent *de;
3462 char dbspace_path[MAXPGPATH * 2];
3463
3464 ts_dir = AllocateDir(tsdirname);
3465
3466 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3467 {
3468 /*
3469 * We're only interested in the per-database directories, which have
3470 * numeric names. Note that this code will also (properly) ignore "."
3471 * and "..".
3472 */
3473 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3474 continue;
3475
3476 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3477 tsdirname, de->d_name);
3479 }
3480
3481 FreeDir(ts_dir);
3482}
3483
3484/* Process one per-dbspace directory for RemovePgTempRelationFiles */
3485static void
3486RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3487{
3488 DIR *dbspace_dir;
3489 struct dirent *de;
3490 char rm_path[MAXPGPATH * 2];
3491
3492 dbspace_dir = AllocateDir(dbspacedirname);
3493
3494 while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3495 {
3497 continue;
3498
3499 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3500 dbspacedirname, de->d_name);
3501
3502 if (unlink(rm_path) < 0)
3503 ereport(LOG,
3505 errmsg("could not remove file \"%s\": %m",
3506 rm_path)));
3507 }
3508
3509 FreeDir(dbspace_dir);
3510}
3511
3512/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3513bool
3515{
3516 int pos;
3517 int savepos;
3518
3519 /* Must start with "t". */
3520 if (name[0] != 't')
3521 return false;
3522
3523 /* Followed by a non-empty string of digits and then an underscore. */
3524 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3525 ;
3526 if (pos == 1 || name[pos] != '_')
3527 return false;
3528
3529 /* Followed by another nonempty string of digits. */
3530 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3531 ;
3532 if (savepos == pos)
3533 return false;
3534
3535 /* We might have _forkname or .segment or both. */
3536 if (name[pos] == '_')
3537 {
3538 int forkchar = forkname_chars(&name[pos + 1], NULL);
3539
3540 if (forkchar <= 0)
3541 return false;
3542 pos += forkchar + 1;
3543 }
3544 if (name[pos] == '.')
3545 {
3546 int segchar;
3547
3548 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3549 ;
3550 if (segchar <= 1)
3551 return false;
3552 pos += segchar;
3553 }
3554
3555 /* Now we should be at the end. */
3556 if (name[pos] != '\0')
3557 return false;
3558 return true;
3559}
3560
3561#ifdef HAVE_SYNCFS
3562static void
3563do_syncfs(const char *path)
3564{
3565 int fd;
3566
3567 ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3568 path);
3569
3570 fd = OpenTransientFile(path, O_RDONLY);
3571 if (fd < 0)
3572 {
3573 ereport(LOG,
3575 errmsg("could not open file \"%s\": %m", path)));
3576 return;
3577 }
3578 if (syncfs(fd) < 0)
3579 ereport(LOG,
3581 errmsg("could not synchronize file system for file \"%s\": %m", path)));
3583}
3584#endif
3585
3586/*
3587 * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3588 * all potential filesystem, depending on recovery_init_sync_method setting.
3589 *
3590 * We fsync regular files and directories wherever they are, but we
3591 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3592 * Other symlinks are presumed to point at files we're not responsible
3593 * for fsyncing, and might not have privileges to write at all.
3594 *
3595 * Errors are logged but not considered fatal; that's because this is used
3596 * only during database startup, to deal with the possibility that there are
3597 * issued-but-unsynced writes pending against the data directory. We want to
3598 * ensure that such writes reach disk before anything that's done in the new
3599 * run. However, aborting on error would result in failure to start for
3600 * harmless cases such as read-only files in the data directory, and that's
3601 * not good either.
3602 *
3603 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3604 * rewriting all changes again during recovery.
3605 *
3606 * Note we assume we're chdir'd into PGDATA to begin with.
3607 */
3608void
3610{
3611 bool xlog_is_symlink;
3612
3613 /* We can skip this whole thing if fsync is disabled. */
3614 if (!enableFsync)
3615 return;
3616
3617 /*
3618 * If pg_wal is a symlink, we'll need to recurse into it separately,
3619 * because the first walkdir below will ignore it.
3620 */
3621 xlog_is_symlink = false;
3622
3623 {
3624 struct stat st;
3625
3626 if (lstat("pg_wal", &st) < 0)
3627 ereport(LOG,
3629 errmsg("could not stat file \"%s\": %m",
3630 "pg_wal")));
3631 else if (S_ISLNK(st.st_mode))
3632 xlog_is_symlink = true;
3633 }
3634
3635#ifdef HAVE_SYNCFS
3637 {
3638 DIR *dir;
3639 struct dirent *de;
3640
3641 /*
3642 * On Linux, we don't have to open every single file one by one. We
3643 * can use syncfs() to sync whole filesystems. We only expect
3644 * filesystem boundaries to exist where we tolerate symlinks, namely
3645 * pg_wal and the tablespaces, so we call syncfs() for each of those
3646 * directories.
3647 */
3648
3649 /* Prepare to report progress syncing the data directory via syncfs. */
3651
3652 /* Sync the top level pgdata directory. */
3653 do_syncfs(".");
3654 /* If any tablespaces are configured, sync each of those. */
3656 while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3657 {
3658 char path[MAXPGPATH];
3659
3660 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3661 continue;
3662
3663 snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3664 do_syncfs(path);
3665 }
3666 FreeDir(dir);
3667 /* If pg_wal is a symlink, process that too. */
3668 if (xlog_is_symlink)
3669 do_syncfs("pg_wal");
3670 return;
3671 }
3672#endif /* !HAVE_SYNCFS */
3673
3674#ifdef PG_FLUSH_DATA_WORKS
3675 /* Prepare to report progress of the pre-fsync phase. */
3677
3678 /*
3679 * If possible, hint to the kernel that we're soon going to fsync the data
3680 * directory and its contents. Errors in this step are even less
3681 * interesting than normal, so log them only at DEBUG1.
3682 */
3683 walkdir(".", pre_sync_fname, false, DEBUG1);
3684 if (xlog_is_symlink)
3685 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3686 walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
3687#endif
3688
3689 /* Prepare to report progress syncing the data directory via fsync. */
3691
3692 /*
3693 * Now we do the fsync()s in the same order.
3694 *
3695 * The main call ignores symlinks, so in addition to specially processing
3696 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3697 * process_symlinks = true. Note that if there are any plain directories
3698 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3699 * so we don't worry about optimizing it.
3700 */
3701 walkdir(".", datadir_fsync_fname, false, LOG);
3702 if (xlog_is_symlink)
3703 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3705}
3706
3707/*
3708 * walkdir: recursively walk a directory, applying the action to each
3709 * regular file and directory (including the named directory itself).
3710 *
3711 * If process_symlinks is true, the action and recursion are also applied
3712 * to regular files and directories that are pointed to by symlinks in the
3713 * given directory; otherwise symlinks are ignored. Symlinks are always
3714 * ignored in subdirectories, ie we intentionally don't pass down the
3715 * process_symlinks flag to recursive calls.
3716 *
3717 * Errors are reported at level elevel, which might be ERROR or less.
3718 *
3719 * See also walkdir in file_utils.c, which is a frontend version of this
3720 * logic.
3721 */
3722static void
3723walkdir(const char *path,
3724 void (*action) (const char *fname, bool isdir, int elevel),
3725 bool process_symlinks,
3726 int elevel)
3727{
3728 DIR *dir;
3729 struct dirent *de;
3730
3731 dir = AllocateDir(path);
3732
3733 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3734 {
3735 char subpath[MAXPGPATH * 2];
3736
3738
3739 if (strcmp(de->d_name, ".") == 0 ||
3740 strcmp(de->d_name, "..") == 0)
3741 continue;
3742
3743 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3744
3745 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3746 {
3747 case PGFILETYPE_REG:
3748 (*action) (subpath, false, elevel);
3749 break;
3750 case PGFILETYPE_DIR:
3751 walkdir(subpath, action, false, elevel);
3752 break;
3753 default:
3754
3755 /*
3756 * Errors are already reported directly by get_dirent_type(),
3757 * and any remaining symlinks and unknown file types are
3758 * ignored.
3759 */
3760 break;
3761 }
3762 }
3763
3764 FreeDir(dir); /* we ignore any error here */
3765
3766 /*
3767 * It's important to fsync the destination directory itself as individual
3768 * file fsyncs don't guarantee that the directory entry for the file is
3769 * synced. However, skip this if AllocateDir failed; the action function
3770 * might not be robust against that.
3771 */
3772 if (dir)
3773 (*action) (path, true, elevel);
3774}
3775
3776
3777/*
3778 * Hint to the OS that it should get ready to fsync() this file.
3779 *
3780 * Ignores errors trying to open unreadable files, and logs other errors at a
3781 * caller-specified level.
3782 */
3783#ifdef PG_FLUSH_DATA_WORKS
3784
3785static void
3786pre_sync_fname(const char *fname, bool isdir, int elevel)
3787{
3788 int fd;
3789
3790 /* Don't try to flush directories, it'll likely just fail */
3791 if (isdir)
3792 return;
3793
3794 ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3795 fname);
3796
3797 fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3798
3799 if (fd < 0)
3800 {
3801 if (errno == EACCES)
3802 return;
3803 ereport(elevel,
3805 errmsg("could not open file \"%s\": %m", fname)));
3806 return;
3807 }
3808
3809 /*
3810 * pg_flush_data() ignores errors, which is ok because this is only a
3811 * hint.
3812 */
3813 pg_flush_data(fd, 0, 0);
3814
3815 if (CloseTransientFile(fd) != 0)
3816 ereport(elevel,
3818 errmsg("could not close file \"%s\": %m", fname)));
3819}
3820
3821#endif /* PG_FLUSH_DATA_WORKS */
3822
3823static void
3824datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3825{
3826 ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3827 fname);
3828
3829 /*
3830 * We want to silently ignoring errors about unreadable files. Pass that
3831 * desire on to fsync_fname_ext().
3832 */
3833 fsync_fname_ext(fname, isdir, true, elevel);
3834}
3835
3836static void
3837unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3838{
3839 if (isdir)
3840 {
3841 if (rmdir(fname) != 0 && errno != ENOENT)
3842 ereport(elevel,
3844 errmsg("could not remove directory \"%s\": %m", fname)));
3845 }
3846 else
3847 {
3848 /* Use PathNameDeleteTemporaryFile to report filesize */
3849 PathNameDeleteTemporaryFile(fname, false);
3850 }
3851}
3852
3853/*
3854 * fsync_fname_ext -- Try to fsync a file or directory
3855 *
3856 * If ignore_perm is true, ignore errors upon trying to open unreadable
3857 * files. Logs other errors at a caller-specified level.
3858 *
3859 * Returns 0 if the operation succeeded, -1 otherwise.
3860 */
3861int
3862fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3863{
3864 int fd;
3865 int flags;
3866 int returncode;
3867
3868 /*
3869 * Some OSs require directories to be opened read-only whereas other
3870 * systems don't allow us to fsync files opened read-only; so we need both
3871 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3872 * not writable by our userid, but we assume that's OK.
3873 */
3874 flags = PG_BINARY;
3875 if (!isdir)
3876 flags |= O_RDWR;
3877 else
3878 flags |= O_RDONLY;
3879
3880 fd = OpenTransientFile(fname, flags);
3881
3882 /*
3883 * Some OSs don't allow us to open directories at all (Windows returns
3884 * EACCES), just ignore the error in that case. If desired also silently
3885 * ignoring errors about unreadable files. Log others.
3886 */
3887 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3888 return 0;
3889 else if (fd < 0 && ignore_perm && errno == EACCES)
3890 return 0;
3891 else if (fd < 0)
3892 {
3893 ereport(elevel,
3895 errmsg("could not open file \"%s\": %m", fname)));
3896 return -1;
3897 }
3898
3899 returncode = pg_fsync(fd);
3900
3901 /*
3902 * Some OSes don't allow us to fsync directories at all, so we can ignore
3903 * those errors. Anything else needs to be logged.
3904 */
3905 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3906 {
3907 int save_errno;
3908
3909 /* close file upon error, might not be in transaction context */
3910 save_errno = errno;
3911 (void) CloseTransientFile(fd);
3912 errno = save_errno;
3913
3914 ereport(elevel,
3916 errmsg("could not fsync file \"%s\": %m", fname)));
3917 return -1;
3918 }
3919
3920 if (CloseTransientFile(fd) != 0)
3921 {
3922 ereport(elevel,
3924 errmsg("could not close file \"%s\": %m", fname)));
3925 return -1;
3926 }
3927
3928 return 0;
3929}
3930
3931/*
3932 * fsync_parent_path -- fsync the parent path of a file or directory
3933 *
3934 * This is aimed at making file operations persistent on disk in case of
3935 * an OS crash or power failure.
3936 */
3937static int
3938fsync_parent_path(const char *fname, int elevel)
3939{
3940 char parentpath[MAXPGPATH];
3941
3942 strlcpy(parentpath, fname, MAXPGPATH);
3943 get_parent_directory(parentpath);
3944
3945 /*
3946 * get_parent_directory() returns an empty string if the input argument is
3947 * just a file name (see comments in path.c), so handle that as being the
3948 * current directory.
3949 */
3950 if (strlen(parentpath) == 0)
3951 strlcpy(parentpath, ".", MAXPGPATH);
3952
3953 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3954 return -1;
3955
3956 return 0;
3957}
3958
3959/*
3960 * Create a PostgreSQL data sub-directory
3961 *
3962 * The data directory itself, and most of its sub-directories, are created at
3963 * initdb time, but we do have some occasions when we create directories in
3964 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3965 * make sure that those directories are created consistently. Today, that means
3966 * making sure that the created directory has the correct permissions, which is
3967 * what pg_dir_create_mode tracks for us.
3968 *
3969 * Note that we also set the umask() based on what we understand the correct
3970 * permissions to be (see file_perm.c).
3971 *
3972 * For permissions other than the default, mkdir() can be used directly, but
3973 * be sure to consider carefully such cases -- a sub-directory with incorrect
3974 * permissions in a PostgreSQL data directory could cause backups and other
3975 * processes to fail.
3976 */
3977int
3978MakePGDirectory(const char *directoryName)
3979{
3980 return mkdir(directoryName, pg_dir_create_mode);
3981}
3982
3983/*
3984 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3985 *
3986 * Failure to fsync any data file is cause for immediate panic, unless
3987 * data_sync_retry is enabled. Data may have been written to the operating
3988 * system and removed from our buffer pool already, and if we are running on
3989 * an operating system that forgets dirty data on write-back failure, there
3990 * may be only one copy of the data remaining: in the WAL. A later attempt to
3991 * fsync again might falsely report success. Therefore we must not allow any
3992 * further checkpoints to be attempted. data_sync_retry can in theory be
3993 * enabled on systems known not to drop dirty buffered data on write-back
3994 * failure (with the likely outcome that checkpoints will continue to fail
3995 * until the underlying problem is fixed).
3996 *
3997 * Any code that reports a failure from fsync() or related functions should
3998 * filter the error level with this function.
3999 */
4000int
4002{
4003 return data_sync_retry ? elevel : PANIC;
4004}
4005
4006bool
4008{
4009 bool result = true;
4010 int flags;
4011
4012#if PG_O_DIRECT == 0
4013 if (strcmp(*newval, "") != 0)
4014 {
4015 GUC_check_errdetail("\"%s\" is not supported on this platform.",
4016 "debug_io_direct");
4017 result = false;
4018 }
4019 flags = 0;
4020#else
4021 List *elemlist;
4022 ListCell *l;
4023 char *rawstring;
4024
4025 /* Need a modifiable copy of string */
4026 rawstring = pstrdup(*newval);
4027
4028 if (!SplitGUCList(rawstring, ',', &elemlist))
4029 {
4030 GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4031 "debug_io_direct");
4032 pfree(rawstring);
4033 list_free(elemlist);
4034 return false;
4035 }
4036
4037 flags = 0;
4038 foreach(l, elemlist)
4039 {
4040 char *item = (char *) lfirst(l);
4041
4042 if (pg_strcasecmp(item, "data") == 0)
4043 flags |= IO_DIRECT_DATA;
4044 else if (pg_strcasecmp(item, "wal") == 0)
4045 flags |= IO_DIRECT_WAL;
4046 else if (pg_strcasecmp(item, "wal_init") == 0)
4047 flags |= IO_DIRECT_WAL_INIT;
4048 else
4049 {
4050 GUC_check_errdetail("Invalid option \"%s\".", item);
4051 result = false;
4052 break;
4053 }
4054 }
4055
4056 /*
4057 * It's possible to configure block sizes smaller than our assumed I/O
4058 * alignment size, which could result in invalid I/O requests.
4059 */
4060#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4061 if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4062 {
4063 GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4064 "debug_io_direct", "XLOG_BLCKSZ");
4065 result = false;
4066 }
4067#endif
4068#if BLCKSZ < PG_IO_ALIGN_SIZE
4069 if (result && (flags & IO_DIRECT_DATA))
4070 {
4071 GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4072 "debug_io_direct", "BLCKSZ");
4073 result = false;
4074 }
4075#endif
4076
4077 pfree(rawstring);
4078 list_free(elemlist);
4079#endif
4080
4081 if (!result)
4082 return result;
4083
4084 /* Save the flags in *extra, for use by assign_debug_io_direct */
4085 *extra = guc_malloc(LOG, sizeof(int));
4086 if (!*extra)
4087 return false;
4088 *((int *) *extra) = flags;
4089
4090 return result;
4091}
4092
4093void
4094assign_debug_io_direct(const char *newval, void *extra)
4095{
4096 int *flags = (int *) extra;
4097
4098 io_direct_flags = *flags;
4099}
4100
4101/* ResourceOwner callbacks */
4102
4103static void
4105{
4106 File file = (File) DatumGetInt32(res);
4107 Vfd *vfdP;
4108
4109 Assert(FileIsValid(file));
4110
4111 vfdP = &VfdCache[file];
4112 vfdP->resowner = NULL;
4113
4114 FileClose(file);
4115}
4116
4117static char *
4119{
4120 return psprintf("File %d", DatumGetInt32(res));
4121}
void pgaio_closing_fd(int fd)
Definition: aio.c:1117
void pgaio_io_start_readv(PgAioHandle *ioh, int fd, int iovcnt, uint64 offset)
Definition: aio_io.c:78
void begin_startup_progress_phase(void)
Definition: startup.c:347
#define Min(x, y)
Definition: c.h:975
uint32 SubTransactionId
Definition: c.h:627
#define INT64_FORMAT
Definition: c.h:520
int64_t int64
Definition: c.h:499
#define PG_BINARY
Definition: c.h:1244
uint64_t uint64
Definition: c.h:503
uint32_t uint32
Definition: c.h:502
unsigned int Index
Definition: c.h:585
#define MemSet(start, val, len)
Definition: c.h:991
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:909
int fdatasync(int fildes)
#define OidIsValid(objectId)
Definition: c.h:746
size_t Size
Definition: c.h:576
int closedir(DIR *)
Definition: dirent.c:127
struct dirent * readdir(DIR *)
Definition: dirent.c:78
DIR * opendir(const char *)
Definition: dirent.c:33
int errcode_for_file_access(void)
Definition: elog.c:877
int errdetail(const char *fmt,...)
Definition: elog.c:1204
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define LOG
Definition: elog.h:31
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:149
static int pg_ftruncate(int fd, off_t length)
Definition: fd.c:703
int max_files_per_process
Definition: fd.c:146
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:525
int FileGetRawDesc(File file)
Definition: fd.c:2532
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3978
int FreeDir(DIR *dir)
Definition: fd.c:3025
int recovery_init_sync_method
Definition: fd.c:165
static const ResourceOwnerDesc file_resowner_desc
Definition: fd.c:361
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2139
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:441
#define FD_MINFREE
Definition: fd.c:138
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2747
static int numTempTableSpaces
Definition: fd.c:289
static bool ReleaseLruFile(void)
Definition: fd.c:1386
int io_direct_flags
Definition: fd.c:168
#define FD_DELETE_AT_CLOSE
Definition: fd.c:192
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1111
static int maxAllocatedDescs
Definition: fd.c:268
static void Delete(File file)
Definition: fd.c:1270
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2803
static long tempFileCounter
Definition: fd.c:280
static char * ResOwnerPrintFile(Datum res)
Definition: fd.c:4118
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:782
char * FilePathName(File file)
Definition: fd.c:2516
static void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: fd.c:377
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:3157
static int numAllocatedDescs
Definition: fd.c:267
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition: fd.c:1905
static void LruDelete(File file)
Definition: fd.c:1289
int pg_fdatasync(int fd)
Definition: fd.c:480
#define FileIsValid(file)
Definition: fd.c:186
void assign_debug_io_direct(const char *newval, void *extra)
Definition: fd.c:4094
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2352
static int nfile
Definition: fd.c:222
int CloseTransientFile(int fd)
Definition: fd.c:2871
#define DO_DB(A)
Definition: fd.c:180
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1089
void closeAllVfds(void)
Definition: fd.c:3084
int max_safe_fds
Definition: fd.c:159
static File AllocateVfd(void)
Definition: fd.c:1418
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1865
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1695
int ClosePipeStream(FILE *file)
Definition: fd.c:3055
void AtEOXact_Files(bool isCommit)
Definition: fd.c:3229
int FileGetRawFlags(File file)
Definition: fd.c:2548
static Size SizeVfdCache
Definition: fd.c:217
static int nextTempTableSpace
Definition: fd.c:290
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:193
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3862
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3837
static void ResOwnerReleaseFile(Datum res)
Definition: fd.c:4104
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3458
int FreeFile(FILE *file)
Definition: fd.c:2843
mode_t FileGetRawMode(File file)
Definition: fd.c:2558
static AllocateDesc * allocatedDescs
Definition: fd.c:269
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2988
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:964
static int FileAccess(File file)
Definition: fd.c:1496
static void FreeVfd(File file)
Definition: fd.c:1476
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition: fd.c:461
void FileClose(File file)
Definition: fd.c:1982
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2221
void ReleaseExternalFD(void)
Definition: fd.c:1241
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:194
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3398
bool pg_file_exists(const char *name)
Definition: fd.c:503
void RemovePgTempFiles(void)
Definition: fd.c:3338
#define FileIsNotOpen(file)
Definition: fd.c:189
bool TempTablespacesAreSet(void)
Definition: fd.c:3142
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:756
int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2424
int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2083
int data_sync_elevel(int elevel)
Definition: fd.c:4001
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1579
static void Insert(File file)
Definition: fd.c:1317
AllocateDescKind
Definition: fd.c:248
@ AllocateDescDir
Definition: fd.c:251
@ AllocateDescPipe
Definition: fd.c:250
@ AllocateDescFile
Definition: fd.c:249
@ AllocateDescRawFD
Definition: fd.c:252
Oid GetNextTempTableSpace(void)
Definition: fd.c:3175
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1592
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3824
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1532
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1808
bool AcquireExternalFD(void)
Definition: fd.c:1188
static void RegisterTemporaryFile(File file)
Definition: fd.c:1551
#define NUM_RESERVED_FDS
Definition: fd.c:129
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2907
static Oid * tempTableSpaces
Definition: fd.c:288
static bool reserveAllocatedDesc(void)
Definition: fd.c:2569
void InitFileAccess(void)
Definition: fd.c:903
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3486
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1728
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:872
static uint64 temporary_files_size
Definition: fd.c:236
void ReserveExternalFD(void)
Definition: fd.c:1223
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2973
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3514
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1936
void set_max_safe_fds(void)
Definition: fd.c:1044
int pg_fsync(int fd)
Definition: fd.c:386
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:3266
#define VFD_CLOSED
Definition: fd.c:184
static bool have_xact_temporary_files
Definition: fd.c:228
static int LruInsert(File file)
Definition: fd.c:1339
static int numExternalFDs
Definition: fd.c:274
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3938
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1664
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2644
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:3196
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2694
void InitTemporaryFileAccess(void)
Definition: fd.c:933
static Vfd * VfdCache
Definition: fd.c:216
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2703
bool data_sync_retry
Definition: fd.c:162
static void ReleaseLruFiles(void)
Definition: fd.c:1408
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2247
void SyncDataDirectory(void)
Definition: fd.c:3609
int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2379
off_t FileSize(File file)
Definition: fd.c:2464
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2165
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2481
bool check_debug_io_direct(char **newval, void **extra, GucSource source)
Definition: fd.c:4007
static void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: fd.c:372
static void BeforeShmemExit_Files(int code, Datum arg)
Definition: fd.c:3243
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3723
int pg_truncate(const char *path, off_t length)
Definition: fd.c:720
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:3113
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1783
#define IO_DIRECT_WAL
Definition: fd.h:55
#define IO_DIRECT_DATA
Definition: fd.h:54
#define IO_DIRECT_WAL_INIT
Definition: fd.h:56
int File
Definition: fd.h:51
#define PG_O_DIRECT
Definition: fd.h:97
int pg_file_create_mode
Definition: file_perm.c:19
int pg_dir_create_mode
Definition: file_perm.c:18
ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset)
Definition: file_utils.c:709
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:547
#define PG_TEMP_FILES_DIR
Definition: file_utils.h:63
#define PG_TEMP_FILE_PREFIX
Definition: file_utils.h:64
PGFileType
Definition: file_utils.h:19
@ PGFILETYPE_DIR
Definition: file_utils.h:23
@ PGFILETYPE_REG
Definition: file_utils.h:22
@ PGFILETYPE_ERROR
Definition: file_utils.h:20
@ DATA_DIR_SYNC_METHOD_SYNCFS
Definition: file_utils.h:30
@ DATA_DIR_SYNC_METHOD_FSYNC
Definition: file_utils.h:29
int MyProcPid
Definition: globals.c:48
bool enableFsync
Definition: globals.c:130
Oid MyDatabaseTableSpace
Definition: globals.c:97
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:638
#define newval
#define GUC_check_errdetail
Definition: guc.h:481
GucSource
Definition: guc.h:112
int temp_file_limit
Definition: guc_tables.c:550
int log_temp_files
Definition: guc_tables.c:545
Assert(PointerIsAligned(start, uint64))
#define realloc(a, b)
Definition: header.h:60
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
#define close(a)
Definition: win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:337
int j
Definition: isn.c:78
int i
Definition: isn.c:77
void list_free(List *list)
Definition: list.c:1546
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:311
char * pstrdup(const char *in)
Definition: mcxt.c:2322
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:2167
void pfree(void *pointer)
Definition: mcxt.c:2147
void * palloc(Size size)
Definition: mcxt.c:1940
#define MAP_FAILED
Definition: mem.h:45
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
void * arg
static char * basedir
static PgChecksumMode mode
Definition: pg_checksums.c:55
#define MAXPGPATH
static ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pg_iovec.h:87
static ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pg_iovec.h:48
#define lfirst(lc)
Definition: pg_list.h:172
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition: pg_prng.c:144
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
static rewind_source * source
Definition: pg_rewind.c:89
static char * buf
Definition: pg_test_fsync.c:72
static char * tablespace
Definition: pgbench.c:217
void pgstat_report_tempfile(size_t filesize)
#define pqsignal
Definition: port.h:531
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
void get_parent_directory(char *path)
Definition: path.c:1068
#define snprintf
Definition: port.h:239
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
uintptr_t Datum
Definition: postgres.h:69
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:217
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:207
#define InvalidOid
Definition: postgres_ext.h:35
unsigned int Oid
Definition: postgres_ext.h:30
static int fd(const char *x, int i)
Definition: preproc-init.c:105
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
#define PG_TBLSPC_DIR
Definition: relpath.h:41
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:33
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:564
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:524
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:452
@ RESOURCE_RELEASE_AFTER_LOCKS
Definition: resowner.h:56
#define RELEASE_PRIO_FILES
Definition: resowner.h:76
void pg_usleep(long microsec)
Definition: signal.c:53
static void error(void)
Definition: sql-dyntest.c:147
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
SubTransactionId create_subid
Definition: fd.c:258
DIR * dir
Definition: fd.c:262
FILE * file
Definition: fd.c:261
int fd
Definition: fd.c:263
union AllocateDesc::@20 desc
AllocateDescKind kind
Definition: fd.c:257
Definition: dirent.c:26
Definition: pg_list.h:54
const char * name
Definition: resowner.h:93
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
__int64 st_size
Definition: win32_port.h:263
unsigned short st_mode
Definition: win32_port.h:258
Definition: fd.c:197
int fd
Definition: fd.c:198
int fileFlags
Definition: fd.c:207
File lruLessRecently
Definition: fd.c:203
File lruMoreRecently
Definition: fd.c:202
char * fileName
Definition: fd.c:205
ResourceOwner resowner
Definition: fd.c:200
unsigned short fdstate
Definition: fd.c:199
File nextFree
Definition: fd.c:201
mode_t fileMode
Definition: fd.c:208
off_t fileSize
Definition: fd.c:204
bool SplitGUCList(char *rawstring, char separator, List **namelist)
Definition: varlena.c:3773
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:85
static void pgstat_report_wait_end(void)
Definition: wait_event.h:101
const char * type
const char * name
#define fsync(fd)
Definition: win32_port.h:83
#define stat
Definition: win32_port.h:274
#define EINTR
Definition: win32_port.h:364
#define EOPNOTSUPP
Definition: win32_port.h:388
#define SIGPIPE
Definition: win32_port.h:163
#define lstat(path, sb)
Definition: win32_port.h:275
#define S_ISDIR(m)
Definition: win32_port.h:315
void _dosmaperr(unsigned long)
Definition: win32error.c:177
#define S_ISLNK(m)
Definition: win32_port.h:334
#define mkdir(a, b)
Definition: win32_port.h:80
#define fstat
Definition: win32_port.h:273
#define O_CLOEXEC
Definition: win32_port.h:349
#define O_DSYNC
Definition: win32_port.h:342
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:791
int wal_sync_method
Definition: xlog.c:130
@ WAL_SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:27
static const char * directory
Definition: zic.c:634