PostgreSQL Source Code git master
Loading...
Searching...
No Matches
fd.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * fd.c
4 * Virtual file descriptor code.
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
11 *
12 * NOTES:
13 *
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 1024 on many modern
20 * operating systems, but may be lower on others.)
21 *
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
27 *
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
32 *
33 * INTERFACE ROUTINES
34 *
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
41 *
42 * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 * temporary files that have names so that they can be shared between
44 * backends. Such files are automatically closed and count against the
45 * temporary file limit of the backend that creates them, but unlike anonymous
46 * files they are not automatically deleted. See sharedfileset.c for a shared
47 * ownership mechanism that provides automatic cleanup for shared files when
48 * the last of a group of backends detaches.
49 *
50 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 * They behave like the corresponding native functions, except that the handle
53 * is registered with the current subtransaction, and will be automatically
54 * closed at abort. These are intended mainly for short operations like
55 * reading a configuration file; there is a limit on the number of files that
56 * can be opened using these functions at any one time.
57 *
58 * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 * release file descriptors in use by the virtual file descriptors if
60 * necessary. There is no automatic cleanup of file descriptors returned by
61 * BasicOpenFile, it is solely the caller's responsibility to close the file
62 * descriptor by calling close(2).
63 *
64 * If a non-virtual file descriptor needs to be held open for any length of
65 * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 * (and eventually ReleaseExternalFD), so that we can take it into account
67 * while deciding how many VFDs can be open. This applies to FDs obtained
68 * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 *
70 *-------------------------------------------------------------------------
71 */
72
73#include "postgres.h"
74
75#include <dirent.h>
76#include <sys/file.h>
77#include <sys/param.h>
78#include <sys/resource.h> /* for getrlimit */
79#include <sys/stat.h>
80#include <sys/types.h>
81#ifndef WIN32
82#include <sys/mman.h>
83#endif
84#include <limits.h>
85#include <unistd.h>
86#include <fcntl.h>
87
88#include "access/xact.h"
89#include "access/xlog.h"
91#include "common/file_perm.h"
92#include "common/file_utils.h"
93#include "common/pg_prng.h"
94#include "miscadmin.h"
95#include "pgstat.h"
96#include "postmaster/startup.h"
97#include "storage/aio.h"
98#include "storage/fd.h"
99#include "storage/ipc.h"
100#include "utils/guc.h"
101#include "utils/guc_hooks.h"
102#include "utils/resowner.h"
103#include "utils/varlena.h"
104
105/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
106#if defined(HAVE_SYNC_FILE_RANGE)
107#define PG_FLUSH_DATA_WORKS 1
108#elif !defined(WIN32) && defined(MS_ASYNC)
109#define PG_FLUSH_DATA_WORKS 1
110#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
111#define PG_FLUSH_DATA_WORKS 1
112#endif
113
114/*
115 * We must leave some file descriptors free for system(), the dynamic loader,
116 * and other code that tries to open files without consulting fd.c. This
117 * is the number left free. (While we try fairly hard to prevent EMFILE
118 * errors, there's never any guarantee that we won't get ENFILE due to
119 * other processes chewing up FDs. So it's a bad idea to try to open files
120 * without consulting fd.c. Nonetheless we cannot control all code.)
121 *
122 * Because this is just a fixed setting, we are effectively assuming that
123 * no such code will leave FDs open over the long term; otherwise the slop
124 * is likely to be insufficient. Note in particular that we expect that
125 * loading a shared library does not result in any permanent increase in
126 * the number of open files. (This appears to be true on most if not
127 * all platforms as of Feb 2004.)
128 */
129#define NUM_RESERVED_FDS 10
130
131/*
132 * If we have fewer than this many usable FDs after allowing for the reserved
133 * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
134 * much less than that. Note that this value ensures numExternalFDs can be
135 * at least 16; as of this writing, the contrib/postgres_fdw regression tests
136 * will not pass unless that can grow to at least 14.)
137 */
138#define FD_MINFREE 48
139
140/*
141 * A number of platforms allow individual processes to open many more files
142 * than they can really support when *many* processes do the same thing.
143 * This GUC parameter lets the DBA limit max_safe_fds to something less than
144 * what the postmaster's initial probe suggests will work.
145 */
147
148/*
149 * Maximum number of file descriptors to open for operations that fd.c knows
150 * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
151 * to a conservative value, and remains that way indefinitely in bootstrap or
152 * standalone-backend cases. In normal postmaster operation, the postmaster
153 * calls set_max_safe_fds() late in initialization to update the value, and
154 * that value is then inherited by forked subprocesses.
155 *
156 * Note: the value of max_files_per_process is taken into account while
157 * setting this variable, and so need not be tested separately.
158 */
159int max_safe_fds = FD_MINFREE; /* default if not changed */
160
161/* Whether it is safe to continue running after fsync() fails. */
162bool data_sync_retry = false;
163
164/* How SyncDataDirectory() should do its job. */
166
167/* How data files should be bulk-extended with zeros. */
169
170/* Which kinds of files should be opened with PG_O_DIRECT. */
172
173/* Debugging.... */
174
175#ifdef FDDEBUG
176#define DO_DB(A) \
177 do { \
178 int _do_db_save_errno = errno; \
179 A; \
180 errno = _do_db_save_errno; \
181 } while (0)
182#else
183#define DO_DB(A) \
184 ((void) 0)
185#endif
186
187#define VFD_CLOSED (-1)
188
189#define FileIsValid(file) \
190 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
191
192#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
193
194/* these are the assigned bits in fdstate below: */
195#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
196#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
197#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
198
199typedef struct vfd
200{
201 int fd; /* current FD, or VFD_CLOSED if none */
202 unsigned short fdstate; /* bitflags for VFD's state */
203 ResourceOwner resowner; /* owner, for automatic cleanup */
204 File nextFree; /* link to next free VFD, if in freelist */
205 File lruMoreRecently; /* doubly linked recency-of-use list */
207 pgoff_t fileSize; /* current size of file (0 if not temporary) */
208 char *fileName; /* name of file, or NULL for unused VFD */
209 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
210 int fileFlags; /* open(2) flags for (re)opening the file */
211 mode_t fileMode; /* mode to pass to open(2) */
213
214/*
215 * Virtual File Descriptor array pointer and size. This grows as
216 * needed. 'File' values are indexes into this array.
217 * Note that VfdCache[0] is not a usable VFD, just a list header.
218 */
219static Vfd *VfdCache;
221
222/*
223 * Number of file descriptors known to be in use by VFD entries.
224 */
225static int nfile = 0;
226
227/*
228 * Flag to tell whether it's worth scanning VfdCache looking for temp files
229 * to close
230 */
231static bool have_xact_temporary_files = false;
232
233/*
234 * Tracks the total size of all temporary files. Note: when temp_file_limit
235 * is being enforced, this cannot overflow since the limit cannot be more
236 * than INT_MAX kilobytes. When not enforcing, it could theoretically
237 * overflow, but we don't care.
238 */
240
241/* Temporary file access initialized and not yet shut down? */
242#ifdef USE_ASSERT_CHECKING
243static bool temporary_files_allowed = false;
244#endif
245
246/*
247 * List of OS handles opened with AllocateFile, AllocateDir and
248 * OpenTransientFile.
249 */
257
258typedef struct
259{
262 union
263 {
266 int fd;
267 } desc;
269
270static int numAllocatedDescs = 0;
271static int maxAllocatedDescs = 0;
273
274/*
275 * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
276 */
277static int numExternalFDs = 0;
278
279/*
280 * Number of temporary files opened during the current session;
281 * this is used in generation of tempfile names.
282 */
283static long tempFileCounter = 0;
284
285/*
286 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
287 * indicating that the current database's default tablespace should be used.)
288 * When numTempTableSpaces is -1, this has not been set in the current
289 * transaction.
290 */
292static int numTempTableSpaces = -1;
293static int nextTempTableSpace = 0;
294
295
296/*--------------------
297 *
298 * Private Routines
299 *
300 * Delete - delete a file from the Lru ring
301 * LruDelete - remove a file from the Lru ring and close its FD
302 * Insert - put a file at the front of the Lru ring
303 * LruInsert - put a file at the front of the Lru ring and open it
304 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
305 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
306 * AllocateVfd - grab a free (or new) file record (from VfdCache)
307 * FreeVfd - free a file record
308 *
309 * The Least Recently Used ring is a doubly linked list that begins and
310 * ends on element zero. Element zero is special -- it doesn't represent
311 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
312 * anchor that shows us the beginning/end of the ring.
313 * Only VFD elements that are currently really open (have an FD assigned) are
314 * in the Lru ring. Elements that are "virtually" open can be recognized
315 * by having a non-null fileName field.
316 *
317 * example:
318 *
319 * /--less----\ /---------\
320 * v \ v \
321 * #0 --more---> LeastRecentlyUsed --more-\ \
322 * ^\ | |
323 * \\less--> MostRecentlyUsedFile <---/ |
324 * \more---/ \--less--/
325 *
326 *--------------------
327 */
328static void Delete(File file);
329static void LruDelete(File file);
330static void Insert(File file);
331static int LruInsert(File file);
332static bool ReleaseLruFile(void);
333static void ReleaseLruFiles(void);
334static File AllocateVfd(void);
335static void FreeVfd(File file);
336
337static int FileAccess(File file);
339static bool reserveAllocatedDesc(void);
340static int FreeDesc(AllocateDesc *desc);
341
342static void BeforeShmemExit_Files(int code, Datum arg);
343static void CleanupTempFiles(bool isCommit, bool isProcExit);
344static void RemovePgTempRelationFiles(const char *tsdirname);
346
347static void walkdir(const char *path,
348 void (*action) (const char *fname, bool isdir, int elevel),
349 bool process_symlinks,
350 int elevel);
351#ifdef PG_FLUSH_DATA_WORKS
352static void pre_sync_fname(const char *fname, bool isdir, int elevel);
353#endif
354static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
355static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
356
357static int fsync_parent_path(const char *fname, int elevel);
358
359
360/* ResourceOwner callbacks to hold virtual file descriptors */
361static void ResOwnerReleaseFile(Datum res);
362static char *ResOwnerPrintFile(Datum res);
363
365{
366 .name = "File",
367 .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
368 .release_priority = RELEASE_PRIO_FILES,
369 .ReleaseResource = ResOwnerReleaseFile,
370 .DebugPrint = ResOwnerPrintFile
371};
372
373/* Convenience wrappers over ResourceOwnerRemember/Forget */
374static inline void
379static inline void
384
385/*
386 * pg_fsync --- do fsync with or without writethrough
387 */
388int
390{
391#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
392 struct stat st;
393
394 /*
395 * Some operating system implementations of fsync() have requirements
396 * about the file access modes that were used when their file descriptor
397 * argument was opened, and these requirements differ depending on whether
398 * the file descriptor is for a directory.
399 *
400 * For any file descriptor that may eventually be handed to fsync(), we
401 * should have opened it with access modes that are compatible with
402 * fsync() on all supported systems, otherwise the code may not be
403 * portable, even if it runs ok on the current system.
404 *
405 * We assert here that a descriptor for a file was opened with write
406 * permissions (i.e., not O_RDONLY) and for a directory without write
407 * permissions (O_RDONLY). Notice that the assertion check is made even
408 * if fsync() is disabled.
409 *
410 * If fstat() fails, ignore it and let the follow-up fsync() complain.
411 */
412 if (fstat(fd, &st) == 0)
413 {
414 int desc_flags = fcntl(fd, F_GETFL);
415
417
418 if (S_ISDIR(st.st_mode))
420 else
422 }
423 errno = 0;
424#endif
425
426 /* #if is to skip the wal_sync_method test if there's no need for it */
427#if defined(HAVE_FSYNC_WRITETHROUGH)
430 else
431#endif
433}
434
435
436/*
437 * pg_fsync_no_writethrough --- same as fsync except does nothing if
438 * enableFsync is off
439 */
440int
442{
443 int rc;
444
445 if (!enableFsync)
446 return 0;
447
448retry:
449 rc = fsync(fd);
450
451 if (rc == -1 && errno == EINTR)
452 goto retry;
453
454 return rc;
455}
456
457/*
458 * pg_fsync_writethrough
459 */
460int
462{
463 if (enableFsync)
464 {
465#if defined(F_FULLFSYNC)
466 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
467#else
468 errno = ENOSYS;
469 return -1;
470#endif
471 }
472 else
473 return 0;
474}
475
476/*
477 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
478 */
479int
481{
482 int rc;
483
484 if (!enableFsync)
485 return 0;
486
487retry:
488 rc = fdatasync(fd);
489
490 if (rc == -1 && errno == EINTR)
491 goto retry;
492
493 return rc;
494}
495
496/*
497 * pg_file_exists -- check that a file exists.
498 *
499 * This requires an absolute path to the file. Returns true if the file is
500 * not a directory, false otherwise.
501 */
502bool
504{
505 struct stat st;
506
507 Assert(name != NULL);
508
509 if (stat(name, &st) == 0)
510 return !S_ISDIR(st.st_mode);
511 else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
514 errmsg("could not access file \"%s\": %m", name)));
515
516 return false;
517}
518
519/*
520 * pg_flush_data --- advise OS that the described dirty data should be flushed
521 *
522 * offset of 0 with nbytes 0 means that the entire file should be flushed
523 */
524void
525pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
526{
527 /*
528 * Right now file flushing is primarily used to avoid making later
529 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
530 * if fsyncs are disabled - that's a decision we might want to make
531 * configurable at some point.
532 */
533 if (!enableFsync)
534 return;
535
536 /*
537 * We compile all alternatives that are supported on the current platform,
538 * to find portability problems more easily.
539 */
540#if defined(HAVE_SYNC_FILE_RANGE)
541 {
542 int rc;
543 static bool not_implemented_by_kernel = false;
544
546 return;
547
548retry:
549
550 /*
551 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
552 * tells the OS that writeback for the specified blocks should be
553 * started, but that we don't want to wait for completion. Note that
554 * this call might block if too much dirty data exists in the range.
555 * This is the preferable method on OSs supporting it, as it works
556 * reliably when available (contrast to msync()) and doesn't flush out
557 * clean data (like FADV_DONTNEED).
558 */
559 rc = sync_file_range(fd, offset, nbytes,
561 if (rc != 0)
562 {
563 int elevel;
564
565 if (rc == EINTR)
566 goto retry;
567
568 /*
569 * For systems that don't have an implementation of
570 * sync_file_range() such as Windows WSL, generate only one
571 * warning and then suppress all further attempts by this process.
572 */
573 if (errno == ENOSYS)
574 {
575 elevel = WARNING;
577 }
578 else
579 elevel = data_sync_elevel(WARNING);
580
581 ereport(elevel,
583 errmsg("could not flush dirty data: %m")));
584 }
585
586 return;
587 }
588#endif
589#if !defined(WIN32) && defined(MS_ASYNC)
590 {
591 void *p;
592 static int pagesize = 0;
593
594 /*
595 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
596 * writeback. On linux it only does so if MS_SYNC is specified, but
597 * then it does the writeback synchronously. Luckily all common linux
598 * systems have sync_file_range(). This is preferable over
599 * FADV_DONTNEED because it doesn't flush out clean data.
600 *
601 * We map the file (mmap()), tell the kernel to sync back the contents
602 * (msync()), and then remove the mapping again (munmap()).
603 */
604
605 /* mmap() needs actual length if we want to map whole file */
606 if (offset == 0 && nbytes == 0)
607 {
608 nbytes = lseek(fd, 0, SEEK_END);
609 if (nbytes < 0)
610 {
613 errmsg("could not determine dirty data size: %m")));
614 return;
615 }
616 }
617
618 /*
619 * Some platforms reject partial-page mmap() attempts. To deal with
620 * that, just truncate the request to a page boundary. If any extra
621 * bytes don't get flushed, well, it's only a hint anyway.
622 */
623
624 /* fetch pagesize only once */
625 if (pagesize == 0)
627
628 /* align length to pagesize, dropping any fractional page */
629 if (pagesize > 0)
630 nbytes = (nbytes / pagesize) * pagesize;
631
632 /* fractional-page request is a no-op */
633 if (nbytes <= 0)
634 return;
635
636 /*
637 * mmap could well fail, particularly on 32-bit platforms where there
638 * may simply not be enough address space. If so, silently fall
639 * through to the next implementation.
640 */
641 if (nbytes <= (pgoff_t) SSIZE_MAX)
642 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
643 else
644 p = MAP_FAILED;
645
646 if (p != MAP_FAILED)
647 {
648 int rc;
649
650 rc = msync(p, (size_t) nbytes, MS_ASYNC);
651 if (rc != 0)
652 {
655 errmsg("could not flush dirty data: %m")));
656 /* NB: need to fall through to munmap()! */
657 }
658
659 rc = munmap(p, (size_t) nbytes);
660 if (rc != 0)
661 {
662 /* FATAL error because mapping would remain */
665 errmsg("could not munmap() while flushing data: %m")));
666 }
667
668 return;
669 }
670 }
671#endif
672#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
673 {
674 int rc;
675
676 /*
677 * Signal the kernel that the passed in range should not be cached
678 * anymore. This has the, desired, side effect of writing out dirty
679 * data, and the, undesired, side effect of likely discarding useful
680 * clean cached blocks. For the latter reason this is the least
681 * preferable method.
682 */
683
684 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
685
686 if (rc != 0)
687 {
688 /* don't error out, this is just a performance optimization */
691 errmsg("could not flush dirty data: %m")));
692 }
693
694 return;
695 }
696#endif
697}
698
699/*
700 * Truncate an open file to a given length.
701 */
702static int
704{
705 int ret;
706
707retry:
708 ret = ftruncate(fd, length);
709
710 if (ret == -1 && errno == EINTR)
711 goto retry;
712
713 return ret;
714}
715
716/*
717 * Truncate a file to a given length by name.
718 */
719int
720pg_truncate(const char *path, pgoff_t length)
721{
722 int ret;
723#ifdef WIN32
724 int save_errno;
725 int fd;
726
728 if (fd >= 0)
729 {
730 ret = pg_ftruncate(fd, length);
734 }
735 else
736 ret = -1;
737#else
738
739retry:
740 ret = truncate(path, length);
741
742 if (ret == -1 && errno == EINTR)
743 goto retry;
744#endif
745
746 return ret;
747}
748
749/*
750 * fsync_fname -- fsync a file or directory, handling errors properly
751 *
752 * Try to fsync a file or directory. When doing the latter, ignore errors that
753 * indicate the OS just doesn't allow/require fsyncing directories.
754 */
755void
756fsync_fname(const char *fname, bool isdir)
757{
759}
760
761/*
762 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
763 *
764 * This routine ensures that, after returning, the effect of renaming file
765 * persists in case of a crash. A crash while this routine is running will
766 * leave you with either the pre-existing or the moved file in place of the
767 * new file; no mixed state or truncated files are possible.
768 *
769 * It does so by using fsync on the old filename and the possibly existing
770 * target filename before the rename, and the target file and directory after.
771 *
772 * Note that rename() cannot be used across arbitrary directories, as they
773 * might not be on the same filesystem. Therefore this routine does not
774 * support renaming across directories.
775 *
776 * Log errors with the caller specified severity.
777 *
778 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
779 * valid upon return.
780 */
781int
782durable_rename(const char *oldfile, const char *newfile, int elevel)
783{
784 int fd;
785
786 /*
787 * First fsync the old and target path (if it exists), to ensure that they
788 * are properly persistent on disk. Syncing the target file is not
789 * strictly necessary, but it makes it easier to reason about crashes;
790 * because it's then guaranteed that either source or target file exists
791 * after a crash.
792 */
793 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
794 return -1;
795
797 if (fd < 0)
798 {
799 if (errno != ENOENT)
800 {
801 ereport(elevel,
803 errmsg("could not open file \"%s\": %m", newfile)));
804 return -1;
805 }
806 }
807 else
808 {
809 if (pg_fsync(fd) != 0)
810 {
811 int save_errno;
812
813 /* close file upon error, might not be in transaction context */
817
818 ereport(elevel,
820 errmsg("could not fsync file \"%s\": %m", newfile)));
821 return -1;
822 }
823
824 if (CloseTransientFile(fd) != 0)
825 {
826 ereport(elevel,
828 errmsg("could not close file \"%s\": %m", newfile)));
829 return -1;
830 }
831 }
832
833 /* Time to do the real deal... */
834 if (rename(oldfile, newfile) < 0)
835 {
836 ereport(elevel,
838 errmsg("could not rename file \"%s\" to \"%s\": %m",
839 oldfile, newfile)));
840 return -1;
841 }
842
843 /*
844 * To guarantee renaming the file is persistent, fsync the file with its
845 * new name, and its containing directory.
846 */
847 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
848 return -1;
849
850 if (fsync_parent_path(newfile, elevel) != 0)
851 return -1;
852
853 return 0;
854}
855
856/*
857 * durable_unlink -- remove a file in a durable manner
858 *
859 * This routine ensures that, after returning, the effect of removing file
860 * persists in case of a crash. A crash while this routine is running will
861 * leave the system in no mixed state.
862 *
863 * It does so by using fsync on the parent directory of the file after the
864 * actual removal is done.
865 *
866 * Log errors with the severity specified by caller.
867 *
868 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
869 * valid upon return.
870 */
871int
872durable_unlink(const char *fname, int elevel)
873{
874 if (unlink(fname) < 0)
875 {
876 ereport(elevel,
878 errmsg("could not remove file \"%s\": %m",
879 fname)));
880 return -1;
881 }
882
883 /*
884 * To guarantee that the removal of the file is persistent, fsync its
885 * parent directory.
886 */
887 if (fsync_parent_path(fname, elevel) != 0)
888 return -1;
889
890 return 0;
891}
892
893/*
894 * InitFileAccess --- initialize this module during backend startup
895 *
896 * This is called during either normal or standalone backend start.
897 * It is *not* called in the postmaster.
898 *
899 * Note that this does not initialize temporary file access, that is
900 * separately initialized via InitTemporaryFileAccess().
901 */
902void
904{
905 Assert(SizeVfdCache == 0); /* call me only once */
906
907 /* initialize cache header entry */
908 VfdCache = (Vfd *) malloc(sizeof(Vfd));
909 if (VfdCache == NULL)
912 errmsg("out of memory")));
913
914 MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
916
917 SizeVfdCache = 1;
918}
919
920/*
921 * InitTemporaryFileAccess --- initialize temporary file access during startup
922 *
923 * This is called during either normal or standalone backend start.
924 * It is *not* called in the postmaster.
925 *
926 * This is separate from InitFileAccess() because temporary file cleanup can
927 * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
928 * our reporting has to happen before that. Low level file access should be
929 * available for longer, hence the separate initialization / shutdown of
930 * temporary file handling.
931 */
932void
934{
935 Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
936 Assert(!temporary_files_allowed); /* call me only once */
937
938 /*
939 * Register before-shmem-exit hook to ensure temp files are dropped while
940 * we can still report stats.
941 */
943
944#ifdef USE_ASSERT_CHECKING
946#endif
947}
948
949/*
950 * count_usable_fds --- count how many FDs the system will let us open,
951 * and estimate how many are already open.
952 *
953 * We stop counting if usable_fds reaches max_to_probe. Note: a small
954 * value of max_to_probe might result in an underestimate of already_open;
955 * we must fill in any "gaps" in the set of used FDs before the calculation
956 * of already_open will give the right answer. In practice, max_to_probe
957 * of a couple of dozen should be enough to ensure good results.
958 *
959 * We assume stderr (FD 2) is available for dup'ing. While the calling
960 * script could theoretically close that, it would be a really bad idea,
961 * since then one risks loss of error messages from, e.g., libc.
962 */
963static void
965{
966 int *fd;
967 int size;
968 int used = 0;
969 int highestfd = 0;
970 int j;
971
972#ifdef HAVE_GETRLIMIT
973 struct rlimit rlim;
975#endif
976
977 size = 1024;
978 fd = (int *) palloc(size * sizeof(int));
979
980#ifdef HAVE_GETRLIMIT
982 if (getrlimit_status != 0)
983 ereport(WARNING, (errmsg("getrlimit failed: %m")));
984#endif /* HAVE_GETRLIMIT */
985
986 /* dup until failure or probe limit reached */
987 for (;;)
988 {
989 int thisfd;
990
991#ifdef HAVE_GETRLIMIT
992
993 /*
994 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
995 * some platforms
996 */
997 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
998 break;
999#endif
1000
1001 thisfd = dup(2);
1002 if (thisfd < 0)
1003 {
1004 /* Expect EMFILE or ENFILE, else it's fishy */
1005 if (errno != EMFILE && errno != ENFILE)
1006 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1007 break;
1008 }
1009
1010 if (used >= size)
1011 {
1012 size *= 2;
1013 fd = (int *) repalloc(fd, size * sizeof(int));
1014 }
1015 fd[used++] = thisfd;
1016
1017 if (highestfd < thisfd)
1018 highestfd = thisfd;
1019
1020 if (used >= max_to_probe)
1021 break;
1022 }
1023
1024 /* release the files we opened */
1025 for (j = 0; j < used; j++)
1026 close(fd[j]);
1027
1028 pfree(fd);
1029
1030 /*
1031 * Return results. usable_fds is just the number of successful dups. We
1032 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1033 * number) and so already_open is highestfd+1 - usable_fds.
1034 */
1035 *usable_fds = used;
1036 *already_open = highestfd + 1 - used;
1037}
1038
1039/*
1040 * set_max_safe_fds
1041 * Determine number of file descriptors that fd.c is allowed to use
1042 */
1043void
1045{
1046 int usable_fds;
1047 int already_open;
1048
1049 /*----------
1050 * We want to set max_safe_fds to
1051 * MIN(usable_fds, max_files_per_process)
1052 * less the slop factor for files that are opened without consulting
1053 * fd.c. This ensures that we won't allow to open more than
1054 * max_files_per_process, or the experimentally-determined EMFILE limit,
1055 * additional files.
1056 *----------
1057 */
1060
1062
1063 /*
1064 * Take off the FDs reserved for system() etc.
1065 */
1067
1068 /*
1069 * Make sure we still have enough to get by.
1070 */
1072 ereport(FATAL,
1074 errmsg("insufficient file descriptors available to start server process"),
1075 errdetail("System allows %d, server needs at least %d, %d files are already open.",
1078 already_open)));
1079
1080 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1082}
1083
1084/*
1085 * Open a file with BasicOpenFilePerm() and pass default file mode for the
1086 * fileMode parameter.
1087 */
1088int
1089BasicOpenFile(const char *fileName, int fileFlags)
1090{
1091 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1092}
1093
1094/*
1095 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1096 *
1097 * This is exported for use by places that really want a plain kernel FD,
1098 * but need to be proof against running out of FDs. Once an FD has been
1099 * successfully returned, it is the caller's responsibility to ensure that
1100 * it will not be leaked on ereport()! Most users should *not* call this
1101 * routine directly, but instead use the VFD abstraction level, which
1102 * provides protection against descriptor leaks as well as management of
1103 * files that need to be open for more than a short period of time.
1104 *
1105 * Ideally this should be the *only* direct call of open() in the backend.
1106 * In practice, the postmaster calls open() directly, and there are some
1107 * direct open() calls done early in backend startup. Those are OK since
1108 * this module wouldn't have any open files to close at that point anyway.
1109 */
1110int
1111BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1112{
1113 int fd;
1114
1115tryAgain:
1116#ifdef PG_O_DIRECT_USE_F_NOCACHE
1117 fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1118#else
1119 fd = open(fileName, fileFlags, fileMode);
1120#endif
1121
1122 if (fd >= 0)
1123 {
1124#ifdef PG_O_DIRECT_USE_F_NOCACHE
1125 if (fileFlags & PG_O_DIRECT)
1126 {
1127 if (fcntl(fd, F_NOCACHE, 1) < 0)
1128 {
1129 int save_errno = errno;
1130
1131 close(fd);
1132 errno = save_errno;
1133 return -1;
1134 }
1135 }
1136#endif
1137
1138 return fd; /* success! */
1139 }
1140
1141 if (errno == EMFILE || errno == ENFILE)
1142 {
1143 int save_errno = errno;
1144
1145 ereport(LOG,
1147 errmsg("out of file descriptors: %m; release and retry")));
1148 errno = 0;
1149 if (ReleaseLruFile())
1150 goto tryAgain;
1151 errno = save_errno;
1152 }
1153
1154 return -1; /* failure */
1155}
1156
1157/*
1158 * AcquireExternalFD - attempt to reserve an external file descriptor
1159 *
1160 * This should be used by callers that need to hold a file descriptor open
1161 * over more than a short interval, but cannot use any of the other facilities
1162 * provided by this module.
1163 *
1164 * The difference between this and the underlying ReserveExternalFD function
1165 * is that this will report failure (by setting errno and returning false)
1166 * if "too many" external FDs are already reserved. This should be used in
1167 * any code where the total number of FDs to be reserved is not predictable
1168 * and small.
1169 */
1170bool
1172{
1173 /*
1174 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1175 * "external" FDs.
1176 */
1177 if (numExternalFDs < max_safe_fds / 3)
1178 {
1180 return true;
1181 }
1182 errno = EMFILE;
1183 return false;
1184}
1185
1186/*
1187 * ReserveExternalFD - report external consumption of a file descriptor
1188 *
1189 * This should be used by callers that need to hold a file descriptor open
1190 * over more than a short interval, but cannot use any of the other facilities
1191 * provided by this module. This just tracks the use of the FD and closes
1192 * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1193 *
1194 * Call this directly only in code where failure to reserve the FD would be
1195 * fatal; for example, the WAL-writing code does so, since the alternative is
1196 * session failure. Also, it's very unwise to do so in code that could
1197 * consume more than one FD per process.
1198 *
1199 * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1200 * available, it doesn't matter too much whether this is called before or
1201 * after actually opening the FD; but doing so beforehand reduces the risk of
1202 * an EMFILE failure if not everybody played nice. In any case, it's solely
1203 * caller's responsibility to keep the external-FD count in sync with reality.
1204 */
1205void
1207{
1208 /*
1209 * Release VFDs if needed to stay safe. Because we do this before
1210 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1211 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1212 */
1214
1216}
1217
1218/*
1219 * ReleaseExternalFD - report release of an external file descriptor
1220 *
1221 * This is guaranteed not to change errno, so it can be used in failure paths.
1222 */
1223void
1225{
1228}
1229
1230
1231#if defined(FDDEBUG)
1232
1233static void
1234_dump_lru(void)
1235{
1236 int mru = VfdCache[0].lruLessRecently;
1237 Vfd *vfdP = &VfdCache[mru];
1238 char buf[2048];
1239
1240 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1241 while (mru != 0)
1242 {
1243 mru = vfdP->lruLessRecently;
1244 vfdP = &VfdCache[mru];
1245 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1246 }
1247 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1248 elog(LOG, "%s", buf);
1249}
1250#endif /* FDDEBUG */
1251
1252static void
1254{
1255 Vfd *vfdP;
1256
1257 Assert(file != 0);
1258
1259 DO_DB(elog(LOG, "Delete %d (%s)",
1260 file, VfdCache[file].fileName));
1261 DO_DB(_dump_lru());
1262
1263 vfdP = &VfdCache[file];
1264
1265 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1266 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1267
1268 DO_DB(_dump_lru());
1269}
1270
1271static void
1273{
1274 Vfd *vfdP;
1275
1276 Assert(file != 0);
1277
1278 DO_DB(elog(LOG, "LruDelete %d (%s)",
1279 file, VfdCache[file].fileName));
1280
1281 vfdP = &VfdCache[file];
1282
1284
1285 /*
1286 * Close the file. We aren't expecting this to fail; if it does, better
1287 * to leak the FD than to mess up our internal state.
1288 */
1289 if (close(vfdP->fd) != 0)
1291 "could not close file \"%s\": %m", vfdP->fileName);
1292 vfdP->fd = VFD_CLOSED;
1293 --nfile;
1294
1295 /* delete the vfd record from the LRU ring */
1296 Delete(file);
1297}
1298
1299static void
1301{
1302 Vfd *vfdP;
1303
1304 Assert(file != 0);
1305
1306 DO_DB(elog(LOG, "Insert %d (%s)",
1307 file, VfdCache[file].fileName));
1308 DO_DB(_dump_lru());
1309
1310 vfdP = &VfdCache[file];
1311
1312 vfdP->lruMoreRecently = 0;
1313 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1314 VfdCache[0].lruLessRecently = file;
1315 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1316
1317 DO_DB(_dump_lru());
1318}
1319
1320/* returns 0 on success, -1 on re-open failure (with errno set) */
1321static int
1323{
1324 Vfd *vfdP;
1325
1326 Assert(file != 0);
1327
1328 DO_DB(elog(LOG, "LruInsert %d (%s)",
1329 file, VfdCache[file].fileName));
1330
1331 vfdP = &VfdCache[file];
1332
1333 if (FileIsNotOpen(file))
1334 {
1335 /* Close excess kernel FDs. */
1337
1338 /*
1339 * The open could still fail for lack of file descriptors, eg due to
1340 * overall system file table being full. So, be prepared to release
1341 * another FD if necessary...
1342 */
1343 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1344 vfdP->fileMode);
1345 if (vfdP->fd < 0)
1346 {
1347 DO_DB(elog(LOG, "re-open failed: %m"));
1348 return -1;
1349 }
1350 else
1351 {
1352 ++nfile;
1353 }
1354 }
1355
1356 /*
1357 * put it at the head of the Lru ring
1358 */
1359
1360 Insert(file);
1361
1362 return 0;
1363}
1364
1365/*
1366 * Release one kernel FD by closing the least-recently-used VFD.
1367 */
1368static bool
1370{
1371 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1372
1373 if (nfile > 0)
1374 {
1375 /*
1376 * There are opened files and so there should be at least one used vfd
1377 * in the ring.
1378 */
1379 Assert(VfdCache[0].lruMoreRecently != 0);
1380 LruDelete(VfdCache[0].lruMoreRecently);
1381 return true; /* freed a file */
1382 }
1383 return false; /* no files available to free */
1384}
1385
1386/*
1387 * Release kernel FDs as needed to get under the max_safe_fds limit.
1388 * After calling this, it's OK to try to open another file.
1389 */
1390static void
1392{
1394 {
1395 if (!ReleaseLruFile())
1396 break;
1397 }
1398}
1399
1400static File
1402{
1403 Index i;
1404 File file;
1405
1406 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1407
1408 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1409
1410 if (VfdCache[0].nextFree == 0)
1411 {
1412 /*
1413 * The free list is empty so it is time to increase the size of the
1414 * array. We choose to double it each time this happens. However,
1415 * there's not much point in starting *real* small.
1416 */
1419
1420 if (newCacheSize < 32)
1421 newCacheSize = 32;
1422
1423 /*
1424 * Be careful not to clobber VfdCache ptr if realloc fails.
1425 */
1426 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1427 if (newVfdCache == NULL)
1428 ereport(ERROR,
1430 errmsg("out of memory")));
1432
1433 /*
1434 * Initialize the new entries and link them into the free list.
1435 */
1436 for (i = SizeVfdCache; i < newCacheSize; i++)
1437 {
1438 MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1439 VfdCache[i].nextFree = i + 1;
1441 }
1444
1445 /*
1446 * Record the new size
1447 */
1449 }
1450
1451 file = VfdCache[0].nextFree;
1452
1454
1455 return file;
1456}
1457
1458static void
1460{
1461 Vfd *vfdP = &VfdCache[file];
1462
1463 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1464 file, vfdP->fileName ? vfdP->fileName : ""));
1465
1466 if (vfdP->fileName != NULL)
1467 {
1468 free(vfdP->fileName);
1469 vfdP->fileName = NULL;
1470 }
1471 vfdP->fdstate = 0x0;
1472
1473 vfdP->nextFree = VfdCache[0].nextFree;
1474 VfdCache[0].nextFree = file;
1475}
1476
1477/* returns 0 on success, -1 on re-open failure (with errno set) */
1478static int
1480{
1481 int returnValue;
1482
1483 DO_DB(elog(LOG, "FileAccess %d (%s)",
1484 file, VfdCache[file].fileName));
1485
1486 /*
1487 * Is the file open? If not, open it and put it at the head of the LRU
1488 * ring (possibly closing the least recently used file to get an FD).
1489 */
1490
1491 if (FileIsNotOpen(file))
1492 {
1493 returnValue = LruInsert(file);
1494 if (returnValue != 0)
1495 return returnValue;
1496 }
1497 else if (VfdCache[0].lruLessRecently != file)
1498 {
1499 /*
1500 * We now know that the file is open and that it is not the last one
1501 * accessed, so we need to move it to the head of the Lru ring.
1502 */
1503
1504 Delete(file);
1505 Insert(file);
1506 }
1507
1508 return 0;
1509}
1510
1511/*
1512 * Called whenever a temporary file is deleted to report its size.
1513 */
1514static void
1515ReportTemporaryFileUsage(const char *path, pgoff_t size)
1516{
1518
1519 if (log_temp_files >= 0)
1520 {
1521 if ((size / 1024) >= log_temp_files)
1522 ereport(LOG,
1523 (errmsg("temporary file: path \"%s\", size %lu",
1524 path, (unsigned long) size)));
1525 }
1526}
1527
1528/*
1529 * Called to register a temporary file for automatic close.
1530 * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1531 * before the file was opened.
1532 */
1533static void
1535{
1538
1539 /* Backup mechanism for closing at end of xact. */
1542}
1543
1544/*
1545 * Called when we get a shared invalidation message on some relation.
1546 */
1547#ifdef NOT_USED
1548void
1549FileInvalidate(File file)
1550{
1551 Assert(FileIsValid(file));
1552 if (!FileIsNotOpen(file))
1553 LruDelete(file);
1554}
1555#endif
1556
1557/*
1558 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1559 * fileMode parameter.
1560 */
1561File
1562PathNameOpenFile(const char *fileName, int fileFlags)
1563{
1564 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1565}
1566
1567/*
1568 * open a file in an arbitrary directory
1569 *
1570 * NB: if the passed pathname is relative (which it usually is),
1571 * it will be interpreted relative to the process' working directory
1572 * (which should always be $PGDATA when this code is running).
1573 */
1574File
1575PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1576{
1577 char *fnamecopy;
1578 File file;
1579 Vfd *vfdP;
1580
1581 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1582 fileName, fileFlags, fileMode));
1583
1584 /*
1585 * We need a malloc'd copy of the file name; fail cleanly if no room.
1586 */
1587 fnamecopy = strdup(fileName);
1588 if (fnamecopy == NULL)
1589 ereport(ERROR,
1591 errmsg("out of memory")));
1592
1593 file = AllocateVfd();
1594 vfdP = &VfdCache[file];
1595
1596 /* Close excess kernel FDs. */
1598
1599 /*
1600 * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1601 * client shouldn't be expected to know which kernel descriptors are
1602 * currently open, so it wouldn't make sense for them to be inherited by
1603 * executed subprograms.
1604 */
1605 fileFlags |= O_CLOEXEC;
1606
1607 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1608
1609 if (vfdP->fd < 0)
1610 {
1611 int save_errno = errno;
1612
1613 FreeVfd(file);
1614 free(fnamecopy);
1615 errno = save_errno;
1616 return -1;
1617 }
1618 ++nfile;
1619 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1620 vfdP->fd));
1621
1622 vfdP->fileName = fnamecopy;
1623 /* Saved flags are adjusted to be OK for re-opening file */
1624 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1625 vfdP->fileMode = fileMode;
1626 vfdP->fileSize = 0;
1627 vfdP->fdstate = 0x0;
1628 vfdP->resowner = NULL;
1629
1630 Insert(file);
1631
1632 return file;
1633}
1634
1635/*
1636 * Create directory 'directory'. If necessary, create 'basedir', which must
1637 * be the directory above it. This is designed for creating the top-level
1638 * temporary directory on demand before creating a directory underneath it.
1639 * Do nothing if the directory already exists.
1640 *
1641 * Directories created within the top-level temporary directory should begin
1642 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1643 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1644 * that do not need any particular prefix.
1645*/
1646void
1648{
1649 if (MakePGDirectory(directory) < 0)
1650 {
1651 if (errno == EEXIST)
1652 return;
1653
1654 /*
1655 * Failed. Try to create basedir first in case it's missing. Tolerate
1656 * EEXIST to close a race against another process following the same
1657 * algorithm.
1658 */
1659 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1660 ereport(ERROR,
1662 errmsg("cannot create temporary directory \"%s\": %m",
1663 basedir)));
1664
1665 /* Try again. */
1666 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1667 ereport(ERROR,
1669 errmsg("cannot create temporary subdirectory \"%s\": %m",
1670 directory)));
1671 }
1672}
1673
1674/*
1675 * Delete a directory and everything in it, if it exists.
1676 */
1677void
1678PathNameDeleteTemporaryDir(const char *dirname)
1679{
1680 struct stat statbuf;
1681
1682 /* Silently ignore missing directory. */
1683 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1684 return;
1685
1686 /*
1687 * Currently, walkdir doesn't offer a way for our passed in function to
1688 * maintain state. Perhaps it should, so that we could tell the caller
1689 * whether this operation succeeded or failed. Since this operation is
1690 * used in a cleanup path, we wouldn't actually behave differently: we'll
1691 * just log failures.
1692 */
1693 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1694}
1695
1696/*
1697 * Open a temporary file that will disappear when we close it.
1698 *
1699 * This routine takes care of generating an appropriate tempfile name.
1700 * There's no need to pass in fileFlags or fileMode either, since only
1701 * one setting makes any sense for a temp file.
1702 *
1703 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1704 * to ensure it's closed and deleted when it's no longer needed, typically at
1705 * the end-of-transaction. In most cases, you don't want temporary files to
1706 * outlive the transaction that created them, so this should be false -- but
1707 * if you need "somewhat" temporary storage, this might be useful. In either
1708 * case, the file is removed when the File is explicitly closed.
1709 */
1710File
1711OpenTemporaryFile(bool interXact)
1712{
1713 File file = 0;
1714
1715 Assert(temporary_files_allowed); /* check temp file access is up */
1716
1717 /*
1718 * Make sure the current resource owner has space for this File before we
1719 * open it, if we'll be registering it below.
1720 */
1721 if (!interXact)
1723
1724 /*
1725 * If some temp tablespace(s) have been given to us, try to use the next
1726 * one. If a given tablespace can't be found, we silently fall back to
1727 * the database's default tablespace.
1728 *
1729 * BUT: if the temp file is slated to outlive the current transaction,
1730 * force it into the database's default tablespace, so that it will not
1731 * pose a threat to possible tablespace drop attempts.
1732 */
1733 if (numTempTableSpaces > 0 && !interXact)
1734 {
1736
1737 if (OidIsValid(tblspcOid))
1739 }
1740
1741 /*
1742 * If not, or if tablespace is bad, create in database's default
1743 * tablespace. MyDatabaseTableSpace should normally be set before we get
1744 * here, but just in case it isn't, fall back to pg_default tablespace.
1745 */
1746 if (file <= 0)
1750 true);
1751
1752 /* Mark it for deletion at close and temporary file size limit */
1754
1755 /* Register it with the current resource owner */
1756 if (!interXact)
1758
1759 return file;
1760}
1761
1762/*
1763 * Return the path of the temp directory in a given tablespace.
1764 */
1765void
1767{
1768 /*
1769 * Identify the tempfile directory for this tablespace.
1770 *
1771 * If someone tries to specify pg_global, use pg_default instead.
1772 */
1773 if (tablespace == InvalidOid ||
1776 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1777 else
1778 {
1779 /* All other tablespaces are accessed via symlinks */
1780 snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1783 }
1784}
1785
1786/*
1787 * Open a temporary file in a specific tablespace.
1788 * Subroutine for OpenTemporaryFile, which see for details.
1789 */
1790static File
1792{
1793 char tempdirpath[MAXPGPATH];
1794 char tempfilepath[MAXPGPATH];
1795 File file;
1796
1798
1799 /*
1800 * Generate a tempfile name that should be unique within the current
1801 * database instance.
1802 */
1803 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1805
1806 /*
1807 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1808 * temp file that can be reused.
1809 */
1812 if (file <= 0)
1813 {
1814 /*
1815 * We might need to create the tablespace's tempfile directory, if no
1816 * one has yet done so.
1817 *
1818 * Don't check for an error from MakePGDirectory; it could fail if
1819 * someone else just did the same thing. If it doesn't work then
1820 * we'll bomb out on the second create attempt, instead.
1821 */
1823
1826 if (file <= 0 && rejectError)
1827 elog(ERROR, "could not create temporary file \"%s\": %m",
1828 tempfilepath);
1829 }
1830
1831 return file;
1832}
1833
1834
1835/*
1836 * Create a new file. The directory containing it must already exist. Files
1837 * created this way are subject to temp_file_limit and are automatically
1838 * closed at end of transaction, but are not automatically deleted on close
1839 * because they are intended to be shared between cooperating backends.
1840 *
1841 * If the file is inside the top-level temporary directory, its name should
1842 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1843 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1844 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1845 * the prefix isn't needed.
1846 */
1847File
1849{
1850 File file;
1851
1852 Assert(temporary_files_allowed); /* check temp file access is up */
1853
1855
1856 /*
1857 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1858 * temp file that can be reused.
1859 */
1860 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1861 if (file <= 0)
1862 {
1863 if (error_on_failure)
1864 ereport(ERROR,
1866 errmsg("could not create temporary file \"%s\": %m",
1867 path)));
1868 else
1869 return file;
1870 }
1871
1872 /* Mark it for temp_file_limit accounting. */
1874
1875 /* Register it for automatic close. */
1877
1878 return file;
1879}
1880
1881/*
1882 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1883 * another backend. Files opened this way don't count against the
1884 * temp_file_limit of the caller, are automatically closed at the end of the
1885 * transaction but are not deleted on close.
1886 */
1887File
1888PathNameOpenTemporaryFile(const char *path, int mode)
1889{
1890 File file;
1891
1892 Assert(temporary_files_allowed); /* check temp file access is up */
1893
1895
1896 file = PathNameOpenFile(path, mode | PG_BINARY);
1897
1898 /* If no such file, then we don't raise an error. */
1899 if (file <= 0 && errno != ENOENT)
1900 ereport(ERROR,
1902 errmsg("could not open temporary file \"%s\": %m",
1903 path)));
1904
1905 if (file > 0)
1906 {
1907 /* Register it for automatic close. */
1909 }
1910
1911 return file;
1912}
1913
1914/*
1915 * Delete a file by pathname. Return true if the file existed, false if
1916 * didn't.
1917 */
1918bool
1920{
1921 struct stat filestats;
1922 int stat_errno;
1923
1924 /* Get the final size for pgstat reporting. */
1925 if (stat(path, &filestats) != 0)
1926 stat_errno = errno;
1927 else
1928 stat_errno = 0;
1929
1930 /*
1931 * Unlike FileClose's automatic file deletion code, we tolerate
1932 * non-existence to support BufFileDeleteFileSet which doesn't know how
1933 * many segments it has to delete until it runs out.
1934 */
1935 if (stat_errno == ENOENT)
1936 return false;
1937
1938 if (unlink(path) < 0)
1939 {
1940 if (errno != ENOENT)
1943 errmsg("could not unlink temporary file \"%s\": %m",
1944 path)));
1945 return false;
1946 }
1947
1948 if (stat_errno == 0)
1949 ReportTemporaryFileUsage(path, filestats.st_size);
1950 else
1951 {
1952 errno = stat_errno;
1953 ereport(LOG,
1955 errmsg("could not stat file \"%s\": %m", path)));
1956 }
1957
1958 return true;
1959}
1960
1961/*
1962 * close a file when done with it
1963 */
1964void
1966{
1967 Vfd *vfdP;
1968
1969 Assert(FileIsValid(file));
1970
1971 DO_DB(elog(LOG, "FileClose: %d (%s)",
1972 file, VfdCache[file].fileName));
1973
1974 vfdP = &VfdCache[file];
1975
1976 if (!FileIsNotOpen(file))
1977 {
1979
1980 /* close the file */
1981 if (close(vfdP->fd) != 0)
1982 {
1983 /*
1984 * We may need to panic on failure to close non-temporary files;
1985 * see LruDelete.
1986 */
1988 "could not close file \"%s\": %m", vfdP->fileName);
1989 }
1990
1991 --nfile;
1992 vfdP->fd = VFD_CLOSED;
1993
1994 /* remove the file from the lru ring */
1995 Delete(file);
1996 }
1997
1998 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1999 {
2000 /* Subtract its size from current usage (do first in case of error) */
2001 temporary_files_size -= vfdP->fileSize;
2002 vfdP->fileSize = 0;
2003 }
2004
2005 /*
2006 * Delete the file if it was temporary, and make a log entry if wanted
2007 */
2008 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2009 {
2010 struct stat filestats;
2011 int stat_errno;
2012
2013 /*
2014 * If we get an error, as could happen within the ereport/elog calls,
2015 * we'll come right back here during transaction abort. Reset the
2016 * flag to ensure that we can't get into an infinite loop. This code
2017 * is arranged to ensure that the worst-case consequence is failing to
2018 * emit log message(s), not failing to attempt the unlink.
2019 */
2020 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2021
2022
2023 /* first try the stat() */
2024 if (stat(vfdP->fileName, &filestats))
2025 stat_errno = errno;
2026 else
2027 stat_errno = 0;
2028
2029 /* in any case do the unlink */
2030 if (unlink(vfdP->fileName))
2031 ereport(LOG,
2033 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2034
2035 /* and last report the stat results */
2036 if (stat_errno == 0)
2037 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2038 else
2039 {
2040 errno = stat_errno;
2041 ereport(LOG,
2043 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2044 }
2045 }
2046
2047 /* Unregister it from the resource owner */
2048 if (vfdP->resowner)
2049 ResourceOwnerForgetFile(vfdP->resowner, file);
2050
2051 /*
2052 * Return the Vfd slot to the free list
2053 */
2054 FreeVfd(file);
2055}
2056
2057/*
2058 * FilePrefetch - initiate asynchronous read of a given range of the file.
2059 *
2060 * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2061 *
2062 * posix_fadvise() is the simplest standardized interface that accomplishes
2063 * this.
2064 */
2065int
2066FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2067{
2068 Assert(FileIsValid(file));
2069
2070 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2071 file, VfdCache[file].fileName,
2072 (int64) offset, (int64) amount));
2073
2074#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2075 {
2076 int returnCode;
2077
2078 returnCode = FileAccess(file);
2079 if (returnCode < 0)
2080 return returnCode;
2081
2082retry:
2083 pgstat_report_wait_start(wait_event_info);
2084 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2087
2088 if (returnCode == EINTR)
2089 goto retry;
2090
2091 return returnCode;
2092 }
2093#elif defined(__darwin__)
2094 {
2095 struct radvisory
2096 {
2097 off_t ra_offset; /* offset into the file */
2098 int ra_count; /* size of the read */
2099 } ra;
2100 int returnCode;
2101
2102 returnCode = FileAccess(file);
2103 if (returnCode < 0)
2104 return returnCode;
2105
2106 ra.ra_offset = offset;
2107 ra.ra_count = amount;
2108 pgstat_report_wait_start(wait_event_info);
2111 if (returnCode != -1)
2112 return 0;
2113 else
2114 return errno;
2115 }
2116#else
2117 return 0;
2118#endif
2119}
2120
2121void
2122FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
2123{
2124 int returnCode;
2125
2126 Assert(FileIsValid(file));
2127
2128 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2129 file, VfdCache[file].fileName,
2130 (int64) offset, (int64) nbytes));
2131
2132 if (nbytes <= 0)
2133 return;
2134
2135 if (VfdCache[file].fileFlags & PG_O_DIRECT)
2136 return;
2137
2138 returnCode = FileAccess(file);
2139 if (returnCode < 0)
2140 return;
2141
2142 pgstat_report_wait_start(wait_event_info);
2143 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2145}
2146
2147ssize_t
2148FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2149 uint32 wait_event_info)
2150{
2152 Vfd *vfdP;
2153
2154 Assert(FileIsValid(file));
2155
2156 DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2157 file, VfdCache[file].fileName,
2158 (int64) offset,
2159 iovcnt));
2160
2161 returnCode = FileAccess(file);
2162 if (returnCode < 0)
2163 return returnCode;
2164
2165 vfdP = &VfdCache[file];
2166
2167retry:
2168 pgstat_report_wait_start(wait_event_info);
2169 returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2171
2172 if (returnCode < 0)
2173 {
2174 /*
2175 * Windows may run out of kernel buffers and return "Insufficient
2176 * system resources" error. Wait a bit and retry to solve it.
2177 *
2178 * It is rumored that EINTR is also possible on some Unix filesystems,
2179 * in which case immediate retry is indicated.
2180 */
2181#ifdef WIN32
2183
2184 switch (error)
2185 {
2187 pg_usleep(1000L);
2188 errno = EINTR;
2189 break;
2190 default:
2192 break;
2193 }
2194#endif
2195 /* OK to retry if interrupted */
2196 if (errno == EINTR)
2197 goto retry;
2198 }
2199
2200 return returnCode;
2201}
2202
2203int
2205 int iovcnt, pgoff_t offset,
2206 uint32 wait_event_info)
2207{
2208 int returnCode;
2209 Vfd *vfdP;
2210
2211 Assert(FileIsValid(file));
2212
2213 DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2214 file, VfdCache[file].fileName,
2215 (int64) offset,
2216 iovcnt));
2217
2218 returnCode = FileAccess(file);
2219 if (returnCode < 0)
2220 return returnCode;
2221
2222 vfdP = &VfdCache[file];
2223
2224 pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2225
2226 return 0;
2227}
2228
2229ssize_t
2230FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2231 uint32 wait_event_info)
2232{
2234 Vfd *vfdP;
2235
2236 Assert(FileIsValid(file));
2237
2238 DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2239 file, VfdCache[file].fileName,
2240 (int64) offset,
2241 iovcnt));
2242
2243 returnCode = FileAccess(file);
2244 if (returnCode < 0)
2245 return returnCode;
2246
2247 vfdP = &VfdCache[file];
2248
2249 /*
2250 * If enforcing temp_file_limit and it's a temp file, check to see if the
2251 * write would overrun temp_file_limit, and throw error if so. Note: it's
2252 * really a modularity violation to throw error here; we should set errno
2253 * and return -1. However, there's no way to report a suitable error
2254 * message if we do that. All current callers would just throw error
2255 * immediately anyway, so this is safe at present.
2256 */
2257 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2258 {
2259 pgoff_t past_write = offset;
2260
2261 for (int i = 0; i < iovcnt; ++i)
2262 past_write += iov[i].iov_len;
2263
2264 if (past_write > vfdP->fileSize)
2265 {
2267
2269 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2270 ereport(ERROR,
2272 errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2273 temp_file_limit)));
2274 }
2275 }
2276
2277retry:
2278 pgstat_report_wait_start(wait_event_info);
2279 returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2281
2282 if (returnCode >= 0)
2283 {
2284 /*
2285 * Some callers expect short writes to set errno, and traditionally we
2286 * have assumed that they imply disk space shortage. We don't want to
2287 * waste CPU cycles adding up the total size here, so we'll just set
2288 * it for all successful writes in case such a caller determines that
2289 * the write was short and ereports "%m".
2290 */
2291 errno = ENOSPC;
2292
2293 /*
2294 * Maintain fileSize and temporary_files_size if it's a temp file.
2295 */
2296 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2297 {
2298 pgoff_t past_write = offset + returnCode;
2299
2300 if (past_write > vfdP->fileSize)
2301 {
2302 temporary_files_size += past_write - vfdP->fileSize;
2303 vfdP->fileSize = past_write;
2304 }
2305 }
2306 }
2307 else
2308 {
2309 /*
2310 * See comments in FileReadV()
2311 */
2312#ifdef WIN32
2314
2315 switch (error)
2316 {
2318 pg_usleep(1000L);
2319 errno = EINTR;
2320 break;
2321 default:
2323 break;
2324 }
2325#endif
2326 /* OK to retry if interrupted */
2327 if (errno == EINTR)
2328 goto retry;
2329 }
2330
2331 return returnCode;
2332}
2333
2334int
2335FileSync(File file, uint32 wait_event_info)
2336{
2337 int returnCode;
2338
2339 Assert(FileIsValid(file));
2340
2341 DO_DB(elog(LOG, "FileSync: %d (%s)",
2342 file, VfdCache[file].fileName));
2343
2344 returnCode = FileAccess(file);
2345 if (returnCode < 0)
2346 return returnCode;
2347
2348 pgstat_report_wait_start(wait_event_info);
2349 returnCode = pg_fsync(VfdCache[file].fd);
2351
2352 return returnCode;
2353}
2354
2355/*
2356 * Zero a region of the file.
2357 *
2358 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2359 * appropriate error.
2360 */
2361int
2362FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2363{
2364 int returnCode;
2366
2367 Assert(FileIsValid(file));
2368
2369 DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2370 file, VfdCache[file].fileName,
2371 (int64) offset, (int64) amount));
2372
2373 returnCode = FileAccess(file);
2374 if (returnCode < 0)
2375 return returnCode;
2376
2377 pgstat_report_wait_start(wait_event_info);
2378 written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2380
2381 if (written < 0)
2382 return -1;
2383 else if (written != amount)
2384 {
2385 /* if errno is unset, assume problem is no disk space */
2386 if (errno == 0)
2387 errno = ENOSPC;
2388 return -1;
2389 }
2390
2391 return 0;
2392}
2393
2394/*
2395 * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2396 * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2397 * use FileZero() instead.
2398 *
2399 * Note that at least glibc() implements posix_fallocate() in userspace if not
2400 * implemented by the filesystem. That's not the case for all environments
2401 * though.
2402 *
2403 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2404 * appropriate error.
2405 */
2406int
2407FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2408{
2409#ifdef HAVE_POSIX_FALLOCATE
2410 int returnCode;
2411
2412 Assert(FileIsValid(file));
2413
2414 DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2415 file, VfdCache[file].fileName,
2416 (int64) offset, (int64) amount));
2417
2418 returnCode = FileAccess(file);
2419 if (returnCode < 0)
2420 return -1;
2421
2422retry:
2423 pgstat_report_wait_start(wait_event_info);
2424 returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2426
2427 if (returnCode == 0)
2428 return 0;
2429 else if (returnCode == EINTR)
2430 goto retry;
2431
2432 /* for compatibility with %m printing etc */
2433 errno = returnCode;
2434
2435 /*
2436 * Return in cases of a "real" failure, if fallocate is not supported,
2437 * fall through to the FileZero() backed implementation.
2438 */
2440 return -1;
2441#endif
2442
2443 return FileZero(file, offset, amount, wait_event_info);
2444}
2445
2446pgoff_t
2448{
2449 Assert(FileIsValid(file));
2450
2451 DO_DB(elog(LOG, "FileSize %d (%s)",
2452 file, VfdCache[file].fileName));
2453
2454 if (FileIsNotOpen(file))
2455 {
2456 if (FileAccess(file) < 0)
2457 return (pgoff_t) -1;
2458 }
2459
2460 return lseek(VfdCache[file].fd, 0, SEEK_END);
2461}
2462
2463int
2464FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
2465{
2466 int returnCode;
2467
2468 Assert(FileIsValid(file));
2469
2470 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2471 file, VfdCache[file].fileName));
2472
2473 returnCode = FileAccess(file);
2474 if (returnCode < 0)
2475 return returnCode;
2476
2477 pgstat_report_wait_start(wait_event_info);
2478 returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2480
2481 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2482 {
2483 /* adjust our state for truncation of a temp file */
2484 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2485 temporary_files_size -= VfdCache[file].fileSize - offset;
2486 VfdCache[file].fileSize = offset;
2487 }
2488
2489 return returnCode;
2490}
2491
2492/*
2493 * Return the pathname associated with an open file.
2494 *
2495 * The returned string points to an internal buffer, which is valid until
2496 * the file is closed.
2497 */
2498char *
2500{
2501 Assert(FileIsValid(file));
2502
2503 return VfdCache[file].fileName;
2504}
2505
2506/*
2507 * Return the raw file descriptor of an opened file.
2508 *
2509 * The returned file descriptor will be valid until the file is closed, but
2510 * there are a lot of things that can make that happen. So the caller should
2511 * be careful not to do much of anything else before it finishes using the
2512 * returned file descriptor.
2513 */
2514int
2516{
2517 int returnCode;
2518
2519 returnCode = FileAccess(file);
2520 if (returnCode < 0)
2521 return returnCode;
2522
2523 Assert(FileIsValid(file));
2524 return VfdCache[file].fd;
2525}
2526
2527/*
2528 * FileGetRawFlags - returns the file flags on open(2)
2529 */
2530int
2532{
2533 Assert(FileIsValid(file));
2534 return VfdCache[file].fileFlags;
2535}
2536
2537/*
2538 * FileGetRawMode - returns the mode bitmask passed to open(2)
2539 */
2540mode_t
2542{
2543 Assert(FileIsValid(file));
2544 return VfdCache[file].fileMode;
2545}
2546
2547/*
2548 * Make room for another allocatedDescs[] array entry if needed and possible.
2549 * Returns true if an array element is available.
2550 */
2551static bool
2553{
2555 int newMax;
2556
2557 /* Quick out if array already has a free slot. */
2559 return true;
2560
2561 /*
2562 * If the array hasn't yet been created in the current process, initialize
2563 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2564 * we will ever need, anyway. We don't want to look at max_safe_fds
2565 * immediately because set_max_safe_fds() may not have run yet.
2566 */
2567 if (allocatedDescs == NULL)
2568 {
2569 newMax = FD_MINFREE / 3;
2571 /* Out of memory already? Treat as fatal error. */
2572 if (newDescs == NULL)
2573 ereport(ERROR,
2575 errmsg("out of memory")));
2578 return true;
2579 }
2580
2581 /*
2582 * Consider enlarging the array beyond the initial allocation used above.
2583 * By the time this happens, max_safe_fds should be known accurately.
2584 *
2585 * We mustn't let allocated descriptors hog all the available FDs, and in
2586 * practice we'd better leave a reasonable number of FDs for VFD use. So
2587 * set the maximum to max_safe_fds / 3. (This should certainly be at
2588 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2589 * tightening the restriction here.) Recall that "external" FDs are
2590 * allowed to consume another third of max_safe_fds.
2591 */
2592 newMax = max_safe_fds / 3;
2594 {
2596 newMax * sizeof(AllocateDesc));
2597 /* Treat out-of-memory as a non-fatal error. */
2598 if (newDescs == NULL)
2599 return false;
2602 return true;
2603 }
2604
2605 /* Can't enlarge allocatedDescs[] any more. */
2606 return false;
2607}
2608
2609/*
2610 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2611 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2612 * necessary to open the file. When done, call FreeFile rather than fclose.
2613 *
2614 * Note that files that will be open for any significant length of time
2615 * should NOT be handled this way, since they cannot share kernel file
2616 * descriptors with other files; there is grave risk of running out of FDs
2617 * if anyone locks down too many FDs. Most callers of this routine are
2618 * simply reading a config file that they will read and close immediately.
2619 *
2620 * fd.c will automatically close all files opened with AllocateFile at
2621 * transaction commit or abort; this prevents FD leakage if a routine
2622 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2623 *
2624 * Ideally this should be the *only* direct call of fopen() in the backend.
2625 */
2626FILE *
2627AllocateFile(const char *name, const char *mode)
2628{
2629 FILE *file;
2630
2631 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2633
2634 /* Can we allocate another non-virtual FD? */
2635 if (!reserveAllocatedDesc())
2636 ereport(ERROR,
2638 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2640
2641 /* Close excess kernel FDs. */
2643
2644TryAgain:
2645 if ((file = fopen(name, mode)) != NULL)
2646 {
2648
2649 desc->kind = AllocateDescFile;
2650 desc->desc.file = file;
2653 return desc->desc.file;
2654 }
2655
2656 if (errno == EMFILE || errno == ENFILE)
2657 {
2658 int save_errno = errno;
2659
2660 ereport(LOG,
2662 errmsg("out of file descriptors: %m; release and retry")));
2663 errno = 0;
2664 if (ReleaseLruFile())
2665 goto TryAgain;
2666 errno = save_errno;
2667 }
2668
2669 return NULL;
2670}
2671
2672/*
2673 * Open a file with OpenTransientFilePerm() and pass default file mode for
2674 * the fileMode parameter.
2675 */
2676int
2677OpenTransientFile(const char *fileName, int fileFlags)
2678{
2679 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2680}
2681
2682/*
2683 * Like AllocateFile, but returns an unbuffered fd like open(2)
2684 */
2685int
2686OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2687{
2688 int fd;
2689
2690 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2691 numAllocatedDescs, fileName));
2692
2693 /* Can we allocate another non-virtual FD? */
2694 if (!reserveAllocatedDesc())
2695 ereport(ERROR,
2697 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2698 maxAllocatedDescs, fileName)));
2699
2700 /* Close excess kernel FDs. */
2702
2703 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2704
2705 if (fd >= 0)
2706 {
2708
2709 desc->kind = AllocateDescRawFD;
2710 desc->desc.fd = fd;
2713
2714 return fd;
2715 }
2716
2717 return -1; /* failure */
2718}
2719
2720/*
2721 * Routines that want to initiate a pipe stream should use OpenPipeStream
2722 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2723 * necessary. When done, call ClosePipeStream rather than pclose.
2724 *
2725 * This function also ensures that the popen'd program is run with default
2726 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2727 * uses. This ensures desirable response to, eg, closing a read pipe early.
2728 */
2729FILE *
2730OpenPipeStream(const char *command, const char *mode)
2731{
2732 FILE *file;
2733 int save_errno;
2734
2735 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2736 numAllocatedDescs, command));
2737
2738 /* Can we allocate another non-virtual FD? */
2739 if (!reserveAllocatedDesc())
2740 ereport(ERROR,
2742 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2743 maxAllocatedDescs, command)));
2744
2745 /* Close excess kernel FDs. */
2747
2748TryAgain:
2749 fflush(NULL);
2751 errno = 0;
2752 file = popen(command, mode);
2753 save_errno = errno;
2755 errno = save_errno;
2756 if (file != NULL)
2757 {
2759
2760 desc->kind = AllocateDescPipe;
2761 desc->desc.file = file;
2764 return desc->desc.file;
2765 }
2766
2767 if (errno == EMFILE || errno == ENFILE)
2768 {
2769 ereport(LOG,
2771 errmsg("out of file descriptors: %m; release and retry")));
2772 if (ReleaseLruFile())
2773 goto TryAgain;
2774 errno = save_errno;
2775 }
2776
2777 return NULL;
2778}
2779
2780/*
2781 * Free an AllocateDesc of any type.
2782 *
2783 * The argument *must* point into the allocatedDescs[] array.
2784 */
2785static int
2787{
2788 int result;
2789
2790 /* Close the underlying object */
2791 switch (desc->kind)
2792 {
2793 case AllocateDescFile:
2794 result = fclose(desc->desc.file);
2795 break;
2796 case AllocateDescPipe:
2797 result = pclose(desc->desc.file);
2798 break;
2799 case AllocateDescDir:
2800 result = closedir(desc->desc.dir);
2801 break;
2802 case AllocateDescRawFD:
2803 pgaio_closing_fd(desc->desc.fd);
2804 result = close(desc->desc.fd);
2805 break;
2806 default:
2807 elog(ERROR, "AllocateDesc kind not recognized");
2808 result = 0; /* keep compiler quiet */
2809 break;
2810 }
2811
2812 /* Compact storage in the allocatedDescs array */
2815
2816 return result;
2817}
2818
2819/*
2820 * Close a file returned by AllocateFile.
2821 *
2822 * Note we do not check fclose's return value --- it is up to the caller
2823 * to handle close errors.
2824 */
2825int
2827{
2828 int i;
2829
2830 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2831
2832 /* Remove file from list of allocated files, if it's present */
2833 for (i = numAllocatedDescs; --i >= 0;)
2834 {
2835 AllocateDesc *desc = &allocatedDescs[i];
2836
2837 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2838 return FreeDesc(desc);
2839 }
2840
2841 /* Only get here if someone passes us a file not in allocatedDescs */
2842 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2843
2844 return fclose(file);
2845}
2846
2847/*
2848 * Close a file returned by OpenTransientFile.
2849 *
2850 * Note we do not check close's return value --- it is up to the caller
2851 * to handle close errors.
2852 */
2853int
2855{
2856 int i;
2857
2858 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2859
2860 /* Remove fd from list of allocated files, if it's present */
2861 for (i = numAllocatedDescs; --i >= 0;)
2862 {
2863 AllocateDesc *desc = &allocatedDescs[i];
2864
2865 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2866 return FreeDesc(desc);
2867 }
2868
2869 /* Only get here if someone passes us a file not in allocatedDescs */
2870 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2871
2873
2874 return close(fd);
2875}
2876
2877/*
2878 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2879 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2880 * necessary to open the directory, and with closing it after an elog.
2881 * When done, call FreeDir rather than closedir.
2882 *
2883 * Returns NULL, with errno set, on failure. Note that failure detection
2884 * is commonly left to the following call of ReadDir or ReadDirExtended;
2885 * see the comments for ReadDir.
2886 *
2887 * Ideally this should be the *only* direct call of opendir() in the backend.
2888 */
2889DIR *
2890AllocateDir(const char *dirname)
2891{
2892 DIR *dir;
2893
2894 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2895 numAllocatedDescs, dirname));
2896
2897 /* Can we allocate another non-virtual FD? */
2898 if (!reserveAllocatedDesc())
2899 ereport(ERROR,
2901 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2902 maxAllocatedDescs, dirname)));
2903
2904 /* Close excess kernel FDs. */
2906
2907TryAgain:
2908 if ((dir = opendir(dirname)) != NULL)
2909 {
2911
2912 desc->kind = AllocateDescDir;
2913 desc->desc.dir = dir;
2916 return desc->desc.dir;
2917 }
2918
2919 if (errno == EMFILE || errno == ENFILE)
2920 {
2921 int save_errno = errno;
2922
2923 ereport(LOG,
2925 errmsg("out of file descriptors: %m; release and retry")));
2926 errno = 0;
2927 if (ReleaseLruFile())
2928 goto TryAgain;
2929 errno = save_errno;
2930 }
2931
2932 return NULL;
2933}
2934
2935/*
2936 * Read a directory opened with AllocateDir, ereport'ing any error.
2937 *
2938 * This is easier to use than raw readdir() since it takes care of some
2939 * otherwise rather tedious and error-prone manipulation of errno. Also,
2940 * if you are happy with a generic error message for AllocateDir failure,
2941 * you can just do
2942 *
2943 * dir = AllocateDir(path);
2944 * while ((dirent = ReadDir(dir, path)) != NULL)
2945 * process dirent;
2946 * FreeDir(dir);
2947 *
2948 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2949 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2950 * use this shortcut.)
2951 *
2952 * The pathname passed to AllocateDir must be passed to this routine too,
2953 * but it is only used for error reporting.
2954 */
2955struct dirent *
2956ReadDir(DIR *dir, const char *dirname)
2957{
2958 return ReadDirExtended(dir, dirname, ERROR);
2959}
2960
2961/*
2962 * Alternate version of ReadDir that allows caller to specify the elevel
2963 * for any error report (whether it's reporting an initial failure of
2964 * AllocateDir or a subsequent directory read failure).
2965 *
2966 * If elevel < ERROR, returns NULL after any error. With the normal coding
2967 * pattern, this will result in falling out of the loop immediately as
2968 * though the directory contained no (more) entries.
2969 */
2970struct dirent *
2971ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2972{
2973 struct dirent *dent;
2974
2975 /* Give a generic message for AllocateDir failure, if caller didn't */
2976 if (dir == NULL)
2977 {
2978 ereport(elevel,
2980 errmsg("could not open directory \"%s\": %m",
2981 dirname)));
2982 return NULL;
2983 }
2984
2985 errno = 0;
2986 if ((dent = readdir(dir)) != NULL)
2987 return dent;
2988
2989 if (errno)
2990 ereport(elevel,
2992 errmsg("could not read directory \"%s\": %m",
2993 dirname)));
2994 return NULL;
2995}
2996
2997/*
2998 * Close a directory opened with AllocateDir.
2999 *
3000 * Returns closedir's return value (with errno set if it's not 0).
3001 * Note we do not check the return value --- it is up to the caller
3002 * to handle close errors if wanted.
3003 *
3004 * Does nothing if dir == NULL; we assume that directory open failure was
3005 * already reported if desired.
3006 */
3007int
3009{
3010 int i;
3011
3012 /* Nothing to do if AllocateDir failed */
3013 if (dir == NULL)
3014 return 0;
3015
3016 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3017
3018 /* Remove dir from list of allocated dirs, if it's present */
3019 for (i = numAllocatedDescs; --i >= 0;)
3020 {
3021 AllocateDesc *desc = &allocatedDescs[i];
3022
3023 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3024 return FreeDesc(desc);
3025 }
3026
3027 /* Only get here if someone passes us a dir not in allocatedDescs */
3028 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3029
3030 return closedir(dir);
3031}
3032
3033
3034/*
3035 * Close a pipe stream returned by OpenPipeStream.
3036 */
3037int
3039{
3040 int i;
3041
3042 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3043
3044 /* Remove file from list of allocated files, if it's present */
3045 for (i = numAllocatedDescs; --i >= 0;)
3046 {
3047 AllocateDesc *desc = &allocatedDescs[i];
3048
3049 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3050 return FreeDesc(desc);
3051 }
3052
3053 /* Only get here if someone passes us a file not in allocatedDescs */
3054 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3055
3056 return pclose(file);
3057}
3058
3059/*
3060 * closeAllVfds
3061 *
3062 * Force all VFDs into the physically-closed state, so that the fewest
3063 * possible number of kernel file descriptors are in use. There is no
3064 * change in the logical state of the VFDs.
3065 */
3066void
3068{
3069 Index i;
3070
3071 if (SizeVfdCache > 0)
3072 {
3073 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3074 for (i = 1; i < SizeVfdCache; i++)
3075 {
3076 if (!FileIsNotOpen(i))
3077 LruDelete(i);
3078 }
3079 }
3080}
3081
3082
3083/*
3084 * SetTempTablespaces
3085 *
3086 * Define a list (actually an array) of OIDs of tablespaces to use for
3087 * temporary files. This list will be used until end of transaction,
3088 * unless this function is called again before then. It is caller's
3089 * responsibility that the passed-in array has adequate lifespan (typically
3090 * it'd be allocated in TopTransactionContext).
3091 *
3092 * Some entries of the array may be InvalidOid, indicating that the current
3093 * database's default tablespace should be used.
3094 */
3095void
3097{
3098 Assert(numSpaces >= 0);
3101
3102 /*
3103 * Select a random starting point in the list. This is to minimize
3104 * conflicts between backends that are most likely sharing the same list
3105 * of temp tablespaces. Note that if we create multiple temp files in the
3106 * same transaction, we'll advance circularly through the list --- this
3107 * ensures that large temporary sort files are nicely spread across all
3108 * available tablespaces.
3109 */
3110 if (numSpaces > 1)
3112 0, numSpaces - 1);
3113 else
3115}
3116
3117/*
3118 * TempTablespacesAreSet
3119 *
3120 * Returns true if SetTempTablespaces has been called in current transaction.
3121 * (This is just so that tablespaces.c doesn't need its own per-transaction
3122 * state.)
3123 */
3124bool
3126{
3127 return (numTempTableSpaces >= 0);
3128}
3129
3130/*
3131 * GetTempTablespaces
3132 *
3133 * Populate an array with the OIDs of the tablespaces that should be used for
3134 * temporary files. (Some entries may be InvalidOid, indicating that the
3135 * current database's default tablespace should be used.) At most numSpaces
3136 * entries will be filled.
3137 * Returns the number of OIDs that were copied into the output array.
3138 */
3139int
3141{
3142 int i;
3143
3145 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3147
3148 return i;
3149}
3150
3151/*
3152 * GetNextTempTableSpace
3153 *
3154 * Select the next temp tablespace to use. A result of InvalidOid means
3155 * to use the current database's default tablespace.
3156 */
3157Oid
3159{
3160 if (numTempTableSpaces > 0)
3161 {
3162 /* Advance nextTempTableSpace counter with wraparound */
3166 }
3167 return InvalidOid;
3168}
3169
3170
3171/*
3172 * AtEOSubXact_Files
3173 *
3174 * Take care of subtransaction commit/abort. At abort, we close AllocateDescs
3175 * that the subtransaction may have opened. At commit, we reassign them to
3176 * the parent subtransaction. (Temporary files are tracked by ResourceOwners
3177 * instead.)
3178 */
3179void
3182{
3183 Index i;
3184
3185 for (i = 0; i < numAllocatedDescs; i++)
3186 {
3187 if (allocatedDescs[i].create_subid == mySubid)
3188 {
3189 if (isCommit)
3191 else
3192 {
3193 /* have to recheck the item after FreeDesc (ugly) */
3195 }
3196 }
3197 }
3198}
3199
3200/*
3201 * AtEOXact_Files
3202 *
3203 * This routine is called during transaction commit or abort. All still-open
3204 * per-transaction temporary file VFDs are closed, which also causes the
3205 * underlying files to be deleted (although they should've been closed already
3206 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3207 * closed. We also forget any transaction-local temp tablespace list.
3208 *
3209 * The isCommit flag is used only to decide whether to emit warnings about
3210 * unclosed files.
3211 */
3212void
3219
3220/*
3221 * BeforeShmemExit_Files
3222 *
3223 * before_shmem_exit hook to clean up temp files during backend shutdown.
3224 * Here, we want to clean up *all* temp files including interXact ones.
3225 */
3226static void
3228{
3229 CleanupTempFiles(false, true);
3230
3231 /* prevent further temp files from being created */
3232#ifdef USE_ASSERT_CHECKING
3234#endif
3235}
3236
3237/*
3238 * Close temporary files and delete their underlying files.
3239 *
3240 * isCommit: if true, this is normal transaction commit, and we don't
3241 * expect any remaining files; warn if there are some.
3242 *
3243 * isProcExit: if true, this is being called as the backend process is
3244 * exiting. If that's the case, we should remove all temporary files; if
3245 * that's not the case, we are being called for transaction commit/abort
3246 * and should only remove transaction-local temp files. In either case,
3247 * also clean up "allocated" stdio files, dirs and fds.
3248 */
3249static void
3251{
3252 Index i;
3253
3254 /*
3255 * Careful here: at proc_exit we need extra cleanup, not just
3256 * xact_temporary files.
3257 */
3259 {
3260 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3261 for (i = 1; i < SizeVfdCache; i++)
3262 {
3263 unsigned short fdstate = VfdCache[i].fdstate;
3264
3265 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3266 VfdCache[i].fileName != NULL)
3267 {
3268 /*
3269 * If we're in the process of exiting a backend process, close
3270 * all temporary files. Otherwise, only close temporary files
3271 * local to the current transaction. They should be closed by
3272 * the ResourceOwner mechanism already, so this is just a
3273 * debugging cross-check.
3274 */
3275 if (isProcExit)
3276 FileClose(i);
3277 else if (fdstate & FD_CLOSE_AT_EOXACT)
3278 {
3279 elog(WARNING,
3280 "temporary file %s not closed at end-of-transaction",
3281 VfdCache[i].fileName);
3282 FileClose(i);
3283 }
3284 }
3285 }
3286
3288 }
3289
3290 /* Complain if any allocated files remain open at commit. */
3291 if (isCommit && numAllocatedDescs > 0)
3292 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3294
3295 /* Clean up "allocated" stdio files, dirs and fds. */
3296 while (numAllocatedDescs > 0)
3298}
3299
3300
3301/*
3302 * Remove temporary and temporary relation files left over from a prior
3303 * postmaster session
3304 *
3305 * This should be called during postmaster startup. It will forcibly
3306 * remove any leftover files created by OpenTemporaryFile and any leftover
3307 * temporary relation files created by mdcreate.
3308 *
3309 * During post-backend-crash restart cycle, this routine is called when
3310 * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3311 * queries are using temp files could result in useless storage usage that can
3312 * only be reclaimed by a service restart. The argument against enabling it is
3313 * that someone might want to examine the temporary files for debugging
3314 * purposes. This does however mean that OpenTemporaryFile had better allow for
3315 * collision with an existing temp file name.
3316 *
3317 * NOTE: this function and its subroutines generally report syscall failures
3318 * with ereport(LOG) and keep going. Removing temp files is not so critical
3319 * that we should fail to start the database when we can't do it.
3320 */
3321void
3323{
3325 DIR *spc_dir;
3326 struct dirent *spc_de;
3327
3328 /*
3329 * First process temp files in pg_default ($PGDATA/base)
3330 */
3331 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3332 RemovePgTempFilesInDir(temp_path, true, false);
3334
3335 /*
3336 * Cycle through temp directories for all non-default tablespaces.
3337 */
3339
3341 {
3342 if (strcmp(spc_de->d_name, ".") == 0 ||
3343 strcmp(spc_de->d_name, "..") == 0)
3344 continue;
3345
3346 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3349 RemovePgTempFilesInDir(temp_path, true, false);
3350
3351 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3354 }
3355
3357
3358 /*
3359 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3360 * DataDir as well. However, that is *not* cleaned here because doing so
3361 * would create a race condition. It's done separately, earlier in
3362 * postmaster startup.
3363 */
3364}
3365
3366/*
3367 * Process one pgsql_tmp directory for RemovePgTempFiles.
3368 *
3369 * If missing_ok is true, it's all right for the named directory to not exist.
3370 * Any other problem results in a LOG message. (missing_ok should be true at
3371 * the top level, since pgsql_tmp directories are not created until needed.)
3372 *
3373 * At the top level, this should be called with unlink_all = false, so that
3374 * only files matching the temporary name prefix will be unlinked. When
3375 * recursing it will be called with unlink_all = true to unlink everything
3376 * under a top-level temporary directory.
3377 *
3378 * (These two flags could be replaced by one, but it seems clearer to keep
3379 * them separate.)
3380 */
3381void
3382RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3383{
3384 DIR *temp_dir;
3385 struct dirent *temp_de;
3386 char rm_path[MAXPGPATH * 2];
3387
3389
3390 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3391 return;
3392
3394 {
3395 if (strcmp(temp_de->d_name, ".") == 0 ||
3396 strcmp(temp_de->d_name, "..") == 0)
3397 continue;
3398
3399 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3400 tmpdirname, temp_de->d_name);
3401
3402 if (unlink_all ||
3403 strncmp(temp_de->d_name,
3406 {
3408
3409 if (type == PGFILETYPE_ERROR)
3410 continue;
3411 else if (type == PGFILETYPE_DIR)
3412 {
3413 /* recursively remove contents, then directory itself */
3414 RemovePgTempFilesInDir(rm_path, false, true);
3415
3416 if (rmdir(rm_path) < 0)
3417 ereport(LOG,
3419 errmsg("could not remove directory \"%s\": %m",
3420 rm_path)));
3421 }
3422 else
3423 {
3424 if (unlink(rm_path) < 0)
3425 ereport(LOG,
3427 errmsg("could not remove file \"%s\": %m",
3428 rm_path)));
3429 }
3430 }
3431 else
3432 ereport(LOG,
3433 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3434 rm_path)));
3435 }
3436
3438}
3439
3440/* Process one tablespace directory, look for per-DB subdirectories */
3441static void
3443{
3444 DIR *ts_dir;
3445 struct dirent *de;
3446 char dbspace_path[MAXPGPATH * 2];
3447
3449
3450 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3451 {
3452 /*
3453 * We're only interested in the per-database directories, which have
3454 * numeric names. Note that this code will also (properly) ignore "."
3455 * and "..".
3456 */
3457 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3458 continue;
3459
3460 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3461 tsdirname, de->d_name);
3463 }
3464
3465 FreeDir(ts_dir);
3466}
3467
3468/* Process one per-dbspace directory for RemovePgTempRelationFiles */
3469static void
3471{
3473 struct dirent *de;
3474 char rm_path[MAXPGPATH * 2];
3475
3477
3479 {
3480 if (!looks_like_temp_rel_name(de->d_name))
3481 continue;
3482
3483 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3484 dbspacedirname, de->d_name);
3485
3486 if (unlink(rm_path) < 0)
3487 ereport(LOG,
3489 errmsg("could not remove file \"%s\": %m",
3490 rm_path)));
3491 }
3492
3494}
3495
3496/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3497bool
3499{
3500 int pos;
3501 int savepos;
3502
3503 /* Must start with "t". */
3504 if (name[0] != 't')
3505 return false;
3506
3507 /* Followed by a non-empty string of digits and then an underscore. */
3508 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3509 ;
3510 if (pos == 1 || name[pos] != '_')
3511 return false;
3512
3513 /* Followed by another nonempty string of digits. */
3514 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3515 ;
3516 if (savepos == pos)
3517 return false;
3518
3519 /* We might have _forkname or .segment or both. */
3520 if (name[pos] == '_')
3521 {
3522 int forkchar = forkname_chars(&name[pos + 1], NULL);
3523
3524 if (forkchar <= 0)
3525 return false;
3526 pos += forkchar + 1;
3527 }
3528 if (name[pos] == '.')
3529 {
3530 int segchar;
3531
3532 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3533 ;
3534 if (segchar <= 1)
3535 return false;
3536 pos += segchar;
3537 }
3538
3539 /* Now we should be at the end. */
3540 if (name[pos] != '\0')
3541 return false;
3542 return true;
3543}
3544
3545#ifdef HAVE_SYNCFS
3546static void
3547do_syncfs(const char *path)
3548{
3549 int fd;
3550
3551 ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3552 path);
3553
3554 fd = OpenTransientFile(path, O_RDONLY);
3555 if (fd < 0)
3556 {
3557 ereport(LOG,
3559 errmsg("could not open file \"%s\": %m", path)));
3560 return;
3561 }
3562 if (syncfs(fd) < 0)
3563 ereport(LOG,
3565 errmsg("could not synchronize file system for file \"%s\": %m", path)));
3567}
3568#endif
3569
3570/*
3571 * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3572 * all potential filesystem, depending on recovery_init_sync_method setting.
3573 *
3574 * We fsync regular files and directories wherever they are, but we
3575 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3576 * Other symlinks are presumed to point at files we're not responsible
3577 * for fsyncing, and might not have privileges to write at all.
3578 *
3579 * Errors are logged but not considered fatal; that's because this is used
3580 * only during database startup, to deal with the possibility that there are
3581 * issued-but-unsynced writes pending against the data directory. We want to
3582 * ensure that such writes reach disk before anything that's done in the new
3583 * run. However, aborting on error would result in failure to start for
3584 * harmless cases such as read-only files in the data directory, and that's
3585 * not good either.
3586 *
3587 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3588 * rewriting all changes again during recovery.
3589 *
3590 * Note we assume we're chdir'd into PGDATA to begin with.
3591 */
3592void
3594{
3595 bool xlog_is_symlink;
3596
3597 /* We can skip this whole thing if fsync is disabled. */
3598 if (!enableFsync)
3599 return;
3600
3601 /*
3602 * If pg_wal is a symlink, we'll need to recurse into it separately,
3603 * because the first walkdir below will ignore it.
3604 */
3605 xlog_is_symlink = false;
3606
3607 {
3608 struct stat st;
3609
3610 if (lstat("pg_wal", &st) < 0)
3611 ereport(LOG,
3613 errmsg("could not stat file \"%s\": %m",
3614 "pg_wal")));
3615 else if (S_ISLNK(st.st_mode))
3616 xlog_is_symlink = true;
3617 }
3618
3619#ifdef HAVE_SYNCFS
3621 {
3622 DIR *dir;
3623 struct dirent *de;
3624
3625 /*
3626 * On Linux, we don't have to open every single file one by one. We
3627 * can use syncfs() to sync whole filesystems. We only expect
3628 * filesystem boundaries to exist where we tolerate symlinks, namely
3629 * pg_wal and the tablespaces, so we call syncfs() for each of those
3630 * directories.
3631 */
3632
3633 /* Prepare to report progress syncing the data directory via syncfs. */
3635
3636 /* Sync the top level pgdata directory. */
3637 do_syncfs(".");
3638 /* If any tablespaces are configured, sync each of those. */
3640 while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3641 {
3642 char path[MAXPGPATH];
3643
3644 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3645 continue;
3646
3647 snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3648 do_syncfs(path);
3649 }
3650 FreeDir(dir);
3651 /* If pg_wal is a symlink, process that too. */
3652 if (xlog_is_symlink)
3653 do_syncfs("pg_wal");
3654 return;
3655 }
3656#endif /* !HAVE_SYNCFS */
3657
3658#ifdef PG_FLUSH_DATA_WORKS
3659 /* Prepare to report progress of the pre-fsync phase. */
3661
3662 /*
3663 * If possible, hint to the kernel that we're soon going to fsync the data
3664 * directory and its contents. Errors in this step are even less
3665 * interesting than normal, so log them only at DEBUG1.
3666 */
3667 walkdir(".", pre_sync_fname, false, DEBUG1);
3668 if (xlog_is_symlink)
3669 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3671#endif
3672
3673 /* Prepare to report progress syncing the data directory via fsync. */
3675
3676 /*
3677 * Now we do the fsync()s in the same order.
3678 *
3679 * The main call ignores symlinks, so in addition to specially processing
3680 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3681 * process_symlinks = true. Note that if there are any plain directories
3682 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3683 * so we don't worry about optimizing it.
3684 */
3685 walkdir(".", datadir_fsync_fname, false, LOG);
3686 if (xlog_is_symlink)
3687 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3689}
3690
3691/*
3692 * walkdir: recursively walk a directory, applying the action to each
3693 * regular file and directory (including the named directory itself).
3694 *
3695 * If process_symlinks is true, the action and recursion are also applied
3696 * to regular files and directories that are pointed to by symlinks in the
3697 * given directory; otherwise symlinks are ignored. Symlinks are always
3698 * ignored in subdirectories, ie we intentionally don't pass down the
3699 * process_symlinks flag to recursive calls.
3700 *
3701 * Errors are reported at level elevel, which might be ERROR or less.
3702 *
3703 * See also walkdir in file_utils.c, which is a frontend version of this
3704 * logic.
3705 */
3706static void
3707walkdir(const char *path,
3708 void (*action) (const char *fname, bool isdir, int elevel),
3709 bool process_symlinks,
3710 int elevel)
3711{
3712 DIR *dir;
3713 struct dirent *de;
3714
3715 dir = AllocateDir(path);
3716
3717 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3718 {
3719 char subpath[MAXPGPATH * 2];
3720
3722
3723 if (strcmp(de->d_name, ".") == 0 ||
3724 strcmp(de->d_name, "..") == 0)
3725 continue;
3726
3727 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3728
3729 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3730 {
3731 case PGFILETYPE_REG:
3732 (*action) (subpath, false, elevel);
3733 break;
3734 case PGFILETYPE_DIR:
3735 walkdir(subpath, action, false, elevel);
3736 break;
3737 default:
3738
3739 /*
3740 * Errors are already reported directly by get_dirent_type(),
3741 * and any remaining symlinks and unknown file types are
3742 * ignored.
3743 */
3744 break;
3745 }
3746 }
3747
3748 FreeDir(dir); /* we ignore any error here */
3749
3750 /*
3751 * It's important to fsync the destination directory itself as individual
3752 * file fsyncs don't guarantee that the directory entry for the file is
3753 * synced. However, skip this if AllocateDir failed; the action function
3754 * might not be robust against that.
3755 */
3756 if (dir)
3757 (*action) (path, true, elevel);
3758}
3759
3760
3761/*
3762 * Hint to the OS that it should get ready to fsync() this file.
3763 *
3764 * Ignores errors trying to open unreadable files, and logs other errors at a
3765 * caller-specified level.
3766 */
3767#ifdef PG_FLUSH_DATA_WORKS
3768
3769static void
3770pre_sync_fname(const char *fname, bool isdir, int elevel)
3771{
3772 int fd;
3773
3774 /* Don't try to flush directories, it'll likely just fail */
3775 if (isdir)
3776 return;
3777
3778 ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3779 fname);
3780
3782
3783 if (fd < 0)
3784 {
3785 if (errno == EACCES)
3786 return;
3787 ereport(elevel,
3789 errmsg("could not open file \"%s\": %m", fname)));
3790 return;
3791 }
3792
3793 /*
3794 * pg_flush_data() ignores errors, which is ok because this is only a
3795 * hint.
3796 */
3797 pg_flush_data(fd, 0, 0);
3798
3799 if (CloseTransientFile(fd) != 0)
3800 ereport(elevel,
3802 errmsg("could not close file \"%s\": %m", fname)));
3803}
3804
3805#endif /* PG_FLUSH_DATA_WORKS */
3806
3807static void
3808datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3809{
3810 ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3811 fname);
3812
3813 /*
3814 * We want to silently ignoring errors about unreadable files. Pass that
3815 * desire on to fsync_fname_ext().
3816 */
3817 fsync_fname_ext(fname, isdir, true, elevel);
3818}
3819
3820static void
3821unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3822{
3823 if (isdir)
3824 {
3825 if (rmdir(fname) != 0 && errno != ENOENT)
3826 ereport(elevel,
3828 errmsg("could not remove directory \"%s\": %m", fname)));
3829 }
3830 else
3831 {
3832 /* Use PathNameDeleteTemporaryFile to report filesize */
3833 PathNameDeleteTemporaryFile(fname, false);
3834 }
3835}
3836
3837/*
3838 * fsync_fname_ext -- Try to fsync a file or directory
3839 *
3840 * If ignore_perm is true, ignore errors upon trying to open unreadable
3841 * files. Logs other errors at a caller-specified level.
3842 *
3843 * Returns 0 if the operation succeeded, -1 otherwise.
3844 */
3845int
3846fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3847{
3848 int fd;
3849 int flags;
3850 int returncode;
3851
3852 /*
3853 * Some OSs require directories to be opened read-only whereas other
3854 * systems don't allow us to fsync files opened read-only; so we need both
3855 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3856 * not writable by our userid, but we assume that's OK.
3857 */
3858 flags = PG_BINARY;
3859 if (!isdir)
3860 flags |= O_RDWR;
3861 else
3862 flags |= O_RDONLY;
3863
3864 fd = OpenTransientFile(fname, flags);
3865
3866 /*
3867 * Some OSs don't allow us to open directories at all (Windows returns
3868 * EACCES), just ignore the error in that case. If desired also silently
3869 * ignoring errors about unreadable files. Log others.
3870 */
3871 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3872 return 0;
3873 else if (fd < 0 && ignore_perm && errno == EACCES)
3874 return 0;
3875 else if (fd < 0)
3876 {
3877 ereport(elevel,
3879 errmsg("could not open file \"%s\": %m", fname)));
3880 return -1;
3881 }
3882
3884
3885 /*
3886 * Some OSes don't allow us to fsync directories at all, so we can ignore
3887 * those errors. Anything else needs to be logged.
3888 */
3889 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3890 {
3891 int save_errno;
3892
3893 /* close file upon error, might not be in transaction context */
3894 save_errno = errno;
3896 errno = save_errno;
3897
3898 ereport(elevel,
3900 errmsg("could not fsync file \"%s\": %m", fname)));
3901 return -1;
3902 }
3903
3904 if (CloseTransientFile(fd) != 0)
3905 {
3906 ereport(elevel,
3908 errmsg("could not close file \"%s\": %m", fname)));
3909 return -1;
3910 }
3911
3912 return 0;
3913}
3914
3915/*
3916 * fsync_parent_path -- fsync the parent path of a file or directory
3917 *
3918 * This is aimed at making file operations persistent on disk in case of
3919 * an OS crash or power failure.
3920 */
3921static int
3922fsync_parent_path(const char *fname, int elevel)
3923{
3924 char parentpath[MAXPGPATH];
3925
3926 strlcpy(parentpath, fname, MAXPGPATH);
3928
3929 /*
3930 * get_parent_directory() returns an empty string if the input argument is
3931 * just a file name (see comments in path.c), so handle that as being the
3932 * current directory.
3933 */
3934 if (strlen(parentpath) == 0)
3936
3937 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3938 return -1;
3939
3940 return 0;
3941}
3942
3943/*
3944 * Create a PostgreSQL data sub-directory
3945 *
3946 * The data directory itself, and most of its sub-directories, are created at
3947 * initdb time, but we do have some occasions when we create directories in
3948 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3949 * make sure that those directories are created consistently. Today, that means
3950 * making sure that the created directory has the correct permissions, which is
3951 * what pg_dir_create_mode tracks for us.
3952 *
3953 * Note that we also set the umask() based on what we understand the correct
3954 * permissions to be (see file_perm.c).
3955 *
3956 * For permissions other than the default, mkdir() can be used directly, but
3957 * be sure to consider carefully such cases -- a sub-directory with incorrect
3958 * permissions in a PostgreSQL data directory could cause backups and other
3959 * processes to fail.
3960 */
3961int
3966
3967/*
3968 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3969 *
3970 * Failure to fsync any data file is cause for immediate panic, unless
3971 * data_sync_retry is enabled. Data may have been written to the operating
3972 * system and removed from our buffer pool already, and if we are running on
3973 * an operating system that forgets dirty data on write-back failure, there
3974 * may be only one copy of the data remaining: in the WAL. A later attempt to
3975 * fsync again might falsely report success. Therefore we must not allow any
3976 * further checkpoints to be attempted. data_sync_retry can in theory be
3977 * enabled on systems known not to drop dirty buffered data on write-back
3978 * failure (with the likely outcome that checkpoints will continue to fail
3979 * until the underlying problem is fixed).
3980 *
3981 * Any code that reports a failure from fsync() or related functions should
3982 * filter the error level with this function.
3983 */
3984int
3986{
3987 return data_sync_retry ? elevel : PANIC;
3988}
3989
3990bool
3992{
3993 bool result = true;
3994 int flags;
3995
3996#if PG_O_DIRECT == 0
3997 if (strcmp(*newval, "") != 0)
3998 {
3999 GUC_check_errdetail("\"%s\" is not supported on this platform.",
4000 "debug_io_direct");
4001 result = false;
4002 }
4003 flags = 0;
4004#else
4005 List *elemlist;
4006 ListCell *l;
4007 char *rawstring;
4008
4009 /* Need a modifiable copy of string */
4011
4012 if (!SplitGUCList(rawstring, ',', &elemlist))
4013 {
4014 GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4015 "debug_io_direct");
4018 return false;
4019 }
4020
4021 flags = 0;
4022 foreach(l, elemlist)
4023 {
4024 char *item = (char *) lfirst(l);
4025
4026 if (pg_strcasecmp(item, "data") == 0)
4027 flags |= IO_DIRECT_DATA;
4028 else if (pg_strcasecmp(item, "wal") == 0)
4029 flags |= IO_DIRECT_WAL;
4030 else if (pg_strcasecmp(item, "wal_init") == 0)
4031 flags |= IO_DIRECT_WAL_INIT;
4032 else
4033 {
4034 GUC_check_errdetail("Invalid option \"%s\".", item);
4035 result = false;
4036 break;
4037 }
4038 }
4039
4040 /*
4041 * It's possible to configure block sizes smaller than our assumed I/O
4042 * alignment size, which could result in invalid I/O requests.
4043 */
4044#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4045 if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4046 {
4047 GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4048 "debug_io_direct", "XLOG_BLCKSZ");
4049 result = false;
4050 }
4051#endif
4052#if BLCKSZ < PG_IO_ALIGN_SIZE
4053 if (result && (flags & IO_DIRECT_DATA))
4054 {
4055 GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4056 "debug_io_direct", "BLCKSZ");
4057 result = false;
4058 }
4059#endif
4060
4063#endif
4064
4065 if (!result)
4066 return result;
4067
4068 /* Save the flags in *extra, for use by assign_debug_io_direct */
4069 *extra = guc_malloc(LOG, sizeof(int));
4070 if (!*extra)
4071 return false;
4072 *((int *) *extra) = flags;
4073
4074 return result;
4075}
4076
4077void
4078assign_debug_io_direct(const char *newval, void *extra)
4079{
4080 int *flags = (int *) extra;
4081
4082 io_direct_flags = *flags;
4083}
4084
4085/* ResourceOwner callbacks */
4086
4087static void
4089{
4090 File file = (File) DatumGetInt32(res);
4091 Vfd *vfdP;
4092
4093 Assert(FileIsValid(file));
4094
4095 vfdP = &VfdCache[file];
4096 vfdP->resowner = NULL;
4097
4098 FileClose(file);
4099}
4100
4101static char *
4103{
4104 return psprintf("File %d", DatumGetInt32(res));
4105}
void pgaio_closing_fd(int fd)
Definition aio.c:1220
void pgaio_io_start_readv(PgAioHandle *ioh, int fd, int iovcnt, uint64 offset)
Definition aio_io.c:78
void begin_startup_progress_phase(void)
Definition startup.c:342
int fdatasync(int fd)
#define Min(x, y)
Definition c.h:997
uint32 SubTransactionId
Definition c.h:670
#define INT64_FORMAT
Definition c.h:564
#define Assert(condition)
Definition c.h:873
int64_t int64
Definition c.h:543
#define PG_BINARY
Definition c.h:1287
uint64_t uint64
Definition c.h:547
uint32_t uint32
Definition c.h:546
unsigned int Index
Definition c.h:628
#define MemSet(start, val, len)
Definition c.h:1013
#define OidIsValid(objectId)
Definition c.h:788
size_t Size
Definition c.h:619
int closedir(DIR *)
Definition dirent.c:127
struct dirent * readdir(DIR *)
Definition dirent.c:78
DIR * opendir(const char *)
Definition dirent.c:33
int errcode_for_file_access(void)
Definition elog.c:886
int errdetail(const char *fmt,...)
Definition elog.c:1216
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define LOG
Definition elog.h:31
#define FATAL
Definition elog.h:41
#define WARNING
Definition elog.h:36
#define DEBUG2
Definition elog.h:29
#define PANIC
Definition elog.h:42
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int pg_truncate(const char *path, pgoff_t length)
Definition fd.c:720
int max_files_per_process
Definition fd.c:146
int FileGetRawDesc(File file)
Definition fd.c:2515
int MakePGDirectory(const char *directoryName)
Definition fd.c:3962
int FreeDir(DIR *dir)
Definition fd.c:3008
int recovery_init_sync_method
Definition fd.c:165
static const ResourceOwnerDesc file_resowner_desc
Definition fd.c:364
int pg_fsync_no_writethrough(int fd)
Definition fd.c:441
#define FD_MINFREE
Definition fd.c:138
FILE * OpenPipeStream(const char *command, const char *mode)
Definition fd.c:2730
static int numTempTableSpaces
Definition fd.c:292
static bool ReleaseLruFile(void)
Definition fd.c:1369
void FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
Definition fd.c:2122
int io_direct_flags
Definition fd.c:171
#define FD_DELETE_AT_CLOSE
Definition fd.c:195
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1111
static int maxAllocatedDescs
Definition fd.c:271
int file_extend_method
Definition fd.c:168
static void Delete(File file)
Definition fd.c:1253
static int FreeDesc(AllocateDesc *desc)
Definition fd.c:2786
static long tempFileCounter
Definition fd.c:283
static char * ResOwnerPrintFile(Datum res)
Definition fd.c:4102
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition fd.c:782
char * FilePathName(File file)
Definition fd.c:2499
static void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition fd.c:380
static int pg_ftruncate(int fd, pgoff_t length)
Definition fd.c:703
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition fd.c:3140
static int numAllocatedDescs
Definition fd.c:270
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition fd.c:1888
static void LruDelete(File file)
Definition fd.c:1272
int pg_fdatasync(int fd)
Definition fd.c:480
#define FileIsValid(file)
Definition fd.c:189
void assign_debug_io_direct(const char *newval, void *extra)
Definition fd.c:4078
int FileSync(File file, uint32 wait_event_info)
Definition fd.c:2335
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2204
static int nfile
Definition fd.c:225
int CloseTransientFile(int fd)
Definition fd.c:2854
#define DO_DB(A)
Definition fd.c:183
int BasicOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1089
void closeAllVfds(void)
Definition fd.c:3067
int max_safe_fds
Definition fd.c:159
static File AllocateVfd(void)
Definition fd.c:1401
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition fd.c:1848
void PathNameDeleteTemporaryDir(const char *dirname)
Definition fd.c:1678
int ClosePipeStream(FILE *file)
Definition fd.c:3038
void AtEOXact_Files(bool isCommit)
Definition fd.c:3213
int FileGetRawFlags(File file)
Definition fd.c:2531
static Size SizeVfdCache
Definition fd.c:220
static int nextTempTableSpace
Definition fd.c:293
#define FD_CLOSE_AT_EOXACT
Definition fd.c:196
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition fd.c:3846
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition fd.c:3821
static void ResOwnerReleaseFile(Datum res)
Definition fd.c:4088
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition fd.c:3442
int FreeFile(FILE *file)
Definition fd.c:2826
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2148
mode_t FileGetRawMode(File file)
Definition fd.c:2541
static AllocateDesc * allocatedDescs
Definition fd.c:272
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition fd.c:2971
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition fd.c:964
int FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2407
static int FileAccess(File file)
Definition fd.c:1479
pgoff_t FileSize(File file)
Definition fd.c:2447
static void FreeVfd(File file)
Definition fd.c:1459
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition fd.c:461
void FileClose(File file)
Definition fd.c:1965
void ReleaseExternalFD(void)
Definition fd.c:1224
#define FD_TEMP_FILE_LIMIT
Definition fd.c:197
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition fd.c:3382
bool pg_file_exists(const char *name)
Definition fd.c:503
void RemovePgTempFiles(void)
Definition fd.c:3322
#define FileIsNotOpen(file)
Definition fd.c:192
bool TempTablespacesAreSet(void)
Definition fd.c:3125
void fsync_fname(const char *fname, bool isdir)
Definition fd.c:756
int data_sync_elevel(int elevel)
Definition fd.c:3985
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1562
static void Insert(File file)
Definition fd.c:1300
AllocateDescKind
Definition fd.c:251
@ AllocateDescDir
Definition fd.c:254
@ AllocateDescPipe
Definition fd.c:253
@ AllocateDescFile
Definition fd.c:252
@ AllocateDescRawFD
Definition fd.c:255
Oid GetNextTempTableSpace(void)
Definition fd.c:3158
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1575
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition fd.c:3808
static void ReportTemporaryFileUsage(const char *path, pgoff_t size)
Definition fd.c:1515
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition fd.c:1791
void pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
Definition fd.c:525
bool AcquireExternalFD(void)
Definition fd.c:1171
static void RegisterTemporaryFile(File file)
Definition fd.c:1534
#define NUM_RESERVED_FDS
Definition fd.c:129
DIR * AllocateDir(const char *dirname)
Definition fd.c:2890
static Oid * tempTableSpaces
Definition fd.c:291
int FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2464
static bool reserveAllocatedDesc(void)
Definition fd.c:2552
void InitFileAccess(void)
Definition fd.c:903
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition fd.c:3470
File OpenTemporaryFile(bool interXact)
Definition fd.c:1711
int durable_unlink(const char *fname, int elevel)
Definition fd.c:872
static uint64 temporary_files_size
Definition fd.c:239
void ReserveExternalFD(void)
Definition fd.c:1206
int FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2362
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition fd.c:2956
bool looks_like_temp_rel_name(const char *name)
Definition fd.c:3498
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition fd.c:1919
void set_max_safe_fds(void)
Definition fd.c:1044
int pg_fsync(int fd)
Definition fd.c:389
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition fd.c:3250
#define VFD_CLOSED
Definition fd.c:187
static bool have_xact_temporary_files
Definition fd.c:231
static int LruInsert(File file)
Definition fd.c:1322
static int numExternalFDs
Definition fd.c:277
static int fsync_parent_path(const char *fname, int elevel)
Definition fd.c:3922
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition fd.c:1647
FILE * AllocateFile(const char *name, const char *mode)
Definition fd.c:2627
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition fd.c:3180
int OpenTransientFile(const char *fileName, int fileFlags)
Definition fd.c:2677
void InitTemporaryFileAccess(void)
Definition fd.c:933
static Vfd * VfdCache
Definition fd.c:219
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:2686
bool data_sync_retry
Definition fd.c:162
int FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2066
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2230
static void ReleaseLruFiles(void)
Definition fd.c:1391
void SyncDataDirectory(void)
Definition fd.c:3593
bool check_debug_io_direct(char **newval, void **extra, GucSource source)
Definition fd.c:3991
static void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition fd.c:375
static void BeforeShmemExit_Files(int code, Datum arg)
Definition fd.c:3227
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition fd.c:3707
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition fd.c:3096
void TempTablespacePath(char *path, Oid tablespace)
Definition fd.c:1766
#define IO_DIRECT_WAL
Definition fd.h:55
#define IO_DIRECT_DATA
Definition fd.h:54
#define DEFAULT_FILE_EXTEND_METHOD
Definition fd.h:67
#define IO_DIRECT_WAL_INIT
Definition fd.h:56
int File
Definition fd.h:51
#define PG_O_DIRECT
Definition fd.h:123
int pg_file_create_mode
Definition file_perm.c:19
int pg_dir_create_mode
Definition file_perm.c:18
ssize_t pg_pwrite_zeros(int fd, size_t size, pgoff_t offset)
Definition file_utils.c:709
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition file_utils.c:547
#define PG_TEMP_FILES_DIR
Definition file_utils.h:63
#define PG_TEMP_FILE_PREFIX
Definition file_utils.h:64
PGFileType
Definition file_utils.h:19
@ PGFILETYPE_DIR
Definition file_utils.h:23
@ PGFILETYPE_REG
Definition file_utils.h:22
@ PGFILETYPE_ERROR
Definition file_utils.h:20
@ DATA_DIR_SYNC_METHOD_SYNCFS
Definition file_utils.h:30
@ DATA_DIR_SYNC_METHOD_FSYNC
Definition file_utils.h:29
int MyProcPid
Definition globals.c:47
bool enableFsync
Definition globals.c:129
Oid MyDatabaseTableSpace
Definition globals.c:96
void * guc_malloc(int elevel, size_t size)
Definition guc.c:636
#define newval
#define GUC_check_errdetail
Definition guc.h:505
GucSource
Definition guc.h:112
int temp_file_limit
Definition guc_tables.c:560
int log_temp_files
Definition guc_tables.c:555
#define close(a)
Definition win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:344
int j
Definition isn.c:78
int i
Definition isn.c:77
void list_free(List *list)
Definition list.c:1546
Datum subpath(PG_FUNCTION_ARGS)
Definition ltree_op.c:311
char * pstrdup(const char *in)
Definition mcxt.c:1781
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define MAP_FAILED
Definition mem.h:43
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
void * arg
static char * basedir
static PgChecksumMode mode
#define MAXPGPATH
static ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset)
Definition pg_iovec.h:54
static ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset)
Definition pg_iovec.h:93
#define lfirst(lc)
Definition pg_list.h:172
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition pg_prng.c:144
pg_prng_state pg_global_prng_state
Definition pg_prng.c:34
static rewind_source * source
Definition pg_rewind.c:89
static char buf[DEFAULT_XLOG_SEG_SIZE]
static char * tablespace
Definition pgbench.c:217
void pgstat_report_tempfile(size_t filesize)
#define pqsignal
Definition port.h:547
int pg_strcasecmp(const char *s1, const char *s2)
void get_parent_directory(char *path)
Definition path.c:1068
#define snprintf
Definition port.h:260
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
off_t pgoff_t
Definition port.h:421
uint64_t Datum
Definition postgres.h:70
static Datum Int32GetDatum(int32 X)
Definition postgres.h:222
static int32 DatumGetInt32(Datum X)
Definition postgres.h:212
#define InvalidOid
unsigned int Oid
static int fd(const char *x, int i)
static int fb(int x)
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
int forkname_chars(const char *str, ForkNumber *fork)
Definition relpath.c:81
#define PG_TBLSPC_DIR
Definition relpath.h:41
#define TABLESPACE_VERSION_DIRECTORY
Definition relpath.h:33
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition resowner.c:561
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition resowner.c:521
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449
@ RESOURCE_RELEASE_AFTER_LOCKS
Definition resowner.h:56
#define RELEASE_PRIO_FILES
Definition resowner.h:76
void pg_usleep(long microsec)
Definition signal.c:53
#define realloc(a, b)
#define free(a)
#define malloc(a)
static void error(void)
#define ereport_startup_progress(msg,...)
Definition startup.h:18
SubTransactionId create_subid
Definition fd.c:261
DIR * dir
Definition fd.c:265
FILE * file
Definition fd.c:264
int fd
Definition fd.c:266
union AllocateDesc::@20 desc
AllocateDescKind kind
Definition fd.c:260
Definition dirent.c:26
Definition pg_list.h:54
const char * name
Definition resowner.h:93
unsigned short st_mode
Definition win32_port.h:258
Definition fd.c:200
int fd
Definition fd.c:201
int fileFlags
Definition fd.c:210
File lruLessRecently
Definition fd.c:206
File lruMoreRecently
Definition fd.c:205
pgoff_t fileSize
Definition fd.c:207
char * fileName
Definition fd.c:208
ResourceOwner resowner
Definition fd.c:203
unsigned short fdstate
Definition fd.c:202
File nextFree
Definition fd.c:204
mode_t fileMode
Definition fd.c:211
bool SplitGUCList(char *rawstring, char separator, List **namelist)
Definition varlena.c:2978
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
const char * type
const char * name
#define fsync(fd)
Definition win32_port.h:83
#define stat
Definition win32_port.h:74
#define EINTR
Definition win32_port.h:361
#define EOPNOTSUPP
Definition win32_port.h:385
#define SIGPIPE
Definition win32_port.h:163
#define lstat(path, sb)
Definition win32_port.h:275
#define S_ISDIR(m)
Definition win32_port.h:315
void _dosmaperr(unsigned long)
Definition win32error.c:177
#define S_ISLNK(m)
Definition win32_port.h:334
#define mkdir(a, b)
Definition win32_port.h:80
#define fstat
Definition win32_port.h:73
#define O_CLOEXEC
Definition win32_port.h:344
SubTransactionId GetCurrentSubTransactionId(void)
Definition xact.c:792
int wal_sync_method
Definition xlog.c:133
@ WAL_SYNC_METHOD_FSYNC_WRITETHROUGH
Definition xlog.h:28
static const char * directory
Definition zic.c:648