PostgreSQL Source Code git master
Loading...
Searching...
No Matches
fd.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * fd.c
4 * Virtual file descriptor code.
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
11 *
12 * NOTES:
13 *
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 1024 on many modern
20 * operating systems, but may be lower on others.)
21 *
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
27 *
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
32 *
33 * INTERFACE ROUTINES
34 *
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
41 *
42 * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 * temporary files that have names so that they can be shared between
44 * backends. Such files are automatically closed and count against the
45 * temporary file limit of the backend that creates them, but unlike anonymous
46 * files they are not automatically deleted. See sharedfileset.c for a shared
47 * ownership mechanism that provides automatic cleanup for shared files when
48 * the last of a group of backends detaches.
49 *
50 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 * They behave like the corresponding native functions, except that the handle
53 * is registered with the current subtransaction, and will be automatically
54 * closed at abort. These are intended mainly for short operations like
55 * reading a configuration file; there is a limit on the number of files that
56 * can be opened using these functions at any one time.
57 *
58 * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 * release file descriptors in use by the virtual file descriptors if
60 * necessary. There is no automatic cleanup of file descriptors returned by
61 * BasicOpenFile, it is solely the caller's responsibility to close the file
62 * descriptor by calling close(2).
63 *
64 * If a non-virtual file descriptor needs to be held open for any length of
65 * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 * (and eventually ReleaseExternalFD), so that we can take it into account
67 * while deciding how many VFDs can be open. This applies to FDs obtained
68 * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 *
70 *-------------------------------------------------------------------------
71 */
72
73#include "postgres.h"
74
75#include <dirent.h>
76#include <sys/file.h>
77#include <sys/param.h>
78#include <sys/resource.h> /* for getrlimit */
79#include <sys/stat.h>
80#include <sys/types.h>
81#ifndef WIN32
82#include <sys/mman.h>
83#endif
84#include <limits.h>
85#include <unistd.h>
86#include <fcntl.h>
87
88#include "access/xact.h"
89#include "access/xlog.h"
91#include "common/file_perm.h"
92#include "common/file_utils.h"
93#include "common/pg_prng.h"
94#include "miscadmin.h"
95#include "pgstat.h"
96#include "postmaster/startup.h"
97#include "storage/aio.h"
98#include "storage/fd.h"
99#include "storage/ipc.h"
100#include "utils/guc.h"
101#include "utils/guc_hooks.h"
102#include "utils/resowner.h"
103#include "utils/varlena.h"
104#include "utils/wait_event.h"
105
106/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
107#if defined(HAVE_SYNC_FILE_RANGE)
108#define PG_FLUSH_DATA_WORKS 1
109#elif !defined(WIN32) && defined(MS_ASYNC)
110#define PG_FLUSH_DATA_WORKS 1
111#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
112#define PG_FLUSH_DATA_WORKS 1
113#endif
114
115/*
116 * We must leave some file descriptors free for system(), the dynamic loader,
117 * and other code that tries to open files without consulting fd.c. This
118 * is the number left free. (While we try fairly hard to prevent EMFILE
119 * errors, there's never any guarantee that we won't get ENFILE due to
120 * other processes chewing up FDs. So it's a bad idea to try to open files
121 * without consulting fd.c. Nonetheless we cannot control all code.)
122 *
123 * Because this is just a fixed setting, we are effectively assuming that
124 * no such code will leave FDs open over the long term; otherwise the slop
125 * is likely to be insufficient. Note in particular that we expect that
126 * loading a shared library does not result in any permanent increase in
127 * the number of open files. (This appears to be true on most if not
128 * all platforms as of Feb 2004.)
129 */
130#define NUM_RESERVED_FDS 10
131
132/*
133 * If we have fewer than this many usable FDs after allowing for the reserved
134 * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
135 * much less than that. Note that this value ensures numExternalFDs can be
136 * at least 16; as of this writing, the contrib/postgres_fdw regression tests
137 * will not pass unless that can grow to at least 14.)
138 */
139#define FD_MINFREE 48
140
141/*
142 * A number of platforms allow individual processes to open many more files
143 * than they can really support when *many* processes do the same thing.
144 * This GUC parameter lets the DBA limit max_safe_fds to something less than
145 * what the postmaster's initial probe suggests will work.
146 */
148
149/*
150 * Maximum number of file descriptors to open for operations that fd.c knows
151 * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
152 * to a conservative value, and remains that way indefinitely in bootstrap or
153 * standalone-backend cases. In normal postmaster operation, the postmaster
154 * calls set_max_safe_fds() late in initialization to update the value, and
155 * that value is then inherited by forked subprocesses.
156 *
157 * Note: the value of max_files_per_process is taken into account while
158 * setting this variable, and so need not be tested separately.
159 */
160int max_safe_fds = FD_MINFREE; /* default if not changed */
161
162/* Whether it is safe to continue running after fsync() fails. */
163bool data_sync_retry = false;
164
165/* How SyncDataDirectory() should do its job. */
167
168/* How data files should be bulk-extended with zeros. */
170
171/* Which kinds of files should be opened with PG_O_DIRECT. */
173
174/* Debugging.... */
175
176#ifdef FDDEBUG
177#define DO_DB(A) \
178 do { \
179 int _do_db_save_errno = errno; \
180 A; \
181 errno = _do_db_save_errno; \
182 } while (0)
183#else
184#define DO_DB(A) \
185 ((void) 0)
186#endif
187
188#define VFD_CLOSED (-1)
189
190#define FileIsValid(file) \
191 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
192
193#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
194
195/* these are the assigned bits in fdstate below: */
196#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
197#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
198#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
199
200typedef struct vfd
201{
202 int fd; /* current FD, or VFD_CLOSED if none */
203 unsigned short fdstate; /* bitflags for VFD's state */
204 ResourceOwner resowner; /* owner, for automatic cleanup */
205 File nextFree; /* link to next free VFD, if in freelist */
206 File lruMoreRecently; /* doubly linked recency-of-use list */
208 pgoff_t fileSize; /* current size of file (0 if not temporary) */
209 char *fileName; /* name of file, or NULL for unused VFD */
210 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
211 int fileFlags; /* open(2) flags for (re)opening the file */
212 mode_t fileMode; /* mode to pass to open(2) */
214
215/*
216 * Virtual File Descriptor array pointer and size. This grows as
217 * needed. 'File' values are indexes into this array.
218 * Note that VfdCache[0] is not a usable VFD, just a list header.
219 */
220static Vfd *VfdCache;
222
223/*
224 * Number of file descriptors known to be in use by VFD entries.
225 */
226static int nfile = 0;
227
228/*
229 * Flag to tell whether it's worth scanning VfdCache looking for temp files
230 * to close
231 */
232static bool have_xact_temporary_files = false;
233
234/*
235 * Tracks the total size of all temporary files. Note: when temp_file_limit
236 * is being enforced, this cannot overflow since the limit cannot be more
237 * than INT_MAX kilobytes. When not enforcing, it could theoretically
238 * overflow, but we don't care.
239 */
241
242/* Temporary file access initialized and not yet shut down? */
243#ifdef USE_ASSERT_CHECKING
244static bool temporary_files_allowed = false;
245#endif
246
247/*
248 * List of OS handles opened with AllocateFile, AllocateDir and
249 * OpenTransientFile.
250 */
258
259typedef struct
260{
263 union
264 {
267 int fd;
268 } desc;
270
271static int numAllocatedDescs = 0;
272static int maxAllocatedDescs = 0;
274
275/*
276 * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
277 */
278static int numExternalFDs = 0;
279
280/*
281 * Number of temporary files opened during the current session;
282 * this is used in generation of tempfile names.
283 */
284static long tempFileCounter = 0;
285
286/*
287 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
288 * indicating that the current database's default tablespace should be used.)
289 * When numTempTableSpaces is -1, this has not been set in the current
290 * transaction.
291 */
293static int numTempTableSpaces = -1;
294static int nextTempTableSpace = 0;
295
296
297/*--------------------
298 *
299 * Private Routines
300 *
301 * Delete - delete a file from the Lru ring
302 * LruDelete - remove a file from the Lru ring and close its FD
303 * Insert - put a file at the front of the Lru ring
304 * LruInsert - put a file at the front of the Lru ring and open it
305 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
306 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
307 * AllocateVfd - grab a free (or new) file record (from VfdCache)
308 * FreeVfd - free a file record
309 *
310 * The Least Recently Used ring is a doubly linked list that begins and
311 * ends on element zero. Element zero is special -- it doesn't represent
312 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
313 * anchor that shows us the beginning/end of the ring.
314 * Only VFD elements that are currently really open (have an FD assigned) are
315 * in the Lru ring. Elements that are "virtually" open can be recognized
316 * by having a non-null fileName field.
317 *
318 * example:
319 *
320 * /--less----\ /---------\
321 * v \ v \
322 * #0 --more---> LeastRecentlyUsed --more-\ \
323 * ^\ | |
324 * \\less--> MostRecentlyUsedFile <---/ |
325 * \more---/ \--less--/
326 *
327 *--------------------
328 */
329static void Delete(File file);
330static void LruDelete(File file);
331static void Insert(File file);
332static int LruInsert(File file);
333static bool ReleaseLruFile(void);
334static void ReleaseLruFiles(void);
335static File AllocateVfd(void);
336static void FreeVfd(File file);
337
338static int FileAccess(File file);
340static bool reserveAllocatedDesc(void);
341static int FreeDesc(AllocateDesc *desc);
342
343static void BeforeShmemExit_Files(int code, Datum arg);
344static void CleanupTempFiles(bool isCommit, bool isProcExit);
345static void RemovePgTempRelationFiles(const char *tsdirname);
347
348static void walkdir(const char *path,
349 void (*action) (const char *fname, bool isdir, int elevel),
350 bool process_symlinks,
351 int elevel);
352#ifdef PG_FLUSH_DATA_WORKS
353static void pre_sync_fname(const char *fname, bool isdir, int elevel);
354#endif
355static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
356static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
357
358static int fsync_parent_path(const char *fname, int elevel);
359
360
361/* ResourceOwner callbacks to hold virtual file descriptors */
362static void ResOwnerReleaseFile(Datum res);
363static char *ResOwnerPrintFile(Datum res);
364
366{
367 .name = "File",
368 .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
369 .release_priority = RELEASE_PRIO_FILES,
370 .ReleaseResource = ResOwnerReleaseFile,
371 .DebugPrint = ResOwnerPrintFile
372};
373
374/* Convenience wrappers over ResourceOwnerRemember/Forget */
375static inline void
380static inline void
385
386/*
387 * pg_fsync --- do fsync with or without writethrough
388 */
389int
391{
392#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
393 struct stat st;
394
395 /*
396 * Some operating system implementations of fsync() have requirements
397 * about the file access modes that were used when their file descriptor
398 * argument was opened, and these requirements differ depending on whether
399 * the file descriptor is for a directory.
400 *
401 * For any file descriptor that may eventually be handed to fsync(), we
402 * should have opened it with access modes that are compatible with
403 * fsync() on all supported systems, otherwise the code may not be
404 * portable, even if it runs ok on the current system.
405 *
406 * We assert here that a descriptor for a file was opened with write
407 * permissions (i.e., not O_RDONLY) and for a directory without write
408 * permissions (O_RDONLY). Notice that the assertion check is made even
409 * if fsync() is disabled.
410 *
411 * If fstat() fails, ignore it and let the follow-up fsync() complain.
412 */
413 if (fstat(fd, &st) == 0)
414 {
415 int desc_flags = fcntl(fd, F_GETFL);
416
418
419 if (S_ISDIR(st.st_mode))
421 else
423 }
424 errno = 0;
425#endif
426
427 /* #if is to skip the wal_sync_method test if there's no need for it */
428#if defined(HAVE_FSYNC_WRITETHROUGH)
431 else
432#endif
434}
435
436
437/*
438 * pg_fsync_no_writethrough --- same as fsync except does nothing if
439 * enableFsync is off
440 */
441int
443{
444 int rc;
445
446 if (!enableFsync)
447 return 0;
448
449retry:
450 rc = fsync(fd);
451
452 if (rc == -1 && errno == EINTR)
453 goto retry;
454
455 return rc;
456}
457
458/*
459 * pg_fsync_writethrough
460 */
461int
463{
464 if (enableFsync)
465 {
466#if defined(F_FULLFSYNC)
467 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
468#else
469 errno = ENOSYS;
470 return -1;
471#endif
472 }
473 else
474 return 0;
475}
476
477/*
478 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
479 */
480int
482{
483 int rc;
484
485 if (!enableFsync)
486 return 0;
487
488retry:
489 rc = fdatasync(fd);
490
491 if (rc == -1 && errno == EINTR)
492 goto retry;
493
494 return rc;
495}
496
497/*
498 * pg_file_exists -- check that a file exists.
499 *
500 * This requires an absolute path to the file. Returns true if the file is
501 * not a directory, false otherwise.
502 */
503bool
505{
506 struct stat st;
507
508 Assert(name != NULL);
509
510 if (stat(name, &st) == 0)
511 return !S_ISDIR(st.st_mode);
512 else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
515 errmsg("could not access file \"%s\": %m", name)));
516
517 return false;
518}
519
520/*
521 * pg_flush_data --- advise OS that the described dirty data should be flushed
522 *
523 * offset of 0 with nbytes 0 means that the entire file should be flushed
524 */
525void
526pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
527{
528 /*
529 * Right now file flushing is primarily used to avoid making later
530 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
531 * if fsyncs are disabled - that's a decision we might want to make
532 * configurable at some point.
533 */
534 if (!enableFsync)
535 return;
536
537 /*
538 * We compile all alternatives that are supported on the current platform,
539 * to find portability problems more easily.
540 */
541#if defined(HAVE_SYNC_FILE_RANGE)
542 {
543 int rc;
544 static bool not_implemented_by_kernel = false;
545
547 return;
548
549retry:
550
551 /*
552 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
553 * tells the OS that writeback for the specified blocks should be
554 * started, but that we don't want to wait for completion. Note that
555 * this call might block if too much dirty data exists in the range.
556 * This is the preferable method on OSs supporting it, as it works
557 * reliably when available (contrast to msync()) and doesn't flush out
558 * clean data (like FADV_DONTNEED).
559 */
560 rc = sync_file_range(fd, offset, nbytes,
562 if (rc != 0)
563 {
564 int elevel;
565
566 if (rc == EINTR)
567 goto retry;
568
569 /*
570 * For systems that don't have an implementation of
571 * sync_file_range() such as Windows WSL, generate only one
572 * warning and then suppress all further attempts by this process.
573 */
574 if (errno == ENOSYS)
575 {
576 elevel = WARNING;
578 }
579 else
580 elevel = data_sync_elevel(WARNING);
581
582 ereport(elevel,
584 errmsg("could not flush dirty data: %m")));
585 }
586
587 return;
588 }
589#endif
590#if !defined(WIN32) && defined(MS_ASYNC)
591 {
592 void *p;
593 static int pagesize = 0;
594
595 /*
596 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
597 * writeback. On linux it only does so if MS_SYNC is specified, but
598 * then it does the writeback synchronously. Luckily all common linux
599 * systems have sync_file_range(). This is preferable over
600 * FADV_DONTNEED because it doesn't flush out clean data.
601 *
602 * We map the file (mmap()), tell the kernel to sync back the contents
603 * (msync()), and then remove the mapping again (munmap()).
604 */
605
606 /* mmap() needs actual length if we want to map whole file */
607 if (offset == 0 && nbytes == 0)
608 {
609 nbytes = lseek(fd, 0, SEEK_END);
610 if (nbytes < 0)
611 {
614 errmsg("could not determine dirty data size: %m")));
615 return;
616 }
617 }
618
619 /*
620 * Some platforms reject partial-page mmap() attempts. To deal with
621 * that, just truncate the request to a page boundary. If any extra
622 * bytes don't get flushed, well, it's only a hint anyway.
623 */
624
625 /* fetch pagesize only once */
626 if (pagesize == 0)
628
629 /* align length to pagesize, dropping any fractional page */
630 if (pagesize > 0)
631 nbytes = (nbytes / pagesize) * pagesize;
632
633 /* fractional-page request is a no-op */
634 if (nbytes <= 0)
635 return;
636
637 /*
638 * mmap could well fail, particularly on 32-bit platforms where there
639 * may simply not be enough address space. If so, silently fall
640 * through to the next implementation.
641 */
642 if (nbytes <= (pgoff_t) SSIZE_MAX)
643 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
644 else
645 p = MAP_FAILED;
646
647 if (p != MAP_FAILED)
648 {
649 int rc;
650
651 rc = msync(p, (size_t) nbytes, MS_ASYNC);
652 if (rc != 0)
653 {
656 errmsg("could not flush dirty data: %m")));
657 /* NB: need to fall through to munmap()! */
658 }
659
660 rc = munmap(p, (size_t) nbytes);
661 if (rc != 0)
662 {
663 /* FATAL error because mapping would remain */
666 errmsg("could not munmap() while flushing data: %m")));
667 }
668
669 return;
670 }
671 }
672#endif
673#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
674 {
675 int rc;
676
677 /*
678 * Signal the kernel that the passed in range should not be cached
679 * anymore. This has the, desired, side effect of writing out dirty
680 * data, and the, undesired, side effect of likely discarding useful
681 * clean cached blocks. For the latter reason this is the least
682 * preferable method.
683 */
684
685 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
686
687 if (rc != 0)
688 {
689 /* don't error out, this is just a performance optimization */
692 errmsg("could not flush dirty data: %m")));
693 }
694
695 return;
696 }
697#endif
698}
699
700/*
701 * Truncate an open file to a given length.
702 */
703static int
705{
706 int ret;
707
708retry:
709 ret = ftruncate(fd, length);
710
711 if (ret == -1 && errno == EINTR)
712 goto retry;
713
714 return ret;
715}
716
717/*
718 * Truncate a file to a given length by name.
719 */
720int
721pg_truncate(const char *path, pgoff_t length)
722{
723 int ret;
724#ifdef WIN32
725 int save_errno;
726 int fd;
727
729 if (fd >= 0)
730 {
731 ret = pg_ftruncate(fd, length);
735 }
736 else
737 ret = -1;
738#else
739
740retry:
741 ret = truncate(path, length);
742
743 if (ret == -1 && errno == EINTR)
744 goto retry;
745#endif
746
747 return ret;
748}
749
750/*
751 * fsync_fname -- fsync a file or directory, handling errors properly
752 *
753 * Try to fsync a file or directory. When doing the latter, ignore errors that
754 * indicate the OS just doesn't allow/require fsyncing directories.
755 */
756void
757fsync_fname(const char *fname, bool isdir)
758{
760}
761
762/*
763 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
764 *
765 * This routine ensures that, after returning, the effect of renaming file
766 * persists in case of a crash. A crash while this routine is running will
767 * leave you with either the pre-existing or the moved file in place of the
768 * new file; no mixed state or truncated files are possible.
769 *
770 * It does so by using fsync on the old filename and the possibly existing
771 * target filename before the rename, and the target file and directory after.
772 *
773 * Note that rename() cannot be used across arbitrary directories, as they
774 * might not be on the same filesystem. Therefore this routine does not
775 * support renaming across directories.
776 *
777 * Log errors with the caller specified severity.
778 *
779 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
780 * valid upon return.
781 */
782int
783durable_rename(const char *oldfile, const char *newfile, int elevel)
784{
785 int fd;
786
787 /*
788 * First fsync the old and target path (if it exists), to ensure that they
789 * are properly persistent on disk. Syncing the target file is not
790 * strictly necessary, but it makes it easier to reason about crashes;
791 * because it's then guaranteed that either source or target file exists
792 * after a crash.
793 */
794 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
795 return -1;
796
798 if (fd < 0)
799 {
800 if (errno != ENOENT)
801 {
802 ereport(elevel,
804 errmsg("could not open file \"%s\": %m", newfile)));
805 return -1;
806 }
807 }
808 else
809 {
810 if (pg_fsync(fd) != 0)
811 {
812 int save_errno;
813
814 /* close file upon error, might not be in transaction context */
818
819 ereport(elevel,
821 errmsg("could not fsync file \"%s\": %m", newfile)));
822 return -1;
823 }
824
825 if (CloseTransientFile(fd) != 0)
826 {
827 ereport(elevel,
829 errmsg("could not close file \"%s\": %m", newfile)));
830 return -1;
831 }
832 }
833
834 /* Time to do the real deal... */
835 if (rename(oldfile, newfile) < 0)
836 {
837 ereport(elevel,
839 errmsg("could not rename file \"%s\" to \"%s\": %m",
840 oldfile, newfile)));
841 return -1;
842 }
843
844 /*
845 * To guarantee renaming the file is persistent, fsync the file with its
846 * new name, and its containing directory.
847 */
848 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
849 return -1;
850
851 if (fsync_parent_path(newfile, elevel) != 0)
852 return -1;
853
854 return 0;
855}
856
857/*
858 * durable_unlink -- remove a file in a durable manner
859 *
860 * This routine ensures that, after returning, the effect of removing file
861 * persists in case of a crash. A crash while this routine is running will
862 * leave the system in no mixed state.
863 *
864 * It does so by using fsync on the parent directory of the file after the
865 * actual removal is done.
866 *
867 * Log errors with the severity specified by caller.
868 *
869 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
870 * valid upon return.
871 */
872int
873durable_unlink(const char *fname, int elevel)
874{
875 if (unlink(fname) < 0)
876 {
877 ereport(elevel,
879 errmsg("could not remove file \"%s\": %m",
880 fname)));
881 return -1;
882 }
883
884 /*
885 * To guarantee that the removal of the file is persistent, fsync its
886 * parent directory.
887 */
888 if (fsync_parent_path(fname, elevel) != 0)
889 return -1;
890
891 return 0;
892}
893
894/*
895 * InitFileAccess --- initialize this module during backend startup
896 *
897 * This is called during either normal or standalone backend start.
898 * It is *not* called in the postmaster.
899 *
900 * Note that this does not initialize temporary file access, that is
901 * separately initialized via InitTemporaryFileAccess().
902 */
903void
905{
906 Assert(SizeVfdCache == 0); /* call me only once */
907
908 /* initialize cache header entry */
909 VfdCache = (Vfd *) malloc(sizeof(Vfd));
910 if (VfdCache == NULL)
913 errmsg("out of memory")));
914
915 MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
917
918 SizeVfdCache = 1;
919}
920
921/*
922 * InitTemporaryFileAccess --- initialize temporary file access during startup
923 *
924 * This is called during either normal or standalone backend start.
925 * It is *not* called in the postmaster.
926 *
927 * This is separate from InitFileAccess() because temporary file cleanup can
928 * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
929 * our reporting has to happen before that. Low level file access should be
930 * available for longer, hence the separate initialization / shutdown of
931 * temporary file handling.
932 */
933void
935{
936 Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
937 Assert(!temporary_files_allowed); /* call me only once */
938
939 /*
940 * Register before-shmem-exit hook to ensure temp files are dropped while
941 * we can still report stats.
942 */
944
945#ifdef USE_ASSERT_CHECKING
947#endif
948}
949
950/*
951 * count_usable_fds --- count how many FDs the system will let us open,
952 * and estimate how many are already open.
953 *
954 * We stop counting if usable_fds reaches max_to_probe. Note: a small
955 * value of max_to_probe might result in an underestimate of already_open;
956 * we must fill in any "gaps" in the set of used FDs before the calculation
957 * of already_open will give the right answer. In practice, max_to_probe
958 * of a couple of dozen should be enough to ensure good results.
959 *
960 * We assume stderr (FD 2) is available for dup'ing. While the calling
961 * script could theoretically close that, it would be a really bad idea,
962 * since then one risks loss of error messages from, e.g., libc.
963 */
964static void
966{
967 int *fd;
968 int size;
969 int used = 0;
970 int highestfd = 0;
971 int j;
972
973#ifdef HAVE_GETRLIMIT
974 struct rlimit rlim;
976#endif
977
978 size = 1024;
979 fd = (int *) palloc(size * sizeof(int));
980
981#ifdef HAVE_GETRLIMIT
983 if (getrlimit_status != 0)
984 ereport(WARNING, (errmsg("getrlimit failed: %m")));
985#endif /* HAVE_GETRLIMIT */
986
987 /* dup until failure or probe limit reached */
988 for (;;)
989 {
990 int thisfd;
991
992#ifdef HAVE_GETRLIMIT
993
994 /*
995 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
996 * some platforms
997 */
998 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
999 break;
1000#endif
1001
1002 thisfd = dup(2);
1003 if (thisfd < 0)
1004 {
1005 /* Expect EMFILE or ENFILE, else it's fishy */
1006 if (errno != EMFILE && errno != ENFILE)
1007 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1008 break;
1009 }
1010
1011 if (used >= size)
1012 {
1013 size *= 2;
1014 fd = (int *) repalloc(fd, size * sizeof(int));
1015 }
1016 fd[used++] = thisfd;
1017
1018 if (highestfd < thisfd)
1019 highestfd = thisfd;
1020
1021 if (used >= max_to_probe)
1022 break;
1023 }
1024
1025 /* release the files we opened */
1026 for (j = 0; j < used; j++)
1027 close(fd[j]);
1028
1029 pfree(fd);
1030
1031 /*
1032 * Return results. usable_fds is just the number of successful dups. We
1033 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1034 * number) and so already_open is highestfd+1 - usable_fds.
1035 */
1036 *usable_fds = used;
1037 *already_open = highestfd + 1 - used;
1038}
1039
1040/*
1041 * set_max_safe_fds
1042 * Determine number of file descriptors that fd.c is allowed to use
1043 */
1044void
1046{
1047 int usable_fds;
1048 int already_open;
1049
1050 /*----------
1051 * We want to set max_safe_fds to
1052 * MIN(usable_fds, max_files_per_process)
1053 * less the slop factor for files that are opened without consulting
1054 * fd.c. This ensures that we won't allow to open more than
1055 * max_files_per_process, or the experimentally-determined EMFILE limit,
1056 * additional files.
1057 *----------
1058 */
1061
1063
1064 /*
1065 * Take off the FDs reserved for system() etc.
1066 */
1068
1069 /*
1070 * Make sure we still have enough to get by.
1071 */
1073 ereport(FATAL,
1075 errmsg("insufficient file descriptors available to start server process"),
1076 errdetail("System allows %d, server needs at least %d, %d files are already open.",
1079 already_open)));
1080
1081 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1083}
1084
1085/*
1086 * Open a file with BasicOpenFilePerm() and pass default file mode for the
1087 * fileMode parameter.
1088 */
1089int
1090BasicOpenFile(const char *fileName, int fileFlags)
1091{
1092 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1093}
1094
1095/*
1096 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1097 *
1098 * This is exported for use by places that really want a plain kernel FD,
1099 * but need to be proof against running out of FDs. Once an FD has been
1100 * successfully returned, it is the caller's responsibility to ensure that
1101 * it will not be leaked on ereport()! Most users should *not* call this
1102 * routine directly, but instead use the VFD abstraction level, which
1103 * provides protection against descriptor leaks as well as management of
1104 * files that need to be open for more than a short period of time.
1105 *
1106 * Ideally this should be the *only* direct call of open() in the backend.
1107 * In practice, the postmaster calls open() directly, and there are some
1108 * direct open() calls done early in backend startup. Those are OK since
1109 * this module wouldn't have any open files to close at that point anyway.
1110 */
1111int
1112BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1113{
1114 int fd;
1115
1116tryAgain:
1117#ifdef PG_O_DIRECT_USE_F_NOCACHE
1118 fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1119#else
1120 fd = open(fileName, fileFlags, fileMode);
1121#endif
1122
1123 if (fd >= 0)
1124 {
1125#ifdef PG_O_DIRECT_USE_F_NOCACHE
1126 if (fileFlags & PG_O_DIRECT)
1127 {
1128 if (fcntl(fd, F_NOCACHE, 1) < 0)
1129 {
1130 int save_errno = errno;
1131
1132 close(fd);
1133 errno = save_errno;
1134 return -1;
1135 }
1136 }
1137#endif
1138
1139 return fd; /* success! */
1140 }
1141
1142 if (errno == EMFILE || errno == ENFILE)
1143 {
1144 int save_errno = errno;
1145
1146 ereport(LOG,
1148 errmsg("out of file descriptors: %m; release and retry")));
1149 errno = 0;
1150 if (ReleaseLruFile())
1151 goto tryAgain;
1152 errno = save_errno;
1153 }
1154
1155 return -1; /* failure */
1156}
1157
1158/*
1159 * AcquireExternalFD - attempt to reserve an external file descriptor
1160 *
1161 * This should be used by callers that need to hold a file descriptor open
1162 * over more than a short interval, but cannot use any of the other facilities
1163 * provided by this module.
1164 *
1165 * The difference between this and the underlying ReserveExternalFD function
1166 * is that this will report failure (by setting errno and returning false)
1167 * if "too many" external FDs are already reserved. This should be used in
1168 * any code where the total number of FDs to be reserved is not predictable
1169 * and small.
1170 */
1171bool
1173{
1174 /*
1175 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1176 * "external" FDs.
1177 */
1178 if (numExternalFDs < max_safe_fds / 3)
1179 {
1181 return true;
1182 }
1183 errno = EMFILE;
1184 return false;
1185}
1186
1187/*
1188 * ReserveExternalFD - report external consumption of a file descriptor
1189 *
1190 * This should be used by callers that need to hold a file descriptor open
1191 * over more than a short interval, but cannot use any of the other facilities
1192 * provided by this module. This just tracks the use of the FD and closes
1193 * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1194 *
1195 * Call this directly only in code where failure to reserve the FD would be
1196 * fatal; for example, the WAL-writing code does so, since the alternative is
1197 * session failure. Also, it's very unwise to do so in code that could
1198 * consume more than one FD per process.
1199 *
1200 * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1201 * available, it doesn't matter too much whether this is called before or
1202 * after actually opening the FD; but doing so beforehand reduces the risk of
1203 * an EMFILE failure if not everybody played nice. In any case, it's solely
1204 * caller's responsibility to keep the external-FD count in sync with reality.
1205 */
1206void
1208{
1209 /*
1210 * Release VFDs if needed to stay safe. Because we do this before
1211 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1212 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1213 */
1215
1217}
1218
1219/*
1220 * ReleaseExternalFD - report release of an external file descriptor
1221 *
1222 * This is guaranteed not to change errno, so it can be used in failure paths.
1223 */
1224void
1226{
1229}
1230
1231
1232#if defined(FDDEBUG)
1233
1234static void
1235_dump_lru(void)
1236{
1237 int mru = VfdCache[0].lruLessRecently;
1238 Vfd *vfdP = &VfdCache[mru];
1239 char buf[2048];
1240
1241 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1242 while (mru != 0)
1243 {
1244 mru = vfdP->lruLessRecently;
1245 vfdP = &VfdCache[mru];
1246 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1247 }
1248 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1249 elog(LOG, "%s", buf);
1250}
1251#endif /* FDDEBUG */
1252
1253static void
1255{
1256 Vfd *vfdP;
1257
1258 Assert(file != 0);
1259
1260 DO_DB(elog(LOG, "Delete %d (%s)",
1261 file, VfdCache[file].fileName));
1262 DO_DB(_dump_lru());
1263
1264 vfdP = &VfdCache[file];
1265
1266 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1267 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1268
1269 DO_DB(_dump_lru());
1270}
1271
1272static void
1274{
1275 Vfd *vfdP;
1276
1277 Assert(file != 0);
1278
1279 DO_DB(elog(LOG, "LruDelete %d (%s)",
1280 file, VfdCache[file].fileName));
1281
1282 vfdP = &VfdCache[file];
1283
1285
1286 /*
1287 * Close the file. We aren't expecting this to fail; if it does, better
1288 * to leak the FD than to mess up our internal state.
1289 */
1290 if (close(vfdP->fd) != 0)
1292 "could not close file \"%s\": %m", vfdP->fileName);
1293 vfdP->fd = VFD_CLOSED;
1294 --nfile;
1295
1296 /* delete the vfd record from the LRU ring */
1297 Delete(file);
1298}
1299
1300static void
1302{
1303 Vfd *vfdP;
1304
1305 Assert(file != 0);
1306
1307 DO_DB(elog(LOG, "Insert %d (%s)",
1308 file, VfdCache[file].fileName));
1309 DO_DB(_dump_lru());
1310
1311 vfdP = &VfdCache[file];
1312
1313 vfdP->lruMoreRecently = 0;
1314 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1315 VfdCache[0].lruLessRecently = file;
1316 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1317
1318 DO_DB(_dump_lru());
1319}
1320
1321/* returns 0 on success, -1 on re-open failure (with errno set) */
1322static int
1324{
1325 Vfd *vfdP;
1326
1327 Assert(file != 0);
1328
1329 DO_DB(elog(LOG, "LruInsert %d (%s)",
1330 file, VfdCache[file].fileName));
1331
1332 vfdP = &VfdCache[file];
1333
1334 if (FileIsNotOpen(file))
1335 {
1336 /* Close excess kernel FDs. */
1338
1339 /*
1340 * The open could still fail for lack of file descriptors, eg due to
1341 * overall system file table being full. So, be prepared to release
1342 * another FD if necessary...
1343 */
1344 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1345 vfdP->fileMode);
1346 if (vfdP->fd < 0)
1347 {
1348 DO_DB(elog(LOG, "re-open failed: %m"));
1349 return -1;
1350 }
1351 else
1352 {
1353 ++nfile;
1354 }
1355 }
1356
1357 /*
1358 * put it at the head of the Lru ring
1359 */
1360
1361 Insert(file);
1362
1363 return 0;
1364}
1365
1366/*
1367 * Release one kernel FD by closing the least-recently-used VFD.
1368 */
1369static bool
1371{
1372 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1373
1374 if (nfile > 0)
1375 {
1376 /*
1377 * There are opened files and so there should be at least one used vfd
1378 * in the ring.
1379 */
1380 Assert(VfdCache[0].lruMoreRecently != 0);
1381 LruDelete(VfdCache[0].lruMoreRecently);
1382 return true; /* freed a file */
1383 }
1384 return false; /* no files available to free */
1385}
1386
1387/*
1388 * Release kernel FDs as needed to get under the max_safe_fds limit.
1389 * After calling this, it's OK to try to open another file.
1390 */
1391static void
1393{
1395 {
1396 if (!ReleaseLruFile())
1397 break;
1398 }
1399}
1400
1401static File
1403{
1404 Index i;
1405 File file;
1406
1407 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1408
1409 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1410
1411 if (VfdCache[0].nextFree == 0)
1412 {
1413 /*
1414 * The free list is empty so it is time to increase the size of the
1415 * array. We choose to double it each time this happens. However,
1416 * there's not much point in starting *real* small.
1417 */
1420
1421 if (newCacheSize < 32)
1422 newCacheSize = 32;
1423
1424 /*
1425 * Be careful not to clobber VfdCache ptr if realloc fails.
1426 */
1427 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1428 if (newVfdCache == NULL)
1429 ereport(ERROR,
1431 errmsg("out of memory")));
1433
1434 /*
1435 * Initialize the new entries and link them into the free list.
1436 */
1437 for (i = SizeVfdCache; i < newCacheSize; i++)
1438 {
1439 MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1440 VfdCache[i].nextFree = i + 1;
1442 }
1445
1446 /*
1447 * Record the new size
1448 */
1450 }
1451
1452 file = VfdCache[0].nextFree;
1453
1455
1456 return file;
1457}
1458
1459static void
1461{
1462 Vfd *vfdP = &VfdCache[file];
1463
1464 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1465 file, vfdP->fileName ? vfdP->fileName : ""));
1466
1467 if (vfdP->fileName != NULL)
1468 {
1469 free(vfdP->fileName);
1470 vfdP->fileName = NULL;
1471 }
1472 vfdP->fdstate = 0x0;
1473
1474 vfdP->nextFree = VfdCache[0].nextFree;
1475 VfdCache[0].nextFree = file;
1476}
1477
1478/* returns 0 on success, -1 on re-open failure (with errno set) */
1479static int
1481{
1482 int returnValue;
1483
1484 DO_DB(elog(LOG, "FileAccess %d (%s)",
1485 file, VfdCache[file].fileName));
1486
1487 /*
1488 * Is the file open? If not, open it and put it at the head of the LRU
1489 * ring (possibly closing the least recently used file to get an FD).
1490 */
1491
1492 if (FileIsNotOpen(file))
1493 {
1494 returnValue = LruInsert(file);
1495 if (returnValue != 0)
1496 return returnValue;
1497 }
1498 else if (VfdCache[0].lruLessRecently != file)
1499 {
1500 /*
1501 * We now know that the file is open and that it is not the last one
1502 * accessed, so we need to move it to the head of the Lru ring.
1503 */
1504
1505 Delete(file);
1506 Insert(file);
1507 }
1508
1509 return 0;
1510}
1511
1512/*
1513 * Called whenever a temporary file is deleted to report its size.
1514 */
1515static void
1516ReportTemporaryFileUsage(const char *path, pgoff_t size)
1517{
1519
1520 if (log_temp_files >= 0)
1521 {
1522 if ((size / 1024) >= log_temp_files)
1523 ereport(LOG,
1524 (errmsg("temporary file: path \"%s\", size %lu",
1525 path, (unsigned long) size)));
1526 }
1527}
1528
1529/*
1530 * Called to register a temporary file for automatic close.
1531 * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1532 * before the file was opened.
1533 */
1534static void
1536{
1539
1540 /* Backup mechanism for closing at end of xact. */
1543}
1544
1545/*
1546 * Called when we get a shared invalidation message on some relation.
1547 */
1548#ifdef NOT_USED
1549void
1550FileInvalidate(File file)
1551{
1552 Assert(FileIsValid(file));
1553 if (!FileIsNotOpen(file))
1554 LruDelete(file);
1555}
1556#endif
1557
1558/*
1559 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1560 * fileMode parameter.
1561 */
1562File
1563PathNameOpenFile(const char *fileName, int fileFlags)
1564{
1565 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1566}
1567
1568/*
1569 * open a file in an arbitrary directory
1570 *
1571 * NB: if the passed pathname is relative (which it usually is),
1572 * it will be interpreted relative to the process' working directory
1573 * (which should always be $PGDATA when this code is running).
1574 */
1575File
1576PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1577{
1578 char *fnamecopy;
1579 File file;
1580 Vfd *vfdP;
1581
1582 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1583 fileName, fileFlags, fileMode));
1584
1585 /*
1586 * We need a malloc'd copy of the file name; fail cleanly if no room.
1587 */
1588 fnamecopy = strdup(fileName);
1589 if (fnamecopy == NULL)
1590 ereport(ERROR,
1592 errmsg("out of memory")));
1593
1594 file = AllocateVfd();
1595 vfdP = &VfdCache[file];
1596
1597 /* Close excess kernel FDs. */
1599
1600 /*
1601 * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1602 * client shouldn't be expected to know which kernel descriptors are
1603 * currently open, so it wouldn't make sense for them to be inherited by
1604 * executed subprograms.
1605 */
1606 fileFlags |= O_CLOEXEC;
1607
1608 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1609
1610 if (vfdP->fd < 0)
1611 {
1612 int save_errno = errno;
1613
1614 FreeVfd(file);
1615 free(fnamecopy);
1616 errno = save_errno;
1617 return -1;
1618 }
1619 ++nfile;
1620 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1621 vfdP->fd));
1622
1623 vfdP->fileName = fnamecopy;
1624 /* Saved flags are adjusted to be OK for re-opening file */
1625 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1626 vfdP->fileMode = fileMode;
1627 vfdP->fileSize = 0;
1628 vfdP->fdstate = 0x0;
1629 vfdP->resowner = NULL;
1630
1631 Insert(file);
1632
1633 return file;
1634}
1635
1636/*
1637 * Create directory 'directory'. If necessary, create 'basedir', which must
1638 * be the directory above it. This is designed for creating the top-level
1639 * temporary directory on demand before creating a directory underneath it.
1640 * Do nothing if the directory already exists.
1641 *
1642 * Directories created within the top-level temporary directory should begin
1643 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1644 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1645 * that do not need any particular prefix.
1646*/
1647void
1649{
1650 if (MakePGDirectory(directory) < 0)
1651 {
1652 if (errno == EEXIST)
1653 return;
1654
1655 /*
1656 * Failed. Try to create basedir first in case it's missing. Tolerate
1657 * EEXIST to close a race against another process following the same
1658 * algorithm.
1659 */
1660 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1661 ereport(ERROR,
1663 errmsg("cannot create temporary directory \"%s\": %m",
1664 basedir)));
1665
1666 /* Try again. */
1667 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1668 ereport(ERROR,
1670 errmsg("cannot create temporary subdirectory \"%s\": %m",
1671 directory)));
1672 }
1673}
1674
1675/*
1676 * Delete a directory and everything in it, if it exists.
1677 */
1678void
1679PathNameDeleteTemporaryDir(const char *dirname)
1680{
1681 struct stat statbuf;
1682
1683 /* Silently ignore missing directory. */
1684 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1685 return;
1686
1687 /*
1688 * Currently, walkdir doesn't offer a way for our passed in function to
1689 * maintain state. Perhaps it should, so that we could tell the caller
1690 * whether this operation succeeded or failed. Since this operation is
1691 * used in a cleanup path, we wouldn't actually behave differently: we'll
1692 * just log failures.
1693 */
1694 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1695}
1696
1697/*
1698 * Open a temporary file that will disappear when we close it.
1699 *
1700 * This routine takes care of generating an appropriate tempfile name.
1701 * There's no need to pass in fileFlags or fileMode either, since only
1702 * one setting makes any sense for a temp file.
1703 *
1704 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1705 * to ensure it's closed and deleted when it's no longer needed, typically at
1706 * the end-of-transaction. In most cases, you don't want temporary files to
1707 * outlive the transaction that created them, so this should be false -- but
1708 * if you need "somewhat" temporary storage, this might be useful. In either
1709 * case, the file is removed when the File is explicitly closed.
1710 */
1711File
1712OpenTemporaryFile(bool interXact)
1713{
1714 File file = 0;
1715
1716 Assert(temporary_files_allowed); /* check temp file access is up */
1717
1718 /*
1719 * Make sure the current resource owner has space for this File before we
1720 * open it, if we'll be registering it below.
1721 */
1722 if (!interXact)
1724
1725 /*
1726 * If some temp tablespace(s) have been given to us, try to use the next
1727 * one. If a given tablespace can't be found, we silently fall back to
1728 * the database's default tablespace.
1729 *
1730 * BUT: if the temp file is slated to outlive the current transaction,
1731 * force it into the database's default tablespace, so that it will not
1732 * pose a threat to possible tablespace drop attempts.
1733 */
1734 if (numTempTableSpaces > 0 && !interXact)
1735 {
1737
1738 if (OidIsValid(tblspcOid))
1740 }
1741
1742 /*
1743 * If not, or if tablespace is bad, create in database's default
1744 * tablespace. MyDatabaseTableSpace should normally be set before we get
1745 * here, but just in case it isn't, fall back to pg_default tablespace.
1746 */
1747 if (file <= 0)
1751 true);
1752
1753 /* Mark it for deletion at close and temporary file size limit */
1755
1756 /* Register it with the current resource owner */
1757 if (!interXact)
1759
1760 return file;
1761}
1762
1763/*
1764 * Return the path of the temp directory in a given tablespace.
1765 */
1766void
1768{
1769 /*
1770 * Identify the tempfile directory for this tablespace.
1771 *
1772 * If someone tries to specify pg_global, use pg_default instead.
1773 */
1774 if (tablespace == InvalidOid ||
1777 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1778 else
1779 {
1780 /* All other tablespaces are accessed via symlinks */
1781 snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1784 }
1785}
1786
1787/*
1788 * Open a temporary file in a specific tablespace.
1789 * Subroutine for OpenTemporaryFile, which see for details.
1790 */
1791static File
1793{
1794 char tempdirpath[MAXPGPATH];
1795 char tempfilepath[MAXPGPATH];
1796 File file;
1797
1799
1800 /*
1801 * Generate a tempfile name that should be unique within the current
1802 * database instance.
1803 */
1804 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1806
1807 /*
1808 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1809 * temp file that can be reused.
1810 */
1813 if (file <= 0)
1814 {
1815 /*
1816 * We might need to create the tablespace's tempfile directory, if no
1817 * one has yet done so.
1818 *
1819 * Don't check for an error from MakePGDirectory; it could fail if
1820 * someone else just did the same thing. If it doesn't work then
1821 * we'll bomb out on the second create attempt, instead.
1822 */
1824
1827 if (file <= 0 && rejectError)
1828 elog(ERROR, "could not create temporary file \"%s\": %m",
1829 tempfilepath);
1830 }
1831
1832 return file;
1833}
1834
1835
1836/*
1837 * Create a new file. The directory containing it must already exist. Files
1838 * created this way are subject to temp_file_limit and are automatically
1839 * closed at end of transaction, but are not automatically deleted on close
1840 * because they are intended to be shared between cooperating backends.
1841 *
1842 * If the file is inside the top-level temporary directory, its name should
1843 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1844 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1845 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1846 * the prefix isn't needed.
1847 */
1848File
1850{
1851 File file;
1852
1853 Assert(temporary_files_allowed); /* check temp file access is up */
1854
1856
1857 /*
1858 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1859 * temp file that can be reused.
1860 */
1861 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1862 if (file <= 0)
1863 {
1864 if (error_on_failure)
1865 ereport(ERROR,
1867 errmsg("could not create temporary file \"%s\": %m",
1868 path)));
1869 else
1870 return file;
1871 }
1872
1873 /* Mark it for temp_file_limit accounting. */
1875
1876 /* Register it for automatic close. */
1878
1879 return file;
1880}
1881
1882/*
1883 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1884 * another backend. Files opened this way don't count against the
1885 * temp_file_limit of the caller, are automatically closed at the end of the
1886 * transaction but are not deleted on close.
1887 */
1888File
1889PathNameOpenTemporaryFile(const char *path, int mode)
1890{
1891 File file;
1892
1893 Assert(temporary_files_allowed); /* check temp file access is up */
1894
1896
1897 file = PathNameOpenFile(path, mode | PG_BINARY);
1898
1899 /* If no such file, then we don't raise an error. */
1900 if (file <= 0 && errno != ENOENT)
1901 ereport(ERROR,
1903 errmsg("could not open temporary file \"%s\": %m",
1904 path)));
1905
1906 if (file > 0)
1907 {
1908 /* Register it for automatic close. */
1910 }
1911
1912 return file;
1913}
1914
1915/*
1916 * Delete a file by pathname. Return true if the file existed, false if
1917 * didn't.
1918 */
1919bool
1921{
1922 struct stat filestats;
1923 int stat_errno;
1924
1925 /* Get the final size for pgstat reporting. */
1926 if (stat(path, &filestats) != 0)
1927 stat_errno = errno;
1928 else
1929 stat_errno = 0;
1930
1931 /*
1932 * Unlike FileClose's automatic file deletion code, we tolerate
1933 * non-existence to support BufFileDeleteFileSet which doesn't know how
1934 * many segments it has to delete until it runs out.
1935 */
1936 if (stat_errno == ENOENT)
1937 return false;
1938
1939 if (unlink(path) < 0)
1940 {
1941 if (errno != ENOENT)
1944 errmsg("could not unlink temporary file \"%s\": %m",
1945 path)));
1946 return false;
1947 }
1948
1949 if (stat_errno == 0)
1950 ReportTemporaryFileUsage(path, filestats.st_size);
1951 else
1952 {
1953 errno = stat_errno;
1954 ereport(LOG,
1956 errmsg("could not stat file \"%s\": %m", path)));
1957 }
1958
1959 return true;
1960}
1961
1962/*
1963 * close a file when done with it
1964 */
1965void
1967{
1968 Vfd *vfdP;
1969
1970 Assert(FileIsValid(file));
1971
1972 DO_DB(elog(LOG, "FileClose: %d (%s)",
1973 file, VfdCache[file].fileName));
1974
1975 vfdP = &VfdCache[file];
1976
1977 if (!FileIsNotOpen(file))
1978 {
1980
1981 /* close the file */
1982 if (close(vfdP->fd) != 0)
1983 {
1984 /*
1985 * We may need to panic on failure to close non-temporary files;
1986 * see LruDelete.
1987 */
1989 "could not close file \"%s\": %m", vfdP->fileName);
1990 }
1991
1992 --nfile;
1993 vfdP->fd = VFD_CLOSED;
1994
1995 /* remove the file from the lru ring */
1996 Delete(file);
1997 }
1998
1999 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2000 {
2001 /* Subtract its size from current usage (do first in case of error) */
2002 temporary_files_size -= vfdP->fileSize;
2003 vfdP->fileSize = 0;
2004 }
2005
2006 /*
2007 * Delete the file if it was temporary, and make a log entry if wanted
2008 */
2009 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2010 {
2011 struct stat filestats;
2012 int stat_errno;
2013
2014 /*
2015 * If we get an error, as could happen within the ereport/elog calls,
2016 * we'll come right back here during transaction abort. Reset the
2017 * flag to ensure that we can't get into an infinite loop. This code
2018 * is arranged to ensure that the worst-case consequence is failing to
2019 * emit log message(s), not failing to attempt the unlink.
2020 */
2021 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2022
2023
2024 /* first try the stat() */
2025 if (stat(vfdP->fileName, &filestats))
2026 stat_errno = errno;
2027 else
2028 stat_errno = 0;
2029
2030 /* in any case do the unlink */
2031 if (unlink(vfdP->fileName))
2032 ereport(LOG,
2034 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2035
2036 /* and last report the stat results */
2037 if (stat_errno == 0)
2038 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2039 else
2040 {
2041 errno = stat_errno;
2042 ereport(LOG,
2044 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2045 }
2046 }
2047
2048 /* Unregister it from the resource owner */
2049 if (vfdP->resowner)
2050 ResourceOwnerForgetFile(vfdP->resowner, file);
2051
2052 /*
2053 * Return the Vfd slot to the free list
2054 */
2055 FreeVfd(file);
2056}
2057
2058/*
2059 * FilePrefetch - initiate asynchronous read of a given range of the file.
2060 *
2061 * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2062 *
2063 * posix_fadvise() is the simplest standardized interface that accomplishes
2064 * this.
2065 */
2066int
2067FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2068{
2069 Assert(FileIsValid(file));
2070
2071 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2072 file, VfdCache[file].fileName,
2073 (int64) offset, (int64) amount));
2074
2075#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2076 {
2077 int returnCode;
2078
2079 returnCode = FileAccess(file);
2080 if (returnCode < 0)
2081 return returnCode;
2082
2083retry:
2084 pgstat_report_wait_start(wait_event_info);
2085 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2088
2089 if (returnCode == EINTR)
2090 goto retry;
2091
2092 return returnCode;
2093 }
2094#elif defined(__darwin__)
2095 {
2096 struct radvisory
2097 {
2098 off_t ra_offset; /* offset into the file */
2099 int ra_count; /* size of the read */
2100 } ra;
2101 int returnCode;
2102
2103 returnCode = FileAccess(file);
2104 if (returnCode < 0)
2105 return returnCode;
2106
2107 ra.ra_offset = offset;
2108 ra.ra_count = amount;
2109 pgstat_report_wait_start(wait_event_info);
2112 if (returnCode != -1)
2113 return 0;
2114 else
2115 return errno;
2116 }
2117#else
2118 return 0;
2119#endif
2120}
2121
2122void
2123FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
2124{
2125 int returnCode;
2126
2127 Assert(FileIsValid(file));
2128
2129 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2130 file, VfdCache[file].fileName,
2131 (int64) offset, (int64) nbytes));
2132
2133 if (nbytes <= 0)
2134 return;
2135
2136 if (VfdCache[file].fileFlags & PG_O_DIRECT)
2137 return;
2138
2139 returnCode = FileAccess(file);
2140 if (returnCode < 0)
2141 return;
2142
2143 pgstat_report_wait_start(wait_event_info);
2144 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2146}
2147
2148ssize_t
2149FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2150 uint32 wait_event_info)
2151{
2153 Vfd *vfdP;
2154
2155 Assert(FileIsValid(file));
2156
2157 DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2158 file, VfdCache[file].fileName,
2159 (int64) offset,
2160 iovcnt));
2161
2162 returnCode = FileAccess(file);
2163 if (returnCode < 0)
2164 return returnCode;
2165
2166 vfdP = &VfdCache[file];
2167
2168retry:
2169 pgstat_report_wait_start(wait_event_info);
2170 returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2172
2173 if (returnCode < 0)
2174 {
2175 /*
2176 * Windows may run out of kernel buffers and return "Insufficient
2177 * system resources" error. Wait a bit and retry to solve it.
2178 *
2179 * It is rumored that EINTR is also possible on some Unix filesystems,
2180 * in which case immediate retry is indicated.
2181 */
2182#ifdef WIN32
2184
2185 switch (error)
2186 {
2188 pg_usleep(1000L);
2189 errno = EINTR;
2190 break;
2191 default:
2193 break;
2194 }
2195#endif
2196 /* OK to retry if interrupted */
2197 if (errno == EINTR)
2198 goto retry;
2199 }
2200
2201 return returnCode;
2202}
2203
2204int
2206 int iovcnt, pgoff_t offset,
2207 uint32 wait_event_info)
2208{
2209 int returnCode;
2210 Vfd *vfdP;
2211
2212 Assert(FileIsValid(file));
2213
2214 DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2215 file, VfdCache[file].fileName,
2216 (int64) offset,
2217 iovcnt));
2218
2219 returnCode = FileAccess(file);
2220 if (returnCode < 0)
2221 return returnCode;
2222
2223 vfdP = &VfdCache[file];
2224
2225 pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2226
2227 return 0;
2228}
2229
2230ssize_t
2231FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2232 uint32 wait_event_info)
2233{
2235 Vfd *vfdP;
2236
2237 Assert(FileIsValid(file));
2238
2239 DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2240 file, VfdCache[file].fileName,
2241 (int64) offset,
2242 iovcnt));
2243
2244 returnCode = FileAccess(file);
2245 if (returnCode < 0)
2246 return returnCode;
2247
2248 vfdP = &VfdCache[file];
2249
2250 /*
2251 * If enforcing temp_file_limit and it's a temp file, check to see if the
2252 * write would overrun temp_file_limit, and throw error if so. Note: it's
2253 * really a modularity violation to throw error here; we should set errno
2254 * and return -1. However, there's no way to report a suitable error
2255 * message if we do that. All current callers would just throw error
2256 * immediately anyway, so this is safe at present.
2257 */
2258 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2259 {
2260 pgoff_t past_write = offset;
2261
2262 for (int i = 0; i < iovcnt; ++i)
2263 past_write += iov[i].iov_len;
2264
2265 if (past_write > vfdP->fileSize)
2266 {
2268
2270 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2271 ereport(ERROR,
2273 errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2274 temp_file_limit)));
2275 }
2276 }
2277
2278retry:
2279 pgstat_report_wait_start(wait_event_info);
2280 returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2282
2283 if (returnCode >= 0)
2284 {
2285 /*
2286 * Some callers expect short writes to set errno, and traditionally we
2287 * have assumed that they imply disk space shortage. We don't want to
2288 * waste CPU cycles adding up the total size here, so we'll just set
2289 * it for all successful writes in case such a caller determines that
2290 * the write was short and ereports "%m".
2291 */
2292 errno = ENOSPC;
2293
2294 /*
2295 * Maintain fileSize and temporary_files_size if it's a temp file.
2296 */
2297 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2298 {
2299 pgoff_t past_write = offset + returnCode;
2300
2301 if (past_write > vfdP->fileSize)
2302 {
2303 temporary_files_size += past_write - vfdP->fileSize;
2304 vfdP->fileSize = past_write;
2305 }
2306 }
2307 }
2308 else
2309 {
2310 /*
2311 * See comments in FileReadV()
2312 */
2313#ifdef WIN32
2315
2316 switch (error)
2317 {
2319 pg_usleep(1000L);
2320 errno = EINTR;
2321 break;
2322 default:
2324 break;
2325 }
2326#endif
2327 /* OK to retry if interrupted */
2328 if (errno == EINTR)
2329 goto retry;
2330 }
2331
2332 return returnCode;
2333}
2334
2335int
2336FileSync(File file, uint32 wait_event_info)
2337{
2338 int returnCode;
2339
2340 Assert(FileIsValid(file));
2341
2342 DO_DB(elog(LOG, "FileSync: %d (%s)",
2343 file, VfdCache[file].fileName));
2344
2345 returnCode = FileAccess(file);
2346 if (returnCode < 0)
2347 return returnCode;
2348
2349 pgstat_report_wait_start(wait_event_info);
2350 returnCode = pg_fsync(VfdCache[file].fd);
2352
2353 return returnCode;
2354}
2355
2356/*
2357 * Zero a region of the file.
2358 *
2359 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2360 * appropriate error.
2361 */
2362int
2363FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2364{
2365 int returnCode;
2367
2368 Assert(FileIsValid(file));
2369
2370 DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2371 file, VfdCache[file].fileName,
2372 (int64) offset, (int64) amount));
2373
2374 returnCode = FileAccess(file);
2375 if (returnCode < 0)
2376 return returnCode;
2377
2378 pgstat_report_wait_start(wait_event_info);
2379 written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2381
2382 if (written < 0)
2383 return -1;
2384 else if (written != amount)
2385 {
2386 /* if errno is unset, assume problem is no disk space */
2387 if (errno == 0)
2388 errno = ENOSPC;
2389 return -1;
2390 }
2391
2392 return 0;
2393}
2394
2395/*
2396 * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2397 * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2398 * use FileZero() instead.
2399 *
2400 * Note that at least glibc() implements posix_fallocate() in userspace if not
2401 * implemented by the filesystem. That's not the case for all environments
2402 * though.
2403 *
2404 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2405 * appropriate error.
2406 */
2407int
2408FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2409{
2410#ifdef HAVE_POSIX_FALLOCATE
2411 int returnCode;
2412
2413 Assert(FileIsValid(file));
2414
2415 DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2416 file, VfdCache[file].fileName,
2417 (int64) offset, (int64) amount));
2418
2419 returnCode = FileAccess(file);
2420 if (returnCode < 0)
2421 return -1;
2422
2423retry:
2424 pgstat_report_wait_start(wait_event_info);
2425 returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2427
2428 if (returnCode == 0)
2429 return 0;
2430 else if (returnCode == EINTR)
2431 goto retry;
2432
2433 /* for compatibility with %m printing etc */
2434 errno = returnCode;
2435
2436 /*
2437 * Return in cases of a "real" failure, if fallocate is not supported,
2438 * fall through to the FileZero() backed implementation.
2439 */
2441 return -1;
2442#endif
2443
2444 return FileZero(file, offset, amount, wait_event_info);
2445}
2446
2447pgoff_t
2449{
2450 Assert(FileIsValid(file));
2451
2452 DO_DB(elog(LOG, "FileSize %d (%s)",
2453 file, VfdCache[file].fileName));
2454
2455 if (FileIsNotOpen(file))
2456 {
2457 if (FileAccess(file) < 0)
2458 return (pgoff_t) -1;
2459 }
2460
2461 return lseek(VfdCache[file].fd, 0, SEEK_END);
2462}
2463
2464int
2465FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
2466{
2467 int returnCode;
2468
2469 Assert(FileIsValid(file));
2470
2471 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2472 file, VfdCache[file].fileName));
2473
2474 returnCode = FileAccess(file);
2475 if (returnCode < 0)
2476 return returnCode;
2477
2478 pgstat_report_wait_start(wait_event_info);
2479 returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2481
2482 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2483 {
2484 /* adjust our state for truncation of a temp file */
2485 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2486 temporary_files_size -= VfdCache[file].fileSize - offset;
2487 VfdCache[file].fileSize = offset;
2488 }
2489
2490 return returnCode;
2491}
2492
2493/*
2494 * Return the pathname associated with an open file.
2495 *
2496 * The returned string points to an internal buffer, which is valid until
2497 * the file is closed.
2498 */
2499char *
2501{
2502 Assert(FileIsValid(file));
2503
2504 return VfdCache[file].fileName;
2505}
2506
2507/*
2508 * Return the raw file descriptor of an opened file.
2509 *
2510 * The returned file descriptor will be valid until the file is closed, but
2511 * there are a lot of things that can make that happen. So the caller should
2512 * be careful not to do much of anything else before it finishes using the
2513 * returned file descriptor.
2514 */
2515int
2517{
2518 int returnCode;
2519
2520 returnCode = FileAccess(file);
2521 if (returnCode < 0)
2522 return returnCode;
2523
2524 Assert(FileIsValid(file));
2525 return VfdCache[file].fd;
2526}
2527
2528/*
2529 * FileGetRawFlags - returns the file flags on open(2)
2530 */
2531int
2533{
2534 Assert(FileIsValid(file));
2535 return VfdCache[file].fileFlags;
2536}
2537
2538/*
2539 * FileGetRawMode - returns the mode bitmask passed to open(2)
2540 */
2541mode_t
2543{
2544 Assert(FileIsValid(file));
2545 return VfdCache[file].fileMode;
2546}
2547
2548/*
2549 * Make room for another allocatedDescs[] array entry if needed and possible.
2550 * Returns true if an array element is available.
2551 */
2552static bool
2554{
2556 int newMax;
2557
2558 /* Quick out if array already has a free slot. */
2560 return true;
2561
2562 /*
2563 * If the array hasn't yet been created in the current process, initialize
2564 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2565 * we will ever need, anyway. We don't want to look at max_safe_fds
2566 * immediately because set_max_safe_fds() may not have run yet.
2567 */
2568 if (allocatedDescs == NULL)
2569 {
2570 newMax = FD_MINFREE / 3;
2572 /* Out of memory already? Treat as fatal error. */
2573 if (newDescs == NULL)
2574 ereport(ERROR,
2576 errmsg("out of memory")));
2579 return true;
2580 }
2581
2582 /*
2583 * Consider enlarging the array beyond the initial allocation used above.
2584 * By the time this happens, max_safe_fds should be known accurately.
2585 *
2586 * We mustn't let allocated descriptors hog all the available FDs, and in
2587 * practice we'd better leave a reasonable number of FDs for VFD use. So
2588 * set the maximum to max_safe_fds / 3. (This should certainly be at
2589 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2590 * tightening the restriction here.) Recall that "external" FDs are
2591 * allowed to consume another third of max_safe_fds.
2592 */
2593 newMax = max_safe_fds / 3;
2595 {
2597 newMax * sizeof(AllocateDesc));
2598 /* Treat out-of-memory as a non-fatal error. */
2599 if (newDescs == NULL)
2600 return false;
2603 return true;
2604 }
2605
2606 /* Can't enlarge allocatedDescs[] any more. */
2607 return false;
2608}
2609
2610/*
2611 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2612 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2613 * necessary to open the file. When done, call FreeFile rather than fclose.
2614 *
2615 * Note that files that will be open for any significant length of time
2616 * should NOT be handled this way, since they cannot share kernel file
2617 * descriptors with other files; there is grave risk of running out of FDs
2618 * if anyone locks down too many FDs. Most callers of this routine are
2619 * simply reading a config file that they will read and close immediately.
2620 *
2621 * fd.c will automatically close all files opened with AllocateFile at
2622 * transaction commit or abort; this prevents FD leakage if a routine
2623 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2624 *
2625 * Ideally this should be the *only* direct call of fopen() in the backend.
2626 */
2627FILE *
2628AllocateFile(const char *name, const char *mode)
2629{
2630 FILE *file;
2631
2632 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2634
2635 /* Can we allocate another non-virtual FD? */
2636 if (!reserveAllocatedDesc())
2637 ereport(ERROR,
2639 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2641
2642 /* Close excess kernel FDs. */
2644
2645TryAgain:
2646 if ((file = fopen(name, mode)) != NULL)
2647 {
2649
2650 desc->kind = AllocateDescFile;
2651 desc->desc.file = file;
2654 return desc->desc.file;
2655 }
2656
2657 if (errno == EMFILE || errno == ENFILE)
2658 {
2659 int save_errno = errno;
2660
2661 ereport(LOG,
2663 errmsg("out of file descriptors: %m; release and retry")));
2664 errno = 0;
2665 if (ReleaseLruFile())
2666 goto TryAgain;
2667 errno = save_errno;
2668 }
2669
2670 return NULL;
2671}
2672
2673/*
2674 * Open a file with OpenTransientFilePerm() and pass default file mode for
2675 * the fileMode parameter.
2676 */
2677int
2678OpenTransientFile(const char *fileName, int fileFlags)
2679{
2680 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2681}
2682
2683/*
2684 * Like AllocateFile, but returns an unbuffered fd like open(2)
2685 */
2686int
2687OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2688{
2689 int fd;
2690
2691 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2692 numAllocatedDescs, fileName));
2693
2694 /* Can we allocate another non-virtual FD? */
2695 if (!reserveAllocatedDesc())
2696 ereport(ERROR,
2698 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2699 maxAllocatedDescs, fileName)));
2700
2701 /* Close excess kernel FDs. */
2703
2704 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2705
2706 if (fd >= 0)
2707 {
2709
2710 desc->kind = AllocateDescRawFD;
2711 desc->desc.fd = fd;
2714
2715 return fd;
2716 }
2717
2718 return -1; /* failure */
2719}
2720
2721/*
2722 * Routines that want to initiate a pipe stream should use OpenPipeStream
2723 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2724 * necessary. When done, call ClosePipeStream rather than pclose.
2725 *
2726 * This function also ensures that the popen'd program is run with default
2727 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2728 * uses. This ensures desirable response to, eg, closing a read pipe early.
2729 */
2730FILE *
2731OpenPipeStream(const char *command, const char *mode)
2732{
2733 FILE *file;
2734 int save_errno;
2735
2736 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2737 numAllocatedDescs, command));
2738
2739 /* Can we allocate another non-virtual FD? */
2740 if (!reserveAllocatedDesc())
2741 ereport(ERROR,
2743 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2744 maxAllocatedDescs, command)));
2745
2746 /* Close excess kernel FDs. */
2748
2749TryAgain:
2750 fflush(NULL);
2752 errno = 0;
2753 file = popen(command, mode);
2754 save_errno = errno;
2756 errno = save_errno;
2757 if (file != NULL)
2758 {
2760
2761 desc->kind = AllocateDescPipe;
2762 desc->desc.file = file;
2765 return desc->desc.file;
2766 }
2767
2768 if (errno == EMFILE || errno == ENFILE)
2769 {
2770 ereport(LOG,
2772 errmsg("out of file descriptors: %m; release and retry")));
2773 if (ReleaseLruFile())
2774 goto TryAgain;
2775 errno = save_errno;
2776 }
2777
2778 return NULL;
2779}
2780
2781/*
2782 * Free an AllocateDesc of any type.
2783 *
2784 * The argument *must* point into the allocatedDescs[] array.
2785 */
2786static int
2788{
2789 int result;
2790
2791 /* Close the underlying object */
2792 switch (desc->kind)
2793 {
2794 case AllocateDescFile:
2795 result = fclose(desc->desc.file);
2796 break;
2797 case AllocateDescPipe:
2798 result = pclose(desc->desc.file);
2799 break;
2800 case AllocateDescDir:
2801 result = closedir(desc->desc.dir);
2802 break;
2803 case AllocateDescRawFD:
2804 pgaio_closing_fd(desc->desc.fd);
2805 result = close(desc->desc.fd);
2806 break;
2807 default:
2808 elog(ERROR, "AllocateDesc kind not recognized");
2809 result = 0; /* keep compiler quiet */
2810 break;
2811 }
2812
2813 /* Compact storage in the allocatedDescs array */
2816
2817 return result;
2818}
2819
2820/*
2821 * Close a file returned by AllocateFile.
2822 *
2823 * Note we do not check fclose's return value --- it is up to the caller
2824 * to handle close errors.
2825 */
2826int
2828{
2829 int i;
2830
2831 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2832
2833 /* Remove file from list of allocated files, if it's present */
2834 for (i = numAllocatedDescs; --i >= 0;)
2835 {
2836 AllocateDesc *desc = &allocatedDescs[i];
2837
2838 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2839 return FreeDesc(desc);
2840 }
2841
2842 /* Only get here if someone passes us a file not in allocatedDescs */
2843 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2844
2845 return fclose(file);
2846}
2847
2848/*
2849 * Close a file returned by OpenTransientFile.
2850 *
2851 * Note we do not check close's return value --- it is up to the caller
2852 * to handle close errors.
2853 */
2854int
2856{
2857 int i;
2858
2859 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2860
2861 /* Remove fd from list of allocated files, if it's present */
2862 for (i = numAllocatedDescs; --i >= 0;)
2863 {
2864 AllocateDesc *desc = &allocatedDescs[i];
2865
2866 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2867 return FreeDesc(desc);
2868 }
2869
2870 /* Only get here if someone passes us a file not in allocatedDescs */
2871 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2872
2874
2875 return close(fd);
2876}
2877
2878/*
2879 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2880 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2881 * necessary to open the directory, and with closing it after an elog.
2882 * When done, call FreeDir rather than closedir.
2883 *
2884 * Returns NULL, with errno set, on failure. Note that failure detection
2885 * is commonly left to the following call of ReadDir or ReadDirExtended;
2886 * see the comments for ReadDir.
2887 *
2888 * Ideally this should be the *only* direct call of opendir() in the backend.
2889 */
2890DIR *
2891AllocateDir(const char *dirname)
2892{
2893 DIR *dir;
2894
2895 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2896 numAllocatedDescs, dirname));
2897
2898 /* Can we allocate another non-virtual FD? */
2899 if (!reserveAllocatedDesc())
2900 ereport(ERROR,
2902 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2903 maxAllocatedDescs, dirname)));
2904
2905 /* Close excess kernel FDs. */
2907
2908TryAgain:
2909 if ((dir = opendir(dirname)) != NULL)
2910 {
2912
2913 desc->kind = AllocateDescDir;
2914 desc->desc.dir = dir;
2917 return desc->desc.dir;
2918 }
2919
2920 if (errno == EMFILE || errno == ENFILE)
2921 {
2922 int save_errno = errno;
2923
2924 ereport(LOG,
2926 errmsg("out of file descriptors: %m; release and retry")));
2927 errno = 0;
2928 if (ReleaseLruFile())
2929 goto TryAgain;
2930 errno = save_errno;
2931 }
2932
2933 return NULL;
2934}
2935
2936/*
2937 * Read a directory opened with AllocateDir, ereport'ing any error.
2938 *
2939 * This is easier to use than raw readdir() since it takes care of some
2940 * otherwise rather tedious and error-prone manipulation of errno. Also,
2941 * if you are happy with a generic error message for AllocateDir failure,
2942 * you can just do
2943 *
2944 * dir = AllocateDir(path);
2945 * while ((dirent = ReadDir(dir, path)) != NULL)
2946 * process dirent;
2947 * FreeDir(dir);
2948 *
2949 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2950 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2951 * use this shortcut.)
2952 *
2953 * The pathname passed to AllocateDir must be passed to this routine too,
2954 * but it is only used for error reporting.
2955 */
2956struct dirent *
2957ReadDir(DIR *dir, const char *dirname)
2958{
2959 return ReadDirExtended(dir, dirname, ERROR);
2960}
2961
2962/*
2963 * Alternate version of ReadDir that allows caller to specify the elevel
2964 * for any error report (whether it's reporting an initial failure of
2965 * AllocateDir or a subsequent directory read failure).
2966 *
2967 * If elevel < ERROR, returns NULL after any error. With the normal coding
2968 * pattern, this will result in falling out of the loop immediately as
2969 * though the directory contained no (more) entries.
2970 */
2971struct dirent *
2972ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2973{
2974 struct dirent *dent;
2975
2976 /* Give a generic message for AllocateDir failure, if caller didn't */
2977 if (dir == NULL)
2978 {
2979 ereport(elevel,
2981 errmsg("could not open directory \"%s\": %m",
2982 dirname)));
2983 return NULL;
2984 }
2985
2986 errno = 0;
2987 if ((dent = readdir(dir)) != NULL)
2988 return dent;
2989
2990 if (errno)
2991 ereport(elevel,
2993 errmsg("could not read directory \"%s\": %m",
2994 dirname)));
2995 return NULL;
2996}
2997
2998/*
2999 * Close a directory opened with AllocateDir.
3000 *
3001 * Returns closedir's return value (with errno set if it's not 0).
3002 * Note we do not check the return value --- it is up to the caller
3003 * to handle close errors if wanted.
3004 *
3005 * Does nothing if dir == NULL; we assume that directory open failure was
3006 * already reported if desired.
3007 */
3008int
3010{
3011 int i;
3012
3013 /* Nothing to do if AllocateDir failed */
3014 if (dir == NULL)
3015 return 0;
3016
3017 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3018
3019 /* Remove dir from list of allocated dirs, if it's present */
3020 for (i = numAllocatedDescs; --i >= 0;)
3021 {
3022 AllocateDesc *desc = &allocatedDescs[i];
3023
3024 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3025 return FreeDesc(desc);
3026 }
3027
3028 /* Only get here if someone passes us a dir not in allocatedDescs */
3029 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3030
3031 return closedir(dir);
3032}
3033
3034
3035/*
3036 * Close a pipe stream returned by OpenPipeStream.
3037 */
3038int
3040{
3041 int i;
3042
3043 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3044
3045 /* Remove file from list of allocated files, if it's present */
3046 for (i = numAllocatedDescs; --i >= 0;)
3047 {
3048 AllocateDesc *desc = &allocatedDescs[i];
3049
3050 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3051 return FreeDesc(desc);
3052 }
3053
3054 /* Only get here if someone passes us a file not in allocatedDescs */
3055 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3056
3057 return pclose(file);
3058}
3059
3060/*
3061 * closeAllVfds
3062 *
3063 * Force all VFDs into the physically-closed state, so that the fewest
3064 * possible number of kernel file descriptors are in use. There is no
3065 * change in the logical state of the VFDs.
3066 */
3067void
3069{
3070 Index i;
3071
3072 if (SizeVfdCache > 0)
3073 {
3074 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3075 for (i = 1; i < SizeVfdCache; i++)
3076 {
3077 if (!FileIsNotOpen(i))
3078 LruDelete(i);
3079 }
3080 }
3081}
3082
3083
3084/*
3085 * SetTempTablespaces
3086 *
3087 * Define a list (actually an array) of OIDs of tablespaces to use for
3088 * temporary files. This list will be used until end of transaction,
3089 * unless this function is called again before then. It is caller's
3090 * responsibility that the passed-in array has adequate lifespan (typically
3091 * it'd be allocated in TopTransactionContext).
3092 *
3093 * Some entries of the array may be InvalidOid, indicating that the current
3094 * database's default tablespace should be used.
3095 */
3096void
3098{
3099 Assert(numSpaces >= 0);
3102
3103 /*
3104 * Select a random starting point in the list. This is to minimize
3105 * conflicts between backends that are most likely sharing the same list
3106 * of temp tablespaces. Note that if we create multiple temp files in the
3107 * same transaction, we'll advance circularly through the list --- this
3108 * ensures that large temporary sort files are nicely spread across all
3109 * available tablespaces.
3110 */
3111 if (numSpaces > 1)
3113 0, numSpaces - 1);
3114 else
3116}
3117
3118/*
3119 * TempTablespacesAreSet
3120 *
3121 * Returns true if SetTempTablespaces has been called in current transaction.
3122 * (This is just so that tablespaces.c doesn't need its own per-transaction
3123 * state.)
3124 */
3125bool
3127{
3128 return (numTempTableSpaces >= 0);
3129}
3130
3131/*
3132 * GetTempTablespaces
3133 *
3134 * Populate an array with the OIDs of the tablespaces that should be used for
3135 * temporary files. (Some entries may be InvalidOid, indicating that the
3136 * current database's default tablespace should be used.) At most numSpaces
3137 * entries will be filled.
3138 * Returns the number of OIDs that were copied into the output array.
3139 */
3140int
3142{
3143 int i;
3144
3146 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3148
3149 return i;
3150}
3151
3152/*
3153 * GetNextTempTableSpace
3154 *
3155 * Select the next temp tablespace to use. A result of InvalidOid means
3156 * to use the current database's default tablespace.
3157 */
3158Oid
3160{
3161 if (numTempTableSpaces > 0)
3162 {
3163 /* Advance nextTempTableSpace counter with wraparound */
3167 }
3168 return InvalidOid;
3169}
3170
3171
3172/*
3173 * AtEOSubXact_Files
3174 *
3175 * Take care of subtransaction commit/abort. At abort, we close AllocateDescs
3176 * that the subtransaction may have opened. At commit, we reassign them to
3177 * the parent subtransaction. (Temporary files are tracked by ResourceOwners
3178 * instead.)
3179 */
3180void
3183{
3184 Index i;
3185
3186 for (i = 0; i < numAllocatedDescs; i++)
3187 {
3188 if (allocatedDescs[i].create_subid == mySubid)
3189 {
3190 if (isCommit)
3192 else
3193 {
3194 /* have to recheck the item after FreeDesc (ugly) */
3196 }
3197 }
3198 }
3199}
3200
3201/*
3202 * AtEOXact_Files
3203 *
3204 * This routine is called during transaction commit or abort. All still-open
3205 * per-transaction temporary file VFDs are closed, which also causes the
3206 * underlying files to be deleted (although they should've been closed already
3207 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3208 * closed. We also forget any transaction-local temp tablespace list.
3209 *
3210 * The isCommit flag is used only to decide whether to emit warnings about
3211 * unclosed files.
3212 */
3213void
3220
3221/*
3222 * BeforeShmemExit_Files
3223 *
3224 * before_shmem_exit hook to clean up temp files during backend shutdown.
3225 * Here, we want to clean up *all* temp files including interXact ones.
3226 */
3227static void
3229{
3230 CleanupTempFiles(false, true);
3231
3232 /* prevent further temp files from being created */
3233#ifdef USE_ASSERT_CHECKING
3235#endif
3236}
3237
3238/*
3239 * Close temporary files and delete their underlying files.
3240 *
3241 * isCommit: if true, this is normal transaction commit, and we don't
3242 * expect any remaining files; warn if there are some.
3243 *
3244 * isProcExit: if true, this is being called as the backend process is
3245 * exiting. If that's the case, we should remove all temporary files; if
3246 * that's not the case, we are being called for transaction commit/abort
3247 * and should only remove transaction-local temp files. In either case,
3248 * also clean up "allocated" stdio files, dirs and fds.
3249 */
3250static void
3252{
3253 Index i;
3254
3255 /*
3256 * Careful here: at proc_exit we need extra cleanup, not just
3257 * xact_temporary files.
3258 */
3260 {
3261 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3262 for (i = 1; i < SizeVfdCache; i++)
3263 {
3264 unsigned short fdstate = VfdCache[i].fdstate;
3265
3266 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3267 VfdCache[i].fileName != NULL)
3268 {
3269 /*
3270 * If we're in the process of exiting a backend process, close
3271 * all temporary files. Otherwise, only close temporary files
3272 * local to the current transaction. They should be closed by
3273 * the ResourceOwner mechanism already, so this is just a
3274 * debugging cross-check.
3275 */
3276 if (isProcExit)
3277 FileClose(i);
3278 else if (fdstate & FD_CLOSE_AT_EOXACT)
3279 {
3280 elog(WARNING,
3281 "temporary file %s not closed at end-of-transaction",
3282 VfdCache[i].fileName);
3283 FileClose(i);
3284 }
3285 }
3286 }
3287
3289 }
3290
3291 /* Complain if any allocated files remain open at commit. */
3292 if (isCommit && numAllocatedDescs > 0)
3293 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3295
3296 /* Clean up "allocated" stdio files, dirs and fds. */
3297 while (numAllocatedDescs > 0)
3299}
3300
3301
3302/*
3303 * Remove temporary and temporary relation files left over from a prior
3304 * postmaster session
3305 *
3306 * This should be called during postmaster startup. It will forcibly
3307 * remove any leftover files created by OpenTemporaryFile and any leftover
3308 * temporary relation files created by mdcreate.
3309 *
3310 * During post-backend-crash restart cycle, this routine is called when
3311 * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3312 * queries are using temp files could result in useless storage usage that can
3313 * only be reclaimed by a service restart. The argument against enabling it is
3314 * that someone might want to examine the temporary files for debugging
3315 * purposes. This does however mean that OpenTemporaryFile had better allow for
3316 * collision with an existing temp file name.
3317 *
3318 * NOTE: this function and its subroutines generally report syscall failures
3319 * with ereport(LOG) and keep going. Removing temp files is not so critical
3320 * that we should fail to start the database when we can't do it.
3321 */
3322void
3324{
3326 DIR *spc_dir;
3327 struct dirent *spc_de;
3328
3329 /*
3330 * First process temp files in pg_default ($PGDATA/base)
3331 */
3332 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3333 RemovePgTempFilesInDir(temp_path, true, false);
3335
3336 /*
3337 * Cycle through temp directories for all non-default tablespaces.
3338 */
3340
3342 {
3343 if (strcmp(spc_de->d_name, ".") == 0 ||
3344 strcmp(spc_de->d_name, "..") == 0)
3345 continue;
3346
3347 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3350 RemovePgTempFilesInDir(temp_path, true, false);
3351
3352 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3355 }
3356
3358
3359 /*
3360 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3361 * DataDir as well. However, that is *not* cleaned here because doing so
3362 * would create a race condition. It's done separately, earlier in
3363 * postmaster startup.
3364 */
3365}
3366
3367/*
3368 * Process one pgsql_tmp directory for RemovePgTempFiles.
3369 *
3370 * If missing_ok is true, it's all right for the named directory to not exist.
3371 * Any other problem results in a LOG message. (missing_ok should be true at
3372 * the top level, since pgsql_tmp directories are not created until needed.)
3373 *
3374 * At the top level, this should be called with unlink_all = false, so that
3375 * only files matching the temporary name prefix will be unlinked. When
3376 * recursing it will be called with unlink_all = true to unlink everything
3377 * under a top-level temporary directory.
3378 *
3379 * (These two flags could be replaced by one, but it seems clearer to keep
3380 * them separate.)
3381 */
3382void
3383RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3384{
3385 DIR *temp_dir;
3386 struct dirent *temp_de;
3387 char rm_path[MAXPGPATH * 2];
3388
3390
3391 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3392 return;
3393
3395 {
3396 if (strcmp(temp_de->d_name, ".") == 0 ||
3397 strcmp(temp_de->d_name, "..") == 0)
3398 continue;
3399
3400 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3401 tmpdirname, temp_de->d_name);
3402
3403 if (unlink_all ||
3404 strncmp(temp_de->d_name,
3407 {
3409
3410 if (type == PGFILETYPE_ERROR)
3411 continue;
3412 else if (type == PGFILETYPE_DIR)
3413 {
3414 /* recursively remove contents, then directory itself */
3415 RemovePgTempFilesInDir(rm_path, false, true);
3416
3417 if (rmdir(rm_path) < 0)
3418 ereport(LOG,
3420 errmsg("could not remove directory \"%s\": %m",
3421 rm_path)));
3422 }
3423 else
3424 {
3425 if (unlink(rm_path) < 0)
3426 ereport(LOG,
3428 errmsg("could not remove file \"%s\": %m",
3429 rm_path)));
3430 }
3431 }
3432 else
3433 ereport(LOG,
3434 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3435 rm_path)));
3436 }
3437
3439}
3440
3441/* Process one tablespace directory, look for per-DB subdirectories */
3442static void
3444{
3445 DIR *ts_dir;
3446 struct dirent *de;
3447 char dbspace_path[MAXPGPATH * 2];
3448
3450
3451 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3452 {
3453 /*
3454 * We're only interested in the per-database directories, which have
3455 * numeric names. Note that this code will also (properly) ignore "."
3456 * and "..".
3457 */
3458 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3459 continue;
3460
3461 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3462 tsdirname, de->d_name);
3464 }
3465
3466 FreeDir(ts_dir);
3467}
3468
3469/* Process one per-dbspace directory for RemovePgTempRelationFiles */
3470static void
3472{
3474 struct dirent *de;
3475 char rm_path[MAXPGPATH * 2];
3476
3478
3480 {
3481 if (!looks_like_temp_rel_name(de->d_name))
3482 continue;
3483
3484 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3485 dbspacedirname, de->d_name);
3486
3487 if (unlink(rm_path) < 0)
3488 ereport(LOG,
3490 errmsg("could not remove file \"%s\": %m",
3491 rm_path)));
3492 }
3493
3495}
3496
3497/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3498bool
3500{
3501 int pos;
3502 int savepos;
3503
3504 /* Must start with "t". */
3505 if (name[0] != 't')
3506 return false;
3507
3508 /* Followed by a non-empty string of digits and then an underscore. */
3509 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3510 ;
3511 if (pos == 1 || name[pos] != '_')
3512 return false;
3513
3514 /* Followed by another nonempty string of digits. */
3515 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3516 ;
3517 if (savepos == pos)
3518 return false;
3519
3520 /* We might have _forkname or .segment or both. */
3521 if (name[pos] == '_')
3522 {
3523 int forkchar = forkname_chars(&name[pos + 1], NULL);
3524
3525 if (forkchar <= 0)
3526 return false;
3527 pos += forkchar + 1;
3528 }
3529 if (name[pos] == '.')
3530 {
3531 int segchar;
3532
3533 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3534 ;
3535 if (segchar <= 1)
3536 return false;
3537 pos += segchar;
3538 }
3539
3540 /* Now we should be at the end. */
3541 if (name[pos] != '\0')
3542 return false;
3543 return true;
3544}
3545
3546#ifdef HAVE_SYNCFS
3547static void
3548do_syncfs(const char *path)
3549{
3550 int fd;
3551
3552 ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3553 path);
3554
3555 fd = OpenTransientFile(path, O_RDONLY);
3556 if (fd < 0)
3557 {
3558 ereport(LOG,
3560 errmsg("could not open file \"%s\": %m", path)));
3561 return;
3562 }
3563 if (syncfs(fd) < 0)
3564 ereport(LOG,
3566 errmsg("could not synchronize file system for file \"%s\": %m", path)));
3568}
3569#endif
3570
3571/*
3572 * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3573 * all potential filesystem, depending on recovery_init_sync_method setting.
3574 *
3575 * We fsync regular files and directories wherever they are, but we
3576 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3577 * Other symlinks are presumed to point at files we're not responsible
3578 * for fsyncing, and might not have privileges to write at all.
3579 *
3580 * Errors are logged but not considered fatal; that's because this is used
3581 * only during database startup, to deal with the possibility that there are
3582 * issued-but-unsynced writes pending against the data directory. We want to
3583 * ensure that such writes reach disk before anything that's done in the new
3584 * run. However, aborting on error would result in failure to start for
3585 * harmless cases such as read-only files in the data directory, and that's
3586 * not good either.
3587 *
3588 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3589 * rewriting all changes again during recovery.
3590 *
3591 * Note we assume we're chdir'd into PGDATA to begin with.
3592 */
3593void
3595{
3596 bool xlog_is_symlink;
3597
3598 /* We can skip this whole thing if fsync is disabled. */
3599 if (!enableFsync)
3600 return;
3601
3602 /*
3603 * If pg_wal is a symlink, we'll need to recurse into it separately,
3604 * because the first walkdir below will ignore it.
3605 */
3606 xlog_is_symlink = false;
3607
3608 {
3609 struct stat st;
3610
3611 if (lstat("pg_wal", &st) < 0)
3612 ereport(LOG,
3614 errmsg("could not stat file \"%s\": %m",
3615 "pg_wal")));
3616 else if (S_ISLNK(st.st_mode))
3617 xlog_is_symlink = true;
3618 }
3619
3620#ifdef HAVE_SYNCFS
3622 {
3623 DIR *dir;
3624 struct dirent *de;
3625
3626 /*
3627 * On Linux, we don't have to open every single file one by one. We
3628 * can use syncfs() to sync whole filesystems. We only expect
3629 * filesystem boundaries to exist where we tolerate symlinks, namely
3630 * pg_wal and the tablespaces, so we call syncfs() for each of those
3631 * directories.
3632 */
3633
3634 /* Prepare to report progress syncing the data directory via syncfs. */
3636
3637 /* Sync the top level pgdata directory. */
3638 do_syncfs(".");
3639 /* If any tablespaces are configured, sync each of those. */
3641 while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3642 {
3643 char path[MAXPGPATH];
3644
3645 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3646 continue;
3647
3648 snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3649 do_syncfs(path);
3650 }
3651 FreeDir(dir);
3652 /* If pg_wal is a symlink, process that too. */
3653 if (xlog_is_symlink)
3654 do_syncfs("pg_wal");
3655 return;
3656 }
3657#endif /* !HAVE_SYNCFS */
3658
3659#ifdef PG_FLUSH_DATA_WORKS
3660 /* Prepare to report progress of the pre-fsync phase. */
3662
3663 /*
3664 * If possible, hint to the kernel that we're soon going to fsync the data
3665 * directory and its contents. Errors in this step are even less
3666 * interesting than normal, so log them only at DEBUG1.
3667 */
3668 walkdir(".", pre_sync_fname, false, DEBUG1);
3669 if (xlog_is_symlink)
3670 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3672#endif
3673
3674 /* Prepare to report progress syncing the data directory via fsync. */
3676
3677 /*
3678 * Now we do the fsync()s in the same order.
3679 *
3680 * The main call ignores symlinks, so in addition to specially processing
3681 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3682 * process_symlinks = true. Note that if there are any plain directories
3683 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3684 * so we don't worry about optimizing it.
3685 */
3686 walkdir(".", datadir_fsync_fname, false, LOG);
3687 if (xlog_is_symlink)
3688 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3690}
3691
3692/*
3693 * walkdir: recursively walk a directory, applying the action to each
3694 * regular file and directory (including the named directory itself).
3695 *
3696 * If process_symlinks is true, the action and recursion are also applied
3697 * to regular files and directories that are pointed to by symlinks in the
3698 * given directory; otherwise symlinks are ignored. Symlinks are always
3699 * ignored in subdirectories, ie we intentionally don't pass down the
3700 * process_symlinks flag to recursive calls.
3701 *
3702 * Errors are reported at level elevel, which might be ERROR or less.
3703 *
3704 * See also walkdir in file_utils.c, which is a frontend version of this
3705 * logic.
3706 */
3707static void
3708walkdir(const char *path,
3709 void (*action) (const char *fname, bool isdir, int elevel),
3710 bool process_symlinks,
3711 int elevel)
3712{
3713 DIR *dir;
3714 struct dirent *de;
3715
3716 dir = AllocateDir(path);
3717
3718 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3719 {
3720 char subpath[MAXPGPATH * 2];
3721
3723
3724 if (strcmp(de->d_name, ".") == 0 ||
3725 strcmp(de->d_name, "..") == 0)
3726 continue;
3727
3728 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3729
3730 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3731 {
3732 case PGFILETYPE_REG:
3733 (*action) (subpath, false, elevel);
3734 break;
3735 case PGFILETYPE_DIR:
3736 walkdir(subpath, action, false, elevel);
3737 break;
3738 default:
3739
3740 /*
3741 * Errors are already reported directly by get_dirent_type(),
3742 * and any remaining symlinks and unknown file types are
3743 * ignored.
3744 */
3745 break;
3746 }
3747 }
3748
3749 FreeDir(dir); /* we ignore any error here */
3750
3751 /*
3752 * It's important to fsync the destination directory itself as individual
3753 * file fsyncs don't guarantee that the directory entry for the file is
3754 * synced. However, skip this if AllocateDir failed; the action function
3755 * might not be robust against that.
3756 */
3757 if (dir)
3758 (*action) (path, true, elevel);
3759}
3760
3761
3762/*
3763 * Hint to the OS that it should get ready to fsync() this file.
3764 *
3765 * Ignores errors trying to open unreadable files, and logs other errors at a
3766 * caller-specified level.
3767 */
3768#ifdef PG_FLUSH_DATA_WORKS
3769
3770static void
3771pre_sync_fname(const char *fname, bool isdir, int elevel)
3772{
3773 int fd;
3774
3775 /* Don't try to flush directories, it'll likely just fail */
3776 if (isdir)
3777 return;
3778
3779 ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3780 fname);
3781
3783
3784 if (fd < 0)
3785 {
3786 if (errno == EACCES)
3787 return;
3788 ereport(elevel,
3790 errmsg("could not open file \"%s\": %m", fname)));
3791 return;
3792 }
3793
3794 /*
3795 * pg_flush_data() ignores errors, which is ok because this is only a
3796 * hint.
3797 */
3798 pg_flush_data(fd, 0, 0);
3799
3800 if (CloseTransientFile(fd) != 0)
3801 ereport(elevel,
3803 errmsg("could not close file \"%s\": %m", fname)));
3804}
3805
3806#endif /* PG_FLUSH_DATA_WORKS */
3807
3808static void
3809datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3810{
3811 ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3812 fname);
3813
3814 /*
3815 * We want to silently ignoring errors about unreadable files. Pass that
3816 * desire on to fsync_fname_ext().
3817 */
3818 fsync_fname_ext(fname, isdir, true, elevel);
3819}
3820
3821static void
3822unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3823{
3824 if (isdir)
3825 {
3826 if (rmdir(fname) != 0 && errno != ENOENT)
3827 ereport(elevel,
3829 errmsg("could not remove directory \"%s\": %m", fname)));
3830 }
3831 else
3832 {
3833 /* Use PathNameDeleteTemporaryFile to report filesize */
3834 PathNameDeleteTemporaryFile(fname, false);
3835 }
3836}
3837
3838/*
3839 * fsync_fname_ext -- Try to fsync a file or directory
3840 *
3841 * If ignore_perm is true, ignore errors upon trying to open unreadable
3842 * files. Logs other errors at a caller-specified level.
3843 *
3844 * Returns 0 if the operation succeeded, -1 otherwise.
3845 */
3846int
3847fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3848{
3849 int fd;
3850 int flags;
3851 int returncode;
3852
3853 /*
3854 * Some OSs require directories to be opened read-only whereas other
3855 * systems don't allow us to fsync files opened read-only; so we need both
3856 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3857 * not writable by our userid, but we assume that's OK.
3858 */
3859 flags = PG_BINARY;
3860 if (!isdir)
3861 flags |= O_RDWR;
3862 else
3863 flags |= O_RDONLY;
3864
3865 fd = OpenTransientFile(fname, flags);
3866
3867 /*
3868 * Some OSs don't allow us to open directories at all (Windows returns
3869 * EACCES), just ignore the error in that case. If desired also silently
3870 * ignoring errors about unreadable files. Log others.
3871 */
3872 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3873 return 0;
3874 else if (fd < 0 && ignore_perm && errno == EACCES)
3875 return 0;
3876 else if (fd < 0)
3877 {
3878 ereport(elevel,
3880 errmsg("could not open file \"%s\": %m", fname)));
3881 return -1;
3882 }
3883
3885
3886 /*
3887 * Some OSes don't allow us to fsync directories at all, so we can ignore
3888 * those errors. Anything else needs to be logged.
3889 */
3890 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3891 {
3892 int save_errno;
3893
3894 /* close file upon error, might not be in transaction context */
3895 save_errno = errno;
3897 errno = save_errno;
3898
3899 ereport(elevel,
3901 errmsg("could not fsync file \"%s\": %m", fname)));
3902 return -1;
3903 }
3904
3905 if (CloseTransientFile(fd) != 0)
3906 {
3907 ereport(elevel,
3909 errmsg("could not close file \"%s\": %m", fname)));
3910 return -1;
3911 }
3912
3913 return 0;
3914}
3915
3916/*
3917 * fsync_parent_path -- fsync the parent path of a file or directory
3918 *
3919 * This is aimed at making file operations persistent on disk in case of
3920 * an OS crash or power failure.
3921 */
3922static int
3923fsync_parent_path(const char *fname, int elevel)
3924{
3925 char parentpath[MAXPGPATH];
3926
3927 strlcpy(parentpath, fname, MAXPGPATH);
3929
3930 /*
3931 * get_parent_directory() returns an empty string if the input argument is
3932 * just a file name (see comments in path.c), so handle that as being the
3933 * current directory.
3934 */
3935 if (strlen(parentpath) == 0)
3937
3938 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3939 return -1;
3940
3941 return 0;
3942}
3943
3944/*
3945 * Create a PostgreSQL data sub-directory
3946 *
3947 * The data directory itself, and most of its sub-directories, are created at
3948 * initdb time, but we do have some occasions when we create directories in
3949 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3950 * make sure that those directories are created consistently. Today, that means
3951 * making sure that the created directory has the correct permissions, which is
3952 * what pg_dir_create_mode tracks for us.
3953 *
3954 * Note that we also set the umask() based on what we understand the correct
3955 * permissions to be (see file_perm.c).
3956 *
3957 * For permissions other than the default, mkdir() can be used directly, but
3958 * be sure to consider carefully such cases -- a sub-directory with incorrect
3959 * permissions in a PostgreSQL data directory could cause backups and other
3960 * processes to fail.
3961 */
3962int
3967
3968/*
3969 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3970 *
3971 * Failure to fsync any data file is cause for immediate panic, unless
3972 * data_sync_retry is enabled. Data may have been written to the operating
3973 * system and removed from our buffer pool already, and if we are running on
3974 * an operating system that forgets dirty data on write-back failure, there
3975 * may be only one copy of the data remaining: in the WAL. A later attempt to
3976 * fsync again might falsely report success. Therefore we must not allow any
3977 * further checkpoints to be attempted. data_sync_retry can in theory be
3978 * enabled on systems known not to drop dirty buffered data on write-back
3979 * failure (with the likely outcome that checkpoints will continue to fail
3980 * until the underlying problem is fixed).
3981 *
3982 * Any code that reports a failure from fsync() or related functions should
3983 * filter the error level with this function.
3984 */
3985int
3987{
3988 return data_sync_retry ? elevel : PANIC;
3989}
3990
3991bool
3993{
3994 bool result = true;
3995 int flags;
3996
3997#if PG_O_DIRECT == 0
3998 if (strcmp(*newval, "") != 0)
3999 {
4000 GUC_check_errdetail("\"%s\" is not supported on this platform.",
4001 "debug_io_direct");
4002 result = false;
4003 }
4004 flags = 0;
4005#else
4006 List *elemlist;
4007 ListCell *l;
4008 char *rawstring;
4009
4010 /* Need a modifiable copy of string */
4012
4013 if (!SplitGUCList(rawstring, ',', &elemlist))
4014 {
4015 GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4016 "debug_io_direct");
4019 return false;
4020 }
4021
4022 flags = 0;
4023 foreach(l, elemlist)
4024 {
4025 char *item = (char *) lfirst(l);
4026
4027 if (pg_strcasecmp(item, "data") == 0)
4028 flags |= IO_DIRECT_DATA;
4029 else if (pg_strcasecmp(item, "wal") == 0)
4030 flags |= IO_DIRECT_WAL;
4031 else if (pg_strcasecmp(item, "wal_init") == 0)
4032 flags |= IO_DIRECT_WAL_INIT;
4033 else
4034 {
4035 GUC_check_errdetail("Invalid option \"%s\".", item);
4036 result = false;
4037 break;
4038 }
4039 }
4040
4041 /*
4042 * It's possible to configure block sizes smaller than our assumed I/O
4043 * alignment size, which could result in invalid I/O requests.
4044 */
4045#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4046 if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4047 {
4048 GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4049 "debug_io_direct", "XLOG_BLCKSZ");
4050 result = false;
4051 }
4052#endif
4053#if BLCKSZ < PG_IO_ALIGN_SIZE
4054 if (result && (flags & IO_DIRECT_DATA))
4055 {
4056 GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4057 "debug_io_direct", "BLCKSZ");
4058 result = false;
4059 }
4060#endif
4061
4064#endif
4065
4066 if (!result)
4067 return result;
4068
4069 /* Save the flags in *extra, for use by assign_debug_io_direct */
4070 *extra = guc_malloc(LOG, sizeof(int));
4071 if (!*extra)
4072 return false;
4073 *((int *) *extra) = flags;
4074
4075 return result;
4076}
4077
4078void
4079assign_debug_io_direct(const char *newval, void *extra)
4080{
4081 int *flags = (int *) extra;
4082
4083 io_direct_flags = *flags;
4084}
4085
4086/* ResourceOwner callbacks */
4087
4088static void
4090{
4091 File file = (File) DatumGetInt32(res);
4092 Vfd *vfdP;
4093
4094 Assert(FileIsValid(file));
4095
4096 vfdP = &VfdCache[file];
4097 vfdP->resowner = NULL;
4098
4099 FileClose(file);
4100}
4101
4102static char *
4104{
4105 return psprintf("File %d", DatumGetInt32(res));
4106}
void pgaio_closing_fd(int fd)
Definition aio.c:1220
void pgaio_io_start_readv(PgAioHandle *ioh, int fd, int iovcnt, uint64 offset)
Definition aio_io.c:78
void begin_startup_progress_phase(void)
Definition startup.c:342
int fdatasync(int fd)
#define Min(x, y)
Definition c.h:1093
uint32 SubTransactionId
Definition c.h:742
#define INT64_FORMAT
Definition c.h:636
#define Assert(condition)
Definition c.h:945
int64_t int64
Definition c.h:615
#define PG_BINARY
Definition c.h:1376
uint64_t uint64
Definition c.h:619
uint32_t uint32
Definition c.h:618
unsigned int Index
Definition c.h:700
#define MemSet(start, val, len)
Definition c.h:1109
#define OidIsValid(objectId)
Definition c.h:860
size_t Size
Definition c.h:691
int closedir(DIR *)
Definition dirent.c:127
struct dirent * readdir(DIR *)
Definition dirent.c:78
DIR * opendir(const char *)
Definition dirent.c:33
Datum arg
Definition elog.c:1322
int errcode_for_file_access(void)
Definition elog.c:897
int errcode(int sqlerrcode)
Definition elog.c:874
#define LOG
Definition elog.h:31
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define FATAL
Definition elog.h:41
#define WARNING
Definition elog.h:36
#define DEBUG2
Definition elog.h:29
#define PANIC
Definition elog.h:42
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int pg_truncate(const char *path, pgoff_t length)
Definition fd.c:721
int max_files_per_process
Definition fd.c:147
int FileGetRawDesc(File file)
Definition fd.c:2516
int MakePGDirectory(const char *directoryName)
Definition fd.c:3963
int FreeDir(DIR *dir)
Definition fd.c:3009
int recovery_init_sync_method
Definition fd.c:166
static const ResourceOwnerDesc file_resowner_desc
Definition fd.c:365
int pg_fsync_no_writethrough(int fd)
Definition fd.c:442
#define FD_MINFREE
Definition fd.c:139
FILE * OpenPipeStream(const char *command, const char *mode)
Definition fd.c:2731
static int numTempTableSpaces
Definition fd.c:293
static bool ReleaseLruFile(void)
Definition fd.c:1370
void FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
Definition fd.c:2123
int io_direct_flags
Definition fd.c:172
#define FD_DELETE_AT_CLOSE
Definition fd.c:196
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1112
static int maxAllocatedDescs
Definition fd.c:272
int file_extend_method
Definition fd.c:169
static void Delete(File file)
Definition fd.c:1254
static int FreeDesc(AllocateDesc *desc)
Definition fd.c:2787
static long tempFileCounter
Definition fd.c:284
static char * ResOwnerPrintFile(Datum res)
Definition fd.c:4103
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition fd.c:783
char * FilePathName(File file)
Definition fd.c:2500
static void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition fd.c:381
static int pg_ftruncate(int fd, pgoff_t length)
Definition fd.c:704
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition fd.c:3141
static int numAllocatedDescs
Definition fd.c:271
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition fd.c:1889
static void LruDelete(File file)
Definition fd.c:1273
int pg_fdatasync(int fd)
Definition fd.c:481
#define FileIsValid(file)
Definition fd.c:190
void assign_debug_io_direct(const char *newval, void *extra)
Definition fd.c:4079
int FileSync(File file, uint32 wait_event_info)
Definition fd.c:2336
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2205
static int nfile
Definition fd.c:226
int CloseTransientFile(int fd)
Definition fd.c:2855
#define DO_DB(A)
Definition fd.c:184
int BasicOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1090
void closeAllVfds(void)
Definition fd.c:3068
int max_safe_fds
Definition fd.c:160
static File AllocateVfd(void)
Definition fd.c:1402
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition fd.c:1849
void PathNameDeleteTemporaryDir(const char *dirname)
Definition fd.c:1679
int ClosePipeStream(FILE *file)
Definition fd.c:3039
void AtEOXact_Files(bool isCommit)
Definition fd.c:3214
int FileGetRawFlags(File file)
Definition fd.c:2532
static Size SizeVfdCache
Definition fd.c:221
static int nextTempTableSpace
Definition fd.c:294
#define FD_CLOSE_AT_EOXACT
Definition fd.c:197
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition fd.c:3847
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition fd.c:3822
static void ResOwnerReleaseFile(Datum res)
Definition fd.c:4089
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition fd.c:3443
int FreeFile(FILE *file)
Definition fd.c:2827
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2149
mode_t FileGetRawMode(File file)
Definition fd.c:2542
static AllocateDesc * allocatedDescs
Definition fd.c:273
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition fd.c:2972
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition fd.c:965
int FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2408
static int FileAccess(File file)
Definition fd.c:1480
pgoff_t FileSize(File file)
Definition fd.c:2448
static void FreeVfd(File file)
Definition fd.c:1460
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition fd.c:462
void FileClose(File file)
Definition fd.c:1966
void ReleaseExternalFD(void)
Definition fd.c:1225
#define FD_TEMP_FILE_LIMIT
Definition fd.c:198
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition fd.c:3383
bool pg_file_exists(const char *name)
Definition fd.c:504
void RemovePgTempFiles(void)
Definition fd.c:3323
#define FileIsNotOpen(file)
Definition fd.c:193
bool TempTablespacesAreSet(void)
Definition fd.c:3126
void fsync_fname(const char *fname, bool isdir)
Definition fd.c:757
int data_sync_elevel(int elevel)
Definition fd.c:3986
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1563
static void Insert(File file)
Definition fd.c:1301
AllocateDescKind
Definition fd.c:252
@ AllocateDescDir
Definition fd.c:255
@ AllocateDescPipe
Definition fd.c:254
@ AllocateDescFile
Definition fd.c:253
@ AllocateDescRawFD
Definition fd.c:256
Oid GetNextTempTableSpace(void)
Definition fd.c:3159
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1576
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition fd.c:3809
static void ReportTemporaryFileUsage(const char *path, pgoff_t size)
Definition fd.c:1516
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition fd.c:1792
void pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
Definition fd.c:526
bool AcquireExternalFD(void)
Definition fd.c:1172
static void RegisterTemporaryFile(File file)
Definition fd.c:1535
#define NUM_RESERVED_FDS
Definition fd.c:130
DIR * AllocateDir(const char *dirname)
Definition fd.c:2891
static Oid * tempTableSpaces
Definition fd.c:292
int FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2465
static bool reserveAllocatedDesc(void)
Definition fd.c:2553
void InitFileAccess(void)
Definition fd.c:904
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition fd.c:3471
File OpenTemporaryFile(bool interXact)
Definition fd.c:1712
int durable_unlink(const char *fname, int elevel)
Definition fd.c:873
static uint64 temporary_files_size
Definition fd.c:240
void ReserveExternalFD(void)
Definition fd.c:1207
int FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2363
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition fd.c:2957
bool looks_like_temp_rel_name(const char *name)
Definition fd.c:3499
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition fd.c:1920
void set_max_safe_fds(void)
Definition fd.c:1045
int pg_fsync(int fd)
Definition fd.c:390
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition fd.c:3251
#define VFD_CLOSED
Definition fd.c:188
static bool have_xact_temporary_files
Definition fd.c:232
static int LruInsert(File file)
Definition fd.c:1323
static int numExternalFDs
Definition fd.c:278
static int fsync_parent_path(const char *fname, int elevel)
Definition fd.c:3923
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition fd.c:1648
FILE * AllocateFile(const char *name, const char *mode)
Definition fd.c:2628
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition fd.c:3181
int OpenTransientFile(const char *fileName, int fileFlags)
Definition fd.c:2678
void InitTemporaryFileAccess(void)
Definition fd.c:934
static Vfd * VfdCache
Definition fd.c:220
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:2687
bool data_sync_retry
Definition fd.c:163
int FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2067
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2231
static void ReleaseLruFiles(void)
Definition fd.c:1392
void SyncDataDirectory(void)
Definition fd.c:3594
bool check_debug_io_direct(char **newval, void **extra, GucSource source)
Definition fd.c:3992
static void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition fd.c:376
static void BeforeShmemExit_Files(int code, Datum arg)
Definition fd.c:3228
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition fd.c:3708
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition fd.c:3097
void TempTablespacePath(char *path, Oid tablespace)
Definition fd.c:1767
#define IO_DIRECT_WAL
Definition fd.h:55
#define IO_DIRECT_DATA
Definition fd.h:54
#define DEFAULT_FILE_EXTEND_METHOD
Definition fd.h:67
#define IO_DIRECT_WAL_INIT
Definition fd.h:56
int File
Definition fd.h:51
#define PG_O_DIRECT
Definition fd.h:123
int pg_file_create_mode
Definition file_perm.c:19
int pg_dir_create_mode
Definition file_perm.c:18
ssize_t pg_pwrite_zeros(int fd, size_t size, pgoff_t offset)
Definition file_utils.c:709
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition file_utils.c:547
#define PG_TEMP_FILES_DIR
Definition file_utils.h:63
#define PG_TEMP_FILE_PREFIX
Definition file_utils.h:64
PGFileType
Definition file_utils.h:19
@ PGFILETYPE_DIR
Definition file_utils.h:23
@ PGFILETYPE_REG
Definition file_utils.h:22
@ PGFILETYPE_ERROR
Definition file_utils.h:20
@ DATA_DIR_SYNC_METHOD_SYNCFS
Definition file_utils.h:30
@ DATA_DIR_SYNC_METHOD_FSYNC
Definition file_utils.h:29
int MyProcPid
Definition globals.c:47
bool enableFsync
Definition globals.c:129
Oid MyDatabaseTableSpace
Definition globals.c:96
void * guc_malloc(int elevel, size_t size)
Definition guc.c:637
#define newval
#define GUC_check_errdetail
Definition guc.h:507
GucSource
Definition guc.h:112
int temp_file_limit
Definition guc_tables.c:560
int log_temp_files
Definition guc_tables.c:555
#define close(a)
Definition win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:344
int j
Definition isn.c:78
int i
Definition isn.c:77
void list_free(List *list)
Definition list.c:1546
Datum subpath(PG_FUNCTION_ARGS)
Definition ltree_op.c:311
char * pstrdup(const char *in)
Definition mcxt.c:1781
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define MAP_FAILED
Definition mem.h:43
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
static char * errmsg
static char * basedir
static PgChecksumMode mode
#define MAXPGPATH
static ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset)
Definition pg_iovec.h:54
static ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset)
Definition pg_iovec.h:93
#define lfirst(lc)
Definition pg_list.h:172
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition pg_prng.c:144
pg_prng_state pg_global_prng_state
Definition pg_prng.c:34
static rewind_source * source
Definition pg_rewind.c:89
static char buf[DEFAULT_XLOG_SEG_SIZE]
static char * tablespace
Definition pgbench.c:217
void pgstat_report_tempfile(size_t filesize)
#define pqsignal
Definition port.h:547
int pg_strcasecmp(const char *s1, const char *s2)
void get_parent_directory(char *path)
Definition path.c:1068
#define snprintf
Definition port.h:260
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
off_t pgoff_t
Definition port.h:421
uint64_t Datum
Definition postgres.h:70
static Datum Int32GetDatum(int32 X)
Definition postgres.h:212
static int32 DatumGetInt32(Datum X)
Definition postgres.h:202
#define InvalidOid
unsigned int Oid
static int fd(const char *x, int i)
static int fb(int x)
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
int forkname_chars(const char *str, ForkNumber *fork)
Definition relpath.c:81
#define PG_TBLSPC_DIR
Definition relpath.h:41
#define TABLESPACE_VERSION_DIRECTORY
Definition relpath.h:33
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition resowner.c:561
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition resowner.c:521
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449
@ RESOURCE_RELEASE_AFTER_LOCKS
Definition resowner.h:56
#define RELEASE_PRIO_FILES
Definition resowner.h:76
void pg_usleep(long microsec)
Definition signal.c:53
#define realloc(a, b)
#define free(a)
#define malloc(a)
static void error(void)
#define ereport_startup_progress(msg,...)
Definition startup.h:18
SubTransactionId create_subid
Definition fd.c:262
DIR * dir
Definition fd.c:266
FILE * file
Definition fd.c:265
int fd
Definition fd.c:267
union AllocateDesc::@20 desc
AllocateDescKind kind
Definition fd.c:261
Definition dirent.c:26
Definition pg_list.h:54
const char * name
Definition resowner.h:93
unsigned short st_mode
Definition win32_port.h:258
Definition fd.c:201
int fd
Definition fd.c:202
int fileFlags
Definition fd.c:211
File lruLessRecently
Definition fd.c:207
File lruMoreRecently
Definition fd.c:206
pgoff_t fileSize
Definition fd.c:208
char * fileName
Definition fd.c:209
ResourceOwner resowner
Definition fd.c:204
unsigned short fdstate
Definition fd.c:203
File nextFree
Definition fd.c:205
mode_t fileMode
Definition fd.c:212
bool SplitGUCList(char *rawstring, char separator, List **namelist)
Definition varlena.c:3025
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
const char * type
const char * name
#define fsync(fd)
Definition win32_port.h:83
#define stat
Definition win32_port.h:74
#define EINTR
Definition win32_port.h:361
#define EOPNOTSUPP
Definition win32_port.h:385
#define SIGPIPE
Definition win32_port.h:163
#define lstat(path, sb)
Definition win32_port.h:275
#define S_ISDIR(m)
Definition win32_port.h:315
void _dosmaperr(unsigned long)
Definition win32error.c:177
#define S_ISLNK(m)
Definition win32_port.h:334
#define mkdir(a, b)
Definition win32_port.h:80
#define fstat
Definition win32_port.h:73
#define O_CLOEXEC
Definition win32_port.h:344
SubTransactionId GetCurrentSubTransactionId(void)
Definition xact.c:793
int wal_sync_method
Definition xlog.c:134
@ WAL_SYNC_METHOD_FSYNC_WRITETHROUGH
Definition xlog.h:28
static const char * directory
Definition zic.c:648