PostgreSQL Source Code git master
fd.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * fd.c
4 * Virtual file descriptor code.
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 * IDENTIFICATION
10 * src/backend/storage/file/fd.c
11 *
12 * NOTES:
13 *
14 * This code manages a cache of 'virtual' file descriptors (VFDs).
15 * The server opens many file descriptors for a variety of reasons,
16 * including base tables, scratch files (e.g., sort and hash spool
17 * files), and random calls to C library routines like system(3); it
18 * is quite easy to exceed system limits on the number of open files a
19 * single process can have. (This is around 1024 on many modern
20 * operating systems, but may be lower on others.)
21 *
22 * VFDs are managed as an LRU pool, with actual OS file descriptors
23 * being opened and closed as needed. Obviously, if a routine is
24 * opened using these interfaces, all subsequent operations must also
25 * be through these interfaces (the File type is not a real file
26 * descriptor).
27 *
28 * For this scheme to work, most (if not all) routines throughout the
29 * server should use these interfaces instead of calling the C library
30 * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31 * may find ourselves short of real file descriptors anyway.
32 *
33 * INTERFACE ROUTINES
34 *
35 * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36 * A File opened with OpenTemporaryFile is automatically deleted when the
37 * File is closed, either explicitly or implicitly at end of transaction or
38 * process exit. PathNameOpenFile is intended for files that are held open
39 * for a long time, like relation files. It is the caller's responsibility
40 * to close them, there is no automatic mechanism in fd.c for that.
41 *
42 * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43 * temporary files that have names so that they can be shared between
44 * backends. Such files are automatically closed and count against the
45 * temporary file limit of the backend that creates them, but unlike anonymous
46 * files they are not automatically deleted. See sharedfileset.c for a shared
47 * ownership mechanism that provides automatic cleanup for shared files when
48 * the last of a group of backends detaches.
49 *
50 * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51 * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52 * They behave like the corresponding native functions, except that the handle
53 * is registered with the current subtransaction, and will be automatically
54 * closed at abort. These are intended mainly for short operations like
55 * reading a configuration file; there is a limit on the number of files that
56 * can be opened using these functions at any one time.
57 *
58 * Finally, BasicOpenFile is just a thin wrapper around open() that can
59 * release file descriptors in use by the virtual file descriptors if
60 * necessary. There is no automatic cleanup of file descriptors returned by
61 * BasicOpenFile, it is solely the caller's responsibility to close the file
62 * descriptor by calling close(2).
63 *
64 * If a non-virtual file descriptor needs to be held open for any length of
65 * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66 * (and eventually ReleaseExternalFD), so that we can take it into account
67 * while deciding how many VFDs can be open. This applies to FDs obtained
68 * with BasicOpenFile as well as those obtained without use of any fd.c API.
69 *
70 *-------------------------------------------------------------------------
71 */
72
73#include "postgres.h"
74
75#include <dirent.h>
76#include <sys/file.h>
77#include <sys/param.h>
78#include <sys/resource.h> /* for getrlimit */
79#include <sys/stat.h>
80#include <sys/types.h>
81#ifndef WIN32
82#include <sys/mman.h>
83#endif
84#include <limits.h>
85#include <unistd.h>
86#include <fcntl.h>
87
88#include "access/xact.h"
89#include "access/xlog.h"
91#include "common/file_perm.h"
92#include "common/file_utils.h"
93#include "common/pg_prng.h"
94#include "miscadmin.h"
95#include "pgstat.h"
96#include "postmaster/startup.h"
97#include "storage/fd.h"
98#include "storage/ipc.h"
99#include "utils/guc.h"
100#include "utils/guc_hooks.h"
101#include "utils/resowner.h"
102#include "utils/varlena.h"
103
104/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
105#if defined(HAVE_SYNC_FILE_RANGE)
106#define PG_FLUSH_DATA_WORKS 1
107#elif !defined(WIN32) && defined(MS_ASYNC)
108#define PG_FLUSH_DATA_WORKS 1
109#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
110#define PG_FLUSH_DATA_WORKS 1
111#endif
112
113/*
114 * We must leave some file descriptors free for system(), the dynamic loader,
115 * and other code that tries to open files without consulting fd.c. This
116 * is the number left free. (While we try fairly hard to prevent EMFILE
117 * errors, there's never any guarantee that we won't get ENFILE due to
118 * other processes chewing up FDs. So it's a bad idea to try to open files
119 * without consulting fd.c. Nonetheless we cannot control all code.)
120 *
121 * Because this is just a fixed setting, we are effectively assuming that
122 * no such code will leave FDs open over the long term; otherwise the slop
123 * is likely to be insufficient. Note in particular that we expect that
124 * loading a shared library does not result in any permanent increase in
125 * the number of open files. (This appears to be true on most if not
126 * all platforms as of Feb 2004.)
127 */
128#define NUM_RESERVED_FDS 10
129
130/*
131 * If we have fewer than this many usable FDs after allowing for the reserved
132 * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
133 * much less than that. Note that this value ensures numExternalFDs can be
134 * at least 16; as of this writing, the contrib/postgres_fdw regression tests
135 * will not pass unless that can grow to at least 14.)
136 */
137#define FD_MINFREE 48
138
139/*
140 * A number of platforms allow individual processes to open many more files
141 * than they can really support when *many* processes do the same thing.
142 * This GUC parameter lets the DBA limit max_safe_fds to something less than
143 * what the postmaster's initial probe suggests will work.
144 */
146
147/*
148 * Maximum number of file descriptors to open for operations that fd.c knows
149 * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
150 * to a conservative value, and remains that way indefinitely in bootstrap or
151 * standalone-backend cases. In normal postmaster operation, the postmaster
152 * calls set_max_safe_fds() late in initialization to update the value, and
153 * that value is then inherited by forked subprocesses.
154 *
155 * Note: the value of max_files_per_process is taken into account while
156 * setting this variable, and so need not be tested separately.
157 */
158int max_safe_fds = FD_MINFREE; /* default if not changed */
159
160/* Whether it is safe to continue running after fsync() fails. */
161bool data_sync_retry = false;
162
163/* How SyncDataDirectory() should do its job. */
165
166/* Which kinds of files should be opened with PG_O_DIRECT. */
168
169/* Debugging.... */
170
171#ifdef FDDEBUG
172#define DO_DB(A) \
173 do { \
174 int _do_db_save_errno = errno; \
175 A; \
176 errno = _do_db_save_errno; \
177 } while (0)
178#else
179#define DO_DB(A) \
180 ((void) 0)
181#endif
182
183#define VFD_CLOSED (-1)
184
185#define FileIsValid(file) \
186 ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
187
188#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
189
190/* these are the assigned bits in fdstate below: */
191#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
192#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
193#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
194
195typedef struct vfd
196{
197 int fd; /* current FD, or VFD_CLOSED if none */
198 unsigned short fdstate; /* bitflags for VFD's state */
199 ResourceOwner resowner; /* owner, for automatic cleanup */
200 File nextFree; /* link to next free VFD, if in freelist */
201 File lruMoreRecently; /* doubly linked recency-of-use list */
203 off_t fileSize; /* current size of file (0 if not temporary) */
204 char *fileName; /* name of file, or NULL for unused VFD */
205 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
206 int fileFlags; /* open(2) flags for (re)opening the file */
207 mode_t fileMode; /* mode to pass to open(2) */
209
210/*
211 * Virtual File Descriptor array pointer and size. This grows as
212 * needed. 'File' values are indexes into this array.
213 * Note that VfdCache[0] is not a usable VFD, just a list header.
214 */
215static Vfd *VfdCache;
217
218/*
219 * Number of file descriptors known to be in use by VFD entries.
220 */
221static int nfile = 0;
222
223/*
224 * Flag to tell whether it's worth scanning VfdCache looking for temp files
225 * to close
226 */
227static bool have_xact_temporary_files = false;
228
229/*
230 * Tracks the total size of all temporary files. Note: when temp_file_limit
231 * is being enforced, this cannot overflow since the limit cannot be more
232 * than INT_MAX kilobytes. When not enforcing, it could theoretically
233 * overflow, but we don't care.
234 */
236
237/* Temporary file access initialized and not yet shut down? */
238#ifdef USE_ASSERT_CHECKING
239static bool temporary_files_allowed = false;
240#endif
241
242/*
243 * List of OS handles opened with AllocateFile, AllocateDir and
244 * OpenTransientFile.
245 */
246typedef enum
247{
253
254typedef struct
255{
258 union
259 {
260 FILE *file;
262 int fd;
263 } desc;
265
266static int numAllocatedDescs = 0;
267static int maxAllocatedDescs = 0;
269
270/*
271 * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
272 */
273static int numExternalFDs = 0;
274
275/*
276 * Number of temporary files opened during the current session;
277 * this is used in generation of tempfile names.
278 */
279static long tempFileCounter = 0;
280
281/*
282 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
283 * indicating that the current database's default tablespace should be used.)
284 * When numTempTableSpaces is -1, this has not been set in the current
285 * transaction.
286 */
287static Oid *tempTableSpaces = NULL;
288static int numTempTableSpaces = -1;
289static int nextTempTableSpace = 0;
290
291
292/*--------------------
293 *
294 * Private Routines
295 *
296 * Delete - delete a file from the Lru ring
297 * LruDelete - remove a file from the Lru ring and close its FD
298 * Insert - put a file at the front of the Lru ring
299 * LruInsert - put a file at the front of the Lru ring and open it
300 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
301 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
302 * AllocateVfd - grab a free (or new) file record (from VfdCache)
303 * FreeVfd - free a file record
304 *
305 * The Least Recently Used ring is a doubly linked list that begins and
306 * ends on element zero. Element zero is special -- it doesn't represent
307 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
308 * anchor that shows us the beginning/end of the ring.
309 * Only VFD elements that are currently really open (have an FD assigned) are
310 * in the Lru ring. Elements that are "virtually" open can be recognized
311 * by having a non-null fileName field.
312 *
313 * example:
314 *
315 * /--less----\ /---------\
316 * v \ v \
317 * #0 --more---> LeastRecentlyUsed --more-\ \
318 * ^\ | |
319 * \\less--> MostRecentlyUsedFile <---/ |
320 * \more---/ \--less--/
321 *
322 *--------------------
323 */
324static void Delete(File file);
325static void LruDelete(File file);
326static void Insert(File file);
327static int LruInsert(File file);
328static bool ReleaseLruFile(void);
329static void ReleaseLruFiles(void);
330static File AllocateVfd(void);
331static void FreeVfd(File file);
332
333static int FileAccess(File file);
334static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
335static bool reserveAllocatedDesc(void);
336static int FreeDesc(AllocateDesc *desc);
337
338static void BeforeShmemExit_Files(int code, Datum arg);
339static void CleanupTempFiles(bool isCommit, bool isProcExit);
340static void RemovePgTempRelationFiles(const char *tsdirname);
341static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
342
343static void walkdir(const char *path,
344 void (*action) (const char *fname, bool isdir, int elevel),
345 bool process_symlinks,
346 int elevel);
347#ifdef PG_FLUSH_DATA_WORKS
348static void pre_sync_fname(const char *fname, bool isdir, int elevel);
349#endif
350static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
351static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
352
353static int fsync_parent_path(const char *fname, int elevel);
354
355
356/* ResourceOwner callbacks to hold virtual file descriptors */
357static void ResOwnerReleaseFile(Datum res);
358static char *ResOwnerPrintFile(Datum res);
359
361{
362 .name = "File",
363 .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
364 .release_priority = RELEASE_PRIO_FILES,
365 .ReleaseResource = ResOwnerReleaseFile,
366 .DebugPrint = ResOwnerPrintFile
367};
368
369/* Convenience wrappers over ResourceOwnerRemember/Forget */
370static inline void
372{
374}
375static inline void
377{
379}
380
381/*
382 * pg_fsync --- do fsync with or without writethrough
383 */
384int
386{
387#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
388 struct stat st;
389
390 /*
391 * Some operating system implementations of fsync() have requirements
392 * about the file access modes that were used when their file descriptor
393 * argument was opened, and these requirements differ depending on whether
394 * the file descriptor is for a directory.
395 *
396 * For any file descriptor that may eventually be handed to fsync(), we
397 * should have opened it with access modes that are compatible with
398 * fsync() on all supported systems, otherwise the code may not be
399 * portable, even if it runs ok on the current system.
400 *
401 * We assert here that a descriptor for a file was opened with write
402 * permissions (either O_RDWR or O_WRONLY) and for a directory without
403 * write permissions (O_RDONLY).
404 *
405 * Ignore any fstat errors and let the follow-up fsync() do its work.
406 * Doing this sanity check here counts for the case where fsync() is
407 * disabled.
408 */
409 if (fstat(fd, &st) == 0)
410 {
411 int desc_flags = fcntl(fd, F_GETFL);
412
413 /*
414 * O_RDONLY is historically 0, so just make sure that for directories
415 * no write flags are used.
416 */
417 if (S_ISDIR(st.st_mode))
418 Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
419 else
420 Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
421 }
422 errno = 0;
423#endif
424
425 /* #if is to skip the wal_sync_method test if there's no need for it */
426#if defined(HAVE_FSYNC_WRITETHROUGH)
429 else
430#endif
432}
433
434
435/*
436 * pg_fsync_no_writethrough --- same as fsync except does nothing if
437 * enableFsync is off
438 */
439int
441{
442 int rc;
443
444 if (!enableFsync)
445 return 0;
446
447retry:
448 rc = fsync(fd);
449
450 if (rc == -1 && errno == EINTR)
451 goto retry;
452
453 return rc;
454}
455
456/*
457 * pg_fsync_writethrough
458 */
459int
461{
462 if (enableFsync)
463 {
464#if defined(F_FULLFSYNC)
465 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
466#else
467 errno = ENOSYS;
468 return -1;
469#endif
470 }
471 else
472 return 0;
473}
474
475/*
476 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
477 */
478int
480{
481 int rc;
482
483 if (!enableFsync)
484 return 0;
485
486retry:
487 rc = fdatasync(fd);
488
489 if (rc == -1 && errno == EINTR)
490 goto retry;
491
492 return rc;
493}
494
495/*
496 * pg_file_exists -- check that a file exists.
497 *
498 * This requires an absolute path to the file. Returns true if the file is
499 * not a directory, false otherwise.
500 */
501bool
503{
504 struct stat st;
505
506 Assert(name != NULL);
507
508 if (stat(name, &st) == 0)
509 return !S_ISDIR(st.st_mode);
510 else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
513 errmsg("could not access file \"%s\": %m", name)));
514
515 return false;
516}
517
518/*
519 * pg_flush_data --- advise OS that the described dirty data should be flushed
520 *
521 * offset of 0 with nbytes 0 means that the entire file should be flushed
522 */
523void
524pg_flush_data(int fd, off_t offset, off_t nbytes)
525{
526 /*
527 * Right now file flushing is primarily used to avoid making later
528 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
529 * if fsyncs are disabled - that's a decision we might want to make
530 * configurable at some point.
531 */
532 if (!enableFsync)
533 return;
534
535 /*
536 * We compile all alternatives that are supported on the current platform,
537 * to find portability problems more easily.
538 */
539#if defined(HAVE_SYNC_FILE_RANGE)
540 {
541 int rc;
542 static bool not_implemented_by_kernel = false;
543
544 if (not_implemented_by_kernel)
545 return;
546
547retry:
548
549 /*
550 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
551 * tells the OS that writeback for the specified blocks should be
552 * started, but that we don't want to wait for completion. Note that
553 * this call might block if too much dirty data exists in the range.
554 * This is the preferable method on OSs supporting it, as it works
555 * reliably when available (contrast to msync()) and doesn't flush out
556 * clean data (like FADV_DONTNEED).
557 */
558 rc = sync_file_range(fd, offset, nbytes,
559 SYNC_FILE_RANGE_WRITE);
560 if (rc != 0)
561 {
562 int elevel;
563
564 if (rc == EINTR)
565 goto retry;
566
567 /*
568 * For systems that don't have an implementation of
569 * sync_file_range() such as Windows WSL, generate only one
570 * warning and then suppress all further attempts by this process.
571 */
572 if (errno == ENOSYS)
573 {
574 elevel = WARNING;
575 not_implemented_by_kernel = true;
576 }
577 else
578 elevel = data_sync_elevel(WARNING);
579
580 ereport(elevel,
582 errmsg("could not flush dirty data: %m")));
583 }
584
585 return;
586 }
587#endif
588#if !defined(WIN32) && defined(MS_ASYNC)
589 {
590 void *p;
591 static int pagesize = 0;
592
593 /*
594 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
595 * writeback. On linux it only does so if MS_SYNC is specified, but
596 * then it does the writeback synchronously. Luckily all common linux
597 * systems have sync_file_range(). This is preferable over
598 * FADV_DONTNEED because it doesn't flush out clean data.
599 *
600 * We map the file (mmap()), tell the kernel to sync back the contents
601 * (msync()), and then remove the mapping again (munmap()).
602 */
603
604 /* mmap() needs actual length if we want to map whole file */
605 if (offset == 0 && nbytes == 0)
606 {
607 nbytes = lseek(fd, 0, SEEK_END);
608 if (nbytes < 0)
609 {
612 errmsg("could not determine dirty data size: %m")));
613 return;
614 }
615 }
616
617 /*
618 * Some platforms reject partial-page mmap() attempts. To deal with
619 * that, just truncate the request to a page boundary. If any extra
620 * bytes don't get flushed, well, it's only a hint anyway.
621 */
622
623 /* fetch pagesize only once */
624 if (pagesize == 0)
625 pagesize = sysconf(_SC_PAGESIZE);
626
627 /* align length to pagesize, dropping any fractional page */
628 if (pagesize > 0)
629 nbytes = (nbytes / pagesize) * pagesize;
630
631 /* fractional-page request is a no-op */
632 if (nbytes <= 0)
633 return;
634
635 /*
636 * mmap could well fail, particularly on 32-bit platforms where there
637 * may simply not be enough address space. If so, silently fall
638 * through to the next implementation.
639 */
640 if (nbytes <= (off_t) SSIZE_MAX)
641 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
642 else
643 p = MAP_FAILED;
644
645 if (p != MAP_FAILED)
646 {
647 int rc;
648
649 rc = msync(p, (size_t) nbytes, MS_ASYNC);
650 if (rc != 0)
651 {
654 errmsg("could not flush dirty data: %m")));
655 /* NB: need to fall through to munmap()! */
656 }
657
658 rc = munmap(p, (size_t) nbytes);
659 if (rc != 0)
660 {
661 /* FATAL error because mapping would remain */
664 errmsg("could not munmap() while flushing data: %m")));
665 }
666
667 return;
668 }
669 }
670#endif
671#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
672 {
673 int rc;
674
675 /*
676 * Signal the kernel that the passed in range should not be cached
677 * anymore. This has the, desired, side effect of writing out dirty
678 * data, and the, undesired, side effect of likely discarding useful
679 * clean cached blocks. For the latter reason this is the least
680 * preferable method.
681 */
682
683 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
684
685 if (rc != 0)
686 {
687 /* don't error out, this is just a performance optimization */
690 errmsg("could not flush dirty data: %m")));
691 }
692
693 return;
694 }
695#endif
696}
697
698/*
699 * Truncate an open file to a given length.
700 */
701static int
702pg_ftruncate(int fd, off_t length)
703{
704 int ret;
705
706retry:
707 ret = ftruncate(fd, length);
708
709 if (ret == -1 && errno == EINTR)
710 goto retry;
711
712 return ret;
713}
714
715/*
716 * Truncate a file to a given length by name.
717 */
718int
719pg_truncate(const char *path, off_t length)
720{
721 int ret;
722#ifdef WIN32
723 int save_errno;
724 int fd;
725
726 fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
727 if (fd >= 0)
728 {
729 ret = pg_ftruncate(fd, length);
730 save_errno = errno;
732 errno = save_errno;
733 }
734 else
735 ret = -1;
736#else
737
738retry:
739 ret = truncate(path, length);
740
741 if (ret == -1 && errno == EINTR)
742 goto retry;
743#endif
744
745 return ret;
746}
747
748/*
749 * fsync_fname -- fsync a file or directory, handling errors properly
750 *
751 * Try to fsync a file or directory. When doing the latter, ignore errors that
752 * indicate the OS just doesn't allow/require fsyncing directories.
753 */
754void
755fsync_fname(const char *fname, bool isdir)
756{
757 fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
758}
759
760/*
761 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
762 *
763 * This routine ensures that, after returning, the effect of renaming file
764 * persists in case of a crash. A crash while this routine is running will
765 * leave you with either the pre-existing or the moved file in place of the
766 * new file; no mixed state or truncated files are possible.
767 *
768 * It does so by using fsync on the old filename and the possibly existing
769 * target filename before the rename, and the target file and directory after.
770 *
771 * Note that rename() cannot be used across arbitrary directories, as they
772 * might not be on the same filesystem. Therefore this routine does not
773 * support renaming across directories.
774 *
775 * Log errors with the caller specified severity.
776 *
777 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
778 * valid upon return.
779 */
780int
781durable_rename(const char *oldfile, const char *newfile, int elevel)
782{
783 int fd;
784
785 /*
786 * First fsync the old and target path (if it exists), to ensure that they
787 * are properly persistent on disk. Syncing the target file is not
788 * strictly necessary, but it makes it easier to reason about crashes;
789 * because it's then guaranteed that either source or target file exists
790 * after a crash.
791 */
792 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
793 return -1;
794
795 fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
796 if (fd < 0)
797 {
798 if (errno != ENOENT)
799 {
800 ereport(elevel,
802 errmsg("could not open file \"%s\": %m", newfile)));
803 return -1;
804 }
805 }
806 else
807 {
808 if (pg_fsync(fd) != 0)
809 {
810 int save_errno;
811
812 /* close file upon error, might not be in transaction context */
813 save_errno = errno;
815 errno = save_errno;
816
817 ereport(elevel,
819 errmsg("could not fsync file \"%s\": %m", newfile)));
820 return -1;
821 }
822
823 if (CloseTransientFile(fd) != 0)
824 {
825 ereport(elevel,
827 errmsg("could not close file \"%s\": %m", newfile)));
828 return -1;
829 }
830 }
831
832 /* Time to do the real deal... */
833 if (rename(oldfile, newfile) < 0)
834 {
835 ereport(elevel,
837 errmsg("could not rename file \"%s\" to \"%s\": %m",
838 oldfile, newfile)));
839 return -1;
840 }
841
842 /*
843 * To guarantee renaming the file is persistent, fsync the file with its
844 * new name, and its containing directory.
845 */
846 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
847 return -1;
848
849 if (fsync_parent_path(newfile, elevel) != 0)
850 return -1;
851
852 return 0;
853}
854
855/*
856 * durable_unlink -- remove a file in a durable manner
857 *
858 * This routine ensures that, after returning, the effect of removing file
859 * persists in case of a crash. A crash while this routine is running will
860 * leave the system in no mixed state.
861 *
862 * It does so by using fsync on the parent directory of the file after the
863 * actual removal is done.
864 *
865 * Log errors with the severity specified by caller.
866 *
867 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
868 * valid upon return.
869 */
870int
871durable_unlink(const char *fname, int elevel)
872{
873 if (unlink(fname) < 0)
874 {
875 ereport(elevel,
877 errmsg("could not remove file \"%s\": %m",
878 fname)));
879 return -1;
880 }
881
882 /*
883 * To guarantee that the removal of the file is persistent, fsync its
884 * parent directory.
885 */
886 if (fsync_parent_path(fname, elevel) != 0)
887 return -1;
888
889 return 0;
890}
891
892/*
893 * InitFileAccess --- initialize this module during backend startup
894 *
895 * This is called during either normal or standalone backend start.
896 * It is *not* called in the postmaster.
897 *
898 * Note that this does not initialize temporary file access, that is
899 * separately initialized via InitTemporaryFileAccess().
900 */
901void
903{
904 Assert(SizeVfdCache == 0); /* call me only once */
905
906 /* initialize cache header entry */
907 VfdCache = (Vfd *) malloc(sizeof(Vfd));
908 if (VfdCache == NULL)
910 (errcode(ERRCODE_OUT_OF_MEMORY),
911 errmsg("out of memory")));
912
913 MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
915
916 SizeVfdCache = 1;
917}
918
919/*
920 * InitTemporaryFileAccess --- initialize temporary file access during startup
921 *
922 * This is called during either normal or standalone backend start.
923 * It is *not* called in the postmaster.
924 *
925 * This is separate from InitFileAccess() because temporary file cleanup can
926 * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
927 * our reporting has to happen before that. Low level file access should be
928 * available for longer, hence the separate initialization / shutdown of
929 * temporary file handling.
930 */
931void
933{
934 Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
935 Assert(!temporary_files_allowed); /* call me only once */
936
937 /*
938 * Register before-shmem-exit hook to ensure temp files are dropped while
939 * we can still report stats.
940 */
942
943#ifdef USE_ASSERT_CHECKING
944 temporary_files_allowed = true;
945#endif
946}
947
948/*
949 * count_usable_fds --- count how many FDs the system will let us open,
950 * and estimate how many are already open.
951 *
952 * We stop counting if usable_fds reaches max_to_probe. Note: a small
953 * value of max_to_probe might result in an underestimate of already_open;
954 * we must fill in any "gaps" in the set of used FDs before the calculation
955 * of already_open will give the right answer. In practice, max_to_probe
956 * of a couple of dozen should be enough to ensure good results.
957 *
958 * We assume stderr (FD 2) is available for dup'ing. While the calling
959 * script could theoretically close that, it would be a really bad idea,
960 * since then one risks loss of error messages from, e.g., libc.
961 */
962static void
963count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
964{
965 int *fd;
966 int size;
967 int used = 0;
968 int highestfd = 0;
969 int j;
970
971#ifdef HAVE_GETRLIMIT
972 struct rlimit rlim;
973 int getrlimit_status;
974#endif
975
976 size = 1024;
977 fd = (int *) palloc(size * sizeof(int));
978
979#ifdef HAVE_GETRLIMIT
980 getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
981 if (getrlimit_status != 0)
982 ereport(WARNING, (errmsg("getrlimit failed: %m")));
983#endif /* HAVE_GETRLIMIT */
984
985 /* dup until failure or probe limit reached */
986 for (;;)
987 {
988 int thisfd;
989
990#ifdef HAVE_GETRLIMIT
991
992 /*
993 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
994 * some platforms
995 */
996 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
997 break;
998#endif
999
1000 thisfd = dup(2);
1001 if (thisfd < 0)
1002 {
1003 /* Expect EMFILE or ENFILE, else it's fishy */
1004 if (errno != EMFILE && errno != ENFILE)
1005 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1006 break;
1007 }
1008
1009 if (used >= size)
1010 {
1011 size *= 2;
1012 fd = (int *) repalloc(fd, size * sizeof(int));
1013 }
1014 fd[used++] = thisfd;
1015
1016 if (highestfd < thisfd)
1017 highestfd = thisfd;
1018
1019 if (used >= max_to_probe)
1020 break;
1021 }
1022
1023 /* release the files we opened */
1024 for (j = 0; j < used; j++)
1025 close(fd[j]);
1026
1027 pfree(fd);
1028
1029 /*
1030 * Return results. usable_fds is just the number of successful dups. We
1031 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1032 * number) and so already_open is highestfd+1 - usable_fds.
1033 */
1034 *usable_fds = used;
1035 *already_open = highestfd + 1 - used;
1036}
1037
1038/*
1039 * set_max_safe_fds
1040 * Determine number of file descriptors that fd.c is allowed to use
1041 */
1042void
1044{
1045 int usable_fds;
1046 int already_open;
1047
1048 /*----------
1049 * We want to set max_safe_fds to
1050 * MIN(usable_fds, max_files_per_process - already_open)
1051 * less the slop factor for files that are opened without consulting
1052 * fd.c. This ensures that we won't exceed either max_files_per_process
1053 * or the experimentally-determined EMFILE limit.
1054 *----------
1055 */
1057 &usable_fds, &already_open);
1058
1059 max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
1060
1061 /*
1062 * Take off the FDs reserved for system() etc.
1063 */
1065
1066 /*
1067 * Make sure we still have enough to get by.
1068 */
1070 ereport(FATAL,
1071 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1072 errmsg("insufficient file descriptors available to start server process"),
1073 errdetail("System allows %d, server needs at least %d.",
1076
1077 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1078 max_safe_fds, usable_fds, already_open);
1079}
1080
1081/*
1082 * Open a file with BasicOpenFilePerm() and pass default file mode for the
1083 * fileMode parameter.
1084 */
1085int
1086BasicOpenFile(const char *fileName, int fileFlags)
1087{
1088 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1089}
1090
1091/*
1092 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1093 *
1094 * This is exported for use by places that really want a plain kernel FD,
1095 * but need to be proof against running out of FDs. Once an FD has been
1096 * successfully returned, it is the caller's responsibility to ensure that
1097 * it will not be leaked on ereport()! Most users should *not* call this
1098 * routine directly, but instead use the VFD abstraction level, which
1099 * provides protection against descriptor leaks as well as management of
1100 * files that need to be open for more than a short period of time.
1101 *
1102 * Ideally this should be the *only* direct call of open() in the backend.
1103 * In practice, the postmaster calls open() directly, and there are some
1104 * direct open() calls done early in backend startup. Those are OK since
1105 * this module wouldn't have any open files to close at that point anyway.
1106 */
1107int
1108BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1109{
1110 int fd;
1111
1112tryAgain:
1113#ifdef PG_O_DIRECT_USE_F_NOCACHE
1114
1115 /*
1116 * The value we defined to stand in for O_DIRECT when simulating it with
1117 * F_NOCACHE had better not collide with any of the standard flags.
1118 */
1120 (O_APPEND |
1121 O_CLOEXEC |
1122 O_CREAT |
1123 O_DSYNC |
1124 O_EXCL |
1125 O_RDWR |
1126 O_RDONLY |
1127 O_SYNC |
1128 O_TRUNC |
1129 O_WRONLY)) == 0,
1130 "PG_O_DIRECT value collides with standard flag");
1131 fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1132#else
1133 fd = open(fileName, fileFlags, fileMode);
1134#endif
1135
1136 if (fd >= 0)
1137 {
1138#ifdef PG_O_DIRECT_USE_F_NOCACHE
1139 if (fileFlags & PG_O_DIRECT)
1140 {
1141 if (fcntl(fd, F_NOCACHE, 1) < 0)
1142 {
1143 int save_errno = errno;
1144
1145 close(fd);
1146 errno = save_errno;
1147 return -1;
1148 }
1149 }
1150#endif
1151
1152 return fd; /* success! */
1153 }
1154
1155 if (errno == EMFILE || errno == ENFILE)
1156 {
1157 int save_errno = errno;
1158
1159 ereport(LOG,
1160 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1161 errmsg("out of file descriptors: %m; release and retry")));
1162 errno = 0;
1163 if (ReleaseLruFile())
1164 goto tryAgain;
1165 errno = save_errno;
1166 }
1167
1168 return -1; /* failure */
1169}
1170
1171/*
1172 * AcquireExternalFD - attempt to reserve an external file descriptor
1173 *
1174 * This should be used by callers that need to hold a file descriptor open
1175 * over more than a short interval, but cannot use any of the other facilities
1176 * provided by this module.
1177 *
1178 * The difference between this and the underlying ReserveExternalFD function
1179 * is that this will report failure (by setting errno and returning false)
1180 * if "too many" external FDs are already reserved. This should be used in
1181 * any code where the total number of FDs to be reserved is not predictable
1182 * and small.
1183 */
1184bool
1186{
1187 /*
1188 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1189 * "external" FDs.
1190 */
1191 if (numExternalFDs < max_safe_fds / 3)
1192 {
1194 return true;
1195 }
1196 errno = EMFILE;
1197 return false;
1198}
1199
1200/*
1201 * ReserveExternalFD - report external consumption of a file descriptor
1202 *
1203 * This should be used by callers that need to hold a file descriptor open
1204 * over more than a short interval, but cannot use any of the other facilities
1205 * provided by this module. This just tracks the use of the FD and closes
1206 * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1207 *
1208 * Call this directly only in code where failure to reserve the FD would be
1209 * fatal; for example, the WAL-writing code does so, since the alternative is
1210 * session failure. Also, it's very unwise to do so in code that could
1211 * consume more than one FD per process.
1212 *
1213 * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1214 * available, it doesn't matter too much whether this is called before or
1215 * after actually opening the FD; but doing so beforehand reduces the risk of
1216 * an EMFILE failure if not everybody played nice. In any case, it's solely
1217 * caller's responsibility to keep the external-FD count in sync with reality.
1218 */
1219void
1221{
1222 /*
1223 * Release VFDs if needed to stay safe. Because we do this before
1224 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1225 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1226 */
1228
1230}
1231
1232/*
1233 * ReleaseExternalFD - report release of an external file descriptor
1234 *
1235 * This is guaranteed not to change errno, so it can be used in failure paths.
1236 */
1237void
1239{
1242}
1243
1244
1245#if defined(FDDEBUG)
1246
1247static void
1248_dump_lru(void)
1249{
1250 int mru = VfdCache[0].lruLessRecently;
1251 Vfd *vfdP = &VfdCache[mru];
1252 char buf[2048];
1253
1254 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1255 while (mru != 0)
1256 {
1257 mru = vfdP->lruLessRecently;
1258 vfdP = &VfdCache[mru];
1259 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1260 }
1261 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1262 elog(LOG, "%s", buf);
1263}
1264#endif /* FDDEBUG */
1265
1266static void
1268{
1269 Vfd *vfdP;
1270
1271 Assert(file != 0);
1272
1273 DO_DB(elog(LOG, "Delete %d (%s)",
1274 file, VfdCache[file].fileName));
1275 DO_DB(_dump_lru());
1276
1277 vfdP = &VfdCache[file];
1278
1281
1282 DO_DB(_dump_lru());
1283}
1284
1285static void
1287{
1288 Vfd *vfdP;
1289
1290 Assert(file != 0);
1291
1292 DO_DB(elog(LOG, "LruDelete %d (%s)",
1293 file, VfdCache[file].fileName));
1294
1295 vfdP = &VfdCache[file];
1296
1297 /*
1298 * Close the file. We aren't expecting this to fail; if it does, better
1299 * to leak the FD than to mess up our internal state.
1300 */
1301 if (close(vfdP->fd) != 0)
1303 "could not close file \"%s\": %m", vfdP->fileName);
1304 vfdP->fd = VFD_CLOSED;
1305 --nfile;
1306
1307 /* delete the vfd record from the LRU ring */
1308 Delete(file);
1309}
1310
1311static void
1313{
1314 Vfd *vfdP;
1315
1316 Assert(file != 0);
1317
1318 DO_DB(elog(LOG, "Insert %d (%s)",
1319 file, VfdCache[file].fileName));
1320 DO_DB(_dump_lru());
1321
1322 vfdP = &VfdCache[file];
1323
1324 vfdP->lruMoreRecently = 0;
1326 VfdCache[0].lruLessRecently = file;
1328
1329 DO_DB(_dump_lru());
1330}
1331
1332/* returns 0 on success, -1 on re-open failure (with errno set) */
1333static int
1335{
1336 Vfd *vfdP;
1337
1338 Assert(file != 0);
1339
1340 DO_DB(elog(LOG, "LruInsert %d (%s)",
1341 file, VfdCache[file].fileName));
1342
1343 vfdP = &VfdCache[file];
1344
1345 if (FileIsNotOpen(file))
1346 {
1347 /* Close excess kernel FDs. */
1349
1350 /*
1351 * The open could still fail for lack of file descriptors, eg due to
1352 * overall system file table being full. So, be prepared to release
1353 * another FD if necessary...
1354 */
1355 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1356 vfdP->fileMode);
1357 if (vfdP->fd < 0)
1358 {
1359 DO_DB(elog(LOG, "re-open failed: %m"));
1360 return -1;
1361 }
1362 else
1363 {
1364 ++nfile;
1365 }
1366 }
1367
1368 /*
1369 * put it at the head of the Lru ring
1370 */
1371
1372 Insert(file);
1373
1374 return 0;
1375}
1376
1377/*
1378 * Release one kernel FD by closing the least-recently-used VFD.
1379 */
1380static bool
1382{
1383 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1384
1385 if (nfile > 0)
1386 {
1387 /*
1388 * There are opened files and so there should be at least one used vfd
1389 * in the ring.
1390 */
1391 Assert(VfdCache[0].lruMoreRecently != 0);
1392 LruDelete(VfdCache[0].lruMoreRecently);
1393 return true; /* freed a file */
1394 }
1395 return false; /* no files available to free */
1396}
1397
1398/*
1399 * Release kernel FDs as needed to get under the max_safe_fds limit.
1400 * After calling this, it's OK to try to open another file.
1401 */
1402static void
1404{
1406 {
1407 if (!ReleaseLruFile())
1408 break;
1409 }
1410}
1411
1412static File
1414{
1415 Index i;
1416 File file;
1417
1418 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1419
1420 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1421
1422 if (VfdCache[0].nextFree == 0)
1423 {
1424 /*
1425 * The free list is empty so it is time to increase the size of the
1426 * array. We choose to double it each time this happens. However,
1427 * there's not much point in starting *real* small.
1428 */
1429 Size newCacheSize = SizeVfdCache * 2;
1430 Vfd *newVfdCache;
1431
1432 if (newCacheSize < 32)
1433 newCacheSize = 32;
1434
1435 /*
1436 * Be careful not to clobber VfdCache ptr if realloc fails.
1437 */
1438 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1439 if (newVfdCache == NULL)
1440 ereport(ERROR,
1441 (errcode(ERRCODE_OUT_OF_MEMORY),
1442 errmsg("out of memory")));
1443 VfdCache = newVfdCache;
1444
1445 /*
1446 * Initialize the new entries and link them into the free list.
1447 */
1448 for (i = SizeVfdCache; i < newCacheSize; i++)
1449 {
1450 MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1451 VfdCache[i].nextFree = i + 1;
1453 }
1454 VfdCache[newCacheSize - 1].nextFree = 0;
1456
1457 /*
1458 * Record the new size
1459 */
1460 SizeVfdCache = newCacheSize;
1461 }
1462
1463 file = VfdCache[0].nextFree;
1464
1466
1467 return file;
1468}
1469
1470static void
1472{
1473 Vfd *vfdP = &VfdCache[file];
1474
1475 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1476 file, vfdP->fileName ? vfdP->fileName : ""));
1477
1478 if (vfdP->fileName != NULL)
1479 {
1480 free(vfdP->fileName);
1481 vfdP->fileName = NULL;
1482 }
1483 vfdP->fdstate = 0x0;
1484
1485 vfdP->nextFree = VfdCache[0].nextFree;
1486 VfdCache[0].nextFree = file;
1487}
1488
1489/* returns 0 on success, -1 on re-open failure (with errno set) */
1490static int
1492{
1493 int returnValue;
1494
1495 DO_DB(elog(LOG, "FileAccess %d (%s)",
1496 file, VfdCache[file].fileName));
1497
1498 /*
1499 * Is the file open? If not, open it and put it at the head of the LRU
1500 * ring (possibly closing the least recently used file to get an FD).
1501 */
1502
1503 if (FileIsNotOpen(file))
1504 {
1505 returnValue = LruInsert(file);
1506 if (returnValue != 0)
1507 return returnValue;
1508 }
1509 else if (VfdCache[0].lruLessRecently != file)
1510 {
1511 /*
1512 * We now know that the file is open and that it is not the last one
1513 * accessed, so we need to move it to the head of the Lru ring.
1514 */
1515
1516 Delete(file);
1517 Insert(file);
1518 }
1519
1520 return 0;
1521}
1522
1523/*
1524 * Called whenever a temporary file is deleted to report its size.
1525 */
1526static void
1527ReportTemporaryFileUsage(const char *path, off_t size)
1528{
1530
1531 if (log_temp_files >= 0)
1532 {
1533 if ((size / 1024) >= log_temp_files)
1534 ereport(LOG,
1535 (errmsg("temporary file: path \"%s\", size %lu",
1536 path, (unsigned long) size)));
1537 }
1538}
1539
1540/*
1541 * Called to register a temporary file for automatic close.
1542 * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1543 * before the file was opened.
1544 */
1545static void
1547{
1550
1551 /* Backup mechanism for closing at end of xact. */
1554}
1555
1556/*
1557 * Called when we get a shared invalidation message on some relation.
1558 */
1559#ifdef NOT_USED
1560void
1561FileInvalidate(File file)
1562{
1563 Assert(FileIsValid(file));
1564 if (!FileIsNotOpen(file))
1565 LruDelete(file);
1566}
1567#endif
1568
1569/*
1570 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1571 * fileMode parameter.
1572 */
1573File
1574PathNameOpenFile(const char *fileName, int fileFlags)
1575{
1576 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1577}
1578
1579/*
1580 * open a file in an arbitrary directory
1581 *
1582 * NB: if the passed pathname is relative (which it usually is),
1583 * it will be interpreted relative to the process' working directory
1584 * (which should always be $PGDATA when this code is running).
1585 */
1586File
1587PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1588{
1589 char *fnamecopy;
1590 File file;
1591 Vfd *vfdP;
1592
1593 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1594 fileName, fileFlags, fileMode));
1595
1596 /*
1597 * We need a malloc'd copy of the file name; fail cleanly if no room.
1598 */
1599 fnamecopy = strdup(fileName);
1600 if (fnamecopy == NULL)
1601 ereport(ERROR,
1602 (errcode(ERRCODE_OUT_OF_MEMORY),
1603 errmsg("out of memory")));
1604
1605 file = AllocateVfd();
1606 vfdP = &VfdCache[file];
1607
1608 /* Close excess kernel FDs. */
1610
1611 /*
1612 * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1613 * client shouldn't be expected to know which kernel descriptors are
1614 * currently open, so it wouldn't make sense for them to be inherited by
1615 * executed subprograms.
1616 */
1617 fileFlags |= O_CLOEXEC;
1618
1619 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1620
1621 if (vfdP->fd < 0)
1622 {
1623 int save_errno = errno;
1624
1625 FreeVfd(file);
1626 free(fnamecopy);
1627 errno = save_errno;
1628 return -1;
1629 }
1630 ++nfile;
1631 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1632 vfdP->fd));
1633
1634 vfdP->fileName = fnamecopy;
1635 /* Saved flags are adjusted to be OK for re-opening file */
1636 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1637 vfdP->fileMode = fileMode;
1638 vfdP->fileSize = 0;
1639 vfdP->fdstate = 0x0;
1640 vfdP->resowner = NULL;
1641
1642 Insert(file);
1643
1644 return file;
1645}
1646
1647/*
1648 * Create directory 'directory'. If necessary, create 'basedir', which must
1649 * be the directory above it. This is designed for creating the top-level
1650 * temporary directory on demand before creating a directory underneath it.
1651 * Do nothing if the directory already exists.
1652 *
1653 * Directories created within the top-level temporary directory should begin
1654 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1655 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1656 * that do not need any particular prefix.
1657*/
1658void
1660{
1661 if (MakePGDirectory(directory) < 0)
1662 {
1663 if (errno == EEXIST)
1664 return;
1665
1666 /*
1667 * Failed. Try to create basedir first in case it's missing. Tolerate
1668 * EEXIST to close a race against another process following the same
1669 * algorithm.
1670 */
1671 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1672 ereport(ERROR,
1674 errmsg("cannot create temporary directory \"%s\": %m",
1675 basedir)));
1676
1677 /* Try again. */
1678 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1679 ereport(ERROR,
1681 errmsg("cannot create temporary subdirectory \"%s\": %m",
1682 directory)));
1683 }
1684}
1685
1686/*
1687 * Delete a directory and everything in it, if it exists.
1688 */
1689void
1690PathNameDeleteTemporaryDir(const char *dirname)
1691{
1692 struct stat statbuf;
1693
1694 /* Silently ignore missing directory. */
1695 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1696 return;
1697
1698 /*
1699 * Currently, walkdir doesn't offer a way for our passed in function to
1700 * maintain state. Perhaps it should, so that we could tell the caller
1701 * whether this operation succeeded or failed. Since this operation is
1702 * used in a cleanup path, we wouldn't actually behave differently: we'll
1703 * just log failures.
1704 */
1705 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1706}
1707
1708/*
1709 * Open a temporary file that will disappear when we close it.
1710 *
1711 * This routine takes care of generating an appropriate tempfile name.
1712 * There's no need to pass in fileFlags or fileMode either, since only
1713 * one setting makes any sense for a temp file.
1714 *
1715 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1716 * to ensure it's closed and deleted when it's no longer needed, typically at
1717 * the end-of-transaction. In most cases, you don't want temporary files to
1718 * outlive the transaction that created them, so this should be false -- but
1719 * if you need "somewhat" temporary storage, this might be useful. In either
1720 * case, the file is removed when the File is explicitly closed.
1721 */
1722File
1723OpenTemporaryFile(bool interXact)
1724{
1725 File file = 0;
1726
1727 Assert(temporary_files_allowed); /* check temp file access is up */
1728
1729 /*
1730 * Make sure the current resource owner has space for this File before we
1731 * open it, if we'll be registering it below.
1732 */
1733 if (!interXact)
1735
1736 /*
1737 * If some temp tablespace(s) have been given to us, try to use the next
1738 * one. If a given tablespace can't be found, we silently fall back to
1739 * the database's default tablespace.
1740 *
1741 * BUT: if the temp file is slated to outlive the current transaction,
1742 * force it into the database's default tablespace, so that it will not
1743 * pose a threat to possible tablespace drop attempts.
1744 */
1745 if (numTempTableSpaces > 0 && !interXact)
1746 {
1747 Oid tblspcOid = GetNextTempTableSpace();
1748
1749 if (OidIsValid(tblspcOid))
1750 file = OpenTemporaryFileInTablespace(tblspcOid, false);
1751 }
1752
1753 /*
1754 * If not, or if tablespace is bad, create in database's default
1755 * tablespace. MyDatabaseTableSpace should normally be set before we get
1756 * here, but just in case it isn't, fall back to pg_default tablespace.
1757 */
1758 if (file <= 0)
1761 DEFAULTTABLESPACE_OID,
1762 true);
1763
1764 /* Mark it for deletion at close and temporary file size limit */
1766
1767 /* Register it with the current resource owner */
1768 if (!interXact)
1770
1771 return file;
1772}
1773
1774/*
1775 * Return the path of the temp directory in a given tablespace.
1776 */
1777void
1779{
1780 /*
1781 * Identify the tempfile directory for this tablespace.
1782 *
1783 * If someone tries to specify pg_global, use pg_default instead.
1784 */
1785 if (tablespace == InvalidOid ||
1786 tablespace == DEFAULTTABLESPACE_OID ||
1787 tablespace == GLOBALTABLESPACE_OID)
1788 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1789 else
1790 {
1791 /* All other tablespaces are accessed via symlinks */
1792 snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1795 }
1796}
1797
1798/*
1799 * Open a temporary file in a specific tablespace.
1800 * Subroutine for OpenTemporaryFile, which see for details.
1801 */
1802static File
1803OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1804{
1805 char tempdirpath[MAXPGPATH];
1806 char tempfilepath[MAXPGPATH];
1807 File file;
1808
1809 TempTablespacePath(tempdirpath, tblspcOid);
1810
1811 /*
1812 * Generate a tempfile name that should be unique within the current
1813 * database instance.
1814 */
1815 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1817
1818 /*
1819 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1820 * temp file that can be reused.
1821 */
1822 file = PathNameOpenFile(tempfilepath,
1823 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1824 if (file <= 0)
1825 {
1826 /*
1827 * We might need to create the tablespace's tempfile directory, if no
1828 * one has yet done so.
1829 *
1830 * Don't check for an error from MakePGDirectory; it could fail if
1831 * someone else just did the same thing. If it doesn't work then
1832 * we'll bomb out on the second create attempt, instead.
1833 */
1834 (void) MakePGDirectory(tempdirpath);
1835
1836 file = PathNameOpenFile(tempfilepath,
1837 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1838 if (file <= 0 && rejectError)
1839 elog(ERROR, "could not create temporary file \"%s\": %m",
1840 tempfilepath);
1841 }
1842
1843 return file;
1844}
1845
1846
1847/*
1848 * Create a new file. The directory containing it must already exist. Files
1849 * created this way are subject to temp_file_limit and are automatically
1850 * closed at end of transaction, but are not automatically deleted on close
1851 * because they are intended to be shared between cooperating backends.
1852 *
1853 * If the file is inside the top-level temporary directory, its name should
1854 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1855 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1856 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1857 * the prefix isn't needed.
1858 */
1859File
1860PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1861{
1862 File file;
1863
1864 Assert(temporary_files_allowed); /* check temp file access is up */
1865
1867
1868 /*
1869 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1870 * temp file that can be reused.
1871 */
1872 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1873 if (file <= 0)
1874 {
1875 if (error_on_failure)
1876 ereport(ERROR,
1878 errmsg("could not create temporary file \"%s\": %m",
1879 path)));
1880 else
1881 return file;
1882 }
1883
1884 /* Mark it for temp_file_limit accounting. */
1886
1887 /* Register it for automatic close. */
1889
1890 return file;
1891}
1892
1893/*
1894 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1895 * another backend. Files opened this way don't count against the
1896 * temp_file_limit of the caller, are automatically closed at the end of the
1897 * transaction but are not deleted on close.
1898 */
1899File
1900PathNameOpenTemporaryFile(const char *path, int mode)
1901{
1902 File file;
1903
1904 Assert(temporary_files_allowed); /* check temp file access is up */
1905
1907
1908 file = PathNameOpenFile(path, mode | PG_BINARY);
1909
1910 /* If no such file, then we don't raise an error. */
1911 if (file <= 0 && errno != ENOENT)
1912 ereport(ERROR,
1914 errmsg("could not open temporary file \"%s\": %m",
1915 path)));
1916
1917 if (file > 0)
1918 {
1919 /* Register it for automatic close. */
1921 }
1922
1923 return file;
1924}
1925
1926/*
1927 * Delete a file by pathname. Return true if the file existed, false if
1928 * didn't.
1929 */
1930bool
1931PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1932{
1933 struct stat filestats;
1934 int stat_errno;
1935
1936 /* Get the final size for pgstat reporting. */
1937 if (stat(path, &filestats) != 0)
1938 stat_errno = errno;
1939 else
1940 stat_errno = 0;
1941
1942 /*
1943 * Unlike FileClose's automatic file deletion code, we tolerate
1944 * non-existence to support BufFileDeleteFileSet which doesn't know how
1945 * many segments it has to delete until it runs out.
1946 */
1947 if (stat_errno == ENOENT)
1948 return false;
1949
1950 if (unlink(path) < 0)
1951 {
1952 if (errno != ENOENT)
1953 ereport(error_on_failure ? ERROR : LOG,
1955 errmsg("could not unlink temporary file \"%s\": %m",
1956 path)));
1957 return false;
1958 }
1959
1960 if (stat_errno == 0)
1961 ReportTemporaryFileUsage(path, filestats.st_size);
1962 else
1963 {
1964 errno = stat_errno;
1965 ereport(LOG,
1967 errmsg("could not stat file \"%s\": %m", path)));
1968 }
1969
1970 return true;
1971}
1972
1973/*
1974 * close a file when done with it
1975 */
1976void
1978{
1979 Vfd *vfdP;
1980
1981 Assert(FileIsValid(file));
1982
1983 DO_DB(elog(LOG, "FileClose: %d (%s)",
1984 file, VfdCache[file].fileName));
1985
1986 vfdP = &VfdCache[file];
1987
1988 if (!FileIsNotOpen(file))
1989 {
1990 /* close the file */
1991 if (close(vfdP->fd) != 0)
1992 {
1993 /*
1994 * We may need to panic on failure to close non-temporary files;
1995 * see LruDelete.
1996 */
1998 "could not close file \"%s\": %m", vfdP->fileName);
1999 }
2000
2001 --nfile;
2002 vfdP->fd = VFD_CLOSED;
2003
2004 /* remove the file from the lru ring */
2005 Delete(file);
2006 }
2007
2008 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2009 {
2010 /* Subtract its size from current usage (do first in case of error) */
2012 vfdP->fileSize = 0;
2013 }
2014
2015 /*
2016 * Delete the file if it was temporary, and make a log entry if wanted
2017 */
2018 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2019 {
2020 struct stat filestats;
2021 int stat_errno;
2022
2023 /*
2024 * If we get an error, as could happen within the ereport/elog calls,
2025 * we'll come right back here during transaction abort. Reset the
2026 * flag to ensure that we can't get into an infinite loop. This code
2027 * is arranged to ensure that the worst-case consequence is failing to
2028 * emit log message(s), not failing to attempt the unlink.
2029 */
2030 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2031
2032
2033 /* first try the stat() */
2034 if (stat(vfdP->fileName, &filestats))
2035 stat_errno = errno;
2036 else
2037 stat_errno = 0;
2038
2039 /* in any case do the unlink */
2040 if (unlink(vfdP->fileName))
2041 ereport(LOG,
2043 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2044
2045 /* and last report the stat results */
2046 if (stat_errno == 0)
2047 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2048 else
2049 {
2050 errno = stat_errno;
2051 ereport(LOG,
2053 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2054 }
2055 }
2056
2057 /* Unregister it from the resource owner */
2058 if (vfdP->resowner)
2059 ResourceOwnerForgetFile(vfdP->resowner, file);
2060
2061 /*
2062 * Return the Vfd slot to the free list
2063 */
2064 FreeVfd(file);
2065}
2066
2067/*
2068 * FilePrefetch - initiate asynchronous read of a given range of the file.
2069 *
2070 * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2071 *
2072 * posix_fadvise() is the simplest standardized interface that accomplishes
2073 * this.
2074 */
2075int
2076FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
2077{
2078 Assert(FileIsValid(file));
2079
2080 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2081 file, VfdCache[file].fileName,
2082 (int64) offset, (int64) amount));
2083
2084#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2085 {
2086 int returnCode;
2087
2088 returnCode = FileAccess(file);
2089 if (returnCode < 0)
2090 return returnCode;
2091
2092retry:
2093 pgstat_report_wait_start(wait_event_info);
2094 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2095 POSIX_FADV_WILLNEED);
2097
2098 if (returnCode == EINTR)
2099 goto retry;
2100
2101 return returnCode;
2102 }
2103#elif defined(__darwin__)
2104 {
2105 struct radvisory
2106 {
2107 off_t ra_offset; /* offset into the file */
2108 int ra_count; /* size of the read */
2109 } ra;
2110 int returnCode;
2111
2112 returnCode = FileAccess(file);
2113 if (returnCode < 0)
2114 return returnCode;
2115
2116 ra.ra_offset = offset;
2117 ra.ra_count = amount;
2118 pgstat_report_wait_start(wait_event_info);
2119 returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
2121 if (returnCode != -1)
2122 return 0;
2123 else
2124 return errno;
2125 }
2126#else
2127 return 0;
2128#endif
2129}
2130
2131void
2132FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2133{
2134 int returnCode;
2135
2136 Assert(FileIsValid(file));
2137
2138 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2139 file, VfdCache[file].fileName,
2140 (int64) offset, (int64) nbytes));
2141
2142 if (nbytes <= 0)
2143 return;
2144
2145 if (VfdCache[file].fileFlags & PG_O_DIRECT)
2146 return;
2147
2148 returnCode = FileAccess(file);
2149 if (returnCode < 0)
2150 return;
2151
2152 pgstat_report_wait_start(wait_event_info);
2153 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2155}
2156
2157ssize_t
2158FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2159 uint32 wait_event_info)
2160{
2161 ssize_t returnCode;
2162 Vfd *vfdP;
2163
2164 Assert(FileIsValid(file));
2165
2166 DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2167 file, VfdCache[file].fileName,
2168 (int64) offset,
2169 iovcnt));
2170
2171 returnCode = FileAccess(file);
2172 if (returnCode < 0)
2173 return returnCode;
2174
2175 vfdP = &VfdCache[file];
2176
2177retry:
2178 pgstat_report_wait_start(wait_event_info);
2179 returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2181
2182 if (returnCode < 0)
2183 {
2184 /*
2185 * Windows may run out of kernel buffers and return "Insufficient
2186 * system resources" error. Wait a bit and retry to solve it.
2187 *
2188 * It is rumored that EINTR is also possible on some Unix filesystems,
2189 * in which case immediate retry is indicated.
2190 */
2191#ifdef WIN32
2192 DWORD error = GetLastError();
2193
2194 switch (error)
2195 {
2196 case ERROR_NO_SYSTEM_RESOURCES:
2197 pg_usleep(1000L);
2198 errno = EINTR;
2199 break;
2200 default:
2202 break;
2203 }
2204#endif
2205 /* OK to retry if interrupted */
2206 if (errno == EINTR)
2207 goto retry;
2208 }
2209
2210 return returnCode;
2211}
2212
2213ssize_t
2214FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2215 uint32 wait_event_info)
2216{
2217 ssize_t returnCode;
2218 Vfd *vfdP;
2219
2220 Assert(FileIsValid(file));
2221
2222 DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2223 file, VfdCache[file].fileName,
2224 (int64) offset,
2225 iovcnt));
2226
2227 returnCode = FileAccess(file);
2228 if (returnCode < 0)
2229 return returnCode;
2230
2231 vfdP = &VfdCache[file];
2232
2233 /*
2234 * If enforcing temp_file_limit and it's a temp file, check to see if the
2235 * write would overrun temp_file_limit, and throw error if so. Note: it's
2236 * really a modularity violation to throw error here; we should set errno
2237 * and return -1. However, there's no way to report a suitable error
2238 * message if we do that. All current callers would just throw error
2239 * immediately anyway, so this is safe at present.
2240 */
2241 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2242 {
2243 off_t past_write = offset;
2244
2245 for (int i = 0; i < iovcnt; ++i)
2246 past_write += iov[i].iov_len;
2247
2248 if (past_write > vfdP->fileSize)
2249 {
2250 uint64 newTotal = temporary_files_size;
2251
2252 newTotal += past_write - vfdP->fileSize;
2253 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2254 ereport(ERROR,
2255 (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2256 errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2257 temp_file_limit)));
2258 }
2259 }
2260
2261retry:
2262 pgstat_report_wait_start(wait_event_info);
2263 returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2265
2266 if (returnCode >= 0)
2267 {
2268 /*
2269 * Some callers expect short writes to set errno, and traditionally we
2270 * have assumed that they imply disk space shortage. We don't want to
2271 * waste CPU cycles adding up the total size here, so we'll just set
2272 * it for all successful writes in case such a caller determines that
2273 * the write was short and ereports "%m".
2274 */
2275 errno = ENOSPC;
2276
2277 /*
2278 * Maintain fileSize and temporary_files_size if it's a temp file.
2279 */
2280 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2281 {
2282 off_t past_write = offset + returnCode;
2283
2284 if (past_write > vfdP->fileSize)
2285 {
2286 temporary_files_size += past_write - vfdP->fileSize;
2287 vfdP->fileSize = past_write;
2288 }
2289 }
2290 }
2291 else
2292 {
2293 /*
2294 * See comments in FileReadV()
2295 */
2296#ifdef WIN32
2297 DWORD error = GetLastError();
2298
2299 switch (error)
2300 {
2301 case ERROR_NO_SYSTEM_RESOURCES:
2302 pg_usleep(1000L);
2303 errno = EINTR;
2304 break;
2305 default:
2307 break;
2308 }
2309#endif
2310 /* OK to retry if interrupted */
2311 if (errno == EINTR)
2312 goto retry;
2313 }
2314
2315 return returnCode;
2316}
2317
2318int
2319FileSync(File file, uint32 wait_event_info)
2320{
2321 int returnCode;
2322
2323 Assert(FileIsValid(file));
2324
2325 DO_DB(elog(LOG, "FileSync: %d (%s)",
2326 file, VfdCache[file].fileName));
2327
2328 returnCode = FileAccess(file);
2329 if (returnCode < 0)
2330 return returnCode;
2331
2332 pgstat_report_wait_start(wait_event_info);
2333 returnCode = pg_fsync(VfdCache[file].fd);
2335
2336 return returnCode;
2337}
2338
2339/*
2340 * Zero a region of the file.
2341 *
2342 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2343 * appropriate error.
2344 */
2345int
2346FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
2347{
2348 int returnCode;
2349 ssize_t written;
2350
2351 Assert(FileIsValid(file));
2352
2353 DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2354 file, VfdCache[file].fileName,
2355 (int64) offset, (int64) amount));
2356
2357 returnCode = FileAccess(file);
2358 if (returnCode < 0)
2359 return returnCode;
2360
2361 pgstat_report_wait_start(wait_event_info);
2362 written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2364
2365 if (written < 0)
2366 return -1;
2367 else if (written != amount)
2368 {
2369 /* if errno is unset, assume problem is no disk space */
2370 if (errno == 0)
2371 errno = ENOSPC;
2372 return -1;
2373 }
2374
2375 return 0;
2376}
2377
2378/*
2379 * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2380 * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2381 * use FileZero() instead.
2382 *
2383 * Note that at least glibc() implements posix_fallocate() in userspace if not
2384 * implemented by the filesystem. That's not the case for all environments
2385 * though.
2386 *
2387 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2388 * appropriate error.
2389 */
2390int
2391FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
2392{
2393#ifdef HAVE_POSIX_FALLOCATE
2394 int returnCode;
2395
2396 Assert(FileIsValid(file));
2397
2398 DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2399 file, VfdCache[file].fileName,
2400 (int64) offset, (int64) amount));
2401
2402 returnCode = FileAccess(file);
2403 if (returnCode < 0)
2404 return -1;
2405
2406retry:
2407 pgstat_report_wait_start(wait_event_info);
2408 returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2410
2411 if (returnCode == 0)
2412 return 0;
2413 else if (returnCode == EINTR)
2414 goto retry;
2415
2416 /* for compatibility with %m printing etc */
2417 errno = returnCode;
2418
2419 /*
2420 * Return in cases of a "real" failure, if fallocate is not supported,
2421 * fall through to the FileZero() backed implementation.
2422 */
2423 if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2424 return -1;
2425#endif
2426
2427 return FileZero(file, offset, amount, wait_event_info);
2428}
2429
2430off_t
2432{
2433 Assert(FileIsValid(file));
2434
2435 DO_DB(elog(LOG, "FileSize %d (%s)",
2436 file, VfdCache[file].fileName));
2437
2438 if (FileIsNotOpen(file))
2439 {
2440 if (FileAccess(file) < 0)
2441 return (off_t) -1;
2442 }
2443
2444 return lseek(VfdCache[file].fd, 0, SEEK_END);
2445}
2446
2447int
2448FileTruncate(File file, off_t offset, uint32 wait_event_info)
2449{
2450 int returnCode;
2451
2452 Assert(FileIsValid(file));
2453
2454 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2455 file, VfdCache[file].fileName));
2456
2457 returnCode = FileAccess(file);
2458 if (returnCode < 0)
2459 return returnCode;
2460
2461 pgstat_report_wait_start(wait_event_info);
2462 returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2464
2465 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2466 {
2467 /* adjust our state for truncation of a temp file */
2468 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2469 temporary_files_size -= VfdCache[file].fileSize - offset;
2470 VfdCache[file].fileSize = offset;
2471 }
2472
2473 return returnCode;
2474}
2475
2476/*
2477 * Return the pathname associated with an open file.
2478 *
2479 * The returned string points to an internal buffer, which is valid until
2480 * the file is closed.
2481 */
2482char *
2484{
2485 Assert(FileIsValid(file));
2486
2487 return VfdCache[file].fileName;
2488}
2489
2490/*
2491 * Return the raw file descriptor of an opened file.
2492 *
2493 * The returned file descriptor will be valid until the file is closed, but
2494 * there are a lot of things that can make that happen. So the caller should
2495 * be careful not to do much of anything else before it finishes using the
2496 * returned file descriptor.
2497 */
2498int
2500{
2501 Assert(FileIsValid(file));
2502 return VfdCache[file].fd;
2503}
2504
2505/*
2506 * FileGetRawFlags - returns the file flags on open(2)
2507 */
2508int
2510{
2511 Assert(FileIsValid(file));
2512 return VfdCache[file].fileFlags;
2513}
2514
2515/*
2516 * FileGetRawMode - returns the mode bitmask passed to open(2)
2517 */
2518mode_t
2520{
2521 Assert(FileIsValid(file));
2522 return VfdCache[file].fileMode;
2523}
2524
2525/*
2526 * Make room for another allocatedDescs[] array entry if needed and possible.
2527 * Returns true if an array element is available.
2528 */
2529static bool
2531{
2532 AllocateDesc *newDescs;
2533 int newMax;
2534
2535 /* Quick out if array already has a free slot. */
2537 return true;
2538
2539 /*
2540 * If the array hasn't yet been created in the current process, initialize
2541 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2542 * we will ever need, anyway. We don't want to look at max_safe_fds
2543 * immediately because set_max_safe_fds() may not have run yet.
2544 */
2545 if (allocatedDescs == NULL)
2546 {
2547 newMax = FD_MINFREE / 3;
2548 newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2549 /* Out of memory already? Treat as fatal error. */
2550 if (newDescs == NULL)
2551 ereport(ERROR,
2552 (errcode(ERRCODE_OUT_OF_MEMORY),
2553 errmsg("out of memory")));
2554 allocatedDescs = newDescs;
2555 maxAllocatedDescs = newMax;
2556 return true;
2557 }
2558
2559 /*
2560 * Consider enlarging the array beyond the initial allocation used above.
2561 * By the time this happens, max_safe_fds should be known accurately.
2562 *
2563 * We mustn't let allocated descriptors hog all the available FDs, and in
2564 * practice we'd better leave a reasonable number of FDs for VFD use. So
2565 * set the maximum to max_safe_fds / 3. (This should certainly be at
2566 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2567 * tightening the restriction here.) Recall that "external" FDs are
2568 * allowed to consume another third of max_safe_fds.
2569 */
2570 newMax = max_safe_fds / 3;
2571 if (newMax > maxAllocatedDescs)
2572 {
2573 newDescs = (AllocateDesc *) realloc(allocatedDescs,
2574 newMax * sizeof(AllocateDesc));
2575 /* Treat out-of-memory as a non-fatal error. */
2576 if (newDescs == NULL)
2577 return false;
2578 allocatedDescs = newDescs;
2579 maxAllocatedDescs = newMax;
2580 return true;
2581 }
2582
2583 /* Can't enlarge allocatedDescs[] any more. */
2584 return false;
2585}
2586
2587/*
2588 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2589 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2590 * necessary to open the file. When done, call FreeFile rather than fclose.
2591 *
2592 * Note that files that will be open for any significant length of time
2593 * should NOT be handled this way, since they cannot share kernel file
2594 * descriptors with other files; there is grave risk of running out of FDs
2595 * if anyone locks down too many FDs. Most callers of this routine are
2596 * simply reading a config file that they will read and close immediately.
2597 *
2598 * fd.c will automatically close all files opened with AllocateFile at
2599 * transaction commit or abort; this prevents FD leakage if a routine
2600 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2601 *
2602 * Ideally this should be the *only* direct call of fopen() in the backend.
2603 */
2604FILE *
2605AllocateFile(const char *name, const char *mode)
2606{
2607 FILE *file;
2608
2609 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2611
2612 /* Can we allocate another non-virtual FD? */
2613 if (!reserveAllocatedDesc())
2614 ereport(ERROR,
2615 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2616 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2618
2619 /* Close excess kernel FDs. */
2621
2622TryAgain:
2623 if ((file = fopen(name, mode)) != NULL)
2624 {
2626
2627 desc->kind = AllocateDescFile;
2628 desc->desc.file = file;
2631 return desc->desc.file;
2632 }
2633
2634 if (errno == EMFILE || errno == ENFILE)
2635 {
2636 int save_errno = errno;
2637
2638 ereport(LOG,
2639 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2640 errmsg("out of file descriptors: %m; release and retry")));
2641 errno = 0;
2642 if (ReleaseLruFile())
2643 goto TryAgain;
2644 errno = save_errno;
2645 }
2646
2647 return NULL;
2648}
2649
2650/*
2651 * Open a file with OpenTransientFilePerm() and pass default file mode for
2652 * the fileMode parameter.
2653 */
2654int
2655OpenTransientFile(const char *fileName, int fileFlags)
2656{
2657 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2658}
2659
2660/*
2661 * Like AllocateFile, but returns an unbuffered fd like open(2)
2662 */
2663int
2664OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2665{
2666 int fd;
2667
2668 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2669 numAllocatedDescs, fileName));
2670
2671 /* Can we allocate another non-virtual FD? */
2672 if (!reserveAllocatedDesc())
2673 ereport(ERROR,
2674 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2675 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2676 maxAllocatedDescs, fileName)));
2677
2678 /* Close excess kernel FDs. */
2680
2681 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2682
2683 if (fd >= 0)
2684 {
2686
2687 desc->kind = AllocateDescRawFD;
2688 desc->desc.fd = fd;
2691
2692 return fd;
2693 }
2694
2695 return -1; /* failure */
2696}
2697
2698/*
2699 * Routines that want to initiate a pipe stream should use OpenPipeStream
2700 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2701 * necessary. When done, call ClosePipeStream rather than pclose.
2702 *
2703 * This function also ensures that the popen'd program is run with default
2704 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2705 * uses. This ensures desirable response to, eg, closing a read pipe early.
2706 */
2707FILE *
2708OpenPipeStream(const char *command, const char *mode)
2709{
2710 FILE *file;
2711 int save_errno;
2712
2713 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2714 numAllocatedDescs, command));
2715
2716 /* Can we allocate another non-virtual FD? */
2717 if (!reserveAllocatedDesc())
2718 ereport(ERROR,
2719 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2720 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2721 maxAllocatedDescs, command)));
2722
2723 /* Close excess kernel FDs. */
2725
2726TryAgain:
2727 fflush(NULL);
2728 pqsignal(SIGPIPE, SIG_DFL);
2729 errno = 0;
2730 file = popen(command, mode);
2731 save_errno = errno;
2732 pqsignal(SIGPIPE, SIG_IGN);
2733 errno = save_errno;
2734 if (file != NULL)
2735 {
2737
2738 desc->kind = AllocateDescPipe;
2739 desc->desc.file = file;
2742 return desc->desc.file;
2743 }
2744
2745 if (errno == EMFILE || errno == ENFILE)
2746 {
2747 ereport(LOG,
2748 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2749 errmsg("out of file descriptors: %m; release and retry")));
2750 if (ReleaseLruFile())
2751 goto TryAgain;
2752 errno = save_errno;
2753 }
2754
2755 return NULL;
2756}
2757
2758/*
2759 * Free an AllocateDesc of any type.
2760 *
2761 * The argument *must* point into the allocatedDescs[] array.
2762 */
2763static int
2765{
2766 int result;
2767
2768 /* Close the underlying object */
2769 switch (desc->kind)
2770 {
2771 case AllocateDescFile:
2772 result = fclose(desc->desc.file);
2773 break;
2774 case AllocateDescPipe:
2775 result = pclose(desc->desc.file);
2776 break;
2777 case AllocateDescDir:
2778 result = closedir(desc->desc.dir);
2779 break;
2780 case AllocateDescRawFD:
2781 result = close(desc->desc.fd);
2782 break;
2783 default:
2784 elog(ERROR, "AllocateDesc kind not recognized");
2785 result = 0; /* keep compiler quiet */
2786 break;
2787 }
2788
2789 /* Compact storage in the allocatedDescs array */
2792
2793 return result;
2794}
2795
2796/*
2797 * Close a file returned by AllocateFile.
2798 *
2799 * Note we do not check fclose's return value --- it is up to the caller
2800 * to handle close errors.
2801 */
2802int
2803FreeFile(FILE *file)
2804{
2805 int i;
2806
2807 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2808
2809 /* Remove file from list of allocated files, if it's present */
2810 for (i = numAllocatedDescs; --i >= 0;)
2811 {
2812 AllocateDesc *desc = &allocatedDescs[i];
2813
2814 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2815 return FreeDesc(desc);
2816 }
2817
2818 /* Only get here if someone passes us a file not in allocatedDescs */
2819 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2820
2821 return fclose(file);
2822}
2823
2824/*
2825 * Close a file returned by OpenTransientFile.
2826 *
2827 * Note we do not check close's return value --- it is up to the caller
2828 * to handle close errors.
2829 */
2830int
2832{
2833 int i;
2834
2835 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2836
2837 /* Remove fd from list of allocated files, if it's present */
2838 for (i = numAllocatedDescs; --i >= 0;)
2839 {
2840 AllocateDesc *desc = &allocatedDescs[i];
2841
2842 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2843 return FreeDesc(desc);
2844 }
2845
2846 /* Only get here if someone passes us a file not in allocatedDescs */
2847 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2848
2849 return close(fd);
2850}
2851
2852/*
2853 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2854 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2855 * necessary to open the directory, and with closing it after an elog.
2856 * When done, call FreeDir rather than closedir.
2857 *
2858 * Returns NULL, with errno set, on failure. Note that failure detection
2859 * is commonly left to the following call of ReadDir or ReadDirExtended;
2860 * see the comments for ReadDir.
2861 *
2862 * Ideally this should be the *only* direct call of opendir() in the backend.
2863 */
2864DIR *
2865AllocateDir(const char *dirname)
2866{
2867 DIR *dir;
2868
2869 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2870 numAllocatedDescs, dirname));
2871
2872 /* Can we allocate another non-virtual FD? */
2873 if (!reserveAllocatedDesc())
2874 ereport(ERROR,
2875 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2876 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2877 maxAllocatedDescs, dirname)));
2878
2879 /* Close excess kernel FDs. */
2881
2882TryAgain:
2883 if ((dir = opendir(dirname)) != NULL)
2884 {
2886
2887 desc->kind = AllocateDescDir;
2888 desc->desc.dir = dir;
2891 return desc->desc.dir;
2892 }
2893
2894 if (errno == EMFILE || errno == ENFILE)
2895 {
2896 int save_errno = errno;
2897
2898 ereport(LOG,
2899 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2900 errmsg("out of file descriptors: %m; release and retry")));
2901 errno = 0;
2902 if (ReleaseLruFile())
2903 goto TryAgain;
2904 errno = save_errno;
2905 }
2906
2907 return NULL;
2908}
2909
2910/*
2911 * Read a directory opened with AllocateDir, ereport'ing any error.
2912 *
2913 * This is easier to use than raw readdir() since it takes care of some
2914 * otherwise rather tedious and error-prone manipulation of errno. Also,
2915 * if you are happy with a generic error message for AllocateDir failure,
2916 * you can just do
2917 *
2918 * dir = AllocateDir(path);
2919 * while ((dirent = ReadDir(dir, path)) != NULL)
2920 * process dirent;
2921 * FreeDir(dir);
2922 *
2923 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2924 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2925 * use this shortcut.)
2926 *
2927 * The pathname passed to AllocateDir must be passed to this routine too,
2928 * but it is only used for error reporting.
2929 */
2930struct dirent *
2931ReadDir(DIR *dir, const char *dirname)
2932{
2933 return ReadDirExtended(dir, dirname, ERROR);
2934}
2935
2936/*
2937 * Alternate version of ReadDir that allows caller to specify the elevel
2938 * for any error report (whether it's reporting an initial failure of
2939 * AllocateDir or a subsequent directory read failure).
2940 *
2941 * If elevel < ERROR, returns NULL after any error. With the normal coding
2942 * pattern, this will result in falling out of the loop immediately as
2943 * though the directory contained no (more) entries.
2944 */
2945struct dirent *
2946ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2947{
2948 struct dirent *dent;
2949
2950 /* Give a generic message for AllocateDir failure, if caller didn't */
2951 if (dir == NULL)
2952 {
2953 ereport(elevel,
2955 errmsg("could not open directory \"%s\": %m",
2956 dirname)));
2957 return NULL;
2958 }
2959
2960 errno = 0;
2961 if ((dent = readdir(dir)) != NULL)
2962 return dent;
2963
2964 if (errno)
2965 ereport(elevel,
2967 errmsg("could not read directory \"%s\": %m",
2968 dirname)));
2969 return NULL;
2970}
2971
2972/*
2973 * Close a directory opened with AllocateDir.
2974 *
2975 * Returns closedir's return value (with errno set if it's not 0).
2976 * Note we do not check the return value --- it is up to the caller
2977 * to handle close errors if wanted.
2978 *
2979 * Does nothing if dir == NULL; we assume that directory open failure was
2980 * already reported if desired.
2981 */
2982int
2984{
2985 int i;
2986
2987 /* Nothing to do if AllocateDir failed */
2988 if (dir == NULL)
2989 return 0;
2990
2991 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2992
2993 /* Remove dir from list of allocated dirs, if it's present */
2994 for (i = numAllocatedDescs; --i >= 0;)
2995 {
2996 AllocateDesc *desc = &allocatedDescs[i];
2997
2998 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2999 return FreeDesc(desc);
3000 }
3001
3002 /* Only get here if someone passes us a dir not in allocatedDescs */
3003 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3004
3005 return closedir(dir);
3006}
3007
3008
3009/*
3010 * Close a pipe stream returned by OpenPipeStream.
3011 */
3012int
3014{
3015 int i;
3016
3017 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3018
3019 /* Remove file from list of allocated files, if it's present */
3020 for (i = numAllocatedDescs; --i >= 0;)
3021 {
3022 AllocateDesc *desc = &allocatedDescs[i];
3023
3024 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3025 return FreeDesc(desc);
3026 }
3027
3028 /* Only get here if someone passes us a file not in allocatedDescs */
3029 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3030
3031 return pclose(file);
3032}
3033
3034/*
3035 * closeAllVfds
3036 *
3037 * Force all VFDs into the physically-closed state, so that the fewest
3038 * possible number of kernel file descriptors are in use. There is no
3039 * change in the logical state of the VFDs.
3040 */
3041void
3043{
3044 Index i;
3045
3046 if (SizeVfdCache > 0)
3047 {
3048 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3049 for (i = 1; i < SizeVfdCache; i++)
3050 {
3051 if (!FileIsNotOpen(i))
3052 LruDelete(i);
3053 }
3054 }
3055}
3056
3057
3058/*
3059 * SetTempTablespaces
3060 *
3061 * Define a list (actually an array) of OIDs of tablespaces to use for
3062 * temporary files. This list will be used until end of transaction,
3063 * unless this function is called again before then. It is caller's
3064 * responsibility that the passed-in array has adequate lifespan (typically
3065 * it'd be allocated in TopTransactionContext).
3066 *
3067 * Some entries of the array may be InvalidOid, indicating that the current
3068 * database's default tablespace should be used.
3069 */
3070void
3071SetTempTablespaces(Oid *tableSpaces, int numSpaces)
3072{
3073 Assert(numSpaces >= 0);
3074 tempTableSpaces = tableSpaces;
3075 numTempTableSpaces = numSpaces;
3076
3077 /*
3078 * Select a random starting point in the list. This is to minimize
3079 * conflicts between backends that are most likely sharing the same list
3080 * of temp tablespaces. Note that if we create multiple temp files in the
3081 * same transaction, we'll advance circularly through the list --- this
3082 * ensures that large temporary sort files are nicely spread across all
3083 * available tablespaces.
3084 */
3085 if (numSpaces > 1)
3087 0, numSpaces - 1);
3088 else
3090}
3091
3092/*
3093 * TempTablespacesAreSet
3094 *
3095 * Returns true if SetTempTablespaces has been called in current transaction.
3096 * (This is just so that tablespaces.c doesn't need its own per-transaction
3097 * state.)
3098 */
3099bool
3101{
3102 return (numTempTableSpaces >= 0);
3103}
3104
3105/*
3106 * GetTempTablespaces
3107 *
3108 * Populate an array with the OIDs of the tablespaces that should be used for
3109 * temporary files. (Some entries may be InvalidOid, indicating that the
3110 * current database's default tablespace should be used.) At most numSpaces
3111 * entries will be filled.
3112 * Returns the number of OIDs that were copied into the output array.
3113 */
3114int
3115GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3116{
3117 int i;
3118
3120 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3121 tableSpaces[i] = tempTableSpaces[i];
3122
3123 return i;
3124}
3125
3126/*
3127 * GetNextTempTableSpace
3128 *
3129 * Select the next temp tablespace to use. A result of InvalidOid means
3130 * to use the current database's default tablespace.
3131 */
3132Oid
3134{
3135 if (numTempTableSpaces > 0)
3136 {
3137 /* Advance nextTempTableSpace counter with wraparound */
3141 }
3142 return InvalidOid;
3143}
3144
3145
3146/*
3147 * AtEOSubXact_Files
3148 *
3149 * Take care of subtransaction commit/abort. At abort, we close temp files
3150 * that the subtransaction may have opened. At commit, we reassign the
3151 * files that were opened to the parent subtransaction.
3152 */
3153void
3155 SubTransactionId parentSubid)
3156{
3157 Index i;
3158
3159 for (i = 0; i < numAllocatedDescs; i++)
3160 {
3161 if (allocatedDescs[i].create_subid == mySubid)
3162 {
3163 if (isCommit)
3164 allocatedDescs[i].create_subid = parentSubid;
3165 else
3166 {
3167 /* have to recheck the item after FreeDesc (ugly) */
3169 }
3170 }
3171 }
3172}
3173
3174/*
3175 * AtEOXact_Files
3176 *
3177 * This routine is called during transaction commit or abort. All still-open
3178 * per-transaction temporary file VFDs are closed, which also causes the
3179 * underlying files to be deleted (although they should've been closed already
3180 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3181 * closed. We also forget any transaction-local temp tablespace list.
3182 *
3183 * The isCommit flag is used only to decide whether to emit warnings about
3184 * unclosed files.
3185 */
3186void
3187AtEOXact_Files(bool isCommit)
3188{
3189 CleanupTempFiles(isCommit, false);
3190 tempTableSpaces = NULL;
3191 numTempTableSpaces = -1;
3192}
3193
3194/*
3195 * BeforeShmemExit_Files
3196 *
3197 * before_shmem_exit hook to clean up temp files during backend shutdown.
3198 * Here, we want to clean up *all* temp files including interXact ones.
3199 */
3200static void
3202{
3203 CleanupTempFiles(false, true);
3204
3205 /* prevent further temp files from being created */
3206#ifdef USE_ASSERT_CHECKING
3207 temporary_files_allowed = false;
3208#endif
3209}
3210
3211/*
3212 * Close temporary files and delete their underlying files.
3213 *
3214 * isCommit: if true, this is normal transaction commit, and we don't
3215 * expect any remaining files; warn if there are some.
3216 *
3217 * isProcExit: if true, this is being called as the backend process is
3218 * exiting. If that's the case, we should remove all temporary files; if
3219 * that's not the case, we are being called for transaction commit/abort
3220 * and should only remove transaction-local temp files. In either case,
3221 * also clean up "allocated" stdio files, dirs and fds.
3222 */
3223static void
3224CleanupTempFiles(bool isCommit, bool isProcExit)
3225{
3226 Index i;
3227
3228 /*
3229 * Careful here: at proc_exit we need extra cleanup, not just
3230 * xact_temporary files.
3231 */
3232 if (isProcExit || have_xact_temporary_files)
3233 {
3234 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3235 for (i = 1; i < SizeVfdCache; i++)
3236 {
3237 unsigned short fdstate = VfdCache[i].fdstate;
3238
3239 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3240 VfdCache[i].fileName != NULL)
3241 {
3242 /*
3243 * If we're in the process of exiting a backend process, close
3244 * all temporary files. Otherwise, only close temporary files
3245 * local to the current transaction. They should be closed by
3246 * the ResourceOwner mechanism already, so this is just a
3247 * debugging cross-check.
3248 */
3249 if (isProcExit)
3250 FileClose(i);
3251 else if (fdstate & FD_CLOSE_AT_EOXACT)
3252 {
3253 elog(WARNING,
3254 "temporary file %s not closed at end-of-transaction",
3255 VfdCache[i].fileName);
3256 FileClose(i);
3257 }
3258 }
3259 }
3260
3262 }
3263
3264 /* Complain if any allocated files remain open at commit. */
3265 if (isCommit && numAllocatedDescs > 0)
3266 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3268
3269 /* Clean up "allocated" stdio files, dirs and fds. */
3270 while (numAllocatedDescs > 0)
3272}
3273
3274
3275/*
3276 * Remove temporary and temporary relation files left over from a prior
3277 * postmaster session
3278 *
3279 * This should be called during postmaster startup. It will forcibly
3280 * remove any leftover files created by OpenTemporaryFile and any leftover
3281 * temporary relation files created by mdcreate.
3282 *
3283 * During post-backend-crash restart cycle, this routine is called when
3284 * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3285 * queries are using temp files could result in useless storage usage that can
3286 * only be reclaimed by a service restart. The argument against enabling it is
3287 * that someone might want to examine the temporary files for debugging
3288 * purposes. This does however mean that OpenTemporaryFile had better allow for
3289 * collision with an existing temp file name.
3290 *
3291 * NOTE: this function and its subroutines generally report syscall failures
3292 * with ereport(LOG) and keep going. Removing temp files is not so critical
3293 * that we should fail to start the database when we can't do it.
3294 */
3295void
3297{
3298 char temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3299 DIR *spc_dir;
3300 struct dirent *spc_de;
3301
3302 /*
3303 * First process temp files in pg_default ($PGDATA/base)
3304 */
3305 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3306 RemovePgTempFilesInDir(temp_path, true, false);
3308
3309 /*
3310 * Cycle through temp directories for all non-default tablespaces.
3311 */
3312 spc_dir = AllocateDir(PG_TBLSPC_DIR);
3313
3314 while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
3315 {
3316 if (strcmp(spc_de->d_name, ".") == 0 ||
3317 strcmp(spc_de->d_name, "..") == 0)
3318 continue;
3319
3320 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3323 RemovePgTempFilesInDir(temp_path, true, false);
3324
3325 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3327 RemovePgTempRelationFiles(temp_path);
3328 }
3329
3330 FreeDir(spc_dir);
3331
3332 /*
3333 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3334 * DataDir as well. However, that is *not* cleaned here because doing so
3335 * would create a race condition. It's done separately, earlier in
3336 * postmaster startup.
3337 */
3338}
3339
3340/*
3341 * Process one pgsql_tmp directory for RemovePgTempFiles.
3342 *
3343 * If missing_ok is true, it's all right for the named directory to not exist.
3344 * Any other problem results in a LOG message. (missing_ok should be true at
3345 * the top level, since pgsql_tmp directories are not created until needed.)
3346 *
3347 * At the top level, this should be called with unlink_all = false, so that
3348 * only files matching the temporary name prefix will be unlinked. When
3349 * recursing it will be called with unlink_all = true to unlink everything
3350 * under a top-level temporary directory.
3351 *
3352 * (These two flags could be replaced by one, but it seems clearer to keep
3353 * them separate.)
3354 */
3355void
3356RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3357{
3358 DIR *temp_dir;
3359 struct dirent *temp_de;
3360 char rm_path[MAXPGPATH * 2];
3361
3362 temp_dir = AllocateDir(tmpdirname);
3363
3364 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3365 return;
3366
3367 while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3368 {
3369 if (strcmp(temp_de->d_name, ".") == 0 ||
3370 strcmp(temp_de->d_name, "..") == 0)
3371 continue;
3372
3373 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3374 tmpdirname, temp_de->d_name);
3375
3376 if (unlink_all ||
3377 strncmp(temp_de->d_name,
3379 strlen(PG_TEMP_FILE_PREFIX)) == 0)
3380 {
3381 PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3382
3383 if (type == PGFILETYPE_ERROR)
3384 continue;
3385 else if (type == PGFILETYPE_DIR)
3386 {
3387 /* recursively remove contents, then directory itself */
3388 RemovePgTempFilesInDir(rm_path, false, true);
3389
3390 if (rmdir(rm_path) < 0)
3391 ereport(LOG,
3393 errmsg("could not remove directory \"%s\": %m",
3394 rm_path)));
3395 }
3396 else
3397 {
3398 if (unlink(rm_path) < 0)
3399 ereport(LOG,
3401 errmsg("could not remove file \"%s\": %m",
3402 rm_path)));
3403 }
3404 }
3405 else
3406 ereport(LOG,
3407 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3408 rm_path)));
3409 }
3410
3411 FreeDir(temp_dir);
3412}
3413
3414/* Process one tablespace directory, look for per-DB subdirectories */
3415static void
3416RemovePgTempRelationFiles(const char *tsdirname)
3417{
3418 DIR *ts_dir;
3419 struct dirent *de;
3420 char dbspace_path[MAXPGPATH * 2];
3421
3422 ts_dir = AllocateDir(tsdirname);
3423
3424 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3425 {
3426 /*
3427 * We're only interested in the per-database directories, which have
3428 * numeric names. Note that this code will also (properly) ignore "."
3429 * and "..".
3430 */
3431 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3432 continue;
3433
3434 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3435 tsdirname, de->d_name);
3437 }
3438
3439 FreeDir(ts_dir);
3440}
3441
3442/* Process one per-dbspace directory for RemovePgTempRelationFiles */
3443static void
3444RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3445{
3446 DIR *dbspace_dir;
3447 struct dirent *de;
3448 char rm_path[MAXPGPATH * 2];
3449
3450 dbspace_dir = AllocateDir(dbspacedirname);
3451
3452 while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3453 {
3455 continue;
3456
3457 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3458 dbspacedirname, de->d_name);
3459
3460 if (unlink(rm_path) < 0)
3461 ereport(LOG,
3463 errmsg("could not remove file \"%s\": %m",
3464 rm_path)));
3465 }
3466
3467 FreeDir(dbspace_dir);
3468}
3469
3470/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3471bool
3473{
3474 int pos;
3475 int savepos;
3476
3477 /* Must start with "t". */
3478 if (name[0] != 't')
3479 return false;
3480
3481 /* Followed by a non-empty string of digits and then an underscore. */
3482 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3483 ;
3484 if (pos == 1 || name[pos] != '_')
3485 return false;
3486
3487 /* Followed by another nonempty string of digits. */
3488 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3489 ;
3490 if (savepos == pos)
3491 return false;
3492
3493 /* We might have _forkname or .segment or both. */
3494 if (name[pos] == '_')
3495 {
3496 int forkchar = forkname_chars(&name[pos + 1], NULL);
3497
3498 if (forkchar <= 0)
3499 return false;
3500 pos += forkchar + 1;
3501 }
3502 if (name[pos] == '.')
3503 {
3504 int segchar;
3505
3506 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3507 ;
3508 if (segchar <= 1)
3509 return false;
3510 pos += segchar;
3511 }
3512
3513 /* Now we should be at the end. */
3514 if (name[pos] != '\0')
3515 return false;
3516 return true;
3517}
3518
3519#ifdef HAVE_SYNCFS
3520static void
3521do_syncfs(const char *path)
3522{
3523 int fd;
3524
3525 ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3526 path);
3527
3528 fd = OpenTransientFile(path, O_RDONLY);
3529 if (fd < 0)
3530 {
3531 ereport(LOG,
3533 errmsg("could not open file \"%s\": %m", path)));
3534 return;
3535 }
3536 if (syncfs(fd) < 0)
3537 ereport(LOG,
3539 errmsg("could not synchronize file system for file \"%s\": %m", path)));
3541}
3542#endif
3543
3544/*
3545 * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3546 * all potential filesystem, depending on recovery_init_sync_method setting.
3547 *
3548 * We fsync regular files and directories wherever they are, but we
3549 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3550 * Other symlinks are presumed to point at files we're not responsible
3551 * for fsyncing, and might not have privileges to write at all.
3552 *
3553 * Errors are logged but not considered fatal; that's because this is used
3554 * only during database startup, to deal with the possibility that there are
3555 * issued-but-unsynced writes pending against the data directory. We want to
3556 * ensure that such writes reach disk before anything that's done in the new
3557 * run. However, aborting on error would result in failure to start for
3558 * harmless cases such as read-only files in the data directory, and that's
3559 * not good either.
3560 *
3561 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3562 * rewriting all changes again during recovery.
3563 *
3564 * Note we assume we're chdir'd into PGDATA to begin with.
3565 */
3566void
3568{
3569 bool xlog_is_symlink;
3570
3571 /* We can skip this whole thing if fsync is disabled. */
3572 if (!enableFsync)
3573 return;
3574
3575 /*
3576 * If pg_wal is a symlink, we'll need to recurse into it separately,
3577 * because the first walkdir below will ignore it.
3578 */
3579 xlog_is_symlink = false;
3580
3581 {
3582 struct stat st;
3583
3584 if (lstat("pg_wal", &st) < 0)
3585 ereport(LOG,
3587 errmsg("could not stat file \"%s\": %m",
3588 "pg_wal")));
3589 else if (S_ISLNK(st.st_mode))
3590 xlog_is_symlink = true;
3591 }
3592
3593#ifdef HAVE_SYNCFS
3595 {
3596 DIR *dir;
3597 struct dirent *de;
3598
3599 /*
3600 * On Linux, we don't have to open every single file one by one. We
3601 * can use syncfs() to sync whole filesystems. We only expect
3602 * filesystem boundaries to exist where we tolerate symlinks, namely
3603 * pg_wal and the tablespaces, so we call syncfs() for each of those
3604 * directories.
3605 */
3606
3607 /* Prepare to report progress syncing the data directory via syncfs. */
3609
3610 /* Sync the top level pgdata directory. */
3611 do_syncfs(".");
3612 /* If any tablespaces are configured, sync each of those. */
3614 while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3615 {
3616 char path[MAXPGPATH];
3617
3618 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3619 continue;
3620
3621 snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3622 do_syncfs(path);
3623 }
3624 FreeDir(dir);
3625 /* If pg_wal is a symlink, process that too. */
3626 if (xlog_is_symlink)
3627 do_syncfs("pg_wal");
3628 return;
3629 }
3630#endif /* !HAVE_SYNCFS */
3631
3632#ifdef PG_FLUSH_DATA_WORKS
3633 /* Prepare to report progress of the pre-fsync phase. */
3635
3636 /*
3637 * If possible, hint to the kernel that we're soon going to fsync the data
3638 * directory and its contents. Errors in this step are even less
3639 * interesting than normal, so log them only at DEBUG1.
3640 */
3641 walkdir(".", pre_sync_fname, false, DEBUG1);
3642 if (xlog_is_symlink)
3643 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3644 walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
3645#endif
3646
3647 /* Prepare to report progress syncing the data directory via fsync. */
3649
3650 /*
3651 * Now we do the fsync()s in the same order.
3652 *
3653 * The main call ignores symlinks, so in addition to specially processing
3654 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3655 * process_symlinks = true. Note that if there are any plain directories
3656 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3657 * so we don't worry about optimizing it.
3658 */
3659 walkdir(".", datadir_fsync_fname, false, LOG);
3660 if (xlog_is_symlink)
3661 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3663}
3664
3665/*
3666 * walkdir: recursively walk a directory, applying the action to each
3667 * regular file and directory (including the named directory itself).
3668 *
3669 * If process_symlinks is true, the action and recursion are also applied
3670 * to regular files and directories that are pointed to by symlinks in the
3671 * given directory; otherwise symlinks are ignored. Symlinks are always
3672 * ignored in subdirectories, ie we intentionally don't pass down the
3673 * process_symlinks flag to recursive calls.
3674 *
3675 * Errors are reported at level elevel, which might be ERROR or less.
3676 *
3677 * See also walkdir in file_utils.c, which is a frontend version of this
3678 * logic.
3679 */
3680static void
3681walkdir(const char *path,
3682 void (*action) (const char *fname, bool isdir, int elevel),
3683 bool process_symlinks,
3684 int elevel)
3685{
3686 DIR *dir;
3687 struct dirent *de;
3688
3689 dir = AllocateDir(path);
3690
3691 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3692 {
3693 char subpath[MAXPGPATH * 2];
3694
3696
3697 if (strcmp(de->d_name, ".") == 0 ||
3698 strcmp(de->d_name, "..") == 0)
3699 continue;
3700
3701 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3702
3703 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3704 {
3705 case PGFILETYPE_REG:
3706 (*action) (subpath, false, elevel);
3707 break;
3708 case PGFILETYPE_DIR:
3709 walkdir(subpath, action, false, elevel);
3710 break;
3711 default:
3712
3713 /*
3714 * Errors are already reported directly by get_dirent_type(),
3715 * and any remaining symlinks and unknown file types are
3716 * ignored.
3717 */
3718 break;
3719 }
3720 }
3721
3722 FreeDir(dir); /* we ignore any error here */
3723
3724 /*
3725 * It's important to fsync the destination directory itself as individual
3726 * file fsyncs don't guarantee that the directory entry for the file is
3727 * synced. However, skip this if AllocateDir failed; the action function
3728 * might not be robust against that.
3729 */
3730 if (dir)
3731 (*action) (path, true, elevel);
3732}
3733
3734
3735/*
3736 * Hint to the OS that it should get ready to fsync() this file.
3737 *
3738 * Ignores errors trying to open unreadable files, and logs other errors at a
3739 * caller-specified level.
3740 */
3741#ifdef PG_FLUSH_DATA_WORKS
3742
3743static void
3744pre_sync_fname(const char *fname, bool isdir, int elevel)
3745{
3746 int fd;
3747
3748 /* Don't try to flush directories, it'll likely just fail */
3749 if (isdir)
3750 return;
3751
3752 ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3753 fname);
3754
3755 fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3756
3757 if (fd < 0)
3758 {
3759 if (errno == EACCES)
3760 return;
3761 ereport(elevel,
3763 errmsg("could not open file \"%s\": %m", fname)));
3764 return;
3765 }
3766
3767 /*
3768 * pg_flush_data() ignores errors, which is ok because this is only a
3769 * hint.
3770 */
3771 pg_flush_data(fd, 0, 0);
3772
3773 if (CloseTransientFile(fd) != 0)
3774 ereport(elevel,
3776 errmsg("could not close file \"%s\": %m", fname)));
3777}
3778
3779#endif /* PG_FLUSH_DATA_WORKS */
3780
3781static void
3782datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3783{
3784 ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3785 fname);
3786
3787 /*
3788 * We want to silently ignoring errors about unreadable files. Pass that
3789 * desire on to fsync_fname_ext().
3790 */
3791 fsync_fname_ext(fname, isdir, true, elevel);
3792}
3793
3794static void
3795unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3796{
3797 if (isdir)
3798 {
3799 if (rmdir(fname) != 0 && errno != ENOENT)
3800 ereport(elevel,
3802 errmsg("could not remove directory \"%s\": %m", fname)));
3803 }
3804 else
3805 {
3806 /* Use PathNameDeleteTemporaryFile to report filesize */
3807 PathNameDeleteTemporaryFile(fname, false);
3808 }
3809}
3810
3811/*
3812 * fsync_fname_ext -- Try to fsync a file or directory
3813 *
3814 * If ignore_perm is true, ignore errors upon trying to open unreadable
3815 * files. Logs other errors at a caller-specified level.
3816 *
3817 * Returns 0 if the operation succeeded, -1 otherwise.
3818 */
3819int
3820fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3821{
3822 int fd;
3823 int flags;
3824 int returncode;
3825
3826 /*
3827 * Some OSs require directories to be opened read-only whereas other
3828 * systems don't allow us to fsync files opened read-only; so we need both
3829 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3830 * not writable by our userid, but we assume that's OK.
3831 */
3832 flags = PG_BINARY;
3833 if (!isdir)
3834 flags |= O_RDWR;
3835 else
3836 flags |= O_RDONLY;
3837
3838 fd = OpenTransientFile(fname, flags);
3839
3840 /*
3841 * Some OSs don't allow us to open directories at all (Windows returns
3842 * EACCES), just ignore the error in that case. If desired also silently
3843 * ignoring errors about unreadable files. Log others.
3844 */
3845 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3846 return 0;
3847 else if (fd < 0 && ignore_perm && errno == EACCES)
3848 return 0;
3849 else if (fd < 0)
3850 {
3851 ereport(elevel,
3853 errmsg("could not open file \"%s\": %m", fname)));
3854 return -1;
3855 }
3856
3857 returncode = pg_fsync(fd);
3858
3859 /*
3860 * Some OSes don't allow us to fsync directories at all, so we can ignore
3861 * those errors. Anything else needs to be logged.
3862 */
3863 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3864 {
3865 int save_errno;
3866
3867 /* close file upon error, might not be in transaction context */
3868 save_errno = errno;
3869 (void) CloseTransientFile(fd);
3870 errno = save_errno;
3871
3872 ereport(elevel,
3874 errmsg("could not fsync file \"%s\": %m", fname)));
3875 return -1;
3876 }
3877
3878 if (CloseTransientFile(fd) != 0)
3879 {
3880 ereport(elevel,
3882 errmsg("could not close file \"%s\": %m", fname)));
3883 return -1;
3884 }
3885
3886 return 0;
3887}
3888
3889/*
3890 * fsync_parent_path -- fsync the parent path of a file or directory
3891 *
3892 * This is aimed at making file operations persistent on disk in case of
3893 * an OS crash or power failure.
3894 */
3895static int
3896fsync_parent_path(const char *fname, int elevel)
3897{
3898 char parentpath[MAXPGPATH];
3899
3900 strlcpy(parentpath, fname, MAXPGPATH);
3901 get_parent_directory(parentpath);
3902
3903 /*
3904 * get_parent_directory() returns an empty string if the input argument is
3905 * just a file name (see comments in path.c), so handle that as being the
3906 * current directory.
3907 */
3908 if (strlen(parentpath) == 0)
3909 strlcpy(parentpath, ".", MAXPGPATH);
3910
3911 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3912 return -1;
3913
3914 return 0;
3915}
3916
3917/*
3918 * Create a PostgreSQL data sub-directory
3919 *
3920 * The data directory itself, and most of its sub-directories, are created at
3921 * initdb time, but we do have some occasions when we create directories in
3922 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3923 * make sure that those directories are created consistently. Today, that means
3924 * making sure that the created directory has the correct permissions, which is
3925 * what pg_dir_create_mode tracks for us.
3926 *
3927 * Note that we also set the umask() based on what we understand the correct
3928 * permissions to be (see file_perm.c).
3929 *
3930 * For permissions other than the default, mkdir() can be used directly, but
3931 * be sure to consider carefully such cases -- a sub-directory with incorrect
3932 * permissions in a PostgreSQL data directory could cause backups and other
3933 * processes to fail.
3934 */
3935int
3936MakePGDirectory(const char *directoryName)
3937{
3938 return mkdir(directoryName, pg_dir_create_mode);
3939}
3940
3941/*
3942 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3943 *
3944 * Failure to fsync any data file is cause for immediate panic, unless
3945 * data_sync_retry is enabled. Data may have been written to the operating
3946 * system and removed from our buffer pool already, and if we are running on
3947 * an operating system that forgets dirty data on write-back failure, there
3948 * may be only one copy of the data remaining: in the WAL. A later attempt to
3949 * fsync again might falsely report success. Therefore we must not allow any
3950 * further checkpoints to be attempted. data_sync_retry can in theory be
3951 * enabled on systems known not to drop dirty buffered data on write-back
3952 * failure (with the likely outcome that checkpoints will continue to fail
3953 * until the underlying problem is fixed).
3954 *
3955 * Any code that reports a failure from fsync() or related functions should
3956 * filter the error level with this function.
3957 */
3958int
3960{
3961 return data_sync_retry ? elevel : PANIC;
3962}
3963
3964bool
3966{
3967 bool result = true;
3968 int flags;
3969
3970#if PG_O_DIRECT == 0
3971 if (strcmp(*newval, "") != 0)
3972 {
3973 GUC_check_errdetail("\"%s\" is not supported on this platform.",
3974 "debug_io_direct");
3975 result = false;
3976 }
3977 flags = 0;
3978#else
3979 List *elemlist;
3980 ListCell *l;
3981 char *rawstring;
3982
3983 /* Need a modifiable copy of string */
3984 rawstring = pstrdup(*newval);
3985
3986 if (!SplitGUCList(rawstring, ',', &elemlist))
3987 {
3988 GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
3989 "debug_io_direct");
3990 pfree(rawstring);
3991 list_free(elemlist);
3992 return false;
3993 }
3994
3995 flags = 0;
3996 foreach(l, elemlist)
3997 {
3998 char *item = (char *) lfirst(l);
3999
4000 if (pg_strcasecmp(item, "data") == 0)
4001 flags |= IO_DIRECT_DATA;
4002 else if (pg_strcasecmp(item, "wal") == 0)
4003 flags |= IO_DIRECT_WAL;
4004 else if (pg_strcasecmp(item, "wal_init") == 0)
4005 flags |= IO_DIRECT_WAL_INIT;
4006 else
4007 {
4008 GUC_check_errdetail("Invalid option \"%s\".", item);
4009 result = false;
4010 break;
4011 }
4012 }
4013
4014 /*
4015 * It's possible to configure block sizes smaller than our assumed I/O
4016 * alignment size, which could result in invalid I/O requests.
4017 */
4018#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4019 if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4020 {
4021 GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4022 "debug_io_direct", "XLOG_BLCKSZ");
4023 result = false;
4024 }
4025#endif
4026#if BLCKSZ < PG_IO_ALIGN_SIZE
4027 if (result && (flags & IO_DIRECT_DATA))
4028 {
4029 GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4030 "debug_io_direct", "BLCKSZ");
4031 result = false;
4032 }
4033#endif
4034
4035 pfree(rawstring);
4036 list_free(elemlist);
4037#endif
4038
4039 if (!result)
4040 return result;
4041
4042 /* Save the flags in *extra, for use by assign_debug_io_direct */
4043 *extra = guc_malloc(ERROR, sizeof(int));
4044 *((int *) *extra) = flags;
4045
4046 return result;
4047}
4048
4049void
4050assign_debug_io_direct(const char *newval, void *extra)
4051{
4052 int *flags = (int *) extra;
4053
4054 io_direct_flags = *flags;
4055}
4056
4057/* ResourceOwner callbacks */
4058
4059static void
4061{
4062 File file = (File) DatumGetInt32(res);
4063 Vfd *vfdP;
4064
4065 Assert(FileIsValid(file));
4066
4067 vfdP = &VfdCache[file];
4068 vfdP->resowner = NULL;
4069
4070 FileClose(file);
4071}
4072
4073static char *
4075{
4076 return psprintf("File %d", DatumGetInt32(res));
4077}
void begin_startup_progress_phase(void)
Definition: startup.c:343
#define Min(x, y)
Definition: c.h:961
uint32 SubTransactionId
Definition: c.h:613
#define INT64_FORMAT
Definition: c.h:506
#define Assert(condition)
Definition: c.h:815
int64_t int64
Definition: c.h:485
#define PG_BINARY
Definition: c.h:1230
uint64_t uint64
Definition: c.h:489
uint32_t uint32
Definition: c.h:488
unsigned int Index
Definition: c.h:571
#define MemSet(start, val, len)
Definition: c.h:977
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:895
int fdatasync(int fildes)
#define OidIsValid(objectId)
Definition: c.h:732
size_t Size
Definition: c.h:562
int closedir(DIR *)
Definition: dirent.c:127
struct dirent * readdir(DIR *)
Definition: dirent.c:78
DIR * opendir(const char *)
Definition: dirent.c:33
int errcode_for_file_access(void)
Definition: elog.c:876
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
static int pg_ftruncate(int fd, off_t length)
Definition: fd.c:702
int max_files_per_process
Definition: fd.c:145
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:524
int FileGetRawDesc(File file)
Definition: fd.c:2499
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3936
int FreeDir(DIR *dir)
Definition: fd.c:2983
int recovery_init_sync_method
Definition: fd.c:164
static const ResourceOwnerDesc file_resowner_desc
Definition: fd.c:360
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2132
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:440
#define FD_MINFREE
Definition: fd.c:137
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2708
static int numTempTableSpaces
Definition: fd.c:288
static bool ReleaseLruFile(void)
Definition: fd.c:1381
int io_direct_flags
Definition: fd.c:167
#define FD_DELETE_AT_CLOSE
Definition: fd.c:191
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1108
static int maxAllocatedDescs
Definition: fd.c:267
static void Delete(File file)
Definition: fd.c:1267
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2764
static long tempFileCounter
Definition: fd.c:279
static char * ResOwnerPrintFile(Datum res)
Definition: fd.c:4074
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:781
char * FilePathName(File file)
Definition: fd.c:2483
static void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: fd.c:376
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:3115
static int numAllocatedDescs
Definition: fd.c:266
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition: fd.c:1900
static void LruDelete(File file)
Definition: fd.c:1286
int pg_fdatasync(int fd)
Definition: fd.c:479
#define FileIsValid(file)
Definition: fd.c:185
void assign_debug_io_direct(const char *newval, void *extra)
Definition: fd.c:4050
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2319
static int nfile
Definition: fd.c:221
int CloseTransientFile(int fd)
Definition: fd.c:2831
#define DO_DB(A)
Definition: fd.c:179
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1086
void closeAllVfds(void)
Definition: fd.c:3042
int max_safe_fds
Definition: fd.c:158
static File AllocateVfd(void)
Definition: fd.c:1413
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1860
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1690
int ClosePipeStream(FILE *file)
Definition: fd.c:3013
void AtEOXact_Files(bool isCommit)
Definition: fd.c:3187
int FileGetRawFlags(File file)
Definition: fd.c:2509
static Size SizeVfdCache
Definition: fd.c:216
static int nextTempTableSpace
Definition: fd.c:289
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:192
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3820
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3795
static void ResOwnerReleaseFile(Datum res)
Definition: fd.c:4060
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3416
int FreeFile(FILE *file)
Definition: fd.c:2803
mode_t FileGetRawMode(File file)
Definition: fd.c:2519
static AllocateDesc * allocatedDescs
Definition: fd.c:268
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2946
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:963
static int FileAccess(File file)
Definition: fd.c:1491
static void FreeVfd(File file)
Definition: fd.c:1471
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition: fd.c:460
void FileClose(File file)
Definition: fd.c:1977
void ReleaseExternalFD(void)
Definition: fd.c:1238
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:193
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3356
bool pg_file_exists(const char *name)
Definition: fd.c:502
void RemovePgTempFiles(void)
Definition: fd.c:3296
#define FileIsNotOpen(file)
Definition: fd.c:188
bool TempTablespacesAreSet(void)
Definition: fd.c:3100
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:755
int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2391
int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2076
int data_sync_elevel(int elevel)
Definition: fd.c:3959
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1574
static void Insert(File file)
Definition: fd.c:1312
AllocateDescKind
Definition: fd.c:247
@ AllocateDescDir
Definition: fd.c:250
@ AllocateDescPipe
Definition: fd.c:249
@ AllocateDescFile
Definition: fd.c:248
@ AllocateDescRawFD
Definition: fd.c:251
Oid GetNextTempTableSpace(void)
Definition: fd.c:3133
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1587
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3782
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1527
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1803
bool AcquireExternalFD(void)
Definition: fd.c:1185
static void RegisterTemporaryFile(File file)
Definition: fd.c:1546
#define NUM_RESERVED_FDS
Definition: fd.c:128
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2865
static Oid * tempTableSpaces
Definition: fd.c:287
static bool reserveAllocatedDesc(void)
Definition: fd.c:2530
void InitFileAccess(void)
Definition: fd.c:902
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3444
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1723
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:871
static uint64 temporary_files_size
Definition: fd.c:235
void ReserveExternalFD(void)
Definition: fd.c:1220
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2931
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3472
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1931
void set_max_safe_fds(void)
Definition: fd.c:1043
int pg_fsync(int fd)
Definition: fd.c:385
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:3224
#define VFD_CLOSED
Definition: fd.c:183
static bool have_xact_temporary_files
Definition: fd.c:227
static int LruInsert(File file)
Definition: fd.c:1334
static int numExternalFDs
Definition: fd.c:273
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3896
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1659
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2605
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:3154
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2655
void InitTemporaryFileAccess(void)
Definition: fd.c:932
static Vfd * VfdCache
Definition: fd.c:215
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2664
bool data_sync_retry
Definition: fd.c:161
static void ReleaseLruFiles(void)
Definition: fd.c:1403
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2214
void SyncDataDirectory(void)
Definition: fd.c:3567
int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2346
off_t FileSize(File file)
Definition: fd.c:2431
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2158
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2448
bool check_debug_io_direct(char **newval, void **extra, GucSource source)
Definition: fd.c:3965
static void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: fd.c:371
static void BeforeShmemExit_Files(int code, Datum arg)
Definition: fd.c:3201
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3681
int pg_truncate(const char *path, off_t length)
Definition: fd.c:719
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:3071
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1778
#define IO_DIRECT_WAL
Definition: fd.h:55
#define IO_DIRECT_DATA
Definition: fd.h:54
#define IO_DIRECT_WAL_INIT
Definition: fd.h:56
int File
Definition: fd.h:51
#define PG_O_DIRECT
Definition: fd.h:97
int pg_file_create_mode
Definition: file_perm.c:19
int pg_dir_create_mode
Definition: file_perm.c:18
ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset)
Definition: file_utils.c:688
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:526
#define PG_TEMP_FILES_DIR
Definition: file_utils.h:62
#define PG_TEMP_FILE_PREFIX
Definition: file_utils.h:63
PGFileType
Definition: file_utils.h:19
@ PGFILETYPE_DIR
Definition: file_utils.h:23
@ PGFILETYPE_REG
Definition: file_utils.h:22
@ PGFILETYPE_ERROR
Definition: file_utils.h:20
@ DATA_DIR_SYNC_METHOD_SYNCFS
Definition: file_utils.h:30
@ DATA_DIR_SYNC_METHOD_FSYNC
Definition: file_utils.h:29
int MyProcPid
Definition: globals.c:46
bool enableFsync
Definition: globals.c:128
Oid MyDatabaseTableSpace
Definition: globals.c:95
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:638
#define newval
#define GUC_check_errdetail
Definition: guc.h:476
GucSource
Definition: guc.h:108
int temp_file_limit
Definition: guc_tables.c:533
int log_temp_files
Definition: guc_tables.c:528
#define realloc(a, b)
Definition: header.h:60
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
#define close(a)
Definition: win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:337
int j
Definition: isn.c:73
int i
Definition: isn.c:72
static void const char fflush(stdout)
void list_free(List *list)
Definition: list.c:1546
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:308
char * pstrdup(const char *in)
Definition: mcxt.c:1696
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void pfree(void *pointer)
Definition: mcxt.c:1521
void * palloc(Size size)
Definition: mcxt.c:1317
#define MAP_FAILED
Definition: mem.h:45
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
void * arg
static char * basedir
static PgChecksumMode mode
Definition: pg_checksums.c:55
#define MAXPGPATH
static ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pg_iovec.h:83
static ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pg_iovec.h:44
#define lfirst(lc)
Definition: pg_list.h:172
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition: pg_prng.c:144
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
static rewind_source * source
Definition: pg_rewind.c:89
static char * buf
Definition: pg_test_fsync.c:72
static char * tablespace
Definition: pgbench.c:216
void pgstat_report_tempfile(size_t filesize)
#define pqsignal
Definition: port.h:520
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
void get_parent_directory(char *path)
Definition: path.c:991
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
uintptr_t Datum
Definition: postgres.h:69
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:217
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:207
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
static int fd(const char *x, int i)
Definition: preproc-init.c:105
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
#define PG_TBLSPC_DIR
Definition: relpath.h:41
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:33
ResourceOwner CurrentResourceOwner
Definition: resowner.c:165
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:554
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:514
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:442
@ RESOURCE_RELEASE_AFTER_LOCKS
Definition: resowner.h:56
#define RELEASE_PRIO_FILES
Definition: resowner.h:76
void pg_usleep(long microsec)
Definition: signal.c:53
static pg_noinline void Size size
Definition: slab.c:607
static void error(void)
Definition: sql-dyntest.c:147
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
SubTransactionId create_subid
Definition: fd.c:257
DIR * dir
Definition: fd.c:261
FILE * file
Definition: fd.c:260
int fd
Definition: fd.c:262
union AllocateDesc::@20 desc
AllocateDescKind kind
Definition: fd.c:256
Definition: dirent.c:26
Definition: pg_list.h:54
const char * name
Definition: resowner.h:93
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
__int64 st_size
Definition: win32_port.h:263
unsigned short st_mode
Definition: win32_port.h:258
Definition: fd.c:196
int fd
Definition: fd.c:197
int fileFlags
Definition: fd.c:206
File lruLessRecently
Definition: fd.c:202
File lruMoreRecently
Definition: fd.c:201
char * fileName
Definition: fd.c:204
ResourceOwner resowner
Definition: fd.c:199
unsigned short fdstate
Definition: fd.c:198
File nextFree
Definition: fd.c:200
mode_t fileMode
Definition: fd.c:207
off_t fileSize
Definition: fd.c:203
bool SplitGUCList(char *rawstring, char separator, List **namelist)
Definition: varlena.c:3680
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:85
static void pgstat_report_wait_end(void)
Definition: wait_event.h:101
const char * type
const char * name
#define fsync(fd)
Definition: win32_port.h:83
#define stat
Definition: win32_port.h:274
#define EINTR
Definition: win32_port.h:364
#define EOPNOTSUPP
Definition: win32_port.h:388
#define SIGPIPE
Definition: win32_port.h:163
#define lstat(path, sb)
Definition: win32_port.h:275
#define S_ISDIR(m)
Definition: win32_port.h:315
void _dosmaperr(unsigned long)
Definition: win32error.c:177
#define S_ISLNK(m)
Definition: win32_port.h:334
#define mkdir(a, b)
Definition: win32_port.h:80
#define fstat
Definition: win32_port.h:273
#define O_CLOEXEC
Definition: win32_port.h:349
#define O_DSYNC
Definition: win32_port.h:342
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:790
int wal_sync_method
Definition: xlog.c:130
@ WAL_SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:27
static const char * directory
Definition: zic.c:634