PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  * If a non-virtual file descriptor needs to be held open for any length of
65  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66  * (and eventually ReleaseExternalFD), so that we can take it into account
67  * while deciding how many VFDs can be open. This applies to FDs obtained
68  * with BasicOpenFile as well as those obtained without use of any fd.c API.
69  *
70  *-------------------------------------------------------------------------
71  */
72 
73 #include "postgres.h"
74 
75 #include <dirent.h>
76 #include <sys/file.h>
77 #include <sys/param.h>
78 #include <sys/resource.h> /* for getrlimit */
79 #include <sys/stat.h>
80 #include <sys/types.h>
81 #ifndef WIN32
82 #include <sys/mman.h>
83 #endif
84 #include <limits.h>
85 #include <unistd.h>
86 #include <fcntl.h>
87 
88 #include "access/xact.h"
89 #include "access/xlog.h"
90 #include "catalog/pg_tablespace.h"
91 #include "common/file_perm.h"
92 #include "common/file_utils.h"
93 #include "common/pg_prng.h"
94 #include "miscadmin.h"
95 #include "pgstat.h"
96 #include "portability/mem.h"
97 #include "postmaster/startup.h"
98 #include "storage/fd.h"
99 #include "storage/ipc.h"
100 #include "utils/guc.h"
101 #include "utils/guc_hooks.h"
102 #include "utils/resowner_private.h"
103 #include "utils/varlena.h"
104 
105 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
106 #if defined(HAVE_SYNC_FILE_RANGE)
107 #define PG_FLUSH_DATA_WORKS 1
108 #elif !defined(WIN32) && defined(MS_ASYNC)
109 #define PG_FLUSH_DATA_WORKS 1
110 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
111 #define PG_FLUSH_DATA_WORKS 1
112 #endif
113 
114 /*
115  * We must leave some file descriptors free for system(), the dynamic loader,
116  * and other code that tries to open files without consulting fd.c. This
117  * is the number left free. (While we try fairly hard to prevent EMFILE
118  * errors, there's never any guarantee that we won't get ENFILE due to
119  * other processes chewing up FDs. So it's a bad idea to try to open files
120  * without consulting fd.c. Nonetheless we cannot control all code.)
121  *
122  * Because this is just a fixed setting, we are effectively assuming that
123  * no such code will leave FDs open over the long term; otherwise the slop
124  * is likely to be insufficient. Note in particular that we expect that
125  * loading a shared library does not result in any permanent increase in
126  * the number of open files. (This appears to be true on most if not
127  * all platforms as of Feb 2004.)
128  */
129 #define NUM_RESERVED_FDS 10
130 
131 /*
132  * If we have fewer than this many usable FDs after allowing for the reserved
133  * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
134  * much less than that. Note that this value ensures numExternalFDs can be
135  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
136  * will not pass unless that can grow to at least 14.)
137  */
138 #define FD_MINFREE 48
139 
140 /*
141  * A number of platforms allow individual processes to open many more files
142  * than they can really support when *many* processes do the same thing.
143  * This GUC parameter lets the DBA limit max_safe_fds to something less than
144  * what the postmaster's initial probe suggests will work.
145  */
147 
148 /*
149  * Maximum number of file descriptors to open for operations that fd.c knows
150  * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
151  * to a conservative value, and remains that way indefinitely in bootstrap or
152  * standalone-backend cases. In normal postmaster operation, the postmaster
153  * calls set_max_safe_fds() late in initialization to update the value, and
154  * that value is then inherited by forked subprocesses.
155  *
156  * Note: the value of max_files_per_process is taken into account while
157  * setting this variable, and so need not be tested separately.
158  */
159 int max_safe_fds = FD_MINFREE; /* default if not changed */
160 
161 /* Whether it is safe to continue running after fsync() fails. */
162 bool data_sync_retry = false;
163 
164 /* How SyncDataDirectory() should do its job. */
166 
167 /* Which kinds of files should be opened with PG_O_DIRECT. */
169 
170 /* Debugging.... */
171 
172 #ifdef FDDEBUG
173 #define DO_DB(A) \
174  do { \
175  int _do_db_save_errno = errno; \
176  A; \
177  errno = _do_db_save_errno; \
178  } while (0)
179 #else
180 #define DO_DB(A) \
181  ((void) 0)
182 #endif
183 
184 #define VFD_CLOSED (-1)
185 
186 #define FileIsValid(file) \
187  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
188 
189 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
190 
191 /* these are the assigned bits in fdstate below: */
192 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
193 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
194 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
195 
196 typedef struct vfd
197 {
198  int fd; /* current FD, or VFD_CLOSED if none */
199  unsigned short fdstate; /* bitflags for VFD's state */
200  ResourceOwner resowner; /* owner, for automatic cleanup */
201  File nextFree; /* link to next free VFD, if in freelist */
202  File lruMoreRecently; /* doubly linked recency-of-use list */
204  off_t fileSize; /* current size of file (0 if not temporary) */
205  char *fileName; /* name of file, or NULL for unused VFD */
206  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
207  int fileFlags; /* open(2) flags for (re)opening the file */
208  mode_t fileMode; /* mode to pass to open(2) */
209 } Vfd;
210 
211 /*
212  * Virtual File Descriptor array pointer and size. This grows as
213  * needed. 'File' values are indexes into this array.
214  * Note that VfdCache[0] is not a usable VFD, just a list header.
215  */
216 static Vfd *VfdCache;
217 static Size SizeVfdCache = 0;
218 
219 /*
220  * Number of file descriptors known to be in use by VFD entries.
221  */
222 static int nfile = 0;
223 
224 /*
225  * Flag to tell whether it's worth scanning VfdCache looking for temp files
226  * to close
227  */
228 static bool have_xact_temporary_files = false;
229 
230 /*
231  * Tracks the total size of all temporary files. Note: when temp_file_limit
232  * is being enforced, this cannot overflow since the limit cannot be more
233  * than INT_MAX kilobytes. When not enforcing, it could theoretically
234  * overflow, but we don't care.
235  */
236 static uint64 temporary_files_size = 0;
237 
238 /* Temporary file access initialized and not yet shut down? */
239 #ifdef USE_ASSERT_CHECKING
240 static bool temporary_files_allowed = false;
241 #endif
242 
243 /*
244  * List of OS handles opened with AllocateFile, AllocateDir and
245  * OpenTransientFile.
246  */
247 typedef enum
248 {
254 
255 typedef struct
256 {
259  union
260  {
261  FILE *file;
263  int fd;
264  } desc;
265 } AllocateDesc;
266 
267 static int numAllocatedDescs = 0;
268 static int maxAllocatedDescs = 0;
270 
271 /*
272  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
273  */
274 static int numExternalFDs = 0;
275 
276 /*
277  * Number of temporary files opened during the current session;
278  * this is used in generation of tempfile names.
279  */
280 static long tempFileCounter = 0;
281 
282 /*
283  * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
284  * indicating that the current database's default tablespace should be used.)
285  * When numTempTableSpaces is -1, this has not been set in the current
286  * transaction.
287  */
288 static Oid *tempTableSpaces = NULL;
289 static int numTempTableSpaces = -1;
290 static int nextTempTableSpace = 0;
291 
292 
293 /*--------------------
294  *
295  * Private Routines
296  *
297  * Delete - delete a file from the Lru ring
298  * LruDelete - remove a file from the Lru ring and close its FD
299  * Insert - put a file at the front of the Lru ring
300  * LruInsert - put a file at the front of the Lru ring and open it
301  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
302  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
303  * AllocateVfd - grab a free (or new) file record (from VfdCache)
304  * FreeVfd - free a file record
305  *
306  * The Least Recently Used ring is a doubly linked list that begins and
307  * ends on element zero. Element zero is special -- it doesn't represent
308  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
309  * anchor that shows us the beginning/end of the ring.
310  * Only VFD elements that are currently really open (have an FD assigned) are
311  * in the Lru ring. Elements that are "virtually" open can be recognized
312  * by having a non-null fileName field.
313  *
314  * example:
315  *
316  * /--less----\ /---------\
317  * v \ v \
318  * #0 --more---> LeastRecentlyUsed --more-\ \
319  * ^\ | |
320  * \\less--> MostRecentlyUsedFile <---/ |
321  * \more---/ \--less--/
322  *
323  *--------------------
324  */
325 static void Delete(File file);
326 static void LruDelete(File file);
327 static void Insert(File file);
328 static int LruInsert(File file);
329 static bool ReleaseLruFile(void);
330 static void ReleaseLruFiles(void);
331 static File AllocateVfd(void);
332 static void FreeVfd(File file);
333 
334 static int FileAccess(File file);
335 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
336 static bool reserveAllocatedDesc(void);
337 static int FreeDesc(AllocateDesc *desc);
338 
339 static void BeforeShmemExit_Files(int code, Datum arg);
340 static void CleanupTempFiles(bool isCommit, bool isProcExit);
341 static void RemovePgTempRelationFiles(const char *tsdirname);
342 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
343 
344 static void walkdir(const char *path,
345  void (*action) (const char *fname, bool isdir, int elevel),
346  bool process_symlinks,
347  int elevel);
348 #ifdef PG_FLUSH_DATA_WORKS
349 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
350 #endif
351 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
352 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
353 
354 static int fsync_parent_path(const char *fname, int elevel);
355 
356 
357 /*
358  * pg_fsync --- do fsync with or without writethrough
359  */
360 int
362 {
363 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
364  struct stat st;
365 
366  /*
367  * Some operating system implementations of fsync() have requirements
368  * about the file access modes that were used when their file descriptor
369  * argument was opened, and these requirements differ depending on whether
370  * the file descriptor is for a directory.
371  *
372  * For any file descriptor that may eventually be handed to fsync(), we
373  * should have opened it with access modes that are compatible with
374  * fsync() on all supported systems, otherwise the code may not be
375  * portable, even if it runs ok on the current system.
376  *
377  * We assert here that a descriptor for a file was opened with write
378  * permissions (either O_RDWR or O_WRONLY) and for a directory without
379  * write permissions (O_RDONLY).
380  *
381  * Ignore any fstat errors and let the follow-up fsync() do its work.
382  * Doing this sanity check here counts for the case where fsync() is
383  * disabled.
384  */
385  if (fstat(fd, &st) == 0)
386  {
387  int desc_flags = fcntl(fd, F_GETFL);
388 
389  /*
390  * O_RDONLY is historically 0, so just make sure that for directories
391  * no write flags are used.
392  */
393  if (S_ISDIR(st.st_mode))
394  Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
395  else
396  Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
397  }
398  errno = 0;
399 #endif
400 
401  /* #if is to skip the sync_method test if there's no need for it */
402 #if defined(HAVE_FSYNC_WRITETHROUGH)
404  return pg_fsync_writethrough(fd);
405  else
406 #endif
408 }
409 
410 
411 /*
412  * pg_fsync_no_writethrough --- same as fsync except does nothing if
413  * enableFsync is off
414  */
415 int
417 {
418  int rc;
419 
420  if (!enableFsync)
421  return 0;
422 
423 retry:
424  rc = fsync(fd);
425 
426  if (rc == -1 && errno == EINTR)
427  goto retry;
428 
429  return rc;
430 }
431 
432 /*
433  * pg_fsync_writethrough
434  */
435 int
437 {
438  if (enableFsync)
439  {
440 #if defined(F_FULLFSYNC)
441  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
442 #else
443  errno = ENOSYS;
444  return -1;
445 #endif
446  }
447  else
448  return 0;
449 }
450 
451 /*
452  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
453  */
454 int
456 {
457  int rc;
458 
459  if (!enableFsync)
460  return 0;
461 
462 retry:
463  rc = fdatasync(fd);
464 
465  if (rc == -1 && errno == EINTR)
466  goto retry;
467 
468  return rc;
469 }
470 
471 /*
472  * pg_flush_data --- advise OS that the described dirty data should be flushed
473  *
474  * offset of 0 with nbytes 0 means that the entire file should be flushed
475  */
476 void
477 pg_flush_data(int fd, off_t offset, off_t nbytes)
478 {
479  /*
480  * Right now file flushing is primarily used to avoid making later
481  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
482  * if fsyncs are disabled - that's a decision we might want to make
483  * configurable at some point.
484  */
485  if (!enableFsync)
486  return;
487 
488  /*
489  * We compile all alternatives that are supported on the current platform,
490  * to find portability problems more easily.
491  */
492 #if defined(HAVE_SYNC_FILE_RANGE)
493  {
494  int rc;
495  static bool not_implemented_by_kernel = false;
496 
497  if (not_implemented_by_kernel)
498  return;
499 
500 retry:
501 
502  /*
503  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
504  * tells the OS that writeback for the specified blocks should be
505  * started, but that we don't want to wait for completion. Note that
506  * this call might block if too much dirty data exists in the range.
507  * This is the preferable method on OSs supporting it, as it works
508  * reliably when available (contrast to msync()) and doesn't flush out
509  * clean data (like FADV_DONTNEED).
510  */
511  rc = sync_file_range(fd, offset, nbytes,
512  SYNC_FILE_RANGE_WRITE);
513  if (rc != 0)
514  {
515  int elevel;
516 
517  if (rc == EINTR)
518  goto retry;
519 
520  /*
521  * For systems that don't have an implementation of
522  * sync_file_range() such as Windows WSL, generate only one
523  * warning and then suppress all further attempts by this process.
524  */
525  if (errno == ENOSYS)
526  {
527  elevel = WARNING;
528  not_implemented_by_kernel = true;
529  }
530  else
531  elevel = data_sync_elevel(WARNING);
532 
533  ereport(elevel,
535  errmsg("could not flush dirty data: %m")));
536  }
537 
538  return;
539  }
540 #endif
541 #if !defined(WIN32) && defined(MS_ASYNC)
542  {
543  void *p;
544  static int pagesize = 0;
545 
546  /*
547  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
548  * writeback. On linux it only does so if MS_SYNC is specified, but
549  * then it does the writeback synchronously. Luckily all common linux
550  * systems have sync_file_range(). This is preferable over
551  * FADV_DONTNEED because it doesn't flush out clean data.
552  *
553  * We map the file (mmap()), tell the kernel to sync back the contents
554  * (msync()), and then remove the mapping again (munmap()).
555  */
556 
557  /* mmap() needs actual length if we want to map whole file */
558  if (offset == 0 && nbytes == 0)
559  {
560  nbytes = lseek(fd, 0, SEEK_END);
561  if (nbytes < 0)
562  {
565  errmsg("could not determine dirty data size: %m")));
566  return;
567  }
568  }
569 
570  /*
571  * Some platforms reject partial-page mmap() attempts. To deal with
572  * that, just truncate the request to a page boundary. If any extra
573  * bytes don't get flushed, well, it's only a hint anyway.
574  */
575 
576  /* fetch pagesize only once */
577  if (pagesize == 0)
578  pagesize = sysconf(_SC_PAGESIZE);
579 
580  /* align length to pagesize, dropping any fractional page */
581  if (pagesize > 0)
582  nbytes = (nbytes / pagesize) * pagesize;
583 
584  /* fractional-page request is a no-op */
585  if (nbytes <= 0)
586  return;
587 
588  /*
589  * mmap could well fail, particularly on 32-bit platforms where there
590  * may simply not be enough address space. If so, silently fall
591  * through to the next implementation.
592  */
593  if (nbytes <= (off_t) SSIZE_MAX)
594  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
595  else
596  p = MAP_FAILED;
597 
598  if (p != MAP_FAILED)
599  {
600  int rc;
601 
602  rc = msync(p, (size_t) nbytes, MS_ASYNC);
603  if (rc != 0)
604  {
607  errmsg("could not flush dirty data: %m")));
608  /* NB: need to fall through to munmap()! */
609  }
610 
611  rc = munmap(p, (size_t) nbytes);
612  if (rc != 0)
613  {
614  /* FATAL error because mapping would remain */
615  ereport(FATAL,
617  errmsg("could not munmap() while flushing data: %m")));
618  }
619 
620  return;
621  }
622  }
623 #endif
624 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
625  {
626  int rc;
627 
628  /*
629  * Signal the kernel that the passed in range should not be cached
630  * anymore. This has the, desired, side effect of writing out dirty
631  * data, and the, undesired, side effect of likely discarding useful
632  * clean cached blocks. For the latter reason this is the least
633  * preferable method.
634  */
635 
636  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
637 
638  if (rc != 0)
639  {
640  /* don't error out, this is just a performance optimization */
643  errmsg("could not flush dirty data: %m")));
644  }
645 
646  return;
647  }
648 #endif
649 }
650 
651 /*
652  * Truncate an open file to a given length.
653  */
654 static int
655 pg_ftruncate(int fd, off_t length)
656 {
657  int ret;
658 
659 retry:
660  ret = ftruncate(fd, length);
661 
662  if (ret == -1 && errno == EINTR)
663  goto retry;
664 
665  return ret;
666 }
667 
668 /*
669  * Truncate a file to a given length by name.
670  */
671 int
672 pg_truncate(const char *path, off_t length)
673 {
674  int ret;
675 #ifdef WIN32
676  int save_errno;
677  int fd;
678 
679  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
680  if (fd >= 0)
681  {
682  ret = pg_ftruncate(fd, length);
683  save_errno = errno;
685  errno = save_errno;
686  }
687  else
688  ret = -1;
689 #else
690 
691 retry:
692  ret = truncate(path, length);
693 
694  if (ret == -1 && errno == EINTR)
695  goto retry;
696 #endif
697 
698  return ret;
699 }
700 
701 /*
702  * fsync_fname -- fsync a file or directory, handling errors properly
703  *
704  * Try to fsync a file or directory. When doing the latter, ignore errors that
705  * indicate the OS just doesn't allow/require fsyncing directories.
706  */
707 void
708 fsync_fname(const char *fname, bool isdir)
709 {
710  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
711 }
712 
713 /*
714  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
715  *
716  * This routine ensures that, after returning, the effect of renaming file
717  * persists in case of a crash. A crash while this routine is running will
718  * leave you with either the pre-existing or the moved file in place of the
719  * new file; no mixed state or truncated files are possible.
720  *
721  * It does so by using fsync on the old filename and the possibly existing
722  * target filename before the rename, and the target file and directory after.
723  *
724  * Note that rename() cannot be used across arbitrary directories, as they
725  * might not be on the same filesystem. Therefore this routine does not
726  * support renaming across directories.
727  *
728  * Log errors with the caller specified severity.
729  *
730  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
731  * valid upon return.
732  */
733 int
734 durable_rename(const char *oldfile, const char *newfile, int elevel)
735 {
736  int fd;
737 
738  /*
739  * First fsync the old and target path (if it exists), to ensure that they
740  * are properly persistent on disk. Syncing the target file is not
741  * strictly necessary, but it makes it easier to reason about crashes;
742  * because it's then guaranteed that either source or target file exists
743  * after a crash.
744  */
745  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
746  return -1;
747 
748  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
749  if (fd < 0)
750  {
751  if (errno != ENOENT)
752  {
753  ereport(elevel,
755  errmsg("could not open file \"%s\": %m", newfile)));
756  return -1;
757  }
758  }
759  else
760  {
761  if (pg_fsync(fd) != 0)
762  {
763  int save_errno;
764 
765  /* close file upon error, might not be in transaction context */
766  save_errno = errno;
768  errno = save_errno;
769 
770  ereport(elevel,
772  errmsg("could not fsync file \"%s\": %m", newfile)));
773  return -1;
774  }
775 
776  if (CloseTransientFile(fd) != 0)
777  {
778  ereport(elevel,
780  errmsg("could not close file \"%s\": %m", newfile)));
781  return -1;
782  }
783  }
784 
785  /* Time to do the real deal... */
786  if (rename(oldfile, newfile) < 0)
787  {
788  ereport(elevel,
790  errmsg("could not rename file \"%s\" to \"%s\": %m",
791  oldfile, newfile)));
792  return -1;
793  }
794 
795  /*
796  * To guarantee renaming the file is persistent, fsync the file with its
797  * new name, and its containing directory.
798  */
799  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
800  return -1;
801 
802  if (fsync_parent_path(newfile, elevel) != 0)
803  return -1;
804 
805  return 0;
806 }
807 
808 /*
809  * durable_unlink -- remove a file in a durable manner
810  *
811  * This routine ensures that, after returning, the effect of removing file
812  * persists in case of a crash. A crash while this routine is running will
813  * leave the system in no mixed state.
814  *
815  * It does so by using fsync on the parent directory of the file after the
816  * actual removal is done.
817  *
818  * Log errors with the severity specified by caller.
819  *
820  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
821  * valid upon return.
822  */
823 int
824 durable_unlink(const char *fname, int elevel)
825 {
826  if (unlink(fname) < 0)
827  {
828  ereport(elevel,
830  errmsg("could not remove file \"%s\": %m",
831  fname)));
832  return -1;
833  }
834 
835  /*
836  * To guarantee that the removal of the file is persistent, fsync its
837  * parent directory.
838  */
839  if (fsync_parent_path(fname, elevel) != 0)
840  return -1;
841 
842  return 0;
843 }
844 
845 /*
846  * InitFileAccess --- initialize this module during backend startup
847  *
848  * This is called during either normal or standalone backend start.
849  * It is *not* called in the postmaster.
850  *
851  * Note that this does not initialize temporary file access, that is
852  * separately initialized via InitTemporaryFileAccess().
853  */
854 void
856 {
857  Assert(SizeVfdCache == 0); /* call me only once */
858 
859  /* initialize cache header entry */
860  VfdCache = (Vfd *) malloc(sizeof(Vfd));
861  if (VfdCache == NULL)
862  ereport(FATAL,
863  (errcode(ERRCODE_OUT_OF_MEMORY),
864  errmsg("out of memory")));
865 
866  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
868 
869  SizeVfdCache = 1;
870 }
871 
872 /*
873  * InitTemporaryFileAccess --- initialize temporary file access during startup
874  *
875  * This is called during either normal or standalone backend start.
876  * It is *not* called in the postmaster.
877  *
878  * This is separate from InitFileAccess() because temporary file cleanup can
879  * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
880  * our reporting has to happen before that. Low level file access should be
881  * available for longer, hence the separate initialization / shutdown of
882  * temporary file handling.
883  */
884 void
886 {
887  Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
888  Assert(!temporary_files_allowed); /* call me only once */
889 
890  /*
891  * Register before-shmem-exit hook to ensure temp files are dropped while
892  * we can still report stats.
893  */
895 
896 #ifdef USE_ASSERT_CHECKING
897  temporary_files_allowed = true;
898 #endif
899 }
900 
901 /*
902  * count_usable_fds --- count how many FDs the system will let us open,
903  * and estimate how many are already open.
904  *
905  * We stop counting if usable_fds reaches max_to_probe. Note: a small
906  * value of max_to_probe might result in an underestimate of already_open;
907  * we must fill in any "gaps" in the set of used FDs before the calculation
908  * of already_open will give the right answer. In practice, max_to_probe
909  * of a couple of dozen should be enough to ensure good results.
910  *
911  * We assume stderr (FD 2) is available for dup'ing. While the calling
912  * script could theoretically close that, it would be a really bad idea,
913  * since then one risks loss of error messages from, e.g., libc.
914  */
915 static void
916 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
917 {
918  int *fd;
919  int size;
920  int used = 0;
921  int highestfd = 0;
922  int j;
923 
924 #ifdef HAVE_GETRLIMIT
925  struct rlimit rlim;
926  int getrlimit_status;
927 #endif
928 
929  size = 1024;
930  fd = (int *) palloc(size * sizeof(int));
931 
932 #ifdef HAVE_GETRLIMIT
933  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
934  if (getrlimit_status != 0)
935  ereport(WARNING, (errmsg("getrlimit failed: %m")));
936 #endif /* HAVE_GETRLIMIT */
937 
938  /* dup until failure or probe limit reached */
939  for (;;)
940  {
941  int thisfd;
942 
943 #ifdef HAVE_GETRLIMIT
944 
945  /*
946  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
947  * some platforms
948  */
949  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
950  break;
951 #endif
952 
953  thisfd = dup(2);
954  if (thisfd < 0)
955  {
956  /* Expect EMFILE or ENFILE, else it's fishy */
957  if (errno != EMFILE && errno != ENFILE)
958  elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
959  break;
960  }
961 
962  if (used >= size)
963  {
964  size *= 2;
965  fd = (int *) repalloc(fd, size * sizeof(int));
966  }
967  fd[used++] = thisfd;
968 
969  if (highestfd < thisfd)
970  highestfd = thisfd;
971 
972  if (used >= max_to_probe)
973  break;
974  }
975 
976  /* release the files we opened */
977  for (j = 0; j < used; j++)
978  close(fd[j]);
979 
980  pfree(fd);
981 
982  /*
983  * Return results. usable_fds is just the number of successful dups. We
984  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
985  * number) and so already_open is highestfd+1 - usable_fds.
986  */
987  *usable_fds = used;
988  *already_open = highestfd + 1 - used;
989 }
990 
991 /*
992  * set_max_safe_fds
993  * Determine number of file descriptors that fd.c is allowed to use
994  */
995 void
997 {
998  int usable_fds;
999  int already_open;
1000 
1001  /*----------
1002  * We want to set max_safe_fds to
1003  * MIN(usable_fds, max_files_per_process - already_open)
1004  * less the slop factor for files that are opened without consulting
1005  * fd.c. This ensures that we won't exceed either max_files_per_process
1006  * or the experimentally-determined EMFILE limit.
1007  *----------
1008  */
1010  &usable_fds, &already_open);
1011 
1012  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
1013 
1014  /*
1015  * Take off the FDs reserved for system() etc.
1016  */
1018 
1019  /*
1020  * Make sure we still have enough to get by.
1021  */
1022  if (max_safe_fds < FD_MINFREE)
1023  ereport(FATAL,
1024  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1025  errmsg("insufficient file descriptors available to start server process"),
1026  errdetail("System allows %d, server needs at least %d.",
1029 
1030  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1031  max_safe_fds, usable_fds, already_open);
1032 }
1033 
1034 /*
1035  * Open a file with BasicOpenFilePerm() and pass default file mode for the
1036  * fileMode parameter.
1037  */
1038 int
1039 BasicOpenFile(const char *fileName, int fileFlags)
1040 {
1041  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1042 }
1043 
1044 /*
1045  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1046  *
1047  * This is exported for use by places that really want a plain kernel FD,
1048  * but need to be proof against running out of FDs. Once an FD has been
1049  * successfully returned, it is the caller's responsibility to ensure that
1050  * it will not be leaked on ereport()! Most users should *not* call this
1051  * routine directly, but instead use the VFD abstraction level, which
1052  * provides protection against descriptor leaks as well as management of
1053  * files that need to be open for more than a short period of time.
1054  *
1055  * Ideally this should be the *only* direct call of open() in the backend.
1056  * In practice, the postmaster calls open() directly, and there are some
1057  * direct open() calls done early in backend startup. Those are OK since
1058  * this module wouldn't have any open files to close at that point anyway.
1059  */
1060 int
1061 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1062 {
1063  int fd;
1064 
1065 tryAgain:
1066 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1067 
1068  /*
1069  * The value we defined to stand in for O_DIRECT when simulating it with
1070  * F_NOCACHE had better not collide with any of the standard flags.
1071  */
1073  (O_APPEND |
1074  O_CLOEXEC |
1075  O_CREAT |
1076  O_DSYNC |
1077  O_EXCL |
1078  O_RDWR |
1079  O_RDONLY |
1080  O_SYNC |
1081  O_TRUNC |
1082  O_WRONLY)) == 0,
1083  "PG_O_DIRECT value collides with standard flag");
1084  fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1085 #else
1086  fd = open(fileName, fileFlags, fileMode);
1087 #endif
1088 
1089  if (fd >= 0)
1090  {
1091 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1092  if (fileFlags & PG_O_DIRECT)
1093  {
1094  if (fcntl(fd, F_NOCACHE, 1) < 0)
1095  {
1096  int save_errno = errno;
1097 
1098  close(fd);
1099  errno = save_errno;
1100  return -1;
1101  }
1102  }
1103 #endif
1104 
1105  return fd; /* success! */
1106  }
1107 
1108  if (errno == EMFILE || errno == ENFILE)
1109  {
1110  int save_errno = errno;
1111 
1112  ereport(LOG,
1113  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1114  errmsg("out of file descriptors: %m; release and retry")));
1115  errno = 0;
1116  if (ReleaseLruFile())
1117  goto tryAgain;
1118  errno = save_errno;
1119  }
1120 
1121  return -1; /* failure */
1122 }
1123 
1124 /*
1125  * AcquireExternalFD - attempt to reserve an external file descriptor
1126  *
1127  * This should be used by callers that need to hold a file descriptor open
1128  * over more than a short interval, but cannot use any of the other facilities
1129  * provided by this module.
1130  *
1131  * The difference between this and the underlying ReserveExternalFD function
1132  * is that this will report failure (by setting errno and returning false)
1133  * if "too many" external FDs are already reserved. This should be used in
1134  * any code where the total number of FDs to be reserved is not predictable
1135  * and small.
1136  */
1137 bool
1139 {
1140  /*
1141  * We don't want more than max_safe_fds / 3 FDs to be consumed for
1142  * "external" FDs.
1143  */
1144  if (numExternalFDs < max_safe_fds / 3)
1145  {
1147  return true;
1148  }
1149  errno = EMFILE;
1150  return false;
1151 }
1152 
1153 /*
1154  * ReserveExternalFD - report external consumption of a file descriptor
1155  *
1156  * This should be used by callers that need to hold a file descriptor open
1157  * over more than a short interval, but cannot use any of the other facilities
1158  * provided by this module. This just tracks the use of the FD and closes
1159  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1160  *
1161  * Call this directly only in code where failure to reserve the FD would be
1162  * fatal; for example, the WAL-writing code does so, since the alternative is
1163  * session failure. Also, it's very unwise to do so in code that could
1164  * consume more than one FD per process.
1165  *
1166  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1167  * available, it doesn't matter too much whether this is called before or
1168  * after actually opening the FD; but doing so beforehand reduces the risk of
1169  * an EMFILE failure if not everybody played nice. In any case, it's solely
1170  * caller's responsibility to keep the external-FD count in sync with reality.
1171  */
1172 void
1174 {
1175  /*
1176  * Release VFDs if needed to stay safe. Because we do this before
1177  * incrementing numExternalFDs, the final state will be as desired, i.e.,
1178  * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1179  */
1180  ReleaseLruFiles();
1181 
1182  numExternalFDs++;
1183 }
1184 
1185 /*
1186  * ReleaseExternalFD - report release of an external file descriptor
1187  *
1188  * This is guaranteed not to change errno, so it can be used in failure paths.
1189  */
1190 void
1192 {
1193  Assert(numExternalFDs > 0);
1194  numExternalFDs--;
1195 }
1196 
1197 
1198 #if defined(FDDEBUG)
1199 
1200 static void
1201 _dump_lru(void)
1202 {
1203  int mru = VfdCache[0].lruLessRecently;
1204  Vfd *vfdP = &VfdCache[mru];
1205  char buf[2048];
1206 
1207  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1208  while (mru != 0)
1209  {
1210  mru = vfdP->lruLessRecently;
1211  vfdP = &VfdCache[mru];
1212  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1213  }
1214  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1215  elog(LOG, "%s", buf);
1216 }
1217 #endif /* FDDEBUG */
1218 
1219 static void
1221 {
1222  Vfd *vfdP;
1223 
1224  Assert(file != 0);
1225 
1226  DO_DB(elog(LOG, "Delete %d (%s)",
1227  file, VfdCache[file].fileName));
1228  DO_DB(_dump_lru());
1229 
1230  vfdP = &VfdCache[file];
1231 
1234 
1235  DO_DB(_dump_lru());
1236 }
1237 
1238 static void
1240 {
1241  Vfd *vfdP;
1242 
1243  Assert(file != 0);
1244 
1245  DO_DB(elog(LOG, "LruDelete %d (%s)",
1246  file, VfdCache[file].fileName));
1247 
1248  vfdP = &VfdCache[file];
1249 
1250  /*
1251  * Close the file. We aren't expecting this to fail; if it does, better
1252  * to leak the FD than to mess up our internal state.
1253  */
1254  if (close(vfdP->fd) != 0)
1256  "could not close file \"%s\": %m", vfdP->fileName);
1257  vfdP->fd = VFD_CLOSED;
1258  --nfile;
1259 
1260  /* delete the vfd record from the LRU ring */
1261  Delete(file);
1262 }
1263 
1264 static void
1266 {
1267  Vfd *vfdP;
1268 
1269  Assert(file != 0);
1270 
1271  DO_DB(elog(LOG, "Insert %d (%s)",
1272  file, VfdCache[file].fileName));
1273  DO_DB(_dump_lru());
1274 
1275  vfdP = &VfdCache[file];
1276 
1277  vfdP->lruMoreRecently = 0;
1279  VfdCache[0].lruLessRecently = file;
1281 
1282  DO_DB(_dump_lru());
1283 }
1284 
1285 /* returns 0 on success, -1 on re-open failure (with errno set) */
1286 static int
1288 {
1289  Vfd *vfdP;
1290 
1291  Assert(file != 0);
1292 
1293  DO_DB(elog(LOG, "LruInsert %d (%s)",
1294  file, VfdCache[file].fileName));
1295 
1296  vfdP = &VfdCache[file];
1297 
1298  if (FileIsNotOpen(file))
1299  {
1300  /* Close excess kernel FDs. */
1301  ReleaseLruFiles();
1302 
1303  /*
1304  * The open could still fail for lack of file descriptors, eg due to
1305  * overall system file table being full. So, be prepared to release
1306  * another FD if necessary...
1307  */
1308  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1309  vfdP->fileMode);
1310  if (vfdP->fd < 0)
1311  {
1312  DO_DB(elog(LOG, "re-open failed: %m"));
1313  return -1;
1314  }
1315  else
1316  {
1317  ++nfile;
1318  }
1319  }
1320 
1321  /*
1322  * put it at the head of the Lru ring
1323  */
1324 
1325  Insert(file);
1326 
1327  return 0;
1328 }
1329 
1330 /*
1331  * Release one kernel FD by closing the least-recently-used VFD.
1332  */
1333 static bool
1335 {
1336  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1337 
1338  if (nfile > 0)
1339  {
1340  /*
1341  * There are opened files and so there should be at least one used vfd
1342  * in the ring.
1343  */
1344  Assert(VfdCache[0].lruMoreRecently != 0);
1345  LruDelete(VfdCache[0].lruMoreRecently);
1346  return true; /* freed a file */
1347  }
1348  return false; /* no files available to free */
1349 }
1350 
1351 /*
1352  * Release kernel FDs as needed to get under the max_safe_fds limit.
1353  * After calling this, it's OK to try to open another file.
1354  */
1355 static void
1357 {
1359  {
1360  if (!ReleaseLruFile())
1361  break;
1362  }
1363 }
1364 
1365 static File
1367 {
1368  Index i;
1369  File file;
1370 
1371  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1372 
1373  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1374 
1375  if (VfdCache[0].nextFree == 0)
1376  {
1377  /*
1378  * The free list is empty so it is time to increase the size of the
1379  * array. We choose to double it each time this happens. However,
1380  * there's not much point in starting *real* small.
1381  */
1382  Size newCacheSize = SizeVfdCache * 2;
1383  Vfd *newVfdCache;
1384 
1385  if (newCacheSize < 32)
1386  newCacheSize = 32;
1387 
1388  /*
1389  * Be careful not to clobber VfdCache ptr if realloc fails.
1390  */
1391  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1392  if (newVfdCache == NULL)
1393  ereport(ERROR,
1394  (errcode(ERRCODE_OUT_OF_MEMORY),
1395  errmsg("out of memory")));
1396  VfdCache = newVfdCache;
1397 
1398  /*
1399  * Initialize the new entries and link them into the free list.
1400  */
1401  for (i = SizeVfdCache; i < newCacheSize; i++)
1402  {
1403  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1404  VfdCache[i].nextFree = i + 1;
1405  VfdCache[i].fd = VFD_CLOSED;
1406  }
1407  VfdCache[newCacheSize - 1].nextFree = 0;
1409 
1410  /*
1411  * Record the new size
1412  */
1413  SizeVfdCache = newCacheSize;
1414  }
1415 
1416  file = VfdCache[0].nextFree;
1417 
1418  VfdCache[0].nextFree = VfdCache[file].nextFree;
1419 
1420  return file;
1421 }
1422 
1423 static void
1425 {
1426  Vfd *vfdP = &VfdCache[file];
1427 
1428  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1429  file, vfdP->fileName ? vfdP->fileName : ""));
1430 
1431  if (vfdP->fileName != NULL)
1432  {
1433  free(vfdP->fileName);
1434  vfdP->fileName = NULL;
1435  }
1436  vfdP->fdstate = 0x0;
1437 
1438  vfdP->nextFree = VfdCache[0].nextFree;
1439  VfdCache[0].nextFree = file;
1440 }
1441 
1442 /* returns 0 on success, -1 on re-open failure (with errno set) */
1443 static int
1445 {
1446  int returnValue;
1447 
1448  DO_DB(elog(LOG, "FileAccess %d (%s)",
1449  file, VfdCache[file].fileName));
1450 
1451  /*
1452  * Is the file open? If not, open it and put it at the head of the LRU
1453  * ring (possibly closing the least recently used file to get an FD).
1454  */
1455 
1456  if (FileIsNotOpen(file))
1457  {
1458  returnValue = LruInsert(file);
1459  if (returnValue != 0)
1460  return returnValue;
1461  }
1462  else if (VfdCache[0].lruLessRecently != file)
1463  {
1464  /*
1465  * We now know that the file is open and that it is not the last one
1466  * accessed, so we need to move it to the head of the Lru ring.
1467  */
1468 
1469  Delete(file);
1470  Insert(file);
1471  }
1472 
1473  return 0;
1474 }
1475 
1476 /*
1477  * Called whenever a temporary file is deleted to report its size.
1478  */
1479 static void
1480 ReportTemporaryFileUsage(const char *path, off_t size)
1481 {
1482  pgstat_report_tempfile(size);
1483 
1484  if (log_temp_files >= 0)
1485  {
1486  if ((size / 1024) >= log_temp_files)
1487  ereport(LOG,
1488  (errmsg("temporary file: path \"%s\", size %lu",
1489  path, (unsigned long) size)));
1490  }
1491 }
1492 
1493 /*
1494  * Called to register a temporary file for automatic close.
1495  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1496  * before the file was opened.
1497  */
1498 static void
1500 {
1503 
1504  /* Backup mechanism for closing at end of xact. */
1507 }
1508 
1509 /*
1510  * Called when we get a shared invalidation message on some relation.
1511  */
1512 #ifdef NOT_USED
1513 void
1514 FileInvalidate(File file)
1515 {
1516  Assert(FileIsValid(file));
1517  if (!FileIsNotOpen(file))
1518  LruDelete(file);
1519 }
1520 #endif
1521 
1522 /*
1523  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1524  * fileMode parameter.
1525  */
1526 File
1527 PathNameOpenFile(const char *fileName, int fileFlags)
1528 {
1529  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1530 }
1531 
1532 /*
1533  * open a file in an arbitrary directory
1534  *
1535  * NB: if the passed pathname is relative (which it usually is),
1536  * it will be interpreted relative to the process' working directory
1537  * (which should always be $PGDATA when this code is running).
1538  */
1539 File
1540 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1541 {
1542  char *fnamecopy;
1543  File file;
1544  Vfd *vfdP;
1545 
1546  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1547  fileName, fileFlags, fileMode));
1548 
1549  /*
1550  * We need a malloc'd copy of the file name; fail cleanly if no room.
1551  */
1552  fnamecopy = strdup(fileName);
1553  if (fnamecopy == NULL)
1554  ereport(ERROR,
1555  (errcode(ERRCODE_OUT_OF_MEMORY),
1556  errmsg("out of memory")));
1557 
1558  file = AllocateVfd();
1559  vfdP = &VfdCache[file];
1560 
1561  /* Close excess kernel FDs. */
1562  ReleaseLruFiles();
1563 
1564  /*
1565  * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1566  * client shouldn't be expected to know which kernel descriptors are
1567  * currently open, so it wouldn't make sense for them to be inherited by
1568  * executed subprograms.
1569  */
1570  fileFlags |= O_CLOEXEC;
1571 
1572  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1573 
1574  if (vfdP->fd < 0)
1575  {
1576  int save_errno = errno;
1577 
1578  FreeVfd(file);
1579  free(fnamecopy);
1580  errno = save_errno;
1581  return -1;
1582  }
1583  ++nfile;
1584  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1585  vfdP->fd));
1586 
1587  vfdP->fileName = fnamecopy;
1588  /* Saved flags are adjusted to be OK for re-opening file */
1589  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1590  vfdP->fileMode = fileMode;
1591  vfdP->fileSize = 0;
1592  vfdP->fdstate = 0x0;
1593  vfdP->resowner = NULL;
1594 
1595  Insert(file);
1596 
1597  return file;
1598 }
1599 
1600 /*
1601  * Create directory 'directory'. If necessary, create 'basedir', which must
1602  * be the directory above it. This is designed for creating the top-level
1603  * temporary directory on demand before creating a directory underneath it.
1604  * Do nothing if the directory already exists.
1605  *
1606  * Directories created within the top-level temporary directory should begin
1607  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1608  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1609  * that do not need any particular prefix.
1610 */
1611 void
1613 {
1614  if (MakePGDirectory(directory) < 0)
1615  {
1616  if (errno == EEXIST)
1617  return;
1618 
1619  /*
1620  * Failed. Try to create basedir first in case it's missing. Tolerate
1621  * EEXIST to close a race against another process following the same
1622  * algorithm.
1623  */
1624  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1625  ereport(ERROR,
1627  errmsg("cannot create temporary directory \"%s\": %m",
1628  basedir)));
1629 
1630  /* Try again. */
1631  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1632  ereport(ERROR,
1634  errmsg("cannot create temporary subdirectory \"%s\": %m",
1635  directory)));
1636  }
1637 }
1638 
1639 /*
1640  * Delete a directory and everything in it, if it exists.
1641  */
1642 void
1643 PathNameDeleteTemporaryDir(const char *dirname)
1644 {
1645  struct stat statbuf;
1646 
1647  /* Silently ignore missing directory. */
1648  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1649  return;
1650 
1651  /*
1652  * Currently, walkdir doesn't offer a way for our passed in function to
1653  * maintain state. Perhaps it should, so that we could tell the caller
1654  * whether this operation succeeded or failed. Since this operation is
1655  * used in a cleanup path, we wouldn't actually behave differently: we'll
1656  * just log failures.
1657  */
1658  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1659 }
1660 
1661 /*
1662  * Open a temporary file that will disappear when we close it.
1663  *
1664  * This routine takes care of generating an appropriate tempfile name.
1665  * There's no need to pass in fileFlags or fileMode either, since only
1666  * one setting makes any sense for a temp file.
1667  *
1668  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1669  * to ensure it's closed and deleted when it's no longer needed, typically at
1670  * the end-of-transaction. In most cases, you don't want temporary files to
1671  * outlive the transaction that created them, so this should be false -- but
1672  * if you need "somewhat" temporary storage, this might be useful. In either
1673  * case, the file is removed when the File is explicitly closed.
1674  */
1675 File
1676 OpenTemporaryFile(bool interXact)
1677 {
1678  File file = 0;
1679 
1680  Assert(temporary_files_allowed); /* check temp file access is up */
1681 
1682  /*
1683  * Make sure the current resource owner has space for this File before we
1684  * open it, if we'll be registering it below.
1685  */
1686  if (!interXact)
1688 
1689  /*
1690  * If some temp tablespace(s) have been given to us, try to use the next
1691  * one. If a given tablespace can't be found, we silently fall back to
1692  * the database's default tablespace.
1693  *
1694  * BUT: if the temp file is slated to outlive the current transaction,
1695  * force it into the database's default tablespace, so that it will not
1696  * pose a threat to possible tablespace drop attempts.
1697  */
1698  if (numTempTableSpaces > 0 && !interXact)
1699  {
1700  Oid tblspcOid = GetNextTempTableSpace();
1701 
1702  if (OidIsValid(tblspcOid))
1703  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1704  }
1705 
1706  /*
1707  * If not, or if tablespace is bad, create in database's default
1708  * tablespace. MyDatabaseTableSpace should normally be set before we get
1709  * here, but just in case it isn't, fall back to pg_default tablespace.
1710  */
1711  if (file <= 0)
1714  DEFAULTTABLESPACE_OID,
1715  true);
1716 
1717  /* Mark it for deletion at close and temporary file size limit */
1719 
1720  /* Register it with the current resource owner */
1721  if (!interXact)
1722  RegisterTemporaryFile(file);
1723 
1724  return file;
1725 }
1726 
1727 /*
1728  * Return the path of the temp directory in a given tablespace.
1729  */
1730 void
1732 {
1733  /*
1734  * Identify the tempfile directory for this tablespace.
1735  *
1736  * If someone tries to specify pg_global, use pg_default instead.
1737  */
1738  if (tablespace == InvalidOid ||
1739  tablespace == DEFAULTTABLESPACE_OID ||
1740  tablespace == GLOBALTABLESPACE_OID)
1741  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1742  else
1743  {
1744  /* All other tablespaces are accessed via symlinks */
1745  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1748  }
1749 }
1750 
1751 /*
1752  * Open a temporary file in a specific tablespace.
1753  * Subroutine for OpenTemporaryFile, which see for details.
1754  */
1755 static File
1756 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1757 {
1758  char tempdirpath[MAXPGPATH];
1759  char tempfilepath[MAXPGPATH];
1760  File file;
1761 
1762  TempTablespacePath(tempdirpath, tblspcOid);
1763 
1764  /*
1765  * Generate a tempfile name that should be unique within the current
1766  * database instance.
1767  */
1768  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1769  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1770 
1771  /*
1772  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1773  * temp file that can be reused.
1774  */
1775  file = PathNameOpenFile(tempfilepath,
1776  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1777  if (file <= 0)
1778  {
1779  /*
1780  * We might need to create the tablespace's tempfile directory, if no
1781  * one has yet done so.
1782  *
1783  * Don't check for an error from MakePGDirectory; it could fail if
1784  * someone else just did the same thing. If it doesn't work then
1785  * we'll bomb out on the second create attempt, instead.
1786  */
1787  (void) MakePGDirectory(tempdirpath);
1788 
1789  file = PathNameOpenFile(tempfilepath,
1790  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1791  if (file <= 0 && rejectError)
1792  elog(ERROR, "could not create temporary file \"%s\": %m",
1793  tempfilepath);
1794  }
1795 
1796  return file;
1797 }
1798 
1799 
1800 /*
1801  * Create a new file. The directory containing it must already exist. Files
1802  * created this way are subject to temp_file_limit and are automatically
1803  * closed at end of transaction, but are not automatically deleted on close
1804  * because they are intended to be shared between cooperating backends.
1805  *
1806  * If the file is inside the top-level temporary directory, its name should
1807  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1808  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1809  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1810  * the prefix isn't needed.
1811  */
1812 File
1813 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1814 {
1815  File file;
1816 
1817  Assert(temporary_files_allowed); /* check temp file access is up */
1818 
1820 
1821  /*
1822  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1823  * temp file that can be reused.
1824  */
1825  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1826  if (file <= 0)
1827  {
1828  if (error_on_failure)
1829  ereport(ERROR,
1831  errmsg("could not create temporary file \"%s\": %m",
1832  path)));
1833  else
1834  return file;
1835  }
1836 
1837  /* Mark it for temp_file_limit accounting. */
1839 
1840  /* Register it for automatic close. */
1841  RegisterTemporaryFile(file);
1842 
1843  return file;
1844 }
1845 
1846 /*
1847  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1848  * another backend. Files opened this way don't count against the
1849  * temp_file_limit of the caller, are automatically closed at the end of the
1850  * transaction but are not deleted on close.
1851  */
1852 File
1853 PathNameOpenTemporaryFile(const char *path, int mode)
1854 {
1855  File file;
1856 
1857  Assert(temporary_files_allowed); /* check temp file access is up */
1858 
1860 
1861  file = PathNameOpenFile(path, mode | PG_BINARY);
1862 
1863  /* If no such file, then we don't raise an error. */
1864  if (file <= 0 && errno != ENOENT)
1865  ereport(ERROR,
1867  errmsg("could not open temporary file \"%s\": %m",
1868  path)));
1869 
1870  if (file > 0)
1871  {
1872  /* Register it for automatic close. */
1873  RegisterTemporaryFile(file);
1874  }
1875 
1876  return file;
1877 }
1878 
1879 /*
1880  * Delete a file by pathname. Return true if the file existed, false if
1881  * didn't.
1882  */
1883 bool
1884 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1885 {
1886  struct stat filestats;
1887  int stat_errno;
1888 
1889  /* Get the final size for pgstat reporting. */
1890  if (stat(path, &filestats) != 0)
1891  stat_errno = errno;
1892  else
1893  stat_errno = 0;
1894 
1895  /*
1896  * Unlike FileClose's automatic file deletion code, we tolerate
1897  * non-existence to support BufFileDeleteFileSet which doesn't know how
1898  * many segments it has to delete until it runs out.
1899  */
1900  if (stat_errno == ENOENT)
1901  return false;
1902 
1903  if (unlink(path) < 0)
1904  {
1905  if (errno != ENOENT)
1906  ereport(error_on_failure ? ERROR : LOG,
1908  errmsg("could not unlink temporary file \"%s\": %m",
1909  path)));
1910  return false;
1911  }
1912 
1913  if (stat_errno == 0)
1914  ReportTemporaryFileUsage(path, filestats.st_size);
1915  else
1916  {
1917  errno = stat_errno;
1918  ereport(LOG,
1920  errmsg("could not stat file \"%s\": %m", path)));
1921  }
1922 
1923  return true;
1924 }
1925 
1926 /*
1927  * close a file when done with it
1928  */
1929 void
1931 {
1932  Vfd *vfdP;
1933 
1934  Assert(FileIsValid(file));
1935 
1936  DO_DB(elog(LOG, "FileClose: %d (%s)",
1937  file, VfdCache[file].fileName));
1938 
1939  vfdP = &VfdCache[file];
1940 
1941  if (!FileIsNotOpen(file))
1942  {
1943  /* close the file */
1944  if (close(vfdP->fd) != 0)
1945  {
1946  /*
1947  * We may need to panic on failure to close non-temporary files;
1948  * see LruDelete.
1949  */
1951  "could not close file \"%s\": %m", vfdP->fileName);
1952  }
1953 
1954  --nfile;
1955  vfdP->fd = VFD_CLOSED;
1956 
1957  /* remove the file from the lru ring */
1958  Delete(file);
1959  }
1960 
1961  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1962  {
1963  /* Subtract its size from current usage (do first in case of error) */
1964  temporary_files_size -= vfdP->fileSize;
1965  vfdP->fileSize = 0;
1966  }
1967 
1968  /*
1969  * Delete the file if it was temporary, and make a log entry if wanted
1970  */
1971  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1972  {
1973  struct stat filestats;
1974  int stat_errno;
1975 
1976  /*
1977  * If we get an error, as could happen within the ereport/elog calls,
1978  * we'll come right back here during transaction abort. Reset the
1979  * flag to ensure that we can't get into an infinite loop. This code
1980  * is arranged to ensure that the worst-case consequence is failing to
1981  * emit log message(s), not failing to attempt the unlink.
1982  */
1983  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1984 
1985 
1986  /* first try the stat() */
1987  if (stat(vfdP->fileName, &filestats))
1988  stat_errno = errno;
1989  else
1990  stat_errno = 0;
1991 
1992  /* in any case do the unlink */
1993  if (unlink(vfdP->fileName))
1994  ereport(LOG,
1996  errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
1997 
1998  /* and last report the stat results */
1999  if (stat_errno == 0)
2000  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2001  else
2002  {
2003  errno = stat_errno;
2004  ereport(LOG,
2006  errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2007  }
2008  }
2009 
2010  /* Unregister it from the resource owner */
2011  if (vfdP->resowner)
2012  ResourceOwnerForgetFile(vfdP->resowner, file);
2013 
2014  /*
2015  * Return the Vfd slot to the free list
2016  */
2017  FreeVfd(file);
2018 }
2019 
2020 /*
2021  * FilePrefetch - initiate asynchronous read of a given range of the file.
2022  *
2023  * Currently the only implementation of this function is using posix_fadvise
2024  * which is the simplest standardized interface that accomplishes this.
2025  * We could add an implementation using libaio in the future; but note that
2026  * this API is inappropriate for libaio, which wants to have a buffer provided
2027  * to read into.
2028  */
2029 int
2030 FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
2031 {
2032 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2033  int returnCode;
2034 
2035  Assert(FileIsValid(file));
2036 
2037  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2038  file, VfdCache[file].fileName,
2039  (int64) offset, (int64) amount));
2040 
2041  returnCode = FileAccess(file);
2042  if (returnCode < 0)
2043  return returnCode;
2044 
2045 retry:
2046  pgstat_report_wait_start(wait_event_info);
2047  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2048  POSIX_FADV_WILLNEED);
2050 
2051  if (returnCode == EINTR)
2052  goto retry;
2053 
2054  return returnCode;
2055 #else
2056  Assert(FileIsValid(file));
2057  return 0;
2058 #endif
2059 }
2060 
2061 void
2062 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2063 {
2064  int returnCode;
2065 
2066  Assert(FileIsValid(file));
2067 
2068  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2069  file, VfdCache[file].fileName,
2070  (int64) offset, (int64) nbytes));
2071 
2072  if (nbytes <= 0)
2073  return;
2074 
2075  if (VfdCache[file].fileFlags & PG_O_DIRECT)
2076  return;
2077 
2078  returnCode = FileAccess(file);
2079  if (returnCode < 0)
2080  return;
2081 
2082  pgstat_report_wait_start(wait_event_info);
2083  pg_flush_data(VfdCache[file].fd, offset, nbytes);
2085 }
2086 
2087 int
2088 FileRead(File file, void *buffer, size_t amount, off_t offset,
2089  uint32 wait_event_info)
2090 {
2091  int returnCode;
2092  Vfd *vfdP;
2093 
2094  Assert(FileIsValid(file));
2095 
2096  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %zu %p",
2097  file, VfdCache[file].fileName,
2098  (int64) offset,
2099  amount, buffer));
2100 
2101  returnCode = FileAccess(file);
2102  if (returnCode < 0)
2103  return returnCode;
2104 
2105  vfdP = &VfdCache[file];
2106 
2107 retry:
2108  pgstat_report_wait_start(wait_event_info);
2109  returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
2111 
2112  if (returnCode < 0)
2113  {
2114  /*
2115  * Windows may run out of kernel buffers and return "Insufficient
2116  * system resources" error. Wait a bit and retry to solve it.
2117  *
2118  * It is rumored that EINTR is also possible on some Unix filesystems,
2119  * in which case immediate retry is indicated.
2120  */
2121 #ifdef WIN32
2122  DWORD error = GetLastError();
2123 
2124  switch (error)
2125  {
2126  case ERROR_NO_SYSTEM_RESOURCES:
2127  pg_usleep(1000L);
2128  errno = EINTR;
2129  break;
2130  default:
2131  _dosmaperr(error);
2132  break;
2133  }
2134 #endif
2135  /* OK to retry if interrupted */
2136  if (errno == EINTR)
2137  goto retry;
2138  }
2139 
2140  return returnCode;
2141 }
2142 
2143 int
2144 FileWrite(File file, const void *buffer, size_t amount, off_t offset,
2145  uint32 wait_event_info)
2146 {
2147  int returnCode;
2148  Vfd *vfdP;
2149 
2150  Assert(FileIsValid(file));
2151 
2152  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %zu %p",
2153  file, VfdCache[file].fileName,
2154  (int64) offset,
2155  amount, buffer));
2156 
2157  returnCode = FileAccess(file);
2158  if (returnCode < 0)
2159  return returnCode;
2160 
2161  vfdP = &VfdCache[file];
2162 
2163  /*
2164  * If enforcing temp_file_limit and it's a temp file, check to see if the
2165  * write would overrun temp_file_limit, and throw error if so. Note: it's
2166  * really a modularity violation to throw error here; we should set errno
2167  * and return -1. However, there's no way to report a suitable error
2168  * message if we do that. All current callers would just throw error
2169  * immediately anyway, so this is safe at present.
2170  */
2171  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2172  {
2173  off_t past_write = offset + amount;
2174 
2175  if (past_write > vfdP->fileSize)
2176  {
2177  uint64 newTotal = temporary_files_size;
2178 
2179  newTotal += past_write - vfdP->fileSize;
2180  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2181  ereport(ERROR,
2182  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2183  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2184  temp_file_limit)));
2185  }
2186  }
2187 
2188 retry:
2189  errno = 0;
2190  pgstat_report_wait_start(wait_event_info);
2191  returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2193 
2194  /* if write didn't set errno, assume problem is no disk space */
2195  if (returnCode != amount && errno == 0)
2196  errno = ENOSPC;
2197 
2198  if (returnCode >= 0)
2199  {
2200  /*
2201  * Maintain fileSize and temporary_files_size if it's a temp file.
2202  */
2203  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2204  {
2205  off_t past_write = offset + amount;
2206 
2207  if (past_write > vfdP->fileSize)
2208  {
2209  temporary_files_size += past_write - vfdP->fileSize;
2210  vfdP->fileSize = past_write;
2211  }
2212  }
2213  }
2214  else
2215  {
2216  /*
2217  * See comments in FileRead()
2218  */
2219 #ifdef WIN32
2220  DWORD error = GetLastError();
2221 
2222  switch (error)
2223  {
2224  case ERROR_NO_SYSTEM_RESOURCES:
2225  pg_usleep(1000L);
2226  errno = EINTR;
2227  break;
2228  default:
2229  _dosmaperr(error);
2230  break;
2231  }
2232 #endif
2233  /* OK to retry if interrupted */
2234  if (errno == EINTR)
2235  goto retry;
2236  }
2237 
2238  return returnCode;
2239 }
2240 
2241 int
2242 FileSync(File file, uint32 wait_event_info)
2243 {
2244  int returnCode;
2245 
2246  Assert(FileIsValid(file));
2247 
2248  DO_DB(elog(LOG, "FileSync: %d (%s)",
2249  file, VfdCache[file].fileName));
2250 
2251  returnCode = FileAccess(file);
2252  if (returnCode < 0)
2253  return returnCode;
2254 
2255  pgstat_report_wait_start(wait_event_info);
2256  returnCode = pg_fsync(VfdCache[file].fd);
2258 
2259  return returnCode;
2260 }
2261 
2262 /*
2263  * Zero a region of the file.
2264  *
2265  * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2266  * appropriate error.
2267  */
2268 int
2269 FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
2270 {
2271  int returnCode;
2272  ssize_t written;
2273 
2274  Assert(FileIsValid(file));
2275 
2276  DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2277  file, VfdCache[file].fileName,
2278  (int64) offset, (int64) amount));
2279 
2280  returnCode = FileAccess(file);
2281  if (returnCode < 0)
2282  return returnCode;
2283 
2284  pgstat_report_wait_start(wait_event_info);
2285  written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2287 
2288  if (written < 0)
2289  return -1;
2290  else if (written != amount)
2291  {
2292  /* if errno is unset, assume problem is no disk space */
2293  if (errno == 0)
2294  errno = ENOSPC;
2295  return -1;
2296  }
2297 
2298  return 0;
2299 }
2300 
2301 /*
2302  * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2303  * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2304  * use FileZero() instead.
2305  *
2306  * Note that at least glibc() implements posix_fallocate() in userspace if not
2307  * implemented by the filesystem. That's not the case for all environments
2308  * though.
2309  *
2310  * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2311  * appropriate error.
2312  */
2313 int
2314 FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
2315 {
2316 #ifdef HAVE_POSIX_FALLOCATE
2317  int returnCode;
2318 
2319  Assert(FileIsValid(file));
2320 
2321  DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2322  file, VfdCache[file].fileName,
2323  (int64) offset, (int64) amount));
2324 
2325  returnCode = FileAccess(file);
2326  if (returnCode < 0)
2327  return -1;
2328 
2329 retry:
2330  pgstat_report_wait_start(wait_event_info);
2331  returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2333 
2334  if (returnCode == 0)
2335  return 0;
2336  else if (returnCode == EINTR)
2337  goto retry;
2338 
2339  /* for compatibility with %m printing etc */
2340  errno = returnCode;
2341 
2342  /*
2343  * Return in cases of a "real" failure, if fallocate is not supported,
2344  * fall through to the FileZero() backed implementation.
2345  */
2346  if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2347  return -1;
2348 #endif
2349 
2350  return FileZero(file, offset, amount, wait_event_info);
2351 }
2352 
2353 off_t
2355 {
2356  Assert(FileIsValid(file));
2357 
2358  DO_DB(elog(LOG, "FileSize %d (%s)",
2359  file, VfdCache[file].fileName));
2360 
2361  if (FileIsNotOpen(file))
2362  {
2363  if (FileAccess(file) < 0)
2364  return (off_t) -1;
2365  }
2366 
2367  return lseek(VfdCache[file].fd, 0, SEEK_END);
2368 }
2369 
2370 int
2371 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2372 {
2373  int returnCode;
2374 
2375  Assert(FileIsValid(file));
2376 
2377  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2378  file, VfdCache[file].fileName));
2379 
2380  returnCode = FileAccess(file);
2381  if (returnCode < 0)
2382  return returnCode;
2383 
2384  pgstat_report_wait_start(wait_event_info);
2385  returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2387 
2388  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2389  {
2390  /* adjust our state for truncation of a temp file */
2391  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2392  temporary_files_size -= VfdCache[file].fileSize - offset;
2393  VfdCache[file].fileSize = offset;
2394  }
2395 
2396  return returnCode;
2397 }
2398 
2399 /*
2400  * Return the pathname associated with an open file.
2401  *
2402  * The returned string points to an internal buffer, which is valid until
2403  * the file is closed.
2404  */
2405 char *
2407 {
2408  Assert(FileIsValid(file));
2409 
2410  return VfdCache[file].fileName;
2411 }
2412 
2413 /*
2414  * Return the raw file descriptor of an opened file.
2415  *
2416  * The returned file descriptor will be valid until the file is closed, but
2417  * there are a lot of things that can make that happen. So the caller should
2418  * be careful not to do much of anything else before it finishes using the
2419  * returned file descriptor.
2420  */
2421 int
2423 {
2424  Assert(FileIsValid(file));
2425  return VfdCache[file].fd;
2426 }
2427 
2428 /*
2429  * FileGetRawFlags - returns the file flags on open(2)
2430  */
2431 int
2433 {
2434  Assert(FileIsValid(file));
2435  return VfdCache[file].fileFlags;
2436 }
2437 
2438 /*
2439  * FileGetRawMode - returns the mode bitmask passed to open(2)
2440  */
2441 mode_t
2443 {
2444  Assert(FileIsValid(file));
2445  return VfdCache[file].fileMode;
2446 }
2447 
2448 /*
2449  * Make room for another allocatedDescs[] array entry if needed and possible.
2450  * Returns true if an array element is available.
2451  */
2452 static bool
2454 {
2455  AllocateDesc *newDescs;
2456  int newMax;
2457 
2458  /* Quick out if array already has a free slot. */
2460  return true;
2461 
2462  /*
2463  * If the array hasn't yet been created in the current process, initialize
2464  * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2465  * we will ever need, anyway. We don't want to look at max_safe_fds
2466  * immediately because set_max_safe_fds() may not have run yet.
2467  */
2468  if (allocatedDescs == NULL)
2469  {
2470  newMax = FD_MINFREE / 3;
2471  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2472  /* Out of memory already? Treat as fatal error. */
2473  if (newDescs == NULL)
2474  ereport(ERROR,
2475  (errcode(ERRCODE_OUT_OF_MEMORY),
2476  errmsg("out of memory")));
2477  allocatedDescs = newDescs;
2478  maxAllocatedDescs = newMax;
2479  return true;
2480  }
2481 
2482  /*
2483  * Consider enlarging the array beyond the initial allocation used above.
2484  * By the time this happens, max_safe_fds should be known accurately.
2485  *
2486  * We mustn't let allocated descriptors hog all the available FDs, and in
2487  * practice we'd better leave a reasonable number of FDs for VFD use. So
2488  * set the maximum to max_safe_fds / 3. (This should certainly be at
2489  * least as large as the initial size, FD_MINFREE / 3, so we aren't
2490  * tightening the restriction here.) Recall that "external" FDs are
2491  * allowed to consume another third of max_safe_fds.
2492  */
2493  newMax = max_safe_fds / 3;
2494  if (newMax > maxAllocatedDescs)
2495  {
2496  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2497  newMax * sizeof(AllocateDesc));
2498  /* Treat out-of-memory as a non-fatal error. */
2499  if (newDescs == NULL)
2500  return false;
2501  allocatedDescs = newDescs;
2502  maxAllocatedDescs = newMax;
2503  return true;
2504  }
2505 
2506  /* Can't enlarge allocatedDescs[] any more. */
2507  return false;
2508 }
2509 
2510 /*
2511  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2512  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2513  * necessary to open the file. When done, call FreeFile rather than fclose.
2514  *
2515  * Note that files that will be open for any significant length of time
2516  * should NOT be handled this way, since they cannot share kernel file
2517  * descriptors with other files; there is grave risk of running out of FDs
2518  * if anyone locks down too many FDs. Most callers of this routine are
2519  * simply reading a config file that they will read and close immediately.
2520  *
2521  * fd.c will automatically close all files opened with AllocateFile at
2522  * transaction commit or abort; this prevents FD leakage if a routine
2523  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2524  *
2525  * Ideally this should be the *only* direct call of fopen() in the backend.
2526  */
2527 FILE *
2528 AllocateFile(const char *name, const char *mode)
2529 {
2530  FILE *file;
2531 
2532  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2534 
2535  /* Can we allocate another non-virtual FD? */
2536  if (!reserveAllocatedDesc())
2537  ereport(ERROR,
2538  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2539  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2540  maxAllocatedDescs, name)));
2541 
2542  /* Close excess kernel FDs. */
2543  ReleaseLruFiles();
2544 
2545 TryAgain:
2546  if ((file = fopen(name, mode)) != NULL)
2547  {
2549 
2550  desc->kind = AllocateDescFile;
2551  desc->desc.file = file;
2554  return desc->desc.file;
2555  }
2556 
2557  if (errno == EMFILE || errno == ENFILE)
2558  {
2559  int save_errno = errno;
2560 
2561  ereport(LOG,
2562  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2563  errmsg("out of file descriptors: %m; release and retry")));
2564  errno = 0;
2565  if (ReleaseLruFile())
2566  goto TryAgain;
2567  errno = save_errno;
2568  }
2569 
2570  return NULL;
2571 }
2572 
2573 /*
2574  * Open a file with OpenTransientFilePerm() and pass default file mode for
2575  * the fileMode parameter.
2576  */
2577 int
2578 OpenTransientFile(const char *fileName, int fileFlags)
2579 {
2580  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2581 }
2582 
2583 /*
2584  * Like AllocateFile, but returns an unbuffered fd like open(2)
2585  */
2586 int
2587 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2588 {
2589  int fd;
2590 
2591  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2592  numAllocatedDescs, fileName));
2593 
2594  /* Can we allocate another non-virtual FD? */
2595  if (!reserveAllocatedDesc())
2596  ereport(ERROR,
2597  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2598  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2599  maxAllocatedDescs, fileName)));
2600 
2601  /* Close excess kernel FDs. */
2602  ReleaseLruFiles();
2603 
2604  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2605 
2606  if (fd >= 0)
2607  {
2609 
2610  desc->kind = AllocateDescRawFD;
2611  desc->desc.fd = fd;
2614 
2615  return fd;
2616  }
2617 
2618  return -1; /* failure */
2619 }
2620 
2621 /*
2622  * Routines that want to initiate a pipe stream should use OpenPipeStream
2623  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2624  * necessary. When done, call ClosePipeStream rather than pclose.
2625  *
2626  * This function also ensures that the popen'd program is run with default
2627  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2628  * uses. This ensures desirable response to, eg, closing a read pipe early.
2629  */
2630 FILE *
2631 OpenPipeStream(const char *command, const char *mode)
2632 {
2633  FILE *file;
2634  int save_errno;
2635 
2636  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2637  numAllocatedDescs, command));
2638 
2639  /* Can we allocate another non-virtual FD? */
2640  if (!reserveAllocatedDesc())
2641  ereport(ERROR,
2642  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2643  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2644  maxAllocatedDescs, command)));
2645 
2646  /* Close excess kernel FDs. */
2647  ReleaseLruFiles();
2648 
2649 TryAgain:
2650  fflush(NULL);
2652  errno = 0;
2653  file = popen(command, mode);
2654  save_errno = errno;
2656  errno = save_errno;
2657  if (file != NULL)
2658  {
2660 
2661  desc->kind = AllocateDescPipe;
2662  desc->desc.file = file;
2665  return desc->desc.file;
2666  }
2667 
2668  if (errno == EMFILE || errno == ENFILE)
2669  {
2670  ereport(LOG,
2671  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2672  errmsg("out of file descriptors: %m; release and retry")));
2673  if (ReleaseLruFile())
2674  goto TryAgain;
2675  errno = save_errno;
2676  }
2677 
2678  return NULL;
2679 }
2680 
2681 /*
2682  * Free an AllocateDesc of any type.
2683  *
2684  * The argument *must* point into the allocatedDescs[] array.
2685  */
2686 static int
2688 {
2689  int result;
2690 
2691  /* Close the underlying object */
2692  switch (desc->kind)
2693  {
2694  case AllocateDescFile:
2695  result = fclose(desc->desc.file);
2696  break;
2697  case AllocateDescPipe:
2698  result = pclose(desc->desc.file);
2699  break;
2700  case AllocateDescDir:
2701  result = closedir(desc->desc.dir);
2702  break;
2703  case AllocateDescRawFD:
2704  result = close(desc->desc.fd);
2705  break;
2706  default:
2707  elog(ERROR, "AllocateDesc kind not recognized");
2708  result = 0; /* keep compiler quiet */
2709  break;
2710  }
2711 
2712  /* Compact storage in the allocatedDescs array */
2715 
2716  return result;
2717 }
2718 
2719 /*
2720  * Close a file returned by AllocateFile.
2721  *
2722  * Note we do not check fclose's return value --- it is up to the caller
2723  * to handle close errors.
2724  */
2725 int
2726 FreeFile(FILE *file)
2727 {
2728  int i;
2729 
2730  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2731 
2732  /* Remove file from list of allocated files, if it's present */
2733  for (i = numAllocatedDescs; --i >= 0;)
2734  {
2735  AllocateDesc *desc = &allocatedDescs[i];
2736 
2737  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2738  return FreeDesc(desc);
2739  }
2740 
2741  /* Only get here if someone passes us a file not in allocatedDescs */
2742  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2743 
2744  return fclose(file);
2745 }
2746 
2747 /*
2748  * Close a file returned by OpenTransientFile.
2749  *
2750  * Note we do not check close's return value --- it is up to the caller
2751  * to handle close errors.
2752  */
2753 int
2755 {
2756  int i;
2757 
2758  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2759 
2760  /* Remove fd from list of allocated files, if it's present */
2761  for (i = numAllocatedDescs; --i >= 0;)
2762  {
2763  AllocateDesc *desc = &allocatedDescs[i];
2764 
2765  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2766  return FreeDesc(desc);
2767  }
2768 
2769  /* Only get here if someone passes us a file not in allocatedDescs */
2770  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2771 
2772  return close(fd);
2773 }
2774 
2775 /*
2776  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2777  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2778  * necessary to open the directory, and with closing it after an elog.
2779  * When done, call FreeDir rather than closedir.
2780  *
2781  * Returns NULL, with errno set, on failure. Note that failure detection
2782  * is commonly left to the following call of ReadDir or ReadDirExtended;
2783  * see the comments for ReadDir.
2784  *
2785  * Ideally this should be the *only* direct call of opendir() in the backend.
2786  */
2787 DIR *
2788 AllocateDir(const char *dirname)
2789 {
2790  DIR *dir;
2791 
2792  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2793  numAllocatedDescs, dirname));
2794 
2795  /* Can we allocate another non-virtual FD? */
2796  if (!reserveAllocatedDesc())
2797  ereport(ERROR,
2798  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2799  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2800  maxAllocatedDescs, dirname)));
2801 
2802  /* Close excess kernel FDs. */
2803  ReleaseLruFiles();
2804 
2805 TryAgain:
2806  if ((dir = opendir(dirname)) != NULL)
2807  {
2809 
2810  desc->kind = AllocateDescDir;
2811  desc->desc.dir = dir;
2814  return desc->desc.dir;
2815  }
2816 
2817  if (errno == EMFILE || errno == ENFILE)
2818  {
2819  int save_errno = errno;
2820 
2821  ereport(LOG,
2822  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2823  errmsg("out of file descriptors: %m; release and retry")));
2824  errno = 0;
2825  if (ReleaseLruFile())
2826  goto TryAgain;
2827  errno = save_errno;
2828  }
2829 
2830  return NULL;
2831 }
2832 
2833 /*
2834  * Read a directory opened with AllocateDir, ereport'ing any error.
2835  *
2836  * This is easier to use than raw readdir() since it takes care of some
2837  * otherwise rather tedious and error-prone manipulation of errno. Also,
2838  * if you are happy with a generic error message for AllocateDir failure,
2839  * you can just do
2840  *
2841  * dir = AllocateDir(path);
2842  * while ((dirent = ReadDir(dir, path)) != NULL)
2843  * process dirent;
2844  * FreeDir(dir);
2845  *
2846  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2847  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2848  * use this shortcut.)
2849  *
2850  * The pathname passed to AllocateDir must be passed to this routine too,
2851  * but it is only used for error reporting.
2852  */
2853 struct dirent *
2854 ReadDir(DIR *dir, const char *dirname)
2855 {
2856  return ReadDirExtended(dir, dirname, ERROR);
2857 }
2858 
2859 /*
2860  * Alternate version of ReadDir that allows caller to specify the elevel
2861  * for any error report (whether it's reporting an initial failure of
2862  * AllocateDir or a subsequent directory read failure).
2863  *
2864  * If elevel < ERROR, returns NULL after any error. With the normal coding
2865  * pattern, this will result in falling out of the loop immediately as
2866  * though the directory contained no (more) entries.
2867  */
2868 struct dirent *
2869 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2870 {
2871  struct dirent *dent;
2872 
2873  /* Give a generic message for AllocateDir failure, if caller didn't */
2874  if (dir == NULL)
2875  {
2876  ereport(elevel,
2878  errmsg("could not open directory \"%s\": %m",
2879  dirname)));
2880  return NULL;
2881  }
2882 
2883  errno = 0;
2884  if ((dent = readdir(dir)) != NULL)
2885  return dent;
2886 
2887  if (errno)
2888  ereport(elevel,
2890  errmsg("could not read directory \"%s\": %m",
2891  dirname)));
2892  return NULL;
2893 }
2894 
2895 /*
2896  * Close a directory opened with AllocateDir.
2897  *
2898  * Returns closedir's return value (with errno set if it's not 0).
2899  * Note we do not check the return value --- it is up to the caller
2900  * to handle close errors if wanted.
2901  *
2902  * Does nothing if dir == NULL; we assume that directory open failure was
2903  * already reported if desired.
2904  */
2905 int
2907 {
2908  int i;
2909 
2910  /* Nothing to do if AllocateDir failed */
2911  if (dir == NULL)
2912  return 0;
2913 
2914  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2915 
2916  /* Remove dir from list of allocated dirs, if it's present */
2917  for (i = numAllocatedDescs; --i >= 0;)
2918  {
2919  AllocateDesc *desc = &allocatedDescs[i];
2920 
2921  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2922  return FreeDesc(desc);
2923  }
2924 
2925  /* Only get here if someone passes us a dir not in allocatedDescs */
2926  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2927 
2928  return closedir(dir);
2929 }
2930 
2931 
2932 /*
2933  * Close a pipe stream returned by OpenPipeStream.
2934  */
2935 int
2936 ClosePipeStream(FILE *file)
2937 {
2938  int i;
2939 
2940  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2941 
2942  /* Remove file from list of allocated files, if it's present */
2943  for (i = numAllocatedDescs; --i >= 0;)
2944  {
2945  AllocateDesc *desc = &allocatedDescs[i];
2946 
2947  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2948  return FreeDesc(desc);
2949  }
2950 
2951  /* Only get here if someone passes us a file not in allocatedDescs */
2952  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2953 
2954  return pclose(file);
2955 }
2956 
2957 /*
2958  * closeAllVfds
2959  *
2960  * Force all VFDs into the physically-closed state, so that the fewest
2961  * possible number of kernel file descriptors are in use. There is no
2962  * change in the logical state of the VFDs.
2963  */
2964 void
2966 {
2967  Index i;
2968 
2969  if (SizeVfdCache > 0)
2970  {
2971  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2972  for (i = 1; i < SizeVfdCache; i++)
2973  {
2974  if (!FileIsNotOpen(i))
2975  LruDelete(i);
2976  }
2977  }
2978 }
2979 
2980 
2981 /*
2982  * SetTempTablespaces
2983  *
2984  * Define a list (actually an array) of OIDs of tablespaces to use for
2985  * temporary files. This list will be used until end of transaction,
2986  * unless this function is called again before then. It is caller's
2987  * responsibility that the passed-in array has adequate lifespan (typically
2988  * it'd be allocated in TopTransactionContext).
2989  *
2990  * Some entries of the array may be InvalidOid, indicating that the current
2991  * database's default tablespace should be used.
2992  */
2993 void
2994 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2995 {
2996  Assert(numSpaces >= 0);
2997  tempTableSpaces = tableSpaces;
2998  numTempTableSpaces = numSpaces;
2999 
3000  /*
3001  * Select a random starting point in the list. This is to minimize
3002  * conflicts between backends that are most likely sharing the same list
3003  * of temp tablespaces. Note that if we create multiple temp files in the
3004  * same transaction, we'll advance circularly through the list --- this
3005  * ensures that large temporary sort files are nicely spread across all
3006  * available tablespaces.
3007  */
3008  if (numSpaces > 1)
3010  0, numSpaces - 1);
3011  else
3012  nextTempTableSpace = 0;
3013 }
3014 
3015 /*
3016  * TempTablespacesAreSet
3017  *
3018  * Returns true if SetTempTablespaces has been called in current transaction.
3019  * (This is just so that tablespaces.c doesn't need its own per-transaction
3020  * state.)
3021  */
3022 bool
3024 {
3025  return (numTempTableSpaces >= 0);
3026 }
3027 
3028 /*
3029  * GetTempTablespaces
3030  *
3031  * Populate an array with the OIDs of the tablespaces that should be used for
3032  * temporary files. (Some entries may be InvalidOid, indicating that the
3033  * current database's default tablespace should be used.) At most numSpaces
3034  * entries will be filled.
3035  * Returns the number of OIDs that were copied into the output array.
3036  */
3037 int
3038 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3039 {
3040  int i;
3041 
3043  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3044  tableSpaces[i] = tempTableSpaces[i];
3045 
3046  return i;
3047 }
3048 
3049 /*
3050  * GetNextTempTableSpace
3051  *
3052  * Select the next temp tablespace to use. A result of InvalidOid means
3053  * to use the current database's default tablespace.
3054  */
3055 Oid
3057 {
3058  if (numTempTableSpaces > 0)
3059  {
3060  /* Advance nextTempTableSpace counter with wraparound */
3062  nextTempTableSpace = 0;
3064  }
3065  return InvalidOid;
3066 }
3067 
3068 
3069 /*
3070  * AtEOSubXact_Files
3071  *
3072  * Take care of subtransaction commit/abort. At abort, we close temp files
3073  * that the subtransaction may have opened. At commit, we reassign the
3074  * files that were opened to the parent subtransaction.
3075  */
3076 void
3077 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3078  SubTransactionId parentSubid)
3079 {
3080  Index i;
3081 
3082  for (i = 0; i < numAllocatedDescs; i++)
3083  {
3084  if (allocatedDescs[i].create_subid == mySubid)
3085  {
3086  if (isCommit)
3087  allocatedDescs[i].create_subid = parentSubid;
3088  else
3089  {
3090  /* have to recheck the item after FreeDesc (ugly) */
3091  FreeDesc(&allocatedDescs[i--]);
3092  }
3093  }
3094  }
3095 }
3096 
3097 /*
3098  * AtEOXact_Files
3099  *
3100  * This routine is called during transaction commit or abort. All still-open
3101  * per-transaction temporary file VFDs are closed, which also causes the
3102  * underlying files to be deleted (although they should've been closed already
3103  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3104  * closed. We also forget any transaction-local temp tablespace list.
3105  *
3106  * The isCommit flag is used only to decide whether to emit warnings about
3107  * unclosed files.
3108  */
3109 void
3110 AtEOXact_Files(bool isCommit)
3111 {
3112  CleanupTempFiles(isCommit, false);
3113  tempTableSpaces = NULL;
3114  numTempTableSpaces = -1;
3115 }
3116 
3117 /*
3118  * BeforeShmemExit_Files
3119  *
3120  * before_shmem_exit hook to clean up temp files during backend shutdown.
3121  * Here, we want to clean up *all* temp files including interXact ones.
3122  */
3123 static void
3125 {
3126  CleanupTempFiles(false, true);
3127 
3128  /* prevent further temp files from being created */
3129 #ifdef USE_ASSERT_CHECKING
3130  temporary_files_allowed = false;
3131 #endif
3132 }
3133 
3134 /*
3135  * Close temporary files and delete their underlying files.
3136  *
3137  * isCommit: if true, this is normal transaction commit, and we don't
3138  * expect any remaining files; warn if there are some.
3139  *
3140  * isProcExit: if true, this is being called as the backend process is
3141  * exiting. If that's the case, we should remove all temporary files; if
3142  * that's not the case, we are being called for transaction commit/abort
3143  * and should only remove transaction-local temp files. In either case,
3144  * also clean up "allocated" stdio files, dirs and fds.
3145  */
3146 static void
3147 CleanupTempFiles(bool isCommit, bool isProcExit)
3148 {
3149  Index i;
3150 
3151  /*
3152  * Careful here: at proc_exit we need extra cleanup, not just
3153  * xact_temporary files.
3154  */
3155  if (isProcExit || have_xact_temporary_files)
3156  {
3157  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3158  for (i = 1; i < SizeVfdCache; i++)
3159  {
3160  unsigned short fdstate = VfdCache[i].fdstate;
3161 
3162  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3163  VfdCache[i].fileName != NULL)
3164  {
3165  /*
3166  * If we're in the process of exiting a backend process, close
3167  * all temporary files. Otherwise, only close temporary files
3168  * local to the current transaction. They should be closed by
3169  * the ResourceOwner mechanism already, so this is just a
3170  * debugging cross-check.
3171  */
3172  if (isProcExit)
3173  FileClose(i);
3174  else if (fdstate & FD_CLOSE_AT_EOXACT)
3175  {
3176  elog(WARNING,
3177  "temporary file %s not closed at end-of-transaction",
3178  VfdCache[i].fileName);
3179  FileClose(i);
3180  }
3181  }
3182  }
3183 
3184  have_xact_temporary_files = false;
3185  }
3186 
3187  /* Complain if any allocated files remain open at commit. */
3188  if (isCommit && numAllocatedDescs > 0)
3189  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3191 
3192  /* Clean up "allocated" stdio files, dirs and fds. */
3193  while (numAllocatedDescs > 0)
3194  FreeDesc(&allocatedDescs[0]);
3195 }
3196 
3197 
3198 /*
3199  * Remove temporary and temporary relation files left over from a prior
3200  * postmaster session
3201  *
3202  * This should be called during postmaster startup. It will forcibly
3203  * remove any leftover files created by OpenTemporaryFile and any leftover
3204  * temporary relation files created by mdcreate.
3205  *
3206  * During post-backend-crash restart cycle, this routine is called when
3207  * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3208  * queries are using temp files could result in useless storage usage that can
3209  * only be reclaimed by a service restart. The argument against enabling it is
3210  * that someone might want to examine the temporary files for debugging
3211  * purposes. This does however mean that OpenTemporaryFile had better allow for
3212  * collision with an existing temp file name.
3213  *
3214  * NOTE: this function and its subroutines generally report syscall failures
3215  * with ereport(LOG) and keep going. Removing temp files is not so critical
3216  * that we should fail to start the database when we can't do it.
3217  */
3218 void
3220 {
3221  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3222  DIR *spc_dir;
3223  struct dirent *spc_de;
3224 
3225  /*
3226  * First process temp files in pg_default ($PGDATA/base)
3227  */
3228  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3229  RemovePgTempFilesInDir(temp_path, true, false);
3230  RemovePgTempRelationFiles("base");
3231 
3232  /*
3233  * Cycle through temp directories for all non-default tablespaces.
3234  */
3235  spc_dir = AllocateDir("pg_tblspc");
3236 
3237  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3238  {
3239  if (strcmp(spc_de->d_name, ".") == 0 ||
3240  strcmp(spc_de->d_name, "..") == 0)
3241  continue;
3242 
3243  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3245  RemovePgTempFilesInDir(temp_path, true, false);
3246 
3247  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3249  RemovePgTempRelationFiles(temp_path);
3250  }
3251 
3252  FreeDir(spc_dir);
3253 
3254  /*
3255  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3256  * DataDir as well. However, that is *not* cleaned here because doing so
3257  * would create a race condition. It's done separately, earlier in
3258  * postmaster startup.
3259  */
3260 }
3261 
3262 /*
3263  * Process one pgsql_tmp directory for RemovePgTempFiles.
3264  *
3265  * If missing_ok is true, it's all right for the named directory to not exist.
3266  * Any other problem results in a LOG message. (missing_ok should be true at
3267  * the top level, since pgsql_tmp directories are not created until needed.)
3268  *
3269  * At the top level, this should be called with unlink_all = false, so that
3270  * only files matching the temporary name prefix will be unlinked. When
3271  * recursing it will be called with unlink_all = true to unlink everything
3272  * under a top-level temporary directory.
3273  *
3274  * (These two flags could be replaced by one, but it seems clearer to keep
3275  * them separate.)
3276  */
3277 void
3278 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3279 {
3280  DIR *temp_dir;
3281  struct dirent *temp_de;
3282  char rm_path[MAXPGPATH * 2];
3283 
3284  temp_dir = AllocateDir(tmpdirname);
3285 
3286  if (temp_dir == NULL && errno == ENOENT && missing_ok)
3287  return;
3288 
3289  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3290  {
3291  if (strcmp(temp_de->d_name, ".") == 0 ||
3292  strcmp(temp_de->d_name, "..") == 0)
3293  continue;
3294 
3295  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3296  tmpdirname, temp_de->d_name);
3297 
3298  if (unlink_all ||
3299  strncmp(temp_de->d_name,
3301  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3302  {
3303  PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3304 
3305  if (type == PGFILETYPE_ERROR)
3306  continue;
3307  else if (type == PGFILETYPE_DIR)
3308  {
3309  /* recursively remove contents, then directory itself */
3310  RemovePgTempFilesInDir(rm_path, false, true);
3311 
3312  if (rmdir(rm_path) < 0)
3313  ereport(LOG,
3315  errmsg("could not remove directory \"%s\": %m",
3316  rm_path)));
3317  }
3318  else
3319  {
3320  if (unlink(rm_path) < 0)
3321  ereport(LOG,
3323  errmsg("could not remove file \"%s\": %m",
3324  rm_path)));
3325  }
3326  }
3327  else
3328  ereport(LOG,
3329  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3330  rm_path)));
3331  }
3332 
3333  FreeDir(temp_dir);
3334 }
3335 
3336 /* Process one tablespace directory, look for per-DB subdirectories */
3337 static void
3338 RemovePgTempRelationFiles(const char *tsdirname)
3339 {
3340  DIR *ts_dir;
3341  struct dirent *de;
3342  char dbspace_path[MAXPGPATH * 2];
3343 
3344  ts_dir = AllocateDir(tsdirname);
3345 
3346  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3347  {
3348  /*
3349  * We're only interested in the per-database directories, which have
3350  * numeric names. Note that this code will also (properly) ignore "."
3351  * and "..".
3352  */
3353  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3354  continue;
3355 
3356  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3357  tsdirname, de->d_name);
3358  RemovePgTempRelationFilesInDbspace(dbspace_path);
3359  }
3360 
3361  FreeDir(ts_dir);
3362 }
3363 
3364 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3365 static void
3366 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3367 {
3368  DIR *dbspace_dir;
3369  struct dirent *de;
3370  char rm_path[MAXPGPATH * 2];
3371 
3372  dbspace_dir = AllocateDir(dbspacedirname);
3373 
3374  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3375  {
3376  if (!looks_like_temp_rel_name(de->d_name))
3377  continue;
3378 
3379  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3380  dbspacedirname, de->d_name);
3381 
3382  if (unlink(rm_path) < 0)
3383  ereport(LOG,
3385  errmsg("could not remove file \"%s\": %m",
3386  rm_path)));
3387  }
3388 
3389  FreeDir(dbspace_dir);
3390 }
3391 
3392 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3393 bool
3395 {
3396  int pos;
3397  int savepos;
3398 
3399  /* Must start with "t". */
3400  if (name[0] != 't')
3401  return false;
3402 
3403  /* Followed by a non-empty string of digits and then an underscore. */
3404  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3405  ;
3406  if (pos == 1 || name[pos] != '_')
3407  return false;
3408 
3409  /* Followed by another nonempty string of digits. */
3410  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3411  ;
3412  if (savepos == pos)
3413  return false;
3414 
3415  /* We might have _forkname or .segment or both. */
3416  if (name[pos] == '_')
3417  {
3418  int forkchar = forkname_chars(&name[pos + 1], NULL);
3419 
3420  if (forkchar <= 0)
3421  return false;
3422  pos += forkchar + 1;
3423  }
3424  if (name[pos] == '.')
3425  {
3426  int segchar;
3427 
3428  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3429  ;
3430  if (segchar <= 1)
3431  return false;
3432  pos += segchar;
3433  }
3434 
3435  /* Now we should be at the end. */
3436  if (name[pos] != '\0')
3437  return false;
3438  return true;
3439 }
3440 
3441 #ifdef HAVE_SYNCFS
3442 static void
3443 do_syncfs(const char *path)
3444 {
3445  int fd;
3446 
3447  ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3448  path);
3449 
3450  fd = OpenTransientFile(path, O_RDONLY);
3451  if (fd < 0)
3452  {
3453  ereport(LOG,
3455  errmsg("could not open file \"%s\": %m", path)));
3456  return;
3457  }
3458  if (syncfs(fd) < 0)
3459  ereport(LOG,
3461  errmsg("could not synchronize file system for file \"%s\": %m", path)));
3463 }
3464 #endif
3465 
3466 /*
3467  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3468  * all potential filesystem, depending on recovery_init_sync_method setting.
3469  *
3470  * We fsync regular files and directories wherever they are, but we
3471  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3472  * Other symlinks are presumed to point at files we're not responsible
3473  * for fsyncing, and might not have privileges to write at all.
3474  *
3475  * Errors are logged but not considered fatal; that's because this is used
3476  * only during database startup, to deal with the possibility that there are
3477  * issued-but-unsynced writes pending against the data directory. We want to
3478  * ensure that such writes reach disk before anything that's done in the new
3479  * run. However, aborting on error would result in failure to start for
3480  * harmless cases such as read-only files in the data directory, and that's
3481  * not good either.
3482  *
3483  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3484  * rewriting all changes again during recovery.
3485  *
3486  * Note we assume we're chdir'd into PGDATA to begin with.
3487  */
3488 void
3490 {
3491  bool xlog_is_symlink;
3492 
3493  /* We can skip this whole thing if fsync is disabled. */
3494  if (!enableFsync)
3495  return;
3496 
3497  /*
3498  * If pg_wal is a symlink, we'll need to recurse into it separately,
3499  * because the first walkdir below will ignore it.
3500  */
3501  xlog_is_symlink = false;
3502 
3503  {
3504  struct stat st;
3505 
3506  if (lstat("pg_wal", &st) < 0)
3507  ereport(LOG,
3509  errmsg("could not stat file \"%s\": %m",
3510  "pg_wal")));
3511  else if (S_ISLNK(st.st_mode))
3512  xlog_is_symlink = true;
3513  }
3514 
3515 #ifdef HAVE_SYNCFS
3517  {
3518  DIR *dir;
3519  struct dirent *de;
3520 
3521  /*
3522  * On Linux, we don't have to open every single file one by one. We
3523  * can use syncfs() to sync whole filesystems. We only expect
3524  * filesystem boundaries to exist where we tolerate symlinks, namely
3525  * pg_wal and the tablespaces, so we call syncfs() for each of those
3526  * directories.
3527  */
3528 
3529  /* Prepare to report progress syncing the data directory via syncfs. */
3531 
3532  /* Sync the top level pgdata directory. */
3533  do_syncfs(".");
3534  /* If any tablespaces are configured, sync each of those. */
3535  dir = AllocateDir("pg_tblspc");
3536  while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
3537  {
3538  char path[MAXPGPATH];
3539 
3540  if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3541  continue;
3542 
3543  snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
3544  do_syncfs(path);
3545  }
3546  FreeDir(dir);
3547  /* If pg_wal is a symlink, process that too. */
3548  if (xlog_is_symlink)
3549  do_syncfs("pg_wal");
3550  return;
3551  }
3552 #endif /* !HAVE_SYNCFS */
3553 
3554 #ifdef PG_FLUSH_DATA_WORKS
3555  /* Prepare to report progress of the pre-fsync phase. */
3557 
3558  /*
3559  * If possible, hint to the kernel that we're soon going to fsync the data
3560  * directory and its contents. Errors in this step are even less
3561  * interesting than normal, so log them only at DEBUG1.
3562  */
3563  walkdir(".", pre_sync_fname, false, DEBUG1);
3564  if (xlog_is_symlink)
3565  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3566  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3567 #endif
3568 
3569  /* Prepare to report progress syncing the data directory via fsync. */
3571 
3572  /*
3573  * Now we do the fsync()s in the same order.
3574  *
3575  * The main call ignores symlinks, so in addition to specially processing
3576  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3577  * process_symlinks = true. Note that if there are any plain directories
3578  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3579  * so we don't worry about optimizing it.
3580  */
3581  walkdir(".", datadir_fsync_fname, false, LOG);
3582  if (xlog_is_symlink)
3583  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3584  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3585 }
3586 
3587 /*
3588  * walkdir: recursively walk a directory, applying the action to each
3589  * regular file and directory (including the named directory itself).
3590  *
3591  * If process_symlinks is true, the action and recursion are also applied
3592  * to regular files and directories that are pointed to by symlinks in the
3593  * given directory; otherwise symlinks are ignored. Symlinks are always
3594  * ignored in subdirectories, ie we intentionally don't pass down the
3595  * process_symlinks flag to recursive calls.
3596  *
3597  * Errors are reported at level elevel, which might be ERROR or less.
3598  *
3599  * See also walkdir in file_utils.c, which is a frontend version of this
3600  * logic.
3601  */
3602 static void
3603 walkdir(const char *path,
3604  void (*action) (const char *fname, bool isdir, int elevel),
3605  bool process_symlinks,
3606  int elevel)
3607 {
3608  DIR *dir;
3609  struct dirent *de;
3610 
3611  dir = AllocateDir(path);
3612 
3613  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3614  {
3615  char subpath[MAXPGPATH * 2];
3616 
3618 
3619  if (strcmp(de->d_name, ".") == 0 ||
3620  strcmp(de->d_name, "..") == 0)
3621  continue;
3622 
3623  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3624 
3625  switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3626  {
3627  case PGFILETYPE_REG:
3628  (*action) (subpath, false, elevel);
3629  break;
3630  case PGFILETYPE_DIR:
3631  walkdir(subpath, action, false, elevel);
3632  break;
3633  default:
3634 
3635  /*
3636  * Errors are already reported directly by get_dirent_type(),
3637  * and any remaining symlinks and unknown file types are
3638  * ignored.
3639  */
3640  break;
3641  }
3642  }
3643 
3644  FreeDir(dir); /* we ignore any error here */
3645 
3646  /*
3647  * It's important to fsync the destination directory itself as individual
3648  * file fsyncs don't guarantee that the directory entry for the file is
3649  * synced. However, skip this if AllocateDir failed; the action function
3650  * might not be robust against that.
3651  */
3652  if (dir)
3653  (*action) (path, true, elevel);
3654 }
3655 
3656 
3657 /*
3658  * Hint to the OS that it should get ready to fsync() this file.
3659  *
3660  * Ignores errors trying to open unreadable files, and logs other errors at a
3661  * caller-specified level.
3662  */
3663 #ifdef PG_FLUSH_DATA_WORKS
3664 
3665 static void
3666 pre_sync_fname(const char *fname, bool isdir, int elevel)
3667 {
3668  int fd;
3669 
3670  /* Don't try to flush directories, it'll likely just fail */
3671  if (isdir)
3672  return;
3673 
3674  ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3675  fname);
3676 
3677  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3678 
3679  if (fd < 0)
3680  {
3681  if (errno == EACCES)
3682  return;
3683  ereport(elevel,
3685  errmsg("could not open file \"%s\": %m", fname)));
3686  return;
3687  }
3688 
3689  /*
3690  * pg_flush_data() ignores errors, which is ok because this is only a
3691  * hint.
3692  */
3693  pg_flush_data(fd, 0, 0);
3694 
3695  if (CloseTransientFile(fd) != 0)
3696  ereport(elevel,
3698  errmsg("could not close file \"%s\": %m", fname)));
3699 }
3700 
3701 #endif /* PG_FLUSH_DATA_WORKS */
3702 
3703 static void
3704 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3705 {
3706  ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3707  fname);
3708 
3709  /*
3710  * We want to silently ignoring errors about unreadable files. Pass that
3711  * desire on to fsync_fname_ext().
3712  */
3713  fsync_fname_ext(fname, isdir, true, elevel);
3714 }
3715 
3716 static void
3717 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3718 {
3719  if (isdir)
3720  {
3721  if (rmdir(fname) != 0 && errno != ENOENT)
3722  ereport(elevel,
3724  errmsg("could not remove directory \"%s\": %m", fname)));
3725  }
3726  else
3727  {
3728  /* Use PathNameDeleteTemporaryFile to report filesize */
3729  PathNameDeleteTemporaryFile(fname, false);
3730  }
3731 }
3732 
3733 /*
3734  * fsync_fname_ext -- Try to fsync a file or directory
3735  *
3736  * If ignore_perm is true, ignore errors upon trying to open unreadable
3737  * files. Logs other errors at a caller-specified level.
3738  *
3739  * Returns 0 if the operation succeeded, -1 otherwise.
3740  */
3741 int
3742 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3743 {
3744  int fd;
3745  int flags;
3746  int returncode;
3747 
3748  /*
3749  * Some OSs require directories to be opened read-only whereas other
3750  * systems don't allow us to fsync files opened read-only; so we need both
3751  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3752  * not writable by our userid, but we assume that's OK.
3753  */
3754  flags = PG_BINARY;
3755  if (!isdir)
3756  flags |= O_RDWR;
3757  else
3758  flags |= O_RDONLY;
3759 
3760  fd = OpenTransientFile(fname, flags);
3761 
3762  /*
3763  * Some OSs don't allow us to open directories at all (Windows returns
3764  * EACCES), just ignore the error in that case. If desired also silently
3765  * ignoring errors about unreadable files. Log others.
3766  */
3767  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3768  return 0;
3769  else if (fd < 0 && ignore_perm && errno == EACCES)
3770  return 0;
3771  else if (fd < 0)
3772  {
3773  ereport(elevel,
3775  errmsg("could not open file \"%s\": %m", fname)));
3776  return -1;
3777  }
3778 
3779  returncode = pg_fsync(fd);
3780 
3781  /*
3782  * Some OSes don't allow us to fsync directories at all, so we can ignore
3783  * those errors. Anything else needs to be logged.
3784  */
3785  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3786  {
3787  int save_errno;
3788 
3789  /* close file upon error, might not be in transaction context */
3790  save_errno = errno;
3791  (void) CloseTransientFile(fd);
3792  errno = save_errno;
3793 
3794  ereport(elevel,
3796  errmsg("could not fsync file \"%s\": %m", fname)));
3797  return -1;
3798  }
3799 
3800  if (CloseTransientFile(fd) != 0)
3801  {
3802  ereport(elevel,
3804  errmsg("could not close file \"%s\": %m", fname)));
3805  return -1;
3806  }
3807 
3808  return 0;
3809 }
3810 
3811 /*
3812  * fsync_parent_path -- fsync the parent path of a file or directory
3813  *
3814  * This is aimed at making file operations persistent on disk in case of
3815  * an OS crash or power failure.
3816  */
3817 static int
3818 fsync_parent_path(const char *fname, int elevel)
3819 {
3820  char parentpath[MAXPGPATH];
3821 
3822  strlcpy(parentpath, fname, MAXPGPATH);
3823  get_parent_directory(parentpath);
3824 
3825  /*
3826  * get_parent_directory() returns an empty string if the input argument is
3827  * just a file name (see comments in path.c), so handle that as being the
3828  * current directory.
3829  */
3830  if (strlen(parentpath) == 0)
3831  strlcpy(parentpath, ".", MAXPGPATH);
3832 
3833  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3834  return -1;
3835 
3836  return 0;
3837 }
3838 
3839 /*
3840  * Create a PostgreSQL data sub-directory
3841  *
3842  * The data directory itself, and most of its sub-directories, are created at
3843  * initdb time, but we do have some occasions when we create directories in
3844  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3845  * make sure that those directories are created consistently. Today, that means
3846  * making sure that the created directory has the correct permissions, which is
3847  * what pg_dir_create_mode tracks for us.
3848  *
3849  * Note that we also set the umask() based on what we understand the correct
3850  * permissions to be (see file_perm.c).
3851  *
3852  * For permissions other than the default, mkdir() can be used directly, but
3853  * be sure to consider carefully such cases -- a sub-directory with incorrect
3854  * permissions in a PostgreSQL data directory could cause backups and other
3855  * processes to fail.
3856  */
3857 int
3858 MakePGDirectory(const char *directoryName)
3859 {
3860  return mkdir(directoryName, pg_dir_create_mode);
3861 }
3862 
3863 /*
3864  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3865  *
3866  * Failure to fsync any data file is cause for immediate panic, unless
3867  * data_sync_retry is enabled. Data may have been written to the operating
3868  * system and removed from our buffer pool already, and if we are running on
3869  * an operating system that forgets dirty data on write-back failure, there
3870  * may be only one copy of the data remaining: in the WAL. A later attempt to
3871  * fsync again might falsely report success. Therefore we must not allow any
3872  * further checkpoints to be attempted. data_sync_retry can in theory be
3873  * enabled on systems known not to drop dirty buffered data on write-back
3874  * failure (with the likely outcome that checkpoints will continue to fail
3875  * until the underlying problem is fixed).
3876  *
3877  * Any code that reports a failure from fsync() or related functions should
3878  * filter the error level with this function.
3879  */
3880 int
3881 data_sync_elevel(int elevel)
3882 {
3883  return data_sync_retry ? elevel : PANIC;
3884 }
3885 
3886 bool
3888 {
3889  bool result = true;
3890  int flags;
3891 
3892 #if PG_O_DIRECT == 0
3893  if (strcmp(*newval, "") != 0)
3894  {
3895  GUC_check_errdetail("debug_io_direct is not supported on this platform.");
3896  result = false;
3897  }
3898  flags = 0;
3899 #else
3900  List *elemlist;
3901  ListCell *l;
3902  char *rawstring;
3903 
3904  /* Need a modifiable copy of string */
3905  rawstring = pstrdup(*newval);
3906 
3907  if (!SplitGUCList(rawstring, ',', &elemlist))
3908  {
3909  GUC_check_errdetail("invalid list syntax in parameter \"%s\"",
3910  "debug_io_direct");
3911  pfree(rawstring);
3912  list_free(elemlist);
3913  return false;
3914  }
3915 
3916  flags = 0;
3917  foreach(l, elemlist)
3918  {
3919  char *item = (char *) lfirst(l);
3920 
3921  if (pg_strcasecmp(item, "data") == 0)
3922  flags |= IO_DIRECT_DATA;
3923  else if (pg_strcasecmp(item, "wal") == 0)
3924  flags |= IO_DIRECT_WAL;
3925  else if (pg_strcasecmp(item, "wal_init") == 0)
3926  flags |= IO_DIRECT_WAL_INIT;
3927  else
3928  {
3929  GUC_check_errdetail("invalid option \"%s\"", item);
3930  result = false;
3931  break;
3932  }
3933  }
3934 
3935  /*
3936  * It's possible to configure block sizes smaller than our assumed I/O
3937  * alignment size, which could result in invalid I/O requests.
3938  */
3939 #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
3940  if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
3941  {
3942  GUC_check_errdetail("debug_io_direct is not supported for WAL because XLOG_BLCKSZ is too small");
3943  result = false;
3944  }
3945 #endif
3946 #if BLCKSZ < PG_IO_ALIGN_SIZE
3947  if (result && (flags & IO_DIRECT_DATA))
3948  {
3949  GUC_check_errdetail("debug_io_direct is not supported for data because BLCKSZ is too small");
3950  result = false;
3951  }
3952 #endif
3953 
3954  pfree(rawstring);
3955  list_free(elemlist);
3956 #endif
3957 
3958  if (!result)
3959  return result;
3960 
3961  /* Save the flags in *extra, for use by assign_debug_io_direct */
3962  *extra = guc_malloc(ERROR, sizeof(int));
3963  *((int *) *extra) = flags;
3964 
3965  return result;
3966 }
3967 
3968 extern void
3969 assign_debug_io_direct(const char *newval, void *extra)
3970 {
3971  int *flags = (int *) extra;
3972 
3973  io_direct_flags = *flags;
3974 }
void begin_startup_progress_phase(void)
Definition: startup.c:352
unsigned int uint32
Definition: c.h:495
#define Min(x, y)
Definition: c.h:993
uint32 SubTransactionId
Definition: c.h:645
#define INT64_FORMAT
Definition: c.h:537
#define PG_BINARY
Definition: c.h:1283
unsigned int Index
Definition: c.h:603
#define MemSet(start, val, len)
Definition: c.h:1009
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:927
int fdatasync(int fildes)
#define OidIsValid(objectId)
Definition: c.h:764
size_t Size
Definition: c.h:594
int closedir(DIR *)
Definition: dirent.c:127
struct dirent * readdir(DIR *)
Definition: dirent.c:78
DIR * opendir(const char *)
Definition: dirent.c:33
int errcode_for_file_access(void)
Definition: elog.c:881
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define LOG
Definition: elog.h:31
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2854
static int pg_ftruncate(int fd, off_t length)
Definition: fd.c:655
int max_files_per_process
Definition: fd.c:146
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:477
int FileGetRawDesc(File file)
Definition: fd.c:2422
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3858
int FreeDir(DIR *dir)
Definition: fd.c:2906
int recovery_init_sync_method
Definition: fd.c:165
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2062
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:416
#define FD_MINFREE
Definition: fd.c:138
static int numTempTableSpaces
Definition: fd.c:289
static bool ReleaseLruFile(void)
Definition: fd.c:1334
int io_direct_flags
Definition: fd.c:168
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2528
#define FD_DELETE_AT_CLOSE
Definition: fd.c:192
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1061
static int maxAllocatedDescs
Definition: fd.c:268
static void Delete(File file)
Definition: fd.c:1220
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2687
static long tempFileCounter
Definition: fd.c:280
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:734
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:3038
static int numAllocatedDescs
Definition: fd.c:267
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition: fd.c:1853
static void LruDelete(File file)
Definition: fd.c:1239
int pg_fdatasync(int fd)
Definition: fd.c:455
#define FileIsValid(file)
Definition: fd.c:186
void assign_debug_io_direct(const char *newval, void *extra)
Definition: fd.c:3969
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2242
static int nfile
Definition: fd.c:222
int CloseTransientFile(int fd)
Definition: fd.c:2754
#define DO_DB(A)
Definition: fd.c:180
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1039
void closeAllVfds(void)
Definition: fd.c:2965
int max_safe_fds
Definition: fd.c:159
static File AllocateVfd(void)
Definition: fd.c:1366
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1813
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1643
int ClosePipeStream(FILE *file)
Definition: fd.c:2936
void AtEOXact_Files(bool isCommit)
Definition: fd.c:3110
int FileGetRawFlags(File file)
Definition: fd.c:2432
static Size SizeVfdCache
Definition: fd.c:217
static int nextTempTableSpace
Definition: fd.c:290
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:193
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3742
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3717
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3338
int FreeFile(FILE *file)
Definition: fd.c:2726
mode_t FileGetRawMode(File file)
Definition: fd.c:2442
static AllocateDesc * allocatedDescs
Definition: fd.c:269
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:916
static int FileAccess(File file)
Definition: fd.c:1444
static void FreeVfd(File file)
Definition: fd.c:1424
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition: fd.c:436
void FileClose(File file)
Definition: fd.c:1930
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2631
void ReleaseExternalFD(void)
Definition: fd.c:1191
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:194
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3278
void RemovePgTempFiles(void)
Definition: fd.c:3219
#define FileIsNotOpen(file)
Definition: fd.c:189
bool TempTablespacesAreSet(void)
Definition: fd.c:3023
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:708
int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2314
int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2030
int data_sync_elevel(int elevel)
Definition: fd.c:3881
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1527
static void Insert(File file)
Definition: fd.c:1265
AllocateDescKind
Definition: fd.c:248
@ AllocateDescDir
Definition: fd.c:251
@ AllocateDescPipe
Definition: fd.c:250
@ AllocateDescFile
Definition: fd.c:249
@ AllocateDescRawFD
Definition: fd.c:252
int FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2144
Oid GetNextTempTableSpace(void)
Definition: fd.c:3056
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1540
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3704
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1480
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1756
bool AcquireExternalFD(void)
Definition: fd.c:1138
static void RegisterTemporaryFile(File file)
Definition: fd.c:1499
int FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2088
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2869
#define NUM_RESERVED_FDS
Definition: fd.c:129
static Oid * tempTableSpaces
Definition: fd.c:288
static bool reserveAllocatedDesc(void)
Definition: fd.c:2453
void InitFileAccess(void)
Definition: fd.c:855
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3366
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1676
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:824
static uint64 temporary_files_size
Definition: fd.c:236
void ReserveExternalFD(void)
Definition: fd.c:1173
char * FilePathName(File file)
Definition: fd.c:2406
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3394
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1884
void set_max_safe_fds(void)
Definition: fd.c:996
int pg_fsync(int fd)
Definition: fd.c:361
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:3147
#define VFD_CLOSED
Definition: fd.c:184
static bool have_xact_temporary_files
Definition: fd.c:228
static int LruInsert(File file)
Definition: fd.c:1287
static int numExternalFDs
Definition: fd.c:274
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3818
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1612
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:3077
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2578
void InitTemporaryFileAccess(void)
Definition: fd.c:885
static Vfd * VfdCache
Definition: fd.c:216
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2587
bool data_sync_retry
Definition: fd.c:162
static void ReleaseLruFiles(void)
Definition: fd.c:1356
void SyncDataDirectory(void)
Definition: fd.c:3489
int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2269
off_t FileSize(File file)
Definition: fd.c:2354
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2371
bool check_debug_io_direct(char **newval, void **extra, GucSource source)
Definition: fd.c:3887
static void BeforeShmemExit_Files(int code, Datum arg)
Definition: fd.c:3124
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3603
int pg_truncate(const char *path, off_t length)
Definition: fd.c:672
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2994
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2788
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1731
#define IO_DIRECT_WAL
Definition: fd.h:53
#define IO_DIRECT_DATA
Definition: fd.h:52
#define IO_DIRECT_WAL_INIT
Definition: fd.h:54
int File
Definition: fd.h:49
#define PG_O_DIRECT
Definition: fd.h:95
int pg_file_create_mode
Definition: file_perm.c:19
int pg_dir_create_mode
Definition: file_perm.c:18
ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset)
Definition: file_utils.c:660
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:525
#define PG_TEMP_FILES_DIR
Definition: file_utils.h:57
#define PG_TEMP_FILE_PREFIX
Definition: file_utils.h:58
PGFileType
Definition: file_utils.h:19
@ PGFILETYPE_DIR
Definition: file_utils.h:23
@ PGFILETYPE_REG
Definition: file_utils.h:22
@ PGFILETYPE_ERROR
Definition: file_utils.h:20
@ DATA_DIR_SYNC_METHOD_SYNCFS
Definition: file_utils.h:30
@ DATA_DIR_SYNC_METHOD_FSYNC
Definition: file_utils.h:29
int MyProcPid
Definition: globals.c:44
bool enableFsync
Definition: globals.c:123
Oid MyDatabaseTableSpace
Definition: globals.c:91
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:631
#define newval
#define GUC_check_errdetail
Definition: guc.h:436
GucSource
Definition: guc.h:108
int temp_file_limit
Definition: guc_tables.c:531
int log_temp_files
Definition: guc_tables.c:525
#define realloc(a, b)
Definition: header.h:60
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
#define close(a)
Definition: win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:333
int j
Definition: isn.c:74
int i
Definition: isn.c:73
static void const char fflush(stdout)
Assert(fmt[strlen(fmt) - 1] !='\n')
void list_free(List *list)
Definition: list.c:1545
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:241
char * pstrdup(const char *in)
Definition: mcxt.c:1644
void pfree(void *pointer)
Definition: mcxt.c:1456
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1476
void * palloc(Size size)
Definition: mcxt.c:1226
#define MAP_FAILED
Definition: mem.h:45
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
void * arg
static char * basedir
static PgChecksumMode mode
Definition: pg_checksums.c:56
#define MAXPGPATH
#define lfirst(lc)
Definition: pg_list.h:172
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition: pg_prng.c:144
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
static rewind_source * source
Definition: pg_rewind.c:89
static char * buf
Definition: pg_test_fsync.c:67
char * tablespace
Definition: pgbench.c:216
void pgstat_report_tempfile(size_t filesize)
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
#define pg_pwrite
Definition: port.h:226
#define pg_pread
Definition: port.h:225
void get_parent_directory(char *path)
Definition: path.c:977
pqsigfunc pqsignal(int signo, pqsigfunc func)
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
uintptr_t Datum
Definition: postgres.h:64
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
static int fd(const char *x, int i)
Definition: preproc-init.c:105
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:33
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1350
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1370
ResourceOwner CurrentResourceOwner
Definition: resowner.c:147
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1361
void pg_usleep(long microsec)
Definition: signal.c:53
static void error(void)
Definition: sql-dyntest.c:147
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
SubTransactionId create_subid
Definition: fd.c:258
DIR * dir
Definition: fd.c:262
FILE * file
Definition: fd.c:261
int fd
Definition: fd.c:263
union AllocateDesc::@19 desc
AllocateDescKind kind
Definition: fd.c:257
Definition: dirent.c:26
Definition: pg_list.h:54
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
__int64 st_size
Definition: win32_port.h:273
unsigned short st_mode
Definition: win32_port.h:268
Definition: fd.c:197
int fd
Definition: fd.c:198
int fileFlags
Definition: fd.c:207
File lruLessRecently
Definition: fd.c:203
File lruMoreRecently
Definition: fd.c:202
char * fileName
Definition: fd.c:205
ResourceOwner resowner
Definition: fd.c:200
unsigned short fdstate
Definition: fd.c:199
File nextFree
Definition: fd.c:201
mode_t fileMode
Definition: fd.c:208
off_t fileSize
Definition: fd.c:204
bool SplitGUCList(char *rawstring, char separator, List **namelist)
Definition: varlena.c:3702
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:88
static void pgstat_report_wait_end(void)
Definition: wait_event.h:104
const char * type
const char * name
#define fsync(fd)
Definition: win32_port.h:85
#define stat
Definition: win32_port.h:284
#define SIG_DFL
Definition: win32_port.h:163
#define EINTR
Definition: win32_port.h:374
#define EOPNOTSUPP
Definition: win32_port.h:398
#define SIGPIPE
Definition: win32_port.h:173
#define lstat(path, sb)
Definition: win32_port.h:285
#define S_ISDIR(m)
Definition: win32_port.h:325
void _dosmaperr(unsigned long)
Definition: win32error.c:177
#define S_ISLNK(m)
Definition: win32_port.h:344
#define mkdir(a, b)
Definition: win32_port.h:80
#define fstat
Definition: win32_port.h:283
#define ftruncate(a, b)
Definition: win32_port.h:82
#define SIG_IGN
Definition: win32_port.h:165
#define O_CLOEXEC
Definition: win32_port.h:359
#define O_DSYNC
Definition: win32_port.h:352
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:780
int sync_method
Definition: xlog.c:133
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:25
static const char * directory
Definition: zic.c:634