PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  * If a non-virtual file descriptor needs to be held open for any length of
65  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66  * (and eventually ReleaseExternalFD), so that we can take it into account
67  * while deciding how many VFDs can be open. This applies to FDs obtained
68  * with BasicOpenFile as well as those obtained without use of any fd.c API.
69  *
70  *-------------------------------------------------------------------------
71  */
72 
73 #include "postgres.h"
74 
75 #include <dirent.h>
76 #include <sys/file.h>
77 #include <sys/param.h>
78 #include <sys/resource.h> /* for getrlimit */
79 #include <sys/stat.h>
80 #include <sys/types.h>
81 #ifndef WIN32
82 #include <sys/mman.h>
83 #endif
84 #include <limits.h>
85 #include <unistd.h>
86 #include <fcntl.h>
87 
88 #include "access/xact.h"
89 #include "access/xlog.h"
90 #include "catalog/pg_tablespace.h"
91 #include "common/file_perm.h"
92 #include "common/file_utils.h"
93 #include "common/pg_prng.h"
94 #include "miscadmin.h"
95 #include "pgstat.h"
96 #include "portability/mem.h"
97 #include "postmaster/startup.h"
98 #include "storage/fd.h"
99 #include "storage/ipc.h"
100 #include "utils/guc.h"
101 #include "utils/resowner_private.h"
102 
103 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
104 #if defined(HAVE_SYNC_FILE_RANGE)
105 #define PG_FLUSH_DATA_WORKS 1
106 #elif !defined(WIN32) && defined(MS_ASYNC)
107 #define PG_FLUSH_DATA_WORKS 1
108 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
109 #define PG_FLUSH_DATA_WORKS 1
110 #endif
111 
112 /*
113  * We must leave some file descriptors free for system(), the dynamic loader,
114  * and other code that tries to open files without consulting fd.c. This
115  * is the number left free. (While we try fairly hard to prevent EMFILE
116  * errors, there's never any guarantee that we won't get ENFILE due to
117  * other processes chewing up FDs. So it's a bad idea to try to open files
118  * without consulting fd.c. Nonetheless we cannot control all code.)
119  *
120  * Because this is just a fixed setting, we are effectively assuming that
121  * no such code will leave FDs open over the long term; otherwise the slop
122  * is likely to be insufficient. Note in particular that we expect that
123  * loading a shared library does not result in any permanent increase in
124  * the number of open files. (This appears to be true on most if not
125  * all platforms as of Feb 2004.)
126  */
127 #define NUM_RESERVED_FDS 10
128 
129 /*
130  * If we have fewer than this many usable FDs after allowing for the reserved
131  * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
132  * much less than that. Note that this value ensures numExternalFDs can be
133  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
134  * will not pass unless that can grow to at least 14.)
135  */
136 #define FD_MINFREE 48
137 
138 /*
139  * A number of platforms allow individual processes to open many more files
140  * than they can really support when *many* processes do the same thing.
141  * This GUC parameter lets the DBA limit max_safe_fds to something less than
142  * what the postmaster's initial probe suggests will work.
143  */
145 
146 /*
147  * Maximum number of file descriptors to open for operations that fd.c knows
148  * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
149  * to a conservative value, and remains that way indefinitely in bootstrap or
150  * standalone-backend cases. In normal postmaster operation, the postmaster
151  * calls set_max_safe_fds() late in initialization to update the value, and
152  * that value is then inherited by forked subprocesses.
153  *
154  * Note: the value of max_files_per_process is taken into account while
155  * setting this variable, and so need not be tested separately.
156  */
157 int max_safe_fds = FD_MINFREE; /* default if not changed */
158 
159 /* Whether it is safe to continue running after fsync() fails. */
160 bool data_sync_retry = false;
161 
162 /* How SyncDataDirectory() should do its job. */
164 
165 /* Debugging.... */
166 
167 #ifdef FDDEBUG
168 #define DO_DB(A) \
169  do { \
170  int _do_db_save_errno = errno; \
171  A; \
172  errno = _do_db_save_errno; \
173  } while (0)
174 #else
175 #define DO_DB(A) \
176  ((void) 0)
177 #endif
178 
179 #define VFD_CLOSED (-1)
180 
181 #define FileIsValid(file) \
182  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
183 
184 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
185 
186 /* these are the assigned bits in fdstate below: */
187 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
188 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
189 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
190 
191 typedef struct vfd
192 {
193  int fd; /* current FD, or VFD_CLOSED if none */
194  unsigned short fdstate; /* bitflags for VFD's state */
195  ResourceOwner resowner; /* owner, for automatic cleanup */
196  File nextFree; /* link to next free VFD, if in freelist */
197  File lruMoreRecently; /* doubly linked recency-of-use list */
199  off_t fileSize; /* current size of file (0 if not temporary) */
200  char *fileName; /* name of file, or NULL for unused VFD */
201  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
202  int fileFlags; /* open(2) flags for (re)opening the file */
203  mode_t fileMode; /* mode to pass to open(2) */
204 } Vfd;
205 
206 /*
207  * Virtual File Descriptor array pointer and size. This grows as
208  * needed. 'File' values are indexes into this array.
209  * Note that VfdCache[0] is not a usable VFD, just a list header.
210  */
211 static Vfd *VfdCache;
212 static Size SizeVfdCache = 0;
213 
214 /*
215  * Number of file descriptors known to be in use by VFD entries.
216  */
217 static int nfile = 0;
218 
219 /*
220  * Flag to tell whether it's worth scanning VfdCache looking for temp files
221  * to close
222  */
223 static bool have_xact_temporary_files = false;
224 
225 /*
226  * Tracks the total size of all temporary files. Note: when temp_file_limit
227  * is being enforced, this cannot overflow since the limit cannot be more
228  * than INT_MAX kilobytes. When not enforcing, it could theoretically
229  * overflow, but we don't care.
230  */
231 static uint64 temporary_files_size = 0;
232 
233 /* Temporary file access initialized and not yet shut down? */
234 #ifdef USE_ASSERT_CHECKING
235 static bool temporary_files_allowed = false;
236 #endif
237 
238 /*
239  * List of OS handles opened with AllocateFile, AllocateDir and
240  * OpenTransientFile.
241  */
242 typedef enum
243 {
249 
250 typedef struct
251 {
254  union
255  {
256  FILE *file;
258  int fd;
259  } desc;
260 } AllocateDesc;
261 
262 static int numAllocatedDescs = 0;
263 static int maxAllocatedDescs = 0;
265 
266 /*
267  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
268  */
269 static int numExternalFDs = 0;
270 
271 /*
272  * Number of temporary files opened during the current session;
273  * this is used in generation of tempfile names.
274  */
275 static long tempFileCounter = 0;
276 
277 /*
278  * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
279  * indicating that the current database's default tablespace should be used.)
280  * When numTempTableSpaces is -1, this has not been set in the current
281  * transaction.
282  */
283 static Oid *tempTableSpaces = NULL;
284 static int numTempTableSpaces = -1;
285 static int nextTempTableSpace = 0;
286 
287 
288 /*--------------------
289  *
290  * Private Routines
291  *
292  * Delete - delete a file from the Lru ring
293  * LruDelete - remove a file from the Lru ring and close its FD
294  * Insert - put a file at the front of the Lru ring
295  * LruInsert - put a file at the front of the Lru ring and open it
296  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
297  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
298  * AllocateVfd - grab a free (or new) file record (from VfdCache)
299  * FreeVfd - free a file record
300  *
301  * The Least Recently Used ring is a doubly linked list that begins and
302  * ends on element zero. Element zero is special -- it doesn't represent
303  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
304  * anchor that shows us the beginning/end of the ring.
305  * Only VFD elements that are currently really open (have an FD assigned) are
306  * in the Lru ring. Elements that are "virtually" open can be recognized
307  * by having a non-null fileName field.
308  *
309  * example:
310  *
311  * /--less----\ /---------\
312  * v \ v \
313  * #0 --more---> LeastRecentlyUsed --more-\ \
314  * ^\ | |
315  * \\less--> MostRecentlyUsedFile <---/ |
316  * \more---/ \--less--/
317  *
318  *--------------------
319  */
320 static void Delete(File file);
321 static void LruDelete(File file);
322 static void Insert(File file);
323 static int LruInsert(File file);
324 static bool ReleaseLruFile(void);
325 static void ReleaseLruFiles(void);
326 static File AllocateVfd(void);
327 static void FreeVfd(File file);
328 
329 static int FileAccess(File file);
330 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
331 static bool reserveAllocatedDesc(void);
332 static int FreeDesc(AllocateDesc *desc);
333 
334 static void BeforeShmemExit_Files(int code, Datum arg);
335 static void CleanupTempFiles(bool isCommit, bool isProcExit);
336 static void RemovePgTempRelationFiles(const char *tsdirname);
337 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
338 
339 static void walkdir(const char *path,
340  void (*action) (const char *fname, bool isdir, int elevel),
341  bool process_symlinks,
342  int elevel);
343 #ifdef PG_FLUSH_DATA_WORKS
344 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
345 #endif
346 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
347 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
348 
349 static int fsync_parent_path(const char *fname, int elevel);
350 
351 
352 /*
353  * pg_fsync --- do fsync with or without writethrough
354  */
355 int
357 {
358 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
359  struct stat st;
360 
361  /*
362  * Some operating system implementations of fsync() have requirements
363  * about the file access modes that were used when their file descriptor
364  * argument was opened, and these requirements differ depending on whether
365  * the file descriptor is for a directory.
366  *
367  * For any file descriptor that may eventually be handed to fsync(), we
368  * should have opened it with access modes that are compatible with
369  * fsync() on all supported systems, otherwise the code may not be
370  * portable, even if it runs ok on the current system.
371  *
372  * We assert here that a descriptor for a file was opened with write
373  * permissions (either O_RDWR or O_WRONLY) and for a directory without
374  * write permissions (O_RDONLY).
375  *
376  * Ignore any fstat errors and let the follow-up fsync() do its work.
377  * Doing this sanity check here counts for the case where fsync() is
378  * disabled.
379  */
380  if (fstat(fd, &st) == 0)
381  {
382  int desc_flags = fcntl(fd, F_GETFL);
383 
384  /*
385  * O_RDONLY is historically 0, so just make sure that for directories
386  * no write flags are used.
387  */
388  if (S_ISDIR(st.st_mode))
389  Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
390  else
391  Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
392  }
393  errno = 0;
394 #endif
395 
396  /* #if is to skip the sync_method test if there's no need for it */
397 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
399  return pg_fsync_writethrough(fd);
400  else
401 #endif
403 }
404 
405 
406 /*
407  * pg_fsync_no_writethrough --- same as fsync except does nothing if
408  * enableFsync is off
409  */
410 int
412 {
413  if (enableFsync)
414  return fsync(fd);
415  else
416  return 0;
417 }
418 
419 /*
420  * pg_fsync_writethrough
421  */
422 int
424 {
425  if (enableFsync)
426  {
427 #ifdef WIN32
428  return _commit(fd);
429 #elif defined(F_FULLFSYNC)
430  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
431 #else
432  errno = ENOSYS;
433  return -1;
434 #endif
435  }
436  else
437  return 0;
438 }
439 
440 /*
441  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
442  */
443 int
445 {
446  if (enableFsync)
447  return fdatasync(fd);
448  else
449  return 0;
450 }
451 
452 /*
453  * pg_flush_data --- advise OS that the described dirty data should be flushed
454  *
455  * offset of 0 with nbytes 0 means that the entire file should be flushed
456  */
457 void
458 pg_flush_data(int fd, off_t offset, off_t nbytes)
459 {
460  /*
461  * Right now file flushing is primarily used to avoid making later
462  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
463  * if fsyncs are disabled - that's a decision we might want to make
464  * configurable at some point.
465  */
466  if (!enableFsync)
467  return;
468 
469  /*
470  * We compile all alternatives that are supported on the current platform,
471  * to find portability problems more easily.
472  */
473 #if defined(HAVE_SYNC_FILE_RANGE)
474  {
475  int rc;
476  static bool not_implemented_by_kernel = false;
477 
478  if (not_implemented_by_kernel)
479  return;
480 
481  /*
482  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
483  * tells the OS that writeback for the specified blocks should be
484  * started, but that we don't want to wait for completion. Note that
485  * this call might block if too much dirty data exists in the range.
486  * This is the preferable method on OSs supporting it, as it works
487  * reliably when available (contrast to msync()) and doesn't flush out
488  * clean data (like FADV_DONTNEED).
489  */
490  rc = sync_file_range(fd, offset, nbytes,
491  SYNC_FILE_RANGE_WRITE);
492  if (rc != 0)
493  {
494  int elevel;
495 
496  /*
497  * For systems that don't have an implementation of
498  * sync_file_range() such as Windows WSL, generate only one
499  * warning and then suppress all further attempts by this process.
500  */
501  if (errno == ENOSYS)
502  {
503  elevel = WARNING;
504  not_implemented_by_kernel = true;
505  }
506  else
507  elevel = data_sync_elevel(WARNING);
508 
509  ereport(elevel,
511  errmsg("could not flush dirty data: %m")));
512  }
513 
514  return;
515  }
516 #endif
517 #if !defined(WIN32) && defined(MS_ASYNC)
518  {
519  void *p;
520  static int pagesize = 0;
521 
522  /*
523  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
524  * writeback. On linux it only does so if MS_SYNC is specified, but
525  * then it does the writeback synchronously. Luckily all common linux
526  * systems have sync_file_range(). This is preferable over
527  * FADV_DONTNEED because it doesn't flush out clean data.
528  *
529  * We map the file (mmap()), tell the kernel to sync back the contents
530  * (msync()), and then remove the mapping again (munmap()).
531  */
532 
533  /* mmap() needs actual length if we want to map whole file */
534  if (offset == 0 && nbytes == 0)
535  {
536  nbytes = lseek(fd, 0, SEEK_END);
537  if (nbytes < 0)
538  {
541  errmsg("could not determine dirty data size: %m")));
542  return;
543  }
544  }
545 
546  /*
547  * Some platforms reject partial-page mmap() attempts. To deal with
548  * that, just truncate the request to a page boundary. If any extra
549  * bytes don't get flushed, well, it's only a hint anyway.
550  */
551 
552  /* fetch pagesize only once */
553  if (pagesize == 0)
554  pagesize = sysconf(_SC_PAGESIZE);
555 
556  /* align length to pagesize, dropping any fractional page */
557  if (pagesize > 0)
558  nbytes = (nbytes / pagesize) * pagesize;
559 
560  /* fractional-page request is a no-op */
561  if (nbytes <= 0)
562  return;
563 
564  /*
565  * mmap could well fail, particularly on 32-bit platforms where there
566  * may simply not be enough address space. If so, silently fall
567  * through to the next implementation.
568  */
569  if (nbytes <= (off_t) SSIZE_MAX)
570  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
571  else
572  p = MAP_FAILED;
573 
574  if (p != MAP_FAILED)
575  {
576  int rc;
577 
578  rc = msync(p, (size_t) nbytes, MS_ASYNC);
579  if (rc != 0)
580  {
583  errmsg("could not flush dirty data: %m")));
584  /* NB: need to fall through to munmap()! */
585  }
586 
587  rc = munmap(p, (size_t) nbytes);
588  if (rc != 0)
589  {
590  /* FATAL error because mapping would remain */
591  ereport(FATAL,
593  errmsg("could not munmap() while flushing data: %m")));
594  }
595 
596  return;
597  }
598  }
599 #endif
600 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
601  {
602  int rc;
603 
604  /*
605  * Signal the kernel that the passed in range should not be cached
606  * anymore. This has the, desired, side effect of writing out dirty
607  * data, and the, undesired, side effect of likely discarding useful
608  * clean cached blocks. For the latter reason this is the least
609  * preferable method.
610  */
611 
612  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
613 
614  if (rc != 0)
615  {
616  /* don't error out, this is just a performance optimization */
619  errmsg("could not flush dirty data: %m")));
620  }
621 
622  return;
623  }
624 #endif
625 }
626 
627 /*
628  * Truncate a file to a given length by name.
629  */
630 int
631 pg_truncate(const char *path, off_t length)
632 {
633 #ifdef WIN32
634  int save_errno;
635  int ret;
636  int fd;
637 
638  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
639  if (fd >= 0)
640  {
641  ret = ftruncate(fd, length);
642  save_errno = errno;
644  errno = save_errno;
645  }
646  else
647  ret = -1;
648 
649  return ret;
650 #else
651  return truncate(path, length);
652 #endif
653 }
654 
655 /*
656  * fsync_fname -- fsync a file or directory, handling errors properly
657  *
658  * Try to fsync a file or directory. When doing the latter, ignore errors that
659  * indicate the OS just doesn't allow/require fsyncing directories.
660  */
661 void
662 fsync_fname(const char *fname, bool isdir)
663 {
664  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
665 }
666 
667 /*
668  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
669  *
670  * This routine ensures that, after returning, the effect of renaming file
671  * persists in case of a crash. A crash while this routine is running will
672  * leave you with either the pre-existing or the moved file in place of the
673  * new file; no mixed state or truncated files are possible.
674  *
675  * It does so by using fsync on the old filename and the possibly existing
676  * target filename before the rename, and the target file and directory after.
677  *
678  * Note that rename() cannot be used across arbitrary directories, as they
679  * might not be on the same filesystem. Therefore this routine does not
680  * support renaming across directories.
681  *
682  * Log errors with the caller specified severity.
683  *
684  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
685  * valid upon return.
686  */
687 int
688 durable_rename(const char *oldfile, const char *newfile, int elevel)
689 {
690  int fd;
691 
692  /*
693  * First fsync the old and target path (if it exists), to ensure that they
694  * are properly persistent on disk. Syncing the target file is not
695  * strictly necessary, but it makes it easier to reason about crashes;
696  * because it's then guaranteed that either source or target file exists
697  * after a crash.
698  */
699  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
700  return -1;
701 
702  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
703  if (fd < 0)
704  {
705  if (errno != ENOENT)
706  {
707  ereport(elevel,
709  errmsg("could not open file \"%s\": %m", newfile)));
710  return -1;
711  }
712  }
713  else
714  {
715  if (pg_fsync(fd) != 0)
716  {
717  int save_errno;
718 
719  /* close file upon error, might not be in transaction context */
720  save_errno = errno;
722  errno = save_errno;
723 
724  ereport(elevel,
726  errmsg("could not fsync file \"%s\": %m", newfile)));
727  return -1;
728  }
729 
730  if (CloseTransientFile(fd) != 0)
731  {
732  ereport(elevel,
734  errmsg("could not close file \"%s\": %m", newfile)));
735  return -1;
736  }
737  }
738 
739  /* Time to do the real deal... */
740  if (rename(oldfile, newfile) < 0)
741  {
742  ereport(elevel,
744  errmsg("could not rename file \"%s\" to \"%s\": %m",
745  oldfile, newfile)));
746  return -1;
747  }
748 
749  /*
750  * To guarantee renaming the file is persistent, fsync the file with its
751  * new name, and its containing directory.
752  */
753  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
754  return -1;
755 
756  if (fsync_parent_path(newfile, elevel) != 0)
757  return -1;
758 
759  return 0;
760 }
761 
762 /*
763  * durable_unlink -- remove a file in a durable manner
764  *
765  * This routine ensures that, after returning, the effect of removing file
766  * persists in case of a crash. A crash while this routine is running will
767  * leave the system in no mixed state.
768  *
769  * It does so by using fsync on the parent directory of the file after the
770  * actual removal is done.
771  *
772  * Log errors with the severity specified by caller.
773  *
774  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
775  * valid upon return.
776  */
777 int
778 durable_unlink(const char *fname, int elevel)
779 {
780  if (unlink(fname) < 0)
781  {
782  ereport(elevel,
784  errmsg("could not remove file \"%s\": %m",
785  fname)));
786  return -1;
787  }
788 
789  /*
790  * To guarantee that the removal of the file is persistent, fsync its
791  * parent directory.
792  */
793  if (fsync_parent_path(fname, elevel) != 0)
794  return -1;
795 
796  return 0;
797 }
798 
799 /*
800  * InitFileAccess --- initialize this module during backend startup
801  *
802  * This is called during either normal or standalone backend start.
803  * It is *not* called in the postmaster.
804  *
805  * Note that this does not initialize temporary file access, that is
806  * separately initialized via InitTemporaryFileAccess().
807  */
808 void
810 {
811  Assert(SizeVfdCache == 0); /* call me only once */
812 
813  /* initialize cache header entry */
814  VfdCache = (Vfd *) malloc(sizeof(Vfd));
815  if (VfdCache == NULL)
816  ereport(FATAL,
817  (errcode(ERRCODE_OUT_OF_MEMORY),
818  errmsg("out of memory")));
819 
820  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
822 
823  SizeVfdCache = 1;
824 }
825 
826 /*
827  * InitTemporaryFileAccess --- initialize temporary file access during startup
828  *
829  * This is called during either normal or standalone backend start.
830  * It is *not* called in the postmaster.
831  *
832  * This is separate from InitFileAccess() because temporary file cleanup can
833  * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
834  * our reporting has to happen before that. Low level file access should be
835  * available for longer, hence the separate initialization / shutdown of
836  * temporary file handling.
837  */
838 void
840 {
841  Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
842  Assert(!temporary_files_allowed); /* call me only once */
843 
844  /*
845  * Register before-shmem-exit hook to ensure temp files are dropped while
846  * we can still report stats.
847  */
849 
850 #ifdef USE_ASSERT_CHECKING
851  temporary_files_allowed = true;
852 #endif
853 }
854 
855 /*
856  * count_usable_fds --- count how many FDs the system will let us open,
857  * and estimate how many are already open.
858  *
859  * We stop counting if usable_fds reaches max_to_probe. Note: a small
860  * value of max_to_probe might result in an underestimate of already_open;
861  * we must fill in any "gaps" in the set of used FDs before the calculation
862  * of already_open will give the right answer. In practice, max_to_probe
863  * of a couple of dozen should be enough to ensure good results.
864  *
865  * We assume stderr (FD 2) is available for dup'ing. While the calling
866  * script could theoretically close that, it would be a really bad idea,
867  * since then one risks loss of error messages from, e.g., libc.
868  */
869 static void
870 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
871 {
872  int *fd;
873  int size;
874  int used = 0;
875  int highestfd = 0;
876  int j;
877 
878 #ifdef HAVE_GETRLIMIT
879  struct rlimit rlim;
880  int getrlimit_status;
881 #endif
882 
883  size = 1024;
884  fd = (int *) palloc(size * sizeof(int));
885 
886 #ifdef HAVE_GETRLIMIT
887  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
888  if (getrlimit_status != 0)
889  ereport(WARNING, (errmsg("getrlimit failed: %m")));
890 #endif /* HAVE_GETRLIMIT */
891 
892  /* dup until failure or probe limit reached */
893  for (;;)
894  {
895  int thisfd;
896 
897 #ifdef HAVE_GETRLIMIT
898 
899  /*
900  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
901  * some platforms
902  */
903  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
904  break;
905 #endif
906 
907  thisfd = dup(2);
908  if (thisfd < 0)
909  {
910  /* Expect EMFILE or ENFILE, else it's fishy */
911  if (errno != EMFILE && errno != ENFILE)
912  elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
913  break;
914  }
915 
916  if (used >= size)
917  {
918  size *= 2;
919  fd = (int *) repalloc(fd, size * sizeof(int));
920  }
921  fd[used++] = thisfd;
922 
923  if (highestfd < thisfd)
924  highestfd = thisfd;
925 
926  if (used >= max_to_probe)
927  break;
928  }
929 
930  /* release the files we opened */
931  for (j = 0; j < used; j++)
932  close(fd[j]);
933 
934  pfree(fd);
935 
936  /*
937  * Return results. usable_fds is just the number of successful dups. We
938  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
939  * number) and so already_open is highestfd+1 - usable_fds.
940  */
941  *usable_fds = used;
942  *already_open = highestfd + 1 - used;
943 }
944 
945 /*
946  * set_max_safe_fds
947  * Determine number of file descriptors that fd.c is allowed to use
948  */
949 void
951 {
952  int usable_fds;
953  int already_open;
954 
955  /*----------
956  * We want to set max_safe_fds to
957  * MIN(usable_fds, max_files_per_process - already_open)
958  * less the slop factor for files that are opened without consulting
959  * fd.c. This ensures that we won't exceed either max_files_per_process
960  * or the experimentally-determined EMFILE limit.
961  *----------
962  */
964  &usable_fds, &already_open);
965 
966  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
967 
968  /*
969  * Take off the FDs reserved for system() etc.
970  */
972 
973  /*
974  * Make sure we still have enough to get by.
975  */
976  if (max_safe_fds < FD_MINFREE)
977  ereport(FATAL,
978  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
979  errmsg("insufficient file descriptors available to start server process"),
980  errdetail("System allows %d, server needs at least %d.",
983 
984  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
985  max_safe_fds, usable_fds, already_open);
986 }
987 
988 /*
989  * Open a file with BasicOpenFilePerm() and pass default file mode for the
990  * fileMode parameter.
991  */
992 int
993 BasicOpenFile(const char *fileName, int fileFlags)
994 {
995  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
996 }
997 
998 /*
999  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1000  *
1001  * This is exported for use by places that really want a plain kernel FD,
1002  * but need to be proof against running out of FDs. Once an FD has been
1003  * successfully returned, it is the caller's responsibility to ensure that
1004  * it will not be leaked on ereport()! Most users should *not* call this
1005  * routine directly, but instead use the VFD abstraction level, which
1006  * provides protection against descriptor leaks as well as management of
1007  * files that need to be open for more than a short period of time.
1008  *
1009  * Ideally this should be the *only* direct call of open() in the backend.
1010  * In practice, the postmaster calls open() directly, and there are some
1011  * direct open() calls done early in backend startup. Those are OK since
1012  * this module wouldn't have any open files to close at that point anyway.
1013  */
1014 int
1015 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1016 {
1017  int fd;
1018 
1019 tryAgain:
1020 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1021 
1022  /*
1023  * The value we defined to stand in for O_DIRECT when simulating it with
1024  * F_NOCACHE had better not collide with any of the standard flags.
1025  */
1027  (O_APPEND |
1028  O_CLOEXEC |
1029  O_CREAT |
1030  O_DSYNC |
1031  O_EXCL |
1032  O_RDWR |
1033  O_RDONLY |
1034  O_SYNC |
1035  O_TRUNC |
1036  O_WRONLY)) == 0,
1037  "PG_O_DIRECT value collides with standard flag");
1038  fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1039 #else
1040  fd = open(fileName, fileFlags, fileMode);
1041 #endif
1042 
1043  if (fd >= 0)
1044  {
1045 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1046  if (fileFlags & PG_O_DIRECT)
1047  {
1048  if (fcntl(fd, F_NOCACHE, 1) < 0)
1049  {
1050  int save_errno = errno;
1051 
1052  close(fd);
1053  errno = save_errno;
1054  return -1;
1055  }
1056  }
1057 #endif
1058 
1059  return fd; /* success! */
1060  }
1061 
1062  if (errno == EMFILE || errno == ENFILE)
1063  {
1064  int save_errno = errno;
1065 
1066  ereport(LOG,
1067  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1068  errmsg("out of file descriptors: %m; release and retry")));
1069  errno = 0;
1070  if (ReleaseLruFile())
1071  goto tryAgain;
1072  errno = save_errno;
1073  }
1074 
1075  return -1; /* failure */
1076 }
1077 
1078 /*
1079  * AcquireExternalFD - attempt to reserve an external file descriptor
1080  *
1081  * This should be used by callers that need to hold a file descriptor open
1082  * over more than a short interval, but cannot use any of the other facilities
1083  * provided by this module.
1084  *
1085  * The difference between this and the underlying ReserveExternalFD function
1086  * is that this will report failure (by setting errno and returning false)
1087  * if "too many" external FDs are already reserved. This should be used in
1088  * any code where the total number of FDs to be reserved is not predictable
1089  * and small.
1090  */
1091 bool
1093 {
1094  /*
1095  * We don't want more than max_safe_fds / 3 FDs to be consumed for
1096  * "external" FDs.
1097  */
1098  if (numExternalFDs < max_safe_fds / 3)
1099  {
1101  return true;
1102  }
1103  errno = EMFILE;
1104  return false;
1105 }
1106 
1107 /*
1108  * ReserveExternalFD - report external consumption of a file descriptor
1109  *
1110  * This should be used by callers that need to hold a file descriptor open
1111  * over more than a short interval, but cannot use any of the other facilities
1112  * provided by this module. This just tracks the use of the FD and closes
1113  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1114  *
1115  * Call this directly only in code where failure to reserve the FD would be
1116  * fatal; for example, the WAL-writing code does so, since the alternative is
1117  * session failure. Also, it's very unwise to do so in code that could
1118  * consume more than one FD per process.
1119  *
1120  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1121  * available, it doesn't matter too much whether this is called before or
1122  * after actually opening the FD; but doing so beforehand reduces the risk of
1123  * an EMFILE failure if not everybody played nice. In any case, it's solely
1124  * caller's responsibility to keep the external-FD count in sync with reality.
1125  */
1126 void
1128 {
1129  /*
1130  * Release VFDs if needed to stay safe. Because we do this before
1131  * incrementing numExternalFDs, the final state will be as desired, i.e.,
1132  * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1133  */
1134  ReleaseLruFiles();
1135 
1136  numExternalFDs++;
1137 }
1138 
1139 /*
1140  * ReleaseExternalFD - report release of an external file descriptor
1141  *
1142  * This is guaranteed not to change errno, so it can be used in failure paths.
1143  */
1144 void
1146 {
1147  Assert(numExternalFDs > 0);
1148  numExternalFDs--;
1149 }
1150 
1151 
1152 #if defined(FDDEBUG)
1153 
1154 static void
1155 _dump_lru(void)
1156 {
1157  int mru = VfdCache[0].lruLessRecently;
1158  Vfd *vfdP = &VfdCache[mru];
1159  char buf[2048];
1160 
1161  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1162  while (mru != 0)
1163  {
1164  mru = vfdP->lruLessRecently;
1165  vfdP = &VfdCache[mru];
1166  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1167  }
1168  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1169  elog(LOG, "%s", buf);
1170 }
1171 #endif /* FDDEBUG */
1172 
1173 static void
1175 {
1176  Vfd *vfdP;
1177 
1178  Assert(file != 0);
1179 
1180  DO_DB(elog(LOG, "Delete %d (%s)",
1181  file, VfdCache[file].fileName));
1182  DO_DB(_dump_lru());
1183 
1184  vfdP = &VfdCache[file];
1185 
1188 
1189  DO_DB(_dump_lru());
1190 }
1191 
1192 static void
1194 {
1195  Vfd *vfdP;
1196 
1197  Assert(file != 0);
1198 
1199  DO_DB(elog(LOG, "LruDelete %d (%s)",
1200  file, VfdCache[file].fileName));
1201 
1202  vfdP = &VfdCache[file];
1203 
1204  /*
1205  * Close the file. We aren't expecting this to fail; if it does, better
1206  * to leak the FD than to mess up our internal state.
1207  */
1208  if (close(vfdP->fd) != 0)
1210  "could not close file \"%s\": %m", vfdP->fileName);
1211  vfdP->fd = VFD_CLOSED;
1212  --nfile;
1213 
1214  /* delete the vfd record from the LRU ring */
1215  Delete(file);
1216 }
1217 
1218 static void
1220 {
1221  Vfd *vfdP;
1222 
1223  Assert(file != 0);
1224 
1225  DO_DB(elog(LOG, "Insert %d (%s)",
1226  file, VfdCache[file].fileName));
1227  DO_DB(_dump_lru());
1228 
1229  vfdP = &VfdCache[file];
1230 
1231  vfdP->lruMoreRecently = 0;
1233  VfdCache[0].lruLessRecently = file;
1235 
1236  DO_DB(_dump_lru());
1237 }
1238 
1239 /* returns 0 on success, -1 on re-open failure (with errno set) */
1240 static int
1242 {
1243  Vfd *vfdP;
1244 
1245  Assert(file != 0);
1246 
1247  DO_DB(elog(LOG, "LruInsert %d (%s)",
1248  file, VfdCache[file].fileName));
1249 
1250  vfdP = &VfdCache[file];
1251 
1252  if (FileIsNotOpen(file))
1253  {
1254  /* Close excess kernel FDs. */
1255  ReleaseLruFiles();
1256 
1257  /*
1258  * The open could still fail for lack of file descriptors, eg due to
1259  * overall system file table being full. So, be prepared to release
1260  * another FD if necessary...
1261  */
1262  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1263  vfdP->fileMode);
1264  if (vfdP->fd < 0)
1265  {
1266  DO_DB(elog(LOG, "re-open failed: %m"));
1267  return -1;
1268  }
1269  else
1270  {
1271  ++nfile;
1272  }
1273  }
1274 
1275  /*
1276  * put it at the head of the Lru ring
1277  */
1278 
1279  Insert(file);
1280 
1281  return 0;
1282 }
1283 
1284 /*
1285  * Release one kernel FD by closing the least-recently-used VFD.
1286  */
1287 static bool
1289 {
1290  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1291 
1292  if (nfile > 0)
1293  {
1294  /*
1295  * There are opened files and so there should be at least one used vfd
1296  * in the ring.
1297  */
1298  Assert(VfdCache[0].lruMoreRecently != 0);
1299  LruDelete(VfdCache[0].lruMoreRecently);
1300  return true; /* freed a file */
1301  }
1302  return false; /* no files available to free */
1303 }
1304 
1305 /*
1306  * Release kernel FDs as needed to get under the max_safe_fds limit.
1307  * After calling this, it's OK to try to open another file.
1308  */
1309 static void
1311 {
1313  {
1314  if (!ReleaseLruFile())
1315  break;
1316  }
1317 }
1318 
1319 static File
1321 {
1322  Index i;
1323  File file;
1324 
1325  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1326 
1327  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1328 
1329  if (VfdCache[0].nextFree == 0)
1330  {
1331  /*
1332  * The free list is empty so it is time to increase the size of the
1333  * array. We choose to double it each time this happens. However,
1334  * there's not much point in starting *real* small.
1335  */
1336  Size newCacheSize = SizeVfdCache * 2;
1337  Vfd *newVfdCache;
1338 
1339  if (newCacheSize < 32)
1340  newCacheSize = 32;
1341 
1342  /*
1343  * Be careful not to clobber VfdCache ptr if realloc fails.
1344  */
1345  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1346  if (newVfdCache == NULL)
1347  ereport(ERROR,
1348  (errcode(ERRCODE_OUT_OF_MEMORY),
1349  errmsg("out of memory")));
1350  VfdCache = newVfdCache;
1351 
1352  /*
1353  * Initialize the new entries and link them into the free list.
1354  */
1355  for (i = SizeVfdCache; i < newCacheSize; i++)
1356  {
1357  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1358  VfdCache[i].nextFree = i + 1;
1359  VfdCache[i].fd = VFD_CLOSED;
1360  }
1361  VfdCache[newCacheSize - 1].nextFree = 0;
1363 
1364  /*
1365  * Record the new size
1366  */
1367  SizeVfdCache = newCacheSize;
1368  }
1369 
1370  file = VfdCache[0].nextFree;
1371 
1372  VfdCache[0].nextFree = VfdCache[file].nextFree;
1373 
1374  return file;
1375 }
1376 
1377 static void
1379 {
1380  Vfd *vfdP = &VfdCache[file];
1381 
1382  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1383  file, vfdP->fileName ? vfdP->fileName : ""));
1384 
1385  if (vfdP->fileName != NULL)
1386  {
1387  free(vfdP->fileName);
1388  vfdP->fileName = NULL;
1389  }
1390  vfdP->fdstate = 0x0;
1391 
1392  vfdP->nextFree = VfdCache[0].nextFree;
1393  VfdCache[0].nextFree = file;
1394 }
1395 
1396 /* returns 0 on success, -1 on re-open failure (with errno set) */
1397 static int
1399 {
1400  int returnValue;
1401 
1402  DO_DB(elog(LOG, "FileAccess %d (%s)",
1403  file, VfdCache[file].fileName));
1404 
1405  /*
1406  * Is the file open? If not, open it and put it at the head of the LRU
1407  * ring (possibly closing the least recently used file to get an FD).
1408  */
1409 
1410  if (FileIsNotOpen(file))
1411  {
1412  returnValue = LruInsert(file);
1413  if (returnValue != 0)
1414  return returnValue;
1415  }
1416  else if (VfdCache[0].lruLessRecently != file)
1417  {
1418  /*
1419  * We now know that the file is open and that it is not the last one
1420  * accessed, so we need to move it to the head of the Lru ring.
1421  */
1422 
1423  Delete(file);
1424  Insert(file);
1425  }
1426 
1427  return 0;
1428 }
1429 
1430 /*
1431  * Called whenever a temporary file is deleted to report its size.
1432  */
1433 static void
1434 ReportTemporaryFileUsage(const char *path, off_t size)
1435 {
1436  pgstat_report_tempfile(size);
1437 
1438  if (log_temp_files >= 0)
1439  {
1440  if ((size / 1024) >= log_temp_files)
1441  ereport(LOG,
1442  (errmsg("temporary file: path \"%s\", size %lu",
1443  path, (unsigned long) size)));
1444  }
1445 }
1446 
1447 /*
1448  * Called to register a temporary file for automatic close.
1449  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1450  * before the file was opened.
1451  */
1452 static void
1454 {
1457 
1458  /* Backup mechanism for closing at end of xact. */
1461 }
1462 
1463 /*
1464  * Called when we get a shared invalidation message on some relation.
1465  */
1466 #ifdef NOT_USED
1467 void
1468 FileInvalidate(File file)
1469 {
1470  Assert(FileIsValid(file));
1471  if (!FileIsNotOpen(file))
1472  LruDelete(file);
1473 }
1474 #endif
1475 
1476 /*
1477  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1478  * fileMode parameter.
1479  */
1480 File
1481 PathNameOpenFile(const char *fileName, int fileFlags)
1482 {
1483  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1484 }
1485 
1486 /*
1487  * open a file in an arbitrary directory
1488  *
1489  * NB: if the passed pathname is relative (which it usually is),
1490  * it will be interpreted relative to the process' working directory
1491  * (which should always be $PGDATA when this code is running).
1492  */
1493 File
1494 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1495 {
1496  char *fnamecopy;
1497  File file;
1498  Vfd *vfdP;
1499 
1500  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1501  fileName, fileFlags, fileMode));
1502 
1503  /*
1504  * We need a malloc'd copy of the file name; fail cleanly if no room.
1505  */
1506  fnamecopy = strdup(fileName);
1507  if (fnamecopy == NULL)
1508  ereport(ERROR,
1509  (errcode(ERRCODE_OUT_OF_MEMORY),
1510  errmsg("out of memory")));
1511 
1512  file = AllocateVfd();
1513  vfdP = &VfdCache[file];
1514 
1515  /* Close excess kernel FDs. */
1516  ReleaseLruFiles();
1517 
1518  /*
1519  * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1520  * client shouldn't be expected to know which kernel descriptors are
1521  * currently open, so it wouldn't make sense for them to be inherited by
1522  * executed subprograms.
1523  */
1524  fileFlags |= O_CLOEXEC;
1525 
1526  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1527 
1528  if (vfdP->fd < 0)
1529  {
1530  int save_errno = errno;
1531 
1532  FreeVfd(file);
1533  free(fnamecopy);
1534  errno = save_errno;
1535  return -1;
1536  }
1537  ++nfile;
1538  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1539  vfdP->fd));
1540 
1541  vfdP->fileName = fnamecopy;
1542  /* Saved flags are adjusted to be OK for re-opening file */
1543  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1544  vfdP->fileMode = fileMode;
1545  vfdP->fileSize = 0;
1546  vfdP->fdstate = 0x0;
1547  vfdP->resowner = NULL;
1548 
1549  Insert(file);
1550 
1551  return file;
1552 }
1553 
1554 /*
1555  * Create directory 'directory'. If necessary, create 'basedir', which must
1556  * be the directory above it. This is designed for creating the top-level
1557  * temporary directory on demand before creating a directory underneath it.
1558  * Do nothing if the directory already exists.
1559  *
1560  * Directories created within the top-level temporary directory should begin
1561  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1562  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1563  * that do not need any particular prefix.
1564 */
1565 void
1567 {
1568  if (MakePGDirectory(directory) < 0)
1569  {
1570  if (errno == EEXIST)
1571  return;
1572 
1573  /*
1574  * Failed. Try to create basedir first in case it's missing. Tolerate
1575  * EEXIST to close a race against another process following the same
1576  * algorithm.
1577  */
1578  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1579  ereport(ERROR,
1581  errmsg("cannot create temporary directory \"%s\": %m",
1582  basedir)));
1583 
1584  /* Try again. */
1585  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1586  ereport(ERROR,
1588  errmsg("cannot create temporary subdirectory \"%s\": %m",
1589  directory)));
1590  }
1591 }
1592 
1593 /*
1594  * Delete a directory and everything in it, if it exists.
1595  */
1596 void
1597 PathNameDeleteTemporaryDir(const char *dirname)
1598 {
1599  struct stat statbuf;
1600 
1601  /* Silently ignore missing directory. */
1602  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1603  return;
1604 
1605  /*
1606  * Currently, walkdir doesn't offer a way for our passed in function to
1607  * maintain state. Perhaps it should, so that we could tell the caller
1608  * whether this operation succeeded or failed. Since this operation is
1609  * used in a cleanup path, we wouldn't actually behave differently: we'll
1610  * just log failures.
1611  */
1612  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1613 }
1614 
1615 /*
1616  * Open a temporary file that will disappear when we close it.
1617  *
1618  * This routine takes care of generating an appropriate tempfile name.
1619  * There's no need to pass in fileFlags or fileMode either, since only
1620  * one setting makes any sense for a temp file.
1621  *
1622  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1623  * to ensure it's closed and deleted when it's no longer needed, typically at
1624  * the end-of-transaction. In most cases, you don't want temporary files to
1625  * outlive the transaction that created them, so this should be false -- but
1626  * if you need "somewhat" temporary storage, this might be useful. In either
1627  * case, the file is removed when the File is explicitly closed.
1628  */
1629 File
1630 OpenTemporaryFile(bool interXact)
1631 {
1632  File file = 0;
1633 
1634  Assert(temporary_files_allowed); /* check temp file access is up */
1635 
1636  /*
1637  * Make sure the current resource owner has space for this File before we
1638  * open it, if we'll be registering it below.
1639  */
1640  if (!interXact)
1642 
1643  /*
1644  * If some temp tablespace(s) have been given to us, try to use the next
1645  * one. If a given tablespace can't be found, we silently fall back to
1646  * the database's default tablespace.
1647  *
1648  * BUT: if the temp file is slated to outlive the current transaction,
1649  * force it into the database's default tablespace, so that it will not
1650  * pose a threat to possible tablespace drop attempts.
1651  */
1652  if (numTempTableSpaces > 0 && !interXact)
1653  {
1654  Oid tblspcOid = GetNextTempTableSpace();
1655 
1656  if (OidIsValid(tblspcOid))
1657  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1658  }
1659 
1660  /*
1661  * If not, or if tablespace is bad, create in database's default
1662  * tablespace. MyDatabaseTableSpace should normally be set before we get
1663  * here, but just in case it isn't, fall back to pg_default tablespace.
1664  */
1665  if (file <= 0)
1668  DEFAULTTABLESPACE_OID,
1669  true);
1670 
1671  /* Mark it for deletion at close and temporary file size limit */
1673 
1674  /* Register it with the current resource owner */
1675  if (!interXact)
1676  RegisterTemporaryFile(file);
1677 
1678  return file;
1679 }
1680 
1681 /*
1682  * Return the path of the temp directory in a given tablespace.
1683  */
1684 void
1686 {
1687  /*
1688  * Identify the tempfile directory for this tablespace.
1689  *
1690  * If someone tries to specify pg_global, use pg_default instead.
1691  */
1692  if (tablespace == InvalidOid ||
1693  tablespace == DEFAULTTABLESPACE_OID ||
1694  tablespace == GLOBALTABLESPACE_OID)
1695  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1696  else
1697  {
1698  /* All other tablespaces are accessed via symlinks */
1699  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1702  }
1703 }
1704 
1705 /*
1706  * Open a temporary file in a specific tablespace.
1707  * Subroutine for OpenTemporaryFile, which see for details.
1708  */
1709 static File
1710 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1711 {
1712  char tempdirpath[MAXPGPATH];
1713  char tempfilepath[MAXPGPATH];
1714  File file;
1715 
1716  TempTablespacePath(tempdirpath, tblspcOid);
1717 
1718  /*
1719  * Generate a tempfile name that should be unique within the current
1720  * database instance.
1721  */
1722  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1723  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1724 
1725  /*
1726  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1727  * temp file that can be reused.
1728  */
1729  file = PathNameOpenFile(tempfilepath,
1730  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1731  if (file <= 0)
1732  {
1733  /*
1734  * We might need to create the tablespace's tempfile directory, if no
1735  * one has yet done so.
1736  *
1737  * Don't check for an error from MakePGDirectory; it could fail if
1738  * someone else just did the same thing. If it doesn't work then
1739  * we'll bomb out on the second create attempt, instead.
1740  */
1741  (void) MakePGDirectory(tempdirpath);
1742 
1743  file = PathNameOpenFile(tempfilepath,
1744  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1745  if (file <= 0 && rejectError)
1746  elog(ERROR, "could not create temporary file \"%s\": %m",
1747  tempfilepath);
1748  }
1749 
1750  return file;
1751 }
1752 
1753 
1754 /*
1755  * Create a new file. The directory containing it must already exist. Files
1756  * created this way are subject to temp_file_limit and are automatically
1757  * closed at end of transaction, but are not automatically deleted on close
1758  * because they are intended to be shared between cooperating backends.
1759  *
1760  * If the file is inside the top-level temporary directory, its name should
1761  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1762  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1763  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1764  * the prefix isn't needed.
1765  */
1766 File
1767 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1768 {
1769  File file;
1770 
1771  Assert(temporary_files_allowed); /* check temp file access is up */
1772 
1774 
1775  /*
1776  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1777  * temp file that can be reused.
1778  */
1779  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1780  if (file <= 0)
1781  {
1782  if (error_on_failure)
1783  ereport(ERROR,
1785  errmsg("could not create temporary file \"%s\": %m",
1786  path)));
1787  else
1788  return file;
1789  }
1790 
1791  /* Mark it for temp_file_limit accounting. */
1793 
1794  /* Register it for automatic close. */
1795  RegisterTemporaryFile(file);
1796 
1797  return file;
1798 }
1799 
1800 /*
1801  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1802  * another backend. Files opened this way don't count against the
1803  * temp_file_limit of the caller, are automatically closed at the end of the
1804  * transaction but are not deleted on close.
1805  */
1806 File
1807 PathNameOpenTemporaryFile(const char *path, int mode)
1808 {
1809  File file;
1810 
1811  Assert(temporary_files_allowed); /* check temp file access is up */
1812 
1814 
1815  file = PathNameOpenFile(path, mode | PG_BINARY);
1816 
1817  /* If no such file, then we don't raise an error. */
1818  if (file <= 0 && errno != ENOENT)
1819  ereport(ERROR,
1821  errmsg("could not open temporary file \"%s\": %m",
1822  path)));
1823 
1824  if (file > 0)
1825  {
1826  /* Register it for automatic close. */
1827  RegisterTemporaryFile(file);
1828  }
1829 
1830  return file;
1831 }
1832 
1833 /*
1834  * Delete a file by pathname. Return true if the file existed, false if
1835  * didn't.
1836  */
1837 bool
1838 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1839 {
1840  struct stat filestats;
1841  int stat_errno;
1842 
1843  /* Get the final size for pgstat reporting. */
1844  if (stat(path, &filestats) != 0)
1845  stat_errno = errno;
1846  else
1847  stat_errno = 0;
1848 
1849  /*
1850  * Unlike FileClose's automatic file deletion code, we tolerate
1851  * non-existence to support BufFileDeleteFileSet which doesn't know how
1852  * many segments it has to delete until it runs out.
1853  */
1854  if (stat_errno == ENOENT)
1855  return false;
1856 
1857  if (unlink(path) < 0)
1858  {
1859  if (errno != ENOENT)
1860  ereport(error_on_failure ? ERROR : LOG,
1862  errmsg("could not unlink temporary file \"%s\": %m",
1863  path)));
1864  return false;
1865  }
1866 
1867  if (stat_errno == 0)
1868  ReportTemporaryFileUsage(path, filestats.st_size);
1869  else
1870  {
1871  errno = stat_errno;
1872  ereport(LOG,
1874  errmsg("could not stat file \"%s\": %m", path)));
1875  }
1876 
1877  return true;
1878 }
1879 
1880 /*
1881  * close a file when done with it
1882  */
1883 void
1885 {
1886  Vfd *vfdP;
1887 
1888  Assert(FileIsValid(file));
1889 
1890  DO_DB(elog(LOG, "FileClose: %d (%s)",
1891  file, VfdCache[file].fileName));
1892 
1893  vfdP = &VfdCache[file];
1894 
1895  if (!FileIsNotOpen(file))
1896  {
1897  /* close the file */
1898  if (close(vfdP->fd) != 0)
1899  {
1900  /*
1901  * We may need to panic on failure to close non-temporary files;
1902  * see LruDelete.
1903  */
1905  "could not close file \"%s\": %m", vfdP->fileName);
1906  }
1907 
1908  --nfile;
1909  vfdP->fd = VFD_CLOSED;
1910 
1911  /* remove the file from the lru ring */
1912  Delete(file);
1913  }
1914 
1915  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1916  {
1917  /* Subtract its size from current usage (do first in case of error) */
1918  temporary_files_size -= vfdP->fileSize;
1919  vfdP->fileSize = 0;
1920  }
1921 
1922  /*
1923  * Delete the file if it was temporary, and make a log entry if wanted
1924  */
1925  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1926  {
1927  struct stat filestats;
1928  int stat_errno;
1929 
1930  /*
1931  * If we get an error, as could happen within the ereport/elog calls,
1932  * we'll come right back here during transaction abort. Reset the
1933  * flag to ensure that we can't get into an infinite loop. This code
1934  * is arranged to ensure that the worst-case consequence is failing to
1935  * emit log message(s), not failing to attempt the unlink.
1936  */
1937  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1938 
1939 
1940  /* first try the stat() */
1941  if (stat(vfdP->fileName, &filestats))
1942  stat_errno = errno;
1943  else
1944  stat_errno = 0;
1945 
1946  /* in any case do the unlink */
1947  if (unlink(vfdP->fileName))
1948  ereport(LOG,
1950  errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
1951 
1952  /* and last report the stat results */
1953  if (stat_errno == 0)
1954  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1955  else
1956  {
1957  errno = stat_errno;
1958  ereport(LOG,
1960  errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
1961  }
1962  }
1963 
1964  /* Unregister it from the resource owner */
1965  if (vfdP->resowner)
1966  ResourceOwnerForgetFile(vfdP->resowner, file);
1967 
1968  /*
1969  * Return the Vfd slot to the free list
1970  */
1971  FreeVfd(file);
1972 }
1973 
1974 /*
1975  * FilePrefetch - initiate asynchronous read of a given range of the file.
1976  *
1977  * Currently the only implementation of this function is using posix_fadvise
1978  * which is the simplest standardized interface that accomplishes this.
1979  * We could add an implementation using libaio in the future; but note that
1980  * this API is inappropriate for libaio, which wants to have a buffer provided
1981  * to read into.
1982  */
1983 int
1984 FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
1985 {
1986 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1987  int returnCode;
1988 
1989  Assert(FileIsValid(file));
1990 
1991  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1992  file, VfdCache[file].fileName,
1993  (int64) offset, amount));
1994 
1995  returnCode = FileAccess(file);
1996  if (returnCode < 0)
1997  return returnCode;
1998 
1999  pgstat_report_wait_start(wait_event_info);
2000  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2001  POSIX_FADV_WILLNEED);
2003 
2004  return returnCode;
2005 #else
2006  Assert(FileIsValid(file));
2007  return 0;
2008 #endif
2009 }
2010 
2011 void
2012 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2013 {
2014  int returnCode;
2015 
2016  Assert(FileIsValid(file));
2017 
2018  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2019  file, VfdCache[file].fileName,
2020  (int64) offset, (int64) nbytes));
2021 
2022  if (nbytes <= 0)
2023  return;
2024 
2025  returnCode = FileAccess(file);
2026  if (returnCode < 0)
2027  return;
2028 
2029  pgstat_report_wait_start(wait_event_info);
2030  pg_flush_data(VfdCache[file].fd, offset, nbytes);
2032 }
2033 
2034 int
2035 FileRead(File file, void *buffer, size_t amount, off_t offset,
2036  uint32 wait_event_info)
2037 {
2038  int returnCode;
2039  Vfd *vfdP;
2040 
2041  Assert(FileIsValid(file));
2042 
2043  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %zu %p",
2044  file, VfdCache[file].fileName,
2045  (int64) offset,
2046  amount, buffer));
2047 
2048  returnCode = FileAccess(file);
2049  if (returnCode < 0)
2050  return returnCode;
2051 
2052  vfdP = &VfdCache[file];
2053 
2054 retry:
2055  pgstat_report_wait_start(wait_event_info);
2056  returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
2058 
2059  if (returnCode < 0)
2060  {
2061  /*
2062  * Windows may run out of kernel buffers and return "Insufficient
2063  * system resources" error. Wait a bit and retry to solve it.
2064  *
2065  * It is rumored that EINTR is also possible on some Unix filesystems,
2066  * in which case immediate retry is indicated.
2067  */
2068 #ifdef WIN32
2069  DWORD error = GetLastError();
2070 
2071  switch (error)
2072  {
2073  case ERROR_NO_SYSTEM_RESOURCES:
2074  pg_usleep(1000L);
2075  errno = EINTR;
2076  break;
2077  default:
2078  _dosmaperr(error);
2079  break;
2080  }
2081 #endif
2082  /* OK to retry if interrupted */
2083  if (errno == EINTR)
2084  goto retry;
2085  }
2086 
2087  return returnCode;
2088 }
2089 
2090 int
2091 FileWrite(File file, const void *buffer, size_t amount, off_t offset,
2092  uint32 wait_event_info)
2093 {
2094  int returnCode;
2095  Vfd *vfdP;
2096 
2097  Assert(FileIsValid(file));
2098 
2099  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
2100  file, VfdCache[file].fileName,
2101  (int64) offset,
2102  amount, buffer));
2103 
2104  returnCode = FileAccess(file);
2105  if (returnCode < 0)
2106  return returnCode;
2107 
2108  vfdP = &VfdCache[file];
2109 
2110  /*
2111  * If enforcing temp_file_limit and it's a temp file, check to see if the
2112  * write would overrun temp_file_limit, and throw error if so. Note: it's
2113  * really a modularity violation to throw error here; we should set errno
2114  * and return -1. However, there's no way to report a suitable error
2115  * message if we do that. All current callers would just throw error
2116  * immediately anyway, so this is safe at present.
2117  */
2118  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2119  {
2120  off_t past_write = offset + amount;
2121 
2122  if (past_write > vfdP->fileSize)
2123  {
2124  uint64 newTotal = temporary_files_size;
2125 
2126  newTotal += past_write - vfdP->fileSize;
2127  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2128  ereport(ERROR,
2129  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2130  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2131  temp_file_limit)));
2132  }
2133  }
2134 
2135 retry:
2136  errno = 0;
2137  pgstat_report_wait_start(wait_event_info);
2138  returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2140 
2141  /* if write didn't set errno, assume problem is no disk space */
2142  if (returnCode != amount && errno == 0)
2143  errno = ENOSPC;
2144 
2145  if (returnCode >= 0)
2146  {
2147  /*
2148  * Maintain fileSize and temporary_files_size if it's a temp file.
2149  */
2150  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2151  {
2152  off_t past_write = offset + amount;
2153 
2154  if (past_write > vfdP->fileSize)
2155  {
2156  temporary_files_size += past_write - vfdP->fileSize;
2157  vfdP->fileSize = past_write;
2158  }
2159  }
2160  }
2161  else
2162  {
2163  /*
2164  * See comments in FileRead()
2165  */
2166 #ifdef WIN32
2167  DWORD error = GetLastError();
2168 
2169  switch (error)
2170  {
2171  case ERROR_NO_SYSTEM_RESOURCES:
2172  pg_usleep(1000L);
2173  errno = EINTR;
2174  break;
2175  default:
2176  _dosmaperr(error);
2177  break;
2178  }
2179 #endif
2180  /* OK to retry if interrupted */
2181  if (errno == EINTR)
2182  goto retry;
2183  }
2184 
2185  return returnCode;
2186 }
2187 
2188 int
2189 FileSync(File file, uint32 wait_event_info)
2190 {
2191  int returnCode;
2192 
2193  Assert(FileIsValid(file));
2194 
2195  DO_DB(elog(LOG, "FileSync: %d (%s)",
2196  file, VfdCache[file].fileName));
2197 
2198  returnCode = FileAccess(file);
2199  if (returnCode < 0)
2200  return returnCode;
2201 
2202  pgstat_report_wait_start(wait_event_info);
2203  returnCode = pg_fsync(VfdCache[file].fd);
2205 
2206  return returnCode;
2207 }
2208 
2209 off_t
2211 {
2212  Assert(FileIsValid(file));
2213 
2214  DO_DB(elog(LOG, "FileSize %d (%s)",
2215  file, VfdCache[file].fileName));
2216 
2217  if (FileIsNotOpen(file))
2218  {
2219  if (FileAccess(file) < 0)
2220  return (off_t) -1;
2221  }
2222 
2223  return lseek(VfdCache[file].fd, 0, SEEK_END);
2224 }
2225 
2226 int
2227 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2228 {
2229  int returnCode;
2230 
2231  Assert(FileIsValid(file));
2232 
2233  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2234  file, VfdCache[file].fileName));
2235 
2236  returnCode = FileAccess(file);
2237  if (returnCode < 0)
2238  return returnCode;
2239 
2240  pgstat_report_wait_start(wait_event_info);
2241  returnCode = ftruncate(VfdCache[file].fd, offset);
2243 
2244  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2245  {
2246  /* adjust our state for truncation of a temp file */
2247  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2248  temporary_files_size -= VfdCache[file].fileSize - offset;
2249  VfdCache[file].fileSize = offset;
2250  }
2251 
2252  return returnCode;
2253 }
2254 
2255 /*
2256  * Return the pathname associated with an open file.
2257  *
2258  * The returned string points to an internal buffer, which is valid until
2259  * the file is closed.
2260  */
2261 char *
2263 {
2264  Assert(FileIsValid(file));
2265 
2266  return VfdCache[file].fileName;
2267 }
2268 
2269 /*
2270  * Return the raw file descriptor of an opened file.
2271  *
2272  * The returned file descriptor will be valid until the file is closed, but
2273  * there are a lot of things that can make that happen. So the caller should
2274  * be careful not to do much of anything else before it finishes using the
2275  * returned file descriptor.
2276  */
2277 int
2279 {
2280  Assert(FileIsValid(file));
2281  return VfdCache[file].fd;
2282 }
2283 
2284 /*
2285  * FileGetRawFlags - returns the file flags on open(2)
2286  */
2287 int
2289 {
2290  Assert(FileIsValid(file));
2291  return VfdCache[file].fileFlags;
2292 }
2293 
2294 /*
2295  * FileGetRawMode - returns the mode bitmask passed to open(2)
2296  */
2297 mode_t
2299 {
2300  Assert(FileIsValid(file));
2301  return VfdCache[file].fileMode;
2302 }
2303 
2304 /*
2305  * Make room for another allocatedDescs[] array entry if needed and possible.
2306  * Returns true if an array element is available.
2307  */
2308 static bool
2310 {
2311  AllocateDesc *newDescs;
2312  int newMax;
2313 
2314  /* Quick out if array already has a free slot. */
2316  return true;
2317 
2318  /*
2319  * If the array hasn't yet been created in the current process, initialize
2320  * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2321  * we will ever need, anyway. We don't want to look at max_safe_fds
2322  * immediately because set_max_safe_fds() may not have run yet.
2323  */
2324  if (allocatedDescs == NULL)
2325  {
2326  newMax = FD_MINFREE / 3;
2327  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2328  /* Out of memory already? Treat as fatal error. */
2329  if (newDescs == NULL)
2330  ereport(ERROR,
2331  (errcode(ERRCODE_OUT_OF_MEMORY),
2332  errmsg("out of memory")));
2333  allocatedDescs = newDescs;
2334  maxAllocatedDescs = newMax;
2335  return true;
2336  }
2337 
2338  /*
2339  * Consider enlarging the array beyond the initial allocation used above.
2340  * By the time this happens, max_safe_fds should be known accurately.
2341  *
2342  * We mustn't let allocated descriptors hog all the available FDs, and in
2343  * practice we'd better leave a reasonable number of FDs for VFD use. So
2344  * set the maximum to max_safe_fds / 3. (This should certainly be at
2345  * least as large as the initial size, FD_MINFREE / 3, so we aren't
2346  * tightening the restriction here.) Recall that "external" FDs are
2347  * allowed to consume another third of max_safe_fds.
2348  */
2349  newMax = max_safe_fds / 3;
2350  if (newMax > maxAllocatedDescs)
2351  {
2352  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2353  newMax * sizeof(AllocateDesc));
2354  /* Treat out-of-memory as a non-fatal error. */
2355  if (newDescs == NULL)
2356  return false;
2357  allocatedDescs = newDescs;
2358  maxAllocatedDescs = newMax;
2359  return true;
2360  }
2361 
2362  /* Can't enlarge allocatedDescs[] any more. */
2363  return false;
2364 }
2365 
2366 /*
2367  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2368  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2369  * necessary to open the file. When done, call FreeFile rather than fclose.
2370  *
2371  * Note that files that will be open for any significant length of time
2372  * should NOT be handled this way, since they cannot share kernel file
2373  * descriptors with other files; there is grave risk of running out of FDs
2374  * if anyone locks down too many FDs. Most callers of this routine are
2375  * simply reading a config file that they will read and close immediately.
2376  *
2377  * fd.c will automatically close all files opened with AllocateFile at
2378  * transaction commit or abort; this prevents FD leakage if a routine
2379  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2380  *
2381  * Ideally this should be the *only* direct call of fopen() in the backend.
2382  */
2383 FILE *
2384 AllocateFile(const char *name, const char *mode)
2385 {
2386  FILE *file;
2387 
2388  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2390 
2391  /* Can we allocate another non-virtual FD? */
2392  if (!reserveAllocatedDesc())
2393  ereport(ERROR,
2394  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2395  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2396  maxAllocatedDescs, name)));
2397 
2398  /* Close excess kernel FDs. */
2399  ReleaseLruFiles();
2400 
2401 TryAgain:
2402  if ((file = fopen(name, mode)) != NULL)
2403  {
2405 
2406  desc->kind = AllocateDescFile;
2407  desc->desc.file = file;
2410  return desc->desc.file;
2411  }
2412 
2413  if (errno == EMFILE || errno == ENFILE)
2414  {
2415  int save_errno = errno;
2416 
2417  ereport(LOG,
2418  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2419  errmsg("out of file descriptors: %m; release and retry")));
2420  errno = 0;
2421  if (ReleaseLruFile())
2422  goto TryAgain;
2423  errno = save_errno;
2424  }
2425 
2426  return NULL;
2427 }
2428 
2429 /*
2430  * Open a file with OpenTransientFilePerm() and pass default file mode for
2431  * the fileMode parameter.
2432  */
2433 int
2434 OpenTransientFile(const char *fileName, int fileFlags)
2435 {
2436  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2437 }
2438 
2439 /*
2440  * Like AllocateFile, but returns an unbuffered fd like open(2)
2441  */
2442 int
2443 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2444 {
2445  int fd;
2446 
2447  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2448  numAllocatedDescs, fileName));
2449 
2450  /* Can we allocate another non-virtual FD? */
2451  if (!reserveAllocatedDesc())
2452  ereport(ERROR,
2453  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2454  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2455  maxAllocatedDescs, fileName)));
2456 
2457  /* Close excess kernel FDs. */
2458  ReleaseLruFiles();
2459 
2460  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2461 
2462  if (fd >= 0)
2463  {
2465 
2466  desc->kind = AllocateDescRawFD;
2467  desc->desc.fd = fd;
2470 
2471  return fd;
2472  }
2473 
2474  return -1; /* failure */
2475 }
2476 
2477 /*
2478  * Routines that want to initiate a pipe stream should use OpenPipeStream
2479  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2480  * necessary. When done, call ClosePipeStream rather than pclose.
2481  *
2482  * This function also ensures that the popen'd program is run with default
2483  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2484  * uses. This ensures desirable response to, eg, closing a read pipe early.
2485  */
2486 FILE *
2487 OpenPipeStream(const char *command, const char *mode)
2488 {
2489  FILE *file;
2490  int save_errno;
2491 
2492  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2493  numAllocatedDescs, command));
2494 
2495  /* Can we allocate another non-virtual FD? */
2496  if (!reserveAllocatedDesc())
2497  ereport(ERROR,
2498  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2499  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2500  maxAllocatedDescs, command)));
2501 
2502  /* Close excess kernel FDs. */
2503  ReleaseLruFiles();
2504 
2505 TryAgain:
2506  fflush(NULL);
2508  errno = 0;
2509  file = popen(command, mode);
2510  save_errno = errno;
2512  errno = save_errno;
2513  if (file != NULL)
2514  {
2516 
2517  desc->kind = AllocateDescPipe;
2518  desc->desc.file = file;
2521  return desc->desc.file;
2522  }
2523 
2524  if (errno == EMFILE || errno == ENFILE)
2525  {
2526  ereport(LOG,
2527  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2528  errmsg("out of file descriptors: %m; release and retry")));
2529  if (ReleaseLruFile())
2530  goto TryAgain;
2531  errno = save_errno;
2532  }
2533 
2534  return NULL;
2535 }
2536 
2537 /*
2538  * Free an AllocateDesc of any type.
2539  *
2540  * The argument *must* point into the allocatedDescs[] array.
2541  */
2542 static int
2544 {
2545  int result;
2546 
2547  /* Close the underlying object */
2548  switch (desc->kind)
2549  {
2550  case AllocateDescFile:
2551  result = fclose(desc->desc.file);
2552  break;
2553  case AllocateDescPipe:
2554  result = pclose(desc->desc.file);
2555  break;
2556  case AllocateDescDir:
2557  result = closedir(desc->desc.dir);
2558  break;
2559  case AllocateDescRawFD:
2560  result = close(desc->desc.fd);
2561  break;
2562  default:
2563  elog(ERROR, "AllocateDesc kind not recognized");
2564  result = 0; /* keep compiler quiet */
2565  break;
2566  }
2567 
2568  /* Compact storage in the allocatedDescs array */
2571 
2572  return result;
2573 }
2574 
2575 /*
2576  * Close a file returned by AllocateFile.
2577  *
2578  * Note we do not check fclose's return value --- it is up to the caller
2579  * to handle close errors.
2580  */
2581 int
2582 FreeFile(FILE *file)
2583 {
2584  int i;
2585 
2586  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2587 
2588  /* Remove file from list of allocated files, if it's present */
2589  for (i = numAllocatedDescs; --i >= 0;)
2590  {
2591  AllocateDesc *desc = &allocatedDescs[i];
2592 
2593  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2594  return FreeDesc(desc);
2595  }
2596 
2597  /* Only get here if someone passes us a file not in allocatedDescs */
2598  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2599 
2600  return fclose(file);
2601 }
2602 
2603 /*
2604  * Close a file returned by OpenTransientFile.
2605  *
2606  * Note we do not check close's return value --- it is up to the caller
2607  * to handle close errors.
2608  */
2609 int
2611 {
2612  int i;
2613 
2614  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2615 
2616  /* Remove fd from list of allocated files, if it's present */
2617  for (i = numAllocatedDescs; --i >= 0;)
2618  {
2619  AllocateDesc *desc = &allocatedDescs[i];
2620 
2621  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2622  return FreeDesc(desc);
2623  }
2624 
2625  /* Only get here if someone passes us a file not in allocatedDescs */
2626  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2627 
2628  return close(fd);
2629 }
2630 
2631 /*
2632  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2633  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2634  * necessary to open the directory, and with closing it after an elog.
2635  * When done, call FreeDir rather than closedir.
2636  *
2637  * Returns NULL, with errno set, on failure. Note that failure detection
2638  * is commonly left to the following call of ReadDir or ReadDirExtended;
2639  * see the comments for ReadDir.
2640  *
2641  * Ideally this should be the *only* direct call of opendir() in the backend.
2642  */
2643 DIR *
2644 AllocateDir(const char *dirname)
2645 {
2646  DIR *dir;
2647 
2648  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2649  numAllocatedDescs, dirname));
2650 
2651  /* Can we allocate another non-virtual FD? */
2652  if (!reserveAllocatedDesc())
2653  ereport(ERROR,
2654  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2655  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2656  maxAllocatedDescs, dirname)));
2657 
2658  /* Close excess kernel FDs. */
2659  ReleaseLruFiles();
2660 
2661 TryAgain:
2662  if ((dir = opendir(dirname)) != NULL)
2663  {
2665 
2666  desc->kind = AllocateDescDir;
2667  desc->desc.dir = dir;
2670  return desc->desc.dir;
2671  }
2672 
2673  if (errno == EMFILE || errno == ENFILE)
2674  {
2675  int save_errno = errno;
2676 
2677  ereport(LOG,
2678  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2679  errmsg("out of file descriptors: %m; release and retry")));
2680  errno = 0;
2681  if (ReleaseLruFile())
2682  goto TryAgain;
2683  errno = save_errno;
2684  }
2685 
2686  return NULL;
2687 }
2688 
2689 /*
2690  * Read a directory opened with AllocateDir, ereport'ing any error.
2691  *
2692  * This is easier to use than raw readdir() since it takes care of some
2693  * otherwise rather tedious and error-prone manipulation of errno. Also,
2694  * if you are happy with a generic error message for AllocateDir failure,
2695  * you can just do
2696  *
2697  * dir = AllocateDir(path);
2698  * while ((dirent = ReadDir(dir, path)) != NULL)
2699  * process dirent;
2700  * FreeDir(dir);
2701  *
2702  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2703  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2704  * use this shortcut.)
2705  *
2706  * The pathname passed to AllocateDir must be passed to this routine too,
2707  * but it is only used for error reporting.
2708  */
2709 struct dirent *
2710 ReadDir(DIR *dir, const char *dirname)
2711 {
2712  return ReadDirExtended(dir, dirname, ERROR);
2713 }
2714 
2715 /*
2716  * Alternate version of ReadDir that allows caller to specify the elevel
2717  * for any error report (whether it's reporting an initial failure of
2718  * AllocateDir or a subsequent directory read failure).
2719  *
2720  * If elevel < ERROR, returns NULL after any error. With the normal coding
2721  * pattern, this will result in falling out of the loop immediately as
2722  * though the directory contained no (more) entries.
2723  */
2724 struct dirent *
2725 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2726 {
2727  struct dirent *dent;
2728 
2729  /* Give a generic message for AllocateDir failure, if caller didn't */
2730  if (dir == NULL)
2731  {
2732  ereport(elevel,
2734  errmsg("could not open directory \"%s\": %m",
2735  dirname)));
2736  return NULL;
2737  }
2738 
2739  errno = 0;
2740  if ((dent = readdir(dir)) != NULL)
2741  return dent;
2742 
2743  if (errno)
2744  ereport(elevel,
2746  errmsg("could not read directory \"%s\": %m",
2747  dirname)));
2748  return NULL;
2749 }
2750 
2751 /*
2752  * Close a directory opened with AllocateDir.
2753  *
2754  * Returns closedir's return value (with errno set if it's not 0).
2755  * Note we do not check the return value --- it is up to the caller
2756  * to handle close errors if wanted.
2757  *
2758  * Does nothing if dir == NULL; we assume that directory open failure was
2759  * already reported if desired.
2760  */
2761 int
2763 {
2764  int i;
2765 
2766  /* Nothing to do if AllocateDir failed */
2767  if (dir == NULL)
2768  return 0;
2769 
2770  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2771 
2772  /* Remove dir from list of allocated dirs, if it's present */
2773  for (i = numAllocatedDescs; --i >= 0;)
2774  {
2775  AllocateDesc *desc = &allocatedDescs[i];
2776 
2777  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2778  return FreeDesc(desc);
2779  }
2780 
2781  /* Only get here if someone passes us a dir not in allocatedDescs */
2782  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2783 
2784  return closedir(dir);
2785 }
2786 
2787 
2788 /*
2789  * Close a pipe stream returned by OpenPipeStream.
2790  */
2791 int
2792 ClosePipeStream(FILE *file)
2793 {
2794  int i;
2795 
2796  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2797 
2798  /* Remove file from list of allocated files, if it's present */
2799  for (i = numAllocatedDescs; --i >= 0;)
2800  {
2801  AllocateDesc *desc = &allocatedDescs[i];
2802 
2803  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2804  return FreeDesc(desc);
2805  }
2806 
2807  /* Only get here if someone passes us a file not in allocatedDescs */
2808  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2809 
2810  return pclose(file);
2811 }
2812 
2813 /*
2814  * closeAllVfds
2815  *
2816  * Force all VFDs into the physically-closed state, so that the fewest
2817  * possible number of kernel file descriptors are in use. There is no
2818  * change in the logical state of the VFDs.
2819  */
2820 void
2822 {
2823  Index i;
2824 
2825  if (SizeVfdCache > 0)
2826  {
2827  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2828  for (i = 1; i < SizeVfdCache; i++)
2829  {
2830  if (!FileIsNotOpen(i))
2831  LruDelete(i);
2832  }
2833  }
2834 }
2835 
2836 
2837 /*
2838  * SetTempTablespaces
2839  *
2840  * Define a list (actually an array) of OIDs of tablespaces to use for
2841  * temporary files. This list will be used until end of transaction,
2842  * unless this function is called again before then. It is caller's
2843  * responsibility that the passed-in array has adequate lifespan (typically
2844  * it'd be allocated in TopTransactionContext).
2845  *
2846  * Some entries of the array may be InvalidOid, indicating that the current
2847  * database's default tablespace should be used.
2848  */
2849 void
2850 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2851 {
2852  Assert(numSpaces >= 0);
2853  tempTableSpaces = tableSpaces;
2854  numTempTableSpaces = numSpaces;
2855 
2856  /*
2857  * Select a random starting point in the list. This is to minimize
2858  * conflicts between backends that are most likely sharing the same list
2859  * of temp tablespaces. Note that if we create multiple temp files in the
2860  * same transaction, we'll advance circularly through the list --- this
2861  * ensures that large temporary sort files are nicely spread across all
2862  * available tablespaces.
2863  */
2864  if (numSpaces > 1)
2866  0, numSpaces - 1);
2867  else
2868  nextTempTableSpace = 0;
2869 }
2870 
2871 /*
2872  * TempTablespacesAreSet
2873  *
2874  * Returns true if SetTempTablespaces has been called in current transaction.
2875  * (This is just so that tablespaces.c doesn't need its own per-transaction
2876  * state.)
2877  */
2878 bool
2880 {
2881  return (numTempTableSpaces >= 0);
2882 }
2883 
2884 /*
2885  * GetTempTablespaces
2886  *
2887  * Populate an array with the OIDs of the tablespaces that should be used for
2888  * temporary files. (Some entries may be InvalidOid, indicating that the
2889  * current database's default tablespace should be used.) At most numSpaces
2890  * entries will be filled.
2891  * Returns the number of OIDs that were copied into the output array.
2892  */
2893 int
2894 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2895 {
2896  int i;
2897 
2899  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2900  tableSpaces[i] = tempTableSpaces[i];
2901 
2902  return i;
2903 }
2904 
2905 /*
2906  * GetNextTempTableSpace
2907  *
2908  * Select the next temp tablespace to use. A result of InvalidOid means
2909  * to use the current database's default tablespace.
2910  */
2911 Oid
2913 {
2914  if (numTempTableSpaces > 0)
2915  {
2916  /* Advance nextTempTableSpace counter with wraparound */
2918  nextTempTableSpace = 0;
2920  }
2921  return InvalidOid;
2922 }
2923 
2924 
2925 /*
2926  * AtEOSubXact_Files
2927  *
2928  * Take care of subtransaction commit/abort. At abort, we close temp files
2929  * that the subtransaction may have opened. At commit, we reassign the
2930  * files that were opened to the parent subtransaction.
2931  */
2932 void
2933 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2934  SubTransactionId parentSubid)
2935 {
2936  Index i;
2937 
2938  for (i = 0; i < numAllocatedDescs; i++)
2939  {
2940  if (allocatedDescs[i].create_subid == mySubid)
2941  {
2942  if (isCommit)
2943  allocatedDescs[i].create_subid = parentSubid;
2944  else
2945  {
2946  /* have to recheck the item after FreeDesc (ugly) */
2947  FreeDesc(&allocatedDescs[i--]);
2948  }
2949  }
2950  }
2951 }
2952 
2953 /*
2954  * AtEOXact_Files
2955  *
2956  * This routine is called during transaction commit or abort. All still-open
2957  * per-transaction temporary file VFDs are closed, which also causes the
2958  * underlying files to be deleted (although they should've been closed already
2959  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2960  * closed. We also forget any transaction-local temp tablespace list.
2961  *
2962  * The isCommit flag is used only to decide whether to emit warnings about
2963  * unclosed files.
2964  */
2965 void
2966 AtEOXact_Files(bool isCommit)
2967 {
2968  CleanupTempFiles(isCommit, false);
2969  tempTableSpaces = NULL;
2970  numTempTableSpaces = -1;
2971 }
2972 
2973 /*
2974  * BeforeShmemExit_Files
2975  *
2976  * before_shmem_access hook to clean up temp files during backend shutdown.
2977  * Here, we want to clean up *all* temp files including interXact ones.
2978  */
2979 static void
2981 {
2982  CleanupTempFiles(false, true);
2983 
2984  /* prevent further temp files from being created */
2985 #ifdef USE_ASSERT_CHECKING
2986  temporary_files_allowed = false;
2987 #endif
2988 }
2989 
2990 /*
2991  * Close temporary files and delete their underlying files.
2992  *
2993  * isCommit: if true, this is normal transaction commit, and we don't
2994  * expect any remaining files; warn if there are some.
2995  *
2996  * isProcExit: if true, this is being called as the backend process is
2997  * exiting. If that's the case, we should remove all temporary files; if
2998  * that's not the case, we are being called for transaction commit/abort
2999  * and should only remove transaction-local temp files. In either case,
3000  * also clean up "allocated" stdio files, dirs and fds.
3001  */
3002 static void
3003 CleanupTempFiles(bool isCommit, bool isProcExit)
3004 {
3005  Index i;
3006 
3007  /*
3008  * Careful here: at proc_exit we need extra cleanup, not just
3009  * xact_temporary files.
3010  */
3011  if (isProcExit || have_xact_temporary_files)
3012  {
3013  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3014  for (i = 1; i < SizeVfdCache; i++)
3015  {
3016  unsigned short fdstate = VfdCache[i].fdstate;
3017 
3018  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3019  VfdCache[i].fileName != NULL)
3020  {
3021  /*
3022  * If we're in the process of exiting a backend process, close
3023  * all temporary files. Otherwise, only close temporary files
3024  * local to the current transaction. They should be closed by
3025  * the ResourceOwner mechanism already, so this is just a
3026  * debugging cross-check.
3027  */
3028  if (isProcExit)
3029  FileClose(i);
3030  else if (fdstate & FD_CLOSE_AT_EOXACT)
3031  {
3032  elog(WARNING,
3033  "temporary file %s not closed at end-of-transaction",
3034  VfdCache[i].fileName);
3035  FileClose(i);
3036  }
3037  }
3038  }
3039 
3040  have_xact_temporary_files = false;
3041  }
3042 
3043  /* Complain if any allocated files remain open at commit. */
3044  if (isCommit && numAllocatedDescs > 0)
3045  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3047 
3048  /* Clean up "allocated" stdio files, dirs and fds. */
3049  while (numAllocatedDescs > 0)
3050  FreeDesc(&allocatedDescs[0]);
3051 }
3052 
3053 
3054 /*
3055  * Remove temporary and temporary relation files left over from a prior
3056  * postmaster session
3057  *
3058  * This should be called during postmaster startup. It will forcibly
3059  * remove any leftover files created by OpenTemporaryFile and any leftover
3060  * temporary relation files created by mdcreate.
3061  *
3062  * During post-backend-crash restart cycle, this routine is called when
3063  * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3064  * queries are using temp files could result in useless storage usage that can
3065  * only be reclaimed by a service restart. The argument against enabling it is
3066  * that someone might want to examine the temporary files for debugging
3067  * purposes. This does however mean that OpenTemporaryFile had better allow for
3068  * collision with an existing temp file name.
3069  *
3070  * NOTE: this function and its subroutines generally report syscall failures
3071  * with ereport(LOG) and keep going. Removing temp files is not so critical
3072  * that we should fail to start the database when we can't do it.
3073  */
3074 void
3076 {
3077  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3078  DIR *spc_dir;
3079  struct dirent *spc_de;
3080 
3081  /*
3082  * First process temp files in pg_default ($PGDATA/base)
3083  */
3084  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3085  RemovePgTempFilesInDir(temp_path, true, false);
3086  RemovePgTempRelationFiles("base");
3087 
3088  /*
3089  * Cycle through temp directories for all non-default tablespaces.
3090  */
3091  spc_dir = AllocateDir("pg_tblspc");
3092 
3093  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3094  {
3095  if (strcmp(spc_de->d_name, ".") == 0 ||
3096  strcmp(spc_de->d_name, "..") == 0)
3097  continue;
3098 
3099  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3101  RemovePgTempFilesInDir(temp_path, true, false);
3102 
3103  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3105  RemovePgTempRelationFiles(temp_path);
3106  }
3107 
3108  FreeDir(spc_dir);
3109 
3110  /*
3111  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3112  * DataDir as well. However, that is *not* cleaned here because doing so
3113  * would create a race condition. It's done separately, earlier in
3114  * postmaster startup.
3115  */
3116 }
3117 
3118 /*
3119  * Process one pgsql_tmp directory for RemovePgTempFiles.
3120  *
3121  * If missing_ok is true, it's all right for the named directory to not exist.
3122  * Any other problem results in a LOG message. (missing_ok should be true at
3123  * the top level, since pgsql_tmp directories are not created until needed.)
3124  *
3125  * At the top level, this should be called with unlink_all = false, so that
3126  * only files matching the temporary name prefix will be unlinked. When
3127  * recursing it will be called with unlink_all = true to unlink everything
3128  * under a top-level temporary directory.
3129  *
3130  * (These two flags could be replaced by one, but it seems clearer to keep
3131  * them separate.)
3132  */
3133 void
3134 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3135 {
3136  DIR *temp_dir;
3137  struct dirent *temp_de;
3138  char rm_path[MAXPGPATH * 2];
3139 
3140  temp_dir = AllocateDir(tmpdirname);
3141 
3142  if (temp_dir == NULL && errno == ENOENT && missing_ok)
3143  return;
3144 
3145  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3146  {
3147  if (strcmp(temp_de->d_name, ".") == 0 ||
3148  strcmp(temp_de->d_name, "..") == 0)
3149  continue;
3150 
3151  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3152  tmpdirname, temp_de->d_name);
3153 
3154  if (unlink_all ||
3155  strncmp(temp_de->d_name,
3157  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3158  {
3159  PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3160 
3161  if (type == PGFILETYPE_ERROR)
3162  continue;
3163  else if (type == PGFILETYPE_DIR)
3164  {
3165  /* recursively remove contents, then directory itself */
3166  RemovePgTempFilesInDir(rm_path, false, true);
3167 
3168  if (rmdir(rm_path) < 0)
3169  ereport(LOG,
3171  errmsg("could not remove directory \"%s\": %m",
3172  rm_path)));
3173  }
3174  else
3175  {
3176  if (unlink(rm_path) < 0)
3177  ereport(LOG,
3179  errmsg("could not remove file \"%s\": %m",
3180  rm_path)));
3181  }
3182  }
3183  else
3184  ereport(LOG,
3185  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3186  rm_path)));
3187  }
3188 
3189  FreeDir(temp_dir);
3190 }
3191 
3192 /* Process one tablespace directory, look for per-DB subdirectories */
3193 static void
3194 RemovePgTempRelationFiles(const char *tsdirname)
3195 {
3196  DIR *ts_dir;
3197  struct dirent *de;
3198  char dbspace_path[MAXPGPATH * 2];
3199 
3200  ts_dir = AllocateDir(tsdirname);
3201 
3202  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3203  {
3204  /*
3205  * We're only interested in the per-database directories, which have
3206  * numeric names. Note that this code will also (properly) ignore "."
3207  * and "..".
3208  */
3209  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3210  continue;
3211 
3212  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3213  tsdirname, de->d_name);
3214  RemovePgTempRelationFilesInDbspace(dbspace_path);
3215  }
3216 
3217  FreeDir(ts_dir);
3218 }
3219 
3220 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3221 static void
3222 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3223 {
3224  DIR *dbspace_dir;
3225  struct dirent *de;
3226  char rm_path[MAXPGPATH * 2];
3227 
3228  dbspace_dir = AllocateDir(dbspacedirname);
3229 
3230  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3231  {
3232  if (!looks_like_temp_rel_name(de->d_name))
3233  continue;
3234 
3235  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3236  dbspacedirname, de->d_name);
3237 
3238  if (unlink(rm_path) < 0)
3239  ereport(LOG,
3241  errmsg("could not remove file \"%s\": %m",
3242  rm_path)));
3243  }
3244 
3245  FreeDir(dbspace_dir);
3246 }
3247 
3248 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3249 bool
3251 {
3252  int pos;
3253  int savepos;
3254 
3255  /* Must start with "t". */
3256  if (name[0] != 't')
3257  return false;
3258 
3259  /* Followed by a non-empty string of digits and then an underscore. */
3260  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3261  ;
3262  if (pos == 1 || name[pos] != '_')
3263  return false;
3264 
3265  /* Followed by another nonempty string of digits. */
3266  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3267  ;
3268  if (savepos == pos)
3269  return false;
3270 
3271  /* We might have _forkname or .segment or both. */
3272  if (name[pos] == '_')
3273  {
3274  int forkchar = forkname_chars(&name[pos + 1], NULL);
3275 
3276  if (forkchar <= 0)
3277  return false;
3278  pos += forkchar + 1;
3279  }
3280  if (name[pos] == '.')
3281  {
3282  int segchar;
3283 
3284  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3285  ;
3286  if (segchar <= 1)
3287  return false;
3288  pos += segchar;
3289  }
3290 
3291  /* Now we should be at the end. */
3292  if (name[pos] != '\0')
3293  return false;
3294  return true;
3295 }
3296 
3297 #ifdef HAVE_SYNCFS
3298 static void
3299 do_syncfs(const char *path)
3300 {
3301  int fd;
3302 
3303  ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3304  path);
3305 
3306  fd = OpenTransientFile(path, O_RDONLY);
3307  if (fd < 0)
3308  {
3309  ereport(LOG,
3311  errmsg("could not open file \"%s\": %m", path)));
3312  return;
3313  }
3314  if (syncfs(fd) < 0)
3315  ereport(LOG,
3317  errmsg("could not synchronize file system for file \"%s\": %m", path)));
3319 }
3320 #endif
3321 
3322 /*
3323  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3324  * all potential filesystem, depending on recovery_init_sync_method setting.
3325  *
3326  * We fsync regular files and directories wherever they are, but we
3327  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3328  * Other symlinks are presumed to point at files we're not responsible
3329  * for fsyncing, and might not have privileges to write at all.
3330  *
3331  * Errors are logged but not considered fatal; that's because this is used
3332  * only during database startup, to deal with the possibility that there are
3333  * issued-but-unsynced writes pending against the data directory. We want to
3334  * ensure that such writes reach disk before anything that's done in the new
3335  * run. However, aborting on error would result in failure to start for
3336  * harmless cases such as read-only files in the data directory, and that's
3337  * not good either.
3338  *
3339  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3340  * rewriting all changes again during recovery.
3341  *
3342  * Note we assume we're chdir'd into PGDATA to begin with.
3343  */
3344 void
3346 {
3347  bool xlog_is_symlink;
3348 
3349  /* We can skip this whole thing if fsync is disabled. */
3350  if (!enableFsync)
3351  return;
3352 
3353  /*
3354  * If pg_wal is a symlink, we'll need to recurse into it separately,
3355  * because the first walkdir below will ignore it.
3356  */
3357  xlog_is_symlink = false;
3358 
3359  {
3360  struct stat st;
3361 
3362  if (lstat("pg_wal", &st) < 0)
3363  ereport(LOG,
3365  errmsg("could not stat file \"%s\": %m",
3366  "pg_wal")));
3367  else if (S_ISLNK(st.st_mode))
3368  xlog_is_symlink = true;
3369  }
3370 
3371 #ifdef HAVE_SYNCFS
3373  {
3374  DIR *dir;
3375  struct dirent *de;
3376 
3377  /*
3378  * On Linux, we don't have to open every single file one by one. We
3379  * can use syncfs() to sync whole filesystems. We only expect
3380  * filesystem boundaries to exist where we tolerate symlinks, namely
3381  * pg_wal and the tablespaces, so we call syncfs() for each of those
3382  * directories.
3383  */
3384 
3385  /* Prepare to report progress syncing the data directory via syncfs. */
3387 
3388  /* Sync the top level pgdata directory. */
3389  do_syncfs(".");
3390  /* If any tablespaces are configured, sync each of those. */
3391  dir = AllocateDir("pg_tblspc");
3392  while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
3393  {
3394  char path[MAXPGPATH];
3395 
3396  if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3397  continue;
3398 
3399  snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
3400  do_syncfs(path);
3401  }
3402  FreeDir(dir);
3403  /* If pg_wal is a symlink, process that too. */
3404  if (xlog_is_symlink)
3405  do_syncfs("pg_wal");
3406  return;
3407  }
3408 #endif /* !HAVE_SYNCFS */
3409 
3410 #ifdef PG_FLUSH_DATA_WORKS
3411  /* Prepare to report progress of the pre-fsync phase. */
3413 
3414  /*
3415  * If possible, hint to the kernel that we're soon going to fsync the data
3416  * directory and its contents. Errors in this step are even less
3417  * interesting than normal, so log them only at DEBUG1.
3418  */
3419  walkdir(".", pre_sync_fname, false, DEBUG1);
3420  if (xlog_is_symlink)
3421  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3422  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3423 #endif
3424 
3425  /* Prepare to report progress syncing the data directory via fsync. */
3427 
3428  /*
3429  * Now we do the fsync()s in the same order.
3430  *
3431  * The main call ignores symlinks, so in addition to specially processing
3432  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3433  * process_symlinks = true. Note that if there are any plain directories
3434  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3435  * so we don't worry about optimizing it.
3436  */
3437  walkdir(".", datadir_fsync_fname, false, LOG);
3438  if (xlog_is_symlink)
3439  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3440  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3441 }
3442 
3443 /*
3444  * walkdir: recursively walk a directory, applying the action to each
3445  * regular file and directory (including the named directory itself).
3446  *
3447  * If process_symlinks is true, the action and recursion are also applied
3448  * to regular files and directories that are pointed to by symlinks in the
3449  * given directory; otherwise symlinks are ignored. Symlinks are always
3450  * ignored in subdirectories, ie we intentionally don't pass down the
3451  * process_symlinks flag to recursive calls.
3452  *
3453  * Errors are reported at level elevel, which might be ERROR or less.
3454  *
3455  * See also walkdir in file_utils.c, which is a frontend version of this
3456  * logic.
3457  */
3458 static void
3459 walkdir(const char *path,
3460  void (*action) (const char *fname, bool isdir, int elevel),
3461  bool process_symlinks,
3462  int elevel)
3463 {
3464  DIR *dir;
3465  struct dirent *de;
3466 
3467  dir = AllocateDir(path);
3468 
3469  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3470  {
3471  char subpath[MAXPGPATH * 2];
3472 
3474 
3475  if (strcmp(de->d_name, ".") == 0 ||
3476  strcmp(de->d_name, "..") == 0)
3477  continue;
3478 
3479  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3480 
3481  switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3482  {
3483  case PGFILETYPE_REG:
3484  (*action) (subpath, false, elevel);
3485  break;
3486  case PGFILETYPE_DIR:
3487  walkdir(subpath, action, false, elevel);
3488  break;
3489  default:
3490 
3491  /*
3492  * Errors are already reported directly by get_dirent_type(),
3493  * and any remaining symlinks and unknown file types are
3494  * ignored.
3495  */
3496  break;
3497  }
3498  }
3499 
3500  FreeDir(dir); /* we ignore any error here */
3501 
3502  /*
3503  * It's important to fsync the destination directory itself as individual
3504  * file fsyncs don't guarantee that the directory entry for the file is
3505  * synced. However, skip this if AllocateDir failed; the action function
3506  * might not be robust against that.
3507  */
3508  if (dir)
3509  (*action) (path, true, elevel);
3510 }
3511 
3512 
3513 /*
3514  * Hint to the OS that it should get ready to fsync() this file.
3515  *
3516  * Ignores errors trying to open unreadable files, and logs other errors at a
3517  * caller-specified level.
3518  */
3519 #ifdef PG_FLUSH_DATA_WORKS
3520 
3521 static void
3522 pre_sync_fname(const char *fname, bool isdir, int elevel)
3523 {
3524  int fd;
3525 
3526  /* Don't try to flush directories, it'll likely just fail */
3527  if (isdir)
3528  return;
3529 
3530  ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3531  fname);
3532 
3533  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3534 
3535  if (fd < 0)
3536  {
3537  if (errno == EACCES)
3538  return;
3539  ereport(elevel,
3541  errmsg("could not open file \"%s\": %m", fname)));
3542  return;
3543  }
3544 
3545  /*
3546  * pg_flush_data() ignores errors, which is ok because this is only a
3547  * hint.
3548  */
3549  pg_flush_data(fd, 0, 0);
3550 
3551  if (CloseTransientFile(fd) != 0)
3552  ereport(elevel,
3554  errmsg("could not close file \"%s\": %m", fname)));
3555 }
3556 
3557 #endif /* PG_FLUSH_DATA_WORKS */
3558 
3559 static void
3560 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3561 {
3562  ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3563  fname);
3564 
3565  /*
3566  * We want to silently ignoring errors about unreadable files. Pass that
3567  * desire on to fsync_fname_ext().
3568  */
3569  fsync_fname_ext(fname, isdir, true, elevel);
3570 }
3571 
3572 static void
3573 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3574 {
3575  if (isdir)
3576  {
3577  if (rmdir(fname) != 0 && errno != ENOENT)
3578  ereport(elevel,
3580  errmsg("could not remove directory \"%s\": %m", fname)));
3581  }
3582  else
3583  {
3584  /* Use PathNameDeleteTemporaryFile to report filesize */
3585  PathNameDeleteTemporaryFile(fname, false);
3586  }
3587 }
3588 
3589 /*
3590  * fsync_fname_ext -- Try to fsync a file or directory
3591  *
3592  * If ignore_perm is true, ignore errors upon trying to open unreadable
3593  * files. Logs other errors at a caller-specified level.
3594  *
3595  * Returns 0 if the operation succeeded, -1 otherwise.
3596  */
3597 int
3598 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3599 {
3600  int fd;
3601  int flags;
3602  int returncode;
3603 
3604  /*
3605  * Some OSs require directories to be opened read-only whereas other
3606  * systems don't allow us to fsync files opened read-only; so we need both
3607  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3608  * not writable by our userid, but we assume that's OK.
3609  */
3610  flags = PG_BINARY;
3611  if (!isdir)
3612  flags |= O_RDWR;
3613  else
3614  flags |= O_RDONLY;
3615 
3616  fd = OpenTransientFile(fname, flags);
3617 
3618  /*
3619  * Some OSs don't allow us to open directories at all (Windows returns
3620  * EACCES), just ignore the error in that case. If desired also silently
3621  * ignoring errors about unreadable files. Log others.
3622  */
3623  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3624  return 0;
3625  else if (fd < 0 && ignore_perm && errno == EACCES)
3626  return 0;
3627  else if (fd < 0)
3628  {
3629  ereport(elevel,
3631  errmsg("could not open file \"%s\": %m", fname)));
3632  return -1;
3633  }
3634 
3635  returncode = pg_fsync(fd);
3636 
3637  /*
3638  * Some OSes don't allow us to fsync directories at all, so we can ignore
3639  * those errors. Anything else needs to be logged.
3640  */
3641  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3642  {
3643  int save_errno;
3644 
3645  /* close file upon error, might not be in transaction context */
3646  save_errno = errno;
3647  (void) CloseTransientFile(fd);
3648  errno = save_errno;
3649 
3650  ereport(elevel,
3652  errmsg("could not fsync file \"%s\": %m", fname)));
3653  return -1;
3654  }
3655 
3656  if (CloseTransientFile(fd) != 0)
3657  {
3658  ereport(elevel,
3660  errmsg("could not close file \"%s\": %m", fname)));
3661  return -1;
3662  }
3663 
3664  return 0;
3665 }
3666 
3667 /*
3668  * fsync_parent_path -- fsync the parent path of a file or directory
3669  *
3670  * This is aimed at making file operations persistent on disk in case of
3671  * an OS crash or power failure.
3672  */
3673 static int
3674 fsync_parent_path(const char *fname, int elevel)
3675 {
3676  char parentpath[MAXPGPATH];
3677 
3678  strlcpy(parentpath, fname, MAXPGPATH);
3679  get_parent_directory(parentpath);
3680 
3681  /*
3682  * get_parent_directory() returns an empty string if the input argument is
3683  * just a file name (see comments in path.c), so handle that as being the
3684  * current directory.
3685  */
3686  if (strlen(parentpath) == 0)
3687  strlcpy(parentpath, ".", MAXPGPATH);
3688 
3689  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3690  return -1;
3691 
3692  return 0;
3693 }
3694 
3695 /*
3696  * Create a PostgreSQL data sub-directory
3697  *
3698  * The data directory itself, and most of its sub-directories, are created at
3699  * initdb time, but we do have some occasions when we create directories in
3700  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3701  * make sure that those directories are created consistently. Today, that means
3702  * making sure that the created directory has the correct permissions, which is
3703  * what pg_dir_create_mode tracks for us.
3704  *
3705  * Note that we also set the umask() based on what we understand the correct
3706  * permissions to be (see file_perm.c).
3707  *
3708  * For permissions other than the default, mkdir() can be used directly, but
3709  * be sure to consider carefully such cases -- a sub-directory with incorrect
3710  * permissions in a PostgreSQL data directory could cause backups and other
3711  * processes to fail.
3712  */
3713 int
3714 MakePGDirectory(const char *directoryName)
3715 {
3716  return mkdir(directoryName, pg_dir_create_mode);
3717 }
3718 
3719 /*
3720  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3721  *
3722  * Failure to fsync any data file is cause for immediate panic, unless
3723  * data_sync_retry is enabled. Data may have been written to the operating
3724  * system and removed from our buffer pool already, and if we are running on
3725  * an operating system that forgets dirty data on write-back failure, there
3726  * may be only one copy of the data remaining: in the WAL. A later attempt to
3727  * fsync again might falsely report success. Therefore we must not allow any
3728  * further checkpoints to be attempted. data_sync_retry can in theory be
3729  * enabled on systems known not to drop dirty buffered data on write-back
3730  * failure (with the likely outcome that checkpoints will continue to fail
3731  * until the underlying problem is fixed).
3732  *
3733  * Any code that reports a failure from fsync() or related functions should
3734  * filter the error level with this function.
3735  */
3736 int
3737 data_sync_elevel(int elevel)
3738 {
3739  return data_sync_retry ? elevel : PANIC;
3740 }
void begin_startup_progress_phase(void)
Definition: startup.c:352
unsigned int uint32
Definition: c.h:490
#define Min(x, y)
Definition: c.h:988
uint32 SubTransactionId
Definition: c.h:640
#define INT64_FORMAT
Definition: c.h:532
#define PG_BINARY
Definition: c.h:1260
unsigned int Index
Definition: c.h:598
#define MemSet(start, val, len)
Definition: c.h:1004
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:922
int fdatasync(int fildes)
#define OidIsValid(objectId)
Definition: c.h:759
size_t Size
Definition: c.h:589
int closedir(DIR *)
Definition: dirent.c:127
struct dirent * readdir(DIR *)
Definition: dirent.c:78
DIR * opendir(const char *)
Definition: dirent.c:33
int errcode_for_file_access(void)
Definition: elog.c:881
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define LOG
Definition: elog.h:31
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
const char * name
Definition: encode.c:571
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2710
int max_files_per_process
Definition: fd.c:144
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:458
int FileGetRawDesc(File file)
Definition: fd.c:2278
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3714
int FreeDir(DIR *dir)
Definition: fd.c:2762
int recovery_init_sync_method
Definition: fd.c:163
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2012
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:411
#define FD_MINFREE
Definition: fd.c:136
static int numTempTableSpaces
Definition: fd.c:284
static bool ReleaseLruFile(void)
Definition: fd.c:1288
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2384
#define FD_DELETE_AT_CLOSE
Definition: fd.c:187
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1015
static int maxAllocatedDescs
Definition: fd.c:263
static void Delete(File file)
Definition: fd.c:1174
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2543
static long tempFileCounter
Definition: fd.c:275
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:688
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2894
static int numAllocatedDescs
Definition: fd.c:262
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition: fd.c:1807
static void LruDelete(File file)
Definition: fd.c:1193
int pg_fdatasync(int fd)
Definition: fd.c:444
#define FileIsValid(file)
Definition: fd.c:181
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2189
static int nfile
Definition: fd.c:217
int CloseTransientFile(int fd)
Definition: fd.c:2610
#define DO_DB(A)
Definition: fd.c:175
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:993
void closeAllVfds(void)
Definition: fd.c:2821
int max_safe_fds
Definition: fd.c:157
static File AllocateVfd(void)
Definition: fd.c:1320
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1767
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1597
int ClosePipeStream(FILE *file)
Definition: fd.c:2792
void AtEOXact_Files(bool isCommit)
Definition: fd.c:2966
int FileGetRawFlags(File file)
Definition: fd.c:2288
static Size SizeVfdCache
Definition: fd.c:212
static int nextTempTableSpace
Definition: fd.c:285
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:188
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3598
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3573
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3194
int FreeFile(FILE *file)
Definition: fd.c:2582
mode_t FileGetRawMode(File file)
Definition: fd.c:2298
static AllocateDesc * allocatedDescs
Definition: fd.c:264
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:870
static int FileAccess(File file)
Definition: fd.c:1398
static void FreeVfd(File file)
Definition: fd.c:1378
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition: fd.c:423
void FileClose(File file)
Definition: fd.c:1884
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2487
void ReleaseExternalFD(void)
Definition: fd.c:1145
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:189
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3134
void RemovePgTempFiles(void)
Definition: fd.c:3075
#define FileIsNotOpen(file)
Definition: fd.c:184
bool TempTablespacesAreSet(void)
Definition: fd.c:2879
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:662
int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:1984
int data_sync_elevel(int elevel)
Definition: fd.c:3737
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1481
static void Insert(File file)
Definition: fd.c:1219
AllocateDescKind
Definition: fd.c:243
@ AllocateDescDir
Definition: fd.c:246
@ AllocateDescPipe
Definition: fd.c:245
@ AllocateDescFile
Definition: fd.c:244
@ AllocateDescRawFD
Definition: fd.c:247
int FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2091
Oid GetNextTempTableSpace(void)
Definition: fd.c:2912
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1494
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3560
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1434
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1710
bool AcquireExternalFD(void)
Definition: fd.c:1092
static void RegisterTemporaryFile(File file)
Definition: fd.c:1453
int FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2035
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2725
#define NUM_RESERVED_FDS
Definition: fd.c:127
static Oid * tempTableSpaces
Definition: fd.c:283
static bool reserveAllocatedDesc(void)
Definition: fd.c:2309
void InitFileAccess(void)
Definition: fd.c:809
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3222
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1630
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:778
static uint64 temporary_files_size
Definition: fd.c:231
void ReserveExternalFD(void)
Definition: fd.c:1127
char * FilePathName(File file)
Definition: fd.c:2262
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3250
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1838
void set_max_safe_fds(void)
Definition: fd.c:950
int pg_fsync(int fd)
Definition: fd.c:356
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:3003
#define VFD_CLOSED
Definition: fd.c:179
static bool have_xact_temporary_files
Definition: fd.c:223
static int LruInsert(File file)
Definition: fd.c:1241
static int numExternalFDs
Definition: fd.c:269
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3674
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1566
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:2933
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2434
void InitTemporaryFileAccess(void)
Definition: fd.c:839
static Vfd * VfdCache
Definition: fd.c:211
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2443
bool data_sync_retry
Definition: fd.c:160
static void ReleaseLruFiles(void)
Definition: fd.c:1310
void SyncDataDirectory(void)
Definition: fd.c:3345
off_t FileSize(File file)
Definition: fd.c:2210
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2227
static void BeforeShmemExit_Files(int code, Datum arg)
Definition: fd.c:2980
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3459
int pg_truncate(const char *path, off_t length)
Definition: fd.c:631
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2850
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2644
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1685
int File
Definition: fd.h:54
@ RECOVERY_INIT_SYNC_METHOD_SYNCFS
Definition: fd.h:51
@ RECOVERY_INIT_SYNC_METHOD_FSYNC
Definition: fd.h:50
#define PG_O_DIRECT
Definition: fd.h:93
int pg_file_create_mode
Definition: file_perm.c:19
int pg_dir_create_mode
Definition: file_perm.c:18
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:406
PGFileType
Definition: file_utils.h:19
@ PGFILETYPE_DIR
Definition: file_utils.h:23
@ PGFILETYPE_REG
Definition: file_utils.h:22
@ PGFILETYPE_ERROR
Definition: file_utils.h:20
int MyProcPid
Definition: globals.c:44
bool enableFsync
Definition: globals.c:123
Oid MyDatabaseTableSpace
Definition: globals.c:91
int temp_file_limit
Definition: guc_tables.c:505
int log_temp_files
Definition: guc_tables.c:499
#define realloc(a, b)
Definition: header.h:60
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
#define close(a)
Definition: win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:333
int j
Definition: isn.c:74
int i
Definition: isn.c:73
static void const char fflush(stdout)
Assert(fmt[strlen(fmt) - 1] !='\n')
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:241
void pfree(void *pointer)
Definition: mcxt.c:1436
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1456
void * palloc(Size size)
Definition: mcxt.c:1210
#define MAP_FAILED
Definition: mem.h:45
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:121
void * arg
static char * basedir
static PgChecksumMode mode
Definition: pg_checksums.c:65
#define PG_TEMP_FILES_DIR
Definition: pg_checksums.c:62
#define PG_TEMP_FILE_PREFIX
Definition: pg_checksums.c:63
#define MAXPGPATH
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition: pg_prng.c:144
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
static char * buf
Definition: pg_test_fsync.c:67
char * tablespace
Definition: pgbench.c:226
void pgstat_report_tempfile(size_t filesize)
#define pg_pwrite
Definition: port.h:226
#define pg_pread
Definition: port.h:225
void get_parent_directory(char *path)
Definition: path.c:977
pqsigfunc pqsignal(int signo, pqsigfunc func)
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
uintptr_t Datum
Definition: postgres.h:64
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
static int fd(const char *x, int i)
Definition: preproc-init.c:105
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:33
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1290
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1310
ResourceOwner CurrentResourceOwner
Definition: resowner.c:146
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1301
void pg_usleep(long microsec)
Definition: signal.c:53
static void error(void)
Definition: sql-dyntest.c:147
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
SubTransactionId create_subid
Definition: fd.c:253
DIR * dir
Definition: fd.c:257
FILE * file
Definition: fd.c:256
int fd
Definition: fd.c:258
union AllocateDesc::@19 desc
AllocateDescKind kind
Definition: fd.c:252
Definition: dirent.c:26
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
__int64 st_size
Definition: win32_port.h:275
unsigned short st_mode
Definition: win32_port.h:270
Definition: fd.c:192
int fd
Definition: fd.c:193
int fileFlags
Definition: fd.c:202
File lruLessRecently
Definition: fd.c:198
File lruMoreRecently
Definition: fd.c:197
char * fileName
Definition: fd.c:200
ResourceOwner resowner
Definition: fd.c:195
unsigned short fdstate
Definition: fd.c:194
File nextFree
Definition: fd.c:196
mode_t fileMode
Definition: fd.c:203
off_t fileSize
Definition: fd.c:199
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:271
static void pgstat_report_wait_end(void)
Definition: wait_event.h:287
#define fsync(fd)
Definition: win32_port.h:85
#define stat
Definition: win32_port.h:286
#define SIG_DFL
Definition: win32_port.h:171
#define EINTR
Definition: win32_port.h:376
#define SIGPIPE
Definition: win32_port.h:181
#define lstat(path, sb)
Definition: win32_port.h:287
#define S_ISDIR(m)
Definition: win32_port.h:327
void _dosmaperr(unsigned long)
Definition: win32error.c:177
#define S_ISLNK(m)
Definition: win32_port.h:346
#define mkdir(a, b)
Definition: win32_port.h:80
#define fstat
Definition: win32_port.h:285
#define ftruncate(a, b)
Definition: win32_port.h:82
#define SIG_IGN
Definition: win32_port.h:173
#define O_CLOEXEC
Definition: win32_port.h:361
#define O_DSYNC
Definition: win32_port.h:354
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:780
int sync_method
Definition: xlog.c:133
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:25
static const char * directory
Definition: zic.c:634