PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  * If a non-virtual file descriptor needs to be held open for any length of
65  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66  * (and eventually ReleaseExternalFD), so that we can take it into account
67  * while deciding how many VFDs can be open. This applies to FDs obtained
68  * with BasicOpenFile as well as those obtained without use of any fd.c API.
69  *
70  *-------------------------------------------------------------------------
71  */
72 
73 #include "postgres.h"
74 
75 #include <sys/file.h>
76 #include <sys/param.h>
77 #include <sys/stat.h>
78 #ifndef WIN32
79 #include <sys/mman.h>
80 #endif
81 #include <limits.h>
82 #include <unistd.h>
83 #include <fcntl.h>
84 #ifdef HAVE_SYS_RESOURCE_H
85 #include <sys/resource.h> /* for getrlimit */
86 #endif
87 
88 #include "access/xact.h"
89 #include "access/xlog.h"
90 #include "catalog/pg_tablespace.h"
91 #include "common/file_perm.h"
92 #include "common/file_utils.h"
93 #include "miscadmin.h"
94 #include "pgstat.h"
95 #include "portability/mem.h"
96 #include "storage/fd.h"
97 #include "storage/ipc.h"
98 #include "utils/guc.h"
99 #include "utils/resowner_private.h"
100 
101 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
102 #if defined(HAVE_SYNC_FILE_RANGE)
103 #define PG_FLUSH_DATA_WORKS 1
104 #elif !defined(WIN32) && defined(MS_ASYNC)
105 #define PG_FLUSH_DATA_WORKS 1
106 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
107 #define PG_FLUSH_DATA_WORKS 1
108 #endif
109 
110 /*
111  * We must leave some file descriptors free for system(), the dynamic loader,
112  * and other code that tries to open files without consulting fd.c. This
113  * is the number left free. (While we try fairly hard to prevent EMFILE
114  * errors, there's never any guarantee that we won't get ENFILE due to
115  * other processes chewing up FDs. So it's a bad idea to try to open files
116  * without consulting fd.c. Nonetheless we cannot control all code.)
117  *
118  * Because this is just a fixed setting, we are effectively assuming that
119  * no such code will leave FDs open over the long term; otherwise the slop
120  * is likely to be insufficient. Note in particular that we expect that
121  * loading a shared library does not result in any permanent increase in
122  * the number of open files. (This appears to be true on most if not
123  * all platforms as of Feb 2004.)
124  */
125 #define NUM_RESERVED_FDS 10
126 
127 /*
128  * If we have fewer than this many usable FDs after allowing for the reserved
129  * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
130  * much less than that. Note that this value ensures numExternalFDs can be
131  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
132  * will not pass unless that can grow to at least 14.)
133  */
134 #define FD_MINFREE 48
135 
136 /*
137  * A number of platforms allow individual processes to open many more files
138  * than they can really support when *many* processes do the same thing.
139  * This GUC parameter lets the DBA limit max_safe_fds to something less than
140  * what the postmaster's initial probe suggests will work.
141  */
143 
144 /*
145  * Maximum number of file descriptors to open for operations that fd.c knows
146  * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
147  * to a conservative value, and remains that way indefinitely in bootstrap or
148  * standalone-backend cases. In normal postmaster operation, the postmaster
149  * calls set_max_safe_fds() late in initialization to update the value, and
150  * that value is then inherited by forked subprocesses.
151  *
152  * Note: the value of max_files_per_process is taken into account while
153  * setting this variable, and so need not be tested separately.
154  */
155 int max_safe_fds = FD_MINFREE; /* default if not changed */
156 
157 /* Whether it is safe to continue running after fsync() fails. */
158 bool data_sync_retry = false;
159 
160 /* Debugging.... */
161 
162 #ifdef FDDEBUG
163 #define DO_DB(A) \
164  do { \
165  int _do_db_save_errno = errno; \
166  A; \
167  errno = _do_db_save_errno; \
168  } while (0)
169 #else
170 #define DO_DB(A) \
171  ((void) 0)
172 #endif
173 
174 #define VFD_CLOSED (-1)
175 
176 #define FileIsValid(file) \
177  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
178 
179 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
180 
181 /* these are the assigned bits in fdstate below: */
182 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
183 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
184 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
185 
186 typedef struct vfd
187 {
188  int fd; /* current FD, or VFD_CLOSED if none */
189  unsigned short fdstate; /* bitflags for VFD's state */
190  ResourceOwner resowner; /* owner, for automatic cleanup */
191  File nextFree; /* link to next free VFD, if in freelist */
192  File lruMoreRecently; /* doubly linked recency-of-use list */
194  off_t fileSize; /* current size of file (0 if not temporary) */
195  char *fileName; /* name of file, or NULL for unused VFD */
196  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
197  int fileFlags; /* open(2) flags for (re)opening the file */
198  mode_t fileMode; /* mode to pass to open(2) */
199 } Vfd;
200 
201 /*
202  * Virtual File Descriptor array pointer and size. This grows as
203  * needed. 'File' values are indexes into this array.
204  * Note that VfdCache[0] is not a usable VFD, just a list header.
205  */
206 static Vfd *VfdCache;
207 static Size SizeVfdCache = 0;
208 
209 /*
210  * Number of file descriptors known to be in use by VFD entries.
211  */
212 static int nfile = 0;
213 
214 /*
215  * Flag to tell whether it's worth scanning VfdCache looking for temp files
216  * to close
217  */
218 static bool have_xact_temporary_files = false;
219 
220 /*
221  * Tracks the total size of all temporary files. Note: when temp_file_limit
222  * is being enforced, this cannot overflow since the limit cannot be more
223  * than INT_MAX kilobytes. When not enforcing, it could theoretically
224  * overflow, but we don't care.
225  */
226 static uint64 temporary_files_size = 0;
227 
228 /*
229  * List of OS handles opened with AllocateFile, AllocateDir and
230  * OpenTransientFile.
231  */
232 typedef enum
233 {
239 
240 typedef struct
241 {
244  union
245  {
246  FILE *file;
248  int fd;
249  } desc;
250 } AllocateDesc;
251 
252 static int numAllocatedDescs = 0;
253 static int maxAllocatedDescs = 0;
255 
256 /*
257  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
258  */
259 static int numExternalFDs = 0;
260 
261 /*
262  * Number of temporary files opened during the current session;
263  * this is used in generation of tempfile names.
264  */
265 static long tempFileCounter = 0;
266 
267 /*
268  * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
269  * indicating that the current database's default tablespace should be used.)
270  * When numTempTableSpaces is -1, this has not been set in the current
271  * transaction.
272  */
273 static Oid *tempTableSpaces = NULL;
274 static int numTempTableSpaces = -1;
275 static int nextTempTableSpace = 0;
276 
277 
278 /*--------------------
279  *
280  * Private Routines
281  *
282  * Delete - delete a file from the Lru ring
283  * LruDelete - remove a file from the Lru ring and close its FD
284  * Insert - put a file at the front of the Lru ring
285  * LruInsert - put a file at the front of the Lru ring and open it
286  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
287  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
288  * AllocateVfd - grab a free (or new) file record (from VfdCache)
289  * FreeVfd - free a file record
290  *
291  * The Least Recently Used ring is a doubly linked list that begins and
292  * ends on element zero. Element zero is special -- it doesn't represent
293  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
294  * anchor that shows us the beginning/end of the ring.
295  * Only VFD elements that are currently really open (have an FD assigned) are
296  * in the Lru ring. Elements that are "virtually" open can be recognized
297  * by having a non-null fileName field.
298  *
299  * example:
300  *
301  * /--less----\ /---------\
302  * v \ v \
303  * #0 --more---> LeastRecentlyUsed --more-\ \
304  * ^\ | |
305  * \\less--> MostRecentlyUsedFile <---/ |
306  * \more---/ \--less--/
307  *
308  *--------------------
309  */
310 static void Delete(File file);
311 static void LruDelete(File file);
312 static void Insert(File file);
313 static int LruInsert(File file);
314 static bool ReleaseLruFile(void);
315 static void ReleaseLruFiles(void);
316 static File AllocateVfd(void);
317 static void FreeVfd(File file);
318 
319 static int FileAccess(File file);
320 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
321 static bool reserveAllocatedDesc(void);
322 static int FreeDesc(AllocateDesc *desc);
323 
324 static void AtProcExit_Files(int code, Datum arg);
325 static void CleanupTempFiles(bool isCommit, bool isProcExit);
326 static void RemovePgTempRelationFiles(const char *tsdirname);
327 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
328 
329 static void walkdir(const char *path,
330  void (*action) (const char *fname, bool isdir, int elevel),
331  bool process_symlinks,
332  int elevel);
333 #ifdef PG_FLUSH_DATA_WORKS
334 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
335 #endif
336 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
337 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
338 
339 static int fsync_parent_path(const char *fname, int elevel);
340 
341 
342 /*
343  * pg_fsync --- do fsync with or without writethrough
344  */
345 int
347 {
348 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
349  struct stat st;
350 
351  /*
352  * Some operating system implementations of fsync() have requirements
353  * about the file access modes that were used when their file descriptor
354  * argument was opened, and these requirements differ depending on whether
355  * the file descriptor is for a directory.
356  *
357  * For any file descriptor that may eventually be handed to fsync(), we
358  * should have opened it with access modes that are compatible with
359  * fsync() on all supported systems, otherwise the code may not be
360  * portable, even if it runs ok on the current system.
361  *
362  * We assert here that a descriptor for a file was opened with write
363  * permissions (either O_RDWR or O_WRONLY) and for a directory without
364  * write permissions (O_RDONLY).
365  *
366  * Ignore any fstat errors and let the follow-up fsync() do its work.
367  * Doing this sanity check here counts for the case where fsync() is
368  * disabled.
369  */
370  if (fstat(fd, &st) == 0)
371  {
372  int desc_flags = fcntl(fd, F_GETFL);
373 
374  /*
375  * O_RDONLY is historically 0, so just make sure that for directories
376  * no write flags are used.
377  */
378  if (S_ISDIR(st.st_mode))
379  Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
380  else
381  Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
382  }
383  errno = 0;
384 #endif
385 
386  /* #if is to skip the sync_method test if there's no need for it */
387 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
389  return pg_fsync_writethrough(fd);
390  else
391 #endif
392  return pg_fsync_no_writethrough(fd);
393 }
394 
395 
396 /*
397  * pg_fsync_no_writethrough --- same as fsync except does nothing if
398  * enableFsync is off
399  */
400 int
402 {
403  if (enableFsync)
404  return fsync(fd);
405  else
406  return 0;
407 }
408 
409 /*
410  * pg_fsync_writethrough
411  */
412 int
414 {
415  if (enableFsync)
416  {
417 #ifdef WIN32
418  return _commit(fd);
419 #elif defined(F_FULLFSYNC)
420  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
421 #else
422  errno = ENOSYS;
423  return -1;
424 #endif
425  }
426  else
427  return 0;
428 }
429 
430 /*
431  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
432  *
433  * Not all platforms have fdatasync; treat as fsync if not available.
434  */
435 int
437 {
438  if (enableFsync)
439  {
440 #ifdef HAVE_FDATASYNC
441  return fdatasync(fd);
442 #else
443  return fsync(fd);
444 #endif
445  }
446  else
447  return 0;
448 }
449 
450 /*
451  * pg_flush_data --- advise OS that the described dirty data should be flushed
452  *
453  * offset of 0 with nbytes 0 means that the entire file should be flushed
454  */
455 void
456 pg_flush_data(int fd, off_t offset, off_t nbytes)
457 {
458  /*
459  * Right now file flushing is primarily used to avoid making later
460  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
461  * if fsyncs are disabled - that's a decision we might want to make
462  * configurable at some point.
463  */
464  if (!enableFsync)
465  return;
466 
467  /*
468  * We compile all alternatives that are supported on the current platform,
469  * to find portability problems more easily.
470  */
471 #if defined(HAVE_SYNC_FILE_RANGE)
472  {
473  int rc;
474  static bool not_implemented_by_kernel = false;
475 
476  if (not_implemented_by_kernel)
477  return;
478 
479  /*
480  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
481  * tells the OS that writeback for the specified blocks should be
482  * started, but that we don't want to wait for completion. Note that
483  * this call might block if too much dirty data exists in the range.
484  * This is the preferable method on OSs supporting it, as it works
485  * reliably when available (contrast to msync()) and doesn't flush out
486  * clean data (like FADV_DONTNEED).
487  */
488  rc = sync_file_range(fd, offset, nbytes,
489  SYNC_FILE_RANGE_WRITE);
490  if (rc != 0)
491  {
492  int elevel;
493 
494  /*
495  * For systems that don't have an implementation of
496  * sync_file_range() such as Windows WSL, generate only one
497  * warning and then suppress all further attempts by this process.
498  */
499  if (errno == ENOSYS)
500  {
501  elevel = WARNING;
502  not_implemented_by_kernel = true;
503  }
504  else
505  elevel = data_sync_elevel(WARNING);
506 
507  ereport(elevel,
509  errmsg("could not flush dirty data: %m")));
510  }
511 
512  return;
513  }
514 #endif
515 #if !defined(WIN32) && defined(MS_ASYNC)
516  {
517  void *p;
518  static int pagesize = 0;
519 
520  /*
521  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
522  * writeback. On linux it only does so if MS_SYNC is specified, but
523  * then it does the writeback synchronously. Luckily all common linux
524  * systems have sync_file_range(). This is preferable over
525  * FADV_DONTNEED because it doesn't flush out clean data.
526  *
527  * We map the file (mmap()), tell the kernel to sync back the contents
528  * (msync()), and then remove the mapping again (munmap()).
529  */
530 
531  /* mmap() needs actual length if we want to map whole file */
532  if (offset == 0 && nbytes == 0)
533  {
534  nbytes = lseek(fd, 0, SEEK_END);
535  if (nbytes < 0)
536  {
539  errmsg("could not determine dirty data size: %m")));
540  return;
541  }
542  }
543 
544  /*
545  * Some platforms reject partial-page mmap() attempts. To deal with
546  * that, just truncate the request to a page boundary. If any extra
547  * bytes don't get flushed, well, it's only a hint anyway.
548  */
549 
550  /* fetch pagesize only once */
551  if (pagesize == 0)
552  pagesize = sysconf(_SC_PAGESIZE);
553 
554  /* align length to pagesize, dropping any fractional page */
555  if (pagesize > 0)
556  nbytes = (nbytes / pagesize) * pagesize;
557 
558  /* fractional-page request is a no-op */
559  if (nbytes <= 0)
560  return;
561 
562  /*
563  * mmap could well fail, particularly on 32-bit platforms where there
564  * may simply not be enough address space. If so, silently fall
565  * through to the next implementation.
566  */
567  if (nbytes <= (off_t) SSIZE_MAX)
568  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
569  else
570  p = MAP_FAILED;
571 
572  if (p != MAP_FAILED)
573  {
574  int rc;
575 
576  rc = msync(p, (size_t) nbytes, MS_ASYNC);
577  if (rc != 0)
578  {
581  errmsg("could not flush dirty data: %m")));
582  /* NB: need to fall through to munmap()! */
583  }
584 
585  rc = munmap(p, (size_t) nbytes);
586  if (rc != 0)
587  {
588  /* FATAL error because mapping would remain */
589  ereport(FATAL,
591  errmsg("could not munmap() while flushing data: %m")));
592  }
593 
594  return;
595  }
596  }
597 #endif
598 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
599  {
600  int rc;
601 
602  /*
603  * Signal the kernel that the passed in range should not be cached
604  * anymore. This has the, desired, side effect of writing out dirty
605  * data, and the, undesired, side effect of likely discarding useful
606  * clean cached blocks. For the latter reason this is the least
607  * preferable method.
608  */
609 
610  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
611 
612  if (rc != 0)
613  {
614  /* don't error out, this is just a performance optimization */
617  errmsg("could not flush dirty data: %m")));
618  }
619 
620  return;
621  }
622 #endif
623 }
624 
625 
626 /*
627  * fsync_fname -- fsync a file or directory, handling errors properly
628  *
629  * Try to fsync a file or directory. When doing the latter, ignore errors that
630  * indicate the OS just doesn't allow/require fsyncing directories.
631  */
632 void
633 fsync_fname(const char *fname, bool isdir)
634 {
635  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
636 }
637 
638 /*
639  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
640  *
641  * This routine ensures that, after returning, the effect of renaming file
642  * persists in case of a crash. A crash while this routine is running will
643  * leave you with either the pre-existing or the moved file in place of the
644  * new file; no mixed state or truncated files are possible.
645  *
646  * It does so by using fsync on the old filename and the possibly existing
647  * target filename before the rename, and the target file and directory after.
648  *
649  * Note that rename() cannot be used across arbitrary directories, as they
650  * might not be on the same filesystem. Therefore this routine does not
651  * support renaming across directories.
652  *
653  * Log errors with the caller specified severity.
654  *
655  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
656  * valid upon return.
657  */
658 int
659 durable_rename(const char *oldfile, const char *newfile, int elevel)
660 {
661  int fd;
662 
663  /*
664  * First fsync the old and target path (if it exists), to ensure that they
665  * are properly persistent on disk. Syncing the target file is not
666  * strictly necessary, but it makes it easier to reason about crashes;
667  * because it's then guaranteed that either source or target file exists
668  * after a crash.
669  */
670  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
671  return -1;
672 
673  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
674  if (fd < 0)
675  {
676  if (errno != ENOENT)
677  {
678  ereport(elevel,
680  errmsg("could not open file \"%s\": %m", newfile)));
681  return -1;
682  }
683  }
684  else
685  {
686  if (pg_fsync(fd) != 0)
687  {
688  int save_errno;
689 
690  /* close file upon error, might not be in transaction context */
691  save_errno = errno;
692  CloseTransientFile(fd);
693  errno = save_errno;
694 
695  ereport(elevel,
697  errmsg("could not fsync file \"%s\": %m", newfile)));
698  return -1;
699  }
700 
701  if (CloseTransientFile(fd) != 0)
702  {
703  ereport(elevel,
705  errmsg("could not close file \"%s\": %m", newfile)));
706  return -1;
707  }
708  }
709 
710  /* Time to do the real deal... */
711  if (rename(oldfile, newfile) < 0)
712  {
713  ereport(elevel,
715  errmsg("could not rename file \"%s\" to \"%s\": %m",
716  oldfile, newfile)));
717  return -1;
718  }
719 
720  /*
721  * To guarantee renaming the file is persistent, fsync the file with its
722  * new name, and its containing directory.
723  */
724  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
725  return -1;
726 
727  if (fsync_parent_path(newfile, elevel) != 0)
728  return -1;
729 
730  return 0;
731 }
732 
733 /*
734  * durable_unlink -- remove a file in a durable manner
735  *
736  * This routine ensures that, after returning, the effect of removing file
737  * persists in case of a crash. A crash while this routine is running will
738  * leave the system in no mixed state.
739  *
740  * It does so by using fsync on the parent directory of the file after the
741  * actual removal is done.
742  *
743  * Log errors with the severity specified by caller.
744  *
745  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
746  * valid upon return.
747  */
748 int
749 durable_unlink(const char *fname, int elevel)
750 {
751  if (unlink(fname) < 0)
752  {
753  ereport(elevel,
755  errmsg("could not remove file \"%s\": %m",
756  fname)));
757  return -1;
758  }
759 
760  /*
761  * To guarantee that the removal of the file is persistent, fsync its
762  * parent directory.
763  */
764  if (fsync_parent_path(fname, elevel) != 0)
765  return -1;
766 
767  return 0;
768 }
769 
770 /*
771  * durable_rename_excl -- rename a file in a durable manner, without
772  * overwriting an existing target file
773  *
774  * Similar to durable_rename(), except that this routine will fail if the
775  * target file already exists.
776  *
777  * Note that a crash in an unfortunate moment can leave you with two links to
778  * the target file.
779  *
780  * Log errors with the caller specified severity.
781  *
782  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
783  * valid upon return.
784  */
785 int
786 durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
787 {
788  /*
789  * Ensure that, if we crash directly after the rename/link, a file with
790  * valid contents is moved into place.
791  */
792  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
793  return -1;
794 
795  if (link(oldfile, newfile) < 0)
796  {
797  ereport(elevel,
799  errmsg("could not link file \"%s\" to \"%s\": %m",
800  oldfile, newfile)));
801  return -1;
802  }
803  unlink(oldfile);
804 
805  /*
806  * Make change persistent in case of an OS crash, both the new entry and
807  * its parent directory need to be flushed.
808  */
809  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
810  return -1;
811 
812  /* Same for parent directory */
813  if (fsync_parent_path(newfile, elevel) != 0)
814  return -1;
815 
816  return 0;
817 }
818 
819 /*
820  * InitFileAccess --- initialize this module during backend startup
821  *
822  * This is called during either normal or standalone backend start.
823  * It is *not* called in the postmaster.
824  */
825 void
827 {
828  Assert(SizeVfdCache == 0); /* call me only once */
829 
830  /* initialize cache header entry */
831  VfdCache = (Vfd *) malloc(sizeof(Vfd));
832  if (VfdCache == NULL)
833  ereport(FATAL,
834  (errcode(ERRCODE_OUT_OF_MEMORY),
835  errmsg("out of memory")));
836 
837  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
838  VfdCache->fd = VFD_CLOSED;
839 
840  SizeVfdCache = 1;
841 
842  /* register proc-exit hook to ensure temp files are dropped at exit */
844 }
845 
846 /*
847  * count_usable_fds --- count how many FDs the system will let us open,
848  * and estimate how many are already open.
849  *
850  * We stop counting if usable_fds reaches max_to_probe. Note: a small
851  * value of max_to_probe might result in an underestimate of already_open;
852  * we must fill in any "gaps" in the set of used FDs before the calculation
853  * of already_open will give the right answer. In practice, max_to_probe
854  * of a couple of dozen should be enough to ensure good results.
855  *
856  * We assume stdin (FD 0) is available for dup'ing
857  */
858 static void
859 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
860 {
861  int *fd;
862  int size;
863  int used = 0;
864  int highestfd = 0;
865  int j;
866 
867 #ifdef HAVE_GETRLIMIT
868  struct rlimit rlim;
869  int getrlimit_status;
870 #endif
871 
872  size = 1024;
873  fd = (int *) palloc(size * sizeof(int));
874 
875 #ifdef HAVE_GETRLIMIT
876 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
877  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
878 #else /* but BSD doesn't ... */
879  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
880 #endif /* RLIMIT_NOFILE */
881  if (getrlimit_status != 0)
882  ereport(WARNING, (errmsg("getrlimit failed: %m")));
883 #endif /* HAVE_GETRLIMIT */
884 
885  /* dup until failure or probe limit reached */
886  for (;;)
887  {
888  int thisfd;
889 
890 #ifdef HAVE_GETRLIMIT
891 
892  /*
893  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
894  * some platforms
895  */
896  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
897  break;
898 #endif
899 
900  thisfd = dup(0);
901  if (thisfd < 0)
902  {
903  /* Expect EMFILE or ENFILE, else it's fishy */
904  if (errno != EMFILE && errno != ENFILE)
905  elog(WARNING, "dup(0) failed after %d successes: %m", used);
906  break;
907  }
908 
909  if (used >= size)
910  {
911  size *= 2;
912  fd = (int *) repalloc(fd, size * sizeof(int));
913  }
914  fd[used++] = thisfd;
915 
916  if (highestfd < thisfd)
917  highestfd = thisfd;
918 
919  if (used >= max_to_probe)
920  break;
921  }
922 
923  /* release the files we opened */
924  for (j = 0; j < used; j++)
925  close(fd[j]);
926 
927  pfree(fd);
928 
929  /*
930  * Return results. usable_fds is just the number of successful dups. We
931  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
932  * number) and so already_open is highestfd+1 - usable_fds.
933  */
934  *usable_fds = used;
935  *already_open = highestfd + 1 - used;
936 }
937 
938 /*
939  * set_max_safe_fds
940  * Determine number of file descriptors that fd.c is allowed to use
941  */
942 void
944 {
945  int usable_fds;
946  int already_open;
947 
948  /*----------
949  * We want to set max_safe_fds to
950  * MIN(usable_fds, max_files_per_process - already_open)
951  * less the slop factor for files that are opened without consulting
952  * fd.c. This ensures that we won't exceed either max_files_per_process
953  * or the experimentally-determined EMFILE limit.
954  *----------
955  */
957  &usable_fds, &already_open);
958 
959  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
960 
961  /*
962  * Take off the FDs reserved for system() etc.
963  */
965 
966  /*
967  * Make sure we still have enough to get by.
968  */
969  if (max_safe_fds < FD_MINFREE)
970  ereport(FATAL,
971  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
972  errmsg("insufficient file descriptors available to start server process"),
973  errdetail("System allows %d, we need at least %d.",
976 
977  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
978  max_safe_fds, usable_fds, already_open);
979 }
980 
981 /*
982  * Open a file with BasicOpenFilePerm() and pass default file mode for the
983  * fileMode parameter.
984  */
985 int
987 {
988  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
989 }
990 
991 /*
992  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
993  *
994  * This is exported for use by places that really want a plain kernel FD,
995  * but need to be proof against running out of FDs. Once an FD has been
996  * successfully returned, it is the caller's responsibility to ensure that
997  * it will not be leaked on ereport()! Most users should *not* call this
998  * routine directly, but instead use the VFD abstraction level, which
999  * provides protection against descriptor leaks as well as management of
1000  * files that need to be open for more than a short period of time.
1001  *
1002  * Ideally this should be the *only* direct call of open() in the backend.
1003  * In practice, the postmaster calls open() directly, and there are some
1004  * direct open() calls done early in backend startup. Those are OK since
1005  * this module wouldn't have any open files to close at that point anyway.
1006  */
1007 int
1009 {
1010  int fd;
1011 
1012 tryAgain:
1013  fd = open(fileName, fileFlags, fileMode);
1014 
1015  if (fd >= 0)
1016  return fd; /* success! */
1017 
1018  if (errno == EMFILE || errno == ENFILE)
1019  {
1020  int save_errno = errno;
1021 
1022  ereport(LOG,
1023  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1024  errmsg("out of file descriptors: %m; release and retry")));
1025  errno = 0;
1026  if (ReleaseLruFile())
1027  goto tryAgain;
1028  errno = save_errno;
1029  }
1030 
1031  return -1; /* failure */
1032 }
1033 
1034 /*
1035  * AcquireExternalFD - attempt to reserve an external file descriptor
1036  *
1037  * This should be used by callers that need to hold a file descriptor open
1038  * over more than a short interval, but cannot use any of the other facilities
1039  * provided by this module.
1040  *
1041  * The difference between this and the underlying ReserveExternalFD function
1042  * is that this will report failure (by setting errno and returning false)
1043  * if "too many" external FDs are already reserved. This should be used in
1044  * any code where the total number of FDs to be reserved is not predictable
1045  * and small.
1046  */
1047 bool
1049 {
1050  /*
1051  * We don't want more than max_safe_fds / 3 FDs to be consumed for
1052  * "external" FDs.
1053  */
1054  if (numExternalFDs < max_safe_fds / 3)
1055  {
1057  return true;
1058  }
1059  errno = EMFILE;
1060  return false;
1061 }
1062 
1063 /*
1064  * ReserveExternalFD - report external consumption of a file descriptor
1065  *
1066  * This should be used by callers that need to hold a file descriptor open
1067  * over more than a short interval, but cannot use any of the other facilities
1068  * provided by this module. This just tracks the use of the FD and closes
1069  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1070  *
1071  * Call this directly only in code where failure to reserve the FD would be
1072  * fatal; for example, the WAL-writing code does so, since the alternative is
1073  * session failure. Also, it's very unwise to do so in code that could
1074  * consume more than one FD per process.
1075  *
1076  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1077  * available, it doesn't matter too much whether this is called before or
1078  * after actually opening the FD; but doing so beforehand reduces the risk of
1079  * an EMFILE failure if not everybody played nice. In any case, it's solely
1080  * caller's responsibility to keep the external-FD count in sync with reality.
1081  */
1082 void
1084 {
1085  /*
1086  * Release VFDs if needed to stay safe. Because we do this before
1087  * incrementing numExternalFDs, the final state will be as desired, i.e.,
1088  * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1089  */
1090  ReleaseLruFiles();
1091 
1092  numExternalFDs++;
1093 }
1094 
1095 /*
1096  * ReleaseExternalFD - report release of an external file descriptor
1097  *
1098  * This is guaranteed not to change errno, so it can be used in failure paths.
1099  */
1100 void
1102 {
1103  Assert(numExternalFDs > 0);
1104  numExternalFDs--;
1105 }
1106 
1107 
1108 #if defined(FDDEBUG)
1109 
1110 static void
1111 _dump_lru(void)
1112 {
1113  int mru = VfdCache[0].lruLessRecently;
1114  Vfd *vfdP = &VfdCache[mru];
1115  char buf[2048];
1116 
1117  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1118  while (mru != 0)
1119  {
1120  mru = vfdP->lruLessRecently;
1121  vfdP = &VfdCache[mru];
1122  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1123  }
1124  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1125  elog(LOG, "%s", buf);
1126 }
1127 #endif /* FDDEBUG */
1128 
1129 static void
1131 {
1132  Vfd *vfdP;
1133 
1134  Assert(file != 0);
1135 
1136  DO_DB(elog(LOG, "Delete %d (%s)",
1137  file, VfdCache[file].fileName));
1138  DO_DB(_dump_lru());
1139 
1140  vfdP = &VfdCache[file];
1141 
1142  VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1143  VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1144 
1145  DO_DB(_dump_lru());
1146 }
1147 
1148 static void
1150 {
1151  Vfd *vfdP;
1152 
1153  Assert(file != 0);
1154 
1155  DO_DB(elog(LOG, "LruDelete %d (%s)",
1156  file, VfdCache[file].fileName));
1157 
1158  vfdP = &VfdCache[file];
1159 
1160  /*
1161  * Close the file. We aren't expecting this to fail; if it does, better
1162  * to leak the FD than to mess up our internal state.
1163  */
1164  if (close(vfdP->fd) != 0)
1166  "could not close file \"%s\": %m", vfdP->fileName);
1167  vfdP->fd = VFD_CLOSED;
1168  --nfile;
1169 
1170  /* delete the vfd record from the LRU ring */
1171  Delete(file);
1172 }
1173 
1174 static void
1176 {
1177  Vfd *vfdP;
1178 
1179  Assert(file != 0);
1180 
1181  DO_DB(elog(LOG, "Insert %d (%s)",
1182  file, VfdCache[file].fileName));
1183  DO_DB(_dump_lru());
1184 
1185  vfdP = &VfdCache[file];
1186 
1187  vfdP->lruMoreRecently = 0;
1188  vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1189  VfdCache[0].lruLessRecently = file;
1190  VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1191 
1192  DO_DB(_dump_lru());
1193 }
1194 
1195 /* returns 0 on success, -1 on re-open failure (with errno set) */
1196 static int
1198 {
1199  Vfd *vfdP;
1200 
1201  Assert(file != 0);
1202 
1203  DO_DB(elog(LOG, "LruInsert %d (%s)",
1204  file, VfdCache[file].fileName));
1205 
1206  vfdP = &VfdCache[file];
1207 
1208  if (FileIsNotOpen(file))
1209  {
1210  /* Close excess kernel FDs. */
1211  ReleaseLruFiles();
1212 
1213  /*
1214  * The open could still fail for lack of file descriptors, eg due to
1215  * overall system file table being full. So, be prepared to release
1216  * another FD if necessary...
1217  */
1218  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1219  vfdP->fileMode);
1220  if (vfdP->fd < 0)
1221  {
1222  DO_DB(elog(LOG, "re-open failed: %m"));
1223  return -1;
1224  }
1225  else
1226  {
1227  ++nfile;
1228  }
1229  }
1230 
1231  /*
1232  * put it at the head of the Lru ring
1233  */
1234 
1235  Insert(file);
1236 
1237  return 0;
1238 }
1239 
1240 /*
1241  * Release one kernel FD by closing the least-recently-used VFD.
1242  */
1243 static bool
1245 {
1246  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1247 
1248  if (nfile > 0)
1249  {
1250  /*
1251  * There are opened files and so there should be at least one used vfd
1252  * in the ring.
1253  */
1254  Assert(VfdCache[0].lruMoreRecently != 0);
1255  LruDelete(VfdCache[0].lruMoreRecently);
1256  return true; /* freed a file */
1257  }
1258  return false; /* no files available to free */
1259 }
1260 
1261 /*
1262  * Release kernel FDs as needed to get under the max_safe_fds limit.
1263  * After calling this, it's OK to try to open another file.
1264  */
1265 static void
1267 {
1269  {
1270  if (!ReleaseLruFile())
1271  break;
1272  }
1273 }
1274 
1275 static File
1277 {
1278  Index i;
1279  File file;
1280 
1281  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1282 
1283  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1284 
1285  if (VfdCache[0].nextFree == 0)
1286  {
1287  /*
1288  * The free list is empty so it is time to increase the size of the
1289  * array. We choose to double it each time this happens. However,
1290  * there's not much point in starting *real* small.
1291  */
1292  Size newCacheSize = SizeVfdCache * 2;
1293  Vfd *newVfdCache;
1294 
1295  if (newCacheSize < 32)
1296  newCacheSize = 32;
1297 
1298  /*
1299  * Be careful not to clobber VfdCache ptr if realloc fails.
1300  */
1301  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1302  if (newVfdCache == NULL)
1303  ereport(ERROR,
1304  (errcode(ERRCODE_OUT_OF_MEMORY),
1305  errmsg("out of memory")));
1306  VfdCache = newVfdCache;
1307 
1308  /*
1309  * Initialize the new entries and link them into the free list.
1310  */
1311  for (i = SizeVfdCache; i < newCacheSize; i++)
1312  {
1313  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1314  VfdCache[i].nextFree = i + 1;
1315  VfdCache[i].fd = VFD_CLOSED;
1316  }
1317  VfdCache[newCacheSize - 1].nextFree = 0;
1318  VfdCache[0].nextFree = SizeVfdCache;
1319 
1320  /*
1321  * Record the new size
1322  */
1323  SizeVfdCache = newCacheSize;
1324  }
1325 
1326  file = VfdCache[0].nextFree;
1327 
1328  VfdCache[0].nextFree = VfdCache[file].nextFree;
1329 
1330  return file;
1331 }
1332 
1333 static void
1335 {
1336  Vfd *vfdP = &VfdCache[file];
1337 
1338  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1339  file, vfdP->fileName ? vfdP->fileName : ""));
1340 
1341  if (vfdP->fileName != NULL)
1342  {
1343  free(vfdP->fileName);
1344  vfdP->fileName = NULL;
1345  }
1346  vfdP->fdstate = 0x0;
1347 
1348  vfdP->nextFree = VfdCache[0].nextFree;
1349  VfdCache[0].nextFree = file;
1350 }
1351 
1352 /* returns 0 on success, -1 on re-open failure (with errno set) */
1353 static int
1355 {
1356  int returnValue;
1357 
1358  DO_DB(elog(LOG, "FileAccess %d (%s)",
1359  file, VfdCache[file].fileName));
1360 
1361  /*
1362  * Is the file open? If not, open it and put it at the head of the LRU
1363  * ring (possibly closing the least recently used file to get an FD).
1364  */
1365 
1366  if (FileIsNotOpen(file))
1367  {
1368  returnValue = LruInsert(file);
1369  if (returnValue != 0)
1370  return returnValue;
1371  }
1372  else if (VfdCache[0].lruLessRecently != file)
1373  {
1374  /*
1375  * We now know that the file is open and that it is not the last one
1376  * accessed, so we need to move it to the head of the Lru ring.
1377  */
1378 
1379  Delete(file);
1380  Insert(file);
1381  }
1382 
1383  return 0;
1384 }
1385 
1386 /*
1387  * Called whenever a temporary file is deleted to report its size.
1388  */
1389 static void
1390 ReportTemporaryFileUsage(const char *path, off_t size)
1391 {
1392  pgstat_report_tempfile(size);
1393 
1394  if (log_temp_files >= 0)
1395  {
1396  if ((size / 1024) >= log_temp_files)
1397  ereport(LOG,
1398  (errmsg("temporary file: path \"%s\", size %lu",
1399  path, (unsigned long) size)));
1400  }
1401 }
1402 
1403 /*
1404  * Called to register a temporary file for automatic close.
1405  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1406  * before the file was opened.
1407  */
1408 static void
1410 {
1412  VfdCache[file].resowner = CurrentResourceOwner;
1413 
1414  /* Backup mechanism for closing at end of xact. */
1415  VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1417 }
1418 
1419 /*
1420  * Called when we get a shared invalidation message on some relation.
1421  */
1422 #ifdef NOT_USED
1423 void
1424 FileInvalidate(File file)
1425 {
1426  Assert(FileIsValid(file));
1427  if (!FileIsNotOpen(file))
1428  LruDelete(file);
1429 }
1430 #endif
1431 
1432 /*
1433  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1434  * fileMode parameter.
1435  */
1436 File
1438 {
1439  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1440 }
1441 
1442 /*
1443  * open a file in an arbitrary directory
1444  *
1445  * NB: if the passed pathname is relative (which it usually is),
1446  * it will be interpreted relative to the process' working directory
1447  * (which should always be $PGDATA when this code is running).
1448  */
1449 File
1451 {
1452  char *fnamecopy;
1453  File file;
1454  Vfd *vfdP;
1455 
1456  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1457  fileName, fileFlags, fileMode));
1458 
1459  /*
1460  * We need a malloc'd copy of the file name; fail cleanly if no room.
1461  */
1462  fnamecopy = strdup(fileName);
1463  if (fnamecopy == NULL)
1464  ereport(ERROR,
1465  (errcode(ERRCODE_OUT_OF_MEMORY),
1466  errmsg("out of memory")));
1467 
1468  file = AllocateVfd();
1469  vfdP = &VfdCache[file];
1470 
1471  /* Close excess kernel FDs. */
1472  ReleaseLruFiles();
1473 
1474  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1475 
1476  if (vfdP->fd < 0)
1477  {
1478  int save_errno = errno;
1479 
1480  FreeVfd(file);
1481  free(fnamecopy);
1482  errno = save_errno;
1483  return -1;
1484  }
1485  ++nfile;
1486  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1487  vfdP->fd));
1488 
1489  Insert(file);
1490 
1491  vfdP->fileName = fnamecopy;
1492  /* Saved flags are adjusted to be OK for re-opening file */
1493  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1494  vfdP->fileMode = fileMode;
1495  vfdP->fileSize = 0;
1496  vfdP->fdstate = 0x0;
1497  vfdP->resowner = NULL;
1498 
1499  return file;
1500 }
1501 
1502 /*
1503  * Create directory 'directory'. If necessary, create 'basedir', which must
1504  * be the directory above it. This is designed for creating the top-level
1505  * temporary directory on demand before creating a directory underneath it.
1506  * Do nothing if the directory already exists.
1507  *
1508  * Directories created within the top-level temporary directory should begin
1509  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1510  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1511  * that do not need any particular prefix.
1512 */
1513 void
1515 {
1516  if (MakePGDirectory(directory) < 0)
1517  {
1518  if (errno == EEXIST)
1519  return;
1520 
1521  /*
1522  * Failed. Try to create basedir first in case it's missing. Tolerate
1523  * EEXIST to close a race against another process following the same
1524  * algorithm.
1525  */
1526  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1527  ereport(ERROR,
1529  errmsg("cannot create temporary directory \"%s\": %m",
1530  basedir)));
1531 
1532  /* Try again. */
1533  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1534  ereport(ERROR,
1536  errmsg("cannot create temporary subdirectory \"%s\": %m",
1537  directory)));
1538  }
1539 }
1540 
1541 /*
1542  * Delete a directory and everything in it, if it exists.
1543  */
1544 void
1545 PathNameDeleteTemporaryDir(const char *dirname)
1546 {
1547  struct stat statbuf;
1548 
1549  /* Silently ignore missing directory. */
1550  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1551  return;
1552 
1553  /*
1554  * Currently, walkdir doesn't offer a way for our passed in function to
1555  * maintain state. Perhaps it should, so that we could tell the caller
1556  * whether this operation succeeded or failed. Since this operation is
1557  * used in a cleanup path, we wouldn't actually behave differently: we'll
1558  * just log failures.
1559  */
1560  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1561 }
1562 
1563 /*
1564  * Open a temporary file that will disappear when we close it.
1565  *
1566  * This routine takes care of generating an appropriate tempfile name.
1567  * There's no need to pass in fileFlags or fileMode either, since only
1568  * one setting makes any sense for a temp file.
1569  *
1570  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1571  * to ensure it's closed and deleted when it's no longer needed, typically at
1572  * the end-of-transaction. In most cases, you don't want temporary files to
1573  * outlive the transaction that created them, so this should be false -- but
1574  * if you need "somewhat" temporary storage, this might be useful. In either
1575  * case, the file is removed when the File is explicitly closed.
1576  */
1577 File
1578 OpenTemporaryFile(bool interXact)
1579 {
1580  File file = 0;
1581 
1582  /*
1583  * Make sure the current resource owner has space for this File before we
1584  * open it, if we'll be registering it below.
1585  */
1586  if (!interXact)
1588 
1589  /*
1590  * If some temp tablespace(s) have been given to us, try to use the next
1591  * one. If a given tablespace can't be found, we silently fall back to
1592  * the database's default tablespace.
1593  *
1594  * BUT: if the temp file is slated to outlive the current transaction,
1595  * force it into the database's default tablespace, so that it will not
1596  * pose a threat to possible tablespace drop attempts.
1597  */
1598  if (numTempTableSpaces > 0 && !interXact)
1599  {
1600  Oid tblspcOid = GetNextTempTableSpace();
1601 
1602  if (OidIsValid(tblspcOid))
1603  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1604  }
1605 
1606  /*
1607  * If not, or if tablespace is bad, create in database's default
1608  * tablespace. MyDatabaseTableSpace should normally be set before we get
1609  * here, but just in case it isn't, fall back to pg_default tablespace.
1610  */
1611  if (file <= 0)
1614  DEFAULTTABLESPACE_OID,
1615  true);
1616 
1617  /* Mark it for deletion at close and temporary file size limit */
1618  VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1619 
1620  /* Register it with the current resource owner */
1621  if (!interXact)
1622  RegisterTemporaryFile(file);
1623 
1624  return file;
1625 }
1626 
1627 /*
1628  * Return the path of the temp directory in a given tablespace.
1629  */
1630 void
1632 {
1633  /*
1634  * Identify the tempfile directory for this tablespace.
1635  *
1636  * If someone tries to specify pg_global, use pg_default instead.
1637  */
1638  if (tablespace == InvalidOid ||
1639  tablespace == DEFAULTTABLESPACE_OID ||
1640  tablespace == GLOBALTABLESPACE_OID)
1641  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1642  else
1643  {
1644  /* All other tablespaces are accessed via symlinks */
1645  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1646  tablespace, TABLESPACE_VERSION_DIRECTORY,
1648  }
1649 }
1650 
1651 /*
1652  * Open a temporary file in a specific tablespace.
1653  * Subroutine for OpenTemporaryFile, which see for details.
1654  */
1655 static File
1656 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1657 {
1658  char tempdirpath[MAXPGPATH];
1659  char tempfilepath[MAXPGPATH];
1660  File file;
1661 
1662  TempTablespacePath(tempdirpath, tblspcOid);
1663 
1664  /*
1665  * Generate a tempfile name that should be unique within the current
1666  * database instance.
1667  */
1668  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1669  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1670 
1671  /*
1672  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1673  * temp file that can be reused.
1674  */
1675  file = PathNameOpenFile(tempfilepath,
1676  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1677  if (file <= 0)
1678  {
1679  /*
1680  * We might need to create the tablespace's tempfile directory, if no
1681  * one has yet done so.
1682  *
1683  * Don't check for an error from MakePGDirectory; it could fail if
1684  * someone else just did the same thing. If it doesn't work then
1685  * we'll bomb out on the second create attempt, instead.
1686  */
1687  (void) MakePGDirectory(tempdirpath);
1688 
1689  file = PathNameOpenFile(tempfilepath,
1690  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1691  if (file <= 0 && rejectError)
1692  elog(ERROR, "could not create temporary file \"%s\": %m",
1693  tempfilepath);
1694  }
1695 
1696  return file;
1697 }
1698 
1699 
1700 /*
1701  * Create a new file. The directory containing it must already exist. Files
1702  * created this way are subject to temp_file_limit and are automatically
1703  * closed at end of transaction, but are not automatically deleted on close
1704  * because they are intended to be shared between cooperating backends.
1705  *
1706  * If the file is inside the top-level temporary directory, its name should
1707  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1708  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1709  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1710  * the prefix isn't needed.
1711  */
1712 File
1713 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1714 {
1715  File file;
1716 
1718 
1719  /*
1720  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1721  * temp file that can be reused.
1722  */
1723  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1724  if (file <= 0)
1725  {
1726  if (error_on_failure)
1727  ereport(ERROR,
1729  errmsg("could not create temporary file \"%s\": %m",
1730  path)));
1731  else
1732  return file;
1733  }
1734 
1735  /* Mark it for temp_file_limit accounting. */
1736  VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1737 
1738  /* Register it for automatic close. */
1739  RegisterTemporaryFile(file);
1740 
1741  return file;
1742 }
1743 
1744 /*
1745  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1746  * another backend. Files opened this way don't count against the
1747  * temp_file_limit of the caller, are automatically closed at the end of the
1748  * transaction but are not deleted on close.
1749  */
1750 File
1751 PathNameOpenTemporaryFile(const char *path, int mode)
1752 {
1753  File file;
1754 
1756 
1757  file = PathNameOpenFile(path, mode | PG_BINARY);
1758 
1759  /* If no such file, then we don't raise an error. */
1760  if (file <= 0 && errno != ENOENT)
1761  ereport(ERROR,
1763  errmsg("could not open temporary file \"%s\": %m",
1764  path)));
1765 
1766  if (file > 0)
1767  {
1768  /* Register it for automatic close. */
1769  RegisterTemporaryFile(file);
1770  }
1771 
1772  return file;
1773 }
1774 
1775 /*
1776  * Delete a file by pathname. Return true if the file existed, false if
1777  * didn't.
1778  */
1779 bool
1780 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1781 {
1782  struct stat filestats;
1783  int stat_errno;
1784 
1785  /* Get the final size for pgstat reporting. */
1786  if (stat(path, &filestats) != 0)
1787  stat_errno = errno;
1788  else
1789  stat_errno = 0;
1790 
1791  /*
1792  * Unlike FileClose's automatic file deletion code, we tolerate
1793  * non-existence to support BufFileDeleteShared which doesn't know how
1794  * many segments it has to delete until it runs out.
1795  */
1796  if (stat_errno == ENOENT)
1797  return false;
1798 
1799  if (unlink(path) < 0)
1800  {
1801  if (errno != ENOENT)
1802  ereport(error_on_failure ? ERROR : LOG,
1804  errmsg("could not unlink temporary file \"%s\": %m",
1805  path)));
1806  return false;
1807  }
1808 
1809  if (stat_errno == 0)
1810  ReportTemporaryFileUsage(path, filestats.st_size);
1811  else
1812  {
1813  errno = stat_errno;
1814  ereport(LOG,
1816  errmsg("could not stat file \"%s\": %m", path)));
1817  }
1818 
1819  return true;
1820 }
1821 
1822 /*
1823  * close a file when done with it
1824  */
1825 void
1827 {
1828  Vfd *vfdP;
1829 
1830  Assert(FileIsValid(file));
1831 
1832  DO_DB(elog(LOG, "FileClose: %d (%s)",
1833  file, VfdCache[file].fileName));
1834 
1835  vfdP = &VfdCache[file];
1836 
1837  if (!FileIsNotOpen(file))
1838  {
1839  /* close the file */
1840  if (close(vfdP->fd) != 0)
1841  {
1842  /*
1843  * We may need to panic on failure to close non-temporary files;
1844  * see LruDelete.
1845  */
1847  "could not close file \"%s\": %m", vfdP->fileName);
1848  }
1849 
1850  --nfile;
1851  vfdP->fd = VFD_CLOSED;
1852 
1853  /* remove the file from the lru ring */
1854  Delete(file);
1855  }
1856 
1857  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1858  {
1859  /* Subtract its size from current usage (do first in case of error) */
1860  temporary_files_size -= vfdP->fileSize;
1861  vfdP->fileSize = 0;
1862  }
1863 
1864  /*
1865  * Delete the file if it was temporary, and make a log entry if wanted
1866  */
1867  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1868  {
1869  struct stat filestats;
1870  int stat_errno;
1871 
1872  /*
1873  * If we get an error, as could happen within the ereport/elog calls,
1874  * we'll come right back here during transaction abort. Reset the
1875  * flag to ensure that we can't get into an infinite loop. This code
1876  * is arranged to ensure that the worst-case consequence is failing to
1877  * emit log message(s), not failing to attempt the unlink.
1878  */
1879  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1880 
1881 
1882  /* first try the stat() */
1883  if (stat(vfdP->fileName, &filestats))
1884  stat_errno = errno;
1885  else
1886  stat_errno = 0;
1887 
1888  /* in any case do the unlink */
1889  if (unlink(vfdP->fileName))
1890  elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1891 
1892  /* and last report the stat results */
1893  if (stat_errno == 0)
1894  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1895  else
1896  {
1897  errno = stat_errno;
1898  elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1899  }
1900  }
1901 
1902  /* Unregister it from the resource owner */
1903  if (vfdP->resowner)
1904  ResourceOwnerForgetFile(vfdP->resowner, file);
1905 
1906  /*
1907  * Return the Vfd slot to the free list
1908  */
1909  FreeVfd(file);
1910 }
1911 
1912 /*
1913  * FilePrefetch - initiate asynchronous read of a given range of the file.
1914  *
1915  * Currently the only implementation of this function is using posix_fadvise
1916  * which is the simplest standardized interface that accomplishes this.
1917  * We could add an implementation using libaio in the future; but note that
1918  * this API is inappropriate for libaio, which wants to have a buffer provided
1919  * to read into.
1920  */
1921 int
1922 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1923 {
1924 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1925  int returnCode;
1926 
1927  Assert(FileIsValid(file));
1928 
1929  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1930  file, VfdCache[file].fileName,
1931  (int64) offset, amount));
1932 
1933  returnCode = FileAccess(file);
1934  if (returnCode < 0)
1935  return returnCode;
1936 
1937  pgstat_report_wait_start(wait_event_info);
1938  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1939  POSIX_FADV_WILLNEED);
1941 
1942  return returnCode;
1943 #else
1944  Assert(FileIsValid(file));
1945  return 0;
1946 #endif
1947 }
1948 
1949 void
1950 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1951 {
1952  int returnCode;
1953 
1954  Assert(FileIsValid(file));
1955 
1956  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1957  file, VfdCache[file].fileName,
1958  (int64) offset, (int64) nbytes));
1959 
1960  if (nbytes <= 0)
1961  return;
1962 
1963  returnCode = FileAccess(file);
1964  if (returnCode < 0)
1965  return;
1966 
1967  pgstat_report_wait_start(wait_event_info);
1968  pg_flush_data(VfdCache[file].fd, offset, nbytes);
1970 }
1971 
1972 int
1973 FileRead(File file, char *buffer, int amount, off_t offset,
1974  uint32 wait_event_info)
1975 {
1976  int returnCode;
1977  Vfd *vfdP;
1978 
1979  Assert(FileIsValid(file));
1980 
1981  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1982  file, VfdCache[file].fileName,
1983  (int64) offset,
1984  amount, buffer));
1985 
1986  returnCode = FileAccess(file);
1987  if (returnCode < 0)
1988  return returnCode;
1989 
1990  vfdP = &VfdCache[file];
1991 
1992 retry:
1993  pgstat_report_wait_start(wait_event_info);
1994  returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
1996 
1997  if (returnCode < 0)
1998  {
1999  /*
2000  * Windows may run out of kernel buffers and return "Insufficient
2001  * system resources" error. Wait a bit and retry to solve it.
2002  *
2003  * It is rumored that EINTR is also possible on some Unix filesystems,
2004  * in which case immediate retry is indicated.
2005  */
2006 #ifdef WIN32
2007  DWORD error = GetLastError();
2008 
2009  switch (error)
2010  {
2011  case ERROR_NO_SYSTEM_RESOURCES:
2012  pg_usleep(1000L);
2013  errno = EINTR;
2014  break;
2015  default:
2016  _dosmaperr(error);
2017  break;
2018  }
2019 #endif
2020  /* OK to retry if interrupted */
2021  if (errno == EINTR)
2022  goto retry;
2023  }
2024 
2025  return returnCode;
2026 }
2027 
2028 int
2029 FileWrite(File file, char *buffer, int amount, off_t offset,
2030  uint32 wait_event_info)
2031 {
2032  int returnCode;
2033  Vfd *vfdP;
2034 
2035  Assert(FileIsValid(file));
2036 
2037  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
2038  file, VfdCache[file].fileName,
2039  (int64) offset,
2040  amount, buffer));
2041 
2042  returnCode = FileAccess(file);
2043  if (returnCode < 0)
2044  return returnCode;
2045 
2046  vfdP = &VfdCache[file];
2047 
2048  /*
2049  * If enforcing temp_file_limit and it's a temp file, check to see if the
2050  * write would overrun temp_file_limit, and throw error if so. Note: it's
2051  * really a modularity violation to throw error here; we should set errno
2052  * and return -1. However, there's no way to report a suitable error
2053  * message if we do that. All current callers would just throw error
2054  * immediately anyway, so this is safe at present.
2055  */
2056  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2057  {
2058  off_t past_write = offset + amount;
2059 
2060  if (past_write > vfdP->fileSize)
2061  {
2062  uint64 newTotal = temporary_files_size;
2063 
2064  newTotal += past_write - vfdP->fileSize;
2065  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2066  ereport(ERROR,
2067  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2068  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2069  temp_file_limit)));
2070  }
2071  }
2072 
2073 retry:
2074  errno = 0;
2075  pgstat_report_wait_start(wait_event_info);
2076  returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2078 
2079  /* if write didn't set errno, assume problem is no disk space */
2080  if (returnCode != amount && errno == 0)
2081  errno = ENOSPC;
2082 
2083  if (returnCode >= 0)
2084  {
2085  /*
2086  * Maintain fileSize and temporary_files_size if it's a temp file.
2087  */
2088  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2089  {
2090  off_t past_write = offset + amount;
2091 
2092  if (past_write > vfdP->fileSize)
2093  {
2094  temporary_files_size += past_write - vfdP->fileSize;
2095  vfdP->fileSize = past_write;
2096  }
2097  }
2098  }
2099  else
2100  {
2101  /*
2102  * See comments in FileRead()
2103  */
2104 #ifdef WIN32
2105  DWORD error = GetLastError();
2106 
2107  switch (error)
2108  {
2109  case ERROR_NO_SYSTEM_RESOURCES:
2110  pg_usleep(1000L);
2111  errno = EINTR;
2112  break;
2113  default:
2114  _dosmaperr(error);
2115  break;
2116  }
2117 #endif
2118  /* OK to retry if interrupted */
2119  if (errno == EINTR)
2120  goto retry;
2121  }
2122 
2123  return returnCode;
2124 }
2125 
2126 int
2127 FileSync(File file, uint32 wait_event_info)
2128 {
2129  int returnCode;
2130 
2131  Assert(FileIsValid(file));
2132 
2133  DO_DB(elog(LOG, "FileSync: %d (%s)",
2134  file, VfdCache[file].fileName));
2135 
2136  returnCode = FileAccess(file);
2137  if (returnCode < 0)
2138  return returnCode;
2139 
2140  pgstat_report_wait_start(wait_event_info);
2141  returnCode = pg_fsync(VfdCache[file].fd);
2143 
2144  return returnCode;
2145 }
2146 
2147 off_t
2149 {
2150  Assert(FileIsValid(file));
2151 
2152  DO_DB(elog(LOG, "FileSize %d (%s)",
2153  file, VfdCache[file].fileName));
2154 
2155  if (FileIsNotOpen(file))
2156  {
2157  if (FileAccess(file) < 0)
2158  return (off_t) -1;
2159  }
2160 
2161  return lseek(VfdCache[file].fd, 0, SEEK_END);
2162 }
2163 
2164 int
2165 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2166 {
2167  int returnCode;
2168 
2169  Assert(FileIsValid(file));
2170 
2171  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2172  file, VfdCache[file].fileName));
2173 
2174  returnCode = FileAccess(file);
2175  if (returnCode < 0)
2176  return returnCode;
2177 
2178  pgstat_report_wait_start(wait_event_info);
2179  returnCode = ftruncate(VfdCache[file].fd, offset);
2181 
2182  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2183  {
2184  /* adjust our state for truncation of a temp file */
2185  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2186  temporary_files_size -= VfdCache[file].fileSize - offset;
2187  VfdCache[file].fileSize = offset;
2188  }
2189 
2190  return returnCode;
2191 }
2192 
2193 /*
2194  * Return the pathname associated with an open file.
2195  *
2196  * The returned string points to an internal buffer, which is valid until
2197  * the file is closed.
2198  */
2199 char *
2201 {
2202  Assert(FileIsValid(file));
2203 
2204  return VfdCache[file].fileName;
2205 }
2206 
2207 /*
2208  * Return the raw file descriptor of an opened file.
2209  *
2210  * The returned file descriptor will be valid until the file is closed, but
2211  * there are a lot of things that can make that happen. So the caller should
2212  * be careful not to do much of anything else before it finishes using the
2213  * returned file descriptor.
2214  */
2215 int
2217 {
2218  Assert(FileIsValid(file));
2219  return VfdCache[file].fd;
2220 }
2221 
2222 /*
2223  * FileGetRawFlags - returns the file flags on open(2)
2224  */
2225 int
2227 {
2228  Assert(FileIsValid(file));
2229  return VfdCache[file].fileFlags;
2230 }
2231 
2232 /*
2233  * FileGetRawMode - returns the mode bitmask passed to open(2)
2234  */
2235 mode_t
2237 {
2238  Assert(FileIsValid(file));
2239  return VfdCache[file].fileMode;
2240 }
2241 
2242 /*
2243  * Make room for another allocatedDescs[] array entry if needed and possible.
2244  * Returns true if an array element is available.
2245  */
2246 static bool
2248 {
2249  AllocateDesc *newDescs;
2250  int newMax;
2251 
2252  /* Quick out if array already has a free slot. */
2254  return true;
2255 
2256  /*
2257  * If the array hasn't yet been created in the current process, initialize
2258  * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2259  * we will ever need, anyway. We don't want to look at max_safe_fds
2260  * immediately because set_max_safe_fds() may not have run yet.
2261  */
2262  if (allocatedDescs == NULL)
2263  {
2264  newMax = FD_MINFREE / 3;
2265  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2266  /* Out of memory already? Treat as fatal error. */
2267  if (newDescs == NULL)
2268  ereport(ERROR,
2269  (errcode(ERRCODE_OUT_OF_MEMORY),
2270  errmsg("out of memory")));
2271  allocatedDescs = newDescs;
2272  maxAllocatedDescs = newMax;
2273  return true;
2274  }
2275 
2276  /*
2277  * Consider enlarging the array beyond the initial allocation used above.
2278  * By the time this happens, max_safe_fds should be known accurately.
2279  *
2280  * We mustn't let allocated descriptors hog all the available FDs, and in
2281  * practice we'd better leave a reasonable number of FDs for VFD use. So
2282  * set the maximum to max_safe_fds / 3. (This should certainly be at
2283  * least as large as the initial size, FD_MINFREE / 3, so we aren't
2284  * tightening the restriction here.) Recall that "external" FDs are
2285  * allowed to consume another third of max_safe_fds.
2286  */
2287  newMax = max_safe_fds / 3;
2288  if (newMax > maxAllocatedDescs)
2289  {
2290  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2291  newMax * sizeof(AllocateDesc));
2292  /* Treat out-of-memory as a non-fatal error. */
2293  if (newDescs == NULL)
2294  return false;
2295  allocatedDescs = newDescs;
2296  maxAllocatedDescs = newMax;
2297  return true;
2298  }
2299 
2300  /* Can't enlarge allocatedDescs[] any more. */
2301  return false;
2302 }
2303 
2304 /*
2305  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2306  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2307  * necessary to open the file. When done, call FreeFile rather than fclose.
2308  *
2309  * Note that files that will be open for any significant length of time
2310  * should NOT be handled this way, since they cannot share kernel file
2311  * descriptors with other files; there is grave risk of running out of FDs
2312  * if anyone locks down too many FDs. Most callers of this routine are
2313  * simply reading a config file that they will read and close immediately.
2314  *
2315  * fd.c will automatically close all files opened with AllocateFile at
2316  * transaction commit or abort; this prevents FD leakage if a routine
2317  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2318  *
2319  * Ideally this should be the *only* direct call of fopen() in the backend.
2320  */
2321 FILE *
2322 AllocateFile(const char *name, const char *mode)
2323 {
2324  FILE *file;
2325 
2326  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2327  numAllocatedDescs, name));
2328 
2329  /* Can we allocate another non-virtual FD? */
2330  if (!reserveAllocatedDesc())
2331  ereport(ERROR,
2332  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2333  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2334  maxAllocatedDescs, name)));
2335 
2336  /* Close excess kernel FDs. */
2337  ReleaseLruFiles();
2338 
2339 TryAgain:
2340  if ((file = fopen(name, mode)) != NULL)
2341  {
2342  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2343 
2344  desc->kind = AllocateDescFile;
2345  desc->desc.file = file;
2348  return desc->desc.file;
2349  }
2350 
2351  if (errno == EMFILE || errno == ENFILE)
2352  {
2353  int save_errno = errno;
2354 
2355  ereport(LOG,
2356  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2357  errmsg("out of file descriptors: %m; release and retry")));
2358  errno = 0;
2359  if (ReleaseLruFile())
2360  goto TryAgain;
2361  errno = save_errno;
2362  }
2363 
2364  return NULL;
2365 }
2366 
2367 /*
2368  * Open a file with OpenTransientFilePerm() and pass default file mode for
2369  * the fileMode parameter.
2370  */
2371 int
2373 {
2374  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2375 }
2376 
2377 /*
2378  * Like AllocateFile, but returns an unbuffered fd like open(2)
2379  */
2380 int
2382 {
2383  int fd;
2384 
2385  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2386  numAllocatedDescs, fileName));
2387 
2388  /* Can we allocate another non-virtual FD? */
2389  if (!reserveAllocatedDesc())
2390  ereport(ERROR,
2391  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2392  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2393  maxAllocatedDescs, fileName)));
2394 
2395  /* Close excess kernel FDs. */
2396  ReleaseLruFiles();
2397 
2398  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2399 
2400  if (fd >= 0)
2401  {
2402  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2403 
2404  desc->kind = AllocateDescRawFD;
2405  desc->desc.fd = fd;
2408 
2409  return fd;
2410  }
2411 
2412  return -1; /* failure */
2413 }
2414 
2415 /*
2416  * Routines that want to initiate a pipe stream should use OpenPipeStream
2417  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2418  * necessary. When done, call ClosePipeStream rather than pclose.
2419  *
2420  * This function also ensures that the popen'd program is run with default
2421  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2422  * uses. This ensures desirable response to, eg, closing a read pipe early.
2423  */
2424 FILE *
2425 OpenPipeStream(const char *command, const char *mode)
2426 {
2427  FILE *file;
2428  int save_errno;
2429 
2430  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2431  numAllocatedDescs, command));
2432 
2433  /* Can we allocate another non-virtual FD? */
2434  if (!reserveAllocatedDesc())
2435  ereport(ERROR,
2436  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2437  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2438  maxAllocatedDescs, command)));
2439 
2440  /* Close excess kernel FDs. */
2441  ReleaseLruFiles();
2442 
2443 TryAgain:
2444  fflush(stdout);
2445  fflush(stderr);
2447  errno = 0;
2448  file = popen(command, mode);
2449  save_errno = errno;
2451  errno = save_errno;
2452  if (file != NULL)
2453  {
2454  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2455 
2456  desc->kind = AllocateDescPipe;
2457  desc->desc.file = file;
2460  return desc->desc.file;
2461  }
2462 
2463  if (errno == EMFILE || errno == ENFILE)
2464  {
2465  ereport(LOG,
2466  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2467  errmsg("out of file descriptors: %m; release and retry")));
2468  if (ReleaseLruFile())
2469  goto TryAgain;
2470  errno = save_errno;
2471  }
2472 
2473  return NULL;
2474 }
2475 
2476 /*
2477  * Free an AllocateDesc of any type.
2478  *
2479  * The argument *must* point into the allocatedDescs[] array.
2480  */
2481 static int
2483 {
2484  int result;
2485 
2486  /* Close the underlying object */
2487  switch (desc->kind)
2488  {
2489  case AllocateDescFile:
2490  result = fclose(desc->desc.file);
2491  break;
2492  case AllocateDescPipe:
2493  result = pclose(desc->desc.file);
2494  break;
2495  case AllocateDescDir:
2496  result = closedir(desc->desc.dir);
2497  break;
2498  case AllocateDescRawFD:
2499  result = close(desc->desc.fd);
2500  break;
2501  default:
2502  elog(ERROR, "AllocateDesc kind not recognized");
2503  result = 0; /* keep compiler quiet */
2504  break;
2505  }
2506 
2507  /* Compact storage in the allocatedDescs array */
2509  *desc = allocatedDescs[numAllocatedDescs];
2510 
2511  return result;
2512 }
2513 
2514 /*
2515  * Close a file returned by AllocateFile.
2516  *
2517  * Note we do not check fclose's return value --- it is up to the caller
2518  * to handle close errors.
2519  */
2520 int
2521 FreeFile(FILE *file)
2522 {
2523  int i;
2524 
2525  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2526 
2527  /* Remove file from list of allocated files, if it's present */
2528  for (i = numAllocatedDescs; --i >= 0;)
2529  {
2530  AllocateDesc *desc = &allocatedDescs[i];
2531 
2532  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2533  return FreeDesc(desc);
2534  }
2535 
2536  /* Only get here if someone passes us a file not in allocatedDescs */
2537  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2538 
2539  return fclose(file);
2540 }
2541 
2542 /*
2543  * Close a file returned by OpenTransientFile.
2544  *
2545  * Note we do not check close's return value --- it is up to the caller
2546  * to handle close errors.
2547  */
2548 int
2550 {
2551  int i;
2552 
2553  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2554 
2555  /* Remove fd from list of allocated files, if it's present */
2556  for (i = numAllocatedDescs; --i >= 0;)
2557  {
2558  AllocateDesc *desc = &allocatedDescs[i];
2559 
2560  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2561  return FreeDesc(desc);
2562  }
2563 
2564  /* Only get here if someone passes us a file not in allocatedDescs */
2565  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2566 
2567  return close(fd);
2568 }
2569 
2570 /*
2571  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2572  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2573  * necessary to open the directory, and with closing it after an elog.
2574  * When done, call FreeDir rather than closedir.
2575  *
2576  * Returns NULL, with errno set, on failure. Note that failure detection
2577  * is commonly left to the following call of ReadDir or ReadDirExtended;
2578  * see the comments for ReadDir.
2579  *
2580  * Ideally this should be the *only* direct call of opendir() in the backend.
2581  */
2582 DIR *
2583 AllocateDir(const char *dirname)
2584 {
2585  DIR *dir;
2586 
2587  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2588  numAllocatedDescs, dirname));
2589 
2590  /* Can we allocate another non-virtual FD? */
2591  if (!reserveAllocatedDesc())
2592  ereport(ERROR,
2593  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2594  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2595  maxAllocatedDescs, dirname)));
2596 
2597  /* Close excess kernel FDs. */
2598  ReleaseLruFiles();
2599 
2600 TryAgain:
2601  if ((dir = opendir(dirname)) != NULL)
2602  {
2603  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2604 
2605  desc->kind = AllocateDescDir;
2606  desc->desc.dir = dir;
2609  return desc->desc.dir;
2610  }
2611 
2612  if (errno == EMFILE || errno == ENFILE)
2613  {
2614  int save_errno = errno;
2615 
2616  ereport(LOG,
2617  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2618  errmsg("out of file descriptors: %m; release and retry")));
2619  errno = 0;
2620  if (ReleaseLruFile())
2621  goto TryAgain;
2622  errno = save_errno;
2623  }
2624 
2625  return NULL;
2626 }
2627 
2628 /*
2629  * Read a directory opened with AllocateDir, ereport'ing any error.
2630  *
2631  * This is easier to use than raw readdir() since it takes care of some
2632  * otherwise rather tedious and error-prone manipulation of errno. Also,
2633  * if you are happy with a generic error message for AllocateDir failure,
2634  * you can just do
2635  *
2636  * dir = AllocateDir(path);
2637  * while ((dirent = ReadDir(dir, path)) != NULL)
2638  * process dirent;
2639  * FreeDir(dir);
2640  *
2641  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2642  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2643  * use this shortcut.)
2644  *
2645  * The pathname passed to AllocateDir must be passed to this routine too,
2646  * but it is only used for error reporting.
2647  */
2648 struct dirent *
2649 ReadDir(DIR *dir, const char *dirname)
2650 {
2651  return ReadDirExtended(dir, dirname, ERROR);
2652 }
2653 
2654 /*
2655  * Alternate version of ReadDir that allows caller to specify the elevel
2656  * for any error report (whether it's reporting an initial failure of
2657  * AllocateDir or a subsequent directory read failure).
2658  *
2659  * If elevel < ERROR, returns NULL after any error. With the normal coding
2660  * pattern, this will result in falling out of the loop immediately as
2661  * though the directory contained no (more) entries.
2662  */
2663 struct dirent *
2664 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2665 {
2666  struct dirent *dent;
2667 
2668  /* Give a generic message for AllocateDir failure, if caller didn't */
2669  if (dir == NULL)
2670  {
2671  ereport(elevel,
2673  errmsg("could not open directory \"%s\": %m",
2674  dirname)));
2675  return NULL;
2676  }
2677 
2678  errno = 0;
2679  if ((dent = readdir(dir)) != NULL)
2680  return dent;
2681 
2682  if (errno)
2683  ereport(elevel,
2685  errmsg("could not read directory \"%s\": %m",
2686  dirname)));
2687  return NULL;
2688 }
2689 
2690 /*
2691  * Close a directory opened with AllocateDir.
2692  *
2693  * Returns closedir's return value (with errno set if it's not 0).
2694  * Note we do not check the return value --- it is up to the caller
2695  * to handle close errors if wanted.
2696  *
2697  * Does nothing if dir == NULL; we assume that directory open failure was
2698  * already reported if desired.
2699  */
2700 int
2702 {
2703  int i;
2704 
2705  /* Nothing to do if AllocateDir failed */
2706  if (dir == NULL)
2707  return 0;
2708 
2709  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2710 
2711  /* Remove dir from list of allocated dirs, if it's present */
2712  for (i = numAllocatedDescs; --i >= 0;)
2713  {
2714  AllocateDesc *desc = &allocatedDescs[i];
2715 
2716  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2717  return FreeDesc(desc);
2718  }
2719 
2720  /* Only get here if someone passes us a dir not in allocatedDescs */
2721  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2722 
2723  return closedir(dir);
2724 }
2725 
2726 
2727 /*
2728  * Close a pipe stream returned by OpenPipeStream.
2729  */
2730 int
2731 ClosePipeStream(FILE *file)
2732 {
2733  int i;
2734 
2735  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2736 
2737  /* Remove file from list of allocated files, if it's present */
2738  for (i = numAllocatedDescs; --i >= 0;)
2739  {
2740  AllocateDesc *desc = &allocatedDescs[i];
2741 
2742  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2743  return FreeDesc(desc);
2744  }
2745 
2746  /* Only get here if someone passes us a file not in allocatedDescs */
2747  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2748 
2749  return pclose(file);
2750 }
2751 
2752 /*
2753  * closeAllVfds
2754  *
2755  * Force all VFDs into the physically-closed state, so that the fewest
2756  * possible number of kernel file descriptors are in use. There is no
2757  * change in the logical state of the VFDs.
2758  */
2759 void
2761 {
2762  Index i;
2763 
2764  if (SizeVfdCache > 0)
2765  {
2766  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2767  for (i = 1; i < SizeVfdCache; i++)
2768  {
2769  if (!FileIsNotOpen(i))
2770  LruDelete(i);
2771  }
2772  }
2773 }
2774 
2775 
2776 /*
2777  * SetTempTablespaces
2778  *
2779  * Define a list (actually an array) of OIDs of tablespaces to use for
2780  * temporary files. This list will be used until end of transaction,
2781  * unless this function is called again before then. It is caller's
2782  * responsibility that the passed-in array has adequate lifespan (typically
2783  * it'd be allocated in TopTransactionContext).
2784  *
2785  * Some entries of the array may be InvalidOid, indicating that the current
2786  * database's default tablespace should be used.
2787  */
2788 void
2789 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2790 {
2791  Assert(numSpaces >= 0);
2792  tempTableSpaces = tableSpaces;
2793  numTempTableSpaces = numSpaces;
2794 
2795  /*
2796  * Select a random starting point in the list. This is to minimize
2797  * conflicts between backends that are most likely sharing the same list
2798  * of temp tablespaces. Note that if we create multiple temp files in the
2799  * same transaction, we'll advance circularly through the list --- this
2800  * ensures that large temporary sort files are nicely spread across all
2801  * available tablespaces.
2802  */
2803  if (numSpaces > 1)
2804  nextTempTableSpace = random() % numSpaces;
2805  else
2806  nextTempTableSpace = 0;
2807 }
2808 
2809 /*
2810  * TempTablespacesAreSet
2811  *
2812  * Returns true if SetTempTablespaces has been called in current transaction.
2813  * (This is just so that tablespaces.c doesn't need its own per-transaction
2814  * state.)
2815  */
2816 bool
2818 {
2819  return (numTempTableSpaces >= 0);
2820 }
2821 
2822 /*
2823  * GetTempTablespaces
2824  *
2825  * Populate an array with the OIDs of the tablespaces that should be used for
2826  * temporary files. (Some entries may be InvalidOid, indicating that the
2827  * current database's default tablespace should be used.) At most numSpaces
2828  * entries will be filled.
2829  * Returns the number of OIDs that were copied into the output array.
2830  */
2831 int
2832 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2833 {
2834  int i;
2835 
2837  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2838  tableSpaces[i] = tempTableSpaces[i];
2839 
2840  return i;
2841 }
2842 
2843 /*
2844  * GetNextTempTableSpace
2845  *
2846  * Select the next temp tablespace to use. A result of InvalidOid means
2847  * to use the current database's default tablespace.
2848  */
2849 Oid
2851 {
2852  if (numTempTableSpaces > 0)
2853  {
2854  /* Advance nextTempTableSpace counter with wraparound */
2856  nextTempTableSpace = 0;
2858  }
2859  return InvalidOid;
2860 }
2861 
2862 
2863 /*
2864  * AtEOSubXact_Files
2865  *
2866  * Take care of subtransaction commit/abort. At abort, we close temp files
2867  * that the subtransaction may have opened. At commit, we reassign the
2868  * files that were opened to the parent subtransaction.
2869  */
2870 void
2871 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2872  SubTransactionId parentSubid)
2873 {
2874  Index i;
2875 
2876  for (i = 0; i < numAllocatedDescs; i++)
2877  {
2878  if (allocatedDescs[i].create_subid == mySubid)
2879  {
2880  if (isCommit)
2881  allocatedDescs[i].create_subid = parentSubid;
2882  else
2883  {
2884  /* have to recheck the item after FreeDesc (ugly) */
2885  FreeDesc(&allocatedDescs[i--]);
2886  }
2887  }
2888  }
2889 }
2890 
2891 /*
2892  * AtEOXact_Files
2893  *
2894  * This routine is called during transaction commit or abort. All still-open
2895  * per-transaction temporary file VFDs are closed, which also causes the
2896  * underlying files to be deleted (although they should've been closed already
2897  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2898  * closed. We also forget any transaction-local temp tablespace list.
2899  *
2900  * The isCommit flag is used only to decide whether to emit warnings about
2901  * unclosed files.
2902  */
2903 void
2904 AtEOXact_Files(bool isCommit)
2905 {
2906  CleanupTempFiles(isCommit, false);
2907  tempTableSpaces = NULL;
2908  numTempTableSpaces = -1;
2909 }
2910 
2911 /*
2912  * AtProcExit_Files
2913  *
2914  * on_proc_exit hook to clean up temp files during backend shutdown.
2915  * Here, we want to clean up *all* temp files including interXact ones.
2916  */
2917 static void
2919 {
2920  CleanupTempFiles(false, true);
2921 }
2922 
2923 /*
2924  * Close temporary files and delete their underlying files.
2925  *
2926  * isCommit: if true, this is normal transaction commit, and we don't
2927  * expect any remaining files; warn if there are some.
2928  *
2929  * isProcExit: if true, this is being called as the backend process is
2930  * exiting. If that's the case, we should remove all temporary files; if
2931  * that's not the case, we are being called for transaction commit/abort
2932  * and should only remove transaction-local temp files. In either case,
2933  * also clean up "allocated" stdio files, dirs and fds.
2934  */
2935 static void
2936 CleanupTempFiles(bool isCommit, bool isProcExit)
2937 {
2938  Index i;
2939 
2940  /*
2941  * Careful here: at proc_exit we need extra cleanup, not just
2942  * xact_temporary files.
2943  */
2944  if (isProcExit || have_xact_temporary_files)
2945  {
2946  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2947  for (i = 1; i < SizeVfdCache; i++)
2948  {
2949  unsigned short fdstate = VfdCache[i].fdstate;
2950 
2951  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2952  VfdCache[i].fileName != NULL)
2953  {
2954  /*
2955  * If we're in the process of exiting a backend process, close
2956  * all temporary files. Otherwise, only close temporary files
2957  * local to the current transaction. They should be closed by
2958  * the ResourceOwner mechanism already, so this is just a
2959  * debugging cross-check.
2960  */
2961  if (isProcExit)
2962  FileClose(i);
2963  else if (fdstate & FD_CLOSE_AT_EOXACT)
2964  {
2965  elog(WARNING,
2966  "temporary file %s not closed at end-of-transaction",
2967  VfdCache[i].fileName);
2968  FileClose(i);
2969  }
2970  }
2971  }
2972 
2973  have_xact_temporary_files = false;
2974  }
2975 
2976  /* Complain if any allocated files remain open at commit. */
2977  if (isCommit && numAllocatedDescs > 0)
2978  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
2980 
2981  /* Clean up "allocated" stdio files, dirs and fds. */
2982  while (numAllocatedDescs > 0)
2983  FreeDesc(&allocatedDescs[0]);
2984 }
2985 
2986 
2987 /*
2988  * Remove temporary and temporary relation files left over from a prior
2989  * postmaster session
2990  *
2991  * This should be called during postmaster startup. It will forcibly
2992  * remove any leftover files created by OpenTemporaryFile and any leftover
2993  * temporary relation files created by mdcreate.
2994  *
2995  * NOTE: we could, but don't, call this during a post-backend-crash restart
2996  * cycle. The argument for not doing it is that someone might want to examine
2997  * the temp files for debugging purposes. This does however mean that
2998  * OpenTemporaryFile had better allow for collision with an existing temp
2999  * file name.
3000  *
3001  * NOTE: this function and its subroutines generally report syscall failures
3002  * with ereport(LOG) and keep going. Removing temp files is not so critical
3003  * that we should fail to start the database when we can't do it.
3004  */
3005 void
3007 {
3008  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3009  DIR *spc_dir;
3010  struct dirent *spc_de;
3011 
3012  /*
3013  * First process temp files in pg_default ($PGDATA/base)
3014  */
3015  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3016  RemovePgTempFilesInDir(temp_path, true, false);
3017  RemovePgTempRelationFiles("base");
3018 
3019  /*
3020  * Cycle through temp directories for all non-default tablespaces.
3021  */
3022  spc_dir = AllocateDir("pg_tblspc");
3023 
3024  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3025  {
3026  if (strcmp(spc_de->d_name, ".") == 0 ||
3027  strcmp(spc_de->d_name, "..") == 0)
3028  continue;
3029 
3030  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3032  RemovePgTempFilesInDir(temp_path, true, false);
3033 
3034  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3036  RemovePgTempRelationFiles(temp_path);
3037  }
3038 
3039  FreeDir(spc_dir);
3040 
3041  /*
3042  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3043  * DataDir as well. However, that is *not* cleaned here because doing so
3044  * would create a race condition. It's done separately, earlier in
3045  * postmaster startup.
3046  */
3047 }
3048 
3049 /*
3050  * Process one pgsql_tmp directory for RemovePgTempFiles.
3051  *
3052  * If missing_ok is true, it's all right for the named directory to not exist.
3053  * Any other problem results in a LOG message. (missing_ok should be true at
3054  * the top level, since pgsql_tmp directories are not created until needed.)
3055  *
3056  * At the top level, this should be called with unlink_all = false, so that
3057  * only files matching the temporary name prefix will be unlinked. When
3058  * recursing it will be called with unlink_all = true to unlink everything
3059  * under a top-level temporary directory.
3060  *
3061  * (These two flags could be replaced by one, but it seems clearer to keep
3062  * them separate.)
3063  */
3064 void
3065 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3066 {
3067  DIR *temp_dir;
3068  struct dirent *temp_de;
3069  char rm_path[MAXPGPATH * 2];
3070 
3071  temp_dir = AllocateDir(tmpdirname);
3072 
3073  if (temp_dir == NULL && errno == ENOENT && missing_ok)
3074  return;
3075 
3076  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3077  {
3078  if (strcmp(temp_de->d_name, ".") == 0 ||
3079  strcmp(temp_de->d_name, "..") == 0)
3080  continue;
3081 
3082  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3083  tmpdirname, temp_de->d_name);
3084 
3085  if (unlink_all ||
3086  strncmp(temp_de->d_name,
3088  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3089  {
3090  struct stat statbuf;
3091 
3092  if (lstat(rm_path, &statbuf) < 0)
3093  {
3094  ereport(LOG,
3096  errmsg("could not stat file \"%s\": %m", rm_path)));
3097  continue;
3098  }
3099 
3100  if (S_ISDIR(statbuf.st_mode))
3101  {
3102  /* recursively remove contents, then directory itself */
3103  RemovePgTempFilesInDir(rm_path, false, true);
3104 
3105  if (rmdir(rm_path) < 0)
3106  ereport(LOG,
3108  errmsg("could not remove directory \"%s\": %m",
3109  rm_path)));
3110  }
3111  else
3112  {
3113  if (unlink(rm_path) < 0)
3114  ereport(LOG,
3116  errmsg("could not remove file \"%s\": %m",
3117  rm_path)));
3118  }
3119  }
3120  else
3121  ereport(LOG,
3122  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3123  rm_path)));
3124  }
3125 
3126  FreeDir(temp_dir);
3127 }
3128 
3129 /* Process one tablespace directory, look for per-DB subdirectories */
3130 static void
3131 RemovePgTempRelationFiles(const char *tsdirname)
3132 {
3133  DIR *ts_dir;
3134  struct dirent *de;
3135  char dbspace_path[MAXPGPATH * 2];
3136 
3137  ts_dir = AllocateDir(tsdirname);
3138 
3139  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3140  {
3141  /*
3142  * We're only interested in the per-database directories, which have
3143  * numeric names. Note that this code will also (properly) ignore "."
3144  * and "..".
3145  */
3146  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3147  continue;
3148 
3149  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3150  tsdirname, de->d_name);
3151  RemovePgTempRelationFilesInDbspace(dbspace_path);
3152  }
3153 
3154  FreeDir(ts_dir);
3155 }
3156 
3157 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3158 static void
3159 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3160 {
3161  DIR *dbspace_dir;
3162  struct dirent *de;
3163  char rm_path[MAXPGPATH * 2];
3164 
3165  dbspace_dir = AllocateDir(dbspacedirname);
3166 
3167  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3168  {
3169  if (!looks_like_temp_rel_name(de->d_name))
3170  continue;
3171 
3172  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3173  dbspacedirname, de->d_name);
3174 
3175  if (unlink(rm_path) < 0)
3176  ereport(LOG,
3178  errmsg("could not remove file \"%s\": %m",
3179  rm_path)));
3180  }
3181 
3182  FreeDir(dbspace_dir);
3183 }
3184 
3185 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3186 bool
3188 {
3189  int pos;
3190  int savepos;
3191 
3192  /* Must start with "t". */
3193  if (name[0] != 't')
3194  return false;
3195 
3196  /* Followed by a non-empty string of digits and then an underscore. */
3197  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3198  ;
3199  if (pos == 1 || name[pos] != '_')
3200  return false;
3201 
3202  /* Followed by another nonempty string of digits. */
3203  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3204  ;
3205  if (savepos == pos)
3206  return false;
3207 
3208  /* We might have _forkname or .segment or both. */
3209  if (name[pos] == '_')
3210  {
3211  int forkchar = forkname_chars(&name[pos + 1], NULL);
3212 
3213  if (forkchar <= 0)
3214  return false;
3215  pos += forkchar + 1;
3216  }
3217  if (name[pos] == '.')
3218  {
3219  int segchar;
3220 
3221  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3222  ;
3223  if (segchar <= 1)
3224  return false;
3225  pos += segchar;
3226  }
3227 
3228  /* Now we should be at the end. */
3229  if (name[pos] != '\0')
3230  return false;
3231  return true;
3232 }
3233 
3234 
3235 /*
3236  * Issue fsync recursively on PGDATA and all its contents.
3237  *
3238  * We fsync regular files and directories wherever they are, but we
3239  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3240  * Other symlinks are presumed to point at files we're not responsible
3241  * for fsyncing, and might not have privileges to write at all.
3242  *
3243  * Errors are logged but not considered fatal; that's because this is used
3244  * only during database startup, to deal with the possibility that there are
3245  * issued-but-unsynced writes pending against the data directory. We want to
3246  * ensure that such writes reach disk before anything that's done in the new
3247  * run. However, aborting on error would result in failure to start for
3248  * harmless cases such as read-only files in the data directory, and that's
3249  * not good either.
3250  *
3251  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3252  * rewriting all changes again during recovery.
3253  *
3254  * Note we assume we're chdir'd into PGDATA to begin with.
3255  */
3256 void
3258 {
3259  bool xlog_is_symlink;
3260 
3261  /* We can skip this whole thing if fsync is disabled. */
3262  if (!enableFsync)
3263  return;
3264 
3265  /*
3266  * If pg_wal is a symlink, we'll need to recurse into it separately,
3267  * because the first walkdir below will ignore it.
3268  */
3269  xlog_is_symlink = false;
3270 
3271 #ifndef WIN32
3272  {
3273  struct stat st;
3274 
3275  if (lstat("pg_wal", &st) < 0)
3276  ereport(LOG,
3278  errmsg("could not stat file \"%s\": %m",
3279  "pg_wal")));
3280  else if (S_ISLNK(st.st_mode))
3281  xlog_is_symlink = true;
3282  }
3283 #else
3284  if (pgwin32_is_junction("pg_wal"))
3285  xlog_is_symlink = true;
3286 #endif
3287 
3288  /*
3289  * If possible, hint to the kernel that we're soon going to fsync the data
3290  * directory and its contents. Errors in this step are even less
3291  * interesting than normal, so log them only at DEBUG1.
3292  */
3293 #ifdef PG_FLUSH_DATA_WORKS
3294  walkdir(".", pre_sync_fname, false, DEBUG1);
3295  if (xlog_is_symlink)
3296  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3297  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3298 #endif
3299 
3300  /*
3301  * Now we do the fsync()s in the same order.
3302  *
3303  * The main call ignores symlinks, so in addition to specially processing
3304  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3305  * process_symlinks = true. Note that if there are any plain directories
3306  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3307  * so we don't worry about optimizing it.
3308  */
3309  walkdir(".", datadir_fsync_fname, false, LOG);
3310  if (xlog_is_symlink)
3311  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3312  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3313 }
3314 
3315 /*
3316  * walkdir: recursively walk a directory, applying the action to each
3317  * regular file and directory (including the named directory itself).
3318  *
3319  * If process_symlinks is true, the action and recursion are also applied
3320  * to regular files and directories that are pointed to by symlinks in the
3321  * given directory; otherwise symlinks are ignored. Symlinks are always
3322  * ignored in subdirectories, ie we intentionally don't pass down the
3323  * process_symlinks flag to recursive calls.
3324  *
3325  * Errors are reported at level elevel, which might be ERROR or less.
3326  *
3327  * See also walkdir in file_utils.c, which is a frontend version of this
3328  * logic.
3329  */
3330 static void
3331 walkdir(const char *path,
3332  void (*action) (const char *fname, bool isdir, int elevel),
3333  bool process_symlinks,
3334  int elevel)
3335 {
3336  DIR *dir;
3337  struct dirent *de;
3338 
3339  dir = AllocateDir(path);
3340 
3341  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3342  {
3343  char subpath[MAXPGPATH * 2];
3344 
3346 
3347  if (strcmp(de->d_name, ".") == 0 ||
3348  strcmp(de->d_name, "..") == 0)
3349  continue;
3350 
3351  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3352 
3353  switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3354  {
3355  case PGFILETYPE_REG:
3356  (*action) (subpath, false, elevel);
3357  break;
3358  case PGFILETYPE_DIR:
3359  walkdir(subpath, action, false, elevel);
3360  break;
3361  default:
3362 
3363  /*
3364  * Errors are already reported directly by get_dirent_type(),
3365  * and any remaining symlinks and unknown file types are
3366  * ignored.
3367  */
3368  break;
3369  }
3370  }
3371 
3372  FreeDir(dir); /* we ignore any error here */
3373 
3374  /*
3375  * It's important to fsync the destination directory itself as individual
3376  * file fsyncs don't guarantee that the directory entry for the file is
3377  * synced. However, skip this if AllocateDir failed; the action function
3378  * might not be robust against that.
3379  */
3380  if (dir)
3381  (*action) (path, true, elevel);
3382 }
3383 
3384 
3385 /*
3386  * Hint to the OS that it should get ready to fsync() this file.
3387  *
3388  * Ignores errors trying to open unreadable files, and logs other errors at a
3389  * caller-specified level.
3390  */
3391 #ifdef PG_FLUSH_DATA_WORKS
3392 
3393 static void
3394 pre_sync_fname(const char *fname, bool isdir, int elevel)
3395 {
3396  int fd;
3397 
3398  /* Don't try to flush directories, it'll likely just fail */
3399  if (isdir)
3400  return;
3401 
3402  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3403 
3404  if (fd < 0)
3405  {
3406  if (errno == EACCES)
3407  return;
3408  ereport(elevel,
3410  errmsg("could not open file \"%s\": %m", fname)));
3411  return;
3412  }
3413 
3414  /*
3415  * pg_flush_data() ignores errors, which is ok because this is only a
3416  * hint.
3417  */
3418  pg_flush_data(fd, 0, 0);
3419 
3420  if (CloseTransientFile(fd) != 0)
3421  ereport(elevel,
3423  errmsg("could not close file \"%s\": %m", fname)));
3424 }
3425 
3426 #endif /* PG_FLUSH_DATA_WORKS */
3427 
3428 static void
3429 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3430 {
3431  /*
3432  * We want to silently ignoring errors about unreadable files. Pass that
3433  * desire on to fsync_fname_ext().
3434  */
3435  fsync_fname_ext(fname, isdir, true, elevel);
3436 }
3437 
3438 static void
3439 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3440 {
3441  if (isdir)
3442  {
3443  if (rmdir(fname) != 0 && errno != ENOENT)
3444  ereport(elevel,
3446  errmsg("could not remove directory \"%s\": %m", fname)));
3447  }
3448  else
3449  {
3450  /* Use PathNameDeleteTemporaryFile to report filesize */
3451  PathNameDeleteTemporaryFile(fname, false);
3452  }
3453 }
3454 
3455 /*
3456  * fsync_fname_ext -- Try to fsync a file or directory
3457  *
3458  * If ignore_perm is true, ignore errors upon trying to open unreadable
3459  * files. Logs other errors at a caller-specified level.
3460  *
3461  * Returns 0 if the operation succeeded, -1 otherwise.
3462  */
3463 int
3464 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3465 {
3466  int fd;
3467  int flags;
3468  int returncode;
3469 
3470  /*
3471  * Some OSs require directories to be opened read-only whereas other
3472  * systems don't allow us to fsync files opened read-only; so we need both
3473  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3474  * not writable by our userid, but we assume that's OK.
3475  */
3476  flags = PG_BINARY;
3477  if (!isdir)
3478  flags |= O_RDWR;
3479  else
3480  flags |= O_RDONLY;
3481 
3482  fd = OpenTransientFile(fname, flags);
3483 
3484  /*
3485  * Some OSs don't allow us to open directories at all (Windows returns
3486  * EACCES), just ignore the error in that case. If desired also silently
3487  * ignoring errors about unreadable files. Log others.
3488  */
3489  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3490  return 0;
3491  else if (fd < 0 && ignore_perm && errno == EACCES)
3492  return 0;
3493  else if (fd < 0)
3494  {
3495  ereport(elevel,
3497  errmsg("could not open file \"%s\": %m", fname)));
3498  return -1;
3499  }
3500 
3501  returncode = pg_fsync(fd);
3502 
3503  /*
3504  * Some OSes don't allow us to fsync directories at all, so we can ignore
3505  * those errors. Anything else needs to be logged.
3506  */
3507  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3508  {
3509  int save_errno;
3510 
3511  /* close file upon error, might not be in transaction context */
3512  save_errno = errno;
3513  (void) CloseTransientFile(fd);
3514  errno = save_errno;
3515 
3516  ereport(elevel,
3518  errmsg("could not fsync file \"%s\": %m", fname)));
3519  return -1;
3520  }
3521 
3522  if (CloseTransientFile(fd) != 0)
3523  {
3524  ereport(elevel,
3526  errmsg("could not close file \"%s\": %m", fname)));
3527  return -1;
3528  }
3529 
3530  return 0;
3531 }
3532 
3533 /*
3534  * fsync_parent_path -- fsync the parent path of a file or directory
3535  *
3536  * This is aimed at making file operations persistent on disk in case of
3537  * an OS crash or power failure.
3538  */
3539 static int
3540 fsync_parent_path(const char *fname, int elevel)
3541 {
3542  char parentpath[MAXPGPATH];
3543 
3544  strlcpy(parentpath, fname, MAXPGPATH);
3545  get_parent_directory(parentpath);
3546 
3547  /*
3548  * get_parent_directory() returns an empty string if the input argument is
3549  * just a file name (see comments in path.c), so handle that as being the
3550  * current directory.
3551  */
3552  if (strlen(parentpath) == 0)
3553  strlcpy(parentpath, ".", MAXPGPATH);
3554 
3555  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3556  return -1;
3557 
3558  return 0;
3559 }
3560 
3561 /*
3562  * Create a PostgreSQL data sub-directory
3563  *
3564  * The data directory itself, and most of its sub-directories, are created at
3565  * initdb time, but we do have some occasions when we create directories in
3566  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3567  * make sure that those directories are created consistently. Today, that means
3568  * making sure that the created directory has the correct permissions, which is
3569  * what pg_dir_create_mode tracks for us.
3570  *
3571  * Note that we also set the umask() based on what we understand the correct
3572  * permissions to be (see file_perm.c).
3573  *
3574  * For permissions other than the default, mkdir() can be used directly, but
3575  * be sure to consider carefully such cases -- a sub-directory with incorrect
3576  * permissions in a PostgreSQL data directory could cause backups and other
3577  * processes to fail.
3578  */
3579 int
3580 MakePGDirectory(const char *directoryName)
3581 {
3582  return mkdir(directoryName, pg_dir_create_mode);
3583 }
3584 
3585 /*
3586  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3587  *
3588  * Failure to fsync any data file is cause for immediate panic, unless
3589  * data_sync_retry is enabled. Data may have been written to the operating
3590  * system and removed from our buffer pool already, and if we are running on
3591  * an operating system that forgets dirty data on write-back failure, there
3592  * may be only one copy of the data remaining: in the WAL. A later attempt to
3593  * fsync again might falsely report success. Therefore we must not allow any
3594  * further checkpoints to be attempted. data_sync_retry can in theory be
3595  * enabled on systems known not to drop dirty buffered data on write-back
3596  * failure (with the likely outcome that checkpoints will continue to fail
3597  * until the underlying problem is fixed).
3598  *
3599  * Any code that reports a failure from fsync() or related functions should
3600  * filter the error level with this function.
3601  */
3602 int
3603 data_sync_elevel(int elevel)
3604 {
3605  return data_sync_retry ? elevel : PANIC;
3606 }
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1450
File lruLessRecently
Definition: fd.c:193
void closeAllVfds(void)
Definition: fd.c:2760
static PgChecksumMode mode
Definition: pg_checksums.c:61
File nextFree
Definition: fd.c:191
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:859
int pg_file_create_mode
Definition: file_perm.c:19
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1780
#define MAP_FAILED
Definition: mem.h:45
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:40
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1713
#define NUM_RESERVED_FDS
Definition: fd.c:125
static AllocateDesc * allocatedDescs
Definition: fd.c:254
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1437
int pg_fdatasync(int fd)
Definition: fd.c:436
static void error(void)
Definition: sql-dyntest.c:147
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
AllocateDescKind
Definition: fd.c:232
DIR * dir
Definition: fd.c:247
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1656
static void AtProcExit_Files(int code, Datum arg)
Definition: fd.c:2918
static Size SizeVfdCache
Definition: fd.c:207
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:184
void on_proc_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:305
#define DO_DB(A)
Definition: fd.c:170
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2832
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3331
long random(void)
Definition: random.c:22
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
static int numExternalFDs
Definition: fd.c:259
int pg_fsync_writethrough(int fd)
Definition: fd.c:413
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2664
int max_safe_fds
Definition: fd.c:155
#define Min(x, y)
Definition: c.h:928
off_t FileSize(File file)
Definition: fd.c:2148
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:633
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2381
#define FD_DELETE_AT_CLOSE
Definition: fd.c:182
int log_temp_files
Definition: guc.c:544
mode_t FileGetRawMode(File file)
Definition: fd.c:2236
void _dosmaperr(unsigned long)
Definition: win32error.c:171
static Vfd * VfdCache
Definition: fd.c:206
static void Delete(File file)
Definition: fd.c:1130
int closedir(DIR *)
Definition: dirent.c:123
static int numTempTableSpaces
Definition: fd.c:274
#define PG_TEMP_FILES_DIR
Definition: pg_checksums.c:58
int errcode(int sqlerrcode)
Definition: elog.c:610
#define MemSet(start, val, len)
Definition: c.h:950
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1545
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:401
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3159
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1690
static bool reserveAllocatedDesc(void)
Definition: fd.c:2247
uint32 SubTransactionId
Definition: c.h:525
#define SIGPIPE
Definition: win32_port.h:164
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1631
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
AllocateDescKind kind
Definition: fd.c:242
char * FilePathName(File file)
Definition: fd.c:2200
Definition: dirent.h:9
#define OidIsValid(objectId)
Definition: c.h:652
#define PANIC
Definition: elog.h:53
#define PG_BINARY
Definition: c.h:1213
static char * basedir
ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset)
Definition: pwrite.c:27
void AtEOXact_Files(bool isCommit)
Definition: fd.c:2904
Oid MyDatabaseTableSpace
Definition: globals.c:87
int ClosePipeStream(FILE *file)
Definition: fd.c:2731
ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset)
Definition: pread.c:27
#define malloc(a)
Definition: header.h:50
static void LruDelete(File file)
Definition: fd.c:1149
void pg_usleep(long microsec)
Definition: signal.c:53
bool TempTablespacesAreSet(void)
Definition: fd.c:2817
#define fstat
Definition: win32_port.h:274
#define fsync(fd)
Definition: win32_port.h:68
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2482
void pfree(void *pointer)
Definition: mcxt.c:1057
mode_t fileMode
Definition: fd.c:198
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3131
static bool ReleaseLruFile(void)
Definition: fd.c:1244
Definition: dirent.c:25
int durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:786
#define ERROR
Definition: elog.h:43
#define PG_TEMP_FILE_PREFIX
Definition: pg_checksums.c:59
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2372
static int LruInsert(File file)
Definition: fd.c:1197
#define FATAL
Definition: elog.h:52
static bool have_xact_temporary_files
Definition: fd.c:218
#define MAXPGPATH
void ReserveExternalFD(void)
Definition: fd.c:1083
DIR * opendir(const char *)
Definition: dirent.c:33
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2127
#define DEBUG2
Definition: elog.h:24
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:26
char * fileName
Definition: fd.c:195
static char * buf
Definition: pg_test_fsync.c:68
Oid GetNextTempTableSpace(void)
Definition: fd.c:2850
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1268
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3439
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition: fd.c:1751
int errdetail(const char *fmt,...)
Definition: elog.c:954
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3065
char * tablespace
Definition: pgbench.c:189
int errcode_for_file_access(void)
Definition: elog.c:633
void get_parent_directory(char *path)
Definition: path.c:854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2322
static int nfile
Definition: fd.c:212
unsigned int uint32
Definition: c.h:375
void SyncDataDirectory(void)
Definition: fd.c:3257
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2583
static int nextTempTableSpace
Definition: fd.c:275
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1460
__int64 st_size
Definition: win32_port.h:265
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:410
int max_files_per_process
Definition: fd.c:142
static File AllocateVfd(void)
Definition: fd.c:1276
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2425
unsigned short fdstate
Definition: fd.c:189
Definition: fd.c:186
off_t fileSize
Definition: fd.c:194
int fd
Definition: fd.c:188
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2789
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:659
static void Insert(File file)
Definition: fd.c:1175
ResourceOwner resowner
Definition: fd.c:190
bool data_sync_retry
Definition: fd.c:158
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3429
int CloseTransientFile(int fd)
Definition: fd.c:2549
#define SIG_IGN
Definition: win32_port.h:156
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1390
static void ReleaseLruFiles(void)
Definition: fd.c:1266
#define WARNING
Definition: elog.h:40
#define FileIsNotOpen(file)
Definition: fd.c:179
int pg_dir_create_mode
Definition: file_perm.c:18
static int elevel
Definition: vacuumlazy.c:333
int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2029
struct vfd Vfd
int data_sync_elevel(int elevel)
Definition: fd.c:3603
uintptr_t Datum
Definition: postgres.h:367
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:2871
unsigned short st_mode
Definition: win32_port.h:260
unsigned int Index
Definition: c.h:483
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:456
#define FileIsValid(file)
Definition: fd.c:176
bool AcquireExternalFD(void)
Definition: fd.c:1048
FILE * file
Definition: fd.c:246
#define InvalidOid
Definition: postgres_ext.h:36
#define VFD_CLOSED
Definition: fd.c:174
static uint64 temporary_files_size
Definition: fd.c:226
#define ereport(elevel,...)
Definition: elog.h:144
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3580
pqsigfunc pqsignal(int signum, pqsigfunc handler)
Definition: signal.c:170
#define free(a)
Definition: header.h:65
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static void RegisterTemporaryFile(File file)
Definition: fd.c:1409
void FileClose(File file)
Definition: fd.c:1826
#define SIG_DFL
Definition: win32_port.h:154
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:1922
static int FileAccess(File file)
Definition: fd.c:1354
#define Assert(condition)
Definition: c.h:746
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:723
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2649
File lruMoreRecently
Definition: fd.c:192
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:1950
void RemovePgTempFiles(void)
Definition: fd.c:3006
SubTransactionId create_subid
Definition: fd.c:243
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1578
size_t Size
Definition: c.h:474
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1436
static const char * directory
Definition: zic.c:632
int sync_method
Definition: xlog.c:106
struct dirent * readdir(DIR *)
Definition: dirent.c:78
#define FD_MINFREE
Definition: fd.c:134
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3187
#define realloc(a, b)
Definition: header.h:60
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1070
#define INT64_FORMAT
Definition: c.h:417
const char * name
Definition: encode.c:561
static long tempFileCounter
Definition: fd.c:265
int fd
Definition: fd.c:248
#define S_ISDIR(m)
Definition: win32_port.h:316
#define lstat(path, sb)
Definition: win32_port.h:276
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:749
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:986
int FreeFile(FILE *file)
Definition: fd.c:2521
void set_max_safe_fds(void)
Definition: fd.c:943
union AllocateDesc::@25 desc
bool enableFsync
Definition: globals.c:119
static Oid * tempTableSpaces
Definition: fd.c:273
void ReleaseExternalFD(void)
Definition: fd.c:1101
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:821
int FileGetRawFlags(File file)
Definition: fd.c:2226
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1257
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1008
#define elog(elevel,...)
Definition: elog.h:214
int i
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:183
void * arg
int FileGetRawDesc(File file)
Definition: fd.c:2216
static void FreeVfd(File file)
Definition: fd.c:1334
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
int pg_fsync(int fd)
Definition: fd.c:346
char d_name[MAX_PATH]
Definition: dirent.h:15
#define mkdir(a, b)
Definition: win32_port.h:63
int link(const char *src, const char *dst)
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:343
int fileFlags
Definition: fd.c:197
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1514
int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:1973
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1277
#define snprintf
Definition: port.h:215
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2165
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3464
static int maxAllocatedDescs
Definition: fd.c:253
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:2936
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3540
int File
Definition: fd.h:49
int FreeDir(DIR *dir)
Definition: fd.c:2701
int temp_file_limit
Definition: guc.c:551
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:241
void InitFileAccess(void)
Definition: fd.c:826
#define stat
Definition: win32_port.h:275
static int numAllocatedDescs
Definition: fd.c:252
bool pgwin32_is_junction(const char *path)
#define ftruncate(a, b)
Definition: win32_port.h:65