PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  * If a non-virtual file descriptor needs to be held open for any length of
65  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66  * (and eventually ReleaseExternalFD), so that we can take it into account
67  * while deciding how many VFDs can be open. This applies to FDs obtained
68  * with BasicOpenFile as well as those obtained without use of any fd.c API.
69  *
70  *-------------------------------------------------------------------------
71  */
72 
73 #include "postgres.h"
74 
75 #include <sys/file.h>
76 #include <sys/param.h>
77 #include <sys/stat.h>
78 #ifndef WIN32
79 #include <sys/mman.h>
80 #endif
81 #include <limits.h>
82 #include <unistd.h>
83 #include <fcntl.h>
84 #ifdef HAVE_SYS_RESOURCE_H
85 #include <sys/resource.h> /* for getrlimit */
86 #endif
87 
88 #include "access/xact.h"
89 #include "access/xlog.h"
90 #include "catalog/pg_tablespace.h"
91 #include "common/file_perm.h"
92 #include "common/file_utils.h"
93 #include "miscadmin.h"
94 #include "pgstat.h"
95 #include "port/pg_iovec.h"
96 #include "portability/mem.h"
97 #include "storage/fd.h"
98 #include "storage/ipc.h"
99 #include "utils/guc.h"
100 #include "utils/resowner_private.h"
101 
102 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
103 #if defined(HAVE_SYNC_FILE_RANGE)
104 #define PG_FLUSH_DATA_WORKS 1
105 #elif !defined(WIN32) && defined(MS_ASYNC)
106 #define PG_FLUSH_DATA_WORKS 1
107 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
108 #define PG_FLUSH_DATA_WORKS 1
109 #endif
110 
111 /*
112  * We must leave some file descriptors free for system(), the dynamic loader,
113  * and other code that tries to open files without consulting fd.c. This
114  * is the number left free. (While we try fairly hard to prevent EMFILE
115  * errors, there's never any guarantee that we won't get ENFILE due to
116  * other processes chewing up FDs. So it's a bad idea to try to open files
117  * without consulting fd.c. Nonetheless we cannot control all code.)
118  *
119  * Because this is just a fixed setting, we are effectively assuming that
120  * no such code will leave FDs open over the long term; otherwise the slop
121  * is likely to be insufficient. Note in particular that we expect that
122  * loading a shared library does not result in any permanent increase in
123  * the number of open files. (This appears to be true on most if not
124  * all platforms as of Feb 2004.)
125  */
126 #define NUM_RESERVED_FDS 10
127 
128 /*
129  * If we have fewer than this many usable FDs after allowing for the reserved
130  * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
131  * much less than that. Note that this value ensures numExternalFDs can be
132  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
133  * will not pass unless that can grow to at least 14.)
134  */
135 #define FD_MINFREE 48
136 
137 /*
138  * A number of platforms allow individual processes to open many more files
139  * than they can really support when *many* processes do the same thing.
140  * This GUC parameter lets the DBA limit max_safe_fds to something less than
141  * what the postmaster's initial probe suggests will work.
142  */
144 
145 /*
146  * Maximum number of file descriptors to open for operations that fd.c knows
147  * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
148  * to a conservative value, and remains that way indefinitely in bootstrap or
149  * standalone-backend cases. In normal postmaster operation, the postmaster
150  * calls set_max_safe_fds() late in initialization to update the value, and
151  * that value is then inherited by forked subprocesses.
152  *
153  * Note: the value of max_files_per_process is taken into account while
154  * setting this variable, and so need not be tested separately.
155  */
156 int max_safe_fds = FD_MINFREE; /* default if not changed */
157 
158 /* Whether it is safe to continue running after fsync() fails. */
159 bool data_sync_retry = false;
160 
161 /* Debugging.... */
162 
163 #ifdef FDDEBUG
164 #define DO_DB(A) \
165  do { \
166  int _do_db_save_errno = errno; \
167  A; \
168  errno = _do_db_save_errno; \
169  } while (0)
170 #else
171 #define DO_DB(A) \
172  ((void) 0)
173 #endif
174 
175 #define VFD_CLOSED (-1)
176 
177 #define FileIsValid(file) \
178  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
179 
180 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
181 
182 /* these are the assigned bits in fdstate below: */
183 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
184 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
185 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
186 
187 typedef struct vfd
188 {
189  int fd; /* current FD, or VFD_CLOSED if none */
190  unsigned short fdstate; /* bitflags for VFD's state */
191  ResourceOwner resowner; /* owner, for automatic cleanup */
192  File nextFree; /* link to next free VFD, if in freelist */
193  File lruMoreRecently; /* doubly linked recency-of-use list */
195  off_t fileSize; /* current size of file (0 if not temporary) */
196  char *fileName; /* name of file, or NULL for unused VFD */
197  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
198  int fileFlags; /* open(2) flags for (re)opening the file */
199  mode_t fileMode; /* mode to pass to open(2) */
200 } Vfd;
201 
202 /*
203  * Virtual File Descriptor array pointer and size. This grows as
204  * needed. 'File' values are indexes into this array.
205  * Note that VfdCache[0] is not a usable VFD, just a list header.
206  */
207 static Vfd *VfdCache;
208 static Size SizeVfdCache = 0;
209 
210 /*
211  * Number of file descriptors known to be in use by VFD entries.
212  */
213 static int nfile = 0;
214 
215 /*
216  * Flag to tell whether it's worth scanning VfdCache looking for temp files
217  * to close
218  */
219 static bool have_xact_temporary_files = false;
220 
221 /*
222  * Tracks the total size of all temporary files. Note: when temp_file_limit
223  * is being enforced, this cannot overflow since the limit cannot be more
224  * than INT_MAX kilobytes. When not enforcing, it could theoretically
225  * overflow, but we don't care.
226  */
227 static uint64 temporary_files_size = 0;
228 
229 /*
230  * List of OS handles opened with AllocateFile, AllocateDir and
231  * OpenTransientFile.
232  */
233 typedef enum
234 {
240 
241 typedef struct
242 {
245  union
246  {
247  FILE *file;
249  int fd;
250  } desc;
251 } AllocateDesc;
252 
253 static int numAllocatedDescs = 0;
254 static int maxAllocatedDescs = 0;
256 
257 /*
258  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
259  */
260 static int numExternalFDs = 0;
261 
262 /*
263  * Number of temporary files opened during the current session;
264  * this is used in generation of tempfile names.
265  */
266 static long tempFileCounter = 0;
267 
268 /*
269  * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
270  * indicating that the current database's default tablespace should be used.)
271  * When numTempTableSpaces is -1, this has not been set in the current
272  * transaction.
273  */
274 static Oid *tempTableSpaces = NULL;
275 static int numTempTableSpaces = -1;
276 static int nextTempTableSpace = 0;
277 
278 
279 /*--------------------
280  *
281  * Private Routines
282  *
283  * Delete - delete a file from the Lru ring
284  * LruDelete - remove a file from the Lru ring and close its FD
285  * Insert - put a file at the front of the Lru ring
286  * LruInsert - put a file at the front of the Lru ring and open it
287  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
288  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
289  * AllocateVfd - grab a free (or new) file record (from VfdCache)
290  * FreeVfd - free a file record
291  *
292  * The Least Recently Used ring is a doubly linked list that begins and
293  * ends on element zero. Element zero is special -- it doesn't represent
294  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
295  * anchor that shows us the beginning/end of the ring.
296  * Only VFD elements that are currently really open (have an FD assigned) are
297  * in the Lru ring. Elements that are "virtually" open can be recognized
298  * by having a non-null fileName field.
299  *
300  * example:
301  *
302  * /--less----\ /---------\
303  * v \ v \
304  * #0 --more---> LeastRecentlyUsed --more-\ \
305  * ^\ | |
306  * \\less--> MostRecentlyUsedFile <---/ |
307  * \more---/ \--less--/
308  *
309  *--------------------
310  */
311 static void Delete(File file);
312 static void LruDelete(File file);
313 static void Insert(File file);
314 static int LruInsert(File file);
315 static bool ReleaseLruFile(void);
316 static void ReleaseLruFiles(void);
317 static File AllocateVfd(void);
318 static void FreeVfd(File file);
319 
320 static int FileAccess(File file);
321 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
322 static bool reserveAllocatedDesc(void);
323 static int FreeDesc(AllocateDesc *desc);
324 
325 static void AtProcExit_Files(int code, Datum arg);
326 static void CleanupTempFiles(bool isCommit, bool isProcExit);
327 static void RemovePgTempRelationFiles(const char *tsdirname);
328 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
329 
330 static void walkdir(const char *path,
331  void (*action) (const char *fname, bool isdir, int elevel),
332  bool process_symlinks,
333  int elevel);
334 #ifdef PG_FLUSH_DATA_WORKS
335 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
336 #endif
337 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
338 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
339 
340 static int fsync_parent_path(const char *fname, int elevel);
341 
342 
343 /*
344  * pg_fsync --- do fsync with or without writethrough
345  */
346 int
348 {
349 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
350  struct stat st;
351 
352  /*
353  * Some operating system implementations of fsync() have requirements
354  * about the file access modes that were used when their file descriptor
355  * argument was opened, and these requirements differ depending on whether
356  * the file descriptor is for a directory.
357  *
358  * For any file descriptor that may eventually be handed to fsync(), we
359  * should have opened it with access modes that are compatible with
360  * fsync() on all supported systems, otherwise the code may not be
361  * portable, even if it runs ok on the current system.
362  *
363  * We assert here that a descriptor for a file was opened with write
364  * permissions (either O_RDWR or O_WRONLY) and for a directory without
365  * write permissions (O_RDONLY).
366  *
367  * Ignore any fstat errors and let the follow-up fsync() do its work.
368  * Doing this sanity check here counts for the case where fsync() is
369  * disabled.
370  */
371  if (fstat(fd, &st) == 0)
372  {
373  int desc_flags = fcntl(fd, F_GETFL);
374 
375  /*
376  * O_RDONLY is historically 0, so just make sure that for directories
377  * no write flags are used.
378  */
379  if (S_ISDIR(st.st_mode))
380  Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
381  else
382  Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
383  }
384  errno = 0;
385 #endif
386 
387  /* #if is to skip the sync_method test if there's no need for it */
388 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
390  return pg_fsync_writethrough(fd);
391  else
392 #endif
393  return pg_fsync_no_writethrough(fd);
394 }
395 
396 
397 /*
398  * pg_fsync_no_writethrough --- same as fsync except does nothing if
399  * enableFsync is off
400  */
401 int
403 {
404  if (enableFsync)
405  return fsync(fd);
406  else
407  return 0;
408 }
409 
410 /*
411  * pg_fsync_writethrough
412  */
413 int
415 {
416  if (enableFsync)
417  {
418 #ifdef WIN32
419  return _commit(fd);
420 #elif defined(F_FULLFSYNC)
421  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
422 #else
423  errno = ENOSYS;
424  return -1;
425 #endif
426  }
427  else
428  return 0;
429 }
430 
431 /*
432  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
433  *
434  * Not all platforms have fdatasync; treat as fsync if not available.
435  */
436 int
438 {
439  if (enableFsync)
440  {
441 #ifdef HAVE_FDATASYNC
442  return fdatasync(fd);
443 #else
444  return fsync(fd);
445 #endif
446  }
447  else
448  return 0;
449 }
450 
451 /*
452  * pg_flush_data --- advise OS that the described dirty data should be flushed
453  *
454  * offset of 0 with nbytes 0 means that the entire file should be flushed
455  */
456 void
457 pg_flush_data(int fd, off_t offset, off_t nbytes)
458 {
459  /*
460  * Right now file flushing is primarily used to avoid making later
461  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
462  * if fsyncs are disabled - that's a decision we might want to make
463  * configurable at some point.
464  */
465  if (!enableFsync)
466  return;
467 
468  /*
469  * We compile all alternatives that are supported on the current platform,
470  * to find portability problems more easily.
471  */
472 #if defined(HAVE_SYNC_FILE_RANGE)
473  {
474  int rc;
475  static bool not_implemented_by_kernel = false;
476 
477  if (not_implemented_by_kernel)
478  return;
479 
480  /*
481  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
482  * tells the OS that writeback for the specified blocks should be
483  * started, but that we don't want to wait for completion. Note that
484  * this call might block if too much dirty data exists in the range.
485  * This is the preferable method on OSs supporting it, as it works
486  * reliably when available (contrast to msync()) and doesn't flush out
487  * clean data (like FADV_DONTNEED).
488  */
489  rc = sync_file_range(fd, offset, nbytes,
490  SYNC_FILE_RANGE_WRITE);
491  if (rc != 0)
492  {
493  int elevel;
494 
495  /*
496  * For systems that don't have an implementation of
497  * sync_file_range() such as Windows WSL, generate only one
498  * warning and then suppress all further attempts by this process.
499  */
500  if (errno == ENOSYS)
501  {
502  elevel = WARNING;
503  not_implemented_by_kernel = true;
504  }
505  else
506  elevel = data_sync_elevel(WARNING);
507 
508  ereport(elevel,
510  errmsg("could not flush dirty data: %m")));
511  }
512 
513  return;
514  }
515 #endif
516 #if !defined(WIN32) && defined(MS_ASYNC)
517  {
518  void *p;
519  static int pagesize = 0;
520 
521  /*
522  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
523  * writeback. On linux it only does so if MS_SYNC is specified, but
524  * then it does the writeback synchronously. Luckily all common linux
525  * systems have sync_file_range(). This is preferable over
526  * FADV_DONTNEED because it doesn't flush out clean data.
527  *
528  * We map the file (mmap()), tell the kernel to sync back the contents
529  * (msync()), and then remove the mapping again (munmap()).
530  */
531 
532  /* mmap() needs actual length if we want to map whole file */
533  if (offset == 0 && nbytes == 0)
534  {
535  nbytes = lseek(fd, 0, SEEK_END);
536  if (nbytes < 0)
537  {
540  errmsg("could not determine dirty data size: %m")));
541  return;
542  }
543  }
544 
545  /*
546  * Some platforms reject partial-page mmap() attempts. To deal with
547  * that, just truncate the request to a page boundary. If any extra
548  * bytes don't get flushed, well, it's only a hint anyway.
549  */
550 
551  /* fetch pagesize only once */
552  if (pagesize == 0)
553  pagesize = sysconf(_SC_PAGESIZE);
554 
555  /* align length to pagesize, dropping any fractional page */
556  if (pagesize > 0)
557  nbytes = (nbytes / pagesize) * pagesize;
558 
559  /* fractional-page request is a no-op */
560  if (nbytes <= 0)
561  return;
562 
563  /*
564  * mmap could well fail, particularly on 32-bit platforms where there
565  * may simply not be enough address space. If so, silently fall
566  * through to the next implementation.
567  */
568  if (nbytes <= (off_t) SSIZE_MAX)
569  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
570  else
571  p = MAP_FAILED;
572 
573  if (p != MAP_FAILED)
574  {
575  int rc;
576 
577  rc = msync(p, (size_t) nbytes, MS_ASYNC);
578  if (rc != 0)
579  {
582  errmsg("could not flush dirty data: %m")));
583  /* NB: need to fall through to munmap()! */
584  }
585 
586  rc = munmap(p, (size_t) nbytes);
587  if (rc != 0)
588  {
589  /* FATAL error because mapping would remain */
590  ereport(FATAL,
592  errmsg("could not munmap() while flushing data: %m")));
593  }
594 
595  return;
596  }
597  }
598 #endif
599 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
600  {
601  int rc;
602 
603  /*
604  * Signal the kernel that the passed in range should not be cached
605  * anymore. This has the, desired, side effect of writing out dirty
606  * data, and the, undesired, side effect of likely discarding useful
607  * clean cached blocks. For the latter reason this is the least
608  * preferable method.
609  */
610 
611  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
612 
613  if (rc != 0)
614  {
615  /* don't error out, this is just a performance optimization */
618  errmsg("could not flush dirty data: %m")));
619  }
620 
621  return;
622  }
623 #endif
624 }
625 
626 /*
627  * Truncate a file to a given length by name.
628  */
629 int
630 pg_truncate(const char *path, off_t length)
631 {
632 #ifdef WIN32
633  int save_errno;
634  int ret;
635  int fd;
636 
637  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
638  if (fd >= 0)
639  {
640  ret = ftruncate(fd, 0);
641  save_errno = errno;
642  CloseTransientFile(fd);
643  errno = save_errno;
644  }
645  else
646  ret = -1;
647 
648  return ret;
649 #else
650  return truncate(path, length);
651 #endif
652 }
653 
654 /*
655  * fsync_fname -- fsync a file or directory, handling errors properly
656  *
657  * Try to fsync a file or directory. When doing the latter, ignore errors that
658  * indicate the OS just doesn't allow/require fsyncing directories.
659  */
660 void
661 fsync_fname(const char *fname, bool isdir)
662 {
663  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
664 }
665 
666 /*
667  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
668  *
669  * This routine ensures that, after returning, the effect of renaming file
670  * persists in case of a crash. A crash while this routine is running will
671  * leave you with either the pre-existing or the moved file in place of the
672  * new file; no mixed state or truncated files are possible.
673  *
674  * It does so by using fsync on the old filename and the possibly existing
675  * target filename before the rename, and the target file and directory after.
676  *
677  * Note that rename() cannot be used across arbitrary directories, as they
678  * might not be on the same filesystem. Therefore this routine does not
679  * support renaming across directories.
680  *
681  * Log errors with the caller specified severity.
682  *
683  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
684  * valid upon return.
685  */
686 int
687 durable_rename(const char *oldfile, const char *newfile, int elevel)
688 {
689  int fd;
690 
691  /*
692  * First fsync the old and target path (if it exists), to ensure that they
693  * are properly persistent on disk. Syncing the target file is not
694  * strictly necessary, but it makes it easier to reason about crashes;
695  * because it's then guaranteed that either source or target file exists
696  * after a crash.
697  */
698  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
699  return -1;
700 
701  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
702  if (fd < 0)
703  {
704  if (errno != ENOENT)
705  {
706  ereport(elevel,
708  errmsg("could not open file \"%s\": %m", newfile)));
709  return -1;
710  }
711  }
712  else
713  {
714  if (pg_fsync(fd) != 0)
715  {
716  int save_errno;
717 
718  /* close file upon error, might not be in transaction context */
719  save_errno = errno;
720  CloseTransientFile(fd);
721  errno = save_errno;
722 
723  ereport(elevel,
725  errmsg("could not fsync file \"%s\": %m", newfile)));
726  return -1;
727  }
728 
729  if (CloseTransientFile(fd) != 0)
730  {
731  ereport(elevel,
733  errmsg("could not close file \"%s\": %m", newfile)));
734  return -1;
735  }
736  }
737 
738  /* Time to do the real deal... */
739  if (rename(oldfile, newfile) < 0)
740  {
741  ereport(elevel,
743  errmsg("could not rename file \"%s\" to \"%s\": %m",
744  oldfile, newfile)));
745  return -1;
746  }
747 
748  /*
749  * To guarantee renaming the file is persistent, fsync the file with its
750  * new name, and its containing directory.
751  */
752  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
753  return -1;
754 
755  if (fsync_parent_path(newfile, elevel) != 0)
756  return -1;
757 
758  return 0;
759 }
760 
761 /*
762  * durable_unlink -- remove a file in a durable manner
763  *
764  * This routine ensures that, after returning, the effect of removing file
765  * persists in case of a crash. A crash while this routine is running will
766  * leave the system in no mixed state.
767  *
768  * It does so by using fsync on the parent directory of the file after the
769  * actual removal is done.
770  *
771  * Log errors with the severity specified by caller.
772  *
773  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
774  * valid upon return.
775  */
776 int
777 durable_unlink(const char *fname, int elevel)
778 {
779  if (unlink(fname) < 0)
780  {
781  ereport(elevel,
783  errmsg("could not remove file \"%s\": %m",
784  fname)));
785  return -1;
786  }
787 
788  /*
789  * To guarantee that the removal of the file is persistent, fsync its
790  * parent directory.
791  */
792  if (fsync_parent_path(fname, elevel) != 0)
793  return -1;
794 
795  return 0;
796 }
797 
798 /*
799  * durable_rename_excl -- rename a file in a durable manner, without
800  * overwriting an existing target file
801  *
802  * Similar to durable_rename(), except that this routine will fail if the
803  * target file already exists.
804  *
805  * Note that a crash in an unfortunate moment can leave you with two links to
806  * the target file.
807  *
808  * Log errors with the caller specified severity.
809  *
810  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
811  * valid upon return.
812  */
813 int
814 durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
815 {
816  /*
817  * Ensure that, if we crash directly after the rename/link, a file with
818  * valid contents is moved into place.
819  */
820  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
821  return -1;
822 
823  if (link(oldfile, newfile) < 0)
824  {
825  ereport(elevel,
827  errmsg("could not link file \"%s\" to \"%s\": %m",
828  oldfile, newfile)));
829  return -1;
830  }
831  unlink(oldfile);
832 
833  /*
834  * Make change persistent in case of an OS crash, both the new entry and
835  * its parent directory need to be flushed.
836  */
837  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
838  return -1;
839 
840  /* Same for parent directory */
841  if (fsync_parent_path(newfile, elevel) != 0)
842  return -1;
843 
844  return 0;
845 }
846 
847 /*
848  * InitFileAccess --- initialize this module during backend startup
849  *
850  * This is called during either normal or standalone backend start.
851  * It is *not* called in the postmaster.
852  */
853 void
855 {
856  Assert(SizeVfdCache == 0); /* call me only once */
857 
858  /* initialize cache header entry */
859  VfdCache = (Vfd *) malloc(sizeof(Vfd));
860  if (VfdCache == NULL)
861  ereport(FATAL,
862  (errcode(ERRCODE_OUT_OF_MEMORY),
863  errmsg("out of memory")));
864 
865  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
866  VfdCache->fd = VFD_CLOSED;
867 
868  SizeVfdCache = 1;
869 
870  /* register proc-exit hook to ensure temp files are dropped at exit */
872 }
873 
874 /*
875  * count_usable_fds --- count how many FDs the system will let us open,
876  * and estimate how many are already open.
877  *
878  * We stop counting if usable_fds reaches max_to_probe. Note: a small
879  * value of max_to_probe might result in an underestimate of already_open;
880  * we must fill in any "gaps" in the set of used FDs before the calculation
881  * of already_open will give the right answer. In practice, max_to_probe
882  * of a couple of dozen should be enough to ensure good results.
883  *
884  * We assume stdin (FD 0) is available for dup'ing
885  */
886 static void
887 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
888 {
889  int *fd;
890  int size;
891  int used = 0;
892  int highestfd = 0;
893  int j;
894 
895 #ifdef HAVE_GETRLIMIT
896  struct rlimit rlim;
897  int getrlimit_status;
898 #endif
899 
900  size = 1024;
901  fd = (int *) palloc(size * sizeof(int));
902 
903 #ifdef HAVE_GETRLIMIT
904 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
905  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
906 #else /* but BSD doesn't ... */
907  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
908 #endif /* RLIMIT_NOFILE */
909  if (getrlimit_status != 0)
910  ereport(WARNING, (errmsg("getrlimit failed: %m")));
911 #endif /* HAVE_GETRLIMIT */
912 
913  /* dup until failure or probe limit reached */
914  for (;;)
915  {
916  int thisfd;
917 
918 #ifdef HAVE_GETRLIMIT
919 
920  /*
921  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
922  * some platforms
923  */
924  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
925  break;
926 #endif
927 
928  thisfd = dup(0);
929  if (thisfd < 0)
930  {
931  /* Expect EMFILE or ENFILE, else it's fishy */
932  if (errno != EMFILE && errno != ENFILE)
933  elog(WARNING, "dup(0) failed after %d successes: %m", used);
934  break;
935  }
936 
937  if (used >= size)
938  {
939  size *= 2;
940  fd = (int *) repalloc(fd, size * sizeof(int));
941  }
942  fd[used++] = thisfd;
943 
944  if (highestfd < thisfd)
945  highestfd = thisfd;
946 
947  if (used >= max_to_probe)
948  break;
949  }
950 
951  /* release the files we opened */
952  for (j = 0; j < used; j++)
953  close(fd[j]);
954 
955  pfree(fd);
956 
957  /*
958  * Return results. usable_fds is just the number of successful dups. We
959  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
960  * number) and so already_open is highestfd+1 - usable_fds.
961  */
962  *usable_fds = used;
963  *already_open = highestfd + 1 - used;
964 }
965 
966 /*
967  * set_max_safe_fds
968  * Determine number of file descriptors that fd.c is allowed to use
969  */
970 void
972 {
973  int usable_fds;
974  int already_open;
975 
976  /*----------
977  * We want to set max_safe_fds to
978  * MIN(usable_fds, max_files_per_process - already_open)
979  * less the slop factor for files that are opened without consulting
980  * fd.c. This ensures that we won't exceed either max_files_per_process
981  * or the experimentally-determined EMFILE limit.
982  *----------
983  */
985  &usable_fds, &already_open);
986 
987  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
988 
989  /*
990  * Take off the FDs reserved for system() etc.
991  */
993 
994  /*
995  * Make sure we still have enough to get by.
996  */
997  if (max_safe_fds < FD_MINFREE)
998  ereport(FATAL,
999  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1000  errmsg("insufficient file descriptors available to start server process"),
1001  errdetail("System allows %d, we need at least %d.",
1004 
1005  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1006  max_safe_fds, usable_fds, already_open);
1007 }
1008 
1009 /*
1010  * Open a file with BasicOpenFilePerm() and pass default file mode for the
1011  * fileMode parameter.
1012  */
1013 int
1015 {
1016  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1017 }
1018 
1019 /*
1020  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1021  *
1022  * This is exported for use by places that really want a plain kernel FD,
1023  * but need to be proof against running out of FDs. Once an FD has been
1024  * successfully returned, it is the caller's responsibility to ensure that
1025  * it will not be leaked on ereport()! Most users should *not* call this
1026  * routine directly, but instead use the VFD abstraction level, which
1027  * provides protection against descriptor leaks as well as management of
1028  * files that need to be open for more than a short period of time.
1029  *
1030  * Ideally this should be the *only* direct call of open() in the backend.
1031  * In practice, the postmaster calls open() directly, and there are some
1032  * direct open() calls done early in backend startup. Those are OK since
1033  * this module wouldn't have any open files to close at that point anyway.
1034  */
1035 int
1037 {
1038  int fd;
1039 
1040 tryAgain:
1041  fd = open(fileName, fileFlags, fileMode);
1042 
1043  if (fd >= 0)
1044  return fd; /* success! */
1045 
1046  if (errno == EMFILE || errno == ENFILE)
1047  {
1048  int save_errno = errno;
1049 
1050  ereport(LOG,
1051  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1052  errmsg("out of file descriptors: %m; release and retry")));
1053  errno = 0;
1054  if (ReleaseLruFile())
1055  goto tryAgain;
1056  errno = save_errno;
1057  }
1058 
1059  return -1; /* failure */
1060 }
1061 
1062 /*
1063  * AcquireExternalFD - attempt to reserve an external file descriptor
1064  *
1065  * This should be used by callers that need to hold a file descriptor open
1066  * over more than a short interval, but cannot use any of the other facilities
1067  * provided by this module.
1068  *
1069  * The difference between this and the underlying ReserveExternalFD function
1070  * is that this will report failure (by setting errno and returning false)
1071  * if "too many" external FDs are already reserved. This should be used in
1072  * any code where the total number of FDs to be reserved is not predictable
1073  * and small.
1074  */
1075 bool
1077 {
1078  /*
1079  * We don't want more than max_safe_fds / 3 FDs to be consumed for
1080  * "external" FDs.
1081  */
1082  if (numExternalFDs < max_safe_fds / 3)
1083  {
1085  return true;
1086  }
1087  errno = EMFILE;
1088  return false;
1089 }
1090 
1091 /*
1092  * ReserveExternalFD - report external consumption of a file descriptor
1093  *
1094  * This should be used by callers that need to hold a file descriptor open
1095  * over more than a short interval, but cannot use any of the other facilities
1096  * provided by this module. This just tracks the use of the FD and closes
1097  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1098  *
1099  * Call this directly only in code where failure to reserve the FD would be
1100  * fatal; for example, the WAL-writing code does so, since the alternative is
1101  * session failure. Also, it's very unwise to do so in code that could
1102  * consume more than one FD per process.
1103  *
1104  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1105  * available, it doesn't matter too much whether this is called before or
1106  * after actually opening the FD; but doing so beforehand reduces the risk of
1107  * an EMFILE failure if not everybody played nice. In any case, it's solely
1108  * caller's responsibility to keep the external-FD count in sync with reality.
1109  */
1110 void
1112 {
1113  /*
1114  * Release VFDs if needed to stay safe. Because we do this before
1115  * incrementing numExternalFDs, the final state will be as desired, i.e.,
1116  * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1117  */
1118  ReleaseLruFiles();
1119 
1120  numExternalFDs++;
1121 }
1122 
1123 /*
1124  * ReleaseExternalFD - report release of an external file descriptor
1125  *
1126  * This is guaranteed not to change errno, so it can be used in failure paths.
1127  */
1128 void
1130 {
1131  Assert(numExternalFDs > 0);
1132  numExternalFDs--;
1133 }
1134 
1135 
1136 #if defined(FDDEBUG)
1137 
1138 static void
1139 _dump_lru(void)
1140 {
1141  int mru = VfdCache[0].lruLessRecently;
1142  Vfd *vfdP = &VfdCache[mru];
1143  char buf[2048];
1144 
1145  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1146  while (mru != 0)
1147  {
1148  mru = vfdP->lruLessRecently;
1149  vfdP = &VfdCache[mru];
1150  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1151  }
1152  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1153  elog(LOG, "%s", buf);
1154 }
1155 #endif /* FDDEBUG */
1156 
1157 static void
1159 {
1160  Vfd *vfdP;
1161 
1162  Assert(file != 0);
1163 
1164  DO_DB(elog(LOG, "Delete %d (%s)",
1165  file, VfdCache[file].fileName));
1166  DO_DB(_dump_lru());
1167 
1168  vfdP = &VfdCache[file];
1169 
1170  VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1171  VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1172 
1173  DO_DB(_dump_lru());
1174 }
1175 
1176 static void
1178 {
1179  Vfd *vfdP;
1180 
1181  Assert(file != 0);
1182 
1183  DO_DB(elog(LOG, "LruDelete %d (%s)",
1184  file, VfdCache[file].fileName));
1185 
1186  vfdP = &VfdCache[file];
1187 
1188  /*
1189  * Close the file. We aren't expecting this to fail; if it does, better
1190  * to leak the FD than to mess up our internal state.
1191  */
1192  if (close(vfdP->fd) != 0)
1194  "could not close file \"%s\": %m", vfdP->fileName);
1195  vfdP->fd = VFD_CLOSED;
1196  --nfile;
1197 
1198  /* delete the vfd record from the LRU ring */
1199  Delete(file);
1200 }
1201 
1202 static void
1204 {
1205  Vfd *vfdP;
1206 
1207  Assert(file != 0);
1208 
1209  DO_DB(elog(LOG, "Insert %d (%s)",
1210  file, VfdCache[file].fileName));
1211  DO_DB(_dump_lru());
1212 
1213  vfdP = &VfdCache[file];
1214 
1215  vfdP->lruMoreRecently = 0;
1216  vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1217  VfdCache[0].lruLessRecently = file;
1218  VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1219 
1220  DO_DB(_dump_lru());
1221 }
1222 
1223 /* returns 0 on success, -1 on re-open failure (with errno set) */
1224 static int
1226 {
1227  Vfd *vfdP;
1228 
1229  Assert(file != 0);
1230 
1231  DO_DB(elog(LOG, "LruInsert %d (%s)",
1232  file, VfdCache[file].fileName));
1233 
1234  vfdP = &VfdCache[file];
1235 
1236  if (FileIsNotOpen(file))
1237  {
1238  /* Close excess kernel FDs. */
1239  ReleaseLruFiles();
1240 
1241  /*
1242  * The open could still fail for lack of file descriptors, eg due to
1243  * overall system file table being full. So, be prepared to release
1244  * another FD if necessary...
1245  */
1246  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1247  vfdP->fileMode);
1248  if (vfdP->fd < 0)
1249  {
1250  DO_DB(elog(LOG, "re-open failed: %m"));
1251  return -1;
1252  }
1253  else
1254  {
1255  ++nfile;
1256  }
1257  }
1258 
1259  /*
1260  * put it at the head of the Lru ring
1261  */
1262 
1263  Insert(file);
1264 
1265  return 0;
1266 }
1267 
1268 /*
1269  * Release one kernel FD by closing the least-recently-used VFD.
1270  */
1271 static bool
1273 {
1274  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1275 
1276  if (nfile > 0)
1277  {
1278  /*
1279  * There are opened files and so there should be at least one used vfd
1280  * in the ring.
1281  */
1282  Assert(VfdCache[0].lruMoreRecently != 0);
1283  LruDelete(VfdCache[0].lruMoreRecently);
1284  return true; /* freed a file */
1285  }
1286  return false; /* no files available to free */
1287 }
1288 
1289 /*
1290  * Release kernel FDs as needed to get under the max_safe_fds limit.
1291  * After calling this, it's OK to try to open another file.
1292  */
1293 static void
1295 {
1297  {
1298  if (!ReleaseLruFile())
1299  break;
1300  }
1301 }
1302 
1303 static File
1305 {
1306  Index i;
1307  File file;
1308 
1309  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1310 
1311  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1312 
1313  if (VfdCache[0].nextFree == 0)
1314  {
1315  /*
1316  * The free list is empty so it is time to increase the size of the
1317  * array. We choose to double it each time this happens. However,
1318  * there's not much point in starting *real* small.
1319  */
1320  Size newCacheSize = SizeVfdCache * 2;
1321  Vfd *newVfdCache;
1322 
1323  if (newCacheSize < 32)
1324  newCacheSize = 32;
1325 
1326  /*
1327  * Be careful not to clobber VfdCache ptr if realloc fails.
1328  */
1329  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1330  if (newVfdCache == NULL)
1331  ereport(ERROR,
1332  (errcode(ERRCODE_OUT_OF_MEMORY),
1333  errmsg("out of memory")));
1334  VfdCache = newVfdCache;
1335 
1336  /*
1337  * Initialize the new entries and link them into the free list.
1338  */
1339  for (i = SizeVfdCache; i < newCacheSize; i++)
1340  {
1341  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1342  VfdCache[i].nextFree = i + 1;
1343  VfdCache[i].fd = VFD_CLOSED;
1344  }
1345  VfdCache[newCacheSize - 1].nextFree = 0;
1346  VfdCache[0].nextFree = SizeVfdCache;
1347 
1348  /*
1349  * Record the new size
1350  */
1351  SizeVfdCache = newCacheSize;
1352  }
1353 
1354  file = VfdCache[0].nextFree;
1355 
1356  VfdCache[0].nextFree = VfdCache[file].nextFree;
1357 
1358  return file;
1359 }
1360 
1361 static void
1363 {
1364  Vfd *vfdP = &VfdCache[file];
1365 
1366  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1367  file, vfdP->fileName ? vfdP->fileName : ""));
1368 
1369  if (vfdP->fileName != NULL)
1370  {
1371  free(vfdP->fileName);
1372  vfdP->fileName = NULL;
1373  }
1374  vfdP->fdstate = 0x0;
1375 
1376  vfdP->nextFree = VfdCache[0].nextFree;
1377  VfdCache[0].nextFree = file;
1378 }
1379 
1380 /* returns 0 on success, -1 on re-open failure (with errno set) */
1381 static int
1383 {
1384  int returnValue;
1385 
1386  DO_DB(elog(LOG, "FileAccess %d (%s)",
1387  file, VfdCache[file].fileName));
1388 
1389  /*
1390  * Is the file open? If not, open it and put it at the head of the LRU
1391  * ring (possibly closing the least recently used file to get an FD).
1392  */
1393 
1394  if (FileIsNotOpen(file))
1395  {
1396  returnValue = LruInsert(file);
1397  if (returnValue != 0)
1398  return returnValue;
1399  }
1400  else if (VfdCache[0].lruLessRecently != file)
1401  {
1402  /*
1403  * We now know that the file is open and that it is not the last one
1404  * accessed, so we need to move it to the head of the Lru ring.
1405  */
1406 
1407  Delete(file);
1408  Insert(file);
1409  }
1410 
1411  return 0;
1412 }
1413 
1414 /*
1415  * Called whenever a temporary file is deleted to report its size.
1416  */
1417 static void
1418 ReportTemporaryFileUsage(const char *path, off_t size)
1419 {
1420  pgstat_report_tempfile(size);
1421 
1422  if (log_temp_files >= 0)
1423  {
1424  if ((size / 1024) >= log_temp_files)
1425  ereport(LOG,
1426  (errmsg("temporary file: path \"%s\", size %lu",
1427  path, (unsigned long) size)));
1428  }
1429 }
1430 
1431 /*
1432  * Called to register a temporary file for automatic close.
1433  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1434  * before the file was opened.
1435  */
1436 static void
1438 {
1440  VfdCache[file].resowner = CurrentResourceOwner;
1441 
1442  /* Backup mechanism for closing at end of xact. */
1443  VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1445 }
1446 
1447 /*
1448  * Called when we get a shared invalidation message on some relation.
1449  */
1450 #ifdef NOT_USED
1451 void
1452 FileInvalidate(File file)
1453 {
1454  Assert(FileIsValid(file));
1455  if (!FileIsNotOpen(file))
1456  LruDelete(file);
1457 }
1458 #endif
1459 
1460 /*
1461  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1462  * fileMode parameter.
1463  */
1464 File
1466 {
1467  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1468 }
1469 
1470 /*
1471  * open a file in an arbitrary directory
1472  *
1473  * NB: if the passed pathname is relative (which it usually is),
1474  * it will be interpreted relative to the process' working directory
1475  * (which should always be $PGDATA when this code is running).
1476  */
1477 File
1479 {
1480  char *fnamecopy;
1481  File file;
1482  Vfd *vfdP;
1483 
1484  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1485  fileName, fileFlags, fileMode));
1486 
1487  /*
1488  * We need a malloc'd copy of the file name; fail cleanly if no room.
1489  */
1490  fnamecopy = strdup(fileName);
1491  if (fnamecopy == NULL)
1492  ereport(ERROR,
1493  (errcode(ERRCODE_OUT_OF_MEMORY),
1494  errmsg("out of memory")));
1495 
1496  file = AllocateVfd();
1497  vfdP = &VfdCache[file];
1498 
1499  /* Close excess kernel FDs. */
1500  ReleaseLruFiles();
1501 
1502  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1503 
1504  if (vfdP->fd < 0)
1505  {
1506  int save_errno = errno;
1507 
1508  FreeVfd(file);
1509  free(fnamecopy);
1510  errno = save_errno;
1511  return -1;
1512  }
1513  ++nfile;
1514  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1515  vfdP->fd));
1516 
1517  vfdP->fileName = fnamecopy;
1518  /* Saved flags are adjusted to be OK for re-opening file */
1519  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1520  vfdP->fileMode = fileMode;
1521  vfdP->fileSize = 0;
1522  vfdP->fdstate = 0x0;
1523  vfdP->resowner = NULL;
1524 
1525  Insert(file);
1526 
1527  return file;
1528 }
1529 
1530 /*
1531  * Create directory 'directory'. If necessary, create 'basedir', which must
1532  * be the directory above it. This is designed for creating the top-level
1533  * temporary directory on demand before creating a directory underneath it.
1534  * Do nothing if the directory already exists.
1535  *
1536  * Directories created within the top-level temporary directory should begin
1537  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1538  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1539  * that do not need any particular prefix.
1540 */
1541 void
1543 {
1544  if (MakePGDirectory(directory) < 0)
1545  {
1546  if (errno == EEXIST)
1547  return;
1548 
1549  /*
1550  * Failed. Try to create basedir first in case it's missing. Tolerate
1551  * EEXIST to close a race against another process following the same
1552  * algorithm.
1553  */
1554  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1555  ereport(ERROR,
1557  errmsg("cannot create temporary directory \"%s\": %m",
1558  basedir)));
1559 
1560  /* Try again. */
1561  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1562  ereport(ERROR,
1564  errmsg("cannot create temporary subdirectory \"%s\": %m",
1565  directory)));
1566  }
1567 }
1568 
1569 /*
1570  * Delete a directory and everything in it, if it exists.
1571  */
1572 void
1573 PathNameDeleteTemporaryDir(const char *dirname)
1574 {
1575  struct stat statbuf;
1576 
1577  /* Silently ignore missing directory. */
1578  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1579  return;
1580 
1581  /*
1582  * Currently, walkdir doesn't offer a way for our passed in function to
1583  * maintain state. Perhaps it should, so that we could tell the caller
1584  * whether this operation succeeded or failed. Since this operation is
1585  * used in a cleanup path, we wouldn't actually behave differently: we'll
1586  * just log failures.
1587  */
1588  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1589 }
1590 
1591 /*
1592  * Open a temporary file that will disappear when we close it.
1593  *
1594  * This routine takes care of generating an appropriate tempfile name.
1595  * There's no need to pass in fileFlags or fileMode either, since only
1596  * one setting makes any sense for a temp file.
1597  *
1598  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1599  * to ensure it's closed and deleted when it's no longer needed, typically at
1600  * the end-of-transaction. In most cases, you don't want temporary files to
1601  * outlive the transaction that created them, so this should be false -- but
1602  * if you need "somewhat" temporary storage, this might be useful. In either
1603  * case, the file is removed when the File is explicitly closed.
1604  */
1605 File
1606 OpenTemporaryFile(bool interXact)
1607 {
1608  File file = 0;
1609 
1610  /*
1611  * Make sure the current resource owner has space for this File before we
1612  * open it, if we'll be registering it below.
1613  */
1614  if (!interXact)
1616 
1617  /*
1618  * If some temp tablespace(s) have been given to us, try to use the next
1619  * one. If a given tablespace can't be found, we silently fall back to
1620  * the database's default tablespace.
1621  *
1622  * BUT: if the temp file is slated to outlive the current transaction,
1623  * force it into the database's default tablespace, so that it will not
1624  * pose a threat to possible tablespace drop attempts.
1625  */
1626  if (numTempTableSpaces > 0 && !interXact)
1627  {
1628  Oid tblspcOid = GetNextTempTableSpace();
1629 
1630  if (OidIsValid(tblspcOid))
1631  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1632  }
1633 
1634  /*
1635  * If not, or if tablespace is bad, create in database's default
1636  * tablespace. MyDatabaseTableSpace should normally be set before we get
1637  * here, but just in case it isn't, fall back to pg_default tablespace.
1638  */
1639  if (file <= 0)
1642  DEFAULTTABLESPACE_OID,
1643  true);
1644 
1645  /* Mark it for deletion at close and temporary file size limit */
1646  VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1647 
1648  /* Register it with the current resource owner */
1649  if (!interXact)
1650  RegisterTemporaryFile(file);
1651 
1652  return file;
1653 }
1654 
1655 /*
1656  * Return the path of the temp directory in a given tablespace.
1657  */
1658 void
1660 {
1661  /*
1662  * Identify the tempfile directory for this tablespace.
1663  *
1664  * If someone tries to specify pg_global, use pg_default instead.
1665  */
1666  if (tablespace == InvalidOid ||
1667  tablespace == DEFAULTTABLESPACE_OID ||
1668  tablespace == GLOBALTABLESPACE_OID)
1669  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1670  else
1671  {
1672  /* All other tablespaces are accessed via symlinks */
1673  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1674  tablespace, TABLESPACE_VERSION_DIRECTORY,
1676  }
1677 }
1678 
1679 /*
1680  * Open a temporary file in a specific tablespace.
1681  * Subroutine for OpenTemporaryFile, which see for details.
1682  */
1683 static File
1684 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1685 {
1686  char tempdirpath[MAXPGPATH];
1687  char tempfilepath[MAXPGPATH];
1688  File file;
1689 
1690  TempTablespacePath(tempdirpath, tblspcOid);
1691 
1692  /*
1693  * Generate a tempfile name that should be unique within the current
1694  * database instance.
1695  */
1696  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1697  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1698 
1699  /*
1700  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1701  * temp file that can be reused.
1702  */
1703  file = PathNameOpenFile(tempfilepath,
1704  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1705  if (file <= 0)
1706  {
1707  /*
1708  * We might need to create the tablespace's tempfile directory, if no
1709  * one has yet done so.
1710  *
1711  * Don't check for an error from MakePGDirectory; it could fail if
1712  * someone else just did the same thing. If it doesn't work then
1713  * we'll bomb out on the second create attempt, instead.
1714  */
1715  (void) MakePGDirectory(tempdirpath);
1716 
1717  file = PathNameOpenFile(tempfilepath,
1718  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1719  if (file <= 0 && rejectError)
1720  elog(ERROR, "could not create temporary file \"%s\": %m",
1721  tempfilepath);
1722  }
1723 
1724  return file;
1725 }
1726 
1727 
1728 /*
1729  * Create a new file. The directory containing it must already exist. Files
1730  * created this way are subject to temp_file_limit and are automatically
1731  * closed at end of transaction, but are not automatically deleted on close
1732  * because they are intended to be shared between cooperating backends.
1733  *
1734  * If the file is inside the top-level temporary directory, its name should
1735  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1736  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1737  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1738  * the prefix isn't needed.
1739  */
1740 File
1741 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1742 {
1743  File file;
1744 
1746 
1747  /*
1748  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1749  * temp file that can be reused.
1750  */
1751  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1752  if (file <= 0)
1753  {
1754  if (error_on_failure)
1755  ereport(ERROR,
1757  errmsg("could not create temporary file \"%s\": %m",
1758  path)));
1759  else
1760  return file;
1761  }
1762 
1763  /* Mark it for temp_file_limit accounting. */
1764  VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1765 
1766  /* Register it for automatic close. */
1767  RegisterTemporaryFile(file);
1768 
1769  return file;
1770 }
1771 
1772 /*
1773  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1774  * another backend. Files opened this way don't count against the
1775  * temp_file_limit of the caller, are automatically closed at the end of the
1776  * transaction but are not deleted on close.
1777  */
1778 File
1779 PathNameOpenTemporaryFile(const char *path, int mode)
1780 {
1781  File file;
1782 
1784 
1785  file = PathNameOpenFile(path, mode | PG_BINARY);
1786 
1787  /* If no such file, then we don't raise an error. */
1788  if (file <= 0 && errno != ENOENT)
1789  ereport(ERROR,
1791  errmsg("could not open temporary file \"%s\": %m",
1792  path)));
1793 
1794  if (file > 0)
1795  {
1796  /* Register it for automatic close. */
1797  RegisterTemporaryFile(file);
1798  }
1799 
1800  return file;
1801 }
1802 
1803 /*
1804  * Delete a file by pathname. Return true if the file existed, false if
1805  * didn't.
1806  */
1807 bool
1808 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1809 {
1810  struct stat filestats;
1811  int stat_errno;
1812 
1813  /* Get the final size for pgstat reporting. */
1814  if (stat(path, &filestats) != 0)
1815  stat_errno = errno;
1816  else
1817  stat_errno = 0;
1818 
1819  /*
1820  * Unlike FileClose's automatic file deletion code, we tolerate
1821  * non-existence to support BufFileDeleteShared which doesn't know how
1822  * many segments it has to delete until it runs out.
1823  */
1824  if (stat_errno == ENOENT)
1825  return false;
1826 
1827  if (unlink(path) < 0)
1828  {
1829  if (errno != ENOENT)
1830  ereport(error_on_failure ? ERROR : LOG,
1832  errmsg("could not unlink temporary file \"%s\": %m",
1833  path)));
1834  return false;
1835  }
1836 
1837  if (stat_errno == 0)
1838  ReportTemporaryFileUsage(path, filestats.st_size);
1839  else
1840  {
1841  errno = stat_errno;
1842  ereport(LOG,
1844  errmsg("could not stat file \"%s\": %m", path)));
1845  }
1846 
1847  return true;
1848 }
1849 
1850 /*
1851  * close a file when done with it
1852  */
1853 void
1855 {
1856  Vfd *vfdP;
1857 
1858  Assert(FileIsValid(file));
1859 
1860  DO_DB(elog(LOG, "FileClose: %d (%s)",
1861  file, VfdCache[file].fileName));
1862 
1863  vfdP = &VfdCache[file];
1864 
1865  if (!FileIsNotOpen(file))
1866  {
1867  /* close the file */
1868  if (close(vfdP->fd) != 0)
1869  {
1870  /*
1871  * We may need to panic on failure to close non-temporary files;
1872  * see LruDelete.
1873  */
1875  "could not close file \"%s\": %m", vfdP->fileName);
1876  }
1877 
1878  --nfile;
1879  vfdP->fd = VFD_CLOSED;
1880 
1881  /* remove the file from the lru ring */
1882  Delete(file);
1883  }
1884 
1885  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1886  {
1887  /* Subtract its size from current usage (do first in case of error) */
1888  temporary_files_size -= vfdP->fileSize;
1889  vfdP->fileSize = 0;
1890  }
1891 
1892  /*
1893  * Delete the file if it was temporary, and make a log entry if wanted
1894  */
1895  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1896  {
1897  struct stat filestats;
1898  int stat_errno;
1899 
1900  /*
1901  * If we get an error, as could happen within the ereport/elog calls,
1902  * we'll come right back here during transaction abort. Reset the
1903  * flag to ensure that we can't get into an infinite loop. This code
1904  * is arranged to ensure that the worst-case consequence is failing to
1905  * emit log message(s), not failing to attempt the unlink.
1906  */
1907  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1908 
1909 
1910  /* first try the stat() */
1911  if (stat(vfdP->fileName, &filestats))
1912  stat_errno = errno;
1913  else
1914  stat_errno = 0;
1915 
1916  /* in any case do the unlink */
1917  if (unlink(vfdP->fileName))
1918  ereport(LOG,
1920  errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
1921 
1922  /* and last report the stat results */
1923  if (stat_errno == 0)
1924  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1925  else
1926  {
1927  errno = stat_errno;
1928  ereport(LOG,
1930  errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
1931  }
1932  }
1933 
1934  /* Unregister it from the resource owner */
1935  if (vfdP->resowner)
1936  ResourceOwnerForgetFile(vfdP->resowner, file);
1937 
1938  /*
1939  * Return the Vfd slot to the free list
1940  */
1941  FreeVfd(file);
1942 }
1943 
1944 /*
1945  * FilePrefetch - initiate asynchronous read of a given range of the file.
1946  *
1947  * Currently the only implementation of this function is using posix_fadvise
1948  * which is the simplest standardized interface that accomplishes this.
1949  * We could add an implementation using libaio in the future; but note that
1950  * this API is inappropriate for libaio, which wants to have a buffer provided
1951  * to read into.
1952  */
1953 int
1954 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1955 {
1956 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1957  int returnCode;
1958 
1959  Assert(FileIsValid(file));
1960 
1961  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1962  file, VfdCache[file].fileName,
1963  (int64) offset, amount));
1964 
1965  returnCode = FileAccess(file);
1966  if (returnCode < 0)
1967  return returnCode;
1968 
1969  pgstat_report_wait_start(wait_event_info);
1970  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1971  POSIX_FADV_WILLNEED);
1973 
1974  return returnCode;
1975 #else
1976  Assert(FileIsValid(file));
1977  return 0;
1978 #endif
1979 }
1980 
1981 void
1982 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1983 {
1984  int returnCode;
1985 
1986  Assert(FileIsValid(file));
1987 
1988  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1989  file, VfdCache[file].fileName,
1990  (int64) offset, (int64) nbytes));
1991 
1992  if (nbytes <= 0)
1993  return;
1994 
1995  returnCode = FileAccess(file);
1996  if (returnCode < 0)
1997  return;
1998 
1999  pgstat_report_wait_start(wait_event_info);
2000  pg_flush_data(VfdCache[file].fd, offset, nbytes);
2002 }
2003 
2004 int
2005 FileRead(File file, char *buffer, int amount, off_t offset,
2006  uint32 wait_event_info)
2007 {
2008  int returnCode;
2009  Vfd *vfdP;
2010 
2011  Assert(FileIsValid(file));
2012 
2013  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
2014  file, VfdCache[file].fileName,
2015  (int64) offset,
2016  amount, buffer));
2017 
2018  returnCode = FileAccess(file);
2019  if (returnCode < 0)
2020  return returnCode;
2021 
2022  vfdP = &VfdCache[file];
2023 
2024 retry:
2025  pgstat_report_wait_start(wait_event_info);
2026  returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
2028 
2029  if (returnCode < 0)
2030  {
2031  /*
2032  * Windows may run out of kernel buffers and return "Insufficient
2033  * system resources" error. Wait a bit and retry to solve it.
2034  *
2035  * It is rumored that EINTR is also possible on some Unix filesystems,
2036  * in which case immediate retry is indicated.
2037  */
2038 #ifdef WIN32
2039  DWORD error = GetLastError();
2040 
2041  switch (error)
2042  {
2043  case ERROR_NO_SYSTEM_RESOURCES:
2044  pg_usleep(1000L);
2045  errno = EINTR;
2046  break;
2047  default:
2048  _dosmaperr(error);
2049  break;
2050  }
2051 #endif
2052  /* OK to retry if interrupted */
2053  if (errno == EINTR)
2054  goto retry;
2055  }
2056 
2057  return returnCode;
2058 }
2059 
2060 int
2061 FileWrite(File file, char *buffer, int amount, off_t offset,
2062  uint32 wait_event_info)
2063 {
2064  int returnCode;
2065  Vfd *vfdP;
2066 
2067  Assert(FileIsValid(file));
2068 
2069  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
2070  file, VfdCache[file].fileName,
2071  (int64) offset,
2072  amount, buffer));
2073 
2074  returnCode = FileAccess(file);
2075  if (returnCode < 0)
2076  return returnCode;
2077 
2078  vfdP = &VfdCache[file];
2079 
2080  /*
2081  * If enforcing temp_file_limit and it's a temp file, check to see if the
2082  * write would overrun temp_file_limit, and throw error if so. Note: it's
2083  * really a modularity violation to throw error here; we should set errno
2084  * and return -1. However, there's no way to report a suitable error
2085  * message if we do that. All current callers would just throw error
2086  * immediately anyway, so this is safe at present.
2087  */
2088  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2089  {
2090  off_t past_write = offset + amount;
2091 
2092  if (past_write > vfdP->fileSize)
2093  {
2094  uint64 newTotal = temporary_files_size;
2095 
2096  newTotal += past_write - vfdP->fileSize;
2097  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2098  ereport(ERROR,
2099  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2100  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2101  temp_file_limit)));
2102  }
2103  }
2104 
2105 retry:
2106  errno = 0;
2107  pgstat_report_wait_start(wait_event_info);
2108  returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2110 
2111  /* if write didn't set errno, assume problem is no disk space */
2112  if (returnCode != amount && errno == 0)
2113  errno = ENOSPC;
2114 
2115  if (returnCode >= 0)
2116  {
2117  /*
2118  * Maintain fileSize and temporary_files_size if it's a temp file.
2119  */
2120  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2121  {
2122  off_t past_write = offset + amount;
2123 
2124  if (past_write > vfdP->fileSize)
2125  {
2126  temporary_files_size += past_write - vfdP->fileSize;
2127  vfdP->fileSize = past_write;
2128  }
2129  }
2130  }
2131  else
2132  {
2133  /*
2134  * See comments in FileRead()
2135  */
2136 #ifdef WIN32
2137  DWORD error = GetLastError();
2138 
2139  switch (error)
2140  {
2141  case ERROR_NO_SYSTEM_RESOURCES:
2142  pg_usleep(1000L);
2143  errno = EINTR;
2144  break;
2145  default:
2146  _dosmaperr(error);
2147  break;
2148  }
2149 #endif
2150  /* OK to retry if interrupted */
2151  if (errno == EINTR)
2152  goto retry;
2153  }
2154 
2155  return returnCode;
2156 }
2157 
2158 int
2159 FileSync(File file, uint32 wait_event_info)
2160 {
2161  int returnCode;
2162 
2163  Assert(FileIsValid(file));
2164 
2165  DO_DB(elog(LOG, "FileSync: %d (%s)",
2166  file, VfdCache[file].fileName));
2167 
2168  returnCode = FileAccess(file);
2169  if (returnCode < 0)
2170  return returnCode;
2171 
2172  pgstat_report_wait_start(wait_event_info);
2173  returnCode = pg_fsync(VfdCache[file].fd);
2175 
2176  return returnCode;
2177 }
2178 
2179 off_t
2181 {
2182  Assert(FileIsValid(file));
2183 
2184  DO_DB(elog(LOG, "FileSize %d (%s)",
2185  file, VfdCache[file].fileName));
2186 
2187  if (FileIsNotOpen(file))
2188  {
2189  if (FileAccess(file) < 0)
2190  return (off_t) -1;
2191  }
2192 
2193  return lseek(VfdCache[file].fd, 0, SEEK_END);
2194 }
2195 
2196 int
2197 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2198 {
2199  int returnCode;
2200 
2201  Assert(FileIsValid(file));
2202 
2203  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2204  file, VfdCache[file].fileName));
2205 
2206  returnCode = FileAccess(file);
2207  if (returnCode < 0)
2208  return returnCode;
2209 
2210  pgstat_report_wait_start(wait_event_info);
2211  returnCode = ftruncate(VfdCache[file].fd, offset);
2213 
2214  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2215  {
2216  /* adjust our state for truncation of a temp file */
2217  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2218  temporary_files_size -= VfdCache[file].fileSize - offset;
2219  VfdCache[file].fileSize = offset;
2220  }
2221 
2222  return returnCode;
2223 }
2224 
2225 /*
2226  * Return the pathname associated with an open file.
2227  *
2228  * The returned string points to an internal buffer, which is valid until
2229  * the file is closed.
2230  */
2231 char *
2233 {
2234  Assert(FileIsValid(file));
2235 
2236  return VfdCache[file].fileName;
2237 }
2238 
2239 /*
2240  * Return the raw file descriptor of an opened file.
2241  *
2242  * The returned file descriptor will be valid until the file is closed, but
2243  * there are a lot of things that can make that happen. So the caller should
2244  * be careful not to do much of anything else before it finishes using the
2245  * returned file descriptor.
2246  */
2247 int
2249 {
2250  Assert(FileIsValid(file));
2251  return VfdCache[file].fd;
2252 }
2253 
2254 /*
2255  * FileGetRawFlags - returns the file flags on open(2)
2256  */
2257 int
2259 {
2260  Assert(FileIsValid(file));
2261  return VfdCache[file].fileFlags;
2262 }
2263 
2264 /*
2265  * FileGetRawMode - returns the mode bitmask passed to open(2)
2266  */
2267 mode_t
2269 {
2270  Assert(FileIsValid(file));
2271  return VfdCache[file].fileMode;
2272 }
2273 
2274 /*
2275  * Make room for another allocatedDescs[] array entry if needed and possible.
2276  * Returns true if an array element is available.
2277  */
2278 static bool
2280 {
2281  AllocateDesc *newDescs;
2282  int newMax;
2283 
2284  /* Quick out if array already has a free slot. */
2286  return true;
2287 
2288  /*
2289  * If the array hasn't yet been created in the current process, initialize
2290  * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2291  * we will ever need, anyway. We don't want to look at max_safe_fds
2292  * immediately because set_max_safe_fds() may not have run yet.
2293  */
2294  if (allocatedDescs == NULL)
2295  {
2296  newMax = FD_MINFREE / 3;
2297  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2298  /* Out of memory already? Treat as fatal error. */
2299  if (newDescs == NULL)
2300  ereport(ERROR,
2301  (errcode(ERRCODE_OUT_OF_MEMORY),
2302  errmsg("out of memory")));
2303  allocatedDescs = newDescs;
2304  maxAllocatedDescs = newMax;
2305  return true;
2306  }
2307 
2308  /*
2309  * Consider enlarging the array beyond the initial allocation used above.
2310  * By the time this happens, max_safe_fds should be known accurately.
2311  *
2312  * We mustn't let allocated descriptors hog all the available FDs, and in
2313  * practice we'd better leave a reasonable number of FDs for VFD use. So
2314  * set the maximum to max_safe_fds / 3. (This should certainly be at
2315  * least as large as the initial size, FD_MINFREE / 3, so we aren't
2316  * tightening the restriction here.) Recall that "external" FDs are
2317  * allowed to consume another third of max_safe_fds.
2318  */
2319  newMax = max_safe_fds / 3;
2320  if (newMax > maxAllocatedDescs)
2321  {
2322  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2323  newMax * sizeof(AllocateDesc));
2324  /* Treat out-of-memory as a non-fatal error. */
2325  if (newDescs == NULL)
2326  return false;
2327  allocatedDescs = newDescs;
2328  maxAllocatedDescs = newMax;
2329  return true;
2330  }
2331 
2332  /* Can't enlarge allocatedDescs[] any more. */
2333  return false;
2334 }
2335 
2336 /*
2337  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2338  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2339  * necessary to open the file. When done, call FreeFile rather than fclose.
2340  *
2341  * Note that files that will be open for any significant length of time
2342  * should NOT be handled this way, since they cannot share kernel file
2343  * descriptors with other files; there is grave risk of running out of FDs
2344  * if anyone locks down too many FDs. Most callers of this routine are
2345  * simply reading a config file that they will read and close immediately.
2346  *
2347  * fd.c will automatically close all files opened with AllocateFile at
2348  * transaction commit or abort; this prevents FD leakage if a routine
2349  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2350  *
2351  * Ideally this should be the *only* direct call of fopen() in the backend.
2352  */
2353 FILE *
2354 AllocateFile(const char *name, const char *mode)
2355 {
2356  FILE *file;
2357 
2358  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2359  numAllocatedDescs, name));
2360 
2361  /* Can we allocate another non-virtual FD? */
2362  if (!reserveAllocatedDesc())
2363  ereport(ERROR,
2364  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2365  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2366  maxAllocatedDescs, name)));
2367 
2368  /* Close excess kernel FDs. */
2369  ReleaseLruFiles();
2370 
2371 TryAgain:
2372  if ((file = fopen(name, mode)) != NULL)
2373  {
2374  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2375 
2376  desc->kind = AllocateDescFile;
2377  desc->desc.file = file;
2380  return desc->desc.file;
2381  }
2382 
2383  if (errno == EMFILE || errno == ENFILE)
2384  {
2385  int save_errno = errno;
2386 
2387  ereport(LOG,
2388  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2389  errmsg("out of file descriptors: %m; release and retry")));
2390  errno = 0;
2391  if (ReleaseLruFile())
2392  goto TryAgain;
2393  errno = save_errno;
2394  }
2395 
2396  return NULL;
2397 }
2398 
2399 /*
2400  * Open a file with OpenTransientFilePerm() and pass default file mode for
2401  * the fileMode parameter.
2402  */
2403 int
2405 {
2406  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2407 }
2408 
2409 /*
2410  * Like AllocateFile, but returns an unbuffered fd like open(2)
2411  */
2412 int
2414 {
2415  int fd;
2416 
2417  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2418  numAllocatedDescs, fileName));
2419 
2420  /* Can we allocate another non-virtual FD? */
2421  if (!reserveAllocatedDesc())
2422  ereport(ERROR,
2423  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2424  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2425  maxAllocatedDescs, fileName)));
2426 
2427  /* Close excess kernel FDs. */
2428  ReleaseLruFiles();
2429 
2430  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2431 
2432  if (fd >= 0)
2433  {
2434  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2435 
2436  desc->kind = AllocateDescRawFD;
2437  desc->desc.fd = fd;
2440 
2441  return fd;
2442  }
2443 
2444  return -1; /* failure */
2445 }
2446 
2447 /*
2448  * Routines that want to initiate a pipe stream should use OpenPipeStream
2449  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2450  * necessary. When done, call ClosePipeStream rather than pclose.
2451  *
2452  * This function also ensures that the popen'd program is run with default
2453  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2454  * uses. This ensures desirable response to, eg, closing a read pipe early.
2455  */
2456 FILE *
2457 OpenPipeStream(const char *command, const char *mode)
2458 {
2459  FILE *file;
2460  int save_errno;
2461 
2462  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2463  numAllocatedDescs, command));
2464 
2465  /* Can we allocate another non-virtual FD? */
2466  if (!reserveAllocatedDesc())
2467  ereport(ERROR,
2468  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2469  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2470  maxAllocatedDescs, command)));
2471 
2472  /* Close excess kernel FDs. */
2473  ReleaseLruFiles();
2474 
2475 TryAgain:
2476  fflush(stdout);
2477  fflush(stderr);
2479  errno = 0;
2480  file = popen(command, mode);
2481  save_errno = errno;
2483  errno = save_errno;
2484  if (file != NULL)
2485  {
2486  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2487 
2488  desc->kind = AllocateDescPipe;
2489  desc->desc.file = file;
2492  return desc->desc.file;
2493  }
2494 
2495  if (errno == EMFILE || errno == ENFILE)
2496  {
2497  ereport(LOG,
2498  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2499  errmsg("out of file descriptors: %m; release and retry")));
2500  if (ReleaseLruFile())
2501  goto TryAgain;
2502  errno = save_errno;
2503  }
2504 
2505  return NULL;
2506 }
2507 
2508 /*
2509  * Free an AllocateDesc of any type.
2510  *
2511  * The argument *must* point into the allocatedDescs[] array.
2512  */
2513 static int
2515 {
2516  int result;
2517 
2518  /* Close the underlying object */
2519  switch (desc->kind)
2520  {
2521  case AllocateDescFile:
2522  result = fclose(desc->desc.file);
2523  break;
2524  case AllocateDescPipe:
2525  result = pclose(desc->desc.file);
2526  break;
2527  case AllocateDescDir:
2528  result = closedir(desc->desc.dir);
2529  break;
2530  case AllocateDescRawFD:
2531  result = close(desc->desc.fd);
2532  break;
2533  default:
2534  elog(ERROR, "AllocateDesc kind not recognized");
2535  result = 0; /* keep compiler quiet */
2536  break;
2537  }
2538 
2539  /* Compact storage in the allocatedDescs array */
2541  *desc = allocatedDescs[numAllocatedDescs];
2542 
2543  return result;
2544 }
2545 
2546 /*
2547  * Close a file returned by AllocateFile.
2548  *
2549  * Note we do not check fclose's return value --- it is up to the caller
2550  * to handle close errors.
2551  */
2552 int
2553 FreeFile(FILE *file)
2554 {
2555  int i;
2556 
2557  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2558 
2559  /* Remove file from list of allocated files, if it's present */
2560  for (i = numAllocatedDescs; --i >= 0;)
2561  {
2562  AllocateDesc *desc = &allocatedDescs[i];
2563 
2564  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2565  return FreeDesc(desc);
2566  }
2567 
2568  /* Only get here if someone passes us a file not in allocatedDescs */
2569  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2570 
2571  return fclose(file);
2572 }
2573 
2574 /*
2575  * Close a file returned by OpenTransientFile.
2576  *
2577  * Note we do not check close's return value --- it is up to the caller
2578  * to handle close errors.
2579  */
2580 int
2582 {
2583  int i;
2584 
2585  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2586 
2587  /* Remove fd from list of allocated files, if it's present */
2588  for (i = numAllocatedDescs; --i >= 0;)
2589  {
2590  AllocateDesc *desc = &allocatedDescs[i];
2591 
2592  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2593  return FreeDesc(desc);
2594  }
2595 
2596  /* Only get here if someone passes us a file not in allocatedDescs */
2597  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2598 
2599  return close(fd);
2600 }
2601 
2602 /*
2603  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2604  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2605  * necessary to open the directory, and with closing it after an elog.
2606  * When done, call FreeDir rather than closedir.
2607  *
2608  * Returns NULL, with errno set, on failure. Note that failure detection
2609  * is commonly left to the following call of ReadDir or ReadDirExtended;
2610  * see the comments for ReadDir.
2611  *
2612  * Ideally this should be the *only* direct call of opendir() in the backend.
2613  */
2614 DIR *
2615 AllocateDir(const char *dirname)
2616 {
2617  DIR *dir;
2618 
2619  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2620  numAllocatedDescs, dirname));
2621 
2622  /* Can we allocate another non-virtual FD? */
2623  if (!reserveAllocatedDesc())
2624  ereport(ERROR,
2625  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2626  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2627  maxAllocatedDescs, dirname)));
2628 
2629  /* Close excess kernel FDs. */
2630  ReleaseLruFiles();
2631 
2632 TryAgain:
2633  if ((dir = opendir(dirname)) != NULL)
2634  {
2635  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2636 
2637  desc->kind = AllocateDescDir;
2638  desc->desc.dir = dir;
2641  return desc->desc.dir;
2642  }
2643 
2644  if (errno == EMFILE || errno == ENFILE)
2645  {
2646  int save_errno = errno;
2647 
2648  ereport(LOG,
2649  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2650  errmsg("out of file descriptors: %m; release and retry")));
2651  errno = 0;
2652  if (ReleaseLruFile())
2653  goto TryAgain;
2654  errno = save_errno;
2655  }
2656 
2657  return NULL;
2658 }
2659 
2660 /*
2661  * Read a directory opened with AllocateDir, ereport'ing any error.
2662  *
2663  * This is easier to use than raw readdir() since it takes care of some
2664  * otherwise rather tedious and error-prone manipulation of errno. Also,
2665  * if you are happy with a generic error message for AllocateDir failure,
2666  * you can just do
2667  *
2668  * dir = AllocateDir(path);
2669  * while ((dirent = ReadDir(dir, path)) != NULL)
2670  * process dirent;
2671  * FreeDir(dir);
2672  *
2673  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2674  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2675  * use this shortcut.)
2676  *
2677  * The pathname passed to AllocateDir must be passed to this routine too,
2678  * but it is only used for error reporting.
2679  */
2680 struct dirent *
2681 ReadDir(DIR *dir, const char *dirname)
2682 {
2683  return ReadDirExtended(dir, dirname, ERROR);
2684 }
2685 
2686 /*
2687  * Alternate version of ReadDir that allows caller to specify the elevel
2688  * for any error report (whether it's reporting an initial failure of
2689  * AllocateDir or a subsequent directory read failure).
2690  *
2691  * If elevel < ERROR, returns NULL after any error. With the normal coding
2692  * pattern, this will result in falling out of the loop immediately as
2693  * though the directory contained no (more) entries.
2694  */
2695 struct dirent *
2696 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2697 {
2698  struct dirent *dent;
2699 
2700  /* Give a generic message for AllocateDir failure, if caller didn't */
2701  if (dir == NULL)
2702  {
2703  ereport(elevel,
2705  errmsg("could not open directory \"%s\": %m",
2706  dirname)));
2707  return NULL;
2708  }
2709 
2710  errno = 0;
2711  if ((dent = readdir(dir)) != NULL)
2712  return dent;
2713 
2714  if (errno)
2715  ereport(elevel,
2717  errmsg("could not read directory \"%s\": %m",
2718  dirname)));
2719  return NULL;
2720 }
2721 
2722 /*
2723  * Close a directory opened with AllocateDir.
2724  *
2725  * Returns closedir's return value (with errno set if it's not 0).
2726  * Note we do not check the return value --- it is up to the caller
2727  * to handle close errors if wanted.
2728  *
2729  * Does nothing if dir == NULL; we assume that directory open failure was
2730  * already reported if desired.
2731  */
2732 int
2734 {
2735  int i;
2736 
2737  /* Nothing to do if AllocateDir failed */
2738  if (dir == NULL)
2739  return 0;
2740 
2741  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2742 
2743  /* Remove dir from list of allocated dirs, if it's present */
2744  for (i = numAllocatedDescs; --i >= 0;)
2745  {
2746  AllocateDesc *desc = &allocatedDescs[i];
2747 
2748  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2749  return FreeDesc(desc);
2750  }
2751 
2752  /* Only get here if someone passes us a dir not in allocatedDescs */
2753  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2754 
2755  return closedir(dir);
2756 }
2757 
2758 
2759 /*
2760  * Close a pipe stream returned by OpenPipeStream.
2761  */
2762 int
2763 ClosePipeStream(FILE *file)
2764 {
2765  int i;
2766 
2767  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2768 
2769  /* Remove file from list of allocated files, if it's present */
2770  for (i = numAllocatedDescs; --i >= 0;)
2771  {
2772  AllocateDesc *desc = &allocatedDescs[i];
2773 
2774  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2775  return FreeDesc(desc);
2776  }
2777 
2778  /* Only get here if someone passes us a file not in allocatedDescs */
2779  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2780 
2781  return pclose(file);
2782 }
2783 
2784 /*
2785  * closeAllVfds
2786  *
2787  * Force all VFDs into the physically-closed state, so that the fewest
2788  * possible number of kernel file descriptors are in use. There is no
2789  * change in the logical state of the VFDs.
2790  */
2791 void
2793 {
2794  Index i;
2795 
2796  if (SizeVfdCache > 0)
2797  {
2798  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2799  for (i = 1; i < SizeVfdCache; i++)
2800  {
2801  if (!FileIsNotOpen(i))
2802  LruDelete(i);
2803  }
2804  }
2805 }
2806 
2807 
2808 /*
2809  * SetTempTablespaces
2810  *
2811  * Define a list (actually an array) of OIDs of tablespaces to use for
2812  * temporary files. This list will be used until end of transaction,
2813  * unless this function is called again before then. It is caller's
2814  * responsibility that the passed-in array has adequate lifespan (typically
2815  * it'd be allocated in TopTransactionContext).
2816  *
2817  * Some entries of the array may be InvalidOid, indicating that the current
2818  * database's default tablespace should be used.
2819  */
2820 void
2821 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2822 {
2823  Assert(numSpaces >= 0);
2824  tempTableSpaces = tableSpaces;
2825  numTempTableSpaces = numSpaces;
2826 
2827  /*
2828  * Select a random starting point in the list. This is to minimize
2829  * conflicts between backends that are most likely sharing the same list
2830  * of temp tablespaces. Note that if we create multiple temp files in the
2831  * same transaction, we'll advance circularly through the list --- this
2832  * ensures that large temporary sort files are nicely spread across all
2833  * available tablespaces.
2834  */
2835  if (numSpaces > 1)
2836  nextTempTableSpace = random() % numSpaces;
2837  else
2838  nextTempTableSpace = 0;
2839 }
2840 
2841 /*
2842  * TempTablespacesAreSet
2843  *
2844  * Returns true if SetTempTablespaces has been called in current transaction.
2845  * (This is just so that tablespaces.c doesn't need its own per-transaction
2846  * state.)
2847  */
2848 bool
2850 {
2851  return (numTempTableSpaces >= 0);
2852 }
2853 
2854 /*
2855  * GetTempTablespaces
2856  *
2857  * Populate an array with the OIDs of the tablespaces that should be used for
2858  * temporary files. (Some entries may be InvalidOid, indicating that the
2859  * current database's default tablespace should be used.) At most numSpaces
2860  * entries will be filled.
2861  * Returns the number of OIDs that were copied into the output array.
2862  */
2863 int
2864 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2865 {
2866  int i;
2867 
2869  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2870  tableSpaces[i] = tempTableSpaces[i];
2871 
2872  return i;
2873 }
2874 
2875 /*
2876  * GetNextTempTableSpace
2877  *
2878  * Select the next temp tablespace to use. A result of InvalidOid means
2879  * to use the current database's default tablespace.
2880  */
2881 Oid
2883 {
2884  if (numTempTableSpaces > 0)
2885  {
2886  /* Advance nextTempTableSpace counter with wraparound */
2888  nextTempTableSpace = 0;
2890  }
2891  return InvalidOid;
2892 }
2893 
2894 
2895 /*
2896  * AtEOSubXact_Files
2897  *
2898  * Take care of subtransaction commit/abort. At abort, we close temp files
2899  * that the subtransaction may have opened. At commit, we reassign the
2900  * files that were opened to the parent subtransaction.
2901  */
2902 void
2903 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2904  SubTransactionId parentSubid)
2905 {
2906  Index i;
2907 
2908  for (i = 0; i < numAllocatedDescs; i++)
2909  {
2910  if (allocatedDescs[i].create_subid == mySubid)
2911  {
2912  if (isCommit)
2913  allocatedDescs[i].create_subid = parentSubid;
2914  else
2915  {
2916  /* have to recheck the item after FreeDesc (ugly) */
2917  FreeDesc(&allocatedDescs[i--]);
2918  }
2919  }
2920  }
2921 }
2922 
2923 /*
2924  * AtEOXact_Files
2925  *
2926  * This routine is called during transaction commit or abort. All still-open
2927  * per-transaction temporary file VFDs are closed, which also causes the
2928  * underlying files to be deleted (although they should've been closed already
2929  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2930  * closed. We also forget any transaction-local temp tablespace list.
2931  *
2932  * The isCommit flag is used only to decide whether to emit warnings about
2933  * unclosed files.
2934  */
2935 void
2936 AtEOXact_Files(bool isCommit)
2937 {
2938  CleanupTempFiles(isCommit, false);
2939  tempTableSpaces = NULL;
2940  numTempTableSpaces = -1;
2941 }
2942 
2943 /*
2944  * AtProcExit_Files
2945  *
2946  * on_proc_exit hook to clean up temp files during backend shutdown.
2947  * Here, we want to clean up *all* temp files including interXact ones.
2948  */
2949 static void
2951 {
2952  CleanupTempFiles(false, true);
2953 }
2954 
2955 /*
2956  * Close temporary files and delete their underlying files.
2957  *
2958  * isCommit: if true, this is normal transaction commit, and we don't
2959  * expect any remaining files; warn if there are some.
2960  *
2961  * isProcExit: if true, this is being called as the backend process is
2962  * exiting. If that's the case, we should remove all temporary files; if
2963  * that's not the case, we are being called for transaction commit/abort
2964  * and should only remove transaction-local temp files. In either case,
2965  * also clean up "allocated" stdio files, dirs and fds.
2966  */
2967 static void
2968 CleanupTempFiles(bool isCommit, bool isProcExit)
2969 {
2970  Index i;
2971 
2972  /*
2973  * Careful here: at proc_exit we need extra cleanup, not just
2974  * xact_temporary files.
2975  */
2976  if (isProcExit || have_xact_temporary_files)
2977  {
2978  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2979  for (i = 1; i < SizeVfdCache; i++)
2980  {
2981  unsigned short fdstate = VfdCache[i].fdstate;
2982 
2983  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2984  VfdCache[i].fileName != NULL)
2985  {
2986  /*
2987  * If we're in the process of exiting a backend process, close
2988  * all temporary files. Otherwise, only close temporary files
2989  * local to the current transaction. They should be closed by
2990  * the ResourceOwner mechanism already, so this is just a
2991  * debugging cross-check.
2992  */
2993  if (isProcExit)
2994  FileClose(i);
2995  else if (fdstate & FD_CLOSE_AT_EOXACT)
2996  {
2997  elog(WARNING,
2998  "temporary file %s not closed at end-of-transaction",
2999  VfdCache[i].fileName);
3000  FileClose(i);
3001  }
3002  }
3003  }
3004 
3005  have_xact_temporary_files = false;
3006  }
3007 
3008  /* Complain if any allocated files remain open at commit. */
3009  if (isCommit && numAllocatedDescs > 0)
3010  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3012 
3013  /* Clean up "allocated" stdio files, dirs and fds. */
3014  while (numAllocatedDescs > 0)
3015  FreeDesc(&allocatedDescs[0]);
3016 }
3017 
3018 
3019 /*
3020  * Remove temporary and temporary relation files left over from a prior
3021  * postmaster session
3022  *
3023  * This should be called during postmaster startup. It will forcibly
3024  * remove any leftover files created by OpenTemporaryFile and any leftover
3025  * temporary relation files created by mdcreate.
3026  *
3027  * NOTE: we could, but don't, call this during a post-backend-crash restart
3028  * cycle. The argument for not doing it is that someone might want to examine
3029  * the temp files for debugging purposes. This does however mean that
3030  * OpenTemporaryFile had better allow for collision with an existing temp
3031  * file name.
3032  *
3033  * NOTE: this function and its subroutines generally report syscall failures
3034  * with ereport(LOG) and keep going. Removing temp files is not so critical
3035  * that we should fail to start the database when we can't do it.
3036  */
3037 void
3039 {
3040  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3041  DIR *spc_dir;
3042  struct dirent *spc_de;
3043 
3044  /*
3045  * First process temp files in pg_default ($PGDATA/base)
3046  */
3047  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3048  RemovePgTempFilesInDir(temp_path, true, false);
3049  RemovePgTempRelationFiles("base");
3050 
3051  /*
3052  * Cycle through temp directories for all non-default tablespaces.
3053  */
3054  spc_dir = AllocateDir("pg_tblspc");
3055 
3056  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3057  {
3058  if (strcmp(spc_de->d_name, ".") == 0 ||
3059  strcmp(spc_de->d_name, "..") == 0)
3060  continue;
3061 
3062  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3064  RemovePgTempFilesInDir(temp_path, true, false);
3065 
3066  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3068  RemovePgTempRelationFiles(temp_path);
3069  }
3070 
3071  FreeDir(spc_dir);
3072 
3073  /*
3074  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3075  * DataDir as well. However, that is *not* cleaned here because doing so
3076  * would create a race condition. It's done separately, earlier in
3077  * postmaster startup.
3078  */
3079 }
3080 
3081 /*
3082  * Process one pgsql_tmp directory for RemovePgTempFiles.
3083  *
3084  * If missing_ok is true, it's all right for the named directory to not exist.
3085  * Any other problem results in a LOG message. (missing_ok should be true at
3086  * the top level, since pgsql_tmp directories are not created until needed.)
3087  *
3088  * At the top level, this should be called with unlink_all = false, so that
3089  * only files matching the temporary name prefix will be unlinked. When
3090  * recursing it will be called with unlink_all = true to unlink everything
3091  * under a top-level temporary directory.
3092  *
3093  * (These two flags could be replaced by one, but it seems clearer to keep
3094  * them separate.)
3095  */
3096 void
3097 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3098 {
3099  DIR *temp_dir;
3100  struct dirent *temp_de;
3101  char rm_path[MAXPGPATH * 2];
3102 
3103  temp_dir = AllocateDir(tmpdirname);
3104 
3105  if (temp_dir == NULL && errno == ENOENT && missing_ok)
3106  return;
3107 
3108  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3109  {
3110  if (strcmp(temp_de->d_name, ".") == 0 ||
3111  strcmp(temp_de->d_name, "..") == 0)
3112  continue;
3113 
3114  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3115  tmpdirname, temp_de->d_name);
3116 
3117  if (unlink_all ||
3118  strncmp(temp_de->d_name,
3120  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3121  {
3122  struct stat statbuf;
3123 
3124  if (lstat(rm_path, &statbuf) < 0)
3125  {
3126  ereport(LOG,
3128  errmsg("could not stat file \"%s\": %m", rm_path)));
3129  continue;
3130  }
3131 
3132  if (S_ISDIR(statbuf.st_mode))
3133  {
3134  /* recursively remove contents, then directory itself */
3135  RemovePgTempFilesInDir(rm_path, false, true);
3136 
3137  if (rmdir(rm_path) < 0)
3138  ereport(LOG,
3140  errmsg("could not remove directory \"%s\": %m",
3141  rm_path)));
3142  }
3143  else
3144  {
3145  if (unlink(rm_path) < 0)
3146  ereport(LOG,
3148  errmsg("could not remove file \"%s\": %m",
3149  rm_path)));
3150  }
3151  }
3152  else
3153  ereport(LOG,
3154  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3155  rm_path)));
3156  }
3157 
3158  FreeDir(temp_dir);
3159 }
3160 
3161 /* Process one tablespace directory, look for per-DB subdirectories */
3162 static void
3163 RemovePgTempRelationFiles(const char *tsdirname)
3164 {
3165  DIR *ts_dir;
3166  struct dirent *de;
3167  char dbspace_path[MAXPGPATH * 2];
3168 
3169  ts_dir = AllocateDir(tsdirname);
3170 
3171  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3172  {
3173  /*
3174  * We're only interested in the per-database directories, which have
3175  * numeric names. Note that this code will also (properly) ignore "."
3176  * and "..".
3177  */
3178  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3179  continue;
3180 
3181  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3182  tsdirname, de->d_name);
3183  RemovePgTempRelationFilesInDbspace(dbspace_path);
3184  }
3185 
3186  FreeDir(ts_dir);
3187 }
3188 
3189 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3190 static void
3191 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3192 {
3193  DIR *dbspace_dir;
3194  struct dirent *de;
3195  char rm_path[MAXPGPATH * 2];
3196 
3197  dbspace_dir = AllocateDir(dbspacedirname);
3198 
3199  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3200  {
3201  if (!looks_like_temp_rel_name(de->d_name))
3202  continue;
3203 
3204  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3205  dbspacedirname, de->d_name);
3206 
3207  if (unlink(rm_path) < 0)
3208  ereport(LOG,
3210  errmsg("could not remove file \"%s\": %m",
3211  rm_path)));
3212  }
3213 
3214  FreeDir(dbspace_dir);
3215 }
3216 
3217 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3218 bool
3220 {
3221  int pos;
3222  int savepos;
3223 
3224  /* Must start with "t". */
3225  if (name[0] != 't')
3226  return false;
3227 
3228  /* Followed by a non-empty string of digits and then an underscore. */
3229  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3230  ;
3231  if (pos == 1 || name[pos] != '_')
3232  return false;
3233 
3234  /* Followed by another nonempty string of digits. */
3235  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3236  ;
3237  if (savepos == pos)
3238  return false;
3239 
3240  /* We might have _forkname or .segment or both. */
3241  if (name[pos] == '_')
3242  {
3243  int forkchar = forkname_chars(&name[pos + 1], NULL);
3244 
3245  if (forkchar <= 0)
3246  return false;
3247  pos += forkchar + 1;
3248  }
3249  if (name[pos] == '.')
3250  {
3251  int segchar;
3252 
3253  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3254  ;
3255  if (segchar <= 1)
3256  return false;
3257  pos += segchar;
3258  }
3259 
3260  /* Now we should be at the end. */
3261  if (name[pos] != '\0')
3262  return false;
3263  return true;
3264 }
3265 
3266 
3267 /*
3268  * Issue fsync recursively on PGDATA and all its contents.
3269  *
3270  * We fsync regular files and directories wherever they are, but we
3271  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3272  * Other symlinks are presumed to point at files we're not responsible
3273  * for fsyncing, and might not have privileges to write at all.
3274  *
3275  * Errors are logged but not considered fatal; that's because this is used
3276  * only during database startup, to deal with the possibility that there are
3277  * issued-but-unsynced writes pending against the data directory. We want to
3278  * ensure that such writes reach disk before anything that's done in the new
3279  * run. However, aborting on error would result in failure to start for
3280  * harmless cases such as read-only files in the data directory, and that's
3281  * not good either.
3282  *
3283  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3284  * rewriting all changes again during recovery.
3285  *
3286  * Note we assume we're chdir'd into PGDATA to begin with.
3287  */
3288 void
3290 {
3291  bool xlog_is_symlink;
3292 
3293  /* We can skip this whole thing if fsync is disabled. */
3294  if (!enableFsync)
3295  return;
3296 
3297  /*
3298  * If pg_wal is a symlink, we'll need to recurse into it separately,
3299  * because the first walkdir below will ignore it.
3300  */
3301  xlog_is_symlink = false;
3302 
3303 #ifndef WIN32
3304  {
3305  struct stat st;
3306 
3307  if (lstat("pg_wal", &st) < 0)
3308  ereport(LOG,
3310  errmsg("could not stat file \"%s\": %m",
3311  "pg_wal")));
3312  else if (S_ISLNK(st.st_mode))
3313  xlog_is_symlink = true;
3314  }
3315 #else
3316  if (pgwin32_is_junction("pg_wal"))
3317  xlog_is_symlink = true;
3318 #endif
3319 
3320  /*
3321  * If possible, hint to the kernel that we're soon going to fsync the data
3322  * directory and its contents. Errors in this step are even less
3323  * interesting than normal, so log them only at DEBUG1.
3324  */
3325 #ifdef PG_FLUSH_DATA_WORKS
3326  walkdir(".", pre_sync_fname, false, DEBUG1);
3327  if (xlog_is_symlink)
3328  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3329  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3330 #endif
3331 
3332  /*
3333  * Now we do the fsync()s in the same order.
3334  *
3335  * The main call ignores symlinks, so in addition to specially processing
3336  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3337  * process_symlinks = true. Note that if there are any plain directories
3338  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3339  * so we don't worry about optimizing it.
3340  */
3341  walkdir(".", datadir_fsync_fname, false, LOG);
3342  if (xlog_is_symlink)
3343  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3344  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3345 }
3346 
3347 /*
3348  * walkdir: recursively walk a directory, applying the action to each
3349  * regular file and directory (including the named directory itself).
3350  *
3351  * If process_symlinks is true, the action and recursion are also applied
3352  * to regular files and directories that are pointed to by symlinks in the
3353  * given directory; otherwise symlinks are ignored. Symlinks are always
3354  * ignored in subdirectories, ie we intentionally don't pass down the
3355  * process_symlinks flag to recursive calls.
3356  *
3357  * Errors are reported at level elevel, which might be ERROR or less.
3358  *
3359  * See also walkdir in file_utils.c, which is a frontend version of this
3360  * logic.
3361  */
3362 static void
3363 walkdir(const char *path,
3364  void (*action) (const char *fname, bool isdir, int elevel),
3365  bool process_symlinks,
3366  int elevel)
3367 {
3368  DIR *dir;
3369  struct dirent *de;
3370 
3371  dir = AllocateDir(path);
3372 
3373  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3374  {
3375  char subpath[MAXPGPATH * 2];
3376 
3378 
3379  if (strcmp(de->d_name, ".") == 0 ||
3380  strcmp(de->d_name, "..") == 0)
3381  continue;
3382 
3383  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3384 
3385  switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3386  {
3387  case PGFILETYPE_REG:
3388  (*action) (subpath, false, elevel);
3389  break;
3390  case PGFILETYPE_DIR:
3391  walkdir(subpath, action, false, elevel);
3392  break;
3393  default:
3394 
3395  /*
3396  * Errors are already reported directly by get_dirent_type(),
3397  * and any remaining symlinks and unknown file types are
3398  * ignored.
3399  */
3400  break;
3401  }
3402  }
3403 
3404  FreeDir(dir); /* we ignore any error here */
3405 
3406  /*
3407  * It's important to fsync the destination directory itself as individual
3408  * file fsyncs don't guarantee that the directory entry for the file is
3409  * synced. However, skip this if AllocateDir failed; the action function
3410  * might not be robust against that.
3411  */
3412  if (dir)
3413  (*action) (path, true, elevel);
3414 }
3415 
3416 
3417 /*
3418  * Hint to the OS that it should get ready to fsync() this file.
3419  *
3420  * Ignores errors trying to open unreadable files, and logs other errors at a
3421  * caller-specified level.
3422  */
3423 #ifdef PG_FLUSH_DATA_WORKS
3424 
3425 static void
3426 pre_sync_fname(const char *fname, bool isdir, int elevel)
3427 {
3428  int fd;
3429 
3430  /* Don't try to flush directories, it'll likely just fail */
3431  if (isdir)
3432  return;
3433 
3434  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3435 
3436  if (fd < 0)
3437  {
3438  if (errno == EACCES)
3439  return;
3440  ereport(elevel,
3442  errmsg("could not open file \"%s\": %m", fname)));
3443  return;
3444  }
3445 
3446  /*
3447  * pg_flush_data() ignores errors, which is ok because this is only a
3448  * hint.
3449  */
3450  pg_flush_data(fd, 0, 0);
3451 
3452  if (CloseTransientFile(fd) != 0)
3453  ereport(elevel,
3455  errmsg("could not close file \"%s\": %m", fname)));
3456 }
3457 
3458 #endif /* PG_FLUSH_DATA_WORKS */
3459 
3460 static void
3461 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3462 {
3463  /*
3464  * We want to silently ignoring errors about unreadable files. Pass that
3465  * desire on to fsync_fname_ext().
3466  */
3467  fsync_fname_ext(fname, isdir, true, elevel);
3468 }
3469 
3470 static void
3471 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3472 {
3473  if (isdir)
3474  {
3475  if (rmdir(fname) != 0 && errno != ENOENT)
3476  ereport(elevel,
3478  errmsg("could not remove directory \"%s\": %m", fname)));
3479  }
3480  else
3481  {
3482  /* Use PathNameDeleteTemporaryFile to report filesize */
3483  PathNameDeleteTemporaryFile(fname, false);
3484  }
3485 }
3486 
3487 /*
3488  * fsync_fname_ext -- Try to fsync a file or directory
3489  *
3490  * If ignore_perm is true, ignore errors upon trying to open unreadable
3491  * files. Logs other errors at a caller-specified level.
3492  *
3493  * Returns 0 if the operation succeeded, -1 otherwise.
3494  */
3495 int
3496 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3497 {
3498  int fd;
3499  int flags;
3500  int returncode;
3501 
3502  /*
3503  * Some OSs require directories to be opened read-only whereas other
3504  * systems don't allow us to fsync files opened read-only; so we need both
3505  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3506  * not writable by our userid, but we assume that's OK.
3507  */
3508  flags = PG_BINARY;
3509  if (!isdir)
3510  flags |= O_RDWR;
3511  else
3512  flags |= O_RDONLY;
3513 
3514  fd = OpenTransientFile(fname, flags);
3515 
3516  /*
3517  * Some OSs don't allow us to open directories at all (Windows returns
3518  * EACCES), just ignore the error in that case. If desired also silently
3519  * ignoring errors about unreadable files. Log others.
3520  */
3521  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3522  return 0;
3523  else if (fd < 0 && ignore_perm && errno == EACCES)
3524  return 0;
3525  else if (fd < 0)
3526  {
3527  ereport(elevel,
3529  errmsg("could not open file \"%s\": %m", fname)));
3530  return -1;
3531  }
3532 
3533  returncode = pg_fsync(fd);
3534 
3535  /*
3536  * Some OSes don't allow us to fsync directories at all, so we can ignore
3537  * those errors. Anything else needs to be logged.
3538  */
3539  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3540  {
3541  int save_errno;
3542 
3543  /* close file upon error, might not be in transaction context */
3544  save_errno = errno;
3545  (void) CloseTransientFile(fd);
3546  errno = save_errno;
3547 
3548  ereport(elevel,
3550  errmsg("could not fsync file \"%s\": %m", fname)));
3551  return -1;
3552  }
3553 
3554  if (CloseTransientFile(fd) != 0)
3555  {
3556  ereport(elevel,
3558  errmsg("could not close file \"%s\": %m", fname)));
3559  return -1;
3560  }
3561 
3562  return 0;
3563 }
3564 
3565 /*
3566  * fsync_parent_path -- fsync the parent path of a file or directory
3567  *
3568  * This is aimed at making file operations persistent on disk in case of
3569  * an OS crash or power failure.
3570  */
3571 static int
3572 fsync_parent_path(const char *fname, int elevel)
3573 {
3574  char parentpath[MAXPGPATH];
3575 
3576  strlcpy(parentpath, fname, MAXPGPATH);
3577  get_parent_directory(parentpath);
3578 
3579  /*
3580  * get_parent_directory() returns an empty string if the input argument is
3581  * just a file name (see comments in path.c), so handle that as being the
3582  * current directory.
3583  */
3584  if (strlen(parentpath) == 0)
3585  strlcpy(parentpath, ".", MAXPGPATH);
3586 
3587  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3588  return -1;
3589 
3590  return 0;
3591 }
3592 
3593 /*
3594  * Create a PostgreSQL data sub-directory
3595  *
3596  * The data directory itself, and most of its sub-directories, are created at
3597  * initdb time, but we do have some occasions when we create directories in
3598  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3599  * make sure that those directories are created consistently. Today, that means
3600  * making sure that the created directory has the correct permissions, which is
3601  * what pg_dir_create_mode tracks for us.
3602  *
3603  * Note that we also set the umask() based on what we understand the correct
3604  * permissions to be (see file_perm.c).
3605  *
3606  * For permissions other than the default, mkdir() can be used directly, but
3607  * be sure to consider carefully such cases -- a sub-directory with incorrect
3608  * permissions in a PostgreSQL data directory could cause backups and other
3609  * processes to fail.
3610  */
3611 int
3612 MakePGDirectory(const char *directoryName)
3613 {
3614  return mkdir(directoryName, pg_dir_create_mode);
3615 }
3616 
3617 /*
3618  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3619  *
3620  * Failure to fsync any data file is cause for immediate panic, unless
3621  * data_sync_retry is enabled. Data may have been written to the operating
3622  * system and removed from our buffer pool already, and if we are running on
3623  * an operating system that forgets dirty data on write-back failure, there
3624  * may be only one copy of the data remaining: in the WAL. A later attempt to
3625  * fsync again might falsely report success. Therefore we must not allow any
3626  * further checkpoints to be attempted. data_sync_retry can in theory be
3627  * enabled on systems known not to drop dirty buffered data on write-back
3628  * failure (with the likely outcome that checkpoints will continue to fail
3629  * until the underlying problem is fixed).
3630  *
3631  * Any code that reports a failure from fsync() or related functions should
3632  * filter the error level with this function.
3633  */
3634 int
3635 data_sync_elevel(int elevel)
3636 {
3637  return data_sync_retry ? elevel : PANIC;
3638 }
3639 
3640 /*
3641  * A convenience wrapper for pg_pwritev() that retries on partial write. If an
3642  * error is returned, it is unspecified how much has been written.
3643  */
3644 ssize_t
3645 pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
3646 {
3647  struct iovec iov_copy[PG_IOV_MAX];
3648  ssize_t sum = 0;
3649  ssize_t part;
3650 
3651  /* We'd better have space to make a copy, in case we need to retry. */
3652  if (iovcnt > PG_IOV_MAX)
3653  {
3654  errno = EINVAL;
3655  return -1;
3656  }
3657 
3658  for (;;)
3659  {
3660  /* Write as much as we can. */
3661  part = pg_pwritev(fd, iov, iovcnt, offset);
3662  if (part < 0)
3663  return -1;
3664 
3665 #ifdef SIMULATE_SHORT_WRITE
3666  part = Min(part, 4096);
3667 #endif
3668 
3669  /* Count our progress. */
3670  sum += part;
3671  offset += part;
3672 
3673  /* Step over iovecs that are done. */
3674  while (iovcnt > 0 && iov->iov_len <= part)
3675  {
3676  part -= iov->iov_len;
3677  ++iov;
3678  --iovcnt;
3679  }
3680 
3681  /* Are they all done? */
3682  if (iovcnt == 0)
3683  {
3684  /* We don't expect the kernel to write more than requested. */
3685  Assert(part == 0);
3686  break;
3687  }
3688 
3689  /*
3690  * Move whatever's left to the front of our mutable copy and adjust
3691  * the leading iovec.
3692  */
3693  Assert(iovcnt > 0);
3694  memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
3695  Assert(iov->iov_len > part);
3696  iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
3697  iov_copy[0].iov_len -= part;
3698  iov = iov_copy;
3699  }
3700 
3701  return sum;
3702 }
size_t iov_len
Definition: pg_iovec.h:27
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1478
File lruLessRecently
Definition: fd.c:194
void closeAllVfds(void)
Definition: fd.c:2792
static PgChecksumMode mode
Definition: pg_checksums.c:61
File nextFree
Definition: fd.c:192
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:887
int pg_file_create_mode
Definition: file_perm.c:19
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1808
#define MAP_FAILED
Definition: mem.h:45
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:41
void * iov_base
Definition: pg_iovec.h:26
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1741
#define NUM_RESERVED_FDS
Definition: fd.c:126
static AllocateDesc * allocatedDescs
Definition: fd.c:255
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1465
int pg_fdatasync(int fd)
Definition: fd.c:437
static void error(void)
Definition: sql-dyntest.c:147
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
AllocateDescKind
Definition: fd.c:233
DIR * dir
Definition: fd.c:248
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1684
static void AtProcExit_Files(int code, Datum arg)
Definition: fd.c:2950
static Size SizeVfdCache
Definition: fd.c:208
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:185
void on_proc_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:305
#define DO_DB(A)
Definition: fd.c:171
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2864
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3363
int pg_truncate(const char *path, off_t length)
Definition: fd.c:630
long random(void)
Definition: random.c:22
ResourceOwner CurrentResourceOwner
Definition: resowner.c:144
static int numExternalFDs
Definition: fd.c:260
int pg_fsync_writethrough(int fd)
Definition: fd.c:414
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2696
int max_safe_fds
Definition: fd.c:156
#define Min(x, y)
Definition: c.h:974
off_t FileSize(File file)
Definition: fd.c:2180
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:661
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2413
#define FD_DELETE_AT_CLOSE
Definition: fd.c:183
int log_temp_files
Definition: guc.c:546
mode_t FileGetRawMode(File file)
Definition: fd.c:2268
void _dosmaperr(unsigned long)
Definition: win32error.c:171
static Vfd * VfdCache
Definition: fd.c:207
static void Delete(File file)
Definition: fd.c:1158
int closedir(DIR *)
Definition: dirent.c:123
static int numTempTableSpaces
Definition: fd.c:275
#define PG_TEMP_FILES_DIR
Definition: pg_checksums.c:58
int errcode(int sqlerrcode)
Definition: elog.c:704
#define MemSet(start, val, len)
Definition: c.h:996
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1573
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:402
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3191
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1756
static bool reserveAllocatedDesc(void)
Definition: fd.c:2279
uint32 SubTransactionId
Definition: c.h:579
#define SIGPIPE
Definition: win32_port.h:164
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1659
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
AllocateDescKind kind
Definition: fd.c:243
char * FilePathName(File file)
Definition: fd.c:2232
Definition: dirent.h:9
#define OidIsValid(objectId)
Definition: c.h:698
#define PANIC
Definition: elog.h:55
#define PG_BINARY
Definition: c.h:1259
static char * basedir
ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset)
Definition: pwrite.c:27
void AtEOXact_Files(bool isCommit)
Definition: fd.c:2936
Oid MyDatabaseTableSpace
Definition: globals.c:88
int ClosePipeStream(FILE *file)
Definition: fd.c:2763
ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset)
Definition: pread.c:27
#define malloc(a)
Definition: header.h:50
static void LruDelete(File file)
Definition: fd.c:1177
void pg_usleep(long microsec)
Definition: signal.c:53
bool TempTablespacesAreSet(void)
Definition: fd.c:2849
#define fstat
Definition: win32_port.h:274
ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pwritev.c:29
#define fsync(fd)
Definition: win32_port.h:68
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2514
void pfree(void *pointer)
Definition: mcxt.c:1057
mode_t fileMode
Definition: fd.c:199
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3163
static bool ReleaseLruFile(void)
Definition: fd.c:1272
Definition: dirent.c:25
int durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:814
#define ERROR
Definition: elog.h:45
#define PG_TEMP_FILE_PREFIX
Definition: pg_checksums.c:59
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2404
static int LruInsert(File file)
Definition: fd.c:1225
#define FATAL
Definition: elog.h:54
static bool have_xact_temporary_files
Definition: fd.c:219
#define MAXPGPATH
void ReserveExternalFD(void)
Definition: fd.c:1111
DIR * opendir(const char *)
Definition: dirent.c:33
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2159
#define DEBUG2
Definition: elog.h:24
ssize_t pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: fd.c:3645
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:26
char * fileName
Definition: fd.c:196
static char * buf
Definition: pg_test_fsync.c:68
Oid GetNextTempTableSpace(void)
Definition: fd.c:2882
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1285
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3471
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition: fd.c:1779
int errdetail(const char *fmt,...)
Definition: elog.c:1048
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3097
char * tablespace
Definition: pgbench.c:189
int errcode_for_file_access(void)
Definition: elog.c:727
void get_parent_directory(char *path)
Definition: path.c:854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2354
static int nfile
Definition: fd.c:213
unsigned int uint32
Definition: c.h:429
void SyncDataDirectory(void)
Definition: fd.c:3289
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2615
static int nextTempTableSpace
Definition: fd.c:276
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1512
__int64 st_size
Definition: win32_port.h:265
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:410
int max_files_per_process
Definition: fd.c:143
static File AllocateVfd(void)
Definition: fd.c:1304
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2457
unsigned short fdstate
Definition: fd.c:190
Definition: fd.c:187
off_t fileSize
Definition: fd.c:195
int fd
Definition: fd.c:189
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2821
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:687
static void Insert(File file)
Definition: fd.c:1203
ResourceOwner resowner
Definition: fd.c:191
bool data_sync_retry
Definition: fd.c:159
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3461
int CloseTransientFile(int fd)
Definition: fd.c:2581
#define SIG_IGN
Definition: win32_port.h:156
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1418
static void ReleaseLruFiles(void)
Definition: fd.c:1294
#define WARNING
Definition: elog.h:40
#define FileIsNotOpen(file)
Definition: fd.c:180
int pg_dir_create_mode
Definition: file_perm.c:18
static int elevel
Definition: vacuumlazy.c:333
int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2061
struct vfd Vfd
int data_sync_elevel(int elevel)
Definition: fd.c:3635
uintptr_t Datum
Definition: postgres.h:367
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:2903
unsigned short st_mode
Definition: win32_port.h:260
Definition: pg_iovec.h:24
unsigned int Index
Definition: c.h:537
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:457
#define FileIsValid(file)
Definition: fd.c:177
bool AcquireExternalFD(void)
Definition: fd.c:1076
FILE * file
Definition: fd.c:247
#define InvalidOid
Definition: postgres_ext.h:36
#define VFD_CLOSED
Definition: fd.c:175
static uint64 temporary_files_size
Definition: fd.c:227
#define ereport(elevel,...)
Definition: elog.h:155
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3612
pqsigfunc pqsignal(int signum, pqsigfunc handler)
Definition: signal.c:170
#define free(a)
Definition: header.h:65
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static void RegisterTemporaryFile(File file)
Definition: fd.c:1437
void FileClose(File file)
Definition: fd.c:1854
#define SIG_DFL
Definition: win32_port.h:154
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:1954
static int FileAccess(File file)
Definition: fd.c:1382
#define Assert(condition)
Definition: c.h:792
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:723
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2681
File lruMoreRecently
Definition: fd.c:193
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:1982
void RemovePgTempFiles(void)
Definition: fd.c:3038
SubTransactionId create_subid
Definition: fd.c:244
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1606
size_t Size
Definition: c.h:528
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1488
static const char * directory
Definition: zic.c:632
int sync_method
Definition: xlog.c:107
struct dirent * readdir(DIR *)
Definition: dirent.c:78
#define FD_MINFREE
Definition: fd.c:135
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3219
#define realloc(a, b)
Definition: header.h:60
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1070
#define INT64_FORMAT
Definition: c.h:471
const char * name
Definition: encode.c:515
static long tempFileCounter
Definition: fd.c:266
int fd
Definition: fd.c:249
#define S_ISDIR(m)
Definition: win32_port.h:316
#define lstat(path, sb)
Definition: win32_port.h:276
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:777
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1014
int FreeFile(FILE *file)
Definition: fd.c:2553
void set_max_safe_fds(void)
Definition: fd.c:971
bool enableFsync
Definition: globals.c:120
static Oid * tempTableSpaces
Definition: fd.c:274
void ReleaseExternalFD(void)
Definition: fd.c:1129
void * palloc(Size size)
Definition: mcxt.c:950
int errmsg(const char *fmt,...)
Definition: elog.c:915
int FileGetRawFlags(File file)
Definition: fd.c:2258
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1274
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1036
#define elog(elevel,...)
Definition: elog.h:228
int i
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:184
void * arg
int FileGetRawDesc(File file)
Definition: fd.c:2248
static void FreeVfd(File file)
Definition: fd.c:1362
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:100
int pg_fsync(int fd)
Definition: fd.c:347
char d_name[MAX_PATH]
Definition: dirent.h:15
#define mkdir(a, b)
Definition: win32_port.h:63
int link(const char *src, const char *dst)
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:343
int fileFlags
Definition: fd.c:198
union AllocateDesc::@23 desc
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1542
int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2005
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1294
#define snprintf
Definition: port.h:215
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2197
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3496
static int maxAllocatedDescs
Definition: fd.c:254
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:2968
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3572
int File
Definition: fd.h:48
int FreeDir(DIR *dir)
Definition: fd.c:2733
int temp_file_limit
Definition: guc.c:553
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:241
void InitFileAccess(void)
Definition: fd.c:854
#define stat
Definition: win32_port.h:275
static int numAllocatedDescs
Definition: fd.c:253
bool pgwin32_is_junction(const char *path)
#define ftruncate(a, b)
Definition: win32_port.h:65