PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  * If a non-virtual file descriptor needs to be held open for any length of
65  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66  * (and eventually ReleaseExternalFD), so that we can take it into account
67  * while deciding how many VFDs can be open. This applies to FDs obtained
68  * with BasicOpenFile as well as those obtained without use of any fd.c API.
69  *
70  *-------------------------------------------------------------------------
71  */
72 
73 #include "postgres.h"
74 
75 #include <dirent.h>
76 #include <sys/file.h>
77 #include <sys/param.h>
78 #include <sys/stat.h>
79 #include <sys/types.h>
80 #ifndef WIN32
81 #include <sys/mman.h>
82 #endif
83 #include <limits.h>
84 #include <unistd.h>
85 #include <fcntl.h>
86 #ifdef HAVE_SYS_RESOURCE_H
87 #include <sys/resource.h> /* for getrlimit */
88 #endif
89 
90 #include "access/xact.h"
91 #include "access/xlog.h"
92 #include "catalog/pg_tablespace.h"
93 #include "common/file_perm.h"
94 #include "common/file_utils.h"
95 #include "common/pg_prng.h"
96 #include "miscadmin.h"
97 #include "pgstat.h"
98 #include "port/pg_iovec.h"
99 #include "portability/mem.h"
100 #include "postmaster/startup.h"
101 #include "storage/fd.h"
102 #include "storage/ipc.h"
103 #include "utils/guc.h"
104 #include "utils/resowner_private.h"
105 
106 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
107 #if defined(HAVE_SYNC_FILE_RANGE)
108 #define PG_FLUSH_DATA_WORKS 1
109 #elif !defined(WIN32) && defined(MS_ASYNC)
110 #define PG_FLUSH_DATA_WORKS 1
111 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
112 #define PG_FLUSH_DATA_WORKS 1
113 #endif
114 
115 /*
116  * We must leave some file descriptors free for system(), the dynamic loader,
117  * and other code that tries to open files without consulting fd.c. This
118  * is the number left free. (While we try fairly hard to prevent EMFILE
119  * errors, there's never any guarantee that we won't get ENFILE due to
120  * other processes chewing up FDs. So it's a bad idea to try to open files
121  * without consulting fd.c. Nonetheless we cannot control all code.)
122  *
123  * Because this is just a fixed setting, we are effectively assuming that
124  * no such code will leave FDs open over the long term; otherwise the slop
125  * is likely to be insufficient. Note in particular that we expect that
126  * loading a shared library does not result in any permanent increase in
127  * the number of open files. (This appears to be true on most if not
128  * all platforms as of Feb 2004.)
129  */
130 #define NUM_RESERVED_FDS 10
131 
132 /*
133  * If we have fewer than this many usable FDs after allowing for the reserved
134  * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
135  * much less than that. Note that this value ensures numExternalFDs can be
136  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
137  * will not pass unless that can grow to at least 14.)
138  */
139 #define FD_MINFREE 48
140 
141 /*
142  * A number of platforms allow individual processes to open many more files
143  * than they can really support when *many* processes do the same thing.
144  * This GUC parameter lets the DBA limit max_safe_fds to something less than
145  * what the postmaster's initial probe suggests will work.
146  */
148 
149 /*
150  * Maximum number of file descriptors to open for operations that fd.c knows
151  * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
152  * to a conservative value, and remains that way indefinitely in bootstrap or
153  * standalone-backend cases. In normal postmaster operation, the postmaster
154  * calls set_max_safe_fds() late in initialization to update the value, and
155  * that value is then inherited by forked subprocesses.
156  *
157  * Note: the value of max_files_per_process is taken into account while
158  * setting this variable, and so need not be tested separately.
159  */
160 int max_safe_fds = FD_MINFREE; /* default if not changed */
161 
162 /* Whether it is safe to continue running after fsync() fails. */
163 bool data_sync_retry = false;
164 
165 /* How SyncDataDirectory() should do its job. */
167 
168 /* Debugging.... */
169 
170 #ifdef FDDEBUG
171 #define DO_DB(A) \
172  do { \
173  int _do_db_save_errno = errno; \
174  A; \
175  errno = _do_db_save_errno; \
176  } while (0)
177 #else
178 #define DO_DB(A) \
179  ((void) 0)
180 #endif
181 
182 #define VFD_CLOSED (-1)
183 
184 #define FileIsValid(file) \
185  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
186 
187 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
188 
189 /* these are the assigned bits in fdstate below: */
190 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
191 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
192 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
193 
194 typedef struct vfd
195 {
196  int fd; /* current FD, or VFD_CLOSED if none */
197  unsigned short fdstate; /* bitflags for VFD's state */
198  ResourceOwner resowner; /* owner, for automatic cleanup */
199  File nextFree; /* link to next free VFD, if in freelist */
200  File lruMoreRecently; /* doubly linked recency-of-use list */
202  off_t fileSize; /* current size of file (0 if not temporary) */
203  char *fileName; /* name of file, or NULL for unused VFD */
204  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
205  int fileFlags; /* open(2) flags for (re)opening the file */
206  mode_t fileMode; /* mode to pass to open(2) */
207 } Vfd;
208 
209 /*
210  * Virtual File Descriptor array pointer and size. This grows as
211  * needed. 'File' values are indexes into this array.
212  * Note that VfdCache[0] is not a usable VFD, just a list header.
213  */
214 static Vfd *VfdCache;
215 static Size SizeVfdCache = 0;
216 
217 /*
218  * Number of file descriptors known to be in use by VFD entries.
219  */
220 static int nfile = 0;
221 
222 /*
223  * Flag to tell whether it's worth scanning VfdCache looking for temp files
224  * to close
225  */
226 static bool have_xact_temporary_files = false;
227 
228 /*
229  * Tracks the total size of all temporary files. Note: when temp_file_limit
230  * is being enforced, this cannot overflow since the limit cannot be more
231  * than INT_MAX kilobytes. When not enforcing, it could theoretically
232  * overflow, but we don't care.
233  */
234 static uint64 temporary_files_size = 0;
235 
236 /* Temporary file access initialized and not yet shut down? */
237 #ifdef USE_ASSERT_CHECKING
238 static bool temporary_files_allowed = false;
239 #endif
240 
241 /*
242  * List of OS handles opened with AllocateFile, AllocateDir and
243  * OpenTransientFile.
244  */
245 typedef enum
246 {
252 
253 typedef struct
254 {
257  union
258  {
259  FILE *file;
261  int fd;
262  } desc;
263 } AllocateDesc;
264 
265 static int numAllocatedDescs = 0;
266 static int maxAllocatedDescs = 0;
268 
269 /*
270  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
271  */
272 static int numExternalFDs = 0;
273 
274 /*
275  * Number of temporary files opened during the current session;
276  * this is used in generation of tempfile names.
277  */
278 static long tempFileCounter = 0;
279 
280 /*
281  * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
282  * indicating that the current database's default tablespace should be used.)
283  * When numTempTableSpaces is -1, this has not been set in the current
284  * transaction.
285  */
286 static Oid *tempTableSpaces = NULL;
287 static int numTempTableSpaces = -1;
288 static int nextTempTableSpace = 0;
289 
290 
291 /*--------------------
292  *
293  * Private Routines
294  *
295  * Delete - delete a file from the Lru ring
296  * LruDelete - remove a file from the Lru ring and close its FD
297  * Insert - put a file at the front of the Lru ring
298  * LruInsert - put a file at the front of the Lru ring and open it
299  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
300  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
301  * AllocateVfd - grab a free (or new) file record (from VfdCache)
302  * FreeVfd - free a file record
303  *
304  * The Least Recently Used ring is a doubly linked list that begins and
305  * ends on element zero. Element zero is special -- it doesn't represent
306  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
307  * anchor that shows us the beginning/end of the ring.
308  * Only VFD elements that are currently really open (have an FD assigned) are
309  * in the Lru ring. Elements that are "virtually" open can be recognized
310  * by having a non-null fileName field.
311  *
312  * example:
313  *
314  * /--less----\ /---------\
315  * v \ v \
316  * #0 --more---> LeastRecentlyUsed --more-\ \
317  * ^\ | |
318  * \\less--> MostRecentlyUsedFile <---/ |
319  * \more---/ \--less--/
320  *
321  *--------------------
322  */
323 static void Delete(File file);
324 static void LruDelete(File file);
325 static void Insert(File file);
326 static int LruInsert(File file);
327 static bool ReleaseLruFile(void);
328 static void ReleaseLruFiles(void);
329 static File AllocateVfd(void);
330 static void FreeVfd(File file);
331 
332 static int FileAccess(File file);
333 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
334 static bool reserveAllocatedDesc(void);
335 static int FreeDesc(AllocateDesc *desc);
336 
337 static void BeforeShmemExit_Files(int code, Datum arg);
338 static void CleanupTempFiles(bool isCommit, bool isProcExit);
339 static void RemovePgTempRelationFiles(const char *tsdirname);
340 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
341 
342 static void walkdir(const char *path,
343  void (*action) (const char *fname, bool isdir, int elevel),
344  bool process_symlinks,
345  int elevel);
346 #ifdef PG_FLUSH_DATA_WORKS
347 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
348 #endif
349 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
350 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
351 
352 static int fsync_parent_path(const char *fname, int elevel);
353 
354 
355 /*
356  * pg_fsync --- do fsync with or without writethrough
357  */
358 int
360 {
361 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
362  struct stat st;
363 
364  /*
365  * Some operating system implementations of fsync() have requirements
366  * about the file access modes that were used when their file descriptor
367  * argument was opened, and these requirements differ depending on whether
368  * the file descriptor is for a directory.
369  *
370  * For any file descriptor that may eventually be handed to fsync(), we
371  * should have opened it with access modes that are compatible with
372  * fsync() on all supported systems, otherwise the code may not be
373  * portable, even if it runs ok on the current system.
374  *
375  * We assert here that a descriptor for a file was opened with write
376  * permissions (either O_RDWR or O_WRONLY) and for a directory without
377  * write permissions (O_RDONLY).
378  *
379  * Ignore any fstat errors and let the follow-up fsync() do its work.
380  * Doing this sanity check here counts for the case where fsync() is
381  * disabled.
382  */
383  if (fstat(fd, &st) == 0)
384  {
385  int desc_flags = fcntl(fd, F_GETFL);
386 
387  /*
388  * O_RDONLY is historically 0, so just make sure that for directories
389  * no write flags are used.
390  */
391  if (S_ISDIR(st.st_mode))
392  Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
393  else
394  Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
395  }
396  errno = 0;
397 #endif
398 
399  /* #if is to skip the sync_method test if there's no need for it */
400 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
402  return pg_fsync_writethrough(fd);
403  else
404 #endif
406 }
407 
408 
409 /*
410  * pg_fsync_no_writethrough --- same as fsync except does nothing if
411  * enableFsync is off
412  */
413 int
415 {
416  if (enableFsync)
417  return fsync(fd);
418  else
419  return 0;
420 }
421 
422 /*
423  * pg_fsync_writethrough
424  */
425 int
427 {
428  if (enableFsync)
429  {
430 #ifdef WIN32
431  return _commit(fd);
432 #elif defined(F_FULLFSYNC)
433  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
434 #else
435  errno = ENOSYS;
436  return -1;
437 #endif
438  }
439  else
440  return 0;
441 }
442 
443 /*
444  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
445  *
446  * Not all platforms have fdatasync; treat as fsync if not available.
447  */
448 int
450 {
451  if (enableFsync)
452  {
453 #ifdef HAVE_FDATASYNC
454  return fdatasync(fd);
455 #else
456  return fsync(fd);
457 #endif
458  }
459  else
460  return 0;
461 }
462 
463 /*
464  * pg_flush_data --- advise OS that the described dirty data should be flushed
465  *
466  * offset of 0 with nbytes 0 means that the entire file should be flushed
467  */
468 void
469 pg_flush_data(int fd, off_t offset, off_t nbytes)
470 {
471  /*
472  * Right now file flushing is primarily used to avoid making later
473  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
474  * if fsyncs are disabled - that's a decision we might want to make
475  * configurable at some point.
476  */
477  if (!enableFsync)
478  return;
479 
480  /*
481  * We compile all alternatives that are supported on the current platform,
482  * to find portability problems more easily.
483  */
484 #if defined(HAVE_SYNC_FILE_RANGE)
485  {
486  int rc;
487  static bool not_implemented_by_kernel = false;
488 
489  if (not_implemented_by_kernel)
490  return;
491 
492  /*
493  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
494  * tells the OS that writeback for the specified blocks should be
495  * started, but that we don't want to wait for completion. Note that
496  * this call might block if too much dirty data exists in the range.
497  * This is the preferable method on OSs supporting it, as it works
498  * reliably when available (contrast to msync()) and doesn't flush out
499  * clean data (like FADV_DONTNEED).
500  */
501  rc = sync_file_range(fd, offset, nbytes,
502  SYNC_FILE_RANGE_WRITE);
503  if (rc != 0)
504  {
505  int elevel;
506 
507  /*
508  * For systems that don't have an implementation of
509  * sync_file_range() such as Windows WSL, generate only one
510  * warning and then suppress all further attempts by this process.
511  */
512  if (errno == ENOSYS)
513  {
514  elevel = WARNING;
515  not_implemented_by_kernel = true;
516  }
517  else
518  elevel = data_sync_elevel(WARNING);
519 
520  ereport(elevel,
522  errmsg("could not flush dirty data: %m")));
523  }
524 
525  return;
526  }
527 #endif
528 #if !defined(WIN32) && defined(MS_ASYNC)
529  {
530  void *p;
531  static int pagesize = 0;
532 
533  /*
534  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
535  * writeback. On linux it only does so if MS_SYNC is specified, but
536  * then it does the writeback synchronously. Luckily all common linux
537  * systems have sync_file_range(). This is preferable over
538  * FADV_DONTNEED because it doesn't flush out clean data.
539  *
540  * We map the file (mmap()), tell the kernel to sync back the contents
541  * (msync()), and then remove the mapping again (munmap()).
542  */
543 
544  /* mmap() needs actual length if we want to map whole file */
545  if (offset == 0 && nbytes == 0)
546  {
547  nbytes = lseek(fd, 0, SEEK_END);
548  if (nbytes < 0)
549  {
552  errmsg("could not determine dirty data size: %m")));
553  return;
554  }
555  }
556 
557  /*
558  * Some platforms reject partial-page mmap() attempts. To deal with
559  * that, just truncate the request to a page boundary. If any extra
560  * bytes don't get flushed, well, it's only a hint anyway.
561  */
562 
563  /* fetch pagesize only once */
564  if (pagesize == 0)
565  pagesize = sysconf(_SC_PAGESIZE);
566 
567  /* align length to pagesize, dropping any fractional page */
568  if (pagesize > 0)
569  nbytes = (nbytes / pagesize) * pagesize;
570 
571  /* fractional-page request is a no-op */
572  if (nbytes <= 0)
573  return;
574 
575  /*
576  * mmap could well fail, particularly on 32-bit platforms where there
577  * may simply not be enough address space. If so, silently fall
578  * through to the next implementation.
579  */
580  if (nbytes <= (off_t) SSIZE_MAX)
581  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
582  else
583  p = MAP_FAILED;
584 
585  if (p != MAP_FAILED)
586  {
587  int rc;
588 
589  rc = msync(p, (size_t) nbytes, MS_ASYNC);
590  if (rc != 0)
591  {
594  errmsg("could not flush dirty data: %m")));
595  /* NB: need to fall through to munmap()! */
596  }
597 
598  rc = munmap(p, (size_t) nbytes);
599  if (rc != 0)
600  {
601  /* FATAL error because mapping would remain */
602  ereport(FATAL,
604  errmsg("could not munmap() while flushing data: %m")));
605  }
606 
607  return;
608  }
609  }
610 #endif
611 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
612  {
613  int rc;
614 
615  /*
616  * Signal the kernel that the passed in range should not be cached
617  * anymore. This has the, desired, side effect of writing out dirty
618  * data, and the, undesired, side effect of likely discarding useful
619  * clean cached blocks. For the latter reason this is the least
620  * preferable method.
621  */
622 
623  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
624 
625  if (rc != 0)
626  {
627  /* don't error out, this is just a performance optimization */
630  errmsg("could not flush dirty data: %m")));
631  }
632 
633  return;
634  }
635 #endif
636 }
637 
638 /*
639  * Truncate a file to a given length by name.
640  */
641 int
642 pg_truncate(const char *path, off_t length)
643 {
644 #ifdef WIN32
645  int save_errno;
646  int ret;
647  int fd;
648 
649  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
650  if (fd >= 0)
651  {
652  ret = ftruncate(fd, 0);
653  save_errno = errno;
655  errno = save_errno;
656  }
657  else
658  ret = -1;
659 
660  return ret;
661 #else
662  return truncate(path, length);
663 #endif
664 }
665 
666 /*
667  * fsync_fname -- fsync a file or directory, handling errors properly
668  *
669  * Try to fsync a file or directory. When doing the latter, ignore errors that
670  * indicate the OS just doesn't allow/require fsyncing directories.
671  */
672 void
673 fsync_fname(const char *fname, bool isdir)
674 {
675  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
676 }
677 
678 /*
679  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
680  *
681  * This routine ensures that, after returning, the effect of renaming file
682  * persists in case of a crash. A crash while this routine is running will
683  * leave you with either the pre-existing or the moved file in place of the
684  * new file; no mixed state or truncated files are possible.
685  *
686  * It does so by using fsync on the old filename and the possibly existing
687  * target filename before the rename, and the target file and directory after.
688  *
689  * Note that rename() cannot be used across arbitrary directories, as they
690  * might not be on the same filesystem. Therefore this routine does not
691  * support renaming across directories.
692  *
693  * Log errors with the caller specified severity.
694  *
695  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
696  * valid upon return.
697  */
698 int
699 durable_rename(const char *oldfile, const char *newfile, int elevel)
700 {
701  int fd;
702 
703  /*
704  * First fsync the old and target path (if it exists), to ensure that they
705  * are properly persistent on disk. Syncing the target file is not
706  * strictly necessary, but it makes it easier to reason about crashes;
707  * because it's then guaranteed that either source or target file exists
708  * after a crash.
709  */
710  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
711  return -1;
712 
713  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
714  if (fd < 0)
715  {
716  if (errno != ENOENT)
717  {
718  ereport(elevel,
720  errmsg("could not open file \"%s\": %m", newfile)));
721  return -1;
722  }
723  }
724  else
725  {
726  if (pg_fsync(fd) != 0)
727  {
728  int save_errno;
729 
730  /* close file upon error, might not be in transaction context */
731  save_errno = errno;
733  errno = save_errno;
734 
735  ereport(elevel,
737  errmsg("could not fsync file \"%s\": %m", newfile)));
738  return -1;
739  }
740 
741  if (CloseTransientFile(fd) != 0)
742  {
743  ereport(elevel,
745  errmsg("could not close file \"%s\": %m", newfile)));
746  return -1;
747  }
748  }
749 
750  /* Time to do the real deal... */
751  if (rename(oldfile, newfile) < 0)
752  {
753  ereport(elevel,
755  errmsg("could not rename file \"%s\" to \"%s\": %m",
756  oldfile, newfile)));
757  return -1;
758  }
759 
760  /*
761  * To guarantee renaming the file is persistent, fsync the file with its
762  * new name, and its containing directory.
763  */
764  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
765  return -1;
766 
767  if (fsync_parent_path(newfile, elevel) != 0)
768  return -1;
769 
770  return 0;
771 }
772 
773 /*
774  * durable_unlink -- remove a file in a durable manner
775  *
776  * This routine ensures that, after returning, the effect of removing file
777  * persists in case of a crash. A crash while this routine is running will
778  * leave the system in no mixed state.
779  *
780  * It does so by using fsync on the parent directory of the file after the
781  * actual removal is done.
782  *
783  * Log errors with the severity specified by caller.
784  *
785  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
786  * valid upon return.
787  */
788 int
789 durable_unlink(const char *fname, int elevel)
790 {
791  if (unlink(fname) < 0)
792  {
793  ereport(elevel,
795  errmsg("could not remove file \"%s\": %m",
796  fname)));
797  return -1;
798  }
799 
800  /*
801  * To guarantee that the removal of the file is persistent, fsync its
802  * parent directory.
803  */
804  if (fsync_parent_path(fname, elevel) != 0)
805  return -1;
806 
807  return 0;
808 }
809 
810 /*
811  * durable_rename_excl -- rename a file in a durable manner.
812  *
813  * Similar to durable_rename(), except that this routine tries (but does not
814  * guarantee) not to overwrite the target file.
815  *
816  * Note that a crash in an unfortunate moment can leave you with two links to
817  * the target file.
818  *
819  * Log errors with the caller specified severity.
820  *
821  * On Windows, using a hard link followed by unlink() causes concurrency
822  * issues, while a simple rename() does not cause that, so be careful when
823  * changing the logic of this routine.
824  *
825  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
826  * valid upon return.
827  */
828 int
829 durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
830 {
831  /*
832  * Ensure that, if we crash directly after the rename/link, a file with
833  * valid contents is moved into place.
834  */
835  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
836  return -1;
837 
838 #ifdef HAVE_WORKING_LINK
839  if (link(oldfile, newfile) < 0)
840  {
841  ereport(elevel,
843  errmsg("could not link file \"%s\" to \"%s\": %m",
844  oldfile, newfile)));
845  return -1;
846  }
847  unlink(oldfile);
848 #else
849  if (rename(oldfile, newfile) < 0)
850  {
851  ereport(elevel,
853  errmsg("could not rename file \"%s\" to \"%s\": %m",
854  oldfile, newfile)));
855  return -1;
856  }
857 #endif
858 
859  /*
860  * Make change persistent in case of an OS crash, both the new entry and
861  * its parent directory need to be flushed.
862  */
863  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
864  return -1;
865 
866  /* Same for parent directory */
867  if (fsync_parent_path(newfile, elevel) != 0)
868  return -1;
869 
870  return 0;
871 }
872 
873 /*
874  * InitFileAccess --- initialize this module during backend startup
875  *
876  * This is called during either normal or standalone backend start.
877  * It is *not* called in the postmaster.
878  *
879  * Note that this does not initialize temporary file access, that is
880  * separately initialized via InitTemporaryFileAccess().
881  */
882 void
884 {
885  Assert(SizeVfdCache == 0); /* call me only once */
886 
887  /* initialize cache header entry */
888  VfdCache = (Vfd *) malloc(sizeof(Vfd));
889  if (VfdCache == NULL)
890  ereport(FATAL,
891  (errcode(ERRCODE_OUT_OF_MEMORY),
892  errmsg("out of memory")));
893 
894  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
896 
897  SizeVfdCache = 1;
898 }
899 
900 /*
901  * InitTemporaryFileAccess --- initialize temporary file access during startup
902  *
903  * This is called during either normal or standalone backend start.
904  * It is *not* called in the postmaster.
905  *
906  * This is separate from InitFileAccess() because temporary file cleanup can
907  * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
908  * our reporting has to happen before that. Low level file access should be
909  * available for longer, hence the separate initialization / shutdown of
910  * temporary file handling.
911  */
912 void
914 {
915  Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run*/
916  Assert(!temporary_files_allowed); /* call me only once */
917 
918  /*
919  * Register before-shmem-exit hook to ensure temp files are dropped while
920  * we can still report stats.
921  */
923 
924 #ifdef USE_ASSERT_CHECKING
925  temporary_files_allowed = true;
926 #endif
927 }
928 
929 /*
930  * count_usable_fds --- count how many FDs the system will let us open,
931  * and estimate how many are already open.
932  *
933  * We stop counting if usable_fds reaches max_to_probe. Note: a small
934  * value of max_to_probe might result in an underestimate of already_open;
935  * we must fill in any "gaps" in the set of used FDs before the calculation
936  * of already_open will give the right answer. In practice, max_to_probe
937  * of a couple of dozen should be enough to ensure good results.
938  *
939  * We assume stderr (FD 2) is available for dup'ing. While the calling
940  * script could theoretically close that, it would be a really bad idea,
941  * since then one risks loss of error messages from, e.g., libc.
942  */
943 static void
944 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
945 {
946  int *fd;
947  int size;
948  int used = 0;
949  int highestfd = 0;
950  int j;
951 
952 #ifdef HAVE_GETRLIMIT
953  struct rlimit rlim;
954  int getrlimit_status;
955 #endif
956 
957  size = 1024;
958  fd = (int *) palloc(size * sizeof(int));
959 
960 #ifdef HAVE_GETRLIMIT
961 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
962  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
963 #else /* but BSD doesn't ... */
964  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
965 #endif /* RLIMIT_NOFILE */
966  if (getrlimit_status != 0)
967  ereport(WARNING, (errmsg("getrlimit failed: %m")));
968 #endif /* HAVE_GETRLIMIT */
969 
970  /* dup until failure or probe limit reached */
971  for (;;)
972  {
973  int thisfd;
974 
975 #ifdef HAVE_GETRLIMIT
976 
977  /*
978  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
979  * some platforms
980  */
981  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
982  break;
983 #endif
984 
985  thisfd = dup(2);
986  if (thisfd < 0)
987  {
988  /* Expect EMFILE or ENFILE, else it's fishy */
989  if (errno != EMFILE && errno != ENFILE)
990  elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
991  break;
992  }
993 
994  if (used >= size)
995  {
996  size *= 2;
997  fd = (int *) repalloc(fd, size * sizeof(int));
998  }
999  fd[used++] = thisfd;
1000 
1001  if (highestfd < thisfd)
1002  highestfd = thisfd;
1003 
1004  if (used >= max_to_probe)
1005  break;
1006  }
1007 
1008  /* release the files we opened */
1009  for (j = 0; j < used; j++)
1010  close(fd[j]);
1011 
1012  pfree(fd);
1013 
1014  /*
1015  * Return results. usable_fds is just the number of successful dups. We
1016  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1017  * number) and so already_open is highestfd+1 - usable_fds.
1018  */
1019  *usable_fds = used;
1020  *already_open = highestfd + 1 - used;
1021 }
1022 
1023 /*
1024  * set_max_safe_fds
1025  * Determine number of file descriptors that fd.c is allowed to use
1026  */
1027 void
1029 {
1030  int usable_fds;
1031  int already_open;
1032 
1033  /*----------
1034  * We want to set max_safe_fds to
1035  * MIN(usable_fds, max_files_per_process - already_open)
1036  * less the slop factor for files that are opened without consulting
1037  * fd.c. This ensures that we won't exceed either max_files_per_process
1038  * or the experimentally-determined EMFILE limit.
1039  *----------
1040  */
1042  &usable_fds, &already_open);
1043 
1044  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
1045 
1046  /*
1047  * Take off the FDs reserved for system() etc.
1048  */
1050 
1051  /*
1052  * Make sure we still have enough to get by.
1053  */
1054  if (max_safe_fds < FD_MINFREE)
1055  ereport(FATAL,
1056  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1057  errmsg("insufficient file descriptors available to start server process"),
1058  errdetail("System allows %d, we need at least %d.",
1061 
1062  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1063  max_safe_fds, usable_fds, already_open);
1064 }
1065 
1066 /*
1067  * Open a file with BasicOpenFilePerm() and pass default file mode for the
1068  * fileMode parameter.
1069  */
1070 int
1071 BasicOpenFile(const char *fileName, int fileFlags)
1072 {
1073  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1074 }
1075 
1076 /*
1077  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1078  *
1079  * This is exported for use by places that really want a plain kernel FD,
1080  * but need to be proof against running out of FDs. Once an FD has been
1081  * successfully returned, it is the caller's responsibility to ensure that
1082  * it will not be leaked on ereport()! Most users should *not* call this
1083  * routine directly, but instead use the VFD abstraction level, which
1084  * provides protection against descriptor leaks as well as management of
1085  * files that need to be open for more than a short period of time.
1086  *
1087  * Ideally this should be the *only* direct call of open() in the backend.
1088  * In practice, the postmaster calls open() directly, and there are some
1089  * direct open() calls done early in backend startup. Those are OK since
1090  * this module wouldn't have any open files to close at that point anyway.
1091  */
1092 int
1093 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1094 {
1095  int fd;
1096 
1097 tryAgain:
1098 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1099 
1100  /*
1101  * The value we defined to stand in for O_DIRECT when simulating it with
1102  * F_NOCACHE had better not collide with any of the standard flags.
1103  */
1105  (O_APPEND |
1106  O_CREAT |
1107  O_EXCL |
1108  O_RDWR |
1109  O_RDONLY |
1110  O_SYNC |
1111  O_TRUNC |
1112  O_WRONLY)) == 0,
1113  "PG_O_DIRECT value collides with standard flag");
1114 #if defined(O_CLOEXEC)
1115  StaticAssertStmt((PG_O_DIRECT & O_CLOEXEC) == 0,
1116  "PG_O_DIRECT value collides with O_CLOEXEC");
1117 #endif
1118 #if defined(O_DSYNC)
1120  "PG_O_DIRECT value collides with O_DSYNC");
1121 #endif
1122 
1123  fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1124 #else
1125  fd = open(fileName, fileFlags, fileMode);
1126 #endif
1127 
1128  if (fd >= 0)
1129  {
1130 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1131  if (fileFlags & PG_O_DIRECT)
1132  {
1133  if (fcntl(fd, F_NOCACHE, 1) < 0)
1134  {
1135  int save_errno = errno;
1136 
1137  close(fd);
1138  errno = save_errno;
1139  return -1;
1140  }
1141  }
1142 #endif
1143 
1144  return fd; /* success! */
1145  }
1146 
1147  if (errno == EMFILE || errno == ENFILE)
1148  {
1149  int save_errno = errno;
1150 
1151  ereport(LOG,
1152  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1153  errmsg("out of file descriptors: %m; release and retry")));
1154  errno = 0;
1155  if (ReleaseLruFile())
1156  goto tryAgain;
1157  errno = save_errno;
1158  }
1159 
1160  return -1; /* failure */
1161 }
1162 
1163 /*
1164  * AcquireExternalFD - attempt to reserve an external file descriptor
1165  *
1166  * This should be used by callers that need to hold a file descriptor open
1167  * over more than a short interval, but cannot use any of the other facilities
1168  * provided by this module.
1169  *
1170  * The difference between this and the underlying ReserveExternalFD function
1171  * is that this will report failure (by setting errno and returning false)
1172  * if "too many" external FDs are already reserved. This should be used in
1173  * any code where the total number of FDs to be reserved is not predictable
1174  * and small.
1175  */
1176 bool
1178 {
1179  /*
1180  * We don't want more than max_safe_fds / 3 FDs to be consumed for
1181  * "external" FDs.
1182  */
1183  if (numExternalFDs < max_safe_fds / 3)
1184  {
1186  return true;
1187  }
1188  errno = EMFILE;
1189  return false;
1190 }
1191 
1192 /*
1193  * ReserveExternalFD - report external consumption of a file descriptor
1194  *
1195  * This should be used by callers that need to hold a file descriptor open
1196  * over more than a short interval, but cannot use any of the other facilities
1197  * provided by this module. This just tracks the use of the FD and closes
1198  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1199  *
1200  * Call this directly only in code where failure to reserve the FD would be
1201  * fatal; for example, the WAL-writing code does so, since the alternative is
1202  * session failure. Also, it's very unwise to do so in code that could
1203  * consume more than one FD per process.
1204  *
1205  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1206  * available, it doesn't matter too much whether this is called before or
1207  * after actually opening the FD; but doing so beforehand reduces the risk of
1208  * an EMFILE failure if not everybody played nice. In any case, it's solely
1209  * caller's responsibility to keep the external-FD count in sync with reality.
1210  */
1211 void
1213 {
1214  /*
1215  * Release VFDs if needed to stay safe. Because we do this before
1216  * incrementing numExternalFDs, the final state will be as desired, i.e.,
1217  * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1218  */
1219  ReleaseLruFiles();
1220 
1221  numExternalFDs++;
1222 }
1223 
1224 /*
1225  * ReleaseExternalFD - report release of an external file descriptor
1226  *
1227  * This is guaranteed not to change errno, so it can be used in failure paths.
1228  */
1229 void
1231 {
1232  Assert(numExternalFDs > 0);
1233  numExternalFDs--;
1234 }
1235 
1236 
1237 #if defined(FDDEBUG)
1238 
1239 static void
1240 _dump_lru(void)
1241 {
1242  int mru = VfdCache[0].lruLessRecently;
1243  Vfd *vfdP = &VfdCache[mru];
1244  char buf[2048];
1245 
1246  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1247  while (mru != 0)
1248  {
1249  mru = vfdP->lruLessRecently;
1250  vfdP = &VfdCache[mru];
1251  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1252  }
1253  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1254  elog(LOG, "%s", buf);
1255 }
1256 #endif /* FDDEBUG */
1257 
1258 static void
1260 {
1261  Vfd *vfdP;
1262 
1263  Assert(file != 0);
1264 
1265  DO_DB(elog(LOG, "Delete %d (%s)",
1266  file, VfdCache[file].fileName));
1267  DO_DB(_dump_lru());
1268 
1269  vfdP = &VfdCache[file];
1270 
1273 
1274  DO_DB(_dump_lru());
1275 }
1276 
1277 static void
1279 {
1280  Vfd *vfdP;
1281 
1282  Assert(file != 0);
1283 
1284  DO_DB(elog(LOG, "LruDelete %d (%s)",
1285  file, VfdCache[file].fileName));
1286 
1287  vfdP = &VfdCache[file];
1288 
1289  /*
1290  * Close the file. We aren't expecting this to fail; if it does, better
1291  * to leak the FD than to mess up our internal state.
1292  */
1293  if (close(vfdP->fd) != 0)
1295  "could not close file \"%s\": %m", vfdP->fileName);
1296  vfdP->fd = VFD_CLOSED;
1297  --nfile;
1298 
1299  /* delete the vfd record from the LRU ring */
1300  Delete(file);
1301 }
1302 
1303 static void
1305 {
1306  Vfd *vfdP;
1307 
1308  Assert(file != 0);
1309 
1310  DO_DB(elog(LOG, "Insert %d (%s)",
1311  file, VfdCache[file].fileName));
1312  DO_DB(_dump_lru());
1313 
1314  vfdP = &VfdCache[file];
1315 
1316  vfdP->lruMoreRecently = 0;
1318  VfdCache[0].lruLessRecently = file;
1320 
1321  DO_DB(_dump_lru());
1322 }
1323 
1324 /* returns 0 on success, -1 on re-open failure (with errno set) */
1325 static int
1327 {
1328  Vfd *vfdP;
1329 
1330  Assert(file != 0);
1331 
1332  DO_DB(elog(LOG, "LruInsert %d (%s)",
1333  file, VfdCache[file].fileName));
1334 
1335  vfdP = &VfdCache[file];
1336 
1337  if (FileIsNotOpen(file))
1338  {
1339  /* Close excess kernel FDs. */
1340  ReleaseLruFiles();
1341 
1342  /*
1343  * The open could still fail for lack of file descriptors, eg due to
1344  * overall system file table being full. So, be prepared to release
1345  * another FD if necessary...
1346  */
1347  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1348  vfdP->fileMode);
1349  if (vfdP->fd < 0)
1350  {
1351  DO_DB(elog(LOG, "re-open failed: %m"));
1352  return -1;
1353  }
1354  else
1355  {
1356  ++nfile;
1357  }
1358  }
1359 
1360  /*
1361  * put it at the head of the Lru ring
1362  */
1363 
1364  Insert(file);
1365 
1366  return 0;
1367 }
1368 
1369 /*
1370  * Release one kernel FD by closing the least-recently-used VFD.
1371  */
1372 static bool
1374 {
1375  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1376 
1377  if (nfile > 0)
1378  {
1379  /*
1380  * There are opened files and so there should be at least one used vfd
1381  * in the ring.
1382  */
1383  Assert(VfdCache[0].lruMoreRecently != 0);
1384  LruDelete(VfdCache[0].lruMoreRecently);
1385  return true; /* freed a file */
1386  }
1387  return false; /* no files available to free */
1388 }
1389 
1390 /*
1391  * Release kernel FDs as needed to get under the max_safe_fds limit.
1392  * After calling this, it's OK to try to open another file.
1393  */
1394 static void
1396 {
1398  {
1399  if (!ReleaseLruFile())
1400  break;
1401  }
1402 }
1403 
1404 static File
1406 {
1407  Index i;
1408  File file;
1409 
1410  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1411 
1412  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1413 
1414  if (VfdCache[0].nextFree == 0)
1415  {
1416  /*
1417  * The free list is empty so it is time to increase the size of the
1418  * array. We choose to double it each time this happens. However,
1419  * there's not much point in starting *real* small.
1420  */
1421  Size newCacheSize = SizeVfdCache * 2;
1422  Vfd *newVfdCache;
1423 
1424  if (newCacheSize < 32)
1425  newCacheSize = 32;
1426 
1427  /*
1428  * Be careful not to clobber VfdCache ptr if realloc fails.
1429  */
1430  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1431  if (newVfdCache == NULL)
1432  ereport(ERROR,
1433  (errcode(ERRCODE_OUT_OF_MEMORY),
1434  errmsg("out of memory")));
1435  VfdCache = newVfdCache;
1436 
1437  /*
1438  * Initialize the new entries and link them into the free list.
1439  */
1440  for (i = SizeVfdCache; i < newCacheSize; i++)
1441  {
1442  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1443  VfdCache[i].nextFree = i + 1;
1444  VfdCache[i].fd = VFD_CLOSED;
1445  }
1446  VfdCache[newCacheSize - 1].nextFree = 0;
1448 
1449  /*
1450  * Record the new size
1451  */
1452  SizeVfdCache = newCacheSize;
1453  }
1454 
1455  file = VfdCache[0].nextFree;
1456 
1457  VfdCache[0].nextFree = VfdCache[file].nextFree;
1458 
1459  return file;
1460 }
1461 
1462 static void
1464 {
1465  Vfd *vfdP = &VfdCache[file];
1466 
1467  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1468  file, vfdP->fileName ? vfdP->fileName : ""));
1469 
1470  if (vfdP->fileName != NULL)
1471  {
1472  free(vfdP->fileName);
1473  vfdP->fileName = NULL;
1474  }
1475  vfdP->fdstate = 0x0;
1476 
1477  vfdP->nextFree = VfdCache[0].nextFree;
1478  VfdCache[0].nextFree = file;
1479 }
1480 
1481 /* returns 0 on success, -1 on re-open failure (with errno set) */
1482 static int
1484 {
1485  int returnValue;
1486 
1487  DO_DB(elog(LOG, "FileAccess %d (%s)",
1488  file, VfdCache[file].fileName));
1489 
1490  /*
1491  * Is the file open? If not, open it and put it at the head of the LRU
1492  * ring (possibly closing the least recently used file to get an FD).
1493  */
1494 
1495  if (FileIsNotOpen(file))
1496  {
1497  returnValue = LruInsert(file);
1498  if (returnValue != 0)
1499  return returnValue;
1500  }
1501  else if (VfdCache[0].lruLessRecently != file)
1502  {
1503  /*
1504  * We now know that the file is open and that it is not the last one
1505  * accessed, so we need to move it to the head of the Lru ring.
1506  */
1507 
1508  Delete(file);
1509  Insert(file);
1510  }
1511 
1512  return 0;
1513 }
1514 
1515 /*
1516  * Called whenever a temporary file is deleted to report its size.
1517  */
1518 static void
1519 ReportTemporaryFileUsage(const char *path, off_t size)
1520 {
1521  pgstat_report_tempfile(size);
1522 
1523  if (log_temp_files >= 0)
1524  {
1525  if ((size / 1024) >= log_temp_files)
1526  ereport(LOG,
1527  (errmsg("temporary file: path \"%s\", size %lu",
1528  path, (unsigned long) size)));
1529  }
1530 }
1531 
1532 /*
1533  * Called to register a temporary file for automatic close.
1534  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1535  * before the file was opened.
1536  */
1537 static void
1539 {
1542 
1543  /* Backup mechanism for closing at end of xact. */
1546 }
1547 
1548 /*
1549  * Called when we get a shared invalidation message on some relation.
1550  */
1551 #ifdef NOT_USED
1552 void
1553 FileInvalidate(File file)
1554 {
1555  Assert(FileIsValid(file));
1556  if (!FileIsNotOpen(file))
1557  LruDelete(file);
1558 }
1559 #endif
1560 
1561 /*
1562  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1563  * fileMode parameter.
1564  */
1565 File
1566 PathNameOpenFile(const char *fileName, int fileFlags)
1567 {
1568  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1569 }
1570 
1571 /*
1572  * open a file in an arbitrary directory
1573  *
1574  * NB: if the passed pathname is relative (which it usually is),
1575  * it will be interpreted relative to the process' working directory
1576  * (which should always be $PGDATA when this code is running).
1577  */
1578 File
1579 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1580 {
1581  char *fnamecopy;
1582  File file;
1583  Vfd *vfdP;
1584 
1585  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1586  fileName, fileFlags, fileMode));
1587 
1588  /*
1589  * We need a malloc'd copy of the file name; fail cleanly if no room.
1590  */
1591  fnamecopy = strdup(fileName);
1592  if (fnamecopy == NULL)
1593  ereport(ERROR,
1594  (errcode(ERRCODE_OUT_OF_MEMORY),
1595  errmsg("out of memory")));
1596 
1597  file = AllocateVfd();
1598  vfdP = &VfdCache[file];
1599 
1600  /* Close excess kernel FDs. */
1601  ReleaseLruFiles();
1602 
1603  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1604 
1605  if (vfdP->fd < 0)
1606  {
1607  int save_errno = errno;
1608 
1609  FreeVfd(file);
1610  free(fnamecopy);
1611  errno = save_errno;
1612  return -1;
1613  }
1614  ++nfile;
1615  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1616  vfdP->fd));
1617 
1618  vfdP->fileName = fnamecopy;
1619  /* Saved flags are adjusted to be OK for re-opening file */
1620  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1621  vfdP->fileMode = fileMode;
1622  vfdP->fileSize = 0;
1623  vfdP->fdstate = 0x0;
1624  vfdP->resowner = NULL;
1625 
1626  Insert(file);
1627 
1628  return file;
1629 }
1630 
1631 /*
1632  * Create directory 'directory'. If necessary, create 'basedir', which must
1633  * be the directory above it. This is designed for creating the top-level
1634  * temporary directory on demand before creating a directory underneath it.
1635  * Do nothing if the directory already exists.
1636  *
1637  * Directories created within the top-level temporary directory should begin
1638  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1639  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1640  * that do not need any particular prefix.
1641 */
1642 void
1644 {
1645  if (MakePGDirectory(directory) < 0)
1646  {
1647  if (errno == EEXIST)
1648  return;
1649 
1650  /*
1651  * Failed. Try to create basedir first in case it's missing. Tolerate
1652  * EEXIST to close a race against another process following the same
1653  * algorithm.
1654  */
1655  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1656  ereport(ERROR,
1658  errmsg("cannot create temporary directory \"%s\": %m",
1659  basedir)));
1660 
1661  /* Try again. */
1662  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1663  ereport(ERROR,
1665  errmsg("cannot create temporary subdirectory \"%s\": %m",
1666  directory)));
1667  }
1668 }
1669 
1670 /*
1671  * Delete a directory and everything in it, if it exists.
1672  */
1673 void
1674 PathNameDeleteTemporaryDir(const char *dirname)
1675 {
1676  struct stat statbuf;
1677 
1678  /* Silently ignore missing directory. */
1679  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1680  return;
1681 
1682  /*
1683  * Currently, walkdir doesn't offer a way for our passed in function to
1684  * maintain state. Perhaps it should, so that we could tell the caller
1685  * whether this operation succeeded or failed. Since this operation is
1686  * used in a cleanup path, we wouldn't actually behave differently: we'll
1687  * just log failures.
1688  */
1689  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1690 }
1691 
1692 /*
1693  * Open a temporary file that will disappear when we close it.
1694  *
1695  * This routine takes care of generating an appropriate tempfile name.
1696  * There's no need to pass in fileFlags or fileMode either, since only
1697  * one setting makes any sense for a temp file.
1698  *
1699  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1700  * to ensure it's closed and deleted when it's no longer needed, typically at
1701  * the end-of-transaction. In most cases, you don't want temporary files to
1702  * outlive the transaction that created them, so this should be false -- but
1703  * if you need "somewhat" temporary storage, this might be useful. In either
1704  * case, the file is removed when the File is explicitly closed.
1705  */
1706 File
1707 OpenTemporaryFile(bool interXact)
1708 {
1709  File file = 0;
1710 
1711  Assert(temporary_files_allowed); /* check temp file access is up */
1712 
1713  /*
1714  * Make sure the current resource owner has space for this File before we
1715  * open it, if we'll be registering it below.
1716  */
1717  if (!interXact)
1719 
1720  /*
1721  * If some temp tablespace(s) have been given to us, try to use the next
1722  * one. If a given tablespace can't be found, we silently fall back to
1723  * the database's default tablespace.
1724  *
1725  * BUT: if the temp file is slated to outlive the current transaction,
1726  * force it into the database's default tablespace, so that it will not
1727  * pose a threat to possible tablespace drop attempts.
1728  */
1729  if (numTempTableSpaces > 0 && !interXact)
1730  {
1731  Oid tblspcOid = GetNextTempTableSpace();
1732 
1733  if (OidIsValid(tblspcOid))
1734  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1735  }
1736 
1737  /*
1738  * If not, or if tablespace is bad, create in database's default
1739  * tablespace. MyDatabaseTableSpace should normally be set before we get
1740  * here, but just in case it isn't, fall back to pg_default tablespace.
1741  */
1742  if (file <= 0)
1745  DEFAULTTABLESPACE_OID,
1746  true);
1747 
1748  /* Mark it for deletion at close and temporary file size limit */
1750 
1751  /* Register it with the current resource owner */
1752  if (!interXact)
1753  RegisterTemporaryFile(file);
1754 
1755  return file;
1756 }
1757 
1758 /*
1759  * Return the path of the temp directory in a given tablespace.
1760  */
1761 void
1763 {
1764  /*
1765  * Identify the tempfile directory for this tablespace.
1766  *
1767  * If someone tries to specify pg_global, use pg_default instead.
1768  */
1769  if (tablespace == InvalidOid ||
1770  tablespace == DEFAULTTABLESPACE_OID ||
1771  tablespace == GLOBALTABLESPACE_OID)
1772  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1773  else
1774  {
1775  /* All other tablespaces are accessed via symlinks */
1776  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1779  }
1780 }
1781 
1782 /*
1783  * Open a temporary file in a specific tablespace.
1784  * Subroutine for OpenTemporaryFile, which see for details.
1785  */
1786 static File
1787 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1788 {
1789  char tempdirpath[MAXPGPATH];
1790  char tempfilepath[MAXPGPATH];
1791  File file;
1792 
1793  TempTablespacePath(tempdirpath, tblspcOid);
1794 
1795  /*
1796  * Generate a tempfile name that should be unique within the current
1797  * database instance.
1798  */
1799  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1800  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1801 
1802  /*
1803  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1804  * temp file that can be reused.
1805  */
1806  file = PathNameOpenFile(tempfilepath,
1807  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1808  if (file <= 0)
1809  {
1810  /*
1811  * We might need to create the tablespace's tempfile directory, if no
1812  * one has yet done so.
1813  *
1814  * Don't check for an error from MakePGDirectory; it could fail if
1815  * someone else just did the same thing. If it doesn't work then
1816  * we'll bomb out on the second create attempt, instead.
1817  */
1818  (void) MakePGDirectory(tempdirpath);
1819 
1820  file = PathNameOpenFile(tempfilepath,
1821  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1822  if (file <= 0 && rejectError)
1823  elog(ERROR, "could not create temporary file \"%s\": %m",
1824  tempfilepath);
1825  }
1826 
1827  return file;
1828 }
1829 
1830 
1831 /*
1832  * Create a new file. The directory containing it must already exist. Files
1833  * created this way are subject to temp_file_limit and are automatically
1834  * closed at end of transaction, but are not automatically deleted on close
1835  * because they are intended to be shared between cooperating backends.
1836  *
1837  * If the file is inside the top-level temporary directory, its name should
1838  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1839  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1840  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1841  * the prefix isn't needed.
1842  */
1843 File
1844 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1845 {
1846  File file;
1847 
1848  Assert(temporary_files_allowed); /* check temp file access is up */
1849 
1851 
1852  /*
1853  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1854  * temp file that can be reused.
1855  */
1856  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1857  if (file <= 0)
1858  {
1859  if (error_on_failure)
1860  ereport(ERROR,
1862  errmsg("could not create temporary file \"%s\": %m",
1863  path)));
1864  else
1865  return file;
1866  }
1867 
1868  /* Mark it for temp_file_limit accounting. */
1870 
1871  /* Register it for automatic close. */
1872  RegisterTemporaryFile(file);
1873 
1874  return file;
1875 }
1876 
1877 /*
1878  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1879  * another backend. Files opened this way don't count against the
1880  * temp_file_limit of the caller, are automatically closed at the end of the
1881  * transaction but are not deleted on close.
1882  */
1883 File
1884 PathNameOpenTemporaryFile(const char *path, int mode)
1885 {
1886  File file;
1887 
1888  Assert(temporary_files_allowed); /* check temp file access is up */
1889 
1891 
1892  file = PathNameOpenFile(path, mode | PG_BINARY);
1893 
1894  /* If no such file, then we don't raise an error. */
1895  if (file <= 0 && errno != ENOENT)
1896  ereport(ERROR,
1898  errmsg("could not open temporary file \"%s\": %m",
1899  path)));
1900 
1901  if (file > 0)
1902  {
1903  /* Register it for automatic close. */
1904  RegisterTemporaryFile(file);
1905  }
1906 
1907  return file;
1908 }
1909 
1910 /*
1911  * Delete a file by pathname. Return true if the file existed, false if
1912  * didn't.
1913  */
1914 bool
1915 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1916 {
1917  struct stat filestats;
1918  int stat_errno;
1919 
1920  /* Get the final size for pgstat reporting. */
1921  if (stat(path, &filestats) != 0)
1922  stat_errno = errno;
1923  else
1924  stat_errno = 0;
1925 
1926  /*
1927  * Unlike FileClose's automatic file deletion code, we tolerate
1928  * non-existence to support BufFileDeleteFileSet which doesn't know how
1929  * many segments it has to delete until it runs out.
1930  */
1931  if (stat_errno == ENOENT)
1932  return false;
1933 
1934  if (unlink(path) < 0)
1935  {
1936  if (errno != ENOENT)
1937  ereport(error_on_failure ? ERROR : LOG,
1939  errmsg("could not unlink temporary file \"%s\": %m",
1940  path)));
1941  return false;
1942  }
1943 
1944  if (stat_errno == 0)
1945  ReportTemporaryFileUsage(path, filestats.st_size);
1946  else
1947  {
1948  errno = stat_errno;
1949  ereport(LOG,
1951  errmsg("could not stat file \"%s\": %m", path)));
1952  }
1953 
1954  return true;
1955 }
1956 
1957 /*
1958  * close a file when done with it
1959  */
1960 void
1962 {
1963  Vfd *vfdP;
1964 
1965  Assert(FileIsValid(file));
1966 
1967  DO_DB(elog(LOG, "FileClose: %d (%s)",
1968  file, VfdCache[file].fileName));
1969 
1970  vfdP = &VfdCache[file];
1971 
1972  if (!FileIsNotOpen(file))
1973  {
1974  /* close the file */
1975  if (close(vfdP->fd) != 0)
1976  {
1977  /*
1978  * We may need to panic on failure to close non-temporary files;
1979  * see LruDelete.
1980  */
1982  "could not close file \"%s\": %m", vfdP->fileName);
1983  }
1984 
1985  --nfile;
1986  vfdP->fd = VFD_CLOSED;
1987 
1988  /* remove the file from the lru ring */
1989  Delete(file);
1990  }
1991 
1992  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1993  {
1994  /* Subtract its size from current usage (do first in case of error) */
1995  temporary_files_size -= vfdP->fileSize;
1996  vfdP->fileSize = 0;
1997  }
1998 
1999  /*
2000  * Delete the file if it was temporary, and make a log entry if wanted
2001  */
2002  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2003  {
2004  struct stat filestats;
2005  int stat_errno;
2006 
2007  /*
2008  * If we get an error, as could happen within the ereport/elog calls,
2009  * we'll come right back here during transaction abort. Reset the
2010  * flag to ensure that we can't get into an infinite loop. This code
2011  * is arranged to ensure that the worst-case consequence is failing to
2012  * emit log message(s), not failing to attempt the unlink.
2013  */
2014  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2015 
2016 
2017  /* first try the stat() */
2018  if (stat(vfdP->fileName, &filestats))
2019  stat_errno = errno;
2020  else
2021  stat_errno = 0;
2022 
2023  /* in any case do the unlink */
2024  if (unlink(vfdP->fileName))
2025  ereport(LOG,
2027  errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2028 
2029  /* and last report the stat results */
2030  if (stat_errno == 0)
2031  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2032  else
2033  {
2034  errno = stat_errno;
2035  ereport(LOG,
2037  errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2038  }
2039  }
2040 
2041  /* Unregister it from the resource owner */
2042  if (vfdP->resowner)
2043  ResourceOwnerForgetFile(vfdP->resowner, file);
2044 
2045  /*
2046  * Return the Vfd slot to the free list
2047  */
2048  FreeVfd(file);
2049 }
2050 
2051 /*
2052  * FilePrefetch - initiate asynchronous read of a given range of the file.
2053  *
2054  * Currently the only implementation of this function is using posix_fadvise
2055  * which is the simplest standardized interface that accomplishes this.
2056  * We could add an implementation using libaio in the future; but note that
2057  * this API is inappropriate for libaio, which wants to have a buffer provided
2058  * to read into.
2059  */
2060 int
2061 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
2062 {
2063 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2064  int returnCode;
2065 
2066  Assert(FileIsValid(file));
2067 
2068  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
2069  file, VfdCache[file].fileName,
2070  (int64) offset, amount));
2071 
2072  returnCode = FileAccess(file);
2073  if (returnCode < 0)
2074  return returnCode;
2075 
2076  pgstat_report_wait_start(wait_event_info);
2077  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2078  POSIX_FADV_WILLNEED);
2080 
2081  return returnCode;
2082 #else
2083  Assert(FileIsValid(file));
2084  return 0;
2085 #endif
2086 }
2087 
2088 void
2089 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2090 {
2091  int returnCode;
2092 
2093  Assert(FileIsValid(file));
2094 
2095  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2096  file, VfdCache[file].fileName,
2097  (int64) offset, (int64) nbytes));
2098 
2099  if (nbytes <= 0)
2100  return;
2101 
2102  returnCode = FileAccess(file);
2103  if (returnCode < 0)
2104  return;
2105 
2106  pgstat_report_wait_start(wait_event_info);
2107  pg_flush_data(VfdCache[file].fd, offset, nbytes);
2109 }
2110 
2111 int
2112 FileRead(File file, char *buffer, int amount, off_t offset,
2113  uint32 wait_event_info)
2114 {
2115  int returnCode;
2116  Vfd *vfdP;
2117 
2118  Assert(FileIsValid(file));
2119 
2120  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
2121  file, VfdCache[file].fileName,
2122  (int64) offset,
2123  amount, buffer));
2124 
2125  returnCode = FileAccess(file);
2126  if (returnCode < 0)
2127  return returnCode;
2128 
2129  vfdP = &VfdCache[file];
2130 
2131 retry:
2132  pgstat_report_wait_start(wait_event_info);
2133  returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
2135 
2136  if (returnCode < 0)
2137  {
2138  /*
2139  * Windows may run out of kernel buffers and return "Insufficient
2140  * system resources" error. Wait a bit and retry to solve it.
2141  *
2142  * It is rumored that EINTR is also possible on some Unix filesystems,
2143  * in which case immediate retry is indicated.
2144  */
2145 #ifdef WIN32
2146  DWORD error = GetLastError();
2147 
2148  switch (error)
2149  {
2150  case ERROR_NO_SYSTEM_RESOURCES:
2151  pg_usleep(1000L);
2152  errno = EINTR;
2153  break;
2154  default:
2155  _dosmaperr(error);
2156  break;
2157  }
2158 #endif
2159  /* OK to retry if interrupted */
2160  if (errno == EINTR)
2161  goto retry;
2162  }
2163 
2164  return returnCode;
2165 }
2166 
2167 int
2168 FileWrite(File file, char *buffer, int amount, off_t offset,
2169  uint32 wait_event_info)
2170 {
2171  int returnCode;
2172  Vfd *vfdP;
2173 
2174  Assert(FileIsValid(file));
2175 
2176  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
2177  file, VfdCache[file].fileName,
2178  (int64) offset,
2179  amount, buffer));
2180 
2181  returnCode = FileAccess(file);
2182  if (returnCode < 0)
2183  return returnCode;
2184 
2185  vfdP = &VfdCache[file];
2186 
2187  /*
2188  * If enforcing temp_file_limit and it's a temp file, check to see if the
2189  * write would overrun temp_file_limit, and throw error if so. Note: it's
2190  * really a modularity violation to throw error here; we should set errno
2191  * and return -1. However, there's no way to report a suitable error
2192  * message if we do that. All current callers would just throw error
2193  * immediately anyway, so this is safe at present.
2194  */
2195  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2196  {
2197  off_t past_write = offset + amount;
2198 
2199  if (past_write > vfdP->fileSize)
2200  {
2201  uint64 newTotal = temporary_files_size;
2202 
2203  newTotal += past_write - vfdP->fileSize;
2204  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2205  ereport(ERROR,
2206  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2207  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2208  temp_file_limit)));
2209  }
2210  }
2211 
2212 retry:
2213  errno = 0;
2214  pgstat_report_wait_start(wait_event_info);
2215  returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2217 
2218  /* if write didn't set errno, assume problem is no disk space */
2219  if (returnCode != amount && errno == 0)
2220  errno = ENOSPC;
2221 
2222  if (returnCode >= 0)
2223  {
2224  /*
2225  * Maintain fileSize and temporary_files_size if it's a temp file.
2226  */
2227  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2228  {
2229  off_t past_write = offset + amount;
2230 
2231  if (past_write > vfdP->fileSize)
2232  {
2233  temporary_files_size += past_write - vfdP->fileSize;
2234  vfdP->fileSize = past_write;
2235  }
2236  }
2237  }
2238  else
2239  {
2240  /*
2241  * See comments in FileRead()
2242  */
2243 #ifdef WIN32
2244  DWORD error = GetLastError();
2245 
2246  switch (error)
2247  {
2248  case ERROR_NO_SYSTEM_RESOURCES:
2249  pg_usleep(1000L);
2250  errno = EINTR;
2251  break;
2252  default:
2253  _dosmaperr(error);
2254  break;
2255  }
2256 #endif
2257  /* OK to retry if interrupted */
2258  if (errno == EINTR)
2259  goto retry;
2260  }
2261 
2262  return returnCode;
2263 }
2264 
2265 int
2266 FileSync(File file, uint32 wait_event_info)
2267 {
2268  int returnCode;
2269 
2270  Assert(FileIsValid(file));
2271 
2272  DO_DB(elog(LOG, "FileSync: %d (%s)",
2273  file, VfdCache[file].fileName));
2274 
2275  returnCode = FileAccess(file);
2276  if (returnCode < 0)
2277  return returnCode;
2278 
2279  pgstat_report_wait_start(wait_event_info);
2280  returnCode = pg_fsync(VfdCache[file].fd);
2282 
2283  return returnCode;
2284 }
2285 
2286 off_t
2288 {
2289  Assert(FileIsValid(file));
2290 
2291  DO_DB(elog(LOG, "FileSize %d (%s)",
2292  file, VfdCache[file].fileName));
2293 
2294  if (FileIsNotOpen(file))
2295  {
2296  if (FileAccess(file) < 0)
2297  return (off_t) -1;
2298  }
2299 
2300  return lseek(VfdCache[file].fd, 0, SEEK_END);
2301 }
2302 
2303 int
2304 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2305 {
2306  int returnCode;
2307 
2308  Assert(FileIsValid(file));
2309 
2310  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2311  file, VfdCache[file].fileName));
2312 
2313  returnCode = FileAccess(file);
2314  if (returnCode < 0)
2315  return returnCode;
2316 
2317  pgstat_report_wait_start(wait_event_info);
2318  returnCode = ftruncate(VfdCache[file].fd, offset);
2320 
2321  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2322  {
2323  /* adjust our state for truncation of a temp file */
2324  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2325  temporary_files_size -= VfdCache[file].fileSize - offset;
2326  VfdCache[file].fileSize = offset;
2327  }
2328 
2329  return returnCode;
2330 }
2331 
2332 /*
2333  * Return the pathname associated with an open file.
2334  *
2335  * The returned string points to an internal buffer, which is valid until
2336  * the file is closed.
2337  */
2338 char *
2340 {
2341  Assert(FileIsValid(file));
2342 
2343  return VfdCache[file].fileName;
2344 }
2345 
2346 /*
2347  * Return the raw file descriptor of an opened file.
2348  *
2349  * The returned file descriptor will be valid until the file is closed, but
2350  * there are a lot of things that can make that happen. So the caller should
2351  * be careful not to do much of anything else before it finishes using the
2352  * returned file descriptor.
2353  */
2354 int
2356 {
2357  Assert(FileIsValid(file));
2358  return VfdCache[file].fd;
2359 }
2360 
2361 /*
2362  * FileGetRawFlags - returns the file flags on open(2)
2363  */
2364 int
2366 {
2367  Assert(FileIsValid(file));
2368  return VfdCache[file].fileFlags;
2369 }
2370 
2371 /*
2372  * FileGetRawMode - returns the mode bitmask passed to open(2)
2373  */
2374 mode_t
2376 {
2377  Assert(FileIsValid(file));
2378  return VfdCache[file].fileMode;
2379 }
2380 
2381 /*
2382  * Make room for another allocatedDescs[] array entry if needed and possible.
2383  * Returns true if an array element is available.
2384  */
2385 static bool
2387 {
2388  AllocateDesc *newDescs;
2389  int newMax;
2390 
2391  /* Quick out if array already has a free slot. */
2393  return true;
2394 
2395  /*
2396  * If the array hasn't yet been created in the current process, initialize
2397  * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2398  * we will ever need, anyway. We don't want to look at max_safe_fds
2399  * immediately because set_max_safe_fds() may not have run yet.
2400  */
2401  if (allocatedDescs == NULL)
2402  {
2403  newMax = FD_MINFREE / 3;
2404  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2405  /* Out of memory already? Treat as fatal error. */
2406  if (newDescs == NULL)
2407  ereport(ERROR,
2408  (errcode(ERRCODE_OUT_OF_MEMORY),
2409  errmsg("out of memory")));
2410  allocatedDescs = newDescs;
2411  maxAllocatedDescs = newMax;
2412  return true;
2413  }
2414 
2415  /*
2416  * Consider enlarging the array beyond the initial allocation used above.
2417  * By the time this happens, max_safe_fds should be known accurately.
2418  *
2419  * We mustn't let allocated descriptors hog all the available FDs, and in
2420  * practice we'd better leave a reasonable number of FDs for VFD use. So
2421  * set the maximum to max_safe_fds / 3. (This should certainly be at
2422  * least as large as the initial size, FD_MINFREE / 3, so we aren't
2423  * tightening the restriction here.) Recall that "external" FDs are
2424  * allowed to consume another third of max_safe_fds.
2425  */
2426  newMax = max_safe_fds / 3;
2427  if (newMax > maxAllocatedDescs)
2428  {
2429  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2430  newMax * sizeof(AllocateDesc));
2431  /* Treat out-of-memory as a non-fatal error. */
2432  if (newDescs == NULL)
2433  return false;
2434  allocatedDescs = newDescs;
2435  maxAllocatedDescs = newMax;
2436  return true;
2437  }
2438 
2439  /* Can't enlarge allocatedDescs[] any more. */
2440  return false;
2441 }
2442 
2443 /*
2444  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2445  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2446  * necessary to open the file. When done, call FreeFile rather than fclose.
2447  *
2448  * Note that files that will be open for any significant length of time
2449  * should NOT be handled this way, since they cannot share kernel file
2450  * descriptors with other files; there is grave risk of running out of FDs
2451  * if anyone locks down too many FDs. Most callers of this routine are
2452  * simply reading a config file that they will read and close immediately.
2453  *
2454  * fd.c will automatically close all files opened with AllocateFile at
2455  * transaction commit or abort; this prevents FD leakage if a routine
2456  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2457  *
2458  * Ideally this should be the *only* direct call of fopen() in the backend.
2459  */
2460 FILE *
2461 AllocateFile(const char *name, const char *mode)
2462 {
2463  FILE *file;
2464 
2465  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2467 
2468  /* Can we allocate another non-virtual FD? */
2469  if (!reserveAllocatedDesc())
2470  ereport(ERROR,
2471  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2472  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2473  maxAllocatedDescs, name)));
2474 
2475  /* Close excess kernel FDs. */
2476  ReleaseLruFiles();
2477 
2478 TryAgain:
2479  if ((file = fopen(name, mode)) != NULL)
2480  {
2482 
2483  desc->kind = AllocateDescFile;
2484  desc->desc.file = file;
2487  return desc->desc.file;
2488  }
2489 
2490  if (errno == EMFILE || errno == ENFILE)
2491  {
2492  int save_errno = errno;
2493 
2494  ereport(LOG,
2495  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2496  errmsg("out of file descriptors: %m; release and retry")));
2497  errno = 0;
2498  if (ReleaseLruFile())
2499  goto TryAgain;
2500  errno = save_errno;
2501  }
2502 
2503  return NULL;
2504 }
2505 
2506 /*
2507  * Open a file with OpenTransientFilePerm() and pass default file mode for
2508  * the fileMode parameter.
2509  */
2510 int
2511 OpenTransientFile(const char *fileName, int fileFlags)
2512 {
2513  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2514 }
2515 
2516 /*
2517  * Like AllocateFile, but returns an unbuffered fd like open(2)
2518  */
2519 int
2520 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2521 {
2522  int fd;
2523 
2524  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2525  numAllocatedDescs, fileName));
2526 
2527  /* Can we allocate another non-virtual FD? */
2528  if (!reserveAllocatedDesc())
2529  ereport(ERROR,
2530  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2531  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2532  maxAllocatedDescs, fileName)));
2533 
2534  /* Close excess kernel FDs. */
2535  ReleaseLruFiles();
2536 
2537  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2538 
2539  if (fd >= 0)
2540  {
2542 
2543  desc->kind = AllocateDescRawFD;
2544  desc->desc.fd = fd;
2547 
2548  return fd;
2549  }
2550 
2551  return -1; /* failure */
2552 }
2553 
2554 /*
2555  * Routines that want to initiate a pipe stream should use OpenPipeStream
2556  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2557  * necessary. When done, call ClosePipeStream rather than pclose.
2558  *
2559  * This function also ensures that the popen'd program is run with default
2560  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2561  * uses. This ensures desirable response to, eg, closing a read pipe early.
2562  */
2563 FILE *
2564 OpenPipeStream(const char *command, const char *mode)
2565 {
2566  FILE *file;
2567  int save_errno;
2568 
2569  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2570  numAllocatedDescs, command));
2571 
2572  /* Can we allocate another non-virtual FD? */
2573  if (!reserveAllocatedDesc())
2574  ereport(ERROR,
2575  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2576  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2577  maxAllocatedDescs, command)));
2578 
2579  /* Close excess kernel FDs. */
2580  ReleaseLruFiles();
2581 
2582 TryAgain:
2583  fflush(stdout);
2584  fflush(stderr);
2586  errno = 0;
2587  file = popen(command, mode);
2588  save_errno = errno;
2590  errno = save_errno;
2591  if (file != NULL)
2592  {
2594 
2595  desc->kind = AllocateDescPipe;
2596  desc->desc.file = file;
2599  return desc->desc.file;
2600  }
2601 
2602  if (errno == EMFILE || errno == ENFILE)
2603  {
2604  ereport(LOG,
2605  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2606  errmsg("out of file descriptors: %m; release and retry")));
2607  if (ReleaseLruFile())
2608  goto TryAgain;
2609  errno = save_errno;
2610  }
2611 
2612  return NULL;
2613 }
2614 
2615 /*
2616  * Free an AllocateDesc of any type.
2617  *
2618  * The argument *must* point into the allocatedDescs[] array.
2619  */
2620 static int
2622 {
2623  int result;
2624 
2625  /* Close the underlying object */
2626  switch (desc->kind)
2627  {
2628  case AllocateDescFile:
2629  result = fclose(desc->desc.file);
2630  break;
2631  case AllocateDescPipe:
2632  result = pclose(desc->desc.file);
2633  break;
2634  case AllocateDescDir:
2635  result = closedir(desc->desc.dir);
2636  break;
2637  case AllocateDescRawFD:
2638  result = close(desc->desc.fd);
2639  break;
2640  default:
2641  elog(ERROR, "AllocateDesc kind not recognized");
2642  result = 0; /* keep compiler quiet */
2643  break;
2644  }
2645 
2646  /* Compact storage in the allocatedDescs array */
2649 
2650  return result;
2651 }
2652 
2653 /*
2654  * Close a file returned by AllocateFile.
2655  *
2656  * Note we do not check fclose's return value --- it is up to the caller
2657  * to handle close errors.
2658  */
2659 int
2660 FreeFile(FILE *file)
2661 {
2662  int i;
2663 
2664  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2665 
2666  /* Remove file from list of allocated files, if it's present */
2667  for (i = numAllocatedDescs; --i >= 0;)
2668  {
2669  AllocateDesc *desc = &allocatedDescs[i];
2670 
2671  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2672  return FreeDesc(desc);
2673  }
2674 
2675  /* Only get here if someone passes us a file not in allocatedDescs */
2676  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2677 
2678  return fclose(file);
2679 }
2680 
2681 /*
2682  * Close a file returned by OpenTransientFile.
2683  *
2684  * Note we do not check close's return value --- it is up to the caller
2685  * to handle close errors.
2686  */
2687 int
2689 {
2690  int i;
2691 
2692  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2693 
2694  /* Remove fd from list of allocated files, if it's present */
2695  for (i = numAllocatedDescs; --i >= 0;)
2696  {
2697  AllocateDesc *desc = &allocatedDescs[i];
2698 
2699  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2700  return FreeDesc(desc);
2701  }
2702 
2703  /* Only get here if someone passes us a file not in allocatedDescs */
2704  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2705 
2706  return close(fd);
2707 }
2708 
2709 /*
2710  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2711  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2712  * necessary to open the directory, and with closing it after an elog.
2713  * When done, call FreeDir rather than closedir.
2714  *
2715  * Returns NULL, with errno set, on failure. Note that failure detection
2716  * is commonly left to the following call of ReadDir or ReadDirExtended;
2717  * see the comments for ReadDir.
2718  *
2719  * Ideally this should be the *only* direct call of opendir() in the backend.
2720  */
2721 DIR *
2722 AllocateDir(const char *dirname)
2723 {
2724  DIR *dir;
2725 
2726  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2727  numAllocatedDescs, dirname));
2728 
2729  /* Can we allocate another non-virtual FD? */
2730  if (!reserveAllocatedDesc())
2731  ereport(ERROR,
2732  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2733  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2734  maxAllocatedDescs, dirname)));
2735 
2736  /* Close excess kernel FDs. */
2737  ReleaseLruFiles();
2738 
2739 TryAgain:
2740  if ((dir = opendir(dirname)) != NULL)
2741  {
2743 
2744  desc->kind = AllocateDescDir;
2745  desc->desc.dir = dir;
2748  return desc->desc.dir;
2749  }
2750 
2751  if (errno == EMFILE || errno == ENFILE)
2752  {
2753  int save_errno = errno;
2754 
2755  ereport(LOG,
2756  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2757  errmsg("out of file descriptors: %m; release and retry")));
2758  errno = 0;
2759  if (ReleaseLruFile())
2760  goto TryAgain;
2761  errno = save_errno;
2762  }
2763 
2764  return NULL;
2765 }
2766 
2767 /*
2768  * Read a directory opened with AllocateDir, ereport'ing any error.
2769  *
2770  * This is easier to use than raw readdir() since it takes care of some
2771  * otherwise rather tedious and error-prone manipulation of errno. Also,
2772  * if you are happy with a generic error message for AllocateDir failure,
2773  * you can just do
2774  *
2775  * dir = AllocateDir(path);
2776  * while ((dirent = ReadDir(dir, path)) != NULL)
2777  * process dirent;
2778  * FreeDir(dir);
2779  *
2780  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2781  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2782  * use this shortcut.)
2783  *
2784  * The pathname passed to AllocateDir must be passed to this routine too,
2785  * but it is only used for error reporting.
2786  */
2787 struct dirent *
2788 ReadDir(DIR *dir, const char *dirname)
2789 {
2790  return ReadDirExtended(dir, dirname, ERROR);
2791 }
2792 
2793 /*
2794  * Alternate version of ReadDir that allows caller to specify the elevel
2795  * for any error report (whether it's reporting an initial failure of
2796  * AllocateDir or a subsequent directory read failure).
2797  *
2798  * If elevel < ERROR, returns NULL after any error. With the normal coding
2799  * pattern, this will result in falling out of the loop immediately as
2800  * though the directory contained no (more) entries.
2801  */
2802 struct dirent *
2803 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2804 {
2805  struct dirent *dent;
2806 
2807  /* Give a generic message for AllocateDir failure, if caller didn't */
2808  if (dir == NULL)
2809  {
2810  ereport(elevel,
2812  errmsg("could not open directory \"%s\": %m",
2813  dirname)));
2814  return NULL;
2815  }
2816 
2817  errno = 0;
2818  if ((dent = readdir(dir)) != NULL)
2819  return dent;
2820 
2821  if (errno)
2822  ereport(elevel,
2824  errmsg("could not read directory \"%s\": %m",
2825  dirname)));
2826  return NULL;
2827 }
2828 
2829 /*
2830  * Close a directory opened with AllocateDir.
2831  *
2832  * Returns closedir's return value (with errno set if it's not 0).
2833  * Note we do not check the return value --- it is up to the caller
2834  * to handle close errors if wanted.
2835  *
2836  * Does nothing if dir == NULL; we assume that directory open failure was
2837  * already reported if desired.
2838  */
2839 int
2841 {
2842  int i;
2843 
2844  /* Nothing to do if AllocateDir failed */
2845  if (dir == NULL)
2846  return 0;
2847 
2848  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2849 
2850  /* Remove dir from list of allocated dirs, if it's present */
2851  for (i = numAllocatedDescs; --i >= 0;)
2852  {
2853  AllocateDesc *desc = &allocatedDescs[i];
2854 
2855  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2856  return FreeDesc(desc);
2857  }
2858 
2859  /* Only get here if someone passes us a dir not in allocatedDescs */
2860  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2861 
2862  return closedir(dir);
2863 }
2864 
2865 
2866 /*
2867  * Close a pipe stream returned by OpenPipeStream.
2868  */
2869 int
2870 ClosePipeStream(FILE *file)
2871 {
2872  int i;
2873 
2874  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2875 
2876  /* Remove file from list of allocated files, if it's present */
2877  for (i = numAllocatedDescs; --i >= 0;)
2878  {
2879  AllocateDesc *desc = &allocatedDescs[i];
2880 
2881  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2882  return FreeDesc(desc);
2883  }
2884 
2885  /* Only get here if someone passes us a file not in allocatedDescs */
2886  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2887 
2888  return pclose(file);
2889 }
2890 
2891 /*
2892  * closeAllVfds
2893  *
2894  * Force all VFDs into the physically-closed state, so that the fewest
2895  * possible number of kernel file descriptors are in use. There is no
2896  * change in the logical state of the VFDs.
2897  */
2898 void
2900 {
2901  Index i;
2902 
2903  if (SizeVfdCache > 0)
2904  {
2905  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2906  for (i = 1; i < SizeVfdCache; i++)
2907  {
2908  if (!FileIsNotOpen(i))
2909  LruDelete(i);
2910  }
2911  }
2912 }
2913 
2914 
2915 /*
2916  * SetTempTablespaces
2917  *
2918  * Define a list (actually an array) of OIDs of tablespaces to use for
2919  * temporary files. This list will be used until end of transaction,
2920  * unless this function is called again before then. It is caller's
2921  * responsibility that the passed-in array has adequate lifespan (typically
2922  * it'd be allocated in TopTransactionContext).
2923  *
2924  * Some entries of the array may be InvalidOid, indicating that the current
2925  * database's default tablespace should be used.
2926  */
2927 void
2928 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2929 {
2930  Assert(numSpaces >= 0);
2931  tempTableSpaces = tableSpaces;
2932  numTempTableSpaces = numSpaces;
2933 
2934  /*
2935  * Select a random starting point in the list. This is to minimize
2936  * conflicts between backends that are most likely sharing the same list
2937  * of temp tablespaces. Note that if we create multiple temp files in the
2938  * same transaction, we'll advance circularly through the list --- this
2939  * ensures that large temporary sort files are nicely spread across all
2940  * available tablespaces.
2941  */
2942  if (numSpaces > 1)
2944  0, numSpaces - 1);
2945  else
2946  nextTempTableSpace = 0;
2947 }
2948 
2949 /*
2950  * TempTablespacesAreSet
2951  *
2952  * Returns true if SetTempTablespaces has been called in current transaction.
2953  * (This is just so that tablespaces.c doesn't need its own per-transaction
2954  * state.)
2955  */
2956 bool
2958 {
2959  return (numTempTableSpaces >= 0);
2960 }
2961 
2962 /*
2963  * GetTempTablespaces
2964  *
2965  * Populate an array with the OIDs of the tablespaces that should be used for
2966  * temporary files. (Some entries may be InvalidOid, indicating that the
2967  * current database's default tablespace should be used.) At most numSpaces
2968  * entries will be filled.
2969  * Returns the number of OIDs that were copied into the output array.
2970  */
2971 int
2972 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2973 {
2974  int i;
2975 
2977  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2978  tableSpaces[i] = tempTableSpaces[i];
2979 
2980  return i;
2981 }
2982 
2983 /*
2984  * GetNextTempTableSpace
2985  *
2986  * Select the next temp tablespace to use. A result of InvalidOid means
2987  * to use the current database's default tablespace.
2988  */
2989 Oid
2991 {
2992  if (numTempTableSpaces > 0)
2993  {
2994  /* Advance nextTempTableSpace counter with wraparound */
2996  nextTempTableSpace = 0;
2998  }
2999  return InvalidOid;
3000 }
3001 
3002 
3003 /*
3004  * AtEOSubXact_Files
3005  *
3006  * Take care of subtransaction commit/abort. At abort, we close temp files
3007  * that the subtransaction may have opened. At commit, we reassign the
3008  * files that were opened to the parent subtransaction.
3009  */
3010 void
3011 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3012  SubTransactionId parentSubid)
3013 {
3014  Index i;
3015 
3016  for (i = 0; i < numAllocatedDescs; i++)
3017  {
3018  if (allocatedDescs[i].create_subid == mySubid)
3019  {
3020  if (isCommit)
3021  allocatedDescs[i].create_subid = parentSubid;
3022  else
3023  {
3024  /* have to recheck the item after FreeDesc (ugly) */
3025  FreeDesc(&allocatedDescs[i--]);
3026  }
3027  }
3028  }
3029 }
3030 
3031 /*
3032  * AtEOXact_Files
3033  *
3034  * This routine is called during transaction commit or abort. All still-open
3035  * per-transaction temporary file VFDs are closed, which also causes the
3036  * underlying files to be deleted (although they should've been closed already
3037  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3038  * closed. We also forget any transaction-local temp tablespace list.
3039  *
3040  * The isCommit flag is used only to decide whether to emit warnings about
3041  * unclosed files.
3042  */
3043 void
3044 AtEOXact_Files(bool isCommit)
3045 {
3046  CleanupTempFiles(isCommit, false);
3047  tempTableSpaces = NULL;
3048  numTempTableSpaces = -1;
3049 }
3050 
3051 /*
3052  * BeforeShmemExit_Files
3053  *
3054  * before_shmem_access hook to clean up temp files during backend shutdown.
3055  * Here, we want to clean up *all* temp files including interXact ones.
3056  */
3057 static void
3059 {
3060  CleanupTempFiles(false, true);
3061 
3062  /* prevent further temp files from being created */
3063 #ifdef USE_ASSERT_CHECKING
3064  temporary_files_allowed = false;
3065 #endif
3066 }
3067 
3068 /*
3069  * Close temporary files and delete their underlying files.
3070  *
3071  * isCommit: if true, this is normal transaction commit, and we don't
3072  * expect any remaining files; warn if there are some.
3073  *
3074  * isProcExit: if true, this is being called as the backend process is
3075  * exiting. If that's the case, we should remove all temporary files; if
3076  * that's not the case, we are being called for transaction commit/abort
3077  * and should only remove transaction-local temp files. In either case,
3078  * also clean up "allocated" stdio files, dirs and fds.
3079  */
3080 static void
3081 CleanupTempFiles(bool isCommit, bool isProcExit)
3082 {
3083  Index i;
3084 
3085  /*
3086  * Careful here: at proc_exit we need extra cleanup, not just
3087  * xact_temporary files.
3088  */
3089  if (isProcExit || have_xact_temporary_files)
3090  {
3091  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3092  for (i = 1; i < SizeVfdCache; i++)
3093  {
3094  unsigned short fdstate = VfdCache[i].fdstate;
3095 
3096  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3097  VfdCache[i].fileName != NULL)
3098  {
3099  /*
3100  * If we're in the process of exiting a backend process, close
3101  * all temporary files. Otherwise, only close temporary files
3102  * local to the current transaction. They should be closed by
3103  * the ResourceOwner mechanism already, so this is just a
3104  * debugging cross-check.
3105  */
3106  if (isProcExit)
3107  FileClose(i);
3108  else if (fdstate & FD_CLOSE_AT_EOXACT)
3109  {
3110  elog(WARNING,
3111  "temporary file %s not closed at end-of-transaction",
3112  VfdCache[i].fileName);
3113  FileClose(i);
3114  }
3115  }
3116  }
3117 
3118  have_xact_temporary_files = false;
3119  }
3120 
3121  /* Complain if any allocated files remain open at commit. */
3122  if (isCommit && numAllocatedDescs > 0)
3123  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3125 
3126  /* Clean up "allocated" stdio files, dirs and fds. */
3127  while (numAllocatedDescs > 0)
3128  FreeDesc(&allocatedDescs[0]);
3129 }
3130 
3131 
3132 /*
3133  * Remove temporary and temporary relation files left over from a prior
3134  * postmaster session
3135  *
3136  * This should be called during postmaster startup. It will forcibly
3137  * remove any leftover files created by OpenTemporaryFile and any leftover
3138  * temporary relation files created by mdcreate.
3139  *
3140  * During post-backend-crash restart cycle, this routine is called when
3141  * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3142  * queries are using temp files could result in useless storage usage that can
3143  * only be reclaimed by a service restart. The argument against enabling it is
3144  * that someone might want to examine the temporary files for debugging
3145  * purposes. This does however mean that OpenTemporaryFile had better allow for
3146  * collision with an existing temp file name.
3147  *
3148  * NOTE: this function and its subroutines generally report syscall failures
3149  * with ereport(LOG) and keep going. Removing temp files is not so critical
3150  * that we should fail to start the database when we can't do it.
3151  */
3152 void
3154 {
3155  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3156  DIR *spc_dir;
3157  struct dirent *spc_de;
3158 
3159  /*
3160  * First process temp files in pg_default ($PGDATA/base)
3161  */
3162  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3163  RemovePgTempFilesInDir(temp_path, true, false);
3164  RemovePgTempRelationFiles("base");
3165 
3166  /*
3167  * Cycle through temp directories for all non-default tablespaces.
3168  */
3169  spc_dir = AllocateDir("pg_tblspc");
3170 
3171  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3172  {
3173  if (strcmp(spc_de->d_name, ".") == 0 ||
3174  strcmp(spc_de->d_name, "..") == 0)
3175  continue;
3176 
3177  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3179  RemovePgTempFilesInDir(temp_path, true, false);
3180 
3181  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3183  RemovePgTempRelationFiles(temp_path);
3184  }
3185 
3186  FreeDir(spc_dir);
3187 
3188  /*
3189  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3190  * DataDir as well. However, that is *not* cleaned here because doing so
3191  * would create a race condition. It's done separately, earlier in
3192  * postmaster startup.
3193  */
3194 }
3195 
3196 /*
3197  * Process one pgsql_tmp directory for RemovePgTempFiles.
3198  *
3199  * If missing_ok is true, it's all right for the named directory to not exist.
3200  * Any other problem results in a LOG message. (missing_ok should be true at
3201  * the top level, since pgsql_tmp directories are not created until needed.)
3202  *
3203  * At the top level, this should be called with unlink_all = false, so that
3204  * only files matching the temporary name prefix will be unlinked. When
3205  * recursing it will be called with unlink_all = true to unlink everything
3206  * under a top-level temporary directory.
3207  *
3208  * (These two flags could be replaced by one, but it seems clearer to keep
3209  * them separate.)
3210  */
3211 void
3212 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3213 {
3214  DIR *temp_dir;
3215  struct dirent *temp_de;
3216  char rm_path[MAXPGPATH * 2];
3217 
3218  temp_dir = AllocateDir(tmpdirname);
3219 
3220  if (temp_dir == NULL && errno == ENOENT && missing_ok)
3221  return;
3222 
3223  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3224  {
3225  if (strcmp(temp_de->d_name, ".") == 0 ||
3226  strcmp(temp_de->d_name, "..") == 0)
3227  continue;
3228 
3229  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3230  tmpdirname, temp_de->d_name);
3231 
3232  if (unlink_all ||
3233  strncmp(temp_de->d_name,
3235  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3236  {
3237  struct stat statbuf;
3238 
3239  if (lstat(rm_path, &statbuf) < 0)
3240  {
3241  ereport(LOG,
3243  errmsg("could not stat file \"%s\": %m", rm_path)));
3244  continue;
3245  }
3246 
3247  if (S_ISDIR(statbuf.st_mode))
3248  {
3249  /* recursively remove contents, then directory itself */
3250  RemovePgTempFilesInDir(rm_path, false, true);
3251 
3252  if (rmdir(rm_path) < 0)
3253  ereport(LOG,
3255  errmsg("could not remove directory \"%s\": %m",
3256  rm_path)));
3257  }
3258  else
3259  {
3260  if (unlink(rm_path) < 0)
3261  ereport(LOG,
3263  errmsg("could not remove file \"%s\": %m",
3264  rm_path)));
3265  }
3266  }
3267  else
3268  ereport(LOG,
3269  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3270  rm_path)));
3271  }
3272 
3273  FreeDir(temp_dir);
3274 }
3275 
3276 /* Process one tablespace directory, look for per-DB subdirectories */
3277 static void
3278 RemovePgTempRelationFiles(const char *tsdirname)
3279 {
3280  DIR *ts_dir;
3281  struct dirent *de;
3282  char dbspace_path[MAXPGPATH * 2];
3283 
3284  ts_dir = AllocateDir(tsdirname);
3285 
3286  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3287  {
3288  /*
3289  * We're only interested in the per-database directories, which have
3290  * numeric names. Note that this code will also (properly) ignore "."
3291  * and "..".
3292  */
3293  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3294  continue;
3295 
3296  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3297  tsdirname, de->d_name);
3298  RemovePgTempRelationFilesInDbspace(dbspace_path);
3299  }
3300 
3301  FreeDir(ts_dir);
3302 }
3303 
3304 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3305 static void
3306 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3307 {
3308  DIR *dbspace_dir;
3309  struct dirent *de;
3310  char rm_path[MAXPGPATH * 2];
3311 
3312  dbspace_dir = AllocateDir(dbspacedirname);
3313 
3314  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3315  {
3316  if (!looks_like_temp_rel_name(de->d_name))
3317  continue;
3318 
3319  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3320  dbspacedirname, de->d_name);
3321 
3322  if (unlink(rm_path) < 0)
3323  ereport(LOG,
3325  errmsg("could not remove file \"%s\": %m",
3326  rm_path)));
3327  }
3328 
3329  FreeDir(dbspace_dir);
3330 }
3331 
3332 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3333 bool
3335 {
3336  int pos;
3337  int savepos;
3338 
3339  /* Must start with "t". */
3340  if (name[0] != 't')
3341  return false;
3342 
3343  /* Followed by a non-empty string of digits and then an underscore. */
3344  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3345  ;
3346  if (pos == 1 || name[pos] != '_')
3347  return false;
3348 
3349  /* Followed by another nonempty string of digits. */
3350  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3351  ;
3352  if (savepos == pos)
3353  return false;
3354 
3355  /* We might have _forkname or .segment or both. */
3356  if (name[pos] == '_')
3357  {
3358  int forkchar = forkname_chars(&name[pos + 1], NULL);
3359 
3360  if (forkchar <= 0)
3361  return false;
3362  pos += forkchar + 1;
3363  }
3364  if (name[pos] == '.')
3365  {
3366  int segchar;
3367 
3368  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3369  ;
3370  if (segchar <= 1)
3371  return false;
3372  pos += segchar;
3373  }
3374 
3375  /* Now we should be at the end. */
3376  if (name[pos] != '\0')
3377  return false;
3378  return true;
3379 }
3380 
3381 #ifdef HAVE_SYNCFS
3382 static void
3383 do_syncfs(const char *path)
3384 {
3385  int fd;
3386 
3387  ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3388  path);
3389 
3390  fd = OpenTransientFile(path, O_RDONLY);
3391  if (fd < 0)
3392  {
3393  ereport(LOG,
3395  errmsg("could not open file \"%s\": %m", path)));
3396  return;
3397  }
3398  if (syncfs(fd) < 0)
3399  ereport(LOG,
3401  errmsg("could not synchronize file system for file \"%s\": %m", path)));
3403 }
3404 #endif
3405 
3406 /*
3407  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3408  * all potential filesystem, depending on recovery_init_sync_method setting.
3409  *
3410  * We fsync regular files and directories wherever they are, but we
3411  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3412  * Other symlinks are presumed to point at files we're not responsible
3413  * for fsyncing, and might not have privileges to write at all.
3414  *
3415  * Errors are logged but not considered fatal; that's because this is used
3416  * only during database startup, to deal with the possibility that there are
3417  * issued-but-unsynced writes pending against the data directory. We want to
3418  * ensure that such writes reach disk before anything that's done in the new
3419  * run. However, aborting on error would result in failure to start for
3420  * harmless cases such as read-only files in the data directory, and that's
3421  * not good either.
3422  *
3423  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3424  * rewriting all changes again during recovery.
3425  *
3426  * Note we assume we're chdir'd into PGDATA to begin with.
3427  */
3428 void
3430 {
3431  bool xlog_is_symlink;
3432 
3433  /* We can skip this whole thing if fsync is disabled. */
3434  if (!enableFsync)
3435  return;
3436 
3437  /*
3438  * If pg_wal is a symlink, we'll need to recurse into it separately,
3439  * because the first walkdir below will ignore it.
3440  */
3441  xlog_is_symlink = false;
3442 
3443 #ifndef WIN32
3444  {
3445  struct stat st;
3446 
3447  if (lstat("pg_wal", &st) < 0)
3448  ereport(LOG,
3450  errmsg("could not stat file \"%s\": %m",
3451  "pg_wal")));
3452  else if (S_ISLNK(st.st_mode))
3453  xlog_is_symlink = true;
3454  }
3455 #else
3456  if (pgwin32_is_junction("pg_wal"))
3457  xlog_is_symlink = true;
3458 #endif
3459 
3460 #ifdef HAVE_SYNCFS
3462  {
3463  DIR *dir;
3464  struct dirent *de;
3465 
3466  /*
3467  * On Linux, we don't have to open every single file one by one. We
3468  * can use syncfs() to sync whole filesystems. We only expect
3469  * filesystem boundaries to exist where we tolerate symlinks, namely
3470  * pg_wal and the tablespaces, so we call syncfs() for each of those
3471  * directories.
3472  */
3473 
3474  /* Prepare to report progress syncing the data directory via syncfs. */
3476 
3477  /* Sync the top level pgdata directory. */
3478  do_syncfs(".");
3479  /* If any tablespaces are configured, sync each of those. */
3480  dir = AllocateDir("pg_tblspc");
3481  while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
3482  {
3483  char path[MAXPGPATH];
3484 
3485  if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3486  continue;
3487 
3488  snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
3489  do_syncfs(path);
3490  }
3491  FreeDir(dir);
3492  /* If pg_wal is a symlink, process that too. */
3493  if (xlog_is_symlink)
3494  do_syncfs("pg_wal");
3495  return;
3496  }
3497 #endif /* !HAVE_SYNCFS */
3498 
3499 #ifdef PG_FLUSH_DATA_WORKS
3500  /* Prepare to report progress of the pre-fsync phase. */
3502 
3503  /*
3504  * If possible, hint to the kernel that we're soon going to fsync the data
3505  * directory and its contents. Errors in this step are even less
3506  * interesting than normal, so log them only at DEBUG1.
3507  */
3508  walkdir(".", pre_sync_fname, false, DEBUG1);
3509  if (xlog_is_symlink)
3510  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3511  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3512 #endif
3513 
3514  /* Prepare to report progress syncing the data directory via fsync. */
3516 
3517  /*
3518  * Now we do the fsync()s in the same order.
3519  *
3520  * The main call ignores symlinks, so in addition to specially processing
3521  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3522  * process_symlinks = true. Note that if there are any plain directories
3523  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3524  * so we don't worry about optimizing it.
3525  */
3526  walkdir(".", datadir_fsync_fname, false, LOG);
3527  if (xlog_is_symlink)
3528  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3529  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3530 }
3531 
3532 /*
3533  * walkdir: recursively walk a directory, applying the action to each
3534  * regular file and directory (including the named directory itself).
3535  *
3536  * If process_symlinks is true, the action and recursion are also applied
3537  * to regular files and directories that are pointed to by symlinks in the
3538  * given directory; otherwise symlinks are ignored. Symlinks are always
3539  * ignored in subdirectories, ie we intentionally don't pass down the
3540  * process_symlinks flag to recursive calls.
3541  *
3542  * Errors are reported at level elevel, which might be ERROR or less.
3543  *
3544  * See also walkdir in file_utils.c, which is a frontend version of this
3545  * logic.
3546  */
3547 static void
3548 walkdir(const char *path,
3549  void (*action) (const char *fname, bool isdir, int elevel),
3550  bool process_symlinks,
3551  int elevel)
3552 {
3553  DIR *dir;
3554  struct dirent *de;
3555 
3556  dir = AllocateDir(path);
3557 
3558  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3559  {
3560  char subpath[MAXPGPATH * 2];
3561 
3563 
3564  if (strcmp(de->d_name, ".") == 0 ||
3565  strcmp(de->d_name, "..") == 0)
3566  continue;
3567 
3568  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3569 
3570  switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3571  {
3572  case PGFILETYPE_REG:
3573  (*action) (subpath, false, elevel);
3574  break;
3575  case PGFILETYPE_DIR:
3576  walkdir(subpath, action, false, elevel);
3577  break;
3578  default:
3579 
3580  /*
3581  * Errors are already reported directly by get_dirent_type(),
3582  * and any remaining symlinks and unknown file types are
3583  * ignored.
3584  */
3585  break;
3586  }
3587  }
3588 
3589  FreeDir(dir); /* we ignore any error here */
3590 
3591  /*
3592  * It's important to fsync the destination directory itself as individual
3593  * file fsyncs don't guarantee that the directory entry for the file is
3594  * synced. However, skip this if AllocateDir failed; the action function
3595  * might not be robust against that.
3596  */
3597  if (dir)
3598  (*action) (path, true, elevel);
3599 }
3600 
3601 
3602 /*
3603  * Hint to the OS that it should get ready to fsync() this file.
3604  *
3605  * Ignores errors trying to open unreadable files, and logs other errors at a
3606  * caller-specified level.
3607  */
3608 #ifdef PG_FLUSH_DATA_WORKS
3609 
3610 static void
3611 pre_sync_fname(const char *fname, bool isdir, int elevel)
3612 {
3613  int fd;
3614 
3615  /* Don't try to flush directories, it'll likely just fail */
3616  if (isdir)
3617  return;
3618 
3619  ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3620  fname);
3621 
3622  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3623 
3624  if (fd < 0)
3625  {
3626  if (errno == EACCES)
3627  return;
3628  ereport(elevel,
3630  errmsg("could not open file \"%s\": %m", fname)));
3631  return;
3632  }
3633 
3634  /*
3635  * pg_flush_data() ignores errors, which is ok because this is only a
3636  * hint.
3637  */
3638  pg_flush_data(fd, 0, 0);
3639 
3640  if (CloseTransientFile(fd) != 0)
3641  ereport(elevel,
3643  errmsg("could not close file \"%s\": %m", fname)));
3644 }
3645 
3646 #endif /* PG_FLUSH_DATA_WORKS */
3647 
3648 static void
3649 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3650 {
3651  ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3652  fname);
3653 
3654  /*
3655  * We want to silently ignoring errors about unreadable files. Pass that
3656  * desire on to fsync_fname_ext().
3657  */
3658  fsync_fname_ext(fname, isdir, true, elevel);
3659 }
3660 
3661 static void
3662 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3663 {
3664  if (isdir)
3665  {
3666  if (rmdir(fname) != 0 && errno != ENOENT)
3667  ereport(elevel,
3669  errmsg("could not remove directory \"%s\": %m", fname)));
3670  }
3671  else
3672  {
3673  /* Use PathNameDeleteTemporaryFile to report filesize */
3674  PathNameDeleteTemporaryFile(fname, false);
3675  }
3676 }
3677 
3678 /*
3679  * fsync_fname_ext -- Try to fsync a file or directory
3680  *
3681  * If ignore_perm is true, ignore errors upon trying to open unreadable
3682  * files. Logs other errors at a caller-specified level.
3683  *
3684  * Returns 0 if the operation succeeded, -1 otherwise.
3685  */
3686 int
3687 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3688 {
3689  int fd;
3690  int flags;
3691  int returncode;
3692 
3693  /*
3694  * Some OSs require directories to be opened read-only whereas other
3695  * systems don't allow us to fsync files opened read-only; so we need both
3696  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3697  * not writable by our userid, but we assume that's OK.
3698  */
3699  flags = PG_BINARY;
3700  if (!isdir)
3701  flags |= O_RDWR;
3702  else
3703  flags |= O_RDONLY;
3704 
3705  fd = OpenTransientFile(fname, flags);
3706 
3707  /*
3708  * Some OSs don't allow us to open directories at all (Windows returns
3709  * EACCES), just ignore the error in that case. If desired also silently
3710  * ignoring errors about unreadable files. Log others.
3711  */
3712  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3713  return 0;
3714  else if (fd < 0 && ignore_perm && errno == EACCES)
3715  return 0;
3716  else if (fd < 0)
3717  {
3718  ereport(elevel,
3720  errmsg("could not open file \"%s\": %m", fname)));
3721  return -1;
3722  }
3723 
3724  returncode = pg_fsync(fd);
3725 
3726  /*
3727  * Some OSes don't allow us to fsync directories at all, so we can ignore
3728  * those errors. Anything else needs to be logged.
3729  */
3730  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3731  {
3732  int save_errno;
3733 
3734  /* close file upon error, might not be in transaction context */
3735  save_errno = errno;
3736  (void) CloseTransientFile(fd);
3737  errno = save_errno;
3738 
3739  ereport(elevel,
3741  errmsg("could not fsync file \"%s\": %m", fname)));
3742  return -1;
3743  }
3744 
3745  if (CloseTransientFile(fd) != 0)
3746  {
3747  ereport(elevel,
3749  errmsg("could not close file \"%s\": %m", fname)));
3750  return -1;
3751  }
3752 
3753  return 0;
3754 }
3755 
3756 /*
3757  * fsync_parent_path -- fsync the parent path of a file or directory
3758  *
3759  * This is aimed at making file operations persistent on disk in case of
3760  * an OS crash or power failure.
3761  */
3762 static int
3763 fsync_parent_path(const char *fname, int elevel)
3764 {
3765  char parentpath[MAXPGPATH];
3766 
3767  strlcpy(parentpath, fname, MAXPGPATH);
3768  get_parent_directory(parentpath);
3769 
3770  /*
3771  * get_parent_directory() returns an empty string if the input argument is
3772  * just a file name (see comments in path.c), so handle that as being the
3773  * current directory.
3774  */
3775  if (strlen(parentpath) == 0)
3776  strlcpy(parentpath, ".", MAXPGPATH);
3777 
3778  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3779  return -1;
3780 
3781  return 0;
3782 }
3783 
3784 /*
3785  * Create a PostgreSQL data sub-directory
3786  *
3787  * The data directory itself, and most of its sub-directories, are created at
3788  * initdb time, but we do have some occasions when we create directories in
3789  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3790  * make sure that those directories are created consistently. Today, that means
3791  * making sure that the created directory has the correct permissions, which is
3792  * what pg_dir_create_mode tracks for us.
3793  *
3794  * Note that we also set the umask() based on what we understand the correct
3795  * permissions to be (see file_perm.c).
3796  *
3797  * For permissions other than the default, mkdir() can be used directly, but
3798  * be sure to consider carefully such cases -- a sub-directory with incorrect
3799  * permissions in a PostgreSQL data directory could cause backups and other
3800  * processes to fail.
3801  */
3802 int
3803 MakePGDirectory(const char *directoryName)
3804 {
3805  return mkdir(directoryName, pg_dir_create_mode);
3806 }
3807 
3808 /*
3809  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3810  *
3811  * Failure to fsync any data file is cause for immediate panic, unless
3812  * data_sync_retry is enabled. Data may have been written to the operating
3813  * system and removed from our buffer pool already, and if we are running on
3814  * an operating system that forgets dirty data on write-back failure, there
3815  * may be only one copy of the data remaining: in the WAL. A later attempt to
3816  * fsync again might falsely report success. Therefore we must not allow any
3817  * further checkpoints to be attempted. data_sync_retry can in theory be
3818  * enabled on systems known not to drop dirty buffered data on write-back
3819  * failure (with the likely outcome that checkpoints will continue to fail
3820  * until the underlying problem is fixed).
3821  *
3822  * Any code that reports a failure from fsync() or related functions should
3823  * filter the error level with this function.
3824  */
3825 int
3826 data_sync_elevel(int elevel)
3827 {
3828  return data_sync_retry ? elevel : PANIC;
3829 }
3830 
3831 /*
3832  * A convenience wrapper for pg_pwritev() that retries on partial write. If an
3833  * error is returned, it is unspecified how much has been written.
3834  */
3835 ssize_t
3836 pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
3837 {
3838  struct iovec iov_copy[PG_IOV_MAX];
3839  ssize_t sum = 0;
3840  ssize_t part;
3841 
3842  /* We'd better have space to make a copy, in case we need to retry. */
3843  if (iovcnt > PG_IOV_MAX)
3844  {
3845  errno = EINVAL;
3846  return -1;
3847  }
3848 
3849  for (;;)
3850  {
3851  /* Write as much as we can. */
3852  part = pg_pwritev(fd, iov, iovcnt, offset);
3853  if (part < 0)
3854  return -1;
3855 
3856 #ifdef SIMULATE_SHORT_WRITE
3857  part = Min(part, 4096);
3858 #endif
3859 
3860  /* Count our progress. */
3861  sum += part;
3862  offset += part;
3863 
3864  /* Step over iovecs that are done. */
3865  while (iovcnt > 0 && iov->iov_len <= part)
3866  {
3867  part -= iov->iov_len;
3868  ++iov;
3869  --iovcnt;
3870  }
3871 
3872  /* Are they all done? */
3873  if (iovcnt == 0)
3874  {
3875  /* We don't expect the kernel to write more than requested. */
3876  Assert(part == 0);
3877  break;
3878  }
3879 
3880  /*
3881  * Move whatever's left to the front of our mutable copy and adjust
3882  * the leading iovec.
3883  */
3884  Assert(iovcnt > 0);
3885  memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
3886  Assert(iov->iov_len > part);
3887  iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
3888  iov_copy[0].iov_len -= part;
3889  iov = iov_copy;
3890  }
3891 
3892  return sum;
3893 }
void begin_startup_progress_phase(void)
Definition: startup.c:320
unsigned int uint32
Definition: c.h:441
#define Min(x, y)
Definition: c.h:986
uint32 SubTransactionId
Definition: c.h:591
#define INT64_FORMAT
Definition: c.h:483
#define PG_BINARY
Definition: c.h:1268
unsigned int Index
Definition: c.h:549
#define MemSet(start, val, len)
Definition: c.h:1008
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:918
#define OidIsValid(objectId)
Definition: c.h:710
size_t Size
Definition: c.h:540
int closedir(DIR *)
Definition: dirent.c:123
struct dirent * readdir(DIR *)
Definition: dirent.c:78
DIR * opendir(const char *)
Definition: dirent.c:33
int errcode_for_file_access(void)
Definition: elog.c:716
int errdetail(const char *fmt,...)
Definition: elog.c:1037
int errcode(int sqlerrcode)
Definition: elog.c:693
int errmsg(const char *fmt,...)
Definition: elog.c:904
#define LOG
Definition: elog.h:25
#define FATAL
Definition: elog.h:35
#define WARNING
Definition: elog.h:30
#define DEBUG2
Definition: elog.h:23
#define PANIC
Definition: elog.h:36
#define DEBUG1
Definition: elog.h:24
#define ERROR
Definition: elog.h:33
#define elog(elevel,...)
Definition: elog.h:218
#define ereport(elevel,...)
Definition: elog.h:143
const char * name
Definition: encode.c:561
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2788
int max_files_per_process
Definition: fd.c:147
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:469
int FileGetRawDesc(File file)
Definition: fd.c:2355
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3803
int FreeDir(DIR *dir)
Definition: fd.c:2840
int recovery_init_sync_method
Definition: fd.c:166
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2089
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:414
#define FD_MINFREE
Definition: fd.c:139
static int numTempTableSpaces
Definition: fd.c:287
static bool ReleaseLruFile(void)
Definition: fd.c:1373
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2461
#define FD_DELETE_AT_CLOSE
Definition: fd.c:190
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1093
static int maxAllocatedDescs
Definition: fd.c:266
static void Delete(File file)
Definition: fd.c:1259
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2621
static long tempFileCounter
Definition: fd.c:278
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:699
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2972
int durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:829
static int numAllocatedDescs
Definition: fd.c:265
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition: fd.c:1884
static void LruDelete(File file)
Definition: fd.c:1278
int pg_fdatasync(int fd)
Definition: fd.c:449
#define FileIsValid(file)
Definition: fd.c:184
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:2061
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2266
static int nfile
Definition: fd.c:220
int CloseTransientFile(int fd)
Definition: fd.c:2688
#define DO_DB(A)
Definition: fd.c:178
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1071
void closeAllVfds(void)
Definition: fd.c:2899
int max_safe_fds
Definition: fd.c:160
static File AllocateVfd(void)
Definition: fd.c:1405
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1844
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1674
int ClosePipeStream(FILE *file)
Definition: fd.c:2870
void AtEOXact_Files(bool isCommit)
Definition: fd.c:3044
int FileGetRawFlags(File file)
Definition: fd.c:2365
static Size SizeVfdCache
Definition: fd.c:215
static int nextTempTableSpace
Definition: fd.c:288
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:191
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3687
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3662
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3278
int FreeFile(FILE *file)
Definition: fd.c:2660
mode_t FileGetRawMode(File file)
Definition: fd.c:2375
static AllocateDesc * allocatedDescs
Definition: fd.c:267
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:944
static int FileAccess(File file)
Definition: fd.c:1483
static void FreeVfd(File file)
Definition: fd.c:1463
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition: fd.c:426
void FileClose(File file)
Definition: fd.c:1961
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2564
void ReleaseExternalFD(void)
Definition: fd.c:1230
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:192
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3212
void RemovePgTempFiles(void)
Definition: fd.c:3153
#define FileIsNotOpen(file)
Definition: fd.c:187
bool TempTablespacesAreSet(void)
Definition: fd.c:2957
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:673
int data_sync_elevel(int elevel)
Definition: fd.c:3826
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1566
static void Insert(File file)
Definition: fd.c:1304
AllocateDescKind
Definition: fd.c:246
@ AllocateDescDir
Definition: fd.c:249
@ AllocateDescPipe
Definition: fd.c:248
@ AllocateDescFile
Definition: fd.c:247
@ AllocateDescRawFD
Definition: fd.c:250
Oid GetNextTempTableSpace(void)
Definition: fd.c:2990
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1579
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3649
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1519
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1787
bool AcquireExternalFD(void)
Definition: fd.c:1177
static void RegisterTemporaryFile(File file)
Definition: fd.c:1538
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2803
#define NUM_RESERVED_FDS
Definition: fd.c:130
static Oid * tempTableSpaces
Definition: fd.c:286
int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2168
static bool reserveAllocatedDesc(void)
Definition: fd.c:2386
void InitFileAccess(void)
Definition: fd.c:883
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3306
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1707
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:789
static uint64 temporary_files_size
Definition: fd.c:234
void ReserveExternalFD(void)
Definition: fd.c:1212
char * FilePathName(File file)
Definition: fd.c:2339
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3334
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1915
void set_max_safe_fds(void)
Definition: fd.c:1028
int pg_fsync(int fd)
Definition: fd.c:359
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:3081
#define VFD_CLOSED
Definition: fd.c:182
static bool have_xact_temporary_files
Definition: fd.c:226
static int LruInsert(File file)
Definition: fd.c:1326
static int numExternalFDs
Definition: fd.c:272
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3763
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1643
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:3011
int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2112
ssize_t pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: fd.c:3836
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2511
void InitTemporaryFileAccess(void)
Definition: fd.c:913
static Vfd * VfdCache
Definition: fd.c:214
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2520
bool data_sync_retry
Definition: fd.c:163
static void ReleaseLruFiles(void)
Definition: fd.c:1395
void SyncDataDirectory(void)
Definition: fd.c:3429
off_t FileSize(File file)
Definition: fd.c:2287
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2304
static void BeforeShmemExit_Files(int code, Datum arg)
Definition: fd.c:3058
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3548
int pg_truncate(const char *path, off_t length)
Definition: fd.c:642
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2928
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2722
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1762
int File
Definition: fd.h:54
@ RECOVERY_INIT_SYNC_METHOD_SYNCFS
Definition: fd.h:51
@ RECOVERY_INIT_SYNC_METHOD_FSYNC
Definition: fd.h:50
#define PG_O_DIRECT
Definition: fd.h:95
int pg_file_create_mode
Definition: file_perm.c:19
int pg_dir_create_mode
Definition: file_perm.c:18
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:410
@ PGFILETYPE_DIR
Definition: file_utils.h:23
@ PGFILETYPE_REG
Definition: file_utils.h:22
int MyProcPid
Definition: globals.c:43
bool enableFsync
Definition: globals.c:122
Oid MyDatabaseTableSpace
Definition: globals.c:90
int temp_file_limit
Definition: guc.c:611
int log_temp_files
Definition: guc.c:604
#define realloc(a, b)
Definition: header.h:60
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
#define close(a)
Definition: win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:333
int j
Definition: isn.c:74
int i
Definition: isn.c:73
static void const char fflush(stdout)
Assert(fmt[strlen(fmt) - 1] !='\n')
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:241
void pfree(void *pointer)
Definition: mcxt.c:1169
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1182
void * palloc(Size size)
Definition: mcxt.c:1062
#define MAP_FAILED
Definition: mem.h:45
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:120
void * arg
static char * basedir
static PgChecksumMode mode
Definition: pg_checksums.c:65
#define PG_TEMP_FILES_DIR
Definition: pg_checksums.c:62
#define PG_TEMP_FILE_PREFIX
Definition: pg_checksums.c:63
#define MAXPGPATH
ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pwritev.c:29
#define PG_IOV_MAX
Definition: pg_iovec.h:40
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition: pg_prng.c:138
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:28
static char * buf
Definition: pg_test_fsync.c:70
char * tablespace
Definition: pgbench.c:227
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1810
void get_parent_directory(char *path)
Definition: path.c:856
ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset)
Definition: pwrite.c:27
int link(const char *src, const char *dst)
ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset)
Definition: pread.c:27
#define snprintf
Definition: port.h:225
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
uintptr_t Datum
Definition: postgres.h:411
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
static int fd(const char *x, int i)
Definition: preproc-init.c:105
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:26
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1286
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1306
ResourceOwner CurrentResourceOwner
Definition: resowner.c:146
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1297
void pg_usleep(long microsec)
Definition: signal.c:53
pqsigfunc pqsignal(int signum, pqsigfunc handler)
Definition: signal.c:180
static void error(void)
Definition: sql-dyntest.c:147
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
SubTransactionId create_subid
Definition: fd.c:256
DIR * dir
Definition: fd.c:260
FILE * file
Definition: fd.c:259
int fd
Definition: fd.c:261
union AllocateDesc::@17 desc
AllocateDescKind kind
Definition: fd.c:255
Definition: dirent.c:26
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
Definition: pg_iovec.h:25
void * iov_base
Definition: pg_iovec.h:26
size_t iov_len
Definition: pg_iovec.h:27
__int64 st_size
Definition: win32_port.h:273
unsigned short st_mode
Definition: win32_port.h:268
Definition: fd.c:195
int fd
Definition: fd.c:196
int fileFlags
Definition: fd.c:205
File lruLessRecently
Definition: fd.c:201
File lruMoreRecently
Definition: fd.c:200
char * fileName
Definition: fd.c:203
ResourceOwner resowner
Definition: fd.c:198
unsigned short fdstate
Definition: fd.c:197
File nextFree
Definition: fd.c:199
mode_t fileMode
Definition: fd.c:206
off_t fileSize
Definition: fd.c:202
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:262
static void pgstat_report_wait_end(void)
Definition: wait_event.h:278
#define fsync(fd)
Definition: win32_port.h:76
#define stat
Definition: win32_port.h:283
#define SIG_DFL
Definition: win32_port.h:162
#define EINTR
Definition: win32_port.h:351
#define SIGPIPE
Definition: win32_port.h:172
#define lstat(path, sb)
Definition: win32_port.h:284
#define S_ISDIR(m)
Definition: win32_port.h:324
void _dosmaperr(unsigned long)
Definition: win32error.c:171
#define mkdir(a, b)
Definition: win32_port.h:71
#define fstat
Definition: win32_port.h:282
bool pgwin32_is_junction(const char *path)
#define ftruncate(a, b)
Definition: win32_port.h:73
#define SIG_IGN
Definition: win32_port.h:164
#define O_DSYNC
Definition: win32_port.h:336
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:775
int sync_method
Definition: xlog.c:111
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
static const char * directory
Definition: zic.c:632