PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  * If a non-virtual file descriptor needs to be held open for any length of
65  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66  * (and eventually ReleaseExternalFD), so that we can take it into account
67  * while deciding how many VFDs can be open. This applies to FDs obtained
68  * with BasicOpenFile as well as those obtained without use of any fd.c API.
69  *
70  *-------------------------------------------------------------------------
71  */
72 
73 #include "postgres.h"
74 
75 #include <dirent.h>
76 #include <sys/file.h>
77 #include <sys/param.h>
78 #include <sys/stat.h>
79 #include <sys/types.h>
80 #ifndef WIN32
81 #include <sys/mman.h>
82 #endif
83 #include <limits.h>
84 #include <unistd.h>
85 #include <fcntl.h>
86 #ifdef HAVE_SYS_RESOURCE_H
87 #include <sys/resource.h> /* for getrlimit */
88 #endif
89 
90 #include "access/xact.h"
91 #include "access/xlog.h"
92 #include "catalog/pg_tablespace.h"
93 #include "common/file_perm.h"
94 #include "common/file_utils.h"
95 #include "miscadmin.h"
96 #include "pgstat.h"
97 #include "port/pg_iovec.h"
98 #include "portability/mem.h"
99 #include "storage/fd.h"
100 #include "storage/ipc.h"
101 #include "utils/guc.h"
102 #include "utils/resowner_private.h"
103 
104 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
105 #if defined(HAVE_SYNC_FILE_RANGE)
106 #define PG_FLUSH_DATA_WORKS 1
107 #elif !defined(WIN32) && defined(MS_ASYNC)
108 #define PG_FLUSH_DATA_WORKS 1
109 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
110 #define PG_FLUSH_DATA_WORKS 1
111 #endif
112 
113 /*
114  * We must leave some file descriptors free for system(), the dynamic loader,
115  * and other code that tries to open files without consulting fd.c. This
116  * is the number left free. (While we try fairly hard to prevent EMFILE
117  * errors, there's never any guarantee that we won't get ENFILE due to
118  * other processes chewing up FDs. So it's a bad idea to try to open files
119  * without consulting fd.c. Nonetheless we cannot control all code.)
120  *
121  * Because this is just a fixed setting, we are effectively assuming that
122  * no such code will leave FDs open over the long term; otherwise the slop
123  * is likely to be insufficient. Note in particular that we expect that
124  * loading a shared library does not result in any permanent increase in
125  * the number of open files. (This appears to be true on most if not
126  * all platforms as of Feb 2004.)
127  */
128 #define NUM_RESERVED_FDS 10
129 
130 /*
131  * If we have fewer than this many usable FDs after allowing for the reserved
132  * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
133  * much less than that. Note that this value ensures numExternalFDs can be
134  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
135  * will not pass unless that can grow to at least 14.)
136  */
137 #define FD_MINFREE 48
138 
139 /*
140  * A number of platforms allow individual processes to open many more files
141  * than they can really support when *many* processes do the same thing.
142  * This GUC parameter lets the DBA limit max_safe_fds to something less than
143  * what the postmaster's initial probe suggests will work.
144  */
146 
147 /*
148  * Maximum number of file descriptors to open for operations that fd.c knows
149  * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
150  * to a conservative value, and remains that way indefinitely in bootstrap or
151  * standalone-backend cases. In normal postmaster operation, the postmaster
152  * calls set_max_safe_fds() late in initialization to update the value, and
153  * that value is then inherited by forked subprocesses.
154  *
155  * Note: the value of max_files_per_process is taken into account while
156  * setting this variable, and so need not be tested separately.
157  */
158 int max_safe_fds = FD_MINFREE; /* default if not changed */
159 
160 /* Whether it is safe to continue running after fsync() fails. */
161 bool data_sync_retry = false;
162 
163 /* How SyncDataDirectory() should do its job. */
165 
166 /* Debugging.... */
167 
168 #ifdef FDDEBUG
169 #define DO_DB(A) \
170  do { \
171  int _do_db_save_errno = errno; \
172  A; \
173  errno = _do_db_save_errno; \
174  } while (0)
175 #else
176 #define DO_DB(A) \
177  ((void) 0)
178 #endif
179 
180 #define VFD_CLOSED (-1)
181 
182 #define FileIsValid(file) \
183  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
184 
185 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
186 
187 /* these are the assigned bits in fdstate below: */
188 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
189 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
190 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
191 
192 typedef struct vfd
193 {
194  int fd; /* current FD, or VFD_CLOSED if none */
195  unsigned short fdstate; /* bitflags for VFD's state */
196  ResourceOwner resowner; /* owner, for automatic cleanup */
197  File nextFree; /* link to next free VFD, if in freelist */
198  File lruMoreRecently; /* doubly linked recency-of-use list */
200  off_t fileSize; /* current size of file (0 if not temporary) */
201  char *fileName; /* name of file, or NULL for unused VFD */
202  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
203  int fileFlags; /* open(2) flags for (re)opening the file */
204  mode_t fileMode; /* mode to pass to open(2) */
205 } Vfd;
206 
207 /*
208  * Virtual File Descriptor array pointer and size. This grows as
209  * needed. 'File' values are indexes into this array.
210  * Note that VfdCache[0] is not a usable VFD, just a list header.
211  */
212 static Vfd *VfdCache;
213 static Size SizeVfdCache = 0;
214 
215 /*
216  * Number of file descriptors known to be in use by VFD entries.
217  */
218 static int nfile = 0;
219 
220 /*
221  * Flag to tell whether it's worth scanning VfdCache looking for temp files
222  * to close
223  */
224 static bool have_xact_temporary_files = false;
225 
226 /*
227  * Tracks the total size of all temporary files. Note: when temp_file_limit
228  * is being enforced, this cannot overflow since the limit cannot be more
229  * than INT_MAX kilobytes. When not enforcing, it could theoretically
230  * overflow, but we don't care.
231  */
232 static uint64 temporary_files_size = 0;
233 
234 /* Temporary file access initialized and not yet shut down? */
235 #ifdef USE_ASSERT_CHECKING
236 static bool temporary_files_allowed = false;
237 #endif
238 
239 /*
240  * List of OS handles opened with AllocateFile, AllocateDir and
241  * OpenTransientFile.
242  */
243 typedef enum
244 {
250 
251 typedef struct
252 {
255  union
256  {
257  FILE *file;
259  int fd;
260  } desc;
261 } AllocateDesc;
262 
263 static int numAllocatedDescs = 0;
264 static int maxAllocatedDescs = 0;
266 
267 /*
268  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
269  */
270 static int numExternalFDs = 0;
271 
272 /*
273  * Number of temporary files opened during the current session;
274  * this is used in generation of tempfile names.
275  */
276 static long tempFileCounter = 0;
277 
278 /*
279  * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
280  * indicating that the current database's default tablespace should be used.)
281  * When numTempTableSpaces is -1, this has not been set in the current
282  * transaction.
283  */
284 static Oid *tempTableSpaces = NULL;
285 static int numTempTableSpaces = -1;
286 static int nextTempTableSpace = 0;
287 
288 
289 /*--------------------
290  *
291  * Private Routines
292  *
293  * Delete - delete a file from the Lru ring
294  * LruDelete - remove a file from the Lru ring and close its FD
295  * Insert - put a file at the front of the Lru ring
296  * LruInsert - put a file at the front of the Lru ring and open it
297  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
298  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
299  * AllocateVfd - grab a free (or new) file record (from VfdCache)
300  * FreeVfd - free a file record
301  *
302  * The Least Recently Used ring is a doubly linked list that begins and
303  * ends on element zero. Element zero is special -- it doesn't represent
304  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
305  * anchor that shows us the beginning/end of the ring.
306  * Only VFD elements that are currently really open (have an FD assigned) are
307  * in the Lru ring. Elements that are "virtually" open can be recognized
308  * by having a non-null fileName field.
309  *
310  * example:
311  *
312  * /--less----\ /---------\
313  * v \ v \
314  * #0 --more---> LeastRecentlyUsed --more-\ \
315  * ^\ | |
316  * \\less--> MostRecentlyUsedFile <---/ |
317  * \more---/ \--less--/
318  *
319  *--------------------
320  */
321 static void Delete(File file);
322 static void LruDelete(File file);
323 static void Insert(File file);
324 static int LruInsert(File file);
325 static bool ReleaseLruFile(void);
326 static void ReleaseLruFiles(void);
327 static File AllocateVfd(void);
328 static void FreeVfd(File file);
329 
330 static int FileAccess(File file);
331 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
332 static bool reserveAllocatedDesc(void);
333 static int FreeDesc(AllocateDesc *desc);
334 
335 static void BeforeShmemExit_Files(int code, Datum arg);
336 static void CleanupTempFiles(bool isCommit, bool isProcExit);
337 static void RemovePgTempRelationFiles(const char *tsdirname);
338 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
339 
340 static void walkdir(const char *path,
341  void (*action) (const char *fname, bool isdir, int elevel),
342  bool process_symlinks,
343  int elevel);
344 #ifdef PG_FLUSH_DATA_WORKS
345 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
346 #endif
347 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
348 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
349 
350 static int fsync_parent_path(const char *fname, int elevel);
351 
352 
353 /*
354  * pg_fsync --- do fsync with or without writethrough
355  */
356 int
358 {
359 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
360  struct stat st;
361 
362  /*
363  * Some operating system implementations of fsync() have requirements
364  * about the file access modes that were used when their file descriptor
365  * argument was opened, and these requirements differ depending on whether
366  * the file descriptor is for a directory.
367  *
368  * For any file descriptor that may eventually be handed to fsync(), we
369  * should have opened it with access modes that are compatible with
370  * fsync() on all supported systems, otherwise the code may not be
371  * portable, even if it runs ok on the current system.
372  *
373  * We assert here that a descriptor for a file was opened with write
374  * permissions (either O_RDWR or O_WRONLY) and for a directory without
375  * write permissions (O_RDONLY).
376  *
377  * Ignore any fstat errors and let the follow-up fsync() do its work.
378  * Doing this sanity check here counts for the case where fsync() is
379  * disabled.
380  */
381  if (fstat(fd, &st) == 0)
382  {
383  int desc_flags = fcntl(fd, F_GETFL);
384 
385  /*
386  * O_RDONLY is historically 0, so just make sure that for directories
387  * no write flags are used.
388  */
389  if (S_ISDIR(st.st_mode))
390  Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
391  else
392  Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
393  }
394  errno = 0;
395 #endif
396 
397  /* #if is to skip the sync_method test if there's no need for it */
398 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
400  return pg_fsync_writethrough(fd);
401  else
402 #endif
403  return pg_fsync_no_writethrough(fd);
404 }
405 
406 
407 /*
408  * pg_fsync_no_writethrough --- same as fsync except does nothing if
409  * enableFsync is off
410  */
411 int
413 {
414  if (enableFsync)
415  return fsync(fd);
416  else
417  return 0;
418 }
419 
420 /*
421  * pg_fsync_writethrough
422  */
423 int
425 {
426  if (enableFsync)
427  {
428 #ifdef WIN32
429  return _commit(fd);
430 #elif defined(F_FULLFSYNC)
431  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
432 #else
433  errno = ENOSYS;
434  return -1;
435 #endif
436  }
437  else
438  return 0;
439 }
440 
441 /*
442  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
443  *
444  * Not all platforms have fdatasync; treat as fsync if not available.
445  */
446 int
448 {
449  if (enableFsync)
450  {
451 #ifdef HAVE_FDATASYNC
452  return fdatasync(fd);
453 #else
454  return fsync(fd);
455 #endif
456  }
457  else
458  return 0;
459 }
460 
461 /*
462  * pg_flush_data --- advise OS that the described dirty data should be flushed
463  *
464  * offset of 0 with nbytes 0 means that the entire file should be flushed
465  */
466 void
467 pg_flush_data(int fd, off_t offset, off_t nbytes)
468 {
469  /*
470  * Right now file flushing is primarily used to avoid making later
471  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
472  * if fsyncs are disabled - that's a decision we might want to make
473  * configurable at some point.
474  */
475  if (!enableFsync)
476  return;
477 
478  /*
479  * We compile all alternatives that are supported on the current platform,
480  * to find portability problems more easily.
481  */
482 #if defined(HAVE_SYNC_FILE_RANGE)
483  {
484  int rc;
485  static bool not_implemented_by_kernel = false;
486 
487  if (not_implemented_by_kernel)
488  return;
489 
490  /*
491  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
492  * tells the OS that writeback for the specified blocks should be
493  * started, but that we don't want to wait for completion. Note that
494  * this call might block if too much dirty data exists in the range.
495  * This is the preferable method on OSs supporting it, as it works
496  * reliably when available (contrast to msync()) and doesn't flush out
497  * clean data (like FADV_DONTNEED).
498  */
499  rc = sync_file_range(fd, offset, nbytes,
500  SYNC_FILE_RANGE_WRITE);
501  if (rc != 0)
502  {
503  int elevel;
504 
505  /*
506  * For systems that don't have an implementation of
507  * sync_file_range() such as Windows WSL, generate only one
508  * warning and then suppress all further attempts by this process.
509  */
510  if (errno == ENOSYS)
511  {
512  elevel = WARNING;
513  not_implemented_by_kernel = true;
514  }
515  else
516  elevel = data_sync_elevel(WARNING);
517 
518  ereport(elevel,
520  errmsg("could not flush dirty data: %m")));
521  }
522 
523  return;
524  }
525 #endif
526 #if !defined(WIN32) && defined(MS_ASYNC)
527  {
528  void *p;
529  static int pagesize = 0;
530 
531  /*
532  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
533  * writeback. On linux it only does so if MS_SYNC is specified, but
534  * then it does the writeback synchronously. Luckily all common linux
535  * systems have sync_file_range(). This is preferable over
536  * FADV_DONTNEED because it doesn't flush out clean data.
537  *
538  * We map the file (mmap()), tell the kernel to sync back the contents
539  * (msync()), and then remove the mapping again (munmap()).
540  */
541 
542  /* mmap() needs actual length if we want to map whole file */
543  if (offset == 0 && nbytes == 0)
544  {
545  nbytes = lseek(fd, 0, SEEK_END);
546  if (nbytes < 0)
547  {
550  errmsg("could not determine dirty data size: %m")));
551  return;
552  }
553  }
554 
555  /*
556  * Some platforms reject partial-page mmap() attempts. To deal with
557  * that, just truncate the request to a page boundary. If any extra
558  * bytes don't get flushed, well, it's only a hint anyway.
559  */
560 
561  /* fetch pagesize only once */
562  if (pagesize == 0)
563  pagesize = sysconf(_SC_PAGESIZE);
564 
565  /* align length to pagesize, dropping any fractional page */
566  if (pagesize > 0)
567  nbytes = (nbytes / pagesize) * pagesize;
568 
569  /* fractional-page request is a no-op */
570  if (nbytes <= 0)
571  return;
572 
573  /*
574  * mmap could well fail, particularly on 32-bit platforms where there
575  * may simply not be enough address space. If so, silently fall
576  * through to the next implementation.
577  */
578  if (nbytes <= (off_t) SSIZE_MAX)
579  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
580  else
581  p = MAP_FAILED;
582 
583  if (p != MAP_FAILED)
584  {
585  int rc;
586 
587  rc = msync(p, (size_t) nbytes, MS_ASYNC);
588  if (rc != 0)
589  {
592  errmsg("could not flush dirty data: %m")));
593  /* NB: need to fall through to munmap()! */
594  }
595 
596  rc = munmap(p, (size_t) nbytes);
597  if (rc != 0)
598  {
599  /* FATAL error because mapping would remain */
600  ereport(FATAL,
602  errmsg("could not munmap() while flushing data: %m")));
603  }
604 
605  return;
606  }
607  }
608 #endif
609 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
610  {
611  int rc;
612 
613  /*
614  * Signal the kernel that the passed in range should not be cached
615  * anymore. This has the, desired, side effect of writing out dirty
616  * data, and the, undesired, side effect of likely discarding useful
617  * clean cached blocks. For the latter reason this is the least
618  * preferable method.
619  */
620 
621  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
622 
623  if (rc != 0)
624  {
625  /* don't error out, this is just a performance optimization */
628  errmsg("could not flush dirty data: %m")));
629  }
630 
631  return;
632  }
633 #endif
634 }
635 
636 /*
637  * Truncate a file to a given length by name.
638  */
639 int
640 pg_truncate(const char *path, off_t length)
641 {
642 #ifdef WIN32
643  int save_errno;
644  int ret;
645  int fd;
646 
647  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
648  if (fd >= 0)
649  {
650  ret = ftruncate(fd, 0);
651  save_errno = errno;
652  CloseTransientFile(fd);
653  errno = save_errno;
654  }
655  else
656  ret = -1;
657 
658  return ret;
659 #else
660  return truncate(path, length);
661 #endif
662 }
663 
664 /*
665  * fsync_fname -- fsync a file or directory, handling errors properly
666  *
667  * Try to fsync a file or directory. When doing the latter, ignore errors that
668  * indicate the OS just doesn't allow/require fsyncing directories.
669  */
670 void
671 fsync_fname(const char *fname, bool isdir)
672 {
673  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
674 }
675 
676 /*
677  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
678  *
679  * This routine ensures that, after returning, the effect of renaming file
680  * persists in case of a crash. A crash while this routine is running will
681  * leave you with either the pre-existing or the moved file in place of the
682  * new file; no mixed state or truncated files are possible.
683  *
684  * It does so by using fsync on the old filename and the possibly existing
685  * target filename before the rename, and the target file and directory after.
686  *
687  * Note that rename() cannot be used across arbitrary directories, as they
688  * might not be on the same filesystem. Therefore this routine does not
689  * support renaming across directories.
690  *
691  * Log errors with the caller specified severity.
692  *
693  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
694  * valid upon return.
695  */
696 int
697 durable_rename(const char *oldfile, const char *newfile, int elevel)
698 {
699  int fd;
700 
701  /*
702  * First fsync the old and target path (if it exists), to ensure that they
703  * are properly persistent on disk. Syncing the target file is not
704  * strictly necessary, but it makes it easier to reason about crashes;
705  * because it's then guaranteed that either source or target file exists
706  * after a crash.
707  */
708  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
709  return -1;
710 
711  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
712  if (fd < 0)
713  {
714  if (errno != ENOENT)
715  {
716  ereport(elevel,
718  errmsg("could not open file \"%s\": %m", newfile)));
719  return -1;
720  }
721  }
722  else
723  {
724  if (pg_fsync(fd) != 0)
725  {
726  int save_errno;
727 
728  /* close file upon error, might not be in transaction context */
729  save_errno = errno;
730  CloseTransientFile(fd);
731  errno = save_errno;
732 
733  ereport(elevel,
735  errmsg("could not fsync file \"%s\": %m", newfile)));
736  return -1;
737  }
738 
739  if (CloseTransientFile(fd) != 0)
740  {
741  ereport(elevel,
743  errmsg("could not close file \"%s\": %m", newfile)));
744  return -1;
745  }
746  }
747 
748  /* Time to do the real deal... */
749  if (rename(oldfile, newfile) < 0)
750  {
751  ereport(elevel,
753  errmsg("could not rename file \"%s\" to \"%s\": %m",
754  oldfile, newfile)));
755  return -1;
756  }
757 
758  /*
759  * To guarantee renaming the file is persistent, fsync the file with its
760  * new name, and its containing directory.
761  */
762  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
763  return -1;
764 
765  if (fsync_parent_path(newfile, elevel) != 0)
766  return -1;
767 
768  return 0;
769 }
770 
771 /*
772  * durable_unlink -- remove a file in a durable manner
773  *
774  * This routine ensures that, after returning, the effect of removing file
775  * persists in case of a crash. A crash while this routine is running will
776  * leave the system in no mixed state.
777  *
778  * It does so by using fsync on the parent directory of the file after the
779  * actual removal is done.
780  *
781  * Log errors with the severity specified by caller.
782  *
783  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
784  * valid upon return.
785  */
786 int
787 durable_unlink(const char *fname, int elevel)
788 {
789  if (unlink(fname) < 0)
790  {
791  ereport(elevel,
793  errmsg("could not remove file \"%s\": %m",
794  fname)));
795  return -1;
796  }
797 
798  /*
799  * To guarantee that the removal of the file is persistent, fsync its
800  * parent directory.
801  */
802  if (fsync_parent_path(fname, elevel) != 0)
803  return -1;
804 
805  return 0;
806 }
807 
808 /*
809  * durable_rename_excl -- rename a file in a durable manner.
810  *
811  * Similar to durable_rename(), except that this routine tries (but does not
812  * guarantee) not to overwrite the target file.
813  *
814  * Note that a crash in an unfortunate moment can leave you with two links to
815  * the target file.
816  *
817  * Log errors with the caller specified severity.
818  *
819  * On Windows, using a hard link followed by unlink() causes concurrency
820  * issues, while a simple rename() does not cause that, so be careful when
821  * changing the logic of this routine.
822  *
823  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
824  * valid upon return.
825  */
826 int
827 durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
828 {
829  /*
830  * Ensure that, if we crash directly after the rename/link, a file with
831  * valid contents is moved into place.
832  */
833  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
834  return -1;
835 
836 #ifdef HAVE_WORKING_LINK
837  if (link(oldfile, newfile) < 0)
838  {
839  ereport(elevel,
841  errmsg("could not link file \"%s\" to \"%s\": %m",
842  oldfile, newfile)));
843  return -1;
844  }
845  unlink(oldfile);
846 #else
847  if (rename(oldfile, newfile) < 0)
848  {
849  ereport(elevel,
851  errmsg("could not rename file \"%s\" to \"%s\": %m",
852  oldfile, newfile)));
853  return -1;
854  }
855 #endif
856 
857  /*
858  * Make change persistent in case of an OS crash, both the new entry and
859  * its parent directory need to be flushed.
860  */
861  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
862  return -1;
863 
864  /* Same for parent directory */
865  if (fsync_parent_path(newfile, elevel) != 0)
866  return -1;
867 
868  return 0;
869 }
870 
871 /*
872  * InitFileAccess --- initialize this module during backend startup
873  *
874  * This is called during either normal or standalone backend start.
875  * It is *not* called in the postmaster.
876  *
877  * Note that this does not initialize temporary file access, that is
878  * separately initialized via InitTemporaryFileAccess().
879  */
880 void
882 {
883  Assert(SizeVfdCache == 0); /* call me only once */
884 
885  /* initialize cache header entry */
886  VfdCache = (Vfd *) malloc(sizeof(Vfd));
887  if (VfdCache == NULL)
888  ereport(FATAL,
889  (errcode(ERRCODE_OUT_OF_MEMORY),
890  errmsg("out of memory")));
891 
892  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
893  VfdCache->fd = VFD_CLOSED;
894 
895  SizeVfdCache = 1;
896 }
897 
898 /*
899  * InitTemporaryFileAccess --- initialize temporary file access during startup
900  *
901  * This is called during either normal or standalone backend start.
902  * It is *not* called in the postmaster.
903  *
904  * This is separate from InitFileAccess() because temporary file cleanup can
905  * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
906  * our reporting has to happen before that. Low level file access should be
907  * available for longer, hence the separate initialization / shutdown of
908  * temporary file handling.
909  */
910 void
912 {
913  Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run*/
914  Assert(!temporary_files_allowed); /* call me only once */
915 
916  /*
917  * Register before-shmem-exit hook to ensure temp files are dropped while
918  * we can still report stats.
919  */
921 
922 #ifdef USE_ASSERT_CHECKING
923  temporary_files_allowed = true;
924 #endif
925 }
926 
927 /*
928  * count_usable_fds --- count how many FDs the system will let us open,
929  * and estimate how many are already open.
930  *
931  * We stop counting if usable_fds reaches max_to_probe. Note: a small
932  * value of max_to_probe might result in an underestimate of already_open;
933  * we must fill in any "gaps" in the set of used FDs before the calculation
934  * of already_open will give the right answer. In practice, max_to_probe
935  * of a couple of dozen should be enough to ensure good results.
936  *
937  * We assume stderr (FD 2) is available for dup'ing. While the calling
938  * script could theoretically close that, it would be a really bad idea,
939  * since then one risks loss of error messages from, e.g., libc.
940  */
941 static void
942 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
943 {
944  int *fd;
945  int size;
946  int used = 0;
947  int highestfd = 0;
948  int j;
949 
950 #ifdef HAVE_GETRLIMIT
951  struct rlimit rlim;
952  int getrlimit_status;
953 #endif
954 
955  size = 1024;
956  fd = (int *) palloc(size * sizeof(int));
957 
958 #ifdef HAVE_GETRLIMIT
959 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
960  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
961 #else /* but BSD doesn't ... */
962  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
963 #endif /* RLIMIT_NOFILE */
964  if (getrlimit_status != 0)
965  ereport(WARNING, (errmsg("getrlimit failed: %m")));
966 #endif /* HAVE_GETRLIMIT */
967 
968  /* dup until failure or probe limit reached */
969  for (;;)
970  {
971  int thisfd;
972 
973 #ifdef HAVE_GETRLIMIT
974 
975  /*
976  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
977  * some platforms
978  */
979  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
980  break;
981 #endif
982 
983  thisfd = dup(2);
984  if (thisfd < 0)
985  {
986  /* Expect EMFILE or ENFILE, else it's fishy */
987  if (errno != EMFILE && errno != ENFILE)
988  elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
989  break;
990  }
991 
992  if (used >= size)
993  {
994  size *= 2;
995  fd = (int *) repalloc(fd, size * sizeof(int));
996  }
997  fd[used++] = thisfd;
998 
999  if (highestfd < thisfd)
1000  highestfd = thisfd;
1001 
1002  if (used >= max_to_probe)
1003  break;
1004  }
1005 
1006  /* release the files we opened */
1007  for (j = 0; j < used; j++)
1008  close(fd[j]);
1009 
1010  pfree(fd);
1011 
1012  /*
1013  * Return results. usable_fds is just the number of successful dups. We
1014  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1015  * number) and so already_open is highestfd+1 - usable_fds.
1016  */
1017  *usable_fds = used;
1018  *already_open = highestfd + 1 - used;
1019 }
1020 
1021 /*
1022  * set_max_safe_fds
1023  * Determine number of file descriptors that fd.c is allowed to use
1024  */
1025 void
1027 {
1028  int usable_fds;
1029  int already_open;
1030 
1031  /*----------
1032  * We want to set max_safe_fds to
1033  * MIN(usable_fds, max_files_per_process - already_open)
1034  * less the slop factor for files that are opened without consulting
1035  * fd.c. This ensures that we won't exceed either max_files_per_process
1036  * or the experimentally-determined EMFILE limit.
1037  *----------
1038  */
1040  &usable_fds, &already_open);
1041 
1042  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
1043 
1044  /*
1045  * Take off the FDs reserved for system() etc.
1046  */
1048 
1049  /*
1050  * Make sure we still have enough to get by.
1051  */
1052  if (max_safe_fds < FD_MINFREE)
1053  ereport(FATAL,
1054  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1055  errmsg("insufficient file descriptors available to start server process"),
1056  errdetail("System allows %d, we need at least %d.",
1059 
1060  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1061  max_safe_fds, usable_fds, already_open);
1062 }
1063 
1064 /*
1065  * Open a file with BasicOpenFilePerm() and pass default file mode for the
1066  * fileMode parameter.
1067  */
1068 int
1070 {
1071  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1072 }
1073 
1074 /*
1075  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1076  *
1077  * This is exported for use by places that really want a plain kernel FD,
1078  * but need to be proof against running out of FDs. Once an FD has been
1079  * successfully returned, it is the caller's responsibility to ensure that
1080  * it will not be leaked on ereport()! Most users should *not* call this
1081  * routine directly, but instead use the VFD abstraction level, which
1082  * provides protection against descriptor leaks as well as management of
1083  * files that need to be open for more than a short period of time.
1084  *
1085  * Ideally this should be the *only* direct call of open() in the backend.
1086  * In practice, the postmaster calls open() directly, and there are some
1087  * direct open() calls done early in backend startup. Those are OK since
1088  * this module wouldn't have any open files to close at that point anyway.
1089  */
1090 int
1092 {
1093  int fd;
1094 
1095 tryAgain:
1096 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1097 
1098  /*
1099  * The value we defined to stand in for O_DIRECT when simulating it with
1100  * F_NOCACHE had better not collide with any of the standard flags.
1101  */
1103  (O_APPEND |
1104  O_CREAT |
1105  O_EXCL |
1106  O_RDWR |
1107  O_RDONLY |
1108  O_SYNC |
1109  O_TRUNC |
1110  O_WRONLY)) == 0,
1111  "PG_O_DIRECT value collides with standard flag");
1112 #if defined(O_CLOEXEC)
1113  StaticAssertStmt((PG_O_DIRECT & O_CLOEXEC) == 0,
1114  "PG_O_DIRECT value collides with O_CLOEXEC");
1115 #endif
1116 #if defined(O_DSYNC)
1118  "PG_O_DIRECT value collides with O_DSYNC");
1119 #endif
1120 
1121  fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1122 #else
1123  fd = open(fileName, fileFlags, fileMode);
1124 #endif
1125 
1126  if (fd >= 0)
1127  {
1128 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1129  if (fileFlags & PG_O_DIRECT)
1130  {
1131  if (fcntl(fd, F_NOCACHE, 1) < 0)
1132  {
1133  int save_errno = errno;
1134 
1135  close(fd);
1136  errno = save_errno;
1137  return -1;
1138  }
1139  }
1140 #endif
1141 
1142  return fd; /* success! */
1143  }
1144 
1145  if (errno == EMFILE || errno == ENFILE)
1146  {
1147  int save_errno = errno;
1148 
1149  ereport(LOG,
1150  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1151  errmsg("out of file descriptors: %m; release and retry")));
1152  errno = 0;
1153  if (ReleaseLruFile())
1154  goto tryAgain;
1155  errno = save_errno;
1156  }
1157 
1158  return -1; /* failure */
1159 }
1160 
1161 /*
1162  * AcquireExternalFD - attempt to reserve an external file descriptor
1163  *
1164  * This should be used by callers that need to hold a file descriptor open
1165  * over more than a short interval, but cannot use any of the other facilities
1166  * provided by this module.
1167  *
1168  * The difference between this and the underlying ReserveExternalFD function
1169  * is that this will report failure (by setting errno and returning false)
1170  * if "too many" external FDs are already reserved. This should be used in
1171  * any code where the total number of FDs to be reserved is not predictable
1172  * and small.
1173  */
1174 bool
1176 {
1177  /*
1178  * We don't want more than max_safe_fds / 3 FDs to be consumed for
1179  * "external" FDs.
1180  */
1181  if (numExternalFDs < max_safe_fds / 3)
1182  {
1184  return true;
1185  }
1186  errno = EMFILE;
1187  return false;
1188 }
1189 
1190 /*
1191  * ReserveExternalFD - report external consumption of a file descriptor
1192  *
1193  * This should be used by callers that need to hold a file descriptor open
1194  * over more than a short interval, but cannot use any of the other facilities
1195  * provided by this module. This just tracks the use of the FD and closes
1196  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1197  *
1198  * Call this directly only in code where failure to reserve the FD would be
1199  * fatal; for example, the WAL-writing code does so, since the alternative is
1200  * session failure. Also, it's very unwise to do so in code that could
1201  * consume more than one FD per process.
1202  *
1203  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1204  * available, it doesn't matter too much whether this is called before or
1205  * after actually opening the FD; but doing so beforehand reduces the risk of
1206  * an EMFILE failure if not everybody played nice. In any case, it's solely
1207  * caller's responsibility to keep the external-FD count in sync with reality.
1208  */
1209 void
1211 {
1212  /*
1213  * Release VFDs if needed to stay safe. Because we do this before
1214  * incrementing numExternalFDs, the final state will be as desired, i.e.,
1215  * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1216  */
1217  ReleaseLruFiles();
1218 
1219  numExternalFDs++;
1220 }
1221 
1222 /*
1223  * ReleaseExternalFD - report release of an external file descriptor
1224  *
1225  * This is guaranteed not to change errno, so it can be used in failure paths.
1226  */
1227 void
1229 {
1230  Assert(numExternalFDs > 0);
1231  numExternalFDs--;
1232 }
1233 
1234 
1235 #if defined(FDDEBUG)
1236 
1237 static void
1238 _dump_lru(void)
1239 {
1240  int mru = VfdCache[0].lruLessRecently;
1241  Vfd *vfdP = &VfdCache[mru];
1242  char buf[2048];
1243 
1244  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1245  while (mru != 0)
1246  {
1247  mru = vfdP->lruLessRecently;
1248  vfdP = &VfdCache[mru];
1249  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1250  }
1251  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1252  elog(LOG, "%s", buf);
1253 }
1254 #endif /* FDDEBUG */
1255 
1256 static void
1258 {
1259  Vfd *vfdP;
1260 
1261  Assert(file != 0);
1262 
1263  DO_DB(elog(LOG, "Delete %d (%s)",
1264  file, VfdCache[file].fileName));
1265  DO_DB(_dump_lru());
1266 
1267  vfdP = &VfdCache[file];
1268 
1269  VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1270  VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1271 
1272  DO_DB(_dump_lru());
1273 }
1274 
1275 static void
1277 {
1278  Vfd *vfdP;
1279 
1280  Assert(file != 0);
1281 
1282  DO_DB(elog(LOG, "LruDelete %d (%s)",
1283  file, VfdCache[file].fileName));
1284 
1285  vfdP = &VfdCache[file];
1286 
1287  /*
1288  * Close the file. We aren't expecting this to fail; if it does, better
1289  * to leak the FD than to mess up our internal state.
1290  */
1291  if (close(vfdP->fd) != 0)
1293  "could not close file \"%s\": %m", vfdP->fileName);
1294  vfdP->fd = VFD_CLOSED;
1295  --nfile;
1296 
1297  /* delete the vfd record from the LRU ring */
1298  Delete(file);
1299 }
1300 
1301 static void
1303 {
1304  Vfd *vfdP;
1305 
1306  Assert(file != 0);
1307 
1308  DO_DB(elog(LOG, "Insert %d (%s)",
1309  file, VfdCache[file].fileName));
1310  DO_DB(_dump_lru());
1311 
1312  vfdP = &VfdCache[file];
1313 
1314  vfdP->lruMoreRecently = 0;
1315  vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1316  VfdCache[0].lruLessRecently = file;
1317  VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1318 
1319  DO_DB(_dump_lru());
1320 }
1321 
1322 /* returns 0 on success, -1 on re-open failure (with errno set) */
1323 static int
1325 {
1326  Vfd *vfdP;
1327 
1328  Assert(file != 0);
1329 
1330  DO_DB(elog(LOG, "LruInsert %d (%s)",
1331  file, VfdCache[file].fileName));
1332 
1333  vfdP = &VfdCache[file];
1334 
1335  if (FileIsNotOpen(file))
1336  {
1337  /* Close excess kernel FDs. */
1338  ReleaseLruFiles();
1339 
1340  /*
1341  * The open could still fail for lack of file descriptors, eg due to
1342  * overall system file table being full. So, be prepared to release
1343  * another FD if necessary...
1344  */
1345  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1346  vfdP->fileMode);
1347  if (vfdP->fd < 0)
1348  {
1349  DO_DB(elog(LOG, "re-open failed: %m"));
1350  return -1;
1351  }
1352  else
1353  {
1354  ++nfile;
1355  }
1356  }
1357 
1358  /*
1359  * put it at the head of the Lru ring
1360  */
1361 
1362  Insert(file);
1363 
1364  return 0;
1365 }
1366 
1367 /*
1368  * Release one kernel FD by closing the least-recently-used VFD.
1369  */
1370 static bool
1372 {
1373  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1374 
1375  if (nfile > 0)
1376  {
1377  /*
1378  * There are opened files and so there should be at least one used vfd
1379  * in the ring.
1380  */
1381  Assert(VfdCache[0].lruMoreRecently != 0);
1382  LruDelete(VfdCache[0].lruMoreRecently);
1383  return true; /* freed a file */
1384  }
1385  return false; /* no files available to free */
1386 }
1387 
1388 /*
1389  * Release kernel FDs as needed to get under the max_safe_fds limit.
1390  * After calling this, it's OK to try to open another file.
1391  */
1392 static void
1394 {
1396  {
1397  if (!ReleaseLruFile())
1398  break;
1399  }
1400 }
1401 
1402 static File
1404 {
1405  Index i;
1406  File file;
1407 
1408  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1409 
1410  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1411 
1412  if (VfdCache[0].nextFree == 0)
1413  {
1414  /*
1415  * The free list is empty so it is time to increase the size of the
1416  * array. We choose to double it each time this happens. However,
1417  * there's not much point in starting *real* small.
1418  */
1419  Size newCacheSize = SizeVfdCache * 2;
1420  Vfd *newVfdCache;
1421 
1422  if (newCacheSize < 32)
1423  newCacheSize = 32;
1424 
1425  /*
1426  * Be careful not to clobber VfdCache ptr if realloc fails.
1427  */
1428  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1429  if (newVfdCache == NULL)
1430  ereport(ERROR,
1431  (errcode(ERRCODE_OUT_OF_MEMORY),
1432  errmsg("out of memory")));
1433  VfdCache = newVfdCache;
1434 
1435  /*
1436  * Initialize the new entries and link them into the free list.
1437  */
1438  for (i = SizeVfdCache; i < newCacheSize; i++)
1439  {
1440  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1441  VfdCache[i].nextFree = i + 1;
1442  VfdCache[i].fd = VFD_CLOSED;
1443  }
1444  VfdCache[newCacheSize - 1].nextFree = 0;
1445  VfdCache[0].nextFree = SizeVfdCache;
1446 
1447  /*
1448  * Record the new size
1449  */
1450  SizeVfdCache = newCacheSize;
1451  }
1452 
1453  file = VfdCache[0].nextFree;
1454 
1455  VfdCache[0].nextFree = VfdCache[file].nextFree;
1456 
1457  return file;
1458 }
1459 
1460 static void
1462 {
1463  Vfd *vfdP = &VfdCache[file];
1464 
1465  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1466  file, vfdP->fileName ? vfdP->fileName : ""));
1467 
1468  if (vfdP->fileName != NULL)
1469  {
1470  free(vfdP->fileName);
1471  vfdP->fileName = NULL;
1472  }
1473  vfdP->fdstate = 0x0;
1474 
1475  vfdP->nextFree = VfdCache[0].nextFree;
1476  VfdCache[0].nextFree = file;
1477 }
1478 
1479 /* returns 0 on success, -1 on re-open failure (with errno set) */
1480 static int
1482 {
1483  int returnValue;
1484 
1485  DO_DB(elog(LOG, "FileAccess %d (%s)",
1486  file, VfdCache[file].fileName));
1487 
1488  /*
1489  * Is the file open? If not, open it and put it at the head of the LRU
1490  * ring (possibly closing the least recently used file to get an FD).
1491  */
1492 
1493  if (FileIsNotOpen(file))
1494  {
1495  returnValue = LruInsert(file);
1496  if (returnValue != 0)
1497  return returnValue;
1498  }
1499  else if (VfdCache[0].lruLessRecently != file)
1500  {
1501  /*
1502  * We now know that the file is open and that it is not the last one
1503  * accessed, so we need to move it to the head of the Lru ring.
1504  */
1505 
1506  Delete(file);
1507  Insert(file);
1508  }
1509 
1510  return 0;
1511 }
1512 
1513 /*
1514  * Called whenever a temporary file is deleted to report its size.
1515  */
1516 static void
1517 ReportTemporaryFileUsage(const char *path, off_t size)
1518 {
1519  pgstat_report_tempfile(size);
1520 
1521  if (log_temp_files >= 0)
1522  {
1523  if ((size / 1024) >= log_temp_files)
1524  ereport(LOG,
1525  (errmsg("temporary file: path \"%s\", size %lu",
1526  path, (unsigned long) size)));
1527  }
1528 }
1529 
1530 /*
1531  * Called to register a temporary file for automatic close.
1532  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1533  * before the file was opened.
1534  */
1535 static void
1537 {
1539  VfdCache[file].resowner = CurrentResourceOwner;
1540 
1541  /* Backup mechanism for closing at end of xact. */
1542  VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1544 }
1545 
1546 /*
1547  * Called when we get a shared invalidation message on some relation.
1548  */
1549 #ifdef NOT_USED
1550 void
1551 FileInvalidate(File file)
1552 {
1553  Assert(FileIsValid(file));
1554  if (!FileIsNotOpen(file))
1555  LruDelete(file);
1556 }
1557 #endif
1558 
1559 /*
1560  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1561  * fileMode parameter.
1562  */
1563 File
1565 {
1566  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1567 }
1568 
1569 /*
1570  * open a file in an arbitrary directory
1571  *
1572  * NB: if the passed pathname is relative (which it usually is),
1573  * it will be interpreted relative to the process' working directory
1574  * (which should always be $PGDATA when this code is running).
1575  */
1576 File
1578 {
1579  char *fnamecopy;
1580  File file;
1581  Vfd *vfdP;
1582 
1583  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1584  fileName, fileFlags, fileMode));
1585 
1586  /*
1587  * We need a malloc'd copy of the file name; fail cleanly if no room.
1588  */
1589  fnamecopy = strdup(fileName);
1590  if (fnamecopy == NULL)
1591  ereport(ERROR,
1592  (errcode(ERRCODE_OUT_OF_MEMORY),
1593  errmsg("out of memory")));
1594 
1595  file = AllocateVfd();
1596  vfdP = &VfdCache[file];
1597 
1598  /* Close excess kernel FDs. */
1599  ReleaseLruFiles();
1600 
1601  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1602 
1603  if (vfdP->fd < 0)
1604  {
1605  int save_errno = errno;
1606 
1607  FreeVfd(file);
1608  free(fnamecopy);
1609  errno = save_errno;
1610  return -1;
1611  }
1612  ++nfile;
1613  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1614  vfdP->fd));
1615 
1616  vfdP->fileName = fnamecopy;
1617  /* Saved flags are adjusted to be OK for re-opening file */
1618  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1619  vfdP->fileMode = fileMode;
1620  vfdP->fileSize = 0;
1621  vfdP->fdstate = 0x0;
1622  vfdP->resowner = NULL;
1623 
1624  Insert(file);
1625 
1626  return file;
1627 }
1628 
1629 /*
1630  * Create directory 'directory'. If necessary, create 'basedir', which must
1631  * be the directory above it. This is designed for creating the top-level
1632  * temporary directory on demand before creating a directory underneath it.
1633  * Do nothing if the directory already exists.
1634  *
1635  * Directories created within the top-level temporary directory should begin
1636  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1637  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1638  * that do not need any particular prefix.
1639 */
1640 void
1642 {
1643  if (MakePGDirectory(directory) < 0)
1644  {
1645  if (errno == EEXIST)
1646  return;
1647 
1648  /*
1649  * Failed. Try to create basedir first in case it's missing. Tolerate
1650  * EEXIST to close a race against another process following the same
1651  * algorithm.
1652  */
1653  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1654  ereport(ERROR,
1656  errmsg("cannot create temporary directory \"%s\": %m",
1657  basedir)));
1658 
1659  /* Try again. */
1660  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1661  ereport(ERROR,
1663  errmsg("cannot create temporary subdirectory \"%s\": %m",
1664  directory)));
1665  }
1666 }
1667 
1668 /*
1669  * Delete a directory and everything in it, if it exists.
1670  */
1671 void
1672 PathNameDeleteTemporaryDir(const char *dirname)
1673 {
1674  struct stat statbuf;
1675 
1676  /* Silently ignore missing directory. */
1677  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1678  return;
1679 
1680  /*
1681  * Currently, walkdir doesn't offer a way for our passed in function to
1682  * maintain state. Perhaps it should, so that we could tell the caller
1683  * whether this operation succeeded or failed. Since this operation is
1684  * used in a cleanup path, we wouldn't actually behave differently: we'll
1685  * just log failures.
1686  */
1687  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1688 }
1689 
1690 /*
1691  * Open a temporary file that will disappear when we close it.
1692  *
1693  * This routine takes care of generating an appropriate tempfile name.
1694  * There's no need to pass in fileFlags or fileMode either, since only
1695  * one setting makes any sense for a temp file.
1696  *
1697  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1698  * to ensure it's closed and deleted when it's no longer needed, typically at
1699  * the end-of-transaction. In most cases, you don't want temporary files to
1700  * outlive the transaction that created them, so this should be false -- but
1701  * if you need "somewhat" temporary storage, this might be useful. In either
1702  * case, the file is removed when the File is explicitly closed.
1703  */
1704 File
1705 OpenTemporaryFile(bool interXact)
1706 {
1707  File file = 0;
1708 
1709  Assert(temporary_files_allowed); /* check temp file access is up */
1710 
1711  /*
1712  * Make sure the current resource owner has space for this File before we
1713  * open it, if we'll be registering it below.
1714  */
1715  if (!interXact)
1717 
1718  /*
1719  * If some temp tablespace(s) have been given to us, try to use the next
1720  * one. If a given tablespace can't be found, we silently fall back to
1721  * the database's default tablespace.
1722  *
1723  * BUT: if the temp file is slated to outlive the current transaction,
1724  * force it into the database's default tablespace, so that it will not
1725  * pose a threat to possible tablespace drop attempts.
1726  */
1727  if (numTempTableSpaces > 0 && !interXact)
1728  {
1729  Oid tblspcOid = GetNextTempTableSpace();
1730 
1731  if (OidIsValid(tblspcOid))
1732  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1733  }
1734 
1735  /*
1736  * If not, or if tablespace is bad, create in database's default
1737  * tablespace. MyDatabaseTableSpace should normally be set before we get
1738  * here, but just in case it isn't, fall back to pg_default tablespace.
1739  */
1740  if (file <= 0)
1743  DEFAULTTABLESPACE_OID,
1744  true);
1745 
1746  /* Mark it for deletion at close and temporary file size limit */
1747  VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1748 
1749  /* Register it with the current resource owner */
1750  if (!interXact)
1751  RegisterTemporaryFile(file);
1752 
1753  return file;
1754 }
1755 
1756 /*
1757  * Return the path of the temp directory in a given tablespace.
1758  */
1759 void
1761 {
1762  /*
1763  * Identify the tempfile directory for this tablespace.
1764  *
1765  * If someone tries to specify pg_global, use pg_default instead.
1766  */
1767  if (tablespace == InvalidOid ||
1768  tablespace == DEFAULTTABLESPACE_OID ||
1769  tablespace == GLOBALTABLESPACE_OID)
1770  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1771  else
1772  {
1773  /* All other tablespaces are accessed via symlinks */
1774  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1775  tablespace, TABLESPACE_VERSION_DIRECTORY,
1777  }
1778 }
1779 
1780 /*
1781  * Open a temporary file in a specific tablespace.
1782  * Subroutine for OpenTemporaryFile, which see for details.
1783  */
1784 static File
1785 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1786 {
1787  char tempdirpath[MAXPGPATH];
1788  char tempfilepath[MAXPGPATH];
1789  File file;
1790 
1791  TempTablespacePath(tempdirpath, tblspcOid);
1792 
1793  /*
1794  * Generate a tempfile name that should be unique within the current
1795  * database instance.
1796  */
1797  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1798  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1799 
1800  /*
1801  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1802  * temp file that can be reused.
1803  */
1804  file = PathNameOpenFile(tempfilepath,
1805  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1806  if (file <= 0)
1807  {
1808  /*
1809  * We might need to create the tablespace's tempfile directory, if no
1810  * one has yet done so.
1811  *
1812  * Don't check for an error from MakePGDirectory; it could fail if
1813  * someone else just did the same thing. If it doesn't work then
1814  * we'll bomb out on the second create attempt, instead.
1815  */
1816  (void) MakePGDirectory(tempdirpath);
1817 
1818  file = PathNameOpenFile(tempfilepath,
1819  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1820  if (file <= 0 && rejectError)
1821  elog(ERROR, "could not create temporary file \"%s\": %m",
1822  tempfilepath);
1823  }
1824 
1825  return file;
1826 }
1827 
1828 
1829 /*
1830  * Create a new file. The directory containing it must already exist. Files
1831  * created this way are subject to temp_file_limit and are automatically
1832  * closed at end of transaction, but are not automatically deleted on close
1833  * because they are intended to be shared between cooperating backends.
1834  *
1835  * If the file is inside the top-level temporary directory, its name should
1836  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1837  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1838  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1839  * the prefix isn't needed.
1840  */
1841 File
1842 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1843 {
1844  File file;
1845 
1846  Assert(temporary_files_allowed); /* check temp file access is up */
1847 
1849 
1850  /*
1851  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1852  * temp file that can be reused.
1853  */
1854  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1855  if (file <= 0)
1856  {
1857  if (error_on_failure)
1858  ereport(ERROR,
1860  errmsg("could not create temporary file \"%s\": %m",
1861  path)));
1862  else
1863  return file;
1864  }
1865 
1866  /* Mark it for temp_file_limit accounting. */
1867  VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1868 
1869  /* Register it for automatic close. */
1870  RegisterTemporaryFile(file);
1871 
1872  return file;
1873 }
1874 
1875 /*
1876  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1877  * another backend. Files opened this way don't count against the
1878  * temp_file_limit of the caller, are automatically closed at the end of the
1879  * transaction but are not deleted on close.
1880  */
1881 File
1882 PathNameOpenTemporaryFile(const char *path, int mode)
1883 {
1884  File file;
1885 
1886  Assert(temporary_files_allowed); /* check temp file access is up */
1887 
1889 
1890  file = PathNameOpenFile(path, mode | PG_BINARY);
1891 
1892  /* If no such file, then we don't raise an error. */
1893  if (file <= 0 && errno != ENOENT)
1894  ereport(ERROR,
1896  errmsg("could not open temporary file \"%s\": %m",
1897  path)));
1898 
1899  if (file > 0)
1900  {
1901  /* Register it for automatic close. */
1902  RegisterTemporaryFile(file);
1903  }
1904 
1905  return file;
1906 }
1907 
1908 /*
1909  * Delete a file by pathname. Return true if the file existed, false if
1910  * didn't.
1911  */
1912 bool
1913 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1914 {
1915  struct stat filestats;
1916  int stat_errno;
1917 
1918  /* Get the final size for pgstat reporting. */
1919  if (stat(path, &filestats) != 0)
1920  stat_errno = errno;
1921  else
1922  stat_errno = 0;
1923 
1924  /*
1925  * Unlike FileClose's automatic file deletion code, we tolerate
1926  * non-existence to support BufFileDeleteFileSet which doesn't know how
1927  * many segments it has to delete until it runs out.
1928  */
1929  if (stat_errno == ENOENT)
1930  return false;
1931 
1932  if (unlink(path) < 0)
1933  {
1934  if (errno != ENOENT)
1935  ereport(error_on_failure ? ERROR : LOG,
1937  errmsg("could not unlink temporary file \"%s\": %m",
1938  path)));
1939  return false;
1940  }
1941 
1942  if (stat_errno == 0)
1943  ReportTemporaryFileUsage(path, filestats.st_size);
1944  else
1945  {
1946  errno = stat_errno;
1947  ereport(LOG,
1949  errmsg("could not stat file \"%s\": %m", path)));
1950  }
1951 
1952  return true;
1953 }
1954 
1955 /*
1956  * close a file when done with it
1957  */
1958 void
1960 {
1961  Vfd *vfdP;
1962 
1963  Assert(FileIsValid(file));
1964 
1965  DO_DB(elog(LOG, "FileClose: %d (%s)",
1966  file, VfdCache[file].fileName));
1967 
1968  vfdP = &VfdCache[file];
1969 
1970  if (!FileIsNotOpen(file))
1971  {
1972  /* close the file */
1973  if (close(vfdP->fd) != 0)
1974  {
1975  /*
1976  * We may need to panic on failure to close non-temporary files;
1977  * see LruDelete.
1978  */
1980  "could not close file \"%s\": %m", vfdP->fileName);
1981  }
1982 
1983  --nfile;
1984  vfdP->fd = VFD_CLOSED;
1985 
1986  /* remove the file from the lru ring */
1987  Delete(file);
1988  }
1989 
1990  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1991  {
1992  /* Subtract its size from current usage (do first in case of error) */
1993  temporary_files_size -= vfdP->fileSize;
1994  vfdP->fileSize = 0;
1995  }
1996 
1997  /*
1998  * Delete the file if it was temporary, and make a log entry if wanted
1999  */
2000  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2001  {
2002  struct stat filestats;
2003  int stat_errno;
2004 
2005  /*
2006  * If we get an error, as could happen within the ereport/elog calls,
2007  * we'll come right back here during transaction abort. Reset the
2008  * flag to ensure that we can't get into an infinite loop. This code
2009  * is arranged to ensure that the worst-case consequence is failing to
2010  * emit log message(s), not failing to attempt the unlink.
2011  */
2012  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2013 
2014 
2015  /* first try the stat() */
2016  if (stat(vfdP->fileName, &filestats))
2017  stat_errno = errno;
2018  else
2019  stat_errno = 0;
2020 
2021  /* in any case do the unlink */
2022  if (unlink(vfdP->fileName))
2023  ereport(LOG,
2025  errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2026 
2027  /* and last report the stat results */
2028  if (stat_errno == 0)
2029  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2030  else
2031  {
2032  errno = stat_errno;
2033  ereport(LOG,
2035  errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2036  }
2037  }
2038 
2039  /* Unregister it from the resource owner */
2040  if (vfdP->resowner)
2041  ResourceOwnerForgetFile(vfdP->resowner, file);
2042 
2043  /*
2044  * Return the Vfd slot to the free list
2045  */
2046  FreeVfd(file);
2047 }
2048 
2049 /*
2050  * FilePrefetch - initiate asynchronous read of a given range of the file.
2051  *
2052  * Currently the only implementation of this function is using posix_fadvise
2053  * which is the simplest standardized interface that accomplishes this.
2054  * We could add an implementation using libaio in the future; but note that
2055  * this API is inappropriate for libaio, which wants to have a buffer provided
2056  * to read into.
2057  */
2058 int
2059 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
2060 {
2061 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2062  int returnCode;
2063 
2064  Assert(FileIsValid(file));
2065 
2066  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
2067  file, VfdCache[file].fileName,
2068  (int64) offset, amount));
2069 
2070  returnCode = FileAccess(file);
2071  if (returnCode < 0)
2072  return returnCode;
2073 
2074  pgstat_report_wait_start(wait_event_info);
2075  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2076  POSIX_FADV_WILLNEED);
2078 
2079  return returnCode;
2080 #else
2081  Assert(FileIsValid(file));
2082  return 0;
2083 #endif
2084 }
2085 
2086 void
2087 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2088 {
2089  int returnCode;
2090 
2091  Assert(FileIsValid(file));
2092 
2093  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2094  file, VfdCache[file].fileName,
2095  (int64) offset, (int64) nbytes));
2096 
2097  if (nbytes <= 0)
2098  return;
2099 
2100  returnCode = FileAccess(file);
2101  if (returnCode < 0)
2102  return;
2103 
2104  pgstat_report_wait_start(wait_event_info);
2105  pg_flush_data(VfdCache[file].fd, offset, nbytes);
2107 }
2108 
2109 int
2110 FileRead(File file, char *buffer, int amount, off_t offset,
2111  uint32 wait_event_info)
2112 {
2113  int returnCode;
2114  Vfd *vfdP;
2115 
2116  Assert(FileIsValid(file));
2117 
2118  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
2119  file, VfdCache[file].fileName,
2120  (int64) offset,
2121  amount, buffer));
2122 
2123  returnCode = FileAccess(file);
2124  if (returnCode < 0)
2125  return returnCode;
2126 
2127  vfdP = &VfdCache[file];
2128 
2129 retry:
2130  pgstat_report_wait_start(wait_event_info);
2131  returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
2133 
2134  if (returnCode < 0)
2135  {
2136  /*
2137  * Windows may run out of kernel buffers and return "Insufficient
2138  * system resources" error. Wait a bit and retry to solve it.
2139  *
2140  * It is rumored that EINTR is also possible on some Unix filesystems,
2141  * in which case immediate retry is indicated.
2142  */
2143 #ifdef WIN32
2144  DWORD error = GetLastError();
2145 
2146  switch (error)
2147  {
2148  case ERROR_NO_SYSTEM_RESOURCES:
2149  pg_usleep(1000L);
2150  errno = EINTR;
2151  break;
2152  default:
2153  _dosmaperr(error);
2154  break;
2155  }
2156 #endif
2157  /* OK to retry if interrupted */
2158  if (errno == EINTR)
2159  goto retry;
2160  }
2161 
2162  return returnCode;
2163 }
2164 
2165 int
2166 FileWrite(File file, char *buffer, int amount, off_t offset,
2167  uint32 wait_event_info)
2168 {
2169  int returnCode;
2170  Vfd *vfdP;
2171 
2172  Assert(FileIsValid(file));
2173 
2174  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
2175  file, VfdCache[file].fileName,
2176  (int64) offset,
2177  amount, buffer));
2178 
2179  returnCode = FileAccess(file);
2180  if (returnCode < 0)
2181  return returnCode;
2182 
2183  vfdP = &VfdCache[file];
2184 
2185  /*
2186  * If enforcing temp_file_limit and it's a temp file, check to see if the
2187  * write would overrun temp_file_limit, and throw error if so. Note: it's
2188  * really a modularity violation to throw error here; we should set errno
2189  * and return -1. However, there's no way to report a suitable error
2190  * message if we do that. All current callers would just throw error
2191  * immediately anyway, so this is safe at present.
2192  */
2193  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2194  {
2195  off_t past_write = offset + amount;
2196 
2197  if (past_write > vfdP->fileSize)
2198  {
2199  uint64 newTotal = temporary_files_size;
2200 
2201  newTotal += past_write - vfdP->fileSize;
2202  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2203  ereport(ERROR,
2204  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2205  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2206  temp_file_limit)));
2207  }
2208  }
2209 
2210 retry:
2211  errno = 0;
2212  pgstat_report_wait_start(wait_event_info);
2213  returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2215 
2216  /* if write didn't set errno, assume problem is no disk space */
2217  if (returnCode != amount && errno == 0)
2218  errno = ENOSPC;
2219 
2220  if (returnCode >= 0)
2221  {
2222  /*
2223  * Maintain fileSize and temporary_files_size if it's a temp file.
2224  */
2225  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2226  {
2227  off_t past_write = offset + amount;
2228 
2229  if (past_write > vfdP->fileSize)
2230  {
2231  temporary_files_size += past_write - vfdP->fileSize;
2232  vfdP->fileSize = past_write;
2233  }
2234  }
2235  }
2236  else
2237  {
2238  /*
2239  * See comments in FileRead()
2240  */
2241 #ifdef WIN32
2242  DWORD error = GetLastError();
2243 
2244  switch (error)
2245  {
2246  case ERROR_NO_SYSTEM_RESOURCES:
2247  pg_usleep(1000L);
2248  errno = EINTR;
2249  break;
2250  default:
2251  _dosmaperr(error);
2252  break;
2253  }
2254 #endif
2255  /* OK to retry if interrupted */
2256  if (errno == EINTR)
2257  goto retry;
2258  }
2259 
2260  return returnCode;
2261 }
2262 
2263 int
2264 FileSync(File file, uint32 wait_event_info)
2265 {
2266  int returnCode;
2267 
2268  Assert(FileIsValid(file));
2269 
2270  DO_DB(elog(LOG, "FileSync: %d (%s)",
2271  file, VfdCache[file].fileName));
2272 
2273  returnCode = FileAccess(file);
2274  if (returnCode < 0)
2275  return returnCode;
2276 
2277  pgstat_report_wait_start(wait_event_info);
2278  returnCode = pg_fsync(VfdCache[file].fd);
2280 
2281  return returnCode;
2282 }
2283 
2284 off_t
2286 {
2287  Assert(FileIsValid(file));
2288 
2289  DO_DB(elog(LOG, "FileSize %d (%s)",
2290  file, VfdCache[file].fileName));
2291 
2292  if (FileIsNotOpen(file))
2293  {
2294  if (FileAccess(file) < 0)
2295  return (off_t) -1;
2296  }
2297 
2298  return lseek(VfdCache[file].fd, 0, SEEK_END);
2299 }
2300 
2301 int
2302 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2303 {
2304  int returnCode;
2305 
2306  Assert(FileIsValid(file));
2307 
2308  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2309  file, VfdCache[file].fileName));
2310 
2311  returnCode = FileAccess(file);
2312  if (returnCode < 0)
2313  return returnCode;
2314 
2315  pgstat_report_wait_start(wait_event_info);
2316  returnCode = ftruncate(VfdCache[file].fd, offset);
2318 
2319  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2320  {
2321  /* adjust our state for truncation of a temp file */
2322  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2323  temporary_files_size -= VfdCache[file].fileSize - offset;
2324  VfdCache[file].fileSize = offset;
2325  }
2326 
2327  return returnCode;
2328 }
2329 
2330 /*
2331  * Return the pathname associated with an open file.
2332  *
2333  * The returned string points to an internal buffer, which is valid until
2334  * the file is closed.
2335  */
2336 char *
2338 {
2339  Assert(FileIsValid(file));
2340 
2341  return VfdCache[file].fileName;
2342 }
2343 
2344 /*
2345  * Return the raw file descriptor of an opened file.
2346  *
2347  * The returned file descriptor will be valid until the file is closed, but
2348  * there are a lot of things that can make that happen. So the caller should
2349  * be careful not to do much of anything else before it finishes using the
2350  * returned file descriptor.
2351  */
2352 int
2354 {
2355  Assert(FileIsValid(file));
2356  return VfdCache[file].fd;
2357 }
2358 
2359 /*
2360  * FileGetRawFlags - returns the file flags on open(2)
2361  */
2362 int
2364 {
2365  Assert(FileIsValid(file));
2366  return VfdCache[file].fileFlags;
2367 }
2368 
2369 /*
2370  * FileGetRawMode - returns the mode bitmask passed to open(2)
2371  */
2372 mode_t
2374 {
2375  Assert(FileIsValid(file));
2376  return VfdCache[file].fileMode;
2377 }
2378 
2379 /*
2380  * Make room for another allocatedDescs[] array entry if needed and possible.
2381  * Returns true if an array element is available.
2382  */
2383 static bool
2385 {
2386  AllocateDesc *newDescs;
2387  int newMax;
2388 
2389  /* Quick out if array already has a free slot. */
2391  return true;
2392 
2393  /*
2394  * If the array hasn't yet been created in the current process, initialize
2395  * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2396  * we will ever need, anyway. We don't want to look at max_safe_fds
2397  * immediately because set_max_safe_fds() may not have run yet.
2398  */
2399  if (allocatedDescs == NULL)
2400  {
2401  newMax = FD_MINFREE / 3;
2402  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2403  /* Out of memory already? Treat as fatal error. */
2404  if (newDescs == NULL)
2405  ereport(ERROR,
2406  (errcode(ERRCODE_OUT_OF_MEMORY),
2407  errmsg("out of memory")));
2408  allocatedDescs = newDescs;
2409  maxAllocatedDescs = newMax;
2410  return true;
2411  }
2412 
2413  /*
2414  * Consider enlarging the array beyond the initial allocation used above.
2415  * By the time this happens, max_safe_fds should be known accurately.
2416  *
2417  * We mustn't let allocated descriptors hog all the available FDs, and in
2418  * practice we'd better leave a reasonable number of FDs for VFD use. So
2419  * set the maximum to max_safe_fds / 3. (This should certainly be at
2420  * least as large as the initial size, FD_MINFREE / 3, so we aren't
2421  * tightening the restriction here.) Recall that "external" FDs are
2422  * allowed to consume another third of max_safe_fds.
2423  */
2424  newMax = max_safe_fds / 3;
2425  if (newMax > maxAllocatedDescs)
2426  {
2427  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2428  newMax * sizeof(AllocateDesc));
2429  /* Treat out-of-memory as a non-fatal error. */
2430  if (newDescs == NULL)
2431  return false;
2432  allocatedDescs = newDescs;
2433  maxAllocatedDescs = newMax;
2434  return true;
2435  }
2436 
2437  /* Can't enlarge allocatedDescs[] any more. */
2438  return false;
2439 }
2440 
2441 /*
2442  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2443  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2444  * necessary to open the file. When done, call FreeFile rather than fclose.
2445  *
2446  * Note that files that will be open for any significant length of time
2447  * should NOT be handled this way, since they cannot share kernel file
2448  * descriptors with other files; there is grave risk of running out of FDs
2449  * if anyone locks down too many FDs. Most callers of this routine are
2450  * simply reading a config file that they will read and close immediately.
2451  *
2452  * fd.c will automatically close all files opened with AllocateFile at
2453  * transaction commit or abort; this prevents FD leakage if a routine
2454  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2455  *
2456  * Ideally this should be the *only* direct call of fopen() in the backend.
2457  */
2458 FILE *
2459 AllocateFile(const char *name, const char *mode)
2460 {
2461  FILE *file;
2462 
2463  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2464  numAllocatedDescs, name));
2465 
2466  /* Can we allocate another non-virtual FD? */
2467  if (!reserveAllocatedDesc())
2468  ereport(ERROR,
2469  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2470  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2471  maxAllocatedDescs, name)));
2472 
2473  /* Close excess kernel FDs. */
2474  ReleaseLruFiles();
2475 
2476 TryAgain:
2477  if ((file = fopen(name, mode)) != NULL)
2478  {
2479  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2480 
2481  desc->kind = AllocateDescFile;
2482  desc->desc.file = file;
2485  return desc->desc.file;
2486  }
2487 
2488  if (errno == EMFILE || errno == ENFILE)
2489  {
2490  int save_errno = errno;
2491 
2492  ereport(LOG,
2493  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2494  errmsg("out of file descriptors: %m; release and retry")));
2495  errno = 0;
2496  if (ReleaseLruFile())
2497  goto TryAgain;
2498  errno = save_errno;
2499  }
2500 
2501  return NULL;
2502 }
2503 
2504 /*
2505  * Open a file with OpenTransientFilePerm() and pass default file mode for
2506  * the fileMode parameter.
2507  */
2508 int
2510 {
2511  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2512 }
2513 
2514 /*
2515  * Like AllocateFile, but returns an unbuffered fd like open(2)
2516  */
2517 int
2519 {
2520  int fd;
2521 
2522  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2523  numAllocatedDescs, fileName));
2524 
2525  /* Can we allocate another non-virtual FD? */
2526  if (!reserveAllocatedDesc())
2527  ereport(ERROR,
2528  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2529  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2530  maxAllocatedDescs, fileName)));
2531 
2532  /* Close excess kernel FDs. */
2533  ReleaseLruFiles();
2534 
2535  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2536 
2537  if (fd >= 0)
2538  {
2539  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2540 
2541  desc->kind = AllocateDescRawFD;
2542  desc->desc.fd = fd;
2545 
2546  return fd;
2547  }
2548 
2549  return -1; /* failure */
2550 }
2551 
2552 /*
2553  * Routines that want to initiate a pipe stream should use OpenPipeStream
2554  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2555  * necessary. When done, call ClosePipeStream rather than pclose.
2556  *
2557  * This function also ensures that the popen'd program is run with default
2558  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2559  * uses. This ensures desirable response to, eg, closing a read pipe early.
2560  */
2561 FILE *
2562 OpenPipeStream(const char *command, const char *mode)
2563 {
2564  FILE *file;
2565  int save_errno;
2566 
2567  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2568  numAllocatedDescs, command));
2569 
2570  /* Can we allocate another non-virtual FD? */
2571  if (!reserveAllocatedDesc())
2572  ereport(ERROR,
2573  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2574  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2575  maxAllocatedDescs, command)));
2576 
2577  /* Close excess kernel FDs. */
2578  ReleaseLruFiles();
2579 
2580 TryAgain:
2581  fflush(stdout);
2582  fflush(stderr);
2584  errno = 0;
2585  file = popen(command, mode);
2586  save_errno = errno;
2588  errno = save_errno;
2589  if (file != NULL)
2590  {
2591  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2592 
2593  desc->kind = AllocateDescPipe;
2594  desc->desc.file = file;
2597  return desc->desc.file;
2598  }
2599 
2600  if (errno == EMFILE || errno == ENFILE)
2601  {
2602  ereport(LOG,
2603  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2604  errmsg("out of file descriptors: %m; release and retry")));
2605  if (ReleaseLruFile())
2606  goto TryAgain;
2607  errno = save_errno;
2608  }
2609 
2610  return NULL;
2611 }
2612 
2613 /*
2614  * Free an AllocateDesc of any type.
2615  *
2616  * The argument *must* point into the allocatedDescs[] array.
2617  */
2618 static int
2620 {
2621  int result;
2622 
2623  /* Close the underlying object */
2624  switch (desc->kind)
2625  {
2626  case AllocateDescFile:
2627  result = fclose(desc->desc.file);
2628  break;
2629  case AllocateDescPipe:
2630  result = pclose(desc->desc.file);
2631  break;
2632  case AllocateDescDir:
2633  result = closedir(desc->desc.dir);
2634  break;
2635  case AllocateDescRawFD:
2636  result = close(desc->desc.fd);
2637  break;
2638  default:
2639  elog(ERROR, "AllocateDesc kind not recognized");
2640  result = 0; /* keep compiler quiet */
2641  break;
2642  }
2643 
2644  /* Compact storage in the allocatedDescs array */
2646  *desc = allocatedDescs[numAllocatedDescs];
2647 
2648  return result;
2649 }
2650 
2651 /*
2652  * Close a file returned by AllocateFile.
2653  *
2654  * Note we do not check fclose's return value --- it is up to the caller
2655  * to handle close errors.
2656  */
2657 int
2658 FreeFile(FILE *file)
2659 {
2660  int i;
2661 
2662  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2663 
2664  /* Remove file from list of allocated files, if it's present */
2665  for (i = numAllocatedDescs; --i >= 0;)
2666  {
2667  AllocateDesc *desc = &allocatedDescs[i];
2668 
2669  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2670  return FreeDesc(desc);
2671  }
2672 
2673  /* Only get here if someone passes us a file not in allocatedDescs */
2674  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2675 
2676  return fclose(file);
2677 }
2678 
2679 /*
2680  * Close a file returned by OpenTransientFile.
2681  *
2682  * Note we do not check close's return value --- it is up to the caller
2683  * to handle close errors.
2684  */
2685 int
2687 {
2688  int i;
2689 
2690  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2691 
2692  /* Remove fd from list of allocated files, if it's present */
2693  for (i = numAllocatedDescs; --i >= 0;)
2694  {
2695  AllocateDesc *desc = &allocatedDescs[i];
2696 
2697  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2698  return FreeDesc(desc);
2699  }
2700 
2701  /* Only get here if someone passes us a file not in allocatedDescs */
2702  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2703 
2704  return close(fd);
2705 }
2706 
2707 /*
2708  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2709  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2710  * necessary to open the directory, and with closing it after an elog.
2711  * When done, call FreeDir rather than closedir.
2712  *
2713  * Returns NULL, with errno set, on failure. Note that failure detection
2714  * is commonly left to the following call of ReadDir or ReadDirExtended;
2715  * see the comments for ReadDir.
2716  *
2717  * Ideally this should be the *only* direct call of opendir() in the backend.
2718  */
2719 DIR *
2720 AllocateDir(const char *dirname)
2721 {
2722  DIR *dir;
2723 
2724  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2725  numAllocatedDescs, dirname));
2726 
2727  /* Can we allocate another non-virtual FD? */
2728  if (!reserveAllocatedDesc())
2729  ereport(ERROR,
2730  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2731  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2732  maxAllocatedDescs, dirname)));
2733 
2734  /* Close excess kernel FDs. */
2735  ReleaseLruFiles();
2736 
2737 TryAgain:
2738  if ((dir = opendir(dirname)) != NULL)
2739  {
2740  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2741 
2742  desc->kind = AllocateDescDir;
2743  desc->desc.dir = dir;
2746  return desc->desc.dir;
2747  }
2748 
2749  if (errno == EMFILE || errno == ENFILE)
2750  {
2751  int save_errno = errno;
2752 
2753  ereport(LOG,
2754  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2755  errmsg("out of file descriptors: %m; release and retry")));
2756  errno = 0;
2757  if (ReleaseLruFile())
2758  goto TryAgain;
2759  errno = save_errno;
2760  }
2761 
2762  return NULL;
2763 }
2764 
2765 /*
2766  * Read a directory opened with AllocateDir, ereport'ing any error.
2767  *
2768  * This is easier to use than raw readdir() since it takes care of some
2769  * otherwise rather tedious and error-prone manipulation of errno. Also,
2770  * if you are happy with a generic error message for AllocateDir failure,
2771  * you can just do
2772  *
2773  * dir = AllocateDir(path);
2774  * while ((dirent = ReadDir(dir, path)) != NULL)
2775  * process dirent;
2776  * FreeDir(dir);
2777  *
2778  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2779  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2780  * use this shortcut.)
2781  *
2782  * The pathname passed to AllocateDir must be passed to this routine too,
2783  * but it is only used for error reporting.
2784  */
2785 struct dirent *
2786 ReadDir(DIR *dir, const char *dirname)
2787 {
2788  return ReadDirExtended(dir, dirname, ERROR);
2789 }
2790 
2791 /*
2792  * Alternate version of ReadDir that allows caller to specify the elevel
2793  * for any error report (whether it's reporting an initial failure of
2794  * AllocateDir or a subsequent directory read failure).
2795  *
2796  * If elevel < ERROR, returns NULL after any error. With the normal coding
2797  * pattern, this will result in falling out of the loop immediately as
2798  * though the directory contained no (more) entries.
2799  */
2800 struct dirent *
2801 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2802 {
2803  struct dirent *dent;
2804 
2805  /* Give a generic message for AllocateDir failure, if caller didn't */
2806  if (dir == NULL)
2807  {
2808  ereport(elevel,
2810  errmsg("could not open directory \"%s\": %m",
2811  dirname)));
2812  return NULL;
2813  }
2814 
2815  errno = 0;
2816  if ((dent = readdir(dir)) != NULL)
2817  return dent;
2818 
2819  if (errno)
2820  ereport(elevel,
2822  errmsg("could not read directory \"%s\": %m",
2823  dirname)));
2824  return NULL;
2825 }
2826 
2827 /*
2828  * Close a directory opened with AllocateDir.
2829  *
2830  * Returns closedir's return value (with errno set if it's not 0).
2831  * Note we do not check the return value --- it is up to the caller
2832  * to handle close errors if wanted.
2833  *
2834  * Does nothing if dir == NULL; we assume that directory open failure was
2835  * already reported if desired.
2836  */
2837 int
2839 {
2840  int i;
2841 
2842  /* Nothing to do if AllocateDir failed */
2843  if (dir == NULL)
2844  return 0;
2845 
2846  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2847 
2848  /* Remove dir from list of allocated dirs, if it's present */
2849  for (i = numAllocatedDescs; --i >= 0;)
2850  {
2851  AllocateDesc *desc = &allocatedDescs[i];
2852 
2853  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2854  return FreeDesc(desc);
2855  }
2856 
2857  /* Only get here if someone passes us a dir not in allocatedDescs */
2858  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2859 
2860  return closedir(dir);
2861 }
2862 
2863 
2864 /*
2865  * Close a pipe stream returned by OpenPipeStream.
2866  */
2867 int
2868 ClosePipeStream(FILE *file)
2869 {
2870  int i;
2871 
2872  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2873 
2874  /* Remove file from list of allocated files, if it's present */
2875  for (i = numAllocatedDescs; --i >= 0;)
2876  {
2877  AllocateDesc *desc = &allocatedDescs[i];
2878 
2879  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2880  return FreeDesc(desc);
2881  }
2882 
2883  /* Only get here if someone passes us a file not in allocatedDescs */
2884  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2885 
2886  return pclose(file);
2887 }
2888 
2889 /*
2890  * closeAllVfds
2891  *
2892  * Force all VFDs into the physically-closed state, so that the fewest
2893  * possible number of kernel file descriptors are in use. There is no
2894  * change in the logical state of the VFDs.
2895  */
2896 void
2898 {
2899  Index i;
2900 
2901  if (SizeVfdCache > 0)
2902  {
2903  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2904  for (i = 1; i < SizeVfdCache; i++)
2905  {
2906  if (!FileIsNotOpen(i))
2907  LruDelete(i);
2908  }
2909  }
2910 }
2911 
2912 
2913 /*
2914  * SetTempTablespaces
2915  *
2916  * Define a list (actually an array) of OIDs of tablespaces to use for
2917  * temporary files. This list will be used until end of transaction,
2918  * unless this function is called again before then. It is caller's
2919  * responsibility that the passed-in array has adequate lifespan (typically
2920  * it'd be allocated in TopTransactionContext).
2921  *
2922  * Some entries of the array may be InvalidOid, indicating that the current
2923  * database's default tablespace should be used.
2924  */
2925 void
2926 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2927 {
2928  Assert(numSpaces >= 0);
2929  tempTableSpaces = tableSpaces;
2930  numTempTableSpaces = numSpaces;
2931 
2932  /*
2933  * Select a random starting point in the list. This is to minimize
2934  * conflicts between backends that are most likely sharing the same list
2935  * of temp tablespaces. Note that if we create multiple temp files in the
2936  * same transaction, we'll advance circularly through the list --- this
2937  * ensures that large temporary sort files are nicely spread across all
2938  * available tablespaces.
2939  */
2940  if (numSpaces > 1)
2941  nextTempTableSpace = random() % numSpaces;
2942  else
2943  nextTempTableSpace = 0;
2944 }
2945 
2946 /*
2947  * TempTablespacesAreSet
2948  *
2949  * Returns true if SetTempTablespaces has been called in current transaction.
2950  * (This is just so that tablespaces.c doesn't need its own per-transaction
2951  * state.)
2952  */
2953 bool
2955 {
2956  return (numTempTableSpaces >= 0);
2957 }
2958 
2959 /*
2960  * GetTempTablespaces
2961  *
2962  * Populate an array with the OIDs of the tablespaces that should be used for
2963  * temporary files. (Some entries may be InvalidOid, indicating that the
2964  * current database's default tablespace should be used.) At most numSpaces
2965  * entries will be filled.
2966  * Returns the number of OIDs that were copied into the output array.
2967  */
2968 int
2969 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2970 {
2971  int i;
2972 
2974  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2975  tableSpaces[i] = tempTableSpaces[i];
2976 
2977  return i;
2978 }
2979 
2980 /*
2981  * GetNextTempTableSpace
2982  *
2983  * Select the next temp tablespace to use. A result of InvalidOid means
2984  * to use the current database's default tablespace.
2985  */
2986 Oid
2988 {
2989  if (numTempTableSpaces > 0)
2990  {
2991  /* Advance nextTempTableSpace counter with wraparound */
2993  nextTempTableSpace = 0;
2995  }
2996  return InvalidOid;
2997 }
2998 
2999 
3000 /*
3001  * AtEOSubXact_Files
3002  *
3003  * Take care of subtransaction commit/abort. At abort, we close temp files
3004  * that the subtransaction may have opened. At commit, we reassign the
3005  * files that were opened to the parent subtransaction.
3006  */
3007 void
3008 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3009  SubTransactionId parentSubid)
3010 {
3011  Index i;
3012 
3013  for (i = 0; i < numAllocatedDescs; i++)
3014  {
3015  if (allocatedDescs[i].create_subid == mySubid)
3016  {
3017  if (isCommit)
3018  allocatedDescs[i].create_subid = parentSubid;
3019  else
3020  {
3021  /* have to recheck the item after FreeDesc (ugly) */
3022  FreeDesc(&allocatedDescs[i--]);
3023  }
3024  }
3025  }
3026 }
3027 
3028 /*
3029  * AtEOXact_Files
3030  *
3031  * This routine is called during transaction commit or abort. All still-open
3032  * per-transaction temporary file VFDs are closed, which also causes the
3033  * underlying files to be deleted (although they should've been closed already
3034  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3035  * closed. We also forget any transaction-local temp tablespace list.
3036  *
3037  * The isCommit flag is used only to decide whether to emit warnings about
3038  * unclosed files.
3039  */
3040 void
3041 AtEOXact_Files(bool isCommit)
3042 {
3043  CleanupTempFiles(isCommit, false);
3044  tempTableSpaces = NULL;
3045  numTempTableSpaces = -1;
3046 }
3047 
3048 /*
3049  * BeforeShmemExit_Files
3050  *
3051  * before_shmem_access hook to clean up temp files during backend shutdown.
3052  * Here, we want to clean up *all* temp files including interXact ones.
3053  */
3054 static void
3056 {
3057  CleanupTempFiles(false, true);
3058 
3059  /* prevent further temp files from being created */
3060 #ifdef USE_ASSERT_CHECKING
3061  temporary_files_allowed = false;
3062 #endif
3063 }
3064 
3065 /*
3066  * Close temporary files and delete their underlying files.
3067  *
3068  * isCommit: if true, this is normal transaction commit, and we don't
3069  * expect any remaining files; warn if there are some.
3070  *
3071  * isProcExit: if true, this is being called as the backend process is
3072  * exiting. If that's the case, we should remove all temporary files; if
3073  * that's not the case, we are being called for transaction commit/abort
3074  * and should only remove transaction-local temp files. In either case,
3075  * also clean up "allocated" stdio files, dirs and fds.
3076  */
3077 static void
3078 CleanupTempFiles(bool isCommit, bool isProcExit)
3079 {
3080  Index i;
3081 
3082  /*
3083  * Careful here: at proc_exit we need extra cleanup, not just
3084  * xact_temporary files.
3085  */
3086  if (isProcExit || have_xact_temporary_files)
3087  {
3088  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3089  for (i = 1; i < SizeVfdCache; i++)
3090  {
3091  unsigned short fdstate = VfdCache[i].fdstate;
3092 
3093  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3094  VfdCache[i].fileName != NULL)
3095  {
3096  /*
3097  * If we're in the process of exiting a backend process, close
3098  * all temporary files. Otherwise, only close temporary files
3099  * local to the current transaction. They should be closed by
3100  * the ResourceOwner mechanism already, so this is just a
3101  * debugging cross-check.
3102  */
3103  if (isProcExit)
3104  FileClose(i);
3105  else if (fdstate & FD_CLOSE_AT_EOXACT)
3106  {
3107  elog(WARNING,
3108  "temporary file %s not closed at end-of-transaction",
3109  VfdCache[i].fileName);
3110  FileClose(i);
3111  }
3112  }
3113  }
3114 
3115  have_xact_temporary_files = false;
3116  }
3117 
3118  /* Complain if any allocated files remain open at commit. */
3119  if (isCommit && numAllocatedDescs > 0)
3120  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3122 
3123  /* Clean up "allocated" stdio files, dirs and fds. */
3124  while (numAllocatedDescs > 0)
3125  FreeDesc(&allocatedDescs[0]);
3126 }
3127 
3128 
3129 /*
3130  * Remove temporary and temporary relation files left over from a prior
3131  * postmaster session
3132  *
3133  * This should be called during postmaster startup. It will forcibly
3134  * remove any leftover files created by OpenTemporaryFile and any leftover
3135  * temporary relation files created by mdcreate.
3136  *
3137  * During post-backend-crash restart cycle, this routine is called when
3138  * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3139  * queries are using temp files could result in useless storage usage that can
3140  * only be reclaimed by a service restart. The argument against enabling it is
3141  * that someone might want to examine the temporary files for debugging
3142  * purposes. This does however mean that OpenTemporaryFile had better allow for
3143  * collision with an existing temp file name.
3144  *
3145  * NOTE: this function and its subroutines generally report syscall failures
3146  * with ereport(LOG) and keep going. Removing temp files is not so critical
3147  * that we should fail to start the database when we can't do it.
3148  */
3149 void
3151 {
3152  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3153  DIR *spc_dir;
3154  struct dirent *spc_de;
3155 
3156  /*
3157  * First process temp files in pg_default ($PGDATA/base)
3158  */
3159  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3160  RemovePgTempFilesInDir(temp_path, true, false);
3161  RemovePgTempRelationFiles("base");
3162 
3163  /*
3164  * Cycle through temp directories for all non-default tablespaces.
3165  */
3166  spc_dir = AllocateDir("pg_tblspc");
3167 
3168  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3169  {
3170  if (strcmp(spc_de->d_name, ".") == 0 ||
3171  strcmp(spc_de->d_name, "..") == 0)
3172  continue;
3173 
3174  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3176  RemovePgTempFilesInDir(temp_path, true, false);
3177 
3178  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3180  RemovePgTempRelationFiles(temp_path);
3181  }
3182 
3183  FreeDir(spc_dir);
3184 
3185  /*
3186  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3187  * DataDir as well. However, that is *not* cleaned here because doing so
3188  * would create a race condition. It's done separately, earlier in
3189  * postmaster startup.
3190  */
3191 }
3192 
3193 /*
3194  * Process one pgsql_tmp directory for RemovePgTempFiles.
3195  *
3196  * If missing_ok is true, it's all right for the named directory to not exist.
3197  * Any other problem results in a LOG message. (missing_ok should be true at
3198  * the top level, since pgsql_tmp directories are not created until needed.)
3199  *
3200  * At the top level, this should be called with unlink_all = false, so that
3201  * only files matching the temporary name prefix will be unlinked. When
3202  * recursing it will be called with unlink_all = true to unlink everything
3203  * under a top-level temporary directory.
3204  *
3205  * (These two flags could be replaced by one, but it seems clearer to keep
3206  * them separate.)
3207  */
3208 void
3209 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3210 {
3211  DIR *temp_dir;
3212  struct dirent *temp_de;
3213  char rm_path[MAXPGPATH * 2];
3214 
3215  temp_dir = AllocateDir(tmpdirname);
3216 
3217  if (temp_dir == NULL && errno == ENOENT && missing_ok)
3218  return;
3219 
3220  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3221  {
3222  if (strcmp(temp_de->d_name, ".") == 0 ||
3223  strcmp(temp_de->d_name, "..") == 0)
3224  continue;
3225 
3226  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3227  tmpdirname, temp_de->d_name);
3228 
3229  if (unlink_all ||
3230  strncmp(temp_de->d_name,
3232  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3233  {
3234  struct stat statbuf;
3235 
3236  if (lstat(rm_path, &statbuf) < 0)
3237  {
3238  ereport(LOG,
3240  errmsg("could not stat file \"%s\": %m", rm_path)));
3241  continue;
3242  }
3243 
3244  if (S_ISDIR(statbuf.st_mode))
3245  {
3246  /* recursively remove contents, then directory itself */
3247  RemovePgTempFilesInDir(rm_path, false, true);
3248 
3249  if (rmdir(rm_path) < 0)
3250  ereport(LOG,
3252  errmsg("could not remove directory \"%s\": %m",
3253  rm_path)));
3254  }
3255  else
3256  {
3257  if (unlink(rm_path) < 0)
3258  ereport(LOG,
3260  errmsg("could not remove file \"%s\": %m",
3261  rm_path)));
3262  }
3263  }
3264  else
3265  ereport(LOG,
3266  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3267  rm_path)));
3268  }
3269 
3270  FreeDir(temp_dir);
3271 }
3272 
3273 /* Process one tablespace directory, look for per-DB subdirectories */
3274 static void
3275 RemovePgTempRelationFiles(const char *tsdirname)
3276 {
3277  DIR *ts_dir;
3278  struct dirent *de;
3279  char dbspace_path[MAXPGPATH * 2];
3280 
3281  ts_dir = AllocateDir(tsdirname);
3282 
3283  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3284  {
3285  /*
3286  * We're only interested in the per-database directories, which have
3287  * numeric names. Note that this code will also (properly) ignore "."
3288  * and "..".
3289  */
3290  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3291  continue;
3292 
3293  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3294  tsdirname, de->d_name);
3295  RemovePgTempRelationFilesInDbspace(dbspace_path);
3296  }
3297 
3298  FreeDir(ts_dir);
3299 }
3300 
3301 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3302 static void
3303 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3304 {
3305  DIR *dbspace_dir;
3306  struct dirent *de;
3307  char rm_path[MAXPGPATH * 2];
3308 
3309  dbspace_dir = AllocateDir(dbspacedirname);
3310 
3311  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3312  {
3313  if (!looks_like_temp_rel_name(de->d_name))
3314  continue;
3315 
3316  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3317  dbspacedirname, de->d_name);
3318 
3319  if (unlink(rm_path) < 0)
3320  ereport(LOG,
3322  errmsg("could not remove file \"%s\": %m",
3323  rm_path)));
3324  }
3325 
3326  FreeDir(dbspace_dir);
3327 }
3328 
3329 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3330 bool
3332 {
3333  int pos;
3334  int savepos;
3335 
3336  /* Must start with "t". */
3337  if (name[0] != 't')
3338  return false;
3339 
3340  /* Followed by a non-empty string of digits and then an underscore. */
3341  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3342  ;
3343  if (pos == 1 || name[pos] != '_')
3344  return false;
3345 
3346  /* Followed by another nonempty string of digits. */
3347  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3348  ;
3349  if (savepos == pos)
3350  return false;
3351 
3352  /* We might have _forkname or .segment or both. */
3353  if (name[pos] == '_')
3354  {
3355  int forkchar = forkname_chars(&name[pos + 1], NULL);
3356 
3357  if (forkchar <= 0)
3358  return false;
3359  pos += forkchar + 1;
3360  }
3361  if (name[pos] == '.')
3362  {
3363  int segchar;
3364 
3365  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3366  ;
3367  if (segchar <= 1)
3368  return false;
3369  pos += segchar;
3370  }
3371 
3372  /* Now we should be at the end. */
3373  if (name[pos] != '\0')
3374  return false;
3375  return true;
3376 }
3377 
3378 #ifdef HAVE_SYNCFS
3379 static void
3380 do_syncfs(const char *path)
3381 {
3382  int fd;
3383 
3384  fd = OpenTransientFile(path, O_RDONLY);
3385  if (fd < 0)
3386  {
3387  ereport(LOG,
3389  errmsg("could not open file \"%s\": %m", path)));
3390  return;
3391  }
3392  if (syncfs(fd) < 0)
3393  ereport(LOG,
3395  errmsg("could not synchronize file system for file \"%s\": %m", path)));
3396  CloseTransientFile(fd);
3397 }
3398 #endif
3399 
3400 /*
3401  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3402  * all potential filesystem, depending on recovery_init_sync_method setting.
3403  *
3404  * We fsync regular files and directories wherever they are, but we
3405  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3406  * Other symlinks are presumed to point at files we're not responsible
3407  * for fsyncing, and might not have privileges to write at all.
3408  *
3409  * Errors are logged but not considered fatal; that's because this is used
3410  * only during database startup, to deal with the possibility that there are
3411  * issued-but-unsynced writes pending against the data directory. We want to
3412  * ensure that such writes reach disk before anything that's done in the new
3413  * run. However, aborting on error would result in failure to start for
3414  * harmless cases such as read-only files in the data directory, and that's
3415  * not good either.
3416  *
3417  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3418  * rewriting all changes again during recovery.
3419  *
3420  * Note we assume we're chdir'd into PGDATA to begin with.
3421  */
3422 void
3424 {
3425  bool xlog_is_symlink;
3426 
3427  /* We can skip this whole thing if fsync is disabled. */
3428  if (!enableFsync)
3429  return;
3430 
3431  /*
3432  * If pg_wal is a symlink, we'll need to recurse into it separately,
3433  * because the first walkdir below will ignore it.
3434  */
3435  xlog_is_symlink = false;
3436 
3437 #ifndef WIN32
3438  {
3439  struct stat st;
3440 
3441  if (lstat("pg_wal", &st) < 0)
3442  ereport(LOG,
3444  errmsg("could not stat file \"%s\": %m",
3445  "pg_wal")));
3446  else if (S_ISLNK(st.st_mode))
3447  xlog_is_symlink = true;
3448  }
3449 #else
3450  if (pgwin32_is_junction("pg_wal"))
3451  xlog_is_symlink = true;
3452 #endif
3453 
3454 #ifdef HAVE_SYNCFS
3456  {
3457  DIR *dir;
3458  struct dirent *de;
3459 
3460  /*
3461  * On Linux, we don't have to open every single file one by one. We
3462  * can use syncfs() to sync whole filesystems. We only expect
3463  * filesystem boundaries to exist where we tolerate symlinks, namely
3464  * pg_wal and the tablespaces, so we call syncfs() for each of those
3465  * directories.
3466  */
3467 
3468  /* Sync the top level pgdata directory. */
3469  do_syncfs(".");
3470  /* If any tablespaces are configured, sync each of those. */
3471  dir = AllocateDir("pg_tblspc");
3472  while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
3473  {
3474  char path[MAXPGPATH];
3475 
3476  if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3477  continue;
3478 
3479  snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
3480  do_syncfs(path);
3481  }
3482  FreeDir(dir);
3483  /* If pg_wal is a symlink, process that too. */
3484  if (xlog_is_symlink)
3485  do_syncfs("pg_wal");
3486  return;
3487  }
3488 #endif /* !HAVE_SYNCFS */
3489 
3490  /*
3491  * If possible, hint to the kernel that we're soon going to fsync the data
3492  * directory and its contents. Errors in this step are even less
3493  * interesting than normal, so log them only at DEBUG1.
3494  */
3495 #ifdef PG_FLUSH_DATA_WORKS
3496  walkdir(".", pre_sync_fname, false, DEBUG1);
3497  if (xlog_is_symlink)
3498  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3499  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3500 #endif
3501 
3502  /*
3503  * Now we do the fsync()s in the same order.
3504  *
3505  * The main call ignores symlinks, so in addition to specially processing
3506  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3507  * process_symlinks = true. Note that if there are any plain directories
3508  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3509  * so we don't worry about optimizing it.
3510  */
3511  walkdir(".", datadir_fsync_fname, false, LOG);
3512  if (xlog_is_symlink)
3513  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3514  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3515 }
3516 
3517 /*
3518  * walkdir: recursively walk a directory, applying the action to each
3519  * regular file and directory (including the named directory itself).
3520  *
3521  * If process_symlinks is true, the action and recursion are also applied
3522  * to regular files and directories that are pointed to by symlinks in the
3523  * given directory; otherwise symlinks are ignored. Symlinks are always
3524  * ignored in subdirectories, ie we intentionally don't pass down the
3525  * process_symlinks flag to recursive calls.
3526  *
3527  * Errors are reported at level elevel, which might be ERROR or less.
3528  *
3529  * See also walkdir in file_utils.c, which is a frontend version of this
3530  * logic.
3531  */
3532 static void
3533 walkdir(const char *path,
3534  void (*action) (const char *fname, bool isdir, int elevel),
3535  bool process_symlinks,
3536  int elevel)
3537 {
3538  DIR *dir;
3539  struct dirent *de;
3540 
3541  dir = AllocateDir(path);
3542 
3543  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3544  {
3545  char subpath[MAXPGPATH * 2];
3546 
3548 
3549  if (strcmp(de->d_name, ".") == 0 ||
3550  strcmp(de->d_name, "..") == 0)
3551  continue;
3552 
3553  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3554 
3555  switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3556  {
3557  case PGFILETYPE_REG:
3558  (*action) (subpath, false, elevel);
3559  break;
3560  case PGFILETYPE_DIR:
3561  walkdir(subpath, action, false, elevel);
3562  break;
3563  default:
3564 
3565  /*
3566  * Errors are already reported directly by get_dirent_type(),
3567  * and any remaining symlinks and unknown file types are
3568  * ignored.
3569  */
3570  break;
3571  }
3572  }
3573 
3574  FreeDir(dir); /* we ignore any error here */
3575 
3576  /*
3577  * It's important to fsync the destination directory itself as individual
3578  * file fsyncs don't guarantee that the directory entry for the file is
3579  * synced. However, skip this if AllocateDir failed; the action function
3580  * might not be robust against that.
3581  */
3582  if (dir)
3583  (*action) (path, true, elevel);
3584 }
3585 
3586 
3587 /*
3588  * Hint to the OS that it should get ready to fsync() this file.
3589  *
3590  * Ignores errors trying to open unreadable files, and logs other errors at a
3591  * caller-specified level.
3592  */
3593 #ifdef PG_FLUSH_DATA_WORKS
3594 
3595 static void
3596 pre_sync_fname(const char *fname, bool isdir, int elevel)
3597 {
3598  int fd;
3599 
3600  /* Don't try to flush directories, it'll likely just fail */
3601  if (isdir)
3602  return;
3603 
3604  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3605 
3606  if (fd < 0)
3607  {
3608  if (errno == EACCES)
3609  return;
3610  ereport(elevel,
3612  errmsg("could not open file \"%s\": %m", fname)));
3613  return;
3614  }
3615 
3616  /*
3617  * pg_flush_data() ignores errors, which is ok because this is only a
3618  * hint.
3619  */
3620  pg_flush_data(fd, 0, 0);
3621 
3622  if (CloseTransientFile(fd) != 0)
3623  ereport(elevel,
3625  errmsg("could not close file \"%s\": %m", fname)));
3626 }
3627 
3628 #endif /* PG_FLUSH_DATA_WORKS */
3629 
3630 static void
3631 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3632 {
3633  /*
3634  * We want to silently ignoring errors about unreadable files. Pass that
3635  * desire on to fsync_fname_ext().
3636  */
3637  fsync_fname_ext(fname, isdir, true, elevel);
3638 }
3639 
3640 static void
3641 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3642 {
3643  if (isdir)
3644  {
3645  if (rmdir(fname) != 0 && errno != ENOENT)
3646  ereport(elevel,
3648  errmsg("could not remove directory \"%s\": %m", fname)));
3649  }
3650  else
3651  {
3652  /* Use PathNameDeleteTemporaryFile to report filesize */
3653  PathNameDeleteTemporaryFile(fname, false);
3654  }
3655 }
3656 
3657 /*
3658  * fsync_fname_ext -- Try to fsync a file or directory
3659  *
3660  * If ignore_perm is true, ignore errors upon trying to open unreadable
3661  * files. Logs other errors at a caller-specified level.
3662  *
3663  * Returns 0 if the operation succeeded, -1 otherwise.
3664  */
3665 int
3666 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3667 {
3668  int fd;
3669  int flags;
3670  int returncode;
3671 
3672  /*
3673  * Some OSs require directories to be opened read-only whereas other
3674  * systems don't allow us to fsync files opened read-only; so we need both
3675  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3676  * not writable by our userid, but we assume that's OK.
3677  */
3678  flags = PG_BINARY;
3679  if (!isdir)
3680  flags |= O_RDWR;
3681  else
3682  flags |= O_RDONLY;
3683 
3684  fd = OpenTransientFile(fname, flags);
3685 
3686  /*
3687  * Some OSs don't allow us to open directories at all (Windows returns
3688  * EACCES), just ignore the error in that case. If desired also silently
3689  * ignoring errors about unreadable files. Log others.
3690  */
3691  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3692  return 0;
3693  else if (fd < 0 && ignore_perm && errno == EACCES)
3694  return 0;
3695  else if (fd < 0)
3696  {
3697  ereport(elevel,
3699  errmsg("could not open file \"%s\": %m", fname)));
3700  return -1;
3701  }
3702 
3703  returncode = pg_fsync(fd);
3704 
3705  /*
3706  * Some OSes don't allow us to fsync directories at all, so we can ignore
3707  * those errors. Anything else needs to be logged.
3708  */
3709  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3710  {
3711  int save_errno;
3712 
3713  /* close file upon error, might not be in transaction context */
3714  save_errno = errno;
3715  (void) CloseTransientFile(fd);
3716  errno = save_errno;
3717 
3718  ereport(elevel,
3720  errmsg("could not fsync file \"%s\": %m", fname)));
3721  return -1;
3722  }
3723 
3724  if (CloseTransientFile(fd) != 0)
3725  {
3726  ereport(elevel,
3728  errmsg("could not close file \"%s\": %m", fname)));
3729  return -1;
3730  }
3731 
3732  return 0;
3733 }
3734 
3735 /*
3736  * fsync_parent_path -- fsync the parent path of a file or directory
3737  *
3738  * This is aimed at making file operations persistent on disk in case of
3739  * an OS crash or power failure.
3740  */
3741 static int
3742 fsync_parent_path(const char *fname, int elevel)
3743 {
3744  char parentpath[MAXPGPATH];
3745 
3746  strlcpy(parentpath, fname, MAXPGPATH);
3747  get_parent_directory(parentpath);
3748 
3749  /*
3750  * get_parent_directory() returns an empty string if the input argument is
3751  * just a file name (see comments in path.c), so handle that as being the
3752  * current directory.
3753  */
3754  if (strlen(parentpath) == 0)
3755  strlcpy(parentpath, ".", MAXPGPATH);
3756 
3757  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3758  return -1;
3759 
3760  return 0;
3761 }
3762 
3763 /*
3764  * Create a PostgreSQL data sub-directory
3765  *
3766  * The data directory itself, and most of its sub-directories, are created at
3767  * initdb time, but we do have some occasions when we create directories in
3768  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3769  * make sure that those directories are created consistently. Today, that means
3770  * making sure that the created directory has the correct permissions, which is
3771  * what pg_dir_create_mode tracks for us.
3772  *
3773  * Note that we also set the umask() based on what we understand the correct
3774  * permissions to be (see file_perm.c).
3775  *
3776  * For permissions other than the default, mkdir() can be used directly, but
3777  * be sure to consider carefully such cases -- a sub-directory with incorrect
3778  * permissions in a PostgreSQL data directory could cause backups and other
3779  * processes to fail.
3780  */
3781 int
3782 MakePGDirectory(const char *directoryName)
3783 {
3784  return mkdir(directoryName, pg_dir_create_mode);
3785 }
3786 
3787 /*
3788  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3789  *
3790  * Failure to fsync any data file is cause for immediate panic, unless
3791  * data_sync_retry is enabled. Data may have been written to the operating
3792  * system and removed from our buffer pool already, and if we are running on
3793  * an operating system that forgets dirty data on write-back failure, there
3794  * may be only one copy of the data remaining: in the WAL. A later attempt to
3795  * fsync again might falsely report success. Therefore we must not allow any
3796  * further checkpoints to be attempted. data_sync_retry can in theory be
3797  * enabled on systems known not to drop dirty buffered data on write-back
3798  * failure (with the likely outcome that checkpoints will continue to fail
3799  * until the underlying problem is fixed).
3800  *
3801  * Any code that reports a failure from fsync() or related functions should
3802  * filter the error level with this function.
3803  */
3804 int
3805 data_sync_elevel(int elevel)
3806 {
3807  return data_sync_retry ? elevel : PANIC;
3808 }
3809 
3810 /*
3811  * A convenience wrapper for pg_pwritev() that retries on partial write. If an
3812  * error is returned, it is unspecified how much has been written.
3813  */
3814 ssize_t
3815 pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
3816 {
3817  struct iovec iov_copy[PG_IOV_MAX];
3818  ssize_t sum = 0;
3819  ssize_t part;
3820 
3821  /* We'd better have space to make a copy, in case we need to retry. */
3822  if (iovcnt > PG_IOV_MAX)
3823  {
3824  errno = EINVAL;
3825  return -1;
3826  }
3827 
3828  for (;;)
3829  {
3830  /* Write as much as we can. */
3831  part = pg_pwritev(fd, iov, iovcnt, offset);
3832  if (part < 0)
3833  return -1;
3834 
3835 #ifdef SIMULATE_SHORT_WRITE
3836  part = Min(part, 4096);
3837 #endif
3838 
3839  /* Count our progress. */
3840  sum += part;
3841  offset += part;
3842 
3843  /* Step over iovecs that are done. */
3844  while (iovcnt > 0 && iov->iov_len <= part)
3845  {
3846  part -= iov->iov_len;
3847  ++iov;
3848  --iovcnt;
3849  }
3850 
3851  /* Are they all done? */
3852  if (iovcnt == 0)
3853  {
3854  /* We don't expect the kernel to write more than requested. */
3855  Assert(part == 0);
3856  break;
3857  }
3858 
3859  /*
3860  * Move whatever's left to the front of our mutable copy and adjust
3861  * the leading iovec.
3862  */
3863  Assert(iovcnt > 0);
3864  memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
3865  Assert(iov->iov_len > part);
3866  iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
3867  iov_copy[0].iov_len -= part;
3868  iov = iov_copy;
3869  }
3870 
3871  return sum;
3872 }
size_t iov_len
Definition: pg_iovec.h:27
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1577
File lruLessRecently
Definition: fd.c:199
void closeAllVfds(void)
Definition: fd.c:2897
static PgChecksumMode mode
Definition: pg_checksums.c:65
static void BeforeShmemExit_Files(int code, Datum arg)
Definition: fd.c:3055
union AllocateDesc::@18 desc
File nextFree
Definition: fd.c:197
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:942
int pg_file_create_mode
Definition: file_perm.c:19
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1913
#define MAP_FAILED
Definition: mem.h:45
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:43
void * iov_base
Definition: pg_iovec.h:26
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1842
#define NUM_RESERVED_FDS
Definition: fd.c:128
static AllocateDesc * allocatedDescs
Definition: fd.c:265
static void pgstat_report_wait_end(void)
Definition: wait_event.h:274
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1564
int pg_fdatasync(int fd)
Definition: fd.c:447
static void error(void)
Definition: sql-dyntest.c:147
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
AllocateDescKind
Definition: fd.c:243
DIR * dir
Definition: fd.c:258
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1785
static Size SizeVfdCache
Definition: fd.c:213
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:190
#define DO_DB(A)
Definition: fd.c:176
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2969
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3533
int pg_truncate(const char *path, off_t length)
Definition: fd.c:640
long random(void)
Definition: random.c:22
ResourceOwner CurrentResourceOwner
Definition: resowner.c:146
static int numExternalFDs
Definition: fd.c:270
int pg_fsync_writethrough(int fd)
Definition: fd.c:424
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2801
int max_safe_fds
Definition: fd.c:158
#define Min(x, y)
Definition: c.h:986
off_t FileSize(File file)
Definition: fd.c:2285
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:671
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2518
#define FD_DELETE_AT_CLOSE
Definition: fd.c:188
int log_temp_files
Definition: guc.c:602
mode_t FileGetRawMode(File file)
Definition: fd.c:2373
void _dosmaperr(unsigned long)
Definition: win32error.c:171
static Vfd * VfdCache
Definition: fd.c:212
static void Delete(File file)
Definition: fd.c:1257
int closedir(DIR *)
Definition: dirent.c:123
static int numTempTableSpaces
Definition: fd.c:285
#define PG_TEMP_FILES_DIR
Definition: pg_checksums.c:62
int errcode(int sqlerrcode)
Definition: elog.c:698
#define MemSet(start, val, len)
Definition: c.h:1008
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1672
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:412
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3303
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1737
static bool reserveAllocatedDesc(void)
Definition: fd.c:2384
uint32 SubTransactionId
Definition: c.h:591
#define SIGPIPE
Definition: win32_port.h:172
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1760
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
AllocateDescKind kind
Definition: fd.c:253
char * FilePathName(File file)
Definition: fd.c:2337
Definition: dirent.h:9
#define OidIsValid(objectId)
Definition: c.h:710
#define PANIC
Definition: elog.h:50
#define PG_BINARY
Definition: c.h:1271
static char * basedir
ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset)
Definition: pwrite.c:27
void AtEOXact_Files(bool isCommit)
Definition: fd.c:3041
Oid MyDatabaseTableSpace
Definition: globals.c:90
int ClosePipeStream(FILE *file)
Definition: fd.c:2868
ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset)
Definition: pread.c:27
#define malloc(a)
Definition: header.h:50
static void LruDelete(File file)
Definition: fd.c:1276
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:918
void pg_usleep(long microsec)
Definition: signal.c:53
bool TempTablespacesAreSet(void)
Definition: fd.c:2954
#define fstat
Definition: win32_port.h:282
ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pwritev.c:29
#define fsync(fd)
Definition: win32_port.h:76
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2619
void pfree(void *pointer)
Definition: mcxt.c:1169
mode_t fileMode
Definition: fd.c:204
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3275
static bool ReleaseLruFile(void)
Definition: fd.c:1371
Definition: dirent.c:25
int durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:827
#define ERROR
Definition: elog.h:46
#define PG_TEMP_FILE_PREFIX
Definition: pg_checksums.c:63
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2509
static int LruInsert(File file)
Definition: fd.c:1324
#define FATAL
Definition: elog.h:49
int recovery_init_sync_method
Definition: fd.c:164
static bool have_xact_temporary_files
Definition: fd.c:224
#define MAXPGPATH
#define PG_O_DIRECT
Definition: fd.h:95
void ReserveExternalFD(void)
Definition: fd.c:1210
DIR * opendir(const char *)
Definition: dirent.c:33
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2264
#define DEBUG2
Definition: elog.h:24
ssize_t pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: fd.c:3815
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:26
char * fileName
Definition: fd.c:201
static char * buf
Definition: pg_test_fsync.c:68
Oid GetNextTempTableSpace(void)
Definition: fd.c:2987
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1297
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3641
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition: fd.c:1882
int errdetail(const char *fmt,...)
Definition: elog.c:1042
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3209
char * tablespace
Definition: pgbench.c:226
int errcode_for_file_access(void)
Definition: elog.c:721
void get_parent_directory(char *path)
Definition: path.c:854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2459
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:333
static int nfile
Definition: fd.c:218
unsigned int uint32
Definition: c.h:441
void SyncDataDirectory(void)
Definition: fd.c:3423
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2720
static int nextTempTableSpace
Definition: fd.c:286
__int64 st_size
Definition: win32_port.h:273
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:410
int max_files_per_process
Definition: fd.c:145
static File AllocateVfd(void)
Definition: fd.c:1403
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2562
unsigned short fdstate
Definition: fd.c:195
Definition: fd.c:192
off_t fileSize
Definition: fd.c:200
int fd
Definition: fd.c:194
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2926
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:697
static void Insert(File file)
Definition: fd.c:1302
ResourceOwner resowner
Definition: fd.c:196
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:258
bool data_sync_retry
Definition: fd.c:161
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3631
int CloseTransientFile(int fd)
Definition: fd.c:2686
#define SIG_IGN
Definition: win32_port.h:164
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1517
static void ReleaseLruFiles(void)
Definition: fd.c:1393
#define WARNING
Definition: elog.h:40
#define FileIsNotOpen(file)
Definition: fd.c:185
int pg_dir_create_mode
Definition: file_perm.c:18
static int elevel
Definition: vacuumlazy.c:401
int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2166
struct vfd Vfd
#define O_DSYNC
Definition: win32_port.h:336
int data_sync_elevel(int elevel)
Definition: fd.c:3805
uintptr_t Datum
Definition: postgres.h:411
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:3008
unsigned short st_mode
Definition: win32_port.h:268
Definition: pg_iovec.h:24
unsigned int Index
Definition: c.h:549
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:467
#define FileIsValid(file)
Definition: fd.c:182
bool AcquireExternalFD(void)
Definition: fd.c:1175
FILE * file
Definition: fd.c:257
#define InvalidOid
Definition: postgres_ext.h:36
#define VFD_CLOSED
Definition: fd.c:180
static uint64 temporary_files_size
Definition: fd.c:232
#define ereport(elevel,...)
Definition: elog.h:157
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3782
pqsigfunc pqsignal(int signum, pqsigfunc handler)
Definition: signal.c:170
#define free(a)
Definition: header.h:65
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static void RegisterTemporaryFile(File file)
Definition: fd.c:1536
void FileClose(File file)
Definition: fd.c:1959
#define SIG_DFL
Definition: win32_port.h:162
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:2059
static int FileAccess(File file)
Definition: fd.c:1481
#define Assert(condition)
Definition: c.h:804
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:724
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2786
File lruMoreRecently
Definition: fd.c:198
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2087
void RemovePgTempFiles(void)
Definition: fd.c:3150
SubTransactionId create_subid
Definition: fd.c:254
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1705
size_t Size
Definition: c.h:540
static const char * directory
Definition: zic.c:632
int sync_method
Definition: xlog.c:107
struct dirent * readdir(DIR *)
Definition: dirent.c:78
#define FD_MINFREE
Definition: fd.c:137
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3331
#define realloc(a, b)
Definition: header.h:60
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1182
#define INT64_FORMAT
Definition: c.h:483
const char * name
Definition: encode.c:561
static long tempFileCounter
Definition: fd.c:276
int fd
Definition: fd.c:259
#define S_ISDIR(m)
Definition: win32_port.h:324
#define lstat(path, sb)
Definition: win32_port.h:284
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:787
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1069
int FreeFile(FILE *file)
Definition: fd.c:2658
void set_max_safe_fds(void)
Definition: fd.c:1026
bool enableFsync
Definition: globals.c:122
static Oid * tempTableSpaces
Definition: fd.c:284
void ReleaseExternalFD(void)
Definition: fd.c:1228
void * palloc(Size size)
Definition: mcxt.c:1062
int errmsg(const char *fmt,...)
Definition: elog.c:909
int FileGetRawFlags(File file)
Definition: fd.c:2363
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1286
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1091
#define elog(elevel,...)
Definition: elog.h:232
int i
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:189
void * arg
int FileGetRawDesc(File file)
Definition: fd.c:2353
static void FreeVfd(File file)
Definition: fd.c:1461
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:120
int pg_fsync(int fd)
Definition: fd.c:357
char d_name[MAX_PATH]
Definition: dirent.h:15
#define mkdir(a, b)
Definition: win32_port.h:71
int link(const char *src, const char *dst)
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:351
int fileFlags
Definition: fd.c:203
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1641
int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2110
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1306
#define snprintf
Definition: port.h:217
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2302
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3666
static int maxAllocatedDescs
Definition: fd.c:264
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:3078
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3742
int File
Definition: fd.h:54
int FreeDir(DIR *dir)
Definition: fd.c:2838
int temp_file_limit
Definition: guc.c:609
void InitTemporaryFileAccess(void)
Definition: fd.c:911
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:241
void InitFileAccess(void)
Definition: fd.c:881
#define stat
Definition: win32_port.h:283
static int numAllocatedDescs
Definition: fd.c:263
bool pgwin32_is_junction(const char *path)
#define ftruncate(a, b)
Definition: win32_port.h:73