PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  * If a non-virtual file descriptor needs to be held open for any length of
65  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66  * (and eventually ReleaseExternalFD), so that we can take it into account
67  * while deciding how many VFDs can be open. This applies to FDs obtained
68  * with BasicOpenFile as well as those obtained without use of any fd.c API.
69  *
70  *-------------------------------------------------------------------------
71  */
72 
73 #include "postgres.h"
74 
75 #include <dirent.h>
76 #include <sys/file.h>
77 #include <sys/param.h>
78 #include <sys/stat.h>
79 #include <sys/types.h>
80 #ifndef WIN32
81 #include <sys/mman.h>
82 #endif
83 #include <limits.h>
84 #include <unistd.h>
85 #include <fcntl.h>
86 #ifdef HAVE_SYS_RESOURCE_H
87 #include <sys/resource.h> /* for getrlimit */
88 #endif
89 
90 #include "access/xact.h"
91 #include "access/xlog.h"
92 #include "catalog/pg_tablespace.h"
93 #include "common/file_perm.h"
94 #include "common/file_utils.h"
95 #include "miscadmin.h"
96 #include "pgstat.h"
97 #include "port/pg_iovec.h"
98 #include "portability/mem.h"
99 #include "storage/fd.h"
100 #include "storage/ipc.h"
101 #include "utils/guc.h"
102 #include "utils/resowner_private.h"
103 
104 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
105 #if defined(HAVE_SYNC_FILE_RANGE)
106 #define PG_FLUSH_DATA_WORKS 1
107 #elif !defined(WIN32) && defined(MS_ASYNC)
108 #define PG_FLUSH_DATA_WORKS 1
109 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
110 #define PG_FLUSH_DATA_WORKS 1
111 #endif
112 
113 /*
114  * We must leave some file descriptors free for system(), the dynamic loader,
115  * and other code that tries to open files without consulting fd.c. This
116  * is the number left free. (While we try fairly hard to prevent EMFILE
117  * errors, there's never any guarantee that we won't get ENFILE due to
118  * other processes chewing up FDs. So it's a bad idea to try to open files
119  * without consulting fd.c. Nonetheless we cannot control all code.)
120  *
121  * Because this is just a fixed setting, we are effectively assuming that
122  * no such code will leave FDs open over the long term; otherwise the slop
123  * is likely to be insufficient. Note in particular that we expect that
124  * loading a shared library does not result in any permanent increase in
125  * the number of open files. (This appears to be true on most if not
126  * all platforms as of Feb 2004.)
127  */
128 #define NUM_RESERVED_FDS 10
129 
130 /*
131  * If we have fewer than this many usable FDs after allowing for the reserved
132  * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
133  * much less than that. Note that this value ensures numExternalFDs can be
134  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
135  * will not pass unless that can grow to at least 14.)
136  */
137 #define FD_MINFREE 48
138 
139 /*
140  * A number of platforms allow individual processes to open many more files
141  * than they can really support when *many* processes do the same thing.
142  * This GUC parameter lets the DBA limit max_safe_fds to something less than
143  * what the postmaster's initial probe suggests will work.
144  */
146 
147 /*
148  * Maximum number of file descriptors to open for operations that fd.c knows
149  * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
150  * to a conservative value, and remains that way indefinitely in bootstrap or
151  * standalone-backend cases. In normal postmaster operation, the postmaster
152  * calls set_max_safe_fds() late in initialization to update the value, and
153  * that value is then inherited by forked subprocesses.
154  *
155  * Note: the value of max_files_per_process is taken into account while
156  * setting this variable, and so need not be tested separately.
157  */
158 int max_safe_fds = FD_MINFREE; /* default if not changed */
159 
160 /* Whether it is safe to continue running after fsync() fails. */
161 bool data_sync_retry = false;
162 
163 /* How SyncDataDirectory() should do its job. */
165 
166 /* Debugging.... */
167 
168 #ifdef FDDEBUG
169 #define DO_DB(A) \
170  do { \
171  int _do_db_save_errno = errno; \
172  A; \
173  errno = _do_db_save_errno; \
174  } while (0)
175 #else
176 #define DO_DB(A) \
177  ((void) 0)
178 #endif
179 
180 #define VFD_CLOSED (-1)
181 
182 #define FileIsValid(file) \
183  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
184 
185 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
186 
187 /* these are the assigned bits in fdstate below: */
188 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
189 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
190 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
191 
192 typedef struct vfd
193 {
194  int fd; /* current FD, or VFD_CLOSED if none */
195  unsigned short fdstate; /* bitflags for VFD's state */
196  ResourceOwner resowner; /* owner, for automatic cleanup */
197  File nextFree; /* link to next free VFD, if in freelist */
198  File lruMoreRecently; /* doubly linked recency-of-use list */
200  off_t fileSize; /* current size of file (0 if not temporary) */
201  char *fileName; /* name of file, or NULL for unused VFD */
202  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
203  int fileFlags; /* open(2) flags for (re)opening the file */
204  mode_t fileMode; /* mode to pass to open(2) */
205 } Vfd;
206 
207 /*
208  * Virtual File Descriptor array pointer and size. This grows as
209  * needed. 'File' values are indexes into this array.
210  * Note that VfdCache[0] is not a usable VFD, just a list header.
211  */
212 static Vfd *VfdCache;
213 static Size SizeVfdCache = 0;
214 
215 /*
216  * Number of file descriptors known to be in use by VFD entries.
217  */
218 static int nfile = 0;
219 
220 /*
221  * Flag to tell whether it's worth scanning VfdCache looking for temp files
222  * to close
223  */
224 static bool have_xact_temporary_files = false;
225 
226 /*
227  * Tracks the total size of all temporary files. Note: when temp_file_limit
228  * is being enforced, this cannot overflow since the limit cannot be more
229  * than INT_MAX kilobytes. When not enforcing, it could theoretically
230  * overflow, but we don't care.
231  */
232 static uint64 temporary_files_size = 0;
233 
234 /*
235  * List of OS handles opened with AllocateFile, AllocateDir and
236  * OpenTransientFile.
237  */
238 typedef enum
239 {
245 
246 typedef struct
247 {
250  union
251  {
252  FILE *file;
254  int fd;
255  } desc;
256 } AllocateDesc;
257 
258 static int numAllocatedDescs = 0;
259 static int maxAllocatedDescs = 0;
261 
262 /*
263  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
264  */
265 static int numExternalFDs = 0;
266 
267 /*
268  * Number of temporary files opened during the current session;
269  * this is used in generation of tempfile names.
270  */
271 static long tempFileCounter = 0;
272 
273 /*
274  * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
275  * indicating that the current database's default tablespace should be used.)
276  * When numTempTableSpaces is -1, this has not been set in the current
277  * transaction.
278  */
279 static Oid *tempTableSpaces = NULL;
280 static int numTempTableSpaces = -1;
281 static int nextTempTableSpace = 0;
282 
283 
284 /*--------------------
285  *
286  * Private Routines
287  *
288  * Delete - delete a file from the Lru ring
289  * LruDelete - remove a file from the Lru ring and close its FD
290  * Insert - put a file at the front of the Lru ring
291  * LruInsert - put a file at the front of the Lru ring and open it
292  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
293  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
294  * AllocateVfd - grab a free (or new) file record (from VfdCache)
295  * FreeVfd - free a file record
296  *
297  * The Least Recently Used ring is a doubly linked list that begins and
298  * ends on element zero. Element zero is special -- it doesn't represent
299  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
300  * anchor that shows us the beginning/end of the ring.
301  * Only VFD elements that are currently really open (have an FD assigned) are
302  * in the Lru ring. Elements that are "virtually" open can be recognized
303  * by having a non-null fileName field.
304  *
305  * example:
306  *
307  * /--less----\ /---------\
308  * v \ v \
309  * #0 --more---> LeastRecentlyUsed --more-\ \
310  * ^\ | |
311  * \\less--> MostRecentlyUsedFile <---/ |
312  * \more---/ \--less--/
313  *
314  *--------------------
315  */
316 static void Delete(File file);
317 static void LruDelete(File file);
318 static void Insert(File file);
319 static int LruInsert(File file);
320 static bool ReleaseLruFile(void);
321 static void ReleaseLruFiles(void);
322 static File AllocateVfd(void);
323 static void FreeVfd(File file);
324 
325 static int FileAccess(File file);
326 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
327 static bool reserveAllocatedDesc(void);
328 static int FreeDesc(AllocateDesc *desc);
329 
330 static void AtProcExit_Files(int code, Datum arg);
331 static void CleanupTempFiles(bool isCommit, bool isProcExit);
332 static void RemovePgTempRelationFiles(const char *tsdirname);
333 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
334 
335 static void walkdir(const char *path,
336  void (*action) (const char *fname, bool isdir, int elevel),
337  bool process_symlinks,
338  int elevel);
339 #ifdef PG_FLUSH_DATA_WORKS
340 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
341 #endif
342 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
343 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
344 
345 static int fsync_parent_path(const char *fname, int elevel);
346 
347 
348 /*
349  * pg_fsync --- do fsync with or without writethrough
350  */
351 int
353 {
354 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
355  struct stat st;
356 
357  /*
358  * Some operating system implementations of fsync() have requirements
359  * about the file access modes that were used when their file descriptor
360  * argument was opened, and these requirements differ depending on whether
361  * the file descriptor is for a directory.
362  *
363  * For any file descriptor that may eventually be handed to fsync(), we
364  * should have opened it with access modes that are compatible with
365  * fsync() on all supported systems, otherwise the code may not be
366  * portable, even if it runs ok on the current system.
367  *
368  * We assert here that a descriptor for a file was opened with write
369  * permissions (either O_RDWR or O_WRONLY) and for a directory without
370  * write permissions (O_RDONLY).
371  *
372  * Ignore any fstat errors and let the follow-up fsync() do its work.
373  * Doing this sanity check here counts for the case where fsync() is
374  * disabled.
375  */
376  if (fstat(fd, &st) == 0)
377  {
378  int desc_flags = fcntl(fd, F_GETFL);
379 
380  /*
381  * O_RDONLY is historically 0, so just make sure that for directories
382  * no write flags are used.
383  */
384  if (S_ISDIR(st.st_mode))
385  Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
386  else
387  Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
388  }
389  errno = 0;
390 #endif
391 
392  /* #if is to skip the sync_method test if there's no need for it */
393 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
395  return pg_fsync_writethrough(fd);
396  else
397 #endif
398  return pg_fsync_no_writethrough(fd);
399 }
400 
401 
402 /*
403  * pg_fsync_no_writethrough --- same as fsync except does nothing if
404  * enableFsync is off
405  */
406 int
408 {
409  if (enableFsync)
410  return fsync(fd);
411  else
412  return 0;
413 }
414 
415 /*
416  * pg_fsync_writethrough
417  */
418 int
420 {
421  if (enableFsync)
422  {
423 #ifdef WIN32
424  return _commit(fd);
425 #elif defined(F_FULLFSYNC)
426  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
427 #else
428  errno = ENOSYS;
429  return -1;
430 #endif
431  }
432  else
433  return 0;
434 }
435 
436 /*
437  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
438  *
439  * Not all platforms have fdatasync; treat as fsync if not available.
440  */
441 int
443 {
444  if (enableFsync)
445  {
446 #ifdef HAVE_FDATASYNC
447  return fdatasync(fd);
448 #else
449  return fsync(fd);
450 #endif
451  }
452  else
453  return 0;
454 }
455 
456 /*
457  * pg_flush_data --- advise OS that the described dirty data should be flushed
458  *
459  * offset of 0 with nbytes 0 means that the entire file should be flushed
460  */
461 void
462 pg_flush_data(int fd, off_t offset, off_t nbytes)
463 {
464  /*
465  * Right now file flushing is primarily used to avoid making later
466  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
467  * if fsyncs are disabled - that's a decision we might want to make
468  * configurable at some point.
469  */
470  if (!enableFsync)
471  return;
472 
473  /*
474  * We compile all alternatives that are supported on the current platform,
475  * to find portability problems more easily.
476  */
477 #if defined(HAVE_SYNC_FILE_RANGE)
478  {
479  int rc;
480  static bool not_implemented_by_kernel = false;
481 
482  if (not_implemented_by_kernel)
483  return;
484 
485  /*
486  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
487  * tells the OS that writeback for the specified blocks should be
488  * started, but that we don't want to wait for completion. Note that
489  * this call might block if too much dirty data exists in the range.
490  * This is the preferable method on OSs supporting it, as it works
491  * reliably when available (contrast to msync()) and doesn't flush out
492  * clean data (like FADV_DONTNEED).
493  */
494  rc = sync_file_range(fd, offset, nbytes,
495  SYNC_FILE_RANGE_WRITE);
496  if (rc != 0)
497  {
498  int elevel;
499 
500  /*
501  * For systems that don't have an implementation of
502  * sync_file_range() such as Windows WSL, generate only one
503  * warning and then suppress all further attempts by this process.
504  */
505  if (errno == ENOSYS)
506  {
507  elevel = WARNING;
508  not_implemented_by_kernel = true;
509  }
510  else
511  elevel = data_sync_elevel(WARNING);
512 
513  ereport(elevel,
515  errmsg("could not flush dirty data: %m")));
516  }
517 
518  return;
519  }
520 #endif
521 #if !defined(WIN32) && defined(MS_ASYNC)
522  {
523  void *p;
524  static int pagesize = 0;
525 
526  /*
527  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
528  * writeback. On linux it only does so if MS_SYNC is specified, but
529  * then it does the writeback synchronously. Luckily all common linux
530  * systems have sync_file_range(). This is preferable over
531  * FADV_DONTNEED because it doesn't flush out clean data.
532  *
533  * We map the file (mmap()), tell the kernel to sync back the contents
534  * (msync()), and then remove the mapping again (munmap()).
535  */
536 
537  /* mmap() needs actual length if we want to map whole file */
538  if (offset == 0 && nbytes == 0)
539  {
540  nbytes = lseek(fd, 0, SEEK_END);
541  if (nbytes < 0)
542  {
545  errmsg("could not determine dirty data size: %m")));
546  return;
547  }
548  }
549 
550  /*
551  * Some platforms reject partial-page mmap() attempts. To deal with
552  * that, just truncate the request to a page boundary. If any extra
553  * bytes don't get flushed, well, it's only a hint anyway.
554  */
555 
556  /* fetch pagesize only once */
557  if (pagesize == 0)
558  pagesize = sysconf(_SC_PAGESIZE);
559 
560  /* align length to pagesize, dropping any fractional page */
561  if (pagesize > 0)
562  nbytes = (nbytes / pagesize) * pagesize;
563 
564  /* fractional-page request is a no-op */
565  if (nbytes <= 0)
566  return;
567 
568  /*
569  * mmap could well fail, particularly on 32-bit platforms where there
570  * may simply not be enough address space. If so, silently fall
571  * through to the next implementation.
572  */
573  if (nbytes <= (off_t) SSIZE_MAX)
574  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
575  else
576  p = MAP_FAILED;
577 
578  if (p != MAP_FAILED)
579  {
580  int rc;
581 
582  rc = msync(p, (size_t) nbytes, MS_ASYNC);
583  if (rc != 0)
584  {
587  errmsg("could not flush dirty data: %m")));
588  /* NB: need to fall through to munmap()! */
589  }
590 
591  rc = munmap(p, (size_t) nbytes);
592  if (rc != 0)
593  {
594  /* FATAL error because mapping would remain */
595  ereport(FATAL,
597  errmsg("could not munmap() while flushing data: %m")));
598  }
599 
600  return;
601  }
602  }
603 #endif
604 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
605  {
606  int rc;
607 
608  /*
609  * Signal the kernel that the passed in range should not be cached
610  * anymore. This has the, desired, side effect of writing out dirty
611  * data, and the, undesired, side effect of likely discarding useful
612  * clean cached blocks. For the latter reason this is the least
613  * preferable method.
614  */
615 
616  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
617 
618  if (rc != 0)
619  {
620  /* don't error out, this is just a performance optimization */
623  errmsg("could not flush dirty data: %m")));
624  }
625 
626  return;
627  }
628 #endif
629 }
630 
631 /*
632  * Truncate a file to a given length by name.
633  */
634 int
635 pg_truncate(const char *path, off_t length)
636 {
637 #ifdef WIN32
638  int save_errno;
639  int ret;
640  int fd;
641 
642  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
643  if (fd >= 0)
644  {
645  ret = ftruncate(fd, 0);
646  save_errno = errno;
647  CloseTransientFile(fd);
648  errno = save_errno;
649  }
650  else
651  ret = -1;
652 
653  return ret;
654 #else
655  return truncate(path, length);
656 #endif
657 }
658 
659 /*
660  * fsync_fname -- fsync a file or directory, handling errors properly
661  *
662  * Try to fsync a file or directory. When doing the latter, ignore errors that
663  * indicate the OS just doesn't allow/require fsyncing directories.
664  */
665 void
666 fsync_fname(const char *fname, bool isdir)
667 {
668  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
669 }
670 
671 /*
672  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
673  *
674  * This routine ensures that, after returning, the effect of renaming file
675  * persists in case of a crash. A crash while this routine is running will
676  * leave you with either the pre-existing or the moved file in place of the
677  * new file; no mixed state or truncated files are possible.
678  *
679  * It does so by using fsync on the old filename and the possibly existing
680  * target filename before the rename, and the target file and directory after.
681  *
682  * Note that rename() cannot be used across arbitrary directories, as they
683  * might not be on the same filesystem. Therefore this routine does not
684  * support renaming across directories.
685  *
686  * Log errors with the caller specified severity.
687  *
688  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
689  * valid upon return.
690  */
691 int
692 durable_rename(const char *oldfile, const char *newfile, int elevel)
693 {
694  int fd;
695 
696  /*
697  * First fsync the old and target path (if it exists), to ensure that they
698  * are properly persistent on disk. Syncing the target file is not
699  * strictly necessary, but it makes it easier to reason about crashes;
700  * because it's then guaranteed that either source or target file exists
701  * after a crash.
702  */
703  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
704  return -1;
705 
706  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
707  if (fd < 0)
708  {
709  if (errno != ENOENT)
710  {
711  ereport(elevel,
713  errmsg("could not open file \"%s\": %m", newfile)));
714  return -1;
715  }
716  }
717  else
718  {
719  if (pg_fsync(fd) != 0)
720  {
721  int save_errno;
722 
723  /* close file upon error, might not be in transaction context */
724  save_errno = errno;
725  CloseTransientFile(fd);
726  errno = save_errno;
727 
728  ereport(elevel,
730  errmsg("could not fsync file \"%s\": %m", newfile)));
731  return -1;
732  }
733 
734  if (CloseTransientFile(fd) != 0)
735  {
736  ereport(elevel,
738  errmsg("could not close file \"%s\": %m", newfile)));
739  return -1;
740  }
741  }
742 
743  /* Time to do the real deal... */
744  if (rename(oldfile, newfile) < 0)
745  {
746  ereport(elevel,
748  errmsg("could not rename file \"%s\" to \"%s\": %m",
749  oldfile, newfile)));
750  return -1;
751  }
752 
753  /*
754  * To guarantee renaming the file is persistent, fsync the file with its
755  * new name, and its containing directory.
756  */
757  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
758  return -1;
759 
760  if (fsync_parent_path(newfile, elevel) != 0)
761  return -1;
762 
763  return 0;
764 }
765 
766 /*
767  * durable_unlink -- remove a file in a durable manner
768  *
769  * This routine ensures that, after returning, the effect of removing file
770  * persists in case of a crash. A crash while this routine is running will
771  * leave the system in no mixed state.
772  *
773  * It does so by using fsync on the parent directory of the file after the
774  * actual removal is done.
775  *
776  * Log errors with the severity specified by caller.
777  *
778  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
779  * valid upon return.
780  */
781 int
782 durable_unlink(const char *fname, int elevel)
783 {
784  if (unlink(fname) < 0)
785  {
786  ereport(elevel,
788  errmsg("could not remove file \"%s\": %m",
789  fname)));
790  return -1;
791  }
792 
793  /*
794  * To guarantee that the removal of the file is persistent, fsync its
795  * parent directory.
796  */
797  if (fsync_parent_path(fname, elevel) != 0)
798  return -1;
799 
800  return 0;
801 }
802 
803 /*
804  * durable_rename_excl -- rename a file in a durable manner.
805  *
806  * Similar to durable_rename(), except that this routine tries (but does not
807  * guarantee) not to overwrite the target file.
808  *
809  * Note that a crash in an unfortunate moment can leave you with two links to
810  * the target file.
811  *
812  * Log errors with the caller specified severity.
813  *
814  * On Windows, using a hard link followed by unlink() causes concurrency
815  * issues, while a simple rename() does not cause that, so be careful when
816  * changing the logic of this routine.
817  *
818  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
819  * valid upon return.
820  */
821 int
822 durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
823 {
824  /*
825  * Ensure that, if we crash directly after the rename/link, a file with
826  * valid contents is moved into place.
827  */
828  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
829  return -1;
830 
831 #ifdef HAVE_WORKING_LINK
832  if (link(oldfile, newfile) < 0)
833  {
834  ereport(elevel,
836  errmsg("could not link file \"%s\" to \"%s\": %m",
837  oldfile, newfile)));
838  return -1;
839  }
840  unlink(oldfile);
841 #else
842  if (rename(oldfile, newfile) < 0)
843  {
844  ereport(elevel,
846  errmsg("could not rename file \"%s\" to \"%s\": %m",
847  oldfile, newfile)));
848  return -1;
849  }
850 #endif
851 
852  /*
853  * Make change persistent in case of an OS crash, both the new entry and
854  * its parent directory need to be flushed.
855  */
856  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
857  return -1;
858 
859  /* Same for parent directory */
860  if (fsync_parent_path(newfile, elevel) != 0)
861  return -1;
862 
863  return 0;
864 }
865 
866 /*
867  * InitFileAccess --- initialize this module during backend startup
868  *
869  * This is called during either normal or standalone backend start.
870  * It is *not* called in the postmaster.
871  */
872 void
874 {
875  Assert(SizeVfdCache == 0); /* call me only once */
876 
877  /* initialize cache header entry */
878  VfdCache = (Vfd *) malloc(sizeof(Vfd));
879  if (VfdCache == NULL)
880  ereport(FATAL,
881  (errcode(ERRCODE_OUT_OF_MEMORY),
882  errmsg("out of memory")));
883 
884  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
885  VfdCache->fd = VFD_CLOSED;
886 
887  SizeVfdCache = 1;
888 
889  /* register proc-exit hook to ensure temp files are dropped at exit */
891 }
892 
893 /*
894  * count_usable_fds --- count how many FDs the system will let us open,
895  * and estimate how many are already open.
896  *
897  * We stop counting if usable_fds reaches max_to_probe. Note: a small
898  * value of max_to_probe might result in an underestimate of already_open;
899  * we must fill in any "gaps" in the set of used FDs before the calculation
900  * of already_open will give the right answer. In practice, max_to_probe
901  * of a couple of dozen should be enough to ensure good results.
902  *
903  * We assume stdin (FD 0) is available for dup'ing
904  */
905 static void
906 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
907 {
908  int *fd;
909  int size;
910  int used = 0;
911  int highestfd = 0;
912  int j;
913 
914 #ifdef HAVE_GETRLIMIT
915  struct rlimit rlim;
916  int getrlimit_status;
917 #endif
918 
919  size = 1024;
920  fd = (int *) palloc(size * sizeof(int));
921 
922 #ifdef HAVE_GETRLIMIT
923 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
924  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
925 #else /* but BSD doesn't ... */
926  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
927 #endif /* RLIMIT_NOFILE */
928  if (getrlimit_status != 0)
929  ereport(WARNING, (errmsg("getrlimit failed: %m")));
930 #endif /* HAVE_GETRLIMIT */
931 
932  /* dup until failure or probe limit reached */
933  for (;;)
934  {
935  int thisfd;
936 
937 #ifdef HAVE_GETRLIMIT
938 
939  /*
940  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
941  * some platforms
942  */
943  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
944  break;
945 #endif
946 
947  thisfd = dup(0);
948  if (thisfd < 0)
949  {
950  /* Expect EMFILE or ENFILE, else it's fishy */
951  if (errno != EMFILE && errno != ENFILE)
952  elog(WARNING, "dup(0) failed after %d successes: %m", used);
953  break;
954  }
955 
956  if (used >= size)
957  {
958  size *= 2;
959  fd = (int *) repalloc(fd, size * sizeof(int));
960  }
961  fd[used++] = thisfd;
962 
963  if (highestfd < thisfd)
964  highestfd = thisfd;
965 
966  if (used >= max_to_probe)
967  break;
968  }
969 
970  /* release the files we opened */
971  for (j = 0; j < used; j++)
972  close(fd[j]);
973 
974  pfree(fd);
975 
976  /*
977  * Return results. usable_fds is just the number of successful dups. We
978  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
979  * number) and so already_open is highestfd+1 - usable_fds.
980  */
981  *usable_fds = used;
982  *already_open = highestfd + 1 - used;
983 }
984 
985 /*
986  * set_max_safe_fds
987  * Determine number of file descriptors that fd.c is allowed to use
988  */
989 void
991 {
992  int usable_fds;
993  int already_open;
994 
995  /*----------
996  * We want to set max_safe_fds to
997  * MIN(usable_fds, max_files_per_process - already_open)
998  * less the slop factor for files that are opened without consulting
999  * fd.c. This ensures that we won't exceed either max_files_per_process
1000  * or the experimentally-determined EMFILE limit.
1001  *----------
1002  */
1004  &usable_fds, &already_open);
1005 
1006  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
1007 
1008  /*
1009  * Take off the FDs reserved for system() etc.
1010  */
1012 
1013  /*
1014  * Make sure we still have enough to get by.
1015  */
1016  if (max_safe_fds < FD_MINFREE)
1017  ereport(FATAL,
1018  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1019  errmsg("insufficient file descriptors available to start server process"),
1020  errdetail("System allows %d, we need at least %d.",
1023 
1024  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1025  max_safe_fds, usable_fds, already_open);
1026 }
1027 
1028 /*
1029  * Open a file with BasicOpenFilePerm() and pass default file mode for the
1030  * fileMode parameter.
1031  */
1032 int
1034 {
1035  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1036 }
1037 
1038 /*
1039  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1040  *
1041  * This is exported for use by places that really want a plain kernel FD,
1042  * but need to be proof against running out of FDs. Once an FD has been
1043  * successfully returned, it is the caller's responsibility to ensure that
1044  * it will not be leaked on ereport()! Most users should *not* call this
1045  * routine directly, but instead use the VFD abstraction level, which
1046  * provides protection against descriptor leaks as well as management of
1047  * files that need to be open for more than a short period of time.
1048  *
1049  * Ideally this should be the *only* direct call of open() in the backend.
1050  * In practice, the postmaster calls open() directly, and there are some
1051  * direct open() calls done early in backend startup. Those are OK since
1052  * this module wouldn't have any open files to close at that point anyway.
1053  */
1054 int
1056 {
1057  int fd;
1058 
1059 tryAgain:
1060 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1061 
1062  /*
1063  * The value we defined to stand in for O_DIRECT when simulating it with
1064  * F_NOCACHE had better not collide with any of the standard flags.
1065  */
1067  (O_APPEND |
1068  O_CREAT |
1069  O_EXCL |
1070  O_RDWR |
1071  O_RDONLY |
1072  O_SYNC |
1073  O_TRUNC |
1074  O_WRONLY)) == 0,
1075  "PG_O_DIRECT value collides with standard flag");
1076 #if defined(O_CLOEXEC)
1077  StaticAssertStmt((PG_O_DIRECT & O_CLOEXEC) == 0,
1078  "PG_O_DIRECT value collides with O_CLOEXEC");
1079 #endif
1080 #if defined(O_DSYNC)
1082  "PG_O_DIRECT value collides with O_DSYNC");
1083 #endif
1084 
1085  fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1086 #else
1087  fd = open(fileName, fileFlags, fileMode);
1088 #endif
1089 
1090  if (fd >= 0)
1091  {
1092 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1093  if (fileFlags & PG_O_DIRECT)
1094  {
1095  if (fcntl(fd, F_NOCACHE, 1) < 0)
1096  {
1097  int save_errno = errno;
1098 
1099  close(fd);
1100  errno = save_errno;
1101  return -1;
1102  }
1103  }
1104 #endif
1105 
1106  return fd; /* success! */
1107  }
1108 
1109  if (errno == EMFILE || errno == ENFILE)
1110  {
1111  int save_errno = errno;
1112 
1113  ereport(LOG,
1114  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1115  errmsg("out of file descriptors: %m; release and retry")));
1116  errno = 0;
1117  if (ReleaseLruFile())
1118  goto tryAgain;
1119  errno = save_errno;
1120  }
1121 
1122  return -1; /* failure */
1123 }
1124 
1125 /*
1126  * AcquireExternalFD - attempt to reserve an external file descriptor
1127  *
1128  * This should be used by callers that need to hold a file descriptor open
1129  * over more than a short interval, but cannot use any of the other facilities
1130  * provided by this module.
1131  *
1132  * The difference between this and the underlying ReserveExternalFD function
1133  * is that this will report failure (by setting errno and returning false)
1134  * if "too many" external FDs are already reserved. This should be used in
1135  * any code where the total number of FDs to be reserved is not predictable
1136  * and small.
1137  */
1138 bool
1140 {
1141  /*
1142  * We don't want more than max_safe_fds / 3 FDs to be consumed for
1143  * "external" FDs.
1144  */
1145  if (numExternalFDs < max_safe_fds / 3)
1146  {
1148  return true;
1149  }
1150  errno = EMFILE;
1151  return false;
1152 }
1153 
1154 /*
1155  * ReserveExternalFD - report external consumption of a file descriptor
1156  *
1157  * This should be used by callers that need to hold a file descriptor open
1158  * over more than a short interval, but cannot use any of the other facilities
1159  * provided by this module. This just tracks the use of the FD and closes
1160  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1161  *
1162  * Call this directly only in code where failure to reserve the FD would be
1163  * fatal; for example, the WAL-writing code does so, since the alternative is
1164  * session failure. Also, it's very unwise to do so in code that could
1165  * consume more than one FD per process.
1166  *
1167  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1168  * available, it doesn't matter too much whether this is called before or
1169  * after actually opening the FD; but doing so beforehand reduces the risk of
1170  * an EMFILE failure if not everybody played nice. In any case, it's solely
1171  * caller's responsibility to keep the external-FD count in sync with reality.
1172  */
1173 void
1175 {
1176  /*
1177  * Release VFDs if needed to stay safe. Because we do this before
1178  * incrementing numExternalFDs, the final state will be as desired, i.e.,
1179  * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1180  */
1181  ReleaseLruFiles();
1182 
1183  numExternalFDs++;
1184 }
1185 
1186 /*
1187  * ReleaseExternalFD - report release of an external file descriptor
1188  *
1189  * This is guaranteed not to change errno, so it can be used in failure paths.
1190  */
1191 void
1193 {
1194  Assert(numExternalFDs > 0);
1195  numExternalFDs--;
1196 }
1197 
1198 
1199 #if defined(FDDEBUG)
1200 
1201 static void
1202 _dump_lru(void)
1203 {
1204  int mru = VfdCache[0].lruLessRecently;
1205  Vfd *vfdP = &VfdCache[mru];
1206  char buf[2048];
1207 
1208  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1209  while (mru != 0)
1210  {
1211  mru = vfdP->lruLessRecently;
1212  vfdP = &VfdCache[mru];
1213  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1214  }
1215  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1216  elog(LOG, "%s", buf);
1217 }
1218 #endif /* FDDEBUG */
1219 
1220 static void
1222 {
1223  Vfd *vfdP;
1224 
1225  Assert(file != 0);
1226 
1227  DO_DB(elog(LOG, "Delete %d (%s)",
1228  file, VfdCache[file].fileName));
1229  DO_DB(_dump_lru());
1230 
1231  vfdP = &VfdCache[file];
1232 
1233  VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1234  VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1235 
1236  DO_DB(_dump_lru());
1237 }
1238 
1239 static void
1241 {
1242  Vfd *vfdP;
1243 
1244  Assert(file != 0);
1245 
1246  DO_DB(elog(LOG, "LruDelete %d (%s)",
1247  file, VfdCache[file].fileName));
1248 
1249  vfdP = &VfdCache[file];
1250 
1251  /*
1252  * Close the file. We aren't expecting this to fail; if it does, better
1253  * to leak the FD than to mess up our internal state.
1254  */
1255  if (close(vfdP->fd) != 0)
1257  "could not close file \"%s\": %m", vfdP->fileName);
1258  vfdP->fd = VFD_CLOSED;
1259  --nfile;
1260 
1261  /* delete the vfd record from the LRU ring */
1262  Delete(file);
1263 }
1264 
1265 static void
1267 {
1268  Vfd *vfdP;
1269 
1270  Assert(file != 0);
1271 
1272  DO_DB(elog(LOG, "Insert %d (%s)",
1273  file, VfdCache[file].fileName));
1274  DO_DB(_dump_lru());
1275 
1276  vfdP = &VfdCache[file];
1277 
1278  vfdP->lruMoreRecently = 0;
1279  vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1280  VfdCache[0].lruLessRecently = file;
1281  VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1282 
1283  DO_DB(_dump_lru());
1284 }
1285 
1286 /* returns 0 on success, -1 on re-open failure (with errno set) */
1287 static int
1289 {
1290  Vfd *vfdP;
1291 
1292  Assert(file != 0);
1293 
1294  DO_DB(elog(LOG, "LruInsert %d (%s)",
1295  file, VfdCache[file].fileName));
1296 
1297  vfdP = &VfdCache[file];
1298 
1299  if (FileIsNotOpen(file))
1300  {
1301  /* Close excess kernel FDs. */
1302  ReleaseLruFiles();
1303 
1304  /*
1305  * The open could still fail for lack of file descriptors, eg due to
1306  * overall system file table being full. So, be prepared to release
1307  * another FD if necessary...
1308  */
1309  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1310  vfdP->fileMode);
1311  if (vfdP->fd < 0)
1312  {
1313  DO_DB(elog(LOG, "re-open failed: %m"));
1314  return -1;
1315  }
1316  else
1317  {
1318  ++nfile;
1319  }
1320  }
1321 
1322  /*
1323  * put it at the head of the Lru ring
1324  */
1325 
1326  Insert(file);
1327 
1328  return 0;
1329 }
1330 
1331 /*
1332  * Release one kernel FD by closing the least-recently-used VFD.
1333  */
1334 static bool
1336 {
1337  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1338 
1339  if (nfile > 0)
1340  {
1341  /*
1342  * There are opened files and so there should be at least one used vfd
1343  * in the ring.
1344  */
1345  Assert(VfdCache[0].lruMoreRecently != 0);
1346  LruDelete(VfdCache[0].lruMoreRecently);
1347  return true; /* freed a file */
1348  }
1349  return false; /* no files available to free */
1350 }
1351 
1352 /*
1353  * Release kernel FDs as needed to get under the max_safe_fds limit.
1354  * After calling this, it's OK to try to open another file.
1355  */
1356 static void
1358 {
1360  {
1361  if (!ReleaseLruFile())
1362  break;
1363  }
1364 }
1365 
1366 static File
1368 {
1369  Index i;
1370  File file;
1371 
1372  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1373 
1374  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1375 
1376  if (VfdCache[0].nextFree == 0)
1377  {
1378  /*
1379  * The free list is empty so it is time to increase the size of the
1380  * array. We choose to double it each time this happens. However,
1381  * there's not much point in starting *real* small.
1382  */
1383  Size newCacheSize = SizeVfdCache * 2;
1384  Vfd *newVfdCache;
1385 
1386  if (newCacheSize < 32)
1387  newCacheSize = 32;
1388 
1389  /*
1390  * Be careful not to clobber VfdCache ptr if realloc fails.
1391  */
1392  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1393  if (newVfdCache == NULL)
1394  ereport(ERROR,
1395  (errcode(ERRCODE_OUT_OF_MEMORY),
1396  errmsg("out of memory")));
1397  VfdCache = newVfdCache;
1398 
1399  /*
1400  * Initialize the new entries and link them into the free list.
1401  */
1402  for (i = SizeVfdCache; i < newCacheSize; i++)
1403  {
1404  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1405  VfdCache[i].nextFree = i + 1;
1406  VfdCache[i].fd = VFD_CLOSED;
1407  }
1408  VfdCache[newCacheSize - 1].nextFree = 0;
1409  VfdCache[0].nextFree = SizeVfdCache;
1410 
1411  /*
1412  * Record the new size
1413  */
1414  SizeVfdCache = newCacheSize;
1415  }
1416 
1417  file = VfdCache[0].nextFree;
1418 
1419  VfdCache[0].nextFree = VfdCache[file].nextFree;
1420 
1421  return file;
1422 }
1423 
1424 static void
1426 {
1427  Vfd *vfdP = &VfdCache[file];
1428 
1429  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1430  file, vfdP->fileName ? vfdP->fileName : ""));
1431 
1432  if (vfdP->fileName != NULL)
1433  {
1434  free(vfdP->fileName);
1435  vfdP->fileName = NULL;
1436  }
1437  vfdP->fdstate = 0x0;
1438 
1439  vfdP->nextFree = VfdCache[0].nextFree;
1440  VfdCache[0].nextFree = file;
1441 }
1442 
1443 /* returns 0 on success, -1 on re-open failure (with errno set) */
1444 static int
1446 {
1447  int returnValue;
1448 
1449  DO_DB(elog(LOG, "FileAccess %d (%s)",
1450  file, VfdCache[file].fileName));
1451 
1452  /*
1453  * Is the file open? If not, open it and put it at the head of the LRU
1454  * ring (possibly closing the least recently used file to get an FD).
1455  */
1456 
1457  if (FileIsNotOpen(file))
1458  {
1459  returnValue = LruInsert(file);
1460  if (returnValue != 0)
1461  return returnValue;
1462  }
1463  else if (VfdCache[0].lruLessRecently != file)
1464  {
1465  /*
1466  * We now know that the file is open and that it is not the last one
1467  * accessed, so we need to move it to the head of the Lru ring.
1468  */
1469 
1470  Delete(file);
1471  Insert(file);
1472  }
1473 
1474  return 0;
1475 }
1476 
1477 /*
1478  * Called whenever a temporary file is deleted to report its size.
1479  */
1480 static void
1481 ReportTemporaryFileUsage(const char *path, off_t size)
1482 {
1483  pgstat_report_tempfile(size);
1484 
1485  if (log_temp_files >= 0)
1486  {
1487  if ((size / 1024) >= log_temp_files)
1488  ereport(LOG,
1489  (errmsg("temporary file: path \"%s\", size %lu",
1490  path, (unsigned long) size)));
1491  }
1492 }
1493 
1494 /*
1495  * Called to register a temporary file for automatic close.
1496  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1497  * before the file was opened.
1498  */
1499 static void
1501 {
1503  VfdCache[file].resowner = CurrentResourceOwner;
1504 
1505  /* Backup mechanism for closing at end of xact. */
1506  VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1508 }
1509 
1510 /*
1511  * Called when we get a shared invalidation message on some relation.
1512  */
1513 #ifdef NOT_USED
1514 void
1515 FileInvalidate(File file)
1516 {
1517  Assert(FileIsValid(file));
1518  if (!FileIsNotOpen(file))
1519  LruDelete(file);
1520 }
1521 #endif
1522 
1523 /*
1524  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1525  * fileMode parameter.
1526  */
1527 File
1529 {
1530  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1531 }
1532 
1533 /*
1534  * open a file in an arbitrary directory
1535  *
1536  * NB: if the passed pathname is relative (which it usually is),
1537  * it will be interpreted relative to the process' working directory
1538  * (which should always be $PGDATA when this code is running).
1539  */
1540 File
1542 {
1543  char *fnamecopy;
1544  File file;
1545  Vfd *vfdP;
1546 
1547  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1548  fileName, fileFlags, fileMode));
1549 
1550  /*
1551  * We need a malloc'd copy of the file name; fail cleanly if no room.
1552  */
1553  fnamecopy = strdup(fileName);
1554  if (fnamecopy == NULL)
1555  ereport(ERROR,
1556  (errcode(ERRCODE_OUT_OF_MEMORY),
1557  errmsg("out of memory")));
1558 
1559  file = AllocateVfd();
1560  vfdP = &VfdCache[file];
1561 
1562  /* Close excess kernel FDs. */
1563  ReleaseLruFiles();
1564 
1565  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1566 
1567  if (vfdP->fd < 0)
1568  {
1569  int save_errno = errno;
1570 
1571  FreeVfd(file);
1572  free(fnamecopy);
1573  errno = save_errno;
1574  return -1;
1575  }
1576  ++nfile;
1577  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1578  vfdP->fd));
1579 
1580  vfdP->fileName = fnamecopy;
1581  /* Saved flags are adjusted to be OK for re-opening file */
1582  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1583  vfdP->fileMode = fileMode;
1584  vfdP->fileSize = 0;
1585  vfdP->fdstate = 0x0;
1586  vfdP->resowner = NULL;
1587 
1588  Insert(file);
1589 
1590  return file;
1591 }
1592 
1593 /*
1594  * Create directory 'directory'. If necessary, create 'basedir', which must
1595  * be the directory above it. This is designed for creating the top-level
1596  * temporary directory on demand before creating a directory underneath it.
1597  * Do nothing if the directory already exists.
1598  *
1599  * Directories created within the top-level temporary directory should begin
1600  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1601  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1602  * that do not need any particular prefix.
1603 */
1604 void
1606 {
1607  if (MakePGDirectory(directory) < 0)
1608  {
1609  if (errno == EEXIST)
1610  return;
1611 
1612  /*
1613  * Failed. Try to create basedir first in case it's missing. Tolerate
1614  * EEXIST to close a race against another process following the same
1615  * algorithm.
1616  */
1617  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1618  ereport(ERROR,
1620  errmsg("cannot create temporary directory \"%s\": %m",
1621  basedir)));
1622 
1623  /* Try again. */
1624  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1625  ereport(ERROR,
1627  errmsg("cannot create temporary subdirectory \"%s\": %m",
1628  directory)));
1629  }
1630 }
1631 
1632 /*
1633  * Delete a directory and everything in it, if it exists.
1634  */
1635 void
1636 PathNameDeleteTemporaryDir(const char *dirname)
1637 {
1638  struct stat statbuf;
1639 
1640  /* Silently ignore missing directory. */
1641  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1642  return;
1643 
1644  /*
1645  * Currently, walkdir doesn't offer a way for our passed in function to
1646  * maintain state. Perhaps it should, so that we could tell the caller
1647  * whether this operation succeeded or failed. Since this operation is
1648  * used in a cleanup path, we wouldn't actually behave differently: we'll
1649  * just log failures.
1650  */
1651  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1652 }
1653 
1654 /*
1655  * Open a temporary file that will disappear when we close it.
1656  *
1657  * This routine takes care of generating an appropriate tempfile name.
1658  * There's no need to pass in fileFlags or fileMode either, since only
1659  * one setting makes any sense for a temp file.
1660  *
1661  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1662  * to ensure it's closed and deleted when it's no longer needed, typically at
1663  * the end-of-transaction. In most cases, you don't want temporary files to
1664  * outlive the transaction that created them, so this should be false -- but
1665  * if you need "somewhat" temporary storage, this might be useful. In either
1666  * case, the file is removed when the File is explicitly closed.
1667  */
1668 File
1669 OpenTemporaryFile(bool interXact)
1670 {
1671  File file = 0;
1672 
1673  /*
1674  * Make sure the current resource owner has space for this File before we
1675  * open it, if we'll be registering it below.
1676  */
1677  if (!interXact)
1679 
1680  /*
1681  * If some temp tablespace(s) have been given to us, try to use the next
1682  * one. If a given tablespace can't be found, we silently fall back to
1683  * the database's default tablespace.
1684  *
1685  * BUT: if the temp file is slated to outlive the current transaction,
1686  * force it into the database's default tablespace, so that it will not
1687  * pose a threat to possible tablespace drop attempts.
1688  */
1689  if (numTempTableSpaces > 0 && !interXact)
1690  {
1691  Oid tblspcOid = GetNextTempTableSpace();
1692 
1693  if (OidIsValid(tblspcOid))
1694  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1695  }
1696 
1697  /*
1698  * If not, or if tablespace is bad, create in database's default
1699  * tablespace. MyDatabaseTableSpace should normally be set before we get
1700  * here, but just in case it isn't, fall back to pg_default tablespace.
1701  */
1702  if (file <= 0)
1705  DEFAULTTABLESPACE_OID,
1706  true);
1707 
1708  /* Mark it for deletion at close and temporary file size limit */
1709  VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1710 
1711  /* Register it with the current resource owner */
1712  if (!interXact)
1713  RegisterTemporaryFile(file);
1714 
1715  return file;
1716 }
1717 
1718 /*
1719  * Return the path of the temp directory in a given tablespace.
1720  */
1721 void
1723 {
1724  /*
1725  * Identify the tempfile directory for this tablespace.
1726  *
1727  * If someone tries to specify pg_global, use pg_default instead.
1728  */
1729  if (tablespace == InvalidOid ||
1730  tablespace == DEFAULTTABLESPACE_OID ||
1731  tablespace == GLOBALTABLESPACE_OID)
1732  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1733  else
1734  {
1735  /* All other tablespaces are accessed via symlinks */
1736  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1737  tablespace, TABLESPACE_VERSION_DIRECTORY,
1739  }
1740 }
1741 
1742 /*
1743  * Open a temporary file in a specific tablespace.
1744  * Subroutine for OpenTemporaryFile, which see for details.
1745  */
1746 static File
1747 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1748 {
1749  char tempdirpath[MAXPGPATH];
1750  char tempfilepath[MAXPGPATH];
1751  File file;
1752 
1753  TempTablespacePath(tempdirpath, tblspcOid);
1754 
1755  /*
1756  * Generate a tempfile name that should be unique within the current
1757  * database instance.
1758  */
1759  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1760  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1761 
1762  /*
1763  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1764  * temp file that can be reused.
1765  */
1766  file = PathNameOpenFile(tempfilepath,
1767  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1768  if (file <= 0)
1769  {
1770  /*
1771  * We might need to create the tablespace's tempfile directory, if no
1772  * one has yet done so.
1773  *
1774  * Don't check for an error from MakePGDirectory; it could fail if
1775  * someone else just did the same thing. If it doesn't work then
1776  * we'll bomb out on the second create attempt, instead.
1777  */
1778  (void) MakePGDirectory(tempdirpath);
1779 
1780  file = PathNameOpenFile(tempfilepath,
1781  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1782  if (file <= 0 && rejectError)
1783  elog(ERROR, "could not create temporary file \"%s\": %m",
1784  tempfilepath);
1785  }
1786 
1787  return file;
1788 }
1789 
1790 
1791 /*
1792  * Create a new file. The directory containing it must already exist. Files
1793  * created this way are subject to temp_file_limit and are automatically
1794  * closed at end of transaction, but are not automatically deleted on close
1795  * because they are intended to be shared between cooperating backends.
1796  *
1797  * If the file is inside the top-level temporary directory, its name should
1798  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1799  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1800  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1801  * the prefix isn't needed.
1802  */
1803 File
1804 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1805 {
1806  File file;
1807 
1809 
1810  /*
1811  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1812  * temp file that can be reused.
1813  */
1814  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1815  if (file <= 0)
1816  {
1817  if (error_on_failure)
1818  ereport(ERROR,
1820  errmsg("could not create temporary file \"%s\": %m",
1821  path)));
1822  else
1823  return file;
1824  }
1825 
1826  /* Mark it for temp_file_limit accounting. */
1827  VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1828 
1829  /* Register it for automatic close. */
1830  RegisterTemporaryFile(file);
1831 
1832  return file;
1833 }
1834 
1835 /*
1836  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1837  * another backend. Files opened this way don't count against the
1838  * temp_file_limit of the caller, are automatically closed at the end of the
1839  * transaction but are not deleted on close.
1840  */
1841 File
1842 PathNameOpenTemporaryFile(const char *path, int mode)
1843 {
1844  File file;
1845 
1847 
1848  file = PathNameOpenFile(path, mode | PG_BINARY);
1849 
1850  /* If no such file, then we don't raise an error. */
1851  if (file <= 0 && errno != ENOENT)
1852  ereport(ERROR,
1854  errmsg("could not open temporary file \"%s\": %m",
1855  path)));
1856 
1857  if (file > 0)
1858  {
1859  /* Register it for automatic close. */
1860  RegisterTemporaryFile(file);
1861  }
1862 
1863  return file;
1864 }
1865 
1866 /*
1867  * Delete a file by pathname. Return true if the file existed, false if
1868  * didn't.
1869  */
1870 bool
1871 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1872 {
1873  struct stat filestats;
1874  int stat_errno;
1875 
1876  /* Get the final size for pgstat reporting. */
1877  if (stat(path, &filestats) != 0)
1878  stat_errno = errno;
1879  else
1880  stat_errno = 0;
1881 
1882  /*
1883  * Unlike FileClose's automatic file deletion code, we tolerate
1884  * non-existence to support BufFileDeleteShared which doesn't know how
1885  * many segments it has to delete until it runs out.
1886  */
1887  if (stat_errno == ENOENT)
1888  return false;
1889 
1890  if (unlink(path) < 0)
1891  {
1892  if (errno != ENOENT)
1893  ereport(error_on_failure ? ERROR : LOG,
1895  errmsg("could not unlink temporary file \"%s\": %m",
1896  path)));
1897  return false;
1898  }
1899 
1900  if (stat_errno == 0)
1901  ReportTemporaryFileUsage(path, filestats.st_size);
1902  else
1903  {
1904  errno = stat_errno;
1905  ereport(LOG,
1907  errmsg("could not stat file \"%s\": %m", path)));
1908  }
1909 
1910  return true;
1911 }
1912 
1913 /*
1914  * close a file when done with it
1915  */
1916 void
1918 {
1919  Vfd *vfdP;
1920 
1921  Assert(FileIsValid(file));
1922 
1923  DO_DB(elog(LOG, "FileClose: %d (%s)",
1924  file, VfdCache[file].fileName));
1925 
1926  vfdP = &VfdCache[file];
1927 
1928  if (!FileIsNotOpen(file))
1929  {
1930  /* close the file */
1931  if (close(vfdP->fd) != 0)
1932  {
1933  /*
1934  * We may need to panic on failure to close non-temporary files;
1935  * see LruDelete.
1936  */
1938  "could not close file \"%s\": %m", vfdP->fileName);
1939  }
1940 
1941  --nfile;
1942  vfdP->fd = VFD_CLOSED;
1943 
1944  /* remove the file from the lru ring */
1945  Delete(file);
1946  }
1947 
1948  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1949  {
1950  /* Subtract its size from current usage (do first in case of error) */
1951  temporary_files_size -= vfdP->fileSize;
1952  vfdP->fileSize = 0;
1953  }
1954 
1955  /*
1956  * Delete the file if it was temporary, and make a log entry if wanted
1957  */
1958  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1959  {
1960  struct stat filestats;
1961  int stat_errno;
1962 
1963  /*
1964  * If we get an error, as could happen within the ereport/elog calls,
1965  * we'll come right back here during transaction abort. Reset the
1966  * flag to ensure that we can't get into an infinite loop. This code
1967  * is arranged to ensure that the worst-case consequence is failing to
1968  * emit log message(s), not failing to attempt the unlink.
1969  */
1970  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1971 
1972 
1973  /* first try the stat() */
1974  if (stat(vfdP->fileName, &filestats))
1975  stat_errno = errno;
1976  else
1977  stat_errno = 0;
1978 
1979  /* in any case do the unlink */
1980  if (unlink(vfdP->fileName))
1981  ereport(LOG,
1983  errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
1984 
1985  /* and last report the stat results */
1986  if (stat_errno == 0)
1987  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1988  else
1989  {
1990  errno = stat_errno;
1991  ereport(LOG,
1993  errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
1994  }
1995  }
1996 
1997  /* Unregister it from the resource owner */
1998  if (vfdP->resowner)
1999  ResourceOwnerForgetFile(vfdP->resowner, file);
2000 
2001  /*
2002  * Return the Vfd slot to the free list
2003  */
2004  FreeVfd(file);
2005 }
2006 
2007 /*
2008  * FilePrefetch - initiate asynchronous read of a given range of the file.
2009  *
2010  * Currently the only implementation of this function is using posix_fadvise
2011  * which is the simplest standardized interface that accomplishes this.
2012  * We could add an implementation using libaio in the future; but note that
2013  * this API is inappropriate for libaio, which wants to have a buffer provided
2014  * to read into.
2015  */
2016 int
2017 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
2018 {
2019 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2020  int returnCode;
2021 
2022  Assert(FileIsValid(file));
2023 
2024  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
2025  file, VfdCache[file].fileName,
2026  (int64) offset, amount));
2027 
2028  returnCode = FileAccess(file);
2029  if (returnCode < 0)
2030  return returnCode;
2031 
2032  pgstat_report_wait_start(wait_event_info);
2033  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2034  POSIX_FADV_WILLNEED);
2036 
2037  return returnCode;
2038 #else
2039  Assert(FileIsValid(file));
2040  return 0;
2041 #endif
2042 }
2043 
2044 void
2045 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2046 {
2047  int returnCode;
2048 
2049  Assert(FileIsValid(file));
2050 
2051  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2052  file, VfdCache[file].fileName,
2053  (int64) offset, (int64) nbytes));
2054 
2055  if (nbytes <= 0)
2056  return;
2057 
2058  returnCode = FileAccess(file);
2059  if (returnCode < 0)
2060  return;
2061 
2062  pgstat_report_wait_start(wait_event_info);
2063  pg_flush_data(VfdCache[file].fd, offset, nbytes);
2065 }
2066 
2067 int
2068 FileRead(File file, char *buffer, int amount, off_t offset,
2069  uint32 wait_event_info)
2070 {
2071  int returnCode;
2072  Vfd *vfdP;
2073 
2074  Assert(FileIsValid(file));
2075 
2076  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
2077  file, VfdCache[file].fileName,
2078  (int64) offset,
2079  amount, buffer));
2080 
2081  returnCode = FileAccess(file);
2082  if (returnCode < 0)
2083  return returnCode;
2084 
2085  vfdP = &VfdCache[file];
2086 
2087 retry:
2088  pgstat_report_wait_start(wait_event_info);
2089  returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
2091 
2092  if (returnCode < 0)
2093  {
2094  /*
2095  * Windows may run out of kernel buffers and return "Insufficient
2096  * system resources" error. Wait a bit and retry to solve it.
2097  *
2098  * It is rumored that EINTR is also possible on some Unix filesystems,
2099  * in which case immediate retry is indicated.
2100  */
2101 #ifdef WIN32
2102  DWORD error = GetLastError();
2103 
2104  switch (error)
2105  {
2106  case ERROR_NO_SYSTEM_RESOURCES:
2107  pg_usleep(1000L);
2108  errno = EINTR;
2109  break;
2110  default:
2111  _dosmaperr(error);
2112  break;
2113  }
2114 #endif
2115  /* OK to retry if interrupted */
2116  if (errno == EINTR)
2117  goto retry;
2118  }
2119 
2120  return returnCode;
2121 }
2122 
2123 int
2124 FileWrite(File file, char *buffer, int amount, off_t offset,
2125  uint32 wait_event_info)
2126 {
2127  int returnCode;
2128  Vfd *vfdP;
2129 
2130  Assert(FileIsValid(file));
2131 
2132  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
2133  file, VfdCache[file].fileName,
2134  (int64) offset,
2135  amount, buffer));
2136 
2137  returnCode = FileAccess(file);
2138  if (returnCode < 0)
2139  return returnCode;
2140 
2141  vfdP = &VfdCache[file];
2142 
2143  /*
2144  * If enforcing temp_file_limit and it's a temp file, check to see if the
2145  * write would overrun temp_file_limit, and throw error if so. Note: it's
2146  * really a modularity violation to throw error here; we should set errno
2147  * and return -1. However, there's no way to report a suitable error
2148  * message if we do that. All current callers would just throw error
2149  * immediately anyway, so this is safe at present.
2150  */
2151  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2152  {
2153  off_t past_write = offset + amount;
2154 
2155  if (past_write > vfdP->fileSize)
2156  {
2157  uint64 newTotal = temporary_files_size;
2158 
2159  newTotal += past_write - vfdP->fileSize;
2160  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2161  ereport(ERROR,
2162  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2163  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2164  temp_file_limit)));
2165  }
2166  }
2167 
2168 retry:
2169  errno = 0;
2170  pgstat_report_wait_start(wait_event_info);
2171  returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2173 
2174  /* if write didn't set errno, assume problem is no disk space */
2175  if (returnCode != amount && errno == 0)
2176  errno = ENOSPC;
2177 
2178  if (returnCode >= 0)
2179  {
2180  /*
2181  * Maintain fileSize and temporary_files_size if it's a temp file.
2182  */
2183  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2184  {
2185  off_t past_write = offset + amount;
2186 
2187  if (past_write > vfdP->fileSize)
2188  {
2189  temporary_files_size += past_write - vfdP->fileSize;
2190  vfdP->fileSize = past_write;
2191  }
2192  }
2193  }
2194  else
2195  {
2196  /*
2197  * See comments in FileRead()
2198  */
2199 #ifdef WIN32
2200  DWORD error = GetLastError();
2201 
2202  switch (error)
2203  {
2204  case ERROR_NO_SYSTEM_RESOURCES:
2205  pg_usleep(1000L);
2206  errno = EINTR;
2207  break;
2208  default:
2209  _dosmaperr(error);
2210  break;
2211  }
2212 #endif
2213  /* OK to retry if interrupted */
2214  if (errno == EINTR)
2215  goto retry;
2216  }
2217 
2218  return returnCode;
2219 }
2220 
2221 int
2222 FileSync(File file, uint32 wait_event_info)
2223 {
2224  int returnCode;
2225 
2226  Assert(FileIsValid(file));
2227 
2228  DO_DB(elog(LOG, "FileSync: %d (%s)",
2229  file, VfdCache[file].fileName));
2230 
2231  returnCode = FileAccess(file);
2232  if (returnCode < 0)
2233  return returnCode;
2234 
2235  pgstat_report_wait_start(wait_event_info);
2236  returnCode = pg_fsync(VfdCache[file].fd);
2238 
2239  return returnCode;
2240 }
2241 
2242 off_t
2244 {
2245  Assert(FileIsValid(file));
2246 
2247  DO_DB(elog(LOG, "FileSize %d (%s)",
2248  file, VfdCache[file].fileName));
2249 
2250  if (FileIsNotOpen(file))
2251  {
2252  if (FileAccess(file) < 0)
2253  return (off_t) -1;
2254  }
2255 
2256  return lseek(VfdCache[file].fd, 0, SEEK_END);
2257 }
2258 
2259 int
2260 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2261 {
2262  int returnCode;
2263 
2264  Assert(FileIsValid(file));
2265 
2266  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2267  file, VfdCache[file].fileName));
2268 
2269  returnCode = FileAccess(file);
2270  if (returnCode < 0)
2271  return returnCode;
2272 
2273  pgstat_report_wait_start(wait_event_info);
2274  returnCode = ftruncate(VfdCache[file].fd, offset);
2276 
2277  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2278  {
2279  /* adjust our state for truncation of a temp file */
2280  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2281  temporary_files_size -= VfdCache[file].fileSize - offset;
2282  VfdCache[file].fileSize = offset;
2283  }
2284 
2285  return returnCode;
2286 }
2287 
2288 /*
2289  * Return the pathname associated with an open file.
2290  *
2291  * The returned string points to an internal buffer, which is valid until
2292  * the file is closed.
2293  */
2294 char *
2296 {
2297  Assert(FileIsValid(file));
2298 
2299  return VfdCache[file].fileName;
2300 }
2301 
2302 /*
2303  * Return the raw file descriptor of an opened file.
2304  *
2305  * The returned file descriptor will be valid until the file is closed, but
2306  * there are a lot of things that can make that happen. So the caller should
2307  * be careful not to do much of anything else before it finishes using the
2308  * returned file descriptor.
2309  */
2310 int
2312 {
2313  Assert(FileIsValid(file));
2314  return VfdCache[file].fd;
2315 }
2316 
2317 /*
2318  * FileGetRawFlags - returns the file flags on open(2)
2319  */
2320 int
2322 {
2323  Assert(FileIsValid(file));
2324  return VfdCache[file].fileFlags;
2325 }
2326 
2327 /*
2328  * FileGetRawMode - returns the mode bitmask passed to open(2)
2329  */
2330 mode_t
2332 {
2333  Assert(FileIsValid(file));
2334  return VfdCache[file].fileMode;
2335 }
2336 
2337 /*
2338  * Make room for another allocatedDescs[] array entry if needed and possible.
2339  * Returns true if an array element is available.
2340  */
2341 static bool
2343 {
2344  AllocateDesc *newDescs;
2345  int newMax;
2346 
2347  /* Quick out if array already has a free slot. */
2349  return true;
2350 
2351  /*
2352  * If the array hasn't yet been created in the current process, initialize
2353  * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2354  * we will ever need, anyway. We don't want to look at max_safe_fds
2355  * immediately because set_max_safe_fds() may not have run yet.
2356  */
2357  if (allocatedDescs == NULL)
2358  {
2359  newMax = FD_MINFREE / 3;
2360  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2361  /* Out of memory already? Treat as fatal error. */
2362  if (newDescs == NULL)
2363  ereport(ERROR,
2364  (errcode(ERRCODE_OUT_OF_MEMORY),
2365  errmsg("out of memory")));
2366  allocatedDescs = newDescs;
2367  maxAllocatedDescs = newMax;
2368  return true;
2369  }
2370 
2371  /*
2372  * Consider enlarging the array beyond the initial allocation used above.
2373  * By the time this happens, max_safe_fds should be known accurately.
2374  *
2375  * We mustn't let allocated descriptors hog all the available FDs, and in
2376  * practice we'd better leave a reasonable number of FDs for VFD use. So
2377  * set the maximum to max_safe_fds / 3. (This should certainly be at
2378  * least as large as the initial size, FD_MINFREE / 3, so we aren't
2379  * tightening the restriction here.) Recall that "external" FDs are
2380  * allowed to consume another third of max_safe_fds.
2381  */
2382  newMax = max_safe_fds / 3;
2383  if (newMax > maxAllocatedDescs)
2384  {
2385  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2386  newMax * sizeof(AllocateDesc));
2387  /* Treat out-of-memory as a non-fatal error. */
2388  if (newDescs == NULL)
2389  return false;
2390  allocatedDescs = newDescs;
2391  maxAllocatedDescs = newMax;
2392  return true;
2393  }
2394 
2395  /* Can't enlarge allocatedDescs[] any more. */
2396  return false;
2397 }
2398 
2399 /*
2400  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2401  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2402  * necessary to open the file. When done, call FreeFile rather than fclose.
2403  *
2404  * Note that files that will be open for any significant length of time
2405  * should NOT be handled this way, since they cannot share kernel file
2406  * descriptors with other files; there is grave risk of running out of FDs
2407  * if anyone locks down too many FDs. Most callers of this routine are
2408  * simply reading a config file that they will read and close immediately.
2409  *
2410  * fd.c will automatically close all files opened with AllocateFile at
2411  * transaction commit or abort; this prevents FD leakage if a routine
2412  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2413  *
2414  * Ideally this should be the *only* direct call of fopen() in the backend.
2415  */
2416 FILE *
2417 AllocateFile(const char *name, const char *mode)
2418 {
2419  FILE *file;
2420 
2421  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2422  numAllocatedDescs, name));
2423 
2424  /* Can we allocate another non-virtual FD? */
2425  if (!reserveAllocatedDesc())
2426  ereport(ERROR,
2427  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2428  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2429  maxAllocatedDescs, name)));
2430 
2431  /* Close excess kernel FDs. */
2432  ReleaseLruFiles();
2433 
2434 TryAgain:
2435  if ((file = fopen(name, mode)) != NULL)
2436  {
2437  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2438 
2439  desc->kind = AllocateDescFile;
2440  desc->desc.file = file;
2443  return desc->desc.file;
2444  }
2445 
2446  if (errno == EMFILE || errno == ENFILE)
2447  {
2448  int save_errno = errno;
2449 
2450  ereport(LOG,
2451  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2452  errmsg("out of file descriptors: %m; release and retry")));
2453  errno = 0;
2454  if (ReleaseLruFile())
2455  goto TryAgain;
2456  errno = save_errno;
2457  }
2458 
2459  return NULL;
2460 }
2461 
2462 /*
2463  * Open a file with OpenTransientFilePerm() and pass default file mode for
2464  * the fileMode parameter.
2465  */
2466 int
2468 {
2469  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2470 }
2471 
2472 /*
2473  * Like AllocateFile, but returns an unbuffered fd like open(2)
2474  */
2475 int
2477 {
2478  int fd;
2479 
2480  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2481  numAllocatedDescs, fileName));
2482 
2483  /* Can we allocate another non-virtual FD? */
2484  if (!reserveAllocatedDesc())
2485  ereport(ERROR,
2486  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2487  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2488  maxAllocatedDescs, fileName)));
2489 
2490  /* Close excess kernel FDs. */
2491  ReleaseLruFiles();
2492 
2493  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2494 
2495  if (fd >= 0)
2496  {
2497  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2498 
2499  desc->kind = AllocateDescRawFD;
2500  desc->desc.fd = fd;
2503 
2504  return fd;
2505  }
2506 
2507  return -1; /* failure */
2508 }
2509 
2510 /*
2511  * Routines that want to initiate a pipe stream should use OpenPipeStream
2512  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2513  * necessary. When done, call ClosePipeStream rather than pclose.
2514  *
2515  * This function also ensures that the popen'd program is run with default
2516  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2517  * uses. This ensures desirable response to, eg, closing a read pipe early.
2518  */
2519 FILE *
2520 OpenPipeStream(const char *command, const char *mode)
2521 {
2522  FILE *file;
2523  int save_errno;
2524 
2525  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2526  numAllocatedDescs, command));
2527 
2528  /* Can we allocate another non-virtual FD? */
2529  if (!reserveAllocatedDesc())
2530  ereport(ERROR,
2531  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2532  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2533  maxAllocatedDescs, command)));
2534 
2535  /* Close excess kernel FDs. */
2536  ReleaseLruFiles();
2537 
2538 TryAgain:
2539  fflush(stdout);
2540  fflush(stderr);
2542  errno = 0;
2543  file = popen(command, mode);
2544  save_errno = errno;
2546  errno = save_errno;
2547  if (file != NULL)
2548  {
2549  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2550 
2551  desc->kind = AllocateDescPipe;
2552  desc->desc.file = file;
2555  return desc->desc.file;
2556  }
2557 
2558  if (errno == EMFILE || errno == ENFILE)
2559  {
2560  ereport(LOG,
2561  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2562  errmsg("out of file descriptors: %m; release and retry")));
2563  if (ReleaseLruFile())
2564  goto TryAgain;
2565  errno = save_errno;
2566  }
2567 
2568  return NULL;
2569 }
2570 
2571 /*
2572  * Free an AllocateDesc of any type.
2573  *
2574  * The argument *must* point into the allocatedDescs[] array.
2575  */
2576 static int
2578 {
2579  int result;
2580 
2581  /* Close the underlying object */
2582  switch (desc->kind)
2583  {
2584  case AllocateDescFile:
2585  result = fclose(desc->desc.file);
2586  break;
2587  case AllocateDescPipe:
2588  result = pclose(desc->desc.file);
2589  break;
2590  case AllocateDescDir:
2591  result = closedir(desc->desc.dir);
2592  break;
2593  case AllocateDescRawFD:
2594  result = close(desc->desc.fd);
2595  break;
2596  default:
2597  elog(ERROR, "AllocateDesc kind not recognized");
2598  result = 0; /* keep compiler quiet */
2599  break;
2600  }
2601 
2602  /* Compact storage in the allocatedDescs array */
2604  *desc = allocatedDescs[numAllocatedDescs];
2605 
2606  return result;
2607 }
2608 
2609 /*
2610  * Close a file returned by AllocateFile.
2611  *
2612  * Note we do not check fclose's return value --- it is up to the caller
2613  * to handle close errors.
2614  */
2615 int
2616 FreeFile(FILE *file)
2617 {
2618  int i;
2619 
2620  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2621 
2622  /* Remove file from list of allocated files, if it's present */
2623  for (i = numAllocatedDescs; --i >= 0;)
2624  {
2625  AllocateDesc *desc = &allocatedDescs[i];
2626 
2627  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2628  return FreeDesc(desc);
2629  }
2630 
2631  /* Only get here if someone passes us a file not in allocatedDescs */
2632  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2633 
2634  return fclose(file);
2635 }
2636 
2637 /*
2638  * Close a file returned by OpenTransientFile.
2639  *
2640  * Note we do not check close's return value --- it is up to the caller
2641  * to handle close errors.
2642  */
2643 int
2645 {
2646  int i;
2647 
2648  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2649 
2650  /* Remove fd from list of allocated files, if it's present */
2651  for (i = numAllocatedDescs; --i >= 0;)
2652  {
2653  AllocateDesc *desc = &allocatedDescs[i];
2654 
2655  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2656  return FreeDesc(desc);
2657  }
2658 
2659  /* Only get here if someone passes us a file not in allocatedDescs */
2660  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2661 
2662  return close(fd);
2663 }
2664 
2665 /*
2666  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2667  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2668  * necessary to open the directory, and with closing it after an elog.
2669  * When done, call FreeDir rather than closedir.
2670  *
2671  * Returns NULL, with errno set, on failure. Note that failure detection
2672  * is commonly left to the following call of ReadDir or ReadDirExtended;
2673  * see the comments for ReadDir.
2674  *
2675  * Ideally this should be the *only* direct call of opendir() in the backend.
2676  */
2677 DIR *
2678 AllocateDir(const char *dirname)
2679 {
2680  DIR *dir;
2681 
2682  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2683  numAllocatedDescs, dirname));
2684 
2685  /* Can we allocate another non-virtual FD? */
2686  if (!reserveAllocatedDesc())
2687  ereport(ERROR,
2688  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2689  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2690  maxAllocatedDescs, dirname)));
2691 
2692  /* Close excess kernel FDs. */
2693  ReleaseLruFiles();
2694 
2695 TryAgain:
2696  if ((dir = opendir(dirname)) != NULL)
2697  {
2698  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2699 
2700  desc->kind = AllocateDescDir;
2701  desc->desc.dir = dir;
2704  return desc->desc.dir;
2705  }
2706 
2707  if (errno == EMFILE || errno == ENFILE)
2708  {
2709  int save_errno = errno;
2710 
2711  ereport(LOG,
2712  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2713  errmsg("out of file descriptors: %m; release and retry")));
2714  errno = 0;
2715  if (ReleaseLruFile())
2716  goto TryAgain;
2717  errno = save_errno;
2718  }
2719 
2720  return NULL;
2721 }
2722 
2723 /*
2724  * Read a directory opened with AllocateDir, ereport'ing any error.
2725  *
2726  * This is easier to use than raw readdir() since it takes care of some
2727  * otherwise rather tedious and error-prone manipulation of errno. Also,
2728  * if you are happy with a generic error message for AllocateDir failure,
2729  * you can just do
2730  *
2731  * dir = AllocateDir(path);
2732  * while ((dirent = ReadDir(dir, path)) != NULL)
2733  * process dirent;
2734  * FreeDir(dir);
2735  *
2736  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2737  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2738  * use this shortcut.)
2739  *
2740  * The pathname passed to AllocateDir must be passed to this routine too,
2741  * but it is only used for error reporting.
2742  */
2743 struct dirent *
2744 ReadDir(DIR *dir, const char *dirname)
2745 {
2746  return ReadDirExtended(dir, dirname, ERROR);
2747 }
2748 
2749 /*
2750  * Alternate version of ReadDir that allows caller to specify the elevel
2751  * for any error report (whether it's reporting an initial failure of
2752  * AllocateDir or a subsequent directory read failure).
2753  *
2754  * If elevel < ERROR, returns NULL after any error. With the normal coding
2755  * pattern, this will result in falling out of the loop immediately as
2756  * though the directory contained no (more) entries.
2757  */
2758 struct dirent *
2759 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2760 {
2761  struct dirent *dent;
2762 
2763  /* Give a generic message for AllocateDir failure, if caller didn't */
2764  if (dir == NULL)
2765  {
2766  ereport(elevel,
2768  errmsg("could not open directory \"%s\": %m",
2769  dirname)));
2770  return NULL;
2771  }
2772 
2773  errno = 0;
2774  if ((dent = readdir(dir)) != NULL)
2775  return dent;
2776 
2777  if (errno)
2778  ereport(elevel,
2780  errmsg("could not read directory \"%s\": %m",
2781  dirname)));
2782  return NULL;
2783 }
2784 
2785 /*
2786  * Close a directory opened with AllocateDir.
2787  *
2788  * Returns closedir's return value (with errno set if it's not 0).
2789  * Note we do not check the return value --- it is up to the caller
2790  * to handle close errors if wanted.
2791  *
2792  * Does nothing if dir == NULL; we assume that directory open failure was
2793  * already reported if desired.
2794  */
2795 int
2797 {
2798  int i;
2799 
2800  /* Nothing to do if AllocateDir failed */
2801  if (dir == NULL)
2802  return 0;
2803 
2804  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2805 
2806  /* Remove dir from list of allocated dirs, if it's present */
2807  for (i = numAllocatedDescs; --i >= 0;)
2808  {
2809  AllocateDesc *desc = &allocatedDescs[i];
2810 
2811  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2812  return FreeDesc(desc);
2813  }
2814 
2815  /* Only get here if someone passes us a dir not in allocatedDescs */
2816  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2817 
2818  return closedir(dir);
2819 }
2820 
2821 
2822 /*
2823  * Close a pipe stream returned by OpenPipeStream.
2824  */
2825 int
2826 ClosePipeStream(FILE *file)
2827 {
2828  int i;
2829 
2830  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2831 
2832  /* Remove file from list of allocated files, if it's present */
2833  for (i = numAllocatedDescs; --i >= 0;)
2834  {
2835  AllocateDesc *desc = &allocatedDescs[i];
2836 
2837  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2838  return FreeDesc(desc);
2839  }
2840 
2841  /* Only get here if someone passes us a file not in allocatedDescs */
2842  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2843 
2844  return pclose(file);
2845 }
2846 
2847 /*
2848  * closeAllVfds
2849  *
2850  * Force all VFDs into the physically-closed state, so that the fewest
2851  * possible number of kernel file descriptors are in use. There is no
2852  * change in the logical state of the VFDs.
2853  */
2854 void
2856 {
2857  Index i;
2858 
2859  if (SizeVfdCache > 0)
2860  {
2861  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2862  for (i = 1; i < SizeVfdCache; i++)
2863  {
2864  if (!FileIsNotOpen(i))
2865  LruDelete(i);
2866  }
2867  }
2868 }
2869 
2870 
2871 /*
2872  * SetTempTablespaces
2873  *
2874  * Define a list (actually an array) of OIDs of tablespaces to use for
2875  * temporary files. This list will be used until end of transaction,
2876  * unless this function is called again before then. It is caller's
2877  * responsibility that the passed-in array has adequate lifespan (typically
2878  * it'd be allocated in TopTransactionContext).
2879  *
2880  * Some entries of the array may be InvalidOid, indicating that the current
2881  * database's default tablespace should be used.
2882  */
2883 void
2884 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2885 {
2886  Assert(numSpaces >= 0);
2887  tempTableSpaces = tableSpaces;
2888  numTempTableSpaces = numSpaces;
2889 
2890  /*
2891  * Select a random starting point in the list. This is to minimize
2892  * conflicts between backends that are most likely sharing the same list
2893  * of temp tablespaces. Note that if we create multiple temp files in the
2894  * same transaction, we'll advance circularly through the list --- this
2895  * ensures that large temporary sort files are nicely spread across all
2896  * available tablespaces.
2897  */
2898  if (numSpaces > 1)
2899  nextTempTableSpace = random() % numSpaces;
2900  else
2901  nextTempTableSpace = 0;
2902 }
2903 
2904 /*
2905  * TempTablespacesAreSet
2906  *
2907  * Returns true if SetTempTablespaces has been called in current transaction.
2908  * (This is just so that tablespaces.c doesn't need its own per-transaction
2909  * state.)
2910  */
2911 bool
2913 {
2914  return (numTempTableSpaces >= 0);
2915 }
2916 
2917 /*
2918  * GetTempTablespaces
2919  *
2920  * Populate an array with the OIDs of the tablespaces that should be used for
2921  * temporary files. (Some entries may be InvalidOid, indicating that the
2922  * current database's default tablespace should be used.) At most numSpaces
2923  * entries will be filled.
2924  * Returns the number of OIDs that were copied into the output array.
2925  */
2926 int
2927 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2928 {
2929  int i;
2930 
2932  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2933  tableSpaces[i] = tempTableSpaces[i];
2934 
2935  return i;
2936 }
2937 
2938 /*
2939  * GetNextTempTableSpace
2940  *
2941  * Select the next temp tablespace to use. A result of InvalidOid means
2942  * to use the current database's default tablespace.
2943  */
2944 Oid
2946 {
2947  if (numTempTableSpaces > 0)
2948  {
2949  /* Advance nextTempTableSpace counter with wraparound */
2951  nextTempTableSpace = 0;
2953  }
2954  return InvalidOid;
2955 }
2956 
2957 
2958 /*
2959  * AtEOSubXact_Files
2960  *
2961  * Take care of subtransaction commit/abort. At abort, we close temp files
2962  * that the subtransaction may have opened. At commit, we reassign the
2963  * files that were opened to the parent subtransaction.
2964  */
2965 void
2966 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2967  SubTransactionId parentSubid)
2968 {
2969  Index i;
2970 
2971  for (i = 0; i < numAllocatedDescs; i++)
2972  {
2973  if (allocatedDescs[i].create_subid == mySubid)
2974  {
2975  if (isCommit)
2976  allocatedDescs[i].create_subid = parentSubid;
2977  else
2978  {
2979  /* have to recheck the item after FreeDesc (ugly) */
2980  FreeDesc(&allocatedDescs[i--]);
2981  }
2982  }
2983  }
2984 }
2985 
2986 /*
2987  * AtEOXact_Files
2988  *
2989  * This routine is called during transaction commit or abort. All still-open
2990  * per-transaction temporary file VFDs are closed, which also causes the
2991  * underlying files to be deleted (although they should've been closed already
2992  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2993  * closed. We also forget any transaction-local temp tablespace list.
2994  *
2995  * The isCommit flag is used only to decide whether to emit warnings about
2996  * unclosed files.
2997  */
2998 void
2999 AtEOXact_Files(bool isCommit)
3000 {
3001  CleanupTempFiles(isCommit, false);
3002  tempTableSpaces = NULL;
3003  numTempTableSpaces = -1;
3004 }
3005 
3006 /*
3007  * AtProcExit_Files
3008  *
3009  * on_proc_exit hook to clean up temp files during backend shutdown.
3010  * Here, we want to clean up *all* temp files including interXact ones.
3011  */
3012 static void
3014 {
3015  CleanupTempFiles(false, true);
3016 }
3017 
3018 /*
3019  * Close temporary files and delete their underlying files.
3020  *
3021  * isCommit: if true, this is normal transaction commit, and we don't
3022  * expect any remaining files; warn if there are some.
3023  *
3024  * isProcExit: if true, this is being called as the backend process is
3025  * exiting. If that's the case, we should remove all temporary files; if
3026  * that's not the case, we are being called for transaction commit/abort
3027  * and should only remove transaction-local temp files. In either case,
3028  * also clean up "allocated" stdio files, dirs and fds.
3029  */
3030 static void
3031 CleanupTempFiles(bool isCommit, bool isProcExit)
3032 {
3033  Index i;
3034 
3035  /*
3036  * Careful here: at proc_exit we need extra cleanup, not just
3037  * xact_temporary files.
3038  */
3039  if (isProcExit || have_xact_temporary_files)
3040  {
3041  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3042  for (i = 1; i < SizeVfdCache; i++)
3043  {
3044  unsigned short fdstate = VfdCache[i].fdstate;
3045 
3046  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3047  VfdCache[i].fileName != NULL)
3048  {
3049  /*
3050  * If we're in the process of exiting a backend process, close
3051  * all temporary files. Otherwise, only close temporary files
3052  * local to the current transaction. They should be closed by
3053  * the ResourceOwner mechanism already, so this is just a
3054  * debugging cross-check.
3055  */
3056  if (isProcExit)
3057  FileClose(i);
3058  else if (fdstate & FD_CLOSE_AT_EOXACT)
3059  {
3060  elog(WARNING,
3061  "temporary file %s not closed at end-of-transaction",
3062  VfdCache[i].fileName);
3063  FileClose(i);
3064  }
3065  }
3066  }
3067 
3068  have_xact_temporary_files = false;
3069  }
3070 
3071  /* Complain if any allocated files remain open at commit. */
3072  if (isCommit && numAllocatedDescs > 0)
3073  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3075 
3076  /* Clean up "allocated" stdio files, dirs and fds. */
3077  while (numAllocatedDescs > 0)
3078  FreeDesc(&allocatedDescs[0]);
3079 }
3080 
3081 
3082 /*
3083  * Remove temporary and temporary relation files left over from a prior
3084  * postmaster session
3085  *
3086  * This should be called during postmaster startup. It will forcibly
3087  * remove any leftover files created by OpenTemporaryFile and any leftover
3088  * temporary relation files created by mdcreate.
3089  *
3090  * During post-backend-crash restart cycle, this routine is called when
3091  * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3092  * queries are using temp files could result in useless storage usage that can
3093  * only be reclaimed by a service restart. The argument against enabling it is
3094  * that someone might want to examine the temporary files for debugging
3095  * purposes. This does however mean that OpenTemporaryFile had better allow for
3096  * collision with an existing temp file name.
3097  *
3098  * NOTE: this function and its subroutines generally report syscall failures
3099  * with ereport(LOG) and keep going. Removing temp files is not so critical
3100  * that we should fail to start the database when we can't do it.
3101  */
3102 void
3104 {
3105  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3106  DIR *spc_dir;
3107  struct dirent *spc_de;
3108 
3109  /*
3110  * First process temp files in pg_default ($PGDATA/base)
3111  */
3112  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3113  RemovePgTempFilesInDir(temp_path, true, false);
3114  RemovePgTempRelationFiles("base");
3115 
3116  /*
3117  * Cycle through temp directories for all non-default tablespaces.
3118  */
3119  spc_dir = AllocateDir("pg_tblspc");
3120 
3121  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3122  {
3123  if (strcmp(spc_de->d_name, ".") == 0 ||
3124  strcmp(spc_de->d_name, "..") == 0)
3125  continue;
3126 
3127  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3129  RemovePgTempFilesInDir(temp_path, true, false);
3130 
3131  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3133  RemovePgTempRelationFiles(temp_path);
3134  }
3135 
3136  FreeDir(spc_dir);
3137 
3138  /*
3139  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3140  * DataDir as well. However, that is *not* cleaned here because doing so
3141  * would create a race condition. It's done separately, earlier in
3142  * postmaster startup.
3143  */
3144 }
3145 
3146 /*
3147  * Process one pgsql_tmp directory for RemovePgTempFiles.
3148  *
3149  * If missing_ok is true, it's all right for the named directory to not exist.
3150  * Any other problem results in a LOG message. (missing_ok should be true at
3151  * the top level, since pgsql_tmp directories are not created until needed.)
3152  *
3153  * At the top level, this should be called with unlink_all = false, so that
3154  * only files matching the temporary name prefix will be unlinked. When
3155  * recursing it will be called with unlink_all = true to unlink everything
3156  * under a top-level temporary directory.
3157  *
3158  * (These two flags could be replaced by one, but it seems clearer to keep
3159  * them separate.)
3160  */
3161 void
3162 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3163 {
3164  DIR *temp_dir;
3165  struct dirent *temp_de;
3166  char rm_path[MAXPGPATH * 2];
3167 
3168  temp_dir = AllocateDir(tmpdirname);
3169 
3170  if (temp_dir == NULL && errno == ENOENT && missing_ok)
3171  return;
3172 
3173  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3174  {
3175  if (strcmp(temp_de->d_name, ".") == 0 ||
3176  strcmp(temp_de->d_name, "..") == 0)
3177  continue;
3178 
3179  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3180  tmpdirname, temp_de->d_name);
3181 
3182  if (unlink_all ||
3183  strncmp(temp_de->d_name,
3185  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3186  {
3187  struct stat statbuf;
3188 
3189  if (lstat(rm_path, &statbuf) < 0)
3190  {
3191  ereport(LOG,
3193  errmsg("could not stat file \"%s\": %m", rm_path)));
3194  continue;
3195  }
3196 
3197  if (S_ISDIR(statbuf.st_mode))
3198  {
3199  /* recursively remove contents, then directory itself */
3200  RemovePgTempFilesInDir(rm_path, false, true);
3201 
3202  if (rmdir(rm_path) < 0)
3203  ereport(LOG,
3205  errmsg("could not remove directory \"%s\": %m",
3206  rm_path)));
3207  }
3208  else
3209  {
3210  if (unlink(rm_path) < 0)
3211  ereport(LOG,
3213  errmsg("could not remove file \"%s\": %m",
3214  rm_path)));
3215  }
3216  }
3217  else
3218  ereport(LOG,
3219  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3220  rm_path)));
3221  }
3222 
3223  FreeDir(temp_dir);
3224 }
3225 
3226 /* Process one tablespace directory, look for per-DB subdirectories */
3227 static void
3228 RemovePgTempRelationFiles(const char *tsdirname)
3229 {
3230  DIR *ts_dir;
3231  struct dirent *de;
3232  char dbspace_path[MAXPGPATH * 2];
3233 
3234  ts_dir = AllocateDir(tsdirname);
3235 
3236  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3237  {
3238  /*
3239  * We're only interested in the per-database directories, which have
3240  * numeric names. Note that this code will also (properly) ignore "."
3241  * and "..".
3242  */
3243  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3244  continue;
3245 
3246  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3247  tsdirname, de->d_name);
3248  RemovePgTempRelationFilesInDbspace(dbspace_path);
3249  }
3250 
3251  FreeDir(ts_dir);
3252 }
3253 
3254 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3255 static void
3256 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3257 {
3258  DIR *dbspace_dir;
3259  struct dirent *de;
3260  char rm_path[MAXPGPATH * 2];
3261 
3262  dbspace_dir = AllocateDir(dbspacedirname);
3263 
3264  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3265  {
3266  if (!looks_like_temp_rel_name(de->d_name))
3267  continue;
3268 
3269  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3270  dbspacedirname, de->d_name);
3271 
3272  if (unlink(rm_path) < 0)
3273  ereport(LOG,
3275  errmsg("could not remove file \"%s\": %m",
3276  rm_path)));
3277  }
3278 
3279  FreeDir(dbspace_dir);
3280 }
3281 
3282 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3283 bool
3285 {
3286  int pos;
3287  int savepos;
3288 
3289  /* Must start with "t". */
3290  if (name[0] != 't')
3291  return false;
3292 
3293  /* Followed by a non-empty string of digits and then an underscore. */
3294  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3295  ;
3296  if (pos == 1 || name[pos] != '_')
3297  return false;
3298 
3299  /* Followed by another nonempty string of digits. */
3300  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3301  ;
3302  if (savepos == pos)
3303  return false;
3304 
3305  /* We might have _forkname or .segment or both. */
3306  if (name[pos] == '_')
3307  {
3308  int forkchar = forkname_chars(&name[pos + 1], NULL);
3309 
3310  if (forkchar <= 0)
3311  return false;
3312  pos += forkchar + 1;
3313  }
3314  if (name[pos] == '.')
3315  {
3316  int segchar;
3317 
3318  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3319  ;
3320  if (segchar <= 1)
3321  return false;
3322  pos += segchar;
3323  }
3324 
3325  /* Now we should be at the end. */
3326  if (name[pos] != '\0')
3327  return false;
3328  return true;
3329 }
3330 
3331 #ifdef HAVE_SYNCFS
3332 static void
3333 do_syncfs(const char *path)
3334 {
3335  int fd;
3336 
3337  fd = OpenTransientFile(path, O_RDONLY);
3338  if (fd < 0)
3339  {
3340  ereport(LOG,
3342  errmsg("could not open file \"%s\": %m", path)));
3343  return;
3344  }
3345  if (syncfs(fd) < 0)
3346  ereport(LOG,
3348  errmsg("could not synchronize file system for file \"%s\": %m", path)));
3349  CloseTransientFile(fd);
3350 }
3351 #endif
3352 
3353 /*
3354  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3355  * all potential filesystem, depending on recovery_init_sync_method setting.
3356  *
3357  * We fsync regular files and directories wherever they are, but we
3358  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3359  * Other symlinks are presumed to point at files we're not responsible
3360  * for fsyncing, and might not have privileges to write at all.
3361  *
3362  * Errors are logged but not considered fatal; that's because this is used
3363  * only during database startup, to deal with the possibility that there are
3364  * issued-but-unsynced writes pending against the data directory. We want to
3365  * ensure that such writes reach disk before anything that's done in the new
3366  * run. However, aborting on error would result in failure to start for
3367  * harmless cases such as read-only files in the data directory, and that's
3368  * not good either.
3369  *
3370  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3371  * rewriting all changes again during recovery.
3372  *
3373  * Note we assume we're chdir'd into PGDATA to begin with.
3374  */
3375 void
3377 {
3378  bool xlog_is_symlink;
3379 
3380  /* We can skip this whole thing if fsync is disabled. */
3381  if (!enableFsync)
3382  return;
3383 
3384  /*
3385  * If pg_wal is a symlink, we'll need to recurse into it separately,
3386  * because the first walkdir below will ignore it.
3387  */
3388  xlog_is_symlink = false;
3389 
3390 #ifndef WIN32
3391  {
3392  struct stat st;
3393 
3394  if (lstat("pg_wal", &st) < 0)
3395  ereport(LOG,
3397  errmsg("could not stat file \"%s\": %m",
3398  "pg_wal")));
3399  else if (S_ISLNK(st.st_mode))
3400  xlog_is_symlink = true;
3401  }
3402 #else
3403  if (pgwin32_is_junction("pg_wal"))
3404  xlog_is_symlink = true;
3405 #endif
3406 
3407 #ifdef HAVE_SYNCFS
3409  {
3410  DIR *dir;
3411  struct dirent *de;
3412 
3413  /*
3414  * On Linux, we don't have to open every single file one by one. We
3415  * can use syncfs() to sync whole filesystems. We only expect
3416  * filesystem boundaries to exist where we tolerate symlinks, namely
3417  * pg_wal and the tablespaces, so we call syncfs() for each of those
3418  * directories.
3419  */
3420 
3421  /* Sync the top level pgdata directory. */
3422  do_syncfs(".");
3423  /* If any tablespaces are configured, sync each of those. */
3424  dir = AllocateDir("pg_tblspc");
3425  while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
3426  {
3427  char path[MAXPGPATH];
3428 
3429  if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3430  continue;
3431 
3432  snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
3433  do_syncfs(path);
3434  }
3435  FreeDir(dir);
3436  /* If pg_wal is a symlink, process that too. */
3437  if (xlog_is_symlink)
3438  do_syncfs("pg_wal");
3439  return;
3440  }
3441 #endif /* !HAVE_SYNCFS */
3442 
3443  /*
3444  * If possible, hint to the kernel that we're soon going to fsync the data
3445  * directory and its contents. Errors in this step are even less
3446  * interesting than normal, so log them only at DEBUG1.
3447  */
3448 #ifdef PG_FLUSH_DATA_WORKS
3449  walkdir(".", pre_sync_fname, false, DEBUG1);
3450  if (xlog_is_symlink)
3451  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3452  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3453 #endif
3454 
3455  /*
3456  * Now we do the fsync()s in the same order.
3457  *
3458  * The main call ignores symlinks, so in addition to specially processing
3459  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3460  * process_symlinks = true. Note that if there are any plain directories
3461  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3462  * so we don't worry about optimizing it.
3463  */
3464  walkdir(".", datadir_fsync_fname, false, LOG);
3465  if (xlog_is_symlink)
3466  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3467  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3468 }
3469 
3470 /*
3471  * walkdir: recursively walk a directory, applying the action to each
3472  * regular file and directory (including the named directory itself).
3473  *
3474  * If process_symlinks is true, the action and recursion are also applied
3475  * to regular files and directories that are pointed to by symlinks in the
3476  * given directory; otherwise symlinks are ignored. Symlinks are always
3477  * ignored in subdirectories, ie we intentionally don't pass down the
3478  * process_symlinks flag to recursive calls.
3479  *
3480  * Errors are reported at level elevel, which might be ERROR or less.
3481  *
3482  * See also walkdir in file_utils.c, which is a frontend version of this
3483  * logic.
3484  */
3485 static void
3486 walkdir(const char *path,
3487  void (*action) (const char *fname, bool isdir, int elevel),
3488  bool process_symlinks,
3489  int elevel)
3490 {
3491  DIR *dir;
3492  struct dirent *de;
3493 
3494  dir = AllocateDir(path);
3495 
3496  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3497  {
3498  char subpath[MAXPGPATH * 2];
3499 
3501 
3502  if (strcmp(de->d_name, ".") == 0 ||
3503  strcmp(de->d_name, "..") == 0)
3504  continue;
3505 
3506  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3507 
3508  switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3509  {
3510  case PGFILETYPE_REG:
3511  (*action) (subpath, false, elevel);
3512  break;
3513  case PGFILETYPE_DIR:
3514  walkdir(subpath, action, false, elevel);
3515  break;
3516  default:
3517 
3518  /*
3519  * Errors are already reported directly by get_dirent_type(),
3520  * and any remaining symlinks and unknown file types are
3521  * ignored.
3522  */
3523  break;
3524  }
3525  }
3526 
3527  FreeDir(dir); /* we ignore any error here */
3528 
3529  /*
3530  * It's important to fsync the destination directory itself as individual
3531  * file fsyncs don't guarantee that the directory entry for the file is
3532  * synced. However, skip this if AllocateDir failed; the action function
3533  * might not be robust against that.
3534  */
3535  if (dir)
3536  (*action) (path, true, elevel);
3537 }
3538 
3539 
3540 /*
3541  * Hint to the OS that it should get ready to fsync() this file.
3542  *
3543  * Ignores errors trying to open unreadable files, and logs other errors at a
3544  * caller-specified level.
3545  */
3546 #ifdef PG_FLUSH_DATA_WORKS
3547 
3548 static void
3549 pre_sync_fname(const char *fname, bool isdir, int elevel)
3550 {
3551  int fd;
3552 
3553  /* Don't try to flush directories, it'll likely just fail */
3554  if (isdir)
3555  return;
3556 
3557  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3558 
3559  if (fd < 0)
3560  {
3561  if (errno == EACCES)
3562  return;
3563  ereport(elevel,
3565  errmsg("could not open file \"%s\": %m", fname)));
3566  return;
3567  }
3568 
3569  /*
3570  * pg_flush_data() ignores errors, which is ok because this is only a
3571  * hint.
3572  */
3573  pg_flush_data(fd, 0, 0);
3574 
3575  if (CloseTransientFile(fd) != 0)
3576  ereport(elevel,
3578  errmsg("could not close file \"%s\": %m", fname)));
3579 }
3580 
3581 #endif /* PG_FLUSH_DATA_WORKS */
3582 
3583 static void
3584 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3585 {
3586  /*
3587  * We want to silently ignoring errors about unreadable files. Pass that
3588  * desire on to fsync_fname_ext().
3589  */
3590  fsync_fname_ext(fname, isdir, true, elevel);
3591 }
3592 
3593 static void
3594 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3595 {
3596  if (isdir)
3597  {
3598  if (rmdir(fname) != 0 && errno != ENOENT)
3599  ereport(elevel,
3601  errmsg("could not remove directory \"%s\": %m", fname)));
3602  }
3603  else
3604  {
3605  /* Use PathNameDeleteTemporaryFile to report filesize */
3606  PathNameDeleteTemporaryFile(fname, false);
3607  }
3608 }
3609 
3610 /*
3611  * fsync_fname_ext -- Try to fsync a file or directory
3612  *
3613  * If ignore_perm is true, ignore errors upon trying to open unreadable
3614  * files. Logs other errors at a caller-specified level.
3615  *
3616  * Returns 0 if the operation succeeded, -1 otherwise.
3617  */
3618 int
3619 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3620 {
3621  int fd;
3622  int flags;
3623  int returncode;
3624 
3625  /*
3626  * Some OSs require directories to be opened read-only whereas other
3627  * systems don't allow us to fsync files opened read-only; so we need both
3628  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3629  * not writable by our userid, but we assume that's OK.
3630  */
3631  flags = PG_BINARY;
3632  if (!isdir)
3633  flags |= O_RDWR;
3634  else
3635  flags |= O_RDONLY;
3636 
3637  fd = OpenTransientFile(fname, flags);
3638 
3639  /*
3640  * Some OSs don't allow us to open directories at all (Windows returns
3641  * EACCES), just ignore the error in that case. If desired also silently
3642  * ignoring errors about unreadable files. Log others.
3643  */
3644  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3645  return 0;
3646  else if (fd < 0 && ignore_perm && errno == EACCES)
3647  return 0;
3648  else if (fd < 0)
3649  {
3650  ereport(elevel,
3652  errmsg("could not open file \"%s\": %m", fname)));
3653  return -1;
3654  }
3655 
3656  returncode = pg_fsync(fd);
3657 
3658  /*
3659  * Some OSes don't allow us to fsync directories at all, so we can ignore
3660  * those errors. Anything else needs to be logged.
3661  */
3662  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3663  {
3664  int save_errno;
3665 
3666  /* close file upon error, might not be in transaction context */
3667  save_errno = errno;
3668  (void) CloseTransientFile(fd);
3669  errno = save_errno;
3670 
3671  ereport(elevel,
3673  errmsg("could not fsync file \"%s\": %m", fname)));
3674  return -1;
3675  }
3676 
3677  if (CloseTransientFile(fd) != 0)
3678  {
3679  ereport(elevel,
3681  errmsg("could not close file \"%s\": %m", fname)));
3682  return -1;
3683  }
3684 
3685  return 0;
3686 }
3687 
3688 /*
3689  * fsync_parent_path -- fsync the parent path of a file or directory
3690  *
3691  * This is aimed at making file operations persistent on disk in case of
3692  * an OS crash or power failure.
3693  */
3694 static int
3695 fsync_parent_path(const char *fname, int elevel)
3696 {
3697  char parentpath[MAXPGPATH];
3698 
3699  strlcpy(parentpath, fname, MAXPGPATH);
3700  get_parent_directory(parentpath);
3701 
3702  /*
3703  * get_parent_directory() returns an empty string if the input argument is
3704  * just a file name (see comments in path.c), so handle that as being the
3705  * current directory.
3706  */
3707  if (strlen(parentpath) == 0)
3708  strlcpy(parentpath, ".", MAXPGPATH);
3709 
3710  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3711  return -1;
3712 
3713  return 0;
3714 }
3715 
3716 /*
3717  * Create a PostgreSQL data sub-directory
3718  *
3719  * The data directory itself, and most of its sub-directories, are created at
3720  * initdb time, but we do have some occasions when we create directories in
3721  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3722  * make sure that those directories are created consistently. Today, that means
3723  * making sure that the created directory has the correct permissions, which is
3724  * what pg_dir_create_mode tracks for us.
3725  *
3726  * Note that we also set the umask() based on what we understand the correct
3727  * permissions to be (see file_perm.c).
3728  *
3729  * For permissions other than the default, mkdir() can be used directly, but
3730  * be sure to consider carefully such cases -- a sub-directory with incorrect
3731  * permissions in a PostgreSQL data directory could cause backups and other
3732  * processes to fail.
3733  */
3734 int
3735 MakePGDirectory(const char *directoryName)
3736 {
3737  return mkdir(directoryName, pg_dir_create_mode);
3738 }
3739 
3740 /*
3741  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3742  *
3743  * Failure to fsync any data file is cause for immediate panic, unless
3744  * data_sync_retry is enabled. Data may have been written to the operating
3745  * system and removed from our buffer pool already, and if we are running on
3746  * an operating system that forgets dirty data on write-back failure, there
3747  * may be only one copy of the data remaining: in the WAL. A later attempt to
3748  * fsync again might falsely report success. Therefore we must not allow any
3749  * further checkpoints to be attempted. data_sync_retry can in theory be
3750  * enabled on systems known not to drop dirty buffered data on write-back
3751  * failure (with the likely outcome that checkpoints will continue to fail
3752  * until the underlying problem is fixed).
3753  *
3754  * Any code that reports a failure from fsync() or related functions should
3755  * filter the error level with this function.
3756  */
3757 int
3758 data_sync_elevel(int elevel)
3759 {
3760  return data_sync_retry ? elevel : PANIC;
3761 }
3762 
3763 /*
3764  * A convenience wrapper for pg_pwritev() that retries on partial write. If an
3765  * error is returned, it is unspecified how much has been written.
3766  */
3767 ssize_t
3768 pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
3769 {
3770  struct iovec iov_copy[PG_IOV_MAX];
3771  ssize_t sum = 0;
3772  ssize_t part;
3773 
3774  /* We'd better have space to make a copy, in case we need to retry. */
3775  if (iovcnt > PG_IOV_MAX)
3776  {
3777  errno = EINVAL;
3778  return -1;
3779  }
3780 
3781  for (;;)
3782  {
3783  /* Write as much as we can. */
3784  part = pg_pwritev(fd, iov, iovcnt, offset);
3785  if (part < 0)
3786  return -1;
3787 
3788 #ifdef SIMULATE_SHORT_WRITE
3789  part = Min(part, 4096);
3790 #endif
3791 
3792  /* Count our progress. */
3793  sum += part;
3794  offset += part;
3795 
3796  /* Step over iovecs that are done. */
3797  while (iovcnt > 0 && iov->iov_len <= part)
3798  {
3799  part -= iov->iov_len;
3800  ++iov;
3801  --iovcnt;
3802  }
3803 
3804  /* Are they all done? */
3805  if (iovcnt == 0)
3806  {
3807  /* We don't expect the kernel to write more than requested. */
3808  Assert(part == 0);
3809  break;
3810  }
3811 
3812  /*
3813  * Move whatever's left to the front of our mutable copy and adjust
3814  * the leading iovec.
3815  */
3816  Assert(iovcnt > 0);
3817  memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
3818  Assert(iov->iov_len > part);
3819  iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
3820  iov_copy[0].iov_len -= part;
3821  iov = iov_copy;
3822  }
3823 
3824  return sum;
3825 }
size_t iov_len
Definition: pg_iovec.h:27
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1541
File lruLessRecently
Definition: fd.c:199
void closeAllVfds(void)
Definition: fd.c:2855
static PgChecksumMode mode
Definition: pg_checksums.c:63
union AllocateDesc::@18 desc
File nextFree
Definition: fd.c:197
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:906
int pg_file_create_mode
Definition: file_perm.c:19
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1871
#define MAP_FAILED
Definition: mem.h:45
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:43
void * iov_base
Definition: pg_iovec.h:26
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1804
#define NUM_RESERVED_FDS
Definition: fd.c:128
static AllocateDesc * allocatedDescs
Definition: fd.c:260
static void pgstat_report_wait_end(void)
Definition: wait_event.h:278
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1528
int pg_fdatasync(int fd)
Definition: fd.c:442
static void error(void)
Definition: sql-dyntest.c:147
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
AllocateDescKind
Definition: fd.c:238
DIR * dir
Definition: fd.c:253
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1747
static void AtProcExit_Files(int code, Datum arg)
Definition: fd.c:3013
static Size SizeVfdCache
Definition: fd.c:213
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:190
void on_proc_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:305
#define DO_DB(A)
Definition: fd.c:176
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2927
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3486
int pg_truncate(const char *path, off_t length)
Definition: fd.c:635
long random(void)
Definition: random.c:22
ResourceOwner CurrentResourceOwner
Definition: resowner.c:146
static int numExternalFDs
Definition: fd.c:265
int pg_fsync_writethrough(int fd)
Definition: fd.c:419
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2759
int max_safe_fds
Definition: fd.c:158
#define Min(x, y)
Definition: c.h:986
off_t FileSize(File file)
Definition: fd.c:2243
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:666
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2476
#define FD_DELETE_AT_CLOSE
Definition: fd.c:188
int log_temp_files
Definition: guc.c:602
mode_t FileGetRawMode(File file)
Definition: fd.c:2331
void _dosmaperr(unsigned long)
Definition: win32error.c:171
static Vfd * VfdCache
Definition: fd.c:212
static void Delete(File file)
Definition: fd.c:1221
int closedir(DIR *)
Definition: dirent.c:123
static int numTempTableSpaces
Definition: fd.c:280
#define PG_TEMP_FILES_DIR
Definition: pg_checksums.c:60
int errcode(int sqlerrcode)
Definition: elog.c:698
#define MemSet(start, val, len)
Definition: c.h:1008
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1636
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:407
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3256
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1788
static bool reserveAllocatedDesc(void)
Definition: fd.c:2342
uint32 SubTransactionId
Definition: c.h:591
#define SIGPIPE
Definition: win32_port.h:164
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1722
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
AllocateDescKind kind
Definition: fd.c:248
char * FilePathName(File file)
Definition: fd.c:2295
Definition: dirent.h:9
#define OidIsValid(objectId)
Definition: c.h:710
#define PANIC
Definition: elog.h:50
#define PG_BINARY
Definition: c.h:1271
static char * basedir
ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset)
Definition: pwrite.c:27
void AtEOXact_Files(bool isCommit)
Definition: fd.c:2999
Oid MyDatabaseTableSpace
Definition: globals.c:90
int ClosePipeStream(FILE *file)
Definition: fd.c:2826
ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset)
Definition: pread.c:27
#define malloc(a)
Definition: header.h:50
static void LruDelete(File file)
Definition: fd.c:1240
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:918
void pg_usleep(long microsec)
Definition: signal.c:53
bool TempTablespacesAreSet(void)
Definition: fd.c:2912
#define fstat
Definition: win32_port.h:274
ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pwritev.c:29
#define fsync(fd)
Definition: win32_port.h:68
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2577
void pfree(void *pointer)
Definition: mcxt.c:1169
mode_t fileMode
Definition: fd.c:204
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3228
static bool ReleaseLruFile(void)
Definition: fd.c:1335
Definition: dirent.c:25
int durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:822
#define ERROR
Definition: elog.h:46
#define PG_TEMP_FILE_PREFIX
Definition: pg_checksums.c:61
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2467
static int LruInsert(File file)
Definition: fd.c:1288
#define FATAL
Definition: elog.h:49
int recovery_init_sync_method
Definition: fd.c:164
static bool have_xact_temporary_files
Definition: fd.c:224
#define MAXPGPATH
#define PG_O_DIRECT
Definition: fd.h:95
void ReserveExternalFD(void)
Definition: fd.c:1174
DIR * opendir(const char *)
Definition: dirent.c:33
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2222
#define DEBUG2
Definition: elog.h:24
ssize_t pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: fd.c:3768
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:26
char * fileName
Definition: fd.c:201
static char * buf
Definition: pg_test_fsync.c:68
Oid GetNextTempTableSpace(void)
Definition: fd.c:2945
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1297
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3594
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition: fd.c:1842
int errdetail(const char *fmt,...)
Definition: elog.c:1042
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3162
char * tablespace
Definition: pgbench.c:225
int errcode_for_file_access(void)
Definition: elog.c:721
void get_parent_directory(char *path)
Definition: path.c:854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2417
static int nfile
Definition: fd.c:218
unsigned int uint32
Definition: c.h:441
void SyncDataDirectory(void)
Definition: fd.c:3376
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2678
static int nextTempTableSpace
Definition: fd.c:281
__int64 st_size
Definition: win32_port.h:265
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:410
int max_files_per_process
Definition: fd.c:145
static File AllocateVfd(void)
Definition: fd.c:1367
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2520
unsigned short fdstate
Definition: fd.c:195
Definition: fd.c:192
off_t fileSize
Definition: fd.c:200
int fd
Definition: fd.c:194
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2884
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:692
static void Insert(File file)
Definition: fd.c:1266
ResourceOwner resowner
Definition: fd.c:196
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:262
bool data_sync_retry
Definition: fd.c:161
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3584
int CloseTransientFile(int fd)
Definition: fd.c:2644
#define SIG_IGN
Definition: win32_port.h:156
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1481
static void ReleaseLruFiles(void)
Definition: fd.c:1357
#define WARNING
Definition: elog.h:40
#define FileIsNotOpen(file)
Definition: fd.c:185
int pg_dir_create_mode
Definition: file_perm.c:18
static int elevel
Definition: vacuumlazy.c:403
int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2124
struct vfd Vfd
#define O_DSYNC
Definition: win32_port.h:328
int data_sync_elevel(int elevel)
Definition: fd.c:3758
uintptr_t Datum
Definition: postgres.h:411
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:2966
unsigned short st_mode
Definition: win32_port.h:260
Definition: pg_iovec.h:24
unsigned int Index
Definition: c.h:549
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:462
#define FileIsValid(file)
Definition: fd.c:182
bool AcquireExternalFD(void)
Definition: fd.c:1139
FILE * file
Definition: fd.c:252
#define InvalidOid
Definition: postgres_ext.h:36
#define VFD_CLOSED
Definition: fd.c:180
static uint64 temporary_files_size
Definition: fd.c:232
#define ereport(elevel,...)
Definition: elog.h:157
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3735
pqsigfunc pqsignal(int signum, pqsigfunc handler)
Definition: signal.c:170
#define free(a)
Definition: header.h:65
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static void RegisterTemporaryFile(File file)
Definition: fd.c:1500
void FileClose(File file)
Definition: fd.c:1917
#define SIG_DFL
Definition: win32_port.h:154
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:2017
static int FileAccess(File file)
Definition: fd.c:1445
#define Assert(condition)
Definition: c.h:804
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:723
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2744
File lruMoreRecently
Definition: fd.c:198
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2045
void RemovePgTempFiles(void)
Definition: fd.c:3103
SubTransactionId create_subid
Definition: fd.c:249
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1669
size_t Size
Definition: c.h:540
static const char * directory
Definition: zic.c:632
int sync_method
Definition: xlog.c:107
struct dirent * readdir(DIR *)
Definition: dirent.c:78
#define FD_MINFREE
Definition: fd.c:137
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3284
#define realloc(a, b)
Definition: header.h:60
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1182
#define INT64_FORMAT
Definition: c.h:483
const char * name
Definition: encode.c:515
static long tempFileCounter
Definition: fd.c:271
int fd
Definition: fd.c:254
#define S_ISDIR(m)
Definition: win32_port.h:316
#define lstat(path, sb)
Definition: win32_port.h:276
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:782
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1033
int FreeFile(FILE *file)
Definition: fd.c:2616
void set_max_safe_fds(void)
Definition: fd.c:990
bool enableFsync
Definition: globals.c:122
static Oid * tempTableSpaces
Definition: fd.c:279
void ReleaseExternalFD(void)
Definition: fd.c:1192
void * palloc(Size size)
Definition: mcxt.c:1062
int errmsg(const char *fmt,...)
Definition: elog.c:909
int FileGetRawFlags(File file)
Definition: fd.c:2321
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1286
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1055
#define elog(elevel,...)
Definition: elog.h:232
int i
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:189
void * arg
int FileGetRawDesc(File file)
Definition: fd.c:2311
static void FreeVfd(File file)
Definition: fd.c:1425
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:120
int pg_fsync(int fd)
Definition: fd.c:352
char d_name[MAX_PATH]
Definition: dirent.h:15
#define mkdir(a, b)
Definition: win32_port.h:63
int link(const char *src, const char *dst)
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:343
int fileFlags
Definition: fd.c:203
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1605
int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2068
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1306
#define snprintf
Definition: port.h:216
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2260
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3619
static int maxAllocatedDescs
Definition: fd.c:259
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:3031
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3695
int File
Definition: fd.h:54
int FreeDir(DIR *dir)
Definition: fd.c:2796
int temp_file_limit
Definition: guc.c:609
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:241
void InitFileAccess(void)
Definition: fd.c:873
#define stat
Definition: win32_port.h:275
static int numAllocatedDescs
Definition: fd.c:258
bool pgwin32_is_junction(const char *path)
#define ftruncate(a, b)
Definition: win32_port.h:65