PostgreSQL Source Code  git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  * If a non-virtual file descriptor needs to be held open for any length of
65  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66  * (and eventually ReleaseExternalFD), so that we can take it into account
67  * while deciding how many VFDs can be open. This applies to FDs obtained
68  * with BasicOpenFile as well as those obtained without use of any fd.c API.
69  *
70  *-------------------------------------------------------------------------
71  */
72 
73 #include "postgres.h"
74 
75 #include <dirent.h>
76 #include <sys/file.h>
77 #include <sys/param.h>
78 #include <sys/resource.h> /* for getrlimit */
79 #include <sys/stat.h>
80 #include <sys/types.h>
81 #ifndef WIN32
82 #include <sys/mman.h>
83 #endif
84 #include <limits.h>
85 #include <unistd.h>
86 #include <fcntl.h>
87 
88 #include "access/xact.h"
89 #include "access/xlog.h"
90 #include "catalog/pg_tablespace.h"
91 #include "common/file_perm.h"
92 #include "common/file_utils.h"
93 #include "common/pg_prng.h"
94 #include "miscadmin.h"
95 #include "pgstat.h"
96 #include "postmaster/startup.h"
97 #include "storage/fd.h"
98 #include "storage/ipc.h"
99 #include "utils/guc.h"
100 #include "utils/guc_hooks.h"
101 #include "utils/resowner.h"
102 #include "utils/varlena.h"
103 
104 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
105 #if defined(HAVE_SYNC_FILE_RANGE)
106 #define PG_FLUSH_DATA_WORKS 1
107 #elif !defined(WIN32) && defined(MS_ASYNC)
108 #define PG_FLUSH_DATA_WORKS 1
109 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
110 #define PG_FLUSH_DATA_WORKS 1
111 #endif
112 
113 /*
114  * We must leave some file descriptors free for system(), the dynamic loader,
115  * and other code that tries to open files without consulting fd.c. This
116  * is the number left free. (While we try fairly hard to prevent EMFILE
117  * errors, there's never any guarantee that we won't get ENFILE due to
118  * other processes chewing up FDs. So it's a bad idea to try to open files
119  * without consulting fd.c. Nonetheless we cannot control all code.)
120  *
121  * Because this is just a fixed setting, we are effectively assuming that
122  * no such code will leave FDs open over the long term; otherwise the slop
123  * is likely to be insufficient. Note in particular that we expect that
124  * loading a shared library does not result in any permanent increase in
125  * the number of open files. (This appears to be true on most if not
126  * all platforms as of Feb 2004.)
127  */
128 #define NUM_RESERVED_FDS 10
129 
130 /*
131  * If we have fewer than this many usable FDs after allowing for the reserved
132  * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
133  * much less than that. Note that this value ensures numExternalFDs can be
134  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
135  * will not pass unless that can grow to at least 14.)
136  */
137 #define FD_MINFREE 48
138 
139 /*
140  * A number of platforms allow individual processes to open many more files
141  * than they can really support when *many* processes do the same thing.
142  * This GUC parameter lets the DBA limit max_safe_fds to something less than
143  * what the postmaster's initial probe suggests will work.
144  */
146 
147 /*
148  * Maximum number of file descriptors to open for operations that fd.c knows
149  * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
150  * to a conservative value, and remains that way indefinitely in bootstrap or
151  * standalone-backend cases. In normal postmaster operation, the postmaster
152  * calls set_max_safe_fds() late in initialization to update the value, and
153  * that value is then inherited by forked subprocesses.
154  *
155  * Note: the value of max_files_per_process is taken into account while
156  * setting this variable, and so need not be tested separately.
157  */
158 int max_safe_fds = FD_MINFREE; /* default if not changed */
159 
160 /* Whether it is safe to continue running after fsync() fails. */
161 bool data_sync_retry = false;
162 
163 /* How SyncDataDirectory() should do its job. */
165 
166 /* Which kinds of files should be opened with PG_O_DIRECT. */
168 
169 /* Debugging.... */
170 
171 #ifdef FDDEBUG
172 #define DO_DB(A) \
173  do { \
174  int _do_db_save_errno = errno; \
175  A; \
176  errno = _do_db_save_errno; \
177  } while (0)
178 #else
179 #define DO_DB(A) \
180  ((void) 0)
181 #endif
182 
183 #define VFD_CLOSED (-1)
184 
185 #define FileIsValid(file) \
186  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
187 
188 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
189 
190 /* these are the assigned bits in fdstate below: */
191 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
192 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
193 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
194 
195 typedef struct vfd
196 {
197  int fd; /* current FD, or VFD_CLOSED if none */
198  unsigned short fdstate; /* bitflags for VFD's state */
199  ResourceOwner resowner; /* owner, for automatic cleanup */
200  File nextFree; /* link to next free VFD, if in freelist */
201  File lruMoreRecently; /* doubly linked recency-of-use list */
203  off_t fileSize; /* current size of file (0 if not temporary) */
204  char *fileName; /* name of file, or NULL for unused VFD */
205  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
206  int fileFlags; /* open(2) flags for (re)opening the file */
207  mode_t fileMode; /* mode to pass to open(2) */
208 } Vfd;
209 
210 /*
211  * Virtual File Descriptor array pointer and size. This grows as
212  * needed. 'File' values are indexes into this array.
213  * Note that VfdCache[0] is not a usable VFD, just a list header.
214  */
215 static Vfd *VfdCache;
216 static Size SizeVfdCache = 0;
217 
218 /*
219  * Number of file descriptors known to be in use by VFD entries.
220  */
221 static int nfile = 0;
222 
223 /*
224  * Flag to tell whether it's worth scanning VfdCache looking for temp files
225  * to close
226  */
227 static bool have_xact_temporary_files = false;
228 
229 /*
230  * Tracks the total size of all temporary files. Note: when temp_file_limit
231  * is being enforced, this cannot overflow since the limit cannot be more
232  * than INT_MAX kilobytes. When not enforcing, it could theoretically
233  * overflow, but we don't care.
234  */
235 static uint64 temporary_files_size = 0;
236 
237 /* Temporary file access initialized and not yet shut down? */
238 #ifdef USE_ASSERT_CHECKING
239 static bool temporary_files_allowed = false;
240 #endif
241 
242 /*
243  * List of OS handles opened with AllocateFile, AllocateDir and
244  * OpenTransientFile.
245  */
246 typedef enum
247 {
253 
254 typedef struct
255 {
258  union
259  {
260  FILE *file;
262  int fd;
263  } desc;
264 } AllocateDesc;
265 
266 static int numAllocatedDescs = 0;
267 static int maxAllocatedDescs = 0;
269 
270 /*
271  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
272  */
273 static int numExternalFDs = 0;
274 
275 /*
276  * Number of temporary files opened during the current session;
277  * this is used in generation of tempfile names.
278  */
279 static long tempFileCounter = 0;
280 
281 /*
282  * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
283  * indicating that the current database's default tablespace should be used.)
284  * When numTempTableSpaces is -1, this has not been set in the current
285  * transaction.
286  */
287 static Oid *tempTableSpaces = NULL;
288 static int numTempTableSpaces = -1;
289 static int nextTempTableSpace = 0;
290 
291 
292 /*--------------------
293  *
294  * Private Routines
295  *
296  * Delete - delete a file from the Lru ring
297  * LruDelete - remove a file from the Lru ring and close its FD
298  * Insert - put a file at the front of the Lru ring
299  * LruInsert - put a file at the front of the Lru ring and open it
300  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
301  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
302  * AllocateVfd - grab a free (or new) file record (from VfdCache)
303  * FreeVfd - free a file record
304  *
305  * The Least Recently Used ring is a doubly linked list that begins and
306  * ends on element zero. Element zero is special -- it doesn't represent
307  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
308  * anchor that shows us the beginning/end of the ring.
309  * Only VFD elements that are currently really open (have an FD assigned) are
310  * in the Lru ring. Elements that are "virtually" open can be recognized
311  * by having a non-null fileName field.
312  *
313  * example:
314  *
315  * /--less----\ /---------\
316  * v \ v \
317  * #0 --more---> LeastRecentlyUsed --more-\ \
318  * ^\ | |
319  * \\less--> MostRecentlyUsedFile <---/ |
320  * \more---/ \--less--/
321  *
322  *--------------------
323  */
324 static void Delete(File file);
325 static void LruDelete(File file);
326 static void Insert(File file);
327 static int LruInsert(File file);
328 static bool ReleaseLruFile(void);
329 static void ReleaseLruFiles(void);
330 static File AllocateVfd(void);
331 static void FreeVfd(File file);
332 
333 static int FileAccess(File file);
334 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
335 static bool reserveAllocatedDesc(void);
336 static int FreeDesc(AllocateDesc *desc);
337 
338 static void BeforeShmemExit_Files(int code, Datum arg);
339 static void CleanupTempFiles(bool isCommit, bool isProcExit);
340 static void RemovePgTempRelationFiles(const char *tsdirname);
341 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
342 
343 static void walkdir(const char *path,
344  void (*action) (const char *fname, bool isdir, int elevel),
345  bool process_symlinks,
346  int elevel);
347 #ifdef PG_FLUSH_DATA_WORKS
348 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
349 #endif
350 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
351 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
352 
353 static int fsync_parent_path(const char *fname, int elevel);
354 
355 
356 /* ResourceOwner callbacks to hold virtual file descriptors */
357 static void ResOwnerReleaseFile(Datum res);
358 static char *ResOwnerPrintFile(Datum res);
359 
361 {
362  .name = "File",
363  .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
364  .release_priority = RELEASE_PRIO_FILES,
365  .ReleaseResource = ResOwnerReleaseFile,
366  .DebugPrint = ResOwnerPrintFile
367 };
368 
369 /* Convenience wrappers over ResourceOwnerRemember/Forget */
370 static inline void
372 {
374 }
375 static inline void
377 {
379 }
380 
381 /*
382  * pg_fsync --- do fsync with or without writethrough
383  */
384 int
386 {
387 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
388  struct stat st;
389 
390  /*
391  * Some operating system implementations of fsync() have requirements
392  * about the file access modes that were used when their file descriptor
393  * argument was opened, and these requirements differ depending on whether
394  * the file descriptor is for a directory.
395  *
396  * For any file descriptor that may eventually be handed to fsync(), we
397  * should have opened it with access modes that are compatible with
398  * fsync() on all supported systems, otherwise the code may not be
399  * portable, even if it runs ok on the current system.
400  *
401  * We assert here that a descriptor for a file was opened with write
402  * permissions (either O_RDWR or O_WRONLY) and for a directory without
403  * write permissions (O_RDONLY).
404  *
405  * Ignore any fstat errors and let the follow-up fsync() do its work.
406  * Doing this sanity check here counts for the case where fsync() is
407  * disabled.
408  */
409  if (fstat(fd, &st) == 0)
410  {
411  int desc_flags = fcntl(fd, F_GETFL);
412 
413  /*
414  * O_RDONLY is historically 0, so just make sure that for directories
415  * no write flags are used.
416  */
417  if (S_ISDIR(st.st_mode))
418  Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
419  else
420  Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
421  }
422  errno = 0;
423 #endif
424 
425  /* #if is to skip the wal_sync_method test if there's no need for it */
426 #if defined(HAVE_FSYNC_WRITETHROUGH)
428  return pg_fsync_writethrough(fd);
429  else
430 #endif
432 }
433 
434 
435 /*
436  * pg_fsync_no_writethrough --- same as fsync except does nothing if
437  * enableFsync is off
438  */
439 int
441 {
442  int rc;
443 
444  if (!enableFsync)
445  return 0;
446 
447 retry:
448  rc = fsync(fd);
449 
450  if (rc == -1 && errno == EINTR)
451  goto retry;
452 
453  return rc;
454 }
455 
456 /*
457  * pg_fsync_writethrough
458  */
459 int
461 {
462  if (enableFsync)
463  {
464 #if defined(F_FULLFSYNC)
465  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
466 #else
467  errno = ENOSYS;
468  return -1;
469 #endif
470  }
471  else
472  return 0;
473 }
474 
475 /*
476  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
477  */
478 int
480 {
481  int rc;
482 
483  if (!enableFsync)
484  return 0;
485 
486 retry:
487  rc = fdatasync(fd);
488 
489  if (rc == -1 && errno == EINTR)
490  goto retry;
491 
492  return rc;
493 }
494 
495 /*
496  * pg_file_exists -- check that a file exists.
497  *
498  * This requires an absolute path to the file. Returns true if the file is
499  * not a directory, false otherwise.
500  */
501 bool
502 pg_file_exists(const char *name)
503 {
504  struct stat st;
505 
506  Assert(name != NULL);
507 
508  if (stat(name, &st) == 0)
509  return !S_ISDIR(st.st_mode);
510  else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
511  ereport(ERROR,
513  errmsg("could not access file \"%s\": %m", name)));
514 
515  return false;
516 }
517 
518 /*
519  * pg_flush_data --- advise OS that the described dirty data should be flushed
520  *
521  * offset of 0 with nbytes 0 means that the entire file should be flushed
522  */
523 void
524 pg_flush_data(int fd, off_t offset, off_t nbytes)
525 {
526  /*
527  * Right now file flushing is primarily used to avoid making later
528  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
529  * if fsyncs are disabled - that's a decision we might want to make
530  * configurable at some point.
531  */
532  if (!enableFsync)
533  return;
534 
535  /*
536  * We compile all alternatives that are supported on the current platform,
537  * to find portability problems more easily.
538  */
539 #if defined(HAVE_SYNC_FILE_RANGE)
540  {
541  int rc;
542  static bool not_implemented_by_kernel = false;
543 
544  if (not_implemented_by_kernel)
545  return;
546 
547 retry:
548 
549  /*
550  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
551  * tells the OS that writeback for the specified blocks should be
552  * started, but that we don't want to wait for completion. Note that
553  * this call might block if too much dirty data exists in the range.
554  * This is the preferable method on OSs supporting it, as it works
555  * reliably when available (contrast to msync()) and doesn't flush out
556  * clean data (like FADV_DONTNEED).
557  */
558  rc = sync_file_range(fd, offset, nbytes,
559  SYNC_FILE_RANGE_WRITE);
560  if (rc != 0)
561  {
562  int elevel;
563 
564  if (rc == EINTR)
565  goto retry;
566 
567  /*
568  * For systems that don't have an implementation of
569  * sync_file_range() such as Windows WSL, generate only one
570  * warning and then suppress all further attempts by this process.
571  */
572  if (errno == ENOSYS)
573  {
574  elevel = WARNING;
575  not_implemented_by_kernel = true;
576  }
577  else
578  elevel = data_sync_elevel(WARNING);
579 
580  ereport(elevel,
582  errmsg("could not flush dirty data: %m")));
583  }
584 
585  return;
586  }
587 #endif
588 #if !defined(WIN32) && defined(MS_ASYNC)
589  {
590  void *p;
591  static int pagesize = 0;
592 
593  /*
594  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
595  * writeback. On linux it only does so if MS_SYNC is specified, but
596  * then it does the writeback synchronously. Luckily all common linux
597  * systems have sync_file_range(). This is preferable over
598  * FADV_DONTNEED because it doesn't flush out clean data.
599  *
600  * We map the file (mmap()), tell the kernel to sync back the contents
601  * (msync()), and then remove the mapping again (munmap()).
602  */
603 
604  /* mmap() needs actual length if we want to map whole file */
605  if (offset == 0 && nbytes == 0)
606  {
607  nbytes = lseek(fd, 0, SEEK_END);
608  if (nbytes < 0)
609  {
612  errmsg("could not determine dirty data size: %m")));
613  return;
614  }
615  }
616 
617  /*
618  * Some platforms reject partial-page mmap() attempts. To deal with
619  * that, just truncate the request to a page boundary. If any extra
620  * bytes don't get flushed, well, it's only a hint anyway.
621  */
622 
623  /* fetch pagesize only once */
624  if (pagesize == 0)
625  pagesize = sysconf(_SC_PAGESIZE);
626 
627  /* align length to pagesize, dropping any fractional page */
628  if (pagesize > 0)
629  nbytes = (nbytes / pagesize) * pagesize;
630 
631  /* fractional-page request is a no-op */
632  if (nbytes <= 0)
633  return;
634 
635  /*
636  * mmap could well fail, particularly on 32-bit platforms where there
637  * may simply not be enough address space. If so, silently fall
638  * through to the next implementation.
639  */
640  if (nbytes <= (off_t) SSIZE_MAX)
641  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
642  else
643  p = MAP_FAILED;
644 
645  if (p != MAP_FAILED)
646  {
647  int rc;
648 
649  rc = msync(p, (size_t) nbytes, MS_ASYNC);
650  if (rc != 0)
651  {
654  errmsg("could not flush dirty data: %m")));
655  /* NB: need to fall through to munmap()! */
656  }
657 
658  rc = munmap(p, (size_t) nbytes);
659  if (rc != 0)
660  {
661  /* FATAL error because mapping would remain */
662  ereport(FATAL,
664  errmsg("could not munmap() while flushing data: %m")));
665  }
666 
667  return;
668  }
669  }
670 #endif
671 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
672  {
673  int rc;
674 
675  /*
676  * Signal the kernel that the passed in range should not be cached
677  * anymore. This has the, desired, side effect of writing out dirty
678  * data, and the, undesired, side effect of likely discarding useful
679  * clean cached blocks. For the latter reason this is the least
680  * preferable method.
681  */
682 
683  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
684 
685  if (rc != 0)
686  {
687  /* don't error out, this is just a performance optimization */
690  errmsg("could not flush dirty data: %m")));
691  }
692 
693  return;
694  }
695 #endif
696 }
697 
698 /*
699  * Truncate an open file to a given length.
700  */
701 static int
702 pg_ftruncate(int fd, off_t length)
703 {
704  int ret;
705 
706 retry:
707  ret = ftruncate(fd, length);
708 
709  if (ret == -1 && errno == EINTR)
710  goto retry;
711 
712  return ret;
713 }
714 
715 /*
716  * Truncate a file to a given length by name.
717  */
718 int
719 pg_truncate(const char *path, off_t length)
720 {
721  int ret;
722 #ifdef WIN32
723  int save_errno;
724  int fd;
725 
726  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
727  if (fd >= 0)
728  {
729  ret = pg_ftruncate(fd, length);
730  save_errno = errno;
732  errno = save_errno;
733  }
734  else
735  ret = -1;
736 #else
737 
738 retry:
739  ret = truncate(path, length);
740 
741  if (ret == -1 && errno == EINTR)
742  goto retry;
743 #endif
744 
745  return ret;
746 }
747 
748 /*
749  * fsync_fname -- fsync a file or directory, handling errors properly
750  *
751  * Try to fsync a file or directory. When doing the latter, ignore errors that
752  * indicate the OS just doesn't allow/require fsyncing directories.
753  */
754 void
755 fsync_fname(const char *fname, bool isdir)
756 {
757  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
758 }
759 
760 /*
761  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
762  *
763  * This routine ensures that, after returning, the effect of renaming file
764  * persists in case of a crash. A crash while this routine is running will
765  * leave you with either the pre-existing or the moved file in place of the
766  * new file; no mixed state or truncated files are possible.
767  *
768  * It does so by using fsync on the old filename and the possibly existing
769  * target filename before the rename, and the target file and directory after.
770  *
771  * Note that rename() cannot be used across arbitrary directories, as they
772  * might not be on the same filesystem. Therefore this routine does not
773  * support renaming across directories.
774  *
775  * Log errors with the caller specified severity.
776  *
777  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
778  * valid upon return.
779  */
780 int
781 durable_rename(const char *oldfile, const char *newfile, int elevel)
782 {
783  int fd;
784 
785  /*
786  * First fsync the old and target path (if it exists), to ensure that they
787  * are properly persistent on disk. Syncing the target file is not
788  * strictly necessary, but it makes it easier to reason about crashes;
789  * because it's then guaranteed that either source or target file exists
790  * after a crash.
791  */
792  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
793  return -1;
794 
795  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
796  if (fd < 0)
797  {
798  if (errno != ENOENT)
799  {
800  ereport(elevel,
802  errmsg("could not open file \"%s\": %m", newfile)));
803  return -1;
804  }
805  }
806  else
807  {
808  if (pg_fsync(fd) != 0)
809  {
810  int save_errno;
811 
812  /* close file upon error, might not be in transaction context */
813  save_errno = errno;
815  errno = save_errno;
816 
817  ereport(elevel,
819  errmsg("could not fsync file \"%s\": %m", newfile)));
820  return -1;
821  }
822 
823  if (CloseTransientFile(fd) != 0)
824  {
825  ereport(elevel,
827  errmsg("could not close file \"%s\": %m", newfile)));
828  return -1;
829  }
830  }
831 
832  /* Time to do the real deal... */
833  if (rename(oldfile, newfile) < 0)
834  {
835  ereport(elevel,
837  errmsg("could not rename file \"%s\" to \"%s\": %m",
838  oldfile, newfile)));
839  return -1;
840  }
841 
842  /*
843  * To guarantee renaming the file is persistent, fsync the file with its
844  * new name, and its containing directory.
845  */
846  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
847  return -1;
848 
849  if (fsync_parent_path(newfile, elevel) != 0)
850  return -1;
851 
852  return 0;
853 }
854 
855 /*
856  * durable_unlink -- remove a file in a durable manner
857  *
858  * This routine ensures that, after returning, the effect of removing file
859  * persists in case of a crash. A crash while this routine is running will
860  * leave the system in no mixed state.
861  *
862  * It does so by using fsync on the parent directory of the file after the
863  * actual removal is done.
864  *
865  * Log errors with the severity specified by caller.
866  *
867  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
868  * valid upon return.
869  */
870 int
871 durable_unlink(const char *fname, int elevel)
872 {
873  if (unlink(fname) < 0)
874  {
875  ereport(elevel,
877  errmsg("could not remove file \"%s\": %m",
878  fname)));
879  return -1;
880  }
881 
882  /*
883  * To guarantee that the removal of the file is persistent, fsync its
884  * parent directory.
885  */
886  if (fsync_parent_path(fname, elevel) != 0)
887  return -1;
888 
889  return 0;
890 }
891 
892 /*
893  * InitFileAccess --- initialize this module during backend startup
894  *
895  * This is called during either normal or standalone backend start.
896  * It is *not* called in the postmaster.
897  *
898  * Note that this does not initialize temporary file access, that is
899  * separately initialized via InitTemporaryFileAccess().
900  */
901 void
903 {
904  Assert(SizeVfdCache == 0); /* call me only once */
905 
906  /* initialize cache header entry */
907  VfdCache = (Vfd *) malloc(sizeof(Vfd));
908  if (VfdCache == NULL)
909  ereport(FATAL,
910  (errcode(ERRCODE_OUT_OF_MEMORY),
911  errmsg("out of memory")));
912 
913  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
915 
916  SizeVfdCache = 1;
917 }
918 
919 /*
920  * InitTemporaryFileAccess --- initialize temporary file access during startup
921  *
922  * This is called during either normal or standalone backend start.
923  * It is *not* called in the postmaster.
924  *
925  * This is separate from InitFileAccess() because temporary file cleanup can
926  * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
927  * our reporting has to happen before that. Low level file access should be
928  * available for longer, hence the separate initialization / shutdown of
929  * temporary file handling.
930  */
931 void
933 {
934  Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
935  Assert(!temporary_files_allowed); /* call me only once */
936 
937  /*
938  * Register before-shmem-exit hook to ensure temp files are dropped while
939  * we can still report stats.
940  */
942 
943 #ifdef USE_ASSERT_CHECKING
944  temporary_files_allowed = true;
945 #endif
946 }
947 
948 /*
949  * count_usable_fds --- count how many FDs the system will let us open,
950  * and estimate how many are already open.
951  *
952  * We stop counting if usable_fds reaches max_to_probe. Note: a small
953  * value of max_to_probe might result in an underestimate of already_open;
954  * we must fill in any "gaps" in the set of used FDs before the calculation
955  * of already_open will give the right answer. In practice, max_to_probe
956  * of a couple of dozen should be enough to ensure good results.
957  *
958  * We assume stderr (FD 2) is available for dup'ing. While the calling
959  * script could theoretically close that, it would be a really bad idea,
960  * since then one risks loss of error messages from, e.g., libc.
961  */
962 static void
963 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
964 {
965  int *fd;
966  int size;
967  int used = 0;
968  int highestfd = 0;
969  int j;
970 
971 #ifdef HAVE_GETRLIMIT
972  struct rlimit rlim;
973  int getrlimit_status;
974 #endif
975 
976  size = 1024;
977  fd = (int *) palloc(size * sizeof(int));
978 
979 #ifdef HAVE_GETRLIMIT
980  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
981  if (getrlimit_status != 0)
982  ereport(WARNING, (errmsg("getrlimit failed: %m")));
983 #endif /* HAVE_GETRLIMIT */
984 
985  /* dup until failure or probe limit reached */
986  for (;;)
987  {
988  int thisfd;
989 
990 #ifdef HAVE_GETRLIMIT
991 
992  /*
993  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
994  * some platforms
995  */
996  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
997  break;
998 #endif
999 
1000  thisfd = dup(2);
1001  if (thisfd < 0)
1002  {
1003  /* Expect EMFILE or ENFILE, else it's fishy */
1004  if (errno != EMFILE && errno != ENFILE)
1005  elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1006  break;
1007  }
1008 
1009  if (used >= size)
1010  {
1011  size *= 2;
1012  fd = (int *) repalloc(fd, size * sizeof(int));
1013  }
1014  fd[used++] = thisfd;
1015 
1016  if (highestfd < thisfd)
1017  highestfd = thisfd;
1018 
1019  if (used >= max_to_probe)
1020  break;
1021  }
1022 
1023  /* release the files we opened */
1024  for (j = 0; j < used; j++)
1025  close(fd[j]);
1026 
1027  pfree(fd);
1028 
1029  /*
1030  * Return results. usable_fds is just the number of successful dups. We
1031  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1032  * number) and so already_open is highestfd+1 - usable_fds.
1033  */
1034  *usable_fds = used;
1035  *already_open = highestfd + 1 - used;
1036 }
1037 
1038 /*
1039  * set_max_safe_fds
1040  * Determine number of file descriptors that fd.c is allowed to use
1041  */
1042 void
1044 {
1045  int usable_fds;
1046  int already_open;
1047 
1048  /*----------
1049  * We want to set max_safe_fds to
1050  * MIN(usable_fds, max_files_per_process - already_open)
1051  * less the slop factor for files that are opened without consulting
1052  * fd.c. This ensures that we won't exceed either max_files_per_process
1053  * or the experimentally-determined EMFILE limit.
1054  *----------
1055  */
1057  &usable_fds, &already_open);
1058 
1059  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
1060 
1061  /*
1062  * Take off the FDs reserved for system() etc.
1063  */
1065 
1066  /*
1067  * Make sure we still have enough to get by.
1068  */
1069  if (max_safe_fds < FD_MINFREE)
1070  ereport(FATAL,
1071  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1072  errmsg("insufficient file descriptors available to start server process"),
1073  errdetail("System allows %d, server needs at least %d.",
1076 
1077  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1078  max_safe_fds, usable_fds, already_open);
1079 }
1080 
1081 /*
1082  * Open a file with BasicOpenFilePerm() and pass default file mode for the
1083  * fileMode parameter.
1084  */
1085 int
1086 BasicOpenFile(const char *fileName, int fileFlags)
1087 {
1088  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1089 }
1090 
1091 /*
1092  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1093  *
1094  * This is exported for use by places that really want a plain kernel FD,
1095  * but need to be proof against running out of FDs. Once an FD has been
1096  * successfully returned, it is the caller's responsibility to ensure that
1097  * it will not be leaked on ereport()! Most users should *not* call this
1098  * routine directly, but instead use the VFD abstraction level, which
1099  * provides protection against descriptor leaks as well as management of
1100  * files that need to be open for more than a short period of time.
1101  *
1102  * Ideally this should be the *only* direct call of open() in the backend.
1103  * In practice, the postmaster calls open() directly, and there are some
1104  * direct open() calls done early in backend startup. Those are OK since
1105  * this module wouldn't have any open files to close at that point anyway.
1106  */
1107 int
1108 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1109 {
1110  int fd;
1111 
1112 tryAgain:
1113 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1114 
1115  /*
1116  * The value we defined to stand in for O_DIRECT when simulating it with
1117  * F_NOCACHE had better not collide with any of the standard flags.
1118  */
1120  (O_APPEND |
1121  O_CLOEXEC |
1122  O_CREAT |
1123  O_DSYNC |
1124  O_EXCL |
1125  O_RDWR |
1126  O_RDONLY |
1127  O_SYNC |
1128  O_TRUNC |
1129  O_WRONLY)) == 0,
1130  "PG_O_DIRECT value collides with standard flag");
1131  fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1132 #else
1133  fd = open(fileName, fileFlags, fileMode);
1134 #endif
1135 
1136  if (fd >= 0)
1137  {
1138 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1139  if (fileFlags & PG_O_DIRECT)
1140  {
1141  if (fcntl(fd, F_NOCACHE, 1) < 0)
1142  {
1143  int save_errno = errno;
1144 
1145  close(fd);
1146  errno = save_errno;
1147  return -1;
1148  }
1149  }
1150 #endif
1151 
1152  return fd; /* success! */
1153  }
1154 
1155  if (errno == EMFILE || errno == ENFILE)
1156  {
1157  int save_errno = errno;
1158 
1159  ereport(LOG,
1160  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1161  errmsg("out of file descriptors: %m; release and retry")));
1162  errno = 0;
1163  if (ReleaseLruFile())
1164  goto tryAgain;
1165  errno = save_errno;
1166  }
1167 
1168  return -1; /* failure */
1169 }
1170 
1171 /*
1172  * AcquireExternalFD - attempt to reserve an external file descriptor
1173  *
1174  * This should be used by callers that need to hold a file descriptor open
1175  * over more than a short interval, but cannot use any of the other facilities
1176  * provided by this module.
1177  *
1178  * The difference between this and the underlying ReserveExternalFD function
1179  * is that this will report failure (by setting errno and returning false)
1180  * if "too many" external FDs are already reserved. This should be used in
1181  * any code where the total number of FDs to be reserved is not predictable
1182  * and small.
1183  */
1184 bool
1186 {
1187  /*
1188  * We don't want more than max_safe_fds / 3 FDs to be consumed for
1189  * "external" FDs.
1190  */
1191  if (numExternalFDs < max_safe_fds / 3)
1192  {
1194  return true;
1195  }
1196  errno = EMFILE;
1197  return false;
1198 }
1199 
1200 /*
1201  * ReserveExternalFD - report external consumption of a file descriptor
1202  *
1203  * This should be used by callers that need to hold a file descriptor open
1204  * over more than a short interval, but cannot use any of the other facilities
1205  * provided by this module. This just tracks the use of the FD and closes
1206  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1207  *
1208  * Call this directly only in code where failure to reserve the FD would be
1209  * fatal; for example, the WAL-writing code does so, since the alternative is
1210  * session failure. Also, it's very unwise to do so in code that could
1211  * consume more than one FD per process.
1212  *
1213  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1214  * available, it doesn't matter too much whether this is called before or
1215  * after actually opening the FD; but doing so beforehand reduces the risk of
1216  * an EMFILE failure if not everybody played nice. In any case, it's solely
1217  * caller's responsibility to keep the external-FD count in sync with reality.
1218  */
1219 void
1221 {
1222  /*
1223  * Release VFDs if needed to stay safe. Because we do this before
1224  * incrementing numExternalFDs, the final state will be as desired, i.e.,
1225  * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1226  */
1227  ReleaseLruFiles();
1228 
1229  numExternalFDs++;
1230 }
1231 
1232 /*
1233  * ReleaseExternalFD - report release of an external file descriptor
1234  *
1235  * This is guaranteed not to change errno, so it can be used in failure paths.
1236  */
1237 void
1239 {
1240  Assert(numExternalFDs > 0);
1241  numExternalFDs--;
1242 }
1243 
1244 
1245 #if defined(FDDEBUG)
1246 
1247 static void
1248 _dump_lru(void)
1249 {
1250  int mru = VfdCache[0].lruLessRecently;
1251  Vfd *vfdP = &VfdCache[mru];
1252  char buf[2048];
1253 
1254  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1255  while (mru != 0)
1256  {
1257  mru = vfdP->lruLessRecently;
1258  vfdP = &VfdCache[mru];
1259  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1260  }
1261  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1262  elog(LOG, "%s", buf);
1263 }
1264 #endif /* FDDEBUG */
1265 
1266 static void
1268 {
1269  Vfd *vfdP;
1270 
1271  Assert(file != 0);
1272 
1273  DO_DB(elog(LOG, "Delete %d (%s)",
1274  file, VfdCache[file].fileName));
1275  DO_DB(_dump_lru());
1276 
1277  vfdP = &VfdCache[file];
1278 
1281 
1282  DO_DB(_dump_lru());
1283 }
1284 
1285 static void
1287 {
1288  Vfd *vfdP;
1289 
1290  Assert(file != 0);
1291 
1292  DO_DB(elog(LOG, "LruDelete %d (%s)",
1293  file, VfdCache[file].fileName));
1294 
1295  vfdP = &VfdCache[file];
1296 
1297  /*
1298  * Close the file. We aren't expecting this to fail; if it does, better
1299  * to leak the FD than to mess up our internal state.
1300  */
1301  if (close(vfdP->fd) != 0)
1303  "could not close file \"%s\": %m", vfdP->fileName);
1304  vfdP->fd = VFD_CLOSED;
1305  --nfile;
1306 
1307  /* delete the vfd record from the LRU ring */
1308  Delete(file);
1309 }
1310 
1311 static void
1313 {
1314  Vfd *vfdP;
1315 
1316  Assert(file != 0);
1317 
1318  DO_DB(elog(LOG, "Insert %d (%s)",
1319  file, VfdCache[file].fileName));
1320  DO_DB(_dump_lru());
1321 
1322  vfdP = &VfdCache[file];
1323 
1324  vfdP->lruMoreRecently = 0;
1326  VfdCache[0].lruLessRecently = file;
1328 
1329  DO_DB(_dump_lru());
1330 }
1331 
1332 /* returns 0 on success, -1 on re-open failure (with errno set) */
1333 static int
1335 {
1336  Vfd *vfdP;
1337 
1338  Assert(file != 0);
1339 
1340  DO_DB(elog(LOG, "LruInsert %d (%s)",
1341  file, VfdCache[file].fileName));
1342 
1343  vfdP = &VfdCache[file];
1344 
1345  if (FileIsNotOpen(file))
1346  {
1347  /* Close excess kernel FDs. */
1348  ReleaseLruFiles();
1349 
1350  /*
1351  * The open could still fail for lack of file descriptors, eg due to
1352  * overall system file table being full. So, be prepared to release
1353  * another FD if necessary...
1354  */
1355  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1356  vfdP->fileMode);
1357  if (vfdP->fd < 0)
1358  {
1359  DO_DB(elog(LOG, "re-open failed: %m"));
1360  return -1;
1361  }
1362  else
1363  {
1364  ++nfile;
1365  }
1366  }
1367 
1368  /*
1369  * put it at the head of the Lru ring
1370  */
1371 
1372  Insert(file);
1373 
1374  return 0;
1375 }
1376 
1377 /*
1378  * Release one kernel FD by closing the least-recently-used VFD.
1379  */
1380 static bool
1382 {
1383  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1384 
1385  if (nfile > 0)
1386  {
1387  /*
1388  * There are opened files and so there should be at least one used vfd
1389  * in the ring.
1390  */
1391  Assert(VfdCache[0].lruMoreRecently != 0);
1392  LruDelete(VfdCache[0].lruMoreRecently);
1393  return true; /* freed a file */
1394  }
1395  return false; /* no files available to free */
1396 }
1397 
1398 /*
1399  * Release kernel FDs as needed to get under the max_safe_fds limit.
1400  * After calling this, it's OK to try to open another file.
1401  */
1402 static void
1404 {
1406  {
1407  if (!ReleaseLruFile())
1408  break;
1409  }
1410 }
1411 
1412 static File
1414 {
1415  Index i;
1416  File file;
1417 
1418  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1419 
1420  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1421 
1422  if (VfdCache[0].nextFree == 0)
1423  {
1424  /*
1425  * The free list is empty so it is time to increase the size of the
1426  * array. We choose to double it each time this happens. However,
1427  * there's not much point in starting *real* small.
1428  */
1429  Size newCacheSize = SizeVfdCache * 2;
1430  Vfd *newVfdCache;
1431 
1432  if (newCacheSize < 32)
1433  newCacheSize = 32;
1434 
1435  /*
1436  * Be careful not to clobber VfdCache ptr if realloc fails.
1437  */
1438  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1439  if (newVfdCache == NULL)
1440  ereport(ERROR,
1441  (errcode(ERRCODE_OUT_OF_MEMORY),
1442  errmsg("out of memory")));
1443  VfdCache = newVfdCache;
1444 
1445  /*
1446  * Initialize the new entries and link them into the free list.
1447  */
1448  for (i = SizeVfdCache; i < newCacheSize; i++)
1449  {
1450  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1451  VfdCache[i].nextFree = i + 1;
1452  VfdCache[i].fd = VFD_CLOSED;
1453  }
1454  VfdCache[newCacheSize - 1].nextFree = 0;
1456 
1457  /*
1458  * Record the new size
1459  */
1460  SizeVfdCache = newCacheSize;
1461  }
1462 
1463  file = VfdCache[0].nextFree;
1464 
1465  VfdCache[0].nextFree = VfdCache[file].nextFree;
1466 
1467  return file;
1468 }
1469 
1470 static void
1472 {
1473  Vfd *vfdP = &VfdCache[file];
1474 
1475  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1476  file, vfdP->fileName ? vfdP->fileName : ""));
1477 
1478  if (vfdP->fileName != NULL)
1479  {
1480  free(vfdP->fileName);
1481  vfdP->fileName = NULL;
1482  }
1483  vfdP->fdstate = 0x0;
1484 
1485  vfdP->nextFree = VfdCache[0].nextFree;
1486  VfdCache[0].nextFree = file;
1487 }
1488 
1489 /* returns 0 on success, -1 on re-open failure (with errno set) */
1490 static int
1492 {
1493  int returnValue;
1494 
1495  DO_DB(elog(LOG, "FileAccess %d (%s)",
1496  file, VfdCache[file].fileName));
1497 
1498  /*
1499  * Is the file open? If not, open it and put it at the head of the LRU
1500  * ring (possibly closing the least recently used file to get an FD).
1501  */
1502 
1503  if (FileIsNotOpen(file))
1504  {
1505  returnValue = LruInsert(file);
1506  if (returnValue != 0)
1507  return returnValue;
1508  }
1509  else if (VfdCache[0].lruLessRecently != file)
1510  {
1511  /*
1512  * We now know that the file is open and that it is not the last one
1513  * accessed, so we need to move it to the head of the Lru ring.
1514  */
1515 
1516  Delete(file);
1517  Insert(file);
1518  }
1519 
1520  return 0;
1521 }
1522 
1523 /*
1524  * Called whenever a temporary file is deleted to report its size.
1525  */
1526 static void
1527 ReportTemporaryFileUsage(const char *path, off_t size)
1528 {
1530 
1531  if (log_temp_files >= 0)
1532  {
1533  if ((size / 1024) >= log_temp_files)
1534  ereport(LOG,
1535  (errmsg("temporary file: path \"%s\", size %lu",
1536  path, (unsigned long) size)));
1537  }
1538 }
1539 
1540 /*
1541  * Called to register a temporary file for automatic close.
1542  * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1543  * before the file was opened.
1544  */
1545 static void
1547 {
1550 
1551  /* Backup mechanism for closing at end of xact. */
1554 }
1555 
1556 /*
1557  * Called when we get a shared invalidation message on some relation.
1558  */
1559 #ifdef NOT_USED
1560 void
1561 FileInvalidate(File file)
1562 {
1563  Assert(FileIsValid(file));
1564  if (!FileIsNotOpen(file))
1565  LruDelete(file);
1566 }
1567 #endif
1568 
1569 /*
1570  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1571  * fileMode parameter.
1572  */
1573 File
1574 PathNameOpenFile(const char *fileName, int fileFlags)
1575 {
1576  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1577 }
1578 
1579 /*
1580  * open a file in an arbitrary directory
1581  *
1582  * NB: if the passed pathname is relative (which it usually is),
1583  * it will be interpreted relative to the process' working directory
1584  * (which should always be $PGDATA when this code is running).
1585  */
1586 File
1587 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1588 {
1589  char *fnamecopy;
1590  File file;
1591  Vfd *vfdP;
1592 
1593  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1594  fileName, fileFlags, fileMode));
1595 
1596  /*
1597  * We need a malloc'd copy of the file name; fail cleanly if no room.
1598  */
1599  fnamecopy = strdup(fileName);
1600  if (fnamecopy == NULL)
1601  ereport(ERROR,
1602  (errcode(ERRCODE_OUT_OF_MEMORY),
1603  errmsg("out of memory")));
1604 
1605  file = AllocateVfd();
1606  vfdP = &VfdCache[file];
1607 
1608  /* Close excess kernel FDs. */
1609  ReleaseLruFiles();
1610 
1611  /*
1612  * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1613  * client shouldn't be expected to know which kernel descriptors are
1614  * currently open, so it wouldn't make sense for them to be inherited by
1615  * executed subprograms.
1616  */
1617  fileFlags |= O_CLOEXEC;
1618 
1619  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1620 
1621  if (vfdP->fd < 0)
1622  {
1623  int save_errno = errno;
1624 
1625  FreeVfd(file);
1626  free(fnamecopy);
1627  errno = save_errno;
1628  return -1;
1629  }
1630  ++nfile;
1631  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1632  vfdP->fd));
1633 
1634  vfdP->fileName = fnamecopy;
1635  /* Saved flags are adjusted to be OK for re-opening file */
1636  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1637  vfdP->fileMode = fileMode;
1638  vfdP->fileSize = 0;
1639  vfdP->fdstate = 0x0;
1640  vfdP->resowner = NULL;
1641 
1642  Insert(file);
1643 
1644  return file;
1645 }
1646 
1647 /*
1648  * Create directory 'directory'. If necessary, create 'basedir', which must
1649  * be the directory above it. This is designed for creating the top-level
1650  * temporary directory on demand before creating a directory underneath it.
1651  * Do nothing if the directory already exists.
1652  *
1653  * Directories created within the top-level temporary directory should begin
1654  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1655  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1656  * that do not need any particular prefix.
1657 */
1658 void
1660 {
1661  if (MakePGDirectory(directory) < 0)
1662  {
1663  if (errno == EEXIST)
1664  return;
1665 
1666  /*
1667  * Failed. Try to create basedir first in case it's missing. Tolerate
1668  * EEXIST to close a race against another process following the same
1669  * algorithm.
1670  */
1671  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1672  ereport(ERROR,
1674  errmsg("cannot create temporary directory \"%s\": %m",
1675  basedir)));
1676 
1677  /* Try again. */
1678  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1679  ereport(ERROR,
1681  errmsg("cannot create temporary subdirectory \"%s\": %m",
1682  directory)));
1683  }
1684 }
1685 
1686 /*
1687  * Delete a directory and everything in it, if it exists.
1688  */
1689 void
1690 PathNameDeleteTemporaryDir(const char *dirname)
1691 {
1692  struct stat statbuf;
1693 
1694  /* Silently ignore missing directory. */
1695  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1696  return;
1697 
1698  /*
1699  * Currently, walkdir doesn't offer a way for our passed in function to
1700  * maintain state. Perhaps it should, so that we could tell the caller
1701  * whether this operation succeeded or failed. Since this operation is
1702  * used in a cleanup path, we wouldn't actually behave differently: we'll
1703  * just log failures.
1704  */
1705  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1706 }
1707 
1708 /*
1709  * Open a temporary file that will disappear when we close it.
1710  *
1711  * This routine takes care of generating an appropriate tempfile name.
1712  * There's no need to pass in fileFlags or fileMode either, since only
1713  * one setting makes any sense for a temp file.
1714  *
1715  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1716  * to ensure it's closed and deleted when it's no longer needed, typically at
1717  * the end-of-transaction. In most cases, you don't want temporary files to
1718  * outlive the transaction that created them, so this should be false -- but
1719  * if you need "somewhat" temporary storage, this might be useful. In either
1720  * case, the file is removed when the File is explicitly closed.
1721  */
1722 File
1723 OpenTemporaryFile(bool interXact)
1724 {
1725  File file = 0;
1726 
1727  Assert(temporary_files_allowed); /* check temp file access is up */
1728 
1729  /*
1730  * Make sure the current resource owner has space for this File before we
1731  * open it, if we'll be registering it below.
1732  */
1733  if (!interXact)
1735 
1736  /*
1737  * If some temp tablespace(s) have been given to us, try to use the next
1738  * one. If a given tablespace can't be found, we silently fall back to
1739  * the database's default tablespace.
1740  *
1741  * BUT: if the temp file is slated to outlive the current transaction,
1742  * force it into the database's default tablespace, so that it will not
1743  * pose a threat to possible tablespace drop attempts.
1744  */
1745  if (numTempTableSpaces > 0 && !interXact)
1746  {
1747  Oid tblspcOid = GetNextTempTableSpace();
1748 
1749  if (OidIsValid(tblspcOid))
1750  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1751  }
1752 
1753  /*
1754  * If not, or if tablespace is bad, create in database's default
1755  * tablespace. MyDatabaseTableSpace should normally be set before we get
1756  * here, but just in case it isn't, fall back to pg_default tablespace.
1757  */
1758  if (file <= 0)
1761  DEFAULTTABLESPACE_OID,
1762  true);
1763 
1764  /* Mark it for deletion at close and temporary file size limit */
1766 
1767  /* Register it with the current resource owner */
1768  if (!interXact)
1769  RegisterTemporaryFile(file);
1770 
1771  return file;
1772 }
1773 
1774 /*
1775  * Return the path of the temp directory in a given tablespace.
1776  */
1777 void
1779 {
1780  /*
1781  * Identify the tempfile directory for this tablespace.
1782  *
1783  * If someone tries to specify pg_global, use pg_default instead.
1784  */
1785  if (tablespace == InvalidOid ||
1786  tablespace == DEFAULTTABLESPACE_OID ||
1787  tablespace == GLOBALTABLESPACE_OID)
1788  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1789  else
1790  {
1791  /* All other tablespaces are accessed via symlinks */
1792  snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1795  }
1796 }
1797 
1798 /*
1799  * Open a temporary file in a specific tablespace.
1800  * Subroutine for OpenTemporaryFile, which see for details.
1801  */
1802 static File
1803 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1804 {
1805  char tempdirpath[MAXPGPATH];
1806  char tempfilepath[MAXPGPATH];
1807  File file;
1808 
1809  TempTablespacePath(tempdirpath, tblspcOid);
1810 
1811  /*
1812  * Generate a tempfile name that should be unique within the current
1813  * database instance.
1814  */
1815  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1816  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1817 
1818  /*
1819  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1820  * temp file that can be reused.
1821  */
1822  file = PathNameOpenFile(tempfilepath,
1823  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1824  if (file <= 0)
1825  {
1826  /*
1827  * We might need to create the tablespace's tempfile directory, if no
1828  * one has yet done so.
1829  *
1830  * Don't check for an error from MakePGDirectory; it could fail if
1831  * someone else just did the same thing. If it doesn't work then
1832  * we'll bomb out on the second create attempt, instead.
1833  */
1834  (void) MakePGDirectory(tempdirpath);
1835 
1836  file = PathNameOpenFile(tempfilepath,
1837  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1838  if (file <= 0 && rejectError)
1839  elog(ERROR, "could not create temporary file \"%s\": %m",
1840  tempfilepath);
1841  }
1842 
1843  return file;
1844 }
1845 
1846 
1847 /*
1848  * Create a new file. The directory containing it must already exist. Files
1849  * created this way are subject to temp_file_limit and are automatically
1850  * closed at end of transaction, but are not automatically deleted on close
1851  * because they are intended to be shared between cooperating backends.
1852  *
1853  * If the file is inside the top-level temporary directory, its name should
1854  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1855  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1856  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1857  * the prefix isn't needed.
1858  */
1859 File
1860 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1861 {
1862  File file;
1863 
1864  Assert(temporary_files_allowed); /* check temp file access is up */
1865 
1867 
1868  /*
1869  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1870  * temp file that can be reused.
1871  */
1872  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1873  if (file <= 0)
1874  {
1875  if (error_on_failure)
1876  ereport(ERROR,
1878  errmsg("could not create temporary file \"%s\": %m",
1879  path)));
1880  else
1881  return file;
1882  }
1883 
1884  /* Mark it for temp_file_limit accounting. */
1886 
1887  /* Register it for automatic close. */
1888  RegisterTemporaryFile(file);
1889 
1890  return file;
1891 }
1892 
1893 /*
1894  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1895  * another backend. Files opened this way don't count against the
1896  * temp_file_limit of the caller, are automatically closed at the end of the
1897  * transaction but are not deleted on close.
1898  */
1899 File
1900 PathNameOpenTemporaryFile(const char *path, int mode)
1901 {
1902  File file;
1903 
1904  Assert(temporary_files_allowed); /* check temp file access is up */
1905 
1907 
1908  file = PathNameOpenFile(path, mode | PG_BINARY);
1909 
1910  /* If no such file, then we don't raise an error. */
1911  if (file <= 0 && errno != ENOENT)
1912  ereport(ERROR,
1914  errmsg("could not open temporary file \"%s\": %m",
1915  path)));
1916 
1917  if (file > 0)
1918  {
1919  /* Register it for automatic close. */
1920  RegisterTemporaryFile(file);
1921  }
1922 
1923  return file;
1924 }
1925 
1926 /*
1927  * Delete a file by pathname. Return true if the file existed, false if
1928  * didn't.
1929  */
1930 bool
1931 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1932 {
1933  struct stat filestats;
1934  int stat_errno;
1935 
1936  /* Get the final size for pgstat reporting. */
1937  if (stat(path, &filestats) != 0)
1938  stat_errno = errno;
1939  else
1940  stat_errno = 0;
1941 
1942  /*
1943  * Unlike FileClose's automatic file deletion code, we tolerate
1944  * non-existence to support BufFileDeleteFileSet which doesn't know how
1945  * many segments it has to delete until it runs out.
1946  */
1947  if (stat_errno == ENOENT)
1948  return false;
1949 
1950  if (unlink(path) < 0)
1951  {
1952  if (errno != ENOENT)
1953  ereport(error_on_failure ? ERROR : LOG,
1955  errmsg("could not unlink temporary file \"%s\": %m",
1956  path)));
1957  return false;
1958  }
1959 
1960  if (stat_errno == 0)
1961  ReportTemporaryFileUsage(path, filestats.st_size);
1962  else
1963  {
1964  errno = stat_errno;
1965  ereport(LOG,
1967  errmsg("could not stat file \"%s\": %m", path)));
1968  }
1969 
1970  return true;
1971 }
1972 
1973 /*
1974  * close a file when done with it
1975  */
1976 void
1978 {
1979  Vfd *vfdP;
1980 
1981  Assert(FileIsValid(file));
1982 
1983  DO_DB(elog(LOG, "FileClose: %d (%s)",
1984  file, VfdCache[file].fileName));
1985 
1986  vfdP = &VfdCache[file];
1987 
1988  if (!FileIsNotOpen(file))
1989  {
1990  /* close the file */
1991  if (close(vfdP->fd) != 0)
1992  {
1993  /*
1994  * We may need to panic on failure to close non-temporary files;
1995  * see LruDelete.
1996  */
1998  "could not close file \"%s\": %m", vfdP->fileName);
1999  }
2000 
2001  --nfile;
2002  vfdP->fd = VFD_CLOSED;
2003 
2004  /* remove the file from the lru ring */
2005  Delete(file);
2006  }
2007 
2008  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2009  {
2010  /* Subtract its size from current usage (do first in case of error) */
2011  temporary_files_size -= vfdP->fileSize;
2012  vfdP->fileSize = 0;
2013  }
2014 
2015  /*
2016  * Delete the file if it was temporary, and make a log entry if wanted
2017  */
2018  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2019  {
2020  struct stat filestats;
2021  int stat_errno;
2022 
2023  /*
2024  * If we get an error, as could happen within the ereport/elog calls,
2025  * we'll come right back here during transaction abort. Reset the
2026  * flag to ensure that we can't get into an infinite loop. This code
2027  * is arranged to ensure that the worst-case consequence is failing to
2028  * emit log message(s), not failing to attempt the unlink.
2029  */
2030  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2031 
2032 
2033  /* first try the stat() */
2034  if (stat(vfdP->fileName, &filestats))
2035  stat_errno = errno;
2036  else
2037  stat_errno = 0;
2038 
2039  /* in any case do the unlink */
2040  if (unlink(vfdP->fileName))
2041  ereport(LOG,
2043  errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2044 
2045  /* and last report the stat results */
2046  if (stat_errno == 0)
2047  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2048  else
2049  {
2050  errno = stat_errno;
2051  ereport(LOG,
2053  errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2054  }
2055  }
2056 
2057  /* Unregister it from the resource owner */
2058  if (vfdP->resowner)
2059  ResourceOwnerForgetFile(vfdP->resowner, file);
2060 
2061  /*
2062  * Return the Vfd slot to the free list
2063  */
2064  FreeVfd(file);
2065 }
2066 
2067 /*
2068  * FilePrefetch - initiate asynchronous read of a given range of the file.
2069  *
2070  * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2071  *
2072  * posix_fadvise() is the simplest standardized interface that accomplishes
2073  * this.
2074  */
2075 int
2076 FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
2077 {
2078  Assert(FileIsValid(file));
2079 
2080  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2081  file, VfdCache[file].fileName,
2082  (int64) offset, (int64) amount));
2083 
2084 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2085  {
2086  int returnCode;
2087 
2088  returnCode = FileAccess(file);
2089  if (returnCode < 0)
2090  return returnCode;
2091 
2092 retry:
2093  pgstat_report_wait_start(wait_event_info);
2094  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2095  POSIX_FADV_WILLNEED);
2097 
2098  if (returnCode == EINTR)
2099  goto retry;
2100 
2101  return returnCode;
2102  }
2103 #elif defined(__darwin__)
2104  {
2105  struct radvisory
2106  {
2107  off_t ra_offset; /* offset into the file */
2108  int ra_count; /* size of the read */
2109  } ra;
2110  int returnCode;
2111 
2112  returnCode = FileAccess(file);
2113  if (returnCode < 0)
2114  return returnCode;
2115 
2116  ra.ra_offset = offset;
2117  ra.ra_count = amount;
2118  pgstat_report_wait_start(wait_event_info);
2119  returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
2121  if (returnCode != -1)
2122  return 0;
2123  else
2124  return errno;
2125  }
2126 #else
2127  return 0;
2128 #endif
2129 }
2130 
2131 void
2132 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2133 {
2134  int returnCode;
2135 
2136  Assert(FileIsValid(file));
2137 
2138  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2139  file, VfdCache[file].fileName,
2140  (int64) offset, (int64) nbytes));
2141 
2142  if (nbytes <= 0)
2143  return;
2144 
2145  if (VfdCache[file].fileFlags & PG_O_DIRECT)
2146  return;
2147 
2148  returnCode = FileAccess(file);
2149  if (returnCode < 0)
2150  return;
2151 
2152  pgstat_report_wait_start(wait_event_info);
2153  pg_flush_data(VfdCache[file].fd, offset, nbytes);
2155 }
2156 
2157 ssize_t
2158 FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2159  uint32 wait_event_info)
2160 {
2161  ssize_t returnCode;
2162  Vfd *vfdP;
2163 
2164  Assert(FileIsValid(file));
2165 
2166  DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2167  file, VfdCache[file].fileName,
2168  (int64) offset,
2169  iovcnt));
2170 
2171  returnCode = FileAccess(file);
2172  if (returnCode < 0)
2173  return returnCode;
2174 
2175  vfdP = &VfdCache[file];
2176 
2177 retry:
2178  pgstat_report_wait_start(wait_event_info);
2179  returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2181 
2182  if (returnCode < 0)
2183  {
2184  /*
2185  * Windows may run out of kernel buffers and return "Insufficient
2186  * system resources" error. Wait a bit and retry to solve it.
2187  *
2188  * It is rumored that EINTR is also possible on some Unix filesystems,
2189  * in which case immediate retry is indicated.
2190  */
2191 #ifdef WIN32
2192  DWORD error = GetLastError();
2193 
2194  switch (error)
2195  {
2196  case ERROR_NO_SYSTEM_RESOURCES:
2197  pg_usleep(1000L);
2198  errno = EINTR;
2199  break;
2200  default:
2201  _dosmaperr(error);
2202  break;
2203  }
2204 #endif
2205  /* OK to retry if interrupted */
2206  if (errno == EINTR)
2207  goto retry;
2208  }
2209 
2210  return returnCode;
2211 }
2212 
2213 ssize_t
2214 FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2215  uint32 wait_event_info)
2216 {
2217  ssize_t returnCode;
2218  Vfd *vfdP;
2219 
2220  Assert(FileIsValid(file));
2221 
2222  DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2223  file, VfdCache[file].fileName,
2224  (int64) offset,
2225  iovcnt));
2226 
2227  returnCode = FileAccess(file);
2228  if (returnCode < 0)
2229  return returnCode;
2230 
2231  vfdP = &VfdCache[file];
2232 
2233  /*
2234  * If enforcing temp_file_limit and it's a temp file, check to see if the
2235  * write would overrun temp_file_limit, and throw error if so. Note: it's
2236  * really a modularity violation to throw error here; we should set errno
2237  * and return -1. However, there's no way to report a suitable error
2238  * message if we do that. All current callers would just throw error
2239  * immediately anyway, so this is safe at present.
2240  */
2241  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2242  {
2243  off_t past_write = offset;
2244 
2245  for (int i = 0; i < iovcnt; ++i)
2246  past_write += iov[i].iov_len;
2247 
2248  if (past_write > vfdP->fileSize)
2249  {
2250  uint64 newTotal = temporary_files_size;
2251 
2252  newTotal += past_write - vfdP->fileSize;
2253  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2254  ereport(ERROR,
2255  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2256  errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2257  temp_file_limit)));
2258  }
2259  }
2260 
2261 retry:
2262  pgstat_report_wait_start(wait_event_info);
2263  returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2265 
2266  if (returnCode >= 0)
2267  {
2268  /*
2269  * Some callers expect short writes to set errno, and traditionally we
2270  * have assumed that they imply disk space shortage. We don't want to
2271  * waste CPU cycles adding up the total size here, so we'll just set
2272  * it for all successful writes in case such a caller determines that
2273  * the write was short and ereports "%m".
2274  */
2275  errno = ENOSPC;
2276 
2277  /*
2278  * Maintain fileSize and temporary_files_size if it's a temp file.
2279  */
2280  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2281  {
2282  off_t past_write = offset + returnCode;
2283 
2284  if (past_write > vfdP->fileSize)
2285  {
2286  temporary_files_size += past_write - vfdP->fileSize;
2287  vfdP->fileSize = past_write;
2288  }
2289  }
2290  }
2291  else
2292  {
2293  /*
2294  * See comments in FileReadV()
2295  */
2296 #ifdef WIN32
2297  DWORD error = GetLastError();
2298 
2299  switch (error)
2300  {
2301  case ERROR_NO_SYSTEM_RESOURCES:
2302  pg_usleep(1000L);
2303  errno = EINTR;
2304  break;
2305  default:
2306  _dosmaperr(error);
2307  break;
2308  }
2309 #endif
2310  /* OK to retry if interrupted */
2311  if (errno == EINTR)
2312  goto retry;
2313  }
2314 
2315  return returnCode;
2316 }
2317 
2318 int
2319 FileSync(File file, uint32 wait_event_info)
2320 {
2321  int returnCode;
2322 
2323  Assert(FileIsValid(file));
2324 
2325  DO_DB(elog(LOG, "FileSync: %d (%s)",
2326  file, VfdCache[file].fileName));
2327 
2328  returnCode = FileAccess(file);
2329  if (returnCode < 0)
2330  return returnCode;
2331 
2332  pgstat_report_wait_start(wait_event_info);
2333  returnCode = pg_fsync(VfdCache[file].fd);
2335 
2336  return returnCode;
2337 }
2338 
2339 /*
2340  * Zero a region of the file.
2341  *
2342  * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2343  * appropriate error.
2344  */
2345 int
2346 FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
2347 {
2348  int returnCode;
2349  ssize_t written;
2350 
2351  Assert(FileIsValid(file));
2352 
2353  DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2354  file, VfdCache[file].fileName,
2355  (int64) offset, (int64) amount));
2356 
2357  returnCode = FileAccess(file);
2358  if (returnCode < 0)
2359  return returnCode;
2360 
2361  pgstat_report_wait_start(wait_event_info);
2362  written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2364 
2365  if (written < 0)
2366  return -1;
2367  else if (written != amount)
2368  {
2369  /* if errno is unset, assume problem is no disk space */
2370  if (errno == 0)
2371  errno = ENOSPC;
2372  return -1;
2373  }
2374 
2375  return 0;
2376 }
2377 
2378 /*
2379  * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2380  * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2381  * use FileZero() instead.
2382  *
2383  * Note that at least glibc() implements posix_fallocate() in userspace if not
2384  * implemented by the filesystem. That's not the case for all environments
2385  * though.
2386  *
2387  * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2388  * appropriate error.
2389  */
2390 int
2391 FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
2392 {
2393 #ifdef HAVE_POSIX_FALLOCATE
2394  int returnCode;
2395 
2396  Assert(FileIsValid(file));
2397 
2398  DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2399  file, VfdCache[file].fileName,
2400  (int64) offset, (int64) amount));
2401 
2402  returnCode = FileAccess(file);
2403  if (returnCode < 0)
2404  return -1;
2405 
2406 retry:
2407  pgstat_report_wait_start(wait_event_info);
2408  returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2410 
2411  if (returnCode == 0)
2412  return 0;
2413  else if (returnCode == EINTR)
2414  goto retry;
2415 
2416  /* for compatibility with %m printing etc */
2417  errno = returnCode;
2418 
2419  /*
2420  * Return in cases of a "real" failure, if fallocate is not supported,
2421  * fall through to the FileZero() backed implementation.
2422  */
2423  if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2424  return -1;
2425 #endif
2426 
2427  return FileZero(file, offset, amount, wait_event_info);
2428 }
2429 
2430 off_t
2432 {
2433  Assert(FileIsValid(file));
2434 
2435  DO_DB(elog(LOG, "FileSize %d (%s)",
2436  file, VfdCache[file].fileName));
2437 
2438  if (FileIsNotOpen(file))
2439  {
2440  if (FileAccess(file) < 0)
2441  return (off_t) -1;
2442  }
2443 
2444  return lseek(VfdCache[file].fd, 0, SEEK_END);
2445 }
2446 
2447 int
2448 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2449 {
2450  int returnCode;
2451 
2452  Assert(FileIsValid(file));
2453 
2454  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2455  file, VfdCache[file].fileName));
2456 
2457  returnCode = FileAccess(file);
2458  if (returnCode < 0)
2459  return returnCode;
2460 
2461  pgstat_report_wait_start(wait_event_info);
2462  returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2464 
2465  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2466  {
2467  /* adjust our state for truncation of a temp file */
2468  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2469  temporary_files_size -= VfdCache[file].fileSize - offset;
2470  VfdCache[file].fileSize = offset;
2471  }
2472 
2473  return returnCode;
2474 }
2475 
2476 /*
2477  * Return the pathname associated with an open file.
2478  *
2479  * The returned string points to an internal buffer, which is valid until
2480  * the file is closed.
2481  */
2482 char *
2484 {
2485  Assert(FileIsValid(file));
2486 
2487  return VfdCache[file].fileName;
2488 }
2489 
2490 /*
2491  * Return the raw file descriptor of an opened file.
2492  *
2493  * The returned file descriptor will be valid until the file is closed, but
2494  * there are a lot of things that can make that happen. So the caller should
2495  * be careful not to do much of anything else before it finishes using the
2496  * returned file descriptor.
2497  */
2498 int
2500 {
2501  Assert(FileIsValid(file));
2502  return VfdCache[file].fd;
2503 }
2504 
2505 /*
2506  * FileGetRawFlags - returns the file flags on open(2)
2507  */
2508 int
2510 {
2511  Assert(FileIsValid(file));
2512  return VfdCache[file].fileFlags;
2513 }
2514 
2515 /*
2516  * FileGetRawMode - returns the mode bitmask passed to open(2)
2517  */
2518 mode_t
2520 {
2521  Assert(FileIsValid(file));
2522  return VfdCache[file].fileMode;
2523 }
2524 
2525 /*
2526  * Make room for another allocatedDescs[] array entry if needed and possible.
2527  * Returns true if an array element is available.
2528  */
2529 static bool
2531 {
2532  AllocateDesc *newDescs;
2533  int newMax;
2534 
2535  /* Quick out if array already has a free slot. */
2537  return true;
2538 
2539  /*
2540  * If the array hasn't yet been created in the current process, initialize
2541  * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2542  * we will ever need, anyway. We don't want to look at max_safe_fds
2543  * immediately because set_max_safe_fds() may not have run yet.
2544  */
2545  if (allocatedDescs == NULL)
2546  {
2547  newMax = FD_MINFREE / 3;
2548  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2549  /* Out of memory already? Treat as fatal error. */
2550  if (newDescs == NULL)
2551  ereport(ERROR,
2552  (errcode(ERRCODE_OUT_OF_MEMORY),
2553  errmsg("out of memory")));
2554  allocatedDescs = newDescs;
2555  maxAllocatedDescs = newMax;
2556  return true;
2557  }
2558 
2559  /*
2560  * Consider enlarging the array beyond the initial allocation used above.
2561  * By the time this happens, max_safe_fds should be known accurately.
2562  *
2563  * We mustn't let allocated descriptors hog all the available FDs, and in
2564  * practice we'd better leave a reasonable number of FDs for VFD use. So
2565  * set the maximum to max_safe_fds / 3. (This should certainly be at
2566  * least as large as the initial size, FD_MINFREE / 3, so we aren't
2567  * tightening the restriction here.) Recall that "external" FDs are
2568  * allowed to consume another third of max_safe_fds.
2569  */
2570  newMax = max_safe_fds / 3;
2571  if (newMax > maxAllocatedDescs)
2572  {
2573  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2574  newMax * sizeof(AllocateDesc));
2575  /* Treat out-of-memory as a non-fatal error. */
2576  if (newDescs == NULL)
2577  return false;
2578  allocatedDescs = newDescs;
2579  maxAllocatedDescs = newMax;
2580  return true;
2581  }
2582 
2583  /* Can't enlarge allocatedDescs[] any more. */
2584  return false;
2585 }
2586 
2587 /*
2588  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2589  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2590  * necessary to open the file. When done, call FreeFile rather than fclose.
2591  *
2592  * Note that files that will be open for any significant length of time
2593  * should NOT be handled this way, since they cannot share kernel file
2594  * descriptors with other files; there is grave risk of running out of FDs
2595  * if anyone locks down too many FDs. Most callers of this routine are
2596  * simply reading a config file that they will read and close immediately.
2597  *
2598  * fd.c will automatically close all files opened with AllocateFile at
2599  * transaction commit or abort; this prevents FD leakage if a routine
2600  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2601  *
2602  * Ideally this should be the *only* direct call of fopen() in the backend.
2603  */
2604 FILE *
2605 AllocateFile(const char *name, const char *mode)
2606 {
2607  FILE *file;
2608 
2609  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2611 
2612  /* Can we allocate another non-virtual FD? */
2613  if (!reserveAllocatedDesc())
2614  ereport(ERROR,
2615  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2616  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2617  maxAllocatedDescs, name)));
2618 
2619  /* Close excess kernel FDs. */
2620  ReleaseLruFiles();
2621 
2622 TryAgain:
2623  if ((file = fopen(name, mode)) != NULL)
2624  {
2626 
2627  desc->kind = AllocateDescFile;
2628  desc->desc.file = file;
2631  return desc->desc.file;
2632  }
2633 
2634  if (errno == EMFILE || errno == ENFILE)
2635  {
2636  int save_errno = errno;
2637 
2638  ereport(LOG,
2639  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2640  errmsg("out of file descriptors: %m; release and retry")));
2641  errno = 0;
2642  if (ReleaseLruFile())
2643  goto TryAgain;
2644  errno = save_errno;
2645  }
2646 
2647  return NULL;
2648 }
2649 
2650 /*
2651  * Open a file with OpenTransientFilePerm() and pass default file mode for
2652  * the fileMode parameter.
2653  */
2654 int
2655 OpenTransientFile(const char *fileName, int fileFlags)
2656 {
2657  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2658 }
2659 
2660 /*
2661  * Like AllocateFile, but returns an unbuffered fd like open(2)
2662  */
2663 int
2664 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2665 {
2666  int fd;
2667 
2668  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2669  numAllocatedDescs, fileName));
2670 
2671  /* Can we allocate another non-virtual FD? */
2672  if (!reserveAllocatedDesc())
2673  ereport(ERROR,
2674  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2675  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2676  maxAllocatedDescs, fileName)));
2677 
2678  /* Close excess kernel FDs. */
2679  ReleaseLruFiles();
2680 
2681  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2682 
2683  if (fd >= 0)
2684  {
2686 
2687  desc->kind = AllocateDescRawFD;
2688  desc->desc.fd = fd;
2691 
2692  return fd;
2693  }
2694 
2695  return -1; /* failure */
2696 }
2697 
2698 /*
2699  * Routines that want to initiate a pipe stream should use OpenPipeStream
2700  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2701  * necessary. When done, call ClosePipeStream rather than pclose.
2702  *
2703  * This function also ensures that the popen'd program is run with default
2704  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2705  * uses. This ensures desirable response to, eg, closing a read pipe early.
2706  */
2707 FILE *
2708 OpenPipeStream(const char *command, const char *mode)
2709 {
2710  FILE *file;
2711  int save_errno;
2712 
2713  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2714  numAllocatedDescs, command));
2715 
2716  /* Can we allocate another non-virtual FD? */
2717  if (!reserveAllocatedDesc())
2718  ereport(ERROR,
2719  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2720  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2721  maxAllocatedDescs, command)));
2722 
2723  /* Close excess kernel FDs. */
2724  ReleaseLruFiles();
2725 
2726 TryAgain:
2727  fflush(NULL);
2729  errno = 0;
2730  file = popen(command, mode);
2731  save_errno = errno;
2733  errno = save_errno;
2734  if (file != NULL)
2735  {
2737 
2738  desc->kind = AllocateDescPipe;
2739  desc->desc.file = file;
2742  return desc->desc.file;
2743  }
2744 
2745  if (errno == EMFILE || errno == ENFILE)
2746  {
2747  ereport(LOG,
2748  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2749  errmsg("out of file descriptors: %m; release and retry")));
2750  if (ReleaseLruFile())
2751  goto TryAgain;
2752  errno = save_errno;
2753  }
2754 
2755  return NULL;
2756 }
2757 
2758 /*
2759  * Free an AllocateDesc of any type.
2760  *
2761  * The argument *must* point into the allocatedDescs[] array.
2762  */
2763 static int
2765 {
2766  int result;
2767 
2768  /* Close the underlying object */
2769  switch (desc->kind)
2770  {
2771  case AllocateDescFile:
2772  result = fclose(desc->desc.file);
2773  break;
2774  case AllocateDescPipe:
2775  result = pclose(desc->desc.file);
2776  break;
2777  case AllocateDescDir:
2778  result = closedir(desc->desc.dir);
2779  break;
2780  case AllocateDescRawFD:
2781  result = close(desc->desc.fd);
2782  break;
2783  default:
2784  elog(ERROR, "AllocateDesc kind not recognized");
2785  result = 0; /* keep compiler quiet */
2786  break;
2787  }
2788 
2789  /* Compact storage in the allocatedDescs array */
2792 
2793  return result;
2794 }
2795 
2796 /*
2797  * Close a file returned by AllocateFile.
2798  *
2799  * Note we do not check fclose's return value --- it is up to the caller
2800  * to handle close errors.
2801  */
2802 int
2803 FreeFile(FILE *file)
2804 {
2805  int i;
2806 
2807  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2808 
2809  /* Remove file from list of allocated files, if it's present */
2810  for (i = numAllocatedDescs; --i >= 0;)
2811  {
2812  AllocateDesc *desc = &allocatedDescs[i];
2813 
2814  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2815  return FreeDesc(desc);
2816  }
2817 
2818  /* Only get here if someone passes us a file not in allocatedDescs */
2819  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2820 
2821  return fclose(file);
2822 }
2823 
2824 /*
2825  * Close a file returned by OpenTransientFile.
2826  *
2827  * Note we do not check close's return value --- it is up to the caller
2828  * to handle close errors.
2829  */
2830 int
2832 {
2833  int i;
2834 
2835  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2836 
2837  /* Remove fd from list of allocated files, if it's present */
2838  for (i = numAllocatedDescs; --i >= 0;)
2839  {
2840  AllocateDesc *desc = &allocatedDescs[i];
2841 
2842  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2843  return FreeDesc(desc);
2844  }
2845 
2846  /* Only get here if someone passes us a file not in allocatedDescs */
2847  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2848 
2849  return close(fd);
2850 }
2851 
2852 /*
2853  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2854  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2855  * necessary to open the directory, and with closing it after an elog.
2856  * When done, call FreeDir rather than closedir.
2857  *
2858  * Returns NULL, with errno set, on failure. Note that failure detection
2859  * is commonly left to the following call of ReadDir or ReadDirExtended;
2860  * see the comments for ReadDir.
2861  *
2862  * Ideally this should be the *only* direct call of opendir() in the backend.
2863  */
2864 DIR *
2865 AllocateDir(const char *dirname)
2866 {
2867  DIR *dir;
2868 
2869  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2870  numAllocatedDescs, dirname));
2871 
2872  /* Can we allocate another non-virtual FD? */
2873  if (!reserveAllocatedDesc())
2874  ereport(ERROR,
2875  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2876  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2877  maxAllocatedDescs, dirname)));
2878 
2879  /* Close excess kernel FDs. */
2880  ReleaseLruFiles();
2881 
2882 TryAgain:
2883  if ((dir = opendir(dirname)) != NULL)
2884  {
2886 
2887  desc->kind = AllocateDescDir;
2888  desc->desc.dir = dir;
2891  return desc->desc.dir;
2892  }
2893 
2894  if (errno == EMFILE || errno == ENFILE)
2895  {
2896  int save_errno = errno;
2897 
2898  ereport(LOG,
2899  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2900  errmsg("out of file descriptors: %m; release and retry")));
2901  errno = 0;
2902  if (ReleaseLruFile())
2903  goto TryAgain;
2904  errno = save_errno;
2905  }
2906 
2907  return NULL;
2908 }
2909 
2910 /*
2911  * Read a directory opened with AllocateDir, ereport'ing any error.
2912  *
2913  * This is easier to use than raw readdir() since it takes care of some
2914  * otherwise rather tedious and error-prone manipulation of errno. Also,
2915  * if you are happy with a generic error message for AllocateDir failure,
2916  * you can just do
2917  *
2918  * dir = AllocateDir(path);
2919  * while ((dirent = ReadDir(dir, path)) != NULL)
2920  * process dirent;
2921  * FreeDir(dir);
2922  *
2923  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2924  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2925  * use this shortcut.)
2926  *
2927  * The pathname passed to AllocateDir must be passed to this routine too,
2928  * but it is only used for error reporting.
2929  */
2930 struct dirent *
2931 ReadDir(DIR *dir, const char *dirname)
2932 {
2933  return ReadDirExtended(dir, dirname, ERROR);
2934 }
2935 
2936 /*
2937  * Alternate version of ReadDir that allows caller to specify the elevel
2938  * for any error report (whether it's reporting an initial failure of
2939  * AllocateDir or a subsequent directory read failure).
2940  *
2941  * If elevel < ERROR, returns NULL after any error. With the normal coding
2942  * pattern, this will result in falling out of the loop immediately as
2943  * though the directory contained no (more) entries.
2944  */
2945 struct dirent *
2946 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2947 {
2948  struct dirent *dent;
2949 
2950  /* Give a generic message for AllocateDir failure, if caller didn't */
2951  if (dir == NULL)
2952  {
2953  ereport(elevel,
2955  errmsg("could not open directory \"%s\": %m",
2956  dirname)));
2957  return NULL;
2958  }
2959 
2960  errno = 0;
2961  if ((dent = readdir(dir)) != NULL)
2962  return dent;
2963 
2964  if (errno)
2965  ereport(elevel,
2967  errmsg("could not read directory \"%s\": %m",
2968  dirname)));
2969  return NULL;
2970 }
2971 
2972 /*
2973  * Close a directory opened with AllocateDir.
2974  *
2975  * Returns closedir's return value (with errno set if it's not 0).
2976  * Note we do not check the return value --- it is up to the caller
2977  * to handle close errors if wanted.
2978  *
2979  * Does nothing if dir == NULL; we assume that directory open failure was
2980  * already reported if desired.
2981  */
2982 int
2984 {
2985  int i;
2986 
2987  /* Nothing to do if AllocateDir failed */
2988  if (dir == NULL)
2989  return 0;
2990 
2991  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2992 
2993  /* Remove dir from list of allocated dirs, if it's present */
2994  for (i = numAllocatedDescs; --i >= 0;)
2995  {
2996  AllocateDesc *desc = &allocatedDescs[i];
2997 
2998  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2999  return FreeDesc(desc);
3000  }
3001 
3002  /* Only get here if someone passes us a dir not in allocatedDescs */
3003  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3004 
3005  return closedir(dir);
3006 }
3007 
3008 
3009 /*
3010  * Close a pipe stream returned by OpenPipeStream.
3011  */
3012 int
3013 ClosePipeStream(FILE *file)
3014 {
3015  int i;
3016 
3017  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3018 
3019  /* Remove file from list of allocated files, if it's present */
3020  for (i = numAllocatedDescs; --i >= 0;)
3021  {
3022  AllocateDesc *desc = &allocatedDescs[i];
3023 
3024  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3025  return FreeDesc(desc);
3026  }
3027 
3028  /* Only get here if someone passes us a file not in allocatedDescs */
3029  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3030 
3031  return pclose(file);
3032 }
3033 
3034 /*
3035  * closeAllVfds
3036  *
3037  * Force all VFDs into the physically-closed state, so that the fewest
3038  * possible number of kernel file descriptors are in use. There is no
3039  * change in the logical state of the VFDs.
3040  */
3041 void
3043 {
3044  Index i;
3045 
3046  if (SizeVfdCache > 0)
3047  {
3048  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3049  for (i = 1; i < SizeVfdCache; i++)
3050  {
3051  if (!FileIsNotOpen(i))
3052  LruDelete(i);
3053  }
3054  }
3055 }
3056 
3057 
3058 /*
3059  * SetTempTablespaces
3060  *
3061  * Define a list (actually an array) of OIDs of tablespaces to use for
3062  * temporary files. This list will be used until end of transaction,
3063  * unless this function is called again before then. It is caller's
3064  * responsibility that the passed-in array has adequate lifespan (typically
3065  * it'd be allocated in TopTransactionContext).
3066  *
3067  * Some entries of the array may be InvalidOid, indicating that the current
3068  * database's default tablespace should be used.
3069  */
3070 void
3071 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
3072 {
3073  Assert(numSpaces >= 0);
3074  tempTableSpaces = tableSpaces;
3075  numTempTableSpaces = numSpaces;
3076 
3077  /*
3078  * Select a random starting point in the list. This is to minimize
3079  * conflicts between backends that are most likely sharing the same list
3080  * of temp tablespaces. Note that if we create multiple temp files in the
3081  * same transaction, we'll advance circularly through the list --- this
3082  * ensures that large temporary sort files are nicely spread across all
3083  * available tablespaces.
3084  */
3085  if (numSpaces > 1)
3087  0, numSpaces - 1);
3088  else
3089  nextTempTableSpace = 0;
3090 }
3091 
3092 /*
3093  * TempTablespacesAreSet
3094  *
3095  * Returns true if SetTempTablespaces has been called in current transaction.
3096  * (This is just so that tablespaces.c doesn't need its own per-transaction
3097  * state.)
3098  */
3099 bool
3101 {
3102  return (numTempTableSpaces >= 0);
3103 }
3104 
3105 /*
3106  * GetTempTablespaces
3107  *
3108  * Populate an array with the OIDs of the tablespaces that should be used for
3109  * temporary files. (Some entries may be InvalidOid, indicating that the
3110  * current database's default tablespace should be used.) At most numSpaces
3111  * entries will be filled.
3112  * Returns the number of OIDs that were copied into the output array.
3113  */
3114 int
3115 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3116 {
3117  int i;
3118 
3120  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3121  tableSpaces[i] = tempTableSpaces[i];
3122 
3123  return i;
3124 }
3125 
3126 /*
3127  * GetNextTempTableSpace
3128  *
3129  * Select the next temp tablespace to use. A result of InvalidOid means
3130  * to use the current database's default tablespace.
3131  */
3132 Oid
3134 {
3135  if (numTempTableSpaces > 0)
3136  {
3137  /* Advance nextTempTableSpace counter with wraparound */
3139  nextTempTableSpace = 0;
3141  }
3142  return InvalidOid;
3143 }
3144 
3145 
3146 /*
3147  * AtEOSubXact_Files
3148  *
3149  * Take care of subtransaction commit/abort. At abort, we close temp files
3150  * that the subtransaction may have opened. At commit, we reassign the
3151  * files that were opened to the parent subtransaction.
3152  */
3153 void
3154 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3155  SubTransactionId parentSubid)
3156 {
3157  Index i;
3158 
3159  for (i = 0; i < numAllocatedDescs; i++)
3160  {
3161  if (allocatedDescs[i].create_subid == mySubid)
3162  {
3163  if (isCommit)
3164  allocatedDescs[i].create_subid = parentSubid;
3165  else
3166  {
3167  /* have to recheck the item after FreeDesc (ugly) */
3168  FreeDesc(&allocatedDescs[i--]);
3169  }
3170  }
3171  }
3172 }
3173 
3174 /*
3175  * AtEOXact_Files
3176  *
3177  * This routine is called during transaction commit or abort. All still-open
3178  * per-transaction temporary file VFDs are closed, which also causes the
3179  * underlying files to be deleted (although they should've been closed already
3180  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3181  * closed. We also forget any transaction-local temp tablespace list.
3182  *
3183  * The isCommit flag is used only to decide whether to emit warnings about
3184  * unclosed files.
3185  */
3186 void
3187 AtEOXact_Files(bool isCommit)
3188 {
3189  CleanupTempFiles(isCommit, false);
3190  tempTableSpaces = NULL;
3191  numTempTableSpaces = -1;
3192 }
3193 
3194 /*
3195  * BeforeShmemExit_Files
3196  *
3197  * before_shmem_exit hook to clean up temp files during backend shutdown.
3198  * Here, we want to clean up *all* temp files including interXact ones.
3199  */
3200 static void
3202 {
3203  CleanupTempFiles(false, true);
3204 
3205  /* prevent further temp files from being created */
3206 #ifdef USE_ASSERT_CHECKING
3207  temporary_files_allowed = false;
3208 #endif
3209 }
3210 
3211 /*
3212  * Close temporary files and delete their underlying files.
3213  *
3214  * isCommit: if true, this is normal transaction commit, and we don't
3215  * expect any remaining files; warn if there are some.
3216  *
3217  * isProcExit: if true, this is being called as the backend process is
3218  * exiting. If that's the case, we should remove all temporary files; if
3219  * that's not the case, we are being called for transaction commit/abort
3220  * and should only remove transaction-local temp files. In either case,
3221  * also clean up "allocated" stdio files, dirs and fds.
3222  */
3223 static void
3224 CleanupTempFiles(bool isCommit, bool isProcExit)
3225 {
3226  Index i;
3227 
3228  /*
3229  * Careful here: at proc_exit we need extra cleanup, not just
3230  * xact_temporary files.
3231  */
3232  if (isProcExit || have_xact_temporary_files)
3233  {
3234  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3235  for (i = 1; i < SizeVfdCache; i++)
3236  {
3237  unsigned short fdstate = VfdCache[i].fdstate;
3238 
3239  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3240  VfdCache[i].fileName != NULL)
3241  {
3242  /*
3243  * If we're in the process of exiting a backend process, close
3244  * all temporary files. Otherwise, only close temporary files
3245  * local to the current transaction. They should be closed by
3246  * the ResourceOwner mechanism already, so this is just a
3247  * debugging cross-check.
3248  */
3249  if (isProcExit)
3250  FileClose(i);
3251  else if (fdstate & FD_CLOSE_AT_EOXACT)
3252  {
3253  elog(WARNING,
3254  "temporary file %s not closed at end-of-transaction",
3255  VfdCache[i].fileName);
3256  FileClose(i);
3257  }
3258  }
3259  }
3260 
3261  have_xact_temporary_files = false;
3262  }
3263 
3264  /* Complain if any allocated files remain open at commit. */
3265  if (isCommit && numAllocatedDescs > 0)
3266  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3268 
3269  /* Clean up "allocated" stdio files, dirs and fds. */
3270  while (numAllocatedDescs > 0)
3271  FreeDesc(&allocatedDescs[0]);
3272 }
3273 
3274 
3275 /*
3276  * Remove temporary and temporary relation files left over from a prior
3277  * postmaster session
3278  *
3279  * This should be called during postmaster startup. It will forcibly
3280  * remove any leftover files created by OpenTemporaryFile and any leftover
3281  * temporary relation files created by mdcreate.
3282  *
3283  * During post-backend-crash restart cycle, this routine is called when
3284  * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3285  * queries are using temp files could result in useless storage usage that can
3286  * only be reclaimed by a service restart. The argument against enabling it is
3287  * that someone might want to examine the temporary files for debugging
3288  * purposes. This does however mean that OpenTemporaryFile had better allow for
3289  * collision with an existing temp file name.
3290  *
3291  * NOTE: this function and its subroutines generally report syscall failures
3292  * with ereport(LOG) and keep going. Removing temp files is not so critical
3293  * that we should fail to start the database when we can't do it.
3294  */
3295 void
3297 {
3298  char temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3299  DIR *spc_dir;
3300  struct dirent *spc_de;
3301 
3302  /*
3303  * First process temp files in pg_default ($PGDATA/base)
3304  */
3305  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3306  RemovePgTempFilesInDir(temp_path, true, false);
3307  RemovePgTempRelationFiles("base");
3308 
3309  /*
3310  * Cycle through temp directories for all non-default tablespaces.
3311  */
3312  spc_dir = AllocateDir(PG_TBLSPC_DIR);
3313 
3314  while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
3315  {
3316  if (strcmp(spc_de->d_name, ".") == 0 ||
3317  strcmp(spc_de->d_name, "..") == 0)
3318  continue;
3319 
3320  snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3323  RemovePgTempFilesInDir(temp_path, true, false);
3324 
3325  snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3327  RemovePgTempRelationFiles(temp_path);
3328  }
3329 
3330  FreeDir(spc_dir);
3331 
3332  /*
3333  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3334  * DataDir as well. However, that is *not* cleaned here because doing so
3335  * would create a race condition. It's done separately, earlier in
3336  * postmaster startup.
3337  */
3338 }
3339 
3340 /*
3341  * Process one pgsql_tmp directory for RemovePgTempFiles.
3342  *
3343  * If missing_ok is true, it's all right for the named directory to not exist.
3344  * Any other problem results in a LOG message. (missing_ok should be true at
3345  * the top level, since pgsql_tmp directories are not created until needed.)
3346  *
3347  * At the top level, this should be called with unlink_all = false, so that
3348  * only files matching the temporary name prefix will be unlinked. When
3349  * recursing it will be called with unlink_all = true to unlink everything
3350  * under a top-level temporary directory.
3351  *
3352  * (These two flags could be replaced by one, but it seems clearer to keep
3353  * them separate.)
3354  */
3355 void
3356 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3357 {
3358  DIR *temp_dir;
3359  struct dirent *temp_de;
3360  char rm_path[MAXPGPATH * 2];
3361 
3362  temp_dir = AllocateDir(tmpdirname);
3363 
3364  if (temp_dir == NULL && errno == ENOENT && missing_ok)
3365  return;
3366 
3367  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3368  {
3369  if (strcmp(temp_de->d_name, ".") == 0 ||
3370  strcmp(temp_de->d_name, "..") == 0)
3371  continue;
3372 
3373  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3374  tmpdirname, temp_de->d_name);
3375 
3376  if (unlink_all ||
3377  strncmp(temp_de->d_name,
3379  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3380  {
3381  PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3382 
3383  if (type == PGFILETYPE_ERROR)
3384  continue;
3385  else if (type == PGFILETYPE_DIR)
3386  {
3387  /* recursively remove contents, then directory itself */
3388  RemovePgTempFilesInDir(rm_path, false, true);
3389 
3390  if (rmdir(rm_path) < 0)
3391  ereport(LOG,
3393  errmsg("could not remove directory \"%s\": %m",
3394  rm_path)));
3395  }
3396  else
3397  {
3398  if (unlink(rm_path) < 0)
3399  ereport(LOG,
3401  errmsg("could not remove file \"%s\": %m",
3402  rm_path)));
3403  }
3404  }
3405  else
3406  ereport(LOG,
3407  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3408  rm_path)));
3409  }
3410 
3411  FreeDir(temp_dir);
3412 }
3413 
3414 /* Process one tablespace directory, look for per-DB subdirectories */
3415 static void
3416 RemovePgTempRelationFiles(const char *tsdirname)
3417 {
3418  DIR *ts_dir;
3419  struct dirent *de;
3420  char dbspace_path[MAXPGPATH * 2];
3421 
3422  ts_dir = AllocateDir(tsdirname);
3423 
3424  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3425  {
3426  /*
3427  * We're only interested in the per-database directories, which have
3428  * numeric names. Note that this code will also (properly) ignore "."
3429  * and "..".
3430  */
3431  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3432  continue;
3433 
3434  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3435  tsdirname, de->d_name);
3436  RemovePgTempRelationFilesInDbspace(dbspace_path);
3437  }
3438 
3439  FreeDir(ts_dir);
3440 }
3441 
3442 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3443 static void
3444 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3445 {
3446  DIR *dbspace_dir;
3447  struct dirent *de;
3448  char rm_path[MAXPGPATH * 2];
3449 
3450  dbspace_dir = AllocateDir(dbspacedirname);
3451 
3452  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3453  {
3454  if (!looks_like_temp_rel_name(de->d_name))
3455  continue;
3456 
3457  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3458  dbspacedirname, de->d_name);
3459 
3460  if (unlink(rm_path) < 0)
3461  ereport(LOG,
3463  errmsg("could not remove file \"%s\": %m",
3464  rm_path)));
3465  }
3466 
3467  FreeDir(dbspace_dir);
3468 }
3469 
3470 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3471 bool
3473 {
3474  int pos;
3475  int savepos;
3476 
3477  /* Must start with "t". */
3478  if (name[0] != 't')
3479  return false;
3480 
3481  /* Followed by a non-empty string of digits and then an underscore. */
3482  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3483  ;
3484  if (pos == 1 || name[pos] != '_')
3485  return false;
3486 
3487  /* Followed by another nonempty string of digits. */
3488  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3489  ;
3490  if (savepos == pos)
3491  return false;
3492 
3493  /* We might have _forkname or .segment or both. */
3494  if (name[pos] == '_')
3495  {
3496  int forkchar = forkname_chars(&name[pos + 1], NULL);
3497 
3498  if (forkchar <= 0)
3499  return false;
3500  pos += forkchar + 1;
3501  }
3502  if (name[pos] == '.')
3503  {
3504  int segchar;
3505 
3506  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3507  ;
3508  if (segchar <= 1)
3509  return false;
3510  pos += segchar;
3511  }
3512 
3513  /* Now we should be at the end. */
3514  if (name[pos] != '\0')
3515  return false;
3516  return true;
3517 }
3518 
3519 #ifdef HAVE_SYNCFS
3520 static void
3521 do_syncfs(const char *path)
3522 {
3523  int fd;
3524 
3525  ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3526  path);
3527 
3528  fd = OpenTransientFile(path, O_RDONLY);
3529  if (fd < 0)
3530  {
3531  ereport(LOG,
3533  errmsg("could not open file \"%s\": %m", path)));
3534  return;
3535  }
3536  if (syncfs(fd) < 0)
3537  ereport(LOG,
3539  errmsg("could not synchronize file system for file \"%s\": %m", path)));
3541 }
3542 #endif
3543 
3544 /*
3545  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3546  * all potential filesystem, depending on recovery_init_sync_method setting.
3547  *
3548  * We fsync regular files and directories wherever they are, but we
3549  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3550  * Other symlinks are presumed to point at files we're not responsible
3551  * for fsyncing, and might not have privileges to write at all.
3552  *
3553  * Errors are logged but not considered fatal; that's because this is used
3554  * only during database startup, to deal with the possibility that there are
3555  * issued-but-unsynced writes pending against the data directory. We want to
3556  * ensure that such writes reach disk before anything that's done in the new
3557  * run. However, aborting on error would result in failure to start for
3558  * harmless cases such as read-only files in the data directory, and that's
3559  * not good either.
3560  *
3561  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3562  * rewriting all changes again during recovery.
3563  *
3564  * Note we assume we're chdir'd into PGDATA to begin with.
3565  */
3566 void
3568 {
3569  bool xlog_is_symlink;
3570 
3571  /* We can skip this whole thing if fsync is disabled. */
3572  if (!enableFsync)
3573  return;
3574 
3575  /*
3576  * If pg_wal is a symlink, we'll need to recurse into it separately,
3577  * because the first walkdir below will ignore it.
3578  */
3579  xlog_is_symlink = false;
3580 
3581  {
3582  struct stat st;
3583 
3584  if (lstat("pg_wal", &st) < 0)
3585  ereport(LOG,
3587  errmsg("could not stat file \"%s\": %m",
3588  "pg_wal")));
3589  else if (S_ISLNK(st.st_mode))
3590  xlog_is_symlink = true;
3591  }
3592 
3593 #ifdef HAVE_SYNCFS
3595  {
3596  DIR *dir;
3597  struct dirent *de;
3598 
3599  /*
3600  * On Linux, we don't have to open every single file one by one. We
3601  * can use syncfs() to sync whole filesystems. We only expect
3602  * filesystem boundaries to exist where we tolerate symlinks, namely
3603  * pg_wal and the tablespaces, so we call syncfs() for each of those
3604  * directories.
3605  */
3606 
3607  /* Prepare to report progress syncing the data directory via syncfs. */
3609 
3610  /* Sync the top level pgdata directory. */
3611  do_syncfs(".");
3612  /* If any tablespaces are configured, sync each of those. */
3613  dir = AllocateDir(PG_TBLSPC_DIR);
3614  while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3615  {
3616  char path[MAXPGPATH];
3617 
3618  if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3619  continue;
3620 
3621  snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3622  do_syncfs(path);
3623  }
3624  FreeDir(dir);
3625  /* If pg_wal is a symlink, process that too. */
3626  if (xlog_is_symlink)
3627  do_syncfs("pg_wal");
3628  return;
3629  }
3630 #endif /* !HAVE_SYNCFS */
3631 
3632 #ifdef PG_FLUSH_DATA_WORKS
3633  /* Prepare to report progress of the pre-fsync phase. */
3635 
3636  /*
3637  * If possible, hint to the kernel that we're soon going to fsync the data
3638  * directory and its contents. Errors in this step are even less
3639  * interesting than normal, so log them only at DEBUG1.
3640  */
3641  walkdir(".", pre_sync_fname, false, DEBUG1);
3642  if (xlog_is_symlink)
3643  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3644  walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
3645 #endif
3646 
3647  /* Prepare to report progress syncing the data directory via fsync. */
3649 
3650  /*
3651  * Now we do the fsync()s in the same order.
3652  *
3653  * The main call ignores symlinks, so in addition to specially processing
3654  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3655  * process_symlinks = true. Note that if there are any plain directories
3656  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3657  * so we don't worry about optimizing it.
3658  */
3659  walkdir(".", datadir_fsync_fname, false, LOG);
3660  if (xlog_is_symlink)
3661  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3663 }
3664 
3665 /*
3666  * walkdir: recursively walk a directory, applying the action to each
3667  * regular file and directory (including the named directory itself).
3668  *
3669  * If process_symlinks is true, the action and recursion are also applied
3670  * to regular files and directories that are pointed to by symlinks in the
3671  * given directory; otherwise symlinks are ignored. Symlinks are always
3672  * ignored in subdirectories, ie we intentionally don't pass down the
3673  * process_symlinks flag to recursive calls.
3674  *
3675  * Errors are reported at level elevel, which might be ERROR or less.
3676  *
3677  * See also walkdir in file_utils.c, which is a frontend version of this
3678  * logic.
3679  */
3680 static void
3681 walkdir(const char *path,
3682  void (*action) (const char *fname, bool isdir, int elevel),
3683  bool process_symlinks,
3684  int elevel)
3685 {
3686  DIR *dir;
3687  struct dirent *de;
3688 
3689  dir = AllocateDir(path);
3690 
3691  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3692  {
3693  char subpath[MAXPGPATH * 2];
3694 
3696 
3697  if (strcmp(de->d_name, ".") == 0 ||
3698  strcmp(de->d_name, "..") == 0)
3699  continue;
3700 
3701  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3702 
3703  switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3704  {
3705  case PGFILETYPE_REG:
3706  (*action) (subpath, false, elevel);
3707  break;
3708  case PGFILETYPE_DIR:
3709  walkdir(subpath, action, false, elevel);
3710  break;
3711  default:
3712 
3713  /*
3714  * Errors are already reported directly by get_dirent_type(),
3715  * and any remaining symlinks and unknown file types are
3716  * ignored.
3717  */
3718  break;
3719  }
3720  }
3721 
3722  FreeDir(dir); /* we ignore any error here */
3723 
3724  /*
3725  * It's important to fsync the destination directory itself as individual
3726  * file fsyncs don't guarantee that the directory entry for the file is
3727  * synced. However, skip this if AllocateDir failed; the action function
3728  * might not be robust against that.
3729  */
3730  if (dir)
3731  (*action) (path, true, elevel);
3732 }
3733 
3734 
3735 /*
3736  * Hint to the OS that it should get ready to fsync() this file.
3737  *
3738  * Ignores errors trying to open unreadable files, and logs other errors at a
3739  * caller-specified level.
3740  */
3741 #ifdef PG_FLUSH_DATA_WORKS
3742 
3743 static void
3744 pre_sync_fname(const char *fname, bool isdir, int elevel)
3745 {
3746  int fd;
3747 
3748  /* Don't try to flush directories, it'll likely just fail */
3749  if (isdir)
3750  return;
3751 
3752  ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3753  fname);
3754 
3755  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3756 
3757  if (fd < 0)
3758  {
3759  if (errno == EACCES)
3760  return;
3761  ereport(elevel,
3763  errmsg("could not open file \"%s\": %m", fname)));
3764  return;
3765  }
3766 
3767  /*
3768  * pg_flush_data() ignores errors, which is ok because this is only a
3769  * hint.
3770  */
3771  pg_flush_data(fd, 0, 0);
3772 
3773  if (CloseTransientFile(fd) != 0)
3774  ereport(elevel,
3776  errmsg("could not close file \"%s\": %m", fname)));
3777 }
3778 
3779 #endif /* PG_FLUSH_DATA_WORKS */
3780 
3781 static void
3782 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3783 {
3784  ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3785  fname);
3786 
3787  /*
3788  * We want to silently ignoring errors about unreadable files. Pass that
3789  * desire on to fsync_fname_ext().
3790  */
3791  fsync_fname_ext(fname, isdir, true, elevel);
3792 }
3793 
3794 static void
3795 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3796 {
3797  if (isdir)
3798  {
3799  if (rmdir(fname) != 0 && errno != ENOENT)
3800  ereport(elevel,
3802  errmsg("could not remove directory \"%s\": %m", fname)));
3803  }
3804  else
3805  {
3806  /* Use PathNameDeleteTemporaryFile to report filesize */
3807  PathNameDeleteTemporaryFile(fname, false);
3808  }
3809 }
3810 
3811 /*
3812  * fsync_fname_ext -- Try to fsync a file or directory
3813  *
3814  * If ignore_perm is true, ignore errors upon trying to open unreadable
3815  * files. Logs other errors at a caller-specified level.
3816  *
3817  * Returns 0 if the operation succeeded, -1 otherwise.
3818  */
3819 int
3820 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3821 {
3822  int fd;
3823  int flags;
3824  int returncode;
3825 
3826  /*
3827  * Some OSs require directories to be opened read-only whereas other
3828  * systems don't allow us to fsync files opened read-only; so we need both
3829  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3830  * not writable by our userid, but we assume that's OK.
3831  */
3832  flags = PG_BINARY;
3833  if (!isdir)
3834  flags |= O_RDWR;
3835  else
3836  flags |= O_RDONLY;
3837 
3838  fd = OpenTransientFile(fname, flags);
3839 
3840  /*
3841  * Some OSs don't allow us to open directories at all (Windows returns
3842  * EACCES), just ignore the error in that case. If desired also silently
3843  * ignoring errors about unreadable files. Log others.
3844  */
3845  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3846  return 0;
3847  else if (fd < 0 && ignore_perm && errno == EACCES)
3848  return 0;
3849  else if (fd < 0)
3850  {
3851  ereport(elevel,
3853  errmsg("could not open file \"%s\": %m", fname)));
3854  return -1;
3855  }
3856 
3857  returncode = pg_fsync(fd);
3858 
3859  /*
3860  * Some OSes don't allow us to fsync directories at all, so we can ignore
3861  * those errors. Anything else needs to be logged.
3862  */
3863  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3864  {
3865  int save_errno;
3866 
3867  /* close file upon error, might not be in transaction context */
3868  save_errno = errno;
3869  (void) CloseTransientFile(fd);
3870  errno = save_errno;
3871 
3872  ereport(elevel,
3874  errmsg("could not fsync file \"%s\": %m", fname)));
3875  return -1;
3876  }
3877 
3878  if (CloseTransientFile(fd) != 0)
3879  {
3880  ereport(elevel,
3882  errmsg("could not close file \"%s\": %m", fname)));
3883  return -1;
3884  }
3885 
3886  return 0;
3887 }
3888 
3889 /*
3890  * fsync_parent_path -- fsync the parent path of a file or directory
3891  *
3892  * This is aimed at making file operations persistent on disk in case of
3893  * an OS crash or power failure.
3894  */
3895 static int
3896 fsync_parent_path(const char *fname, int elevel)
3897 {
3898  char parentpath[MAXPGPATH];
3899 
3900  strlcpy(parentpath, fname, MAXPGPATH);
3901  get_parent_directory(parentpath);
3902 
3903  /*
3904  * get_parent_directory() returns an empty string if the input argument is
3905  * just a file name (see comments in path.c), so handle that as being the
3906  * current directory.
3907  */
3908  if (strlen(parentpath) == 0)
3909  strlcpy(parentpath, ".", MAXPGPATH);
3910 
3911  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3912  return -1;
3913 
3914  return 0;
3915 }
3916 
3917 /*
3918  * Create a PostgreSQL data sub-directory
3919  *
3920  * The data directory itself, and most of its sub-directories, are created at
3921  * initdb time, but we do have some occasions when we create directories in
3922  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3923  * make sure that those directories are created consistently. Today, that means
3924  * making sure that the created directory has the correct permissions, which is
3925  * what pg_dir_create_mode tracks for us.
3926  *
3927  * Note that we also set the umask() based on what we understand the correct
3928  * permissions to be (see file_perm.c).
3929  *
3930  * For permissions other than the default, mkdir() can be used directly, but
3931  * be sure to consider carefully such cases -- a sub-directory with incorrect
3932  * permissions in a PostgreSQL data directory could cause backups and other
3933  * processes to fail.
3934  */
3935 int
3936 MakePGDirectory(const char *directoryName)
3937 {
3938  return mkdir(directoryName, pg_dir_create_mode);
3939 }
3940 
3941 /*
3942  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3943  *
3944  * Failure to fsync any data file is cause for immediate panic, unless
3945  * data_sync_retry is enabled. Data may have been written to the operating
3946  * system and removed from our buffer pool already, and if we are running on
3947  * an operating system that forgets dirty data on write-back failure, there
3948  * may be only one copy of the data remaining: in the WAL. A later attempt to
3949  * fsync again might falsely report success. Therefore we must not allow any
3950  * further checkpoints to be attempted. data_sync_retry can in theory be
3951  * enabled on systems known not to drop dirty buffered data on write-back
3952  * failure (with the likely outcome that checkpoints will continue to fail
3953  * until the underlying problem is fixed).
3954  *
3955  * Any code that reports a failure from fsync() or related functions should
3956  * filter the error level with this function.
3957  */
3958 int
3959 data_sync_elevel(int elevel)
3960 {
3961  return data_sync_retry ? elevel : PANIC;
3962 }
3963 
3964 bool
3966 {
3967  bool result = true;
3968  int flags;
3969 
3970 #if PG_O_DIRECT == 0
3971  if (strcmp(*newval, "") != 0)
3972  {
3973  GUC_check_errdetail("\"%s\" is not supported on this platform.",
3974  "debug_io_direct");
3975  result = false;
3976  }
3977  flags = 0;
3978 #else
3979  List *elemlist;
3980  ListCell *l;
3981  char *rawstring;
3982 
3983  /* Need a modifiable copy of string */
3984  rawstring = pstrdup(*newval);
3985 
3986  if (!SplitGUCList(rawstring, ',', &elemlist))
3987  {
3988  GUC_check_errdetail("Invalid list syntax in parameter \"%s\"",
3989  "debug_io_direct");
3990  pfree(rawstring);
3991  list_free(elemlist);
3992  return false;
3993  }
3994 
3995  flags = 0;
3996  foreach(l, elemlist)
3997  {
3998  char *item = (char *) lfirst(l);
3999 
4000  if (pg_strcasecmp(item, "data") == 0)
4001  flags |= IO_DIRECT_DATA;
4002  else if (pg_strcasecmp(item, "wal") == 0)
4003  flags |= IO_DIRECT_WAL;
4004  else if (pg_strcasecmp(item, "wal_init") == 0)
4005  flags |= IO_DIRECT_WAL_INIT;
4006  else
4007  {
4008  GUC_check_errdetail("Invalid option \"%s\"", item);
4009  result = false;
4010  break;
4011  }
4012  }
4013 
4014  /*
4015  * It's possible to configure block sizes smaller than our assumed I/O
4016  * alignment size, which could result in invalid I/O requests.
4017  */
4018 #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4019  if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4020  {
4021  GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small",
4022  "debug_io_direct", "XLOG_BLCKSZ");
4023  result = false;
4024  }
4025 #endif
4026 #if BLCKSZ < PG_IO_ALIGN_SIZE
4027  if (result && (flags & IO_DIRECT_DATA))
4028  {
4029  GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small",
4030  "debug_io_direct", "BLCKSZ");
4031  result = false;
4032  }
4033 #endif
4034 
4035  pfree(rawstring);
4036  list_free(elemlist);
4037 #endif
4038 
4039  if (!result)
4040  return result;
4041 
4042  /* Save the flags in *extra, for use by assign_debug_io_direct */
4043  *extra = guc_malloc(ERROR, sizeof(int));
4044  *((int *) *extra) = flags;
4045 
4046  return result;
4047 }
4048 
4049 void
4050 assign_debug_io_direct(const char *newval, void *extra)
4051 {
4052  int *flags = (int *) extra;
4053 
4054  io_direct_flags = *flags;
4055 }
4056 
4057 /* ResourceOwner callbacks */
4058 
4059 static void
4061 {
4062  File file = (File) DatumGetInt32(res);
4063  Vfd *vfdP;
4064 
4065  Assert(FileIsValid(file));
4066 
4067  vfdP = &VfdCache[file];
4068  vfdP->resowner = NULL;
4069 
4070  FileClose(file);
4071 }
4072 
4073 static char *
4075 {
4076  return psprintf("File %d", DatumGetInt32(res));
4077 }
void begin_startup_progress_phase(void)
Definition: startup.c:343
unsigned int uint32
Definition: c.h:518
#define Min(x, y)
Definition: c.h:1009
uint32 SubTransactionId
Definition: c.h:661
#define INT64_FORMAT
Definition: c.h:551
#define Assert(condition)
Definition: c.h:863
#define PG_BINARY
Definition: c.h:1278
unsigned int Index
Definition: c.h:619
#define MemSet(start, val, len)
Definition: c.h:1025
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:943
int fdatasync(int fildes)
#define OidIsValid(objectId)
Definition: c.h:780
size_t Size
Definition: c.h:610
int closedir(DIR *)
Definition: dirent.c:127
struct dirent * readdir(DIR *)
Definition: dirent.c:78
DIR * opendir(const char *)
Definition: dirent.c:33
int errcode_for_file_access(void)
Definition: elog.c:876
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2931
static int pg_ftruncate(int fd, off_t length)
Definition: fd.c:702
int max_files_per_process
Definition: fd.c:145
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:524
int FileGetRawDesc(File file)
Definition: fd.c:2499
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3936
int FreeDir(DIR *dir)
Definition: fd.c:2983
int recovery_init_sync_method
Definition: fd.c:164
static const ResourceOwnerDesc file_resowner_desc
Definition: fd.c:360
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2132
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:440
#define FD_MINFREE
Definition: fd.c:137
static int numTempTableSpaces
Definition: fd.c:288
static bool ReleaseLruFile(void)
Definition: fd.c:1381
int io_direct_flags
Definition: fd.c:167
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2605
#define FD_DELETE_AT_CLOSE
Definition: fd.c:191
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1108
static int maxAllocatedDescs
Definition: fd.c:267
static void Delete(File file)
Definition: fd.c:1267
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2764
static long tempFileCounter
Definition: fd.c:279
static char * ResOwnerPrintFile(Datum res)
Definition: fd.c:4074
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:781
static void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: fd.c:376
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:3115
static int numAllocatedDescs
Definition: fd.c:266
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition: fd.c:1900
static void LruDelete(File file)
Definition: fd.c:1286
int pg_fdatasync(int fd)
Definition: fd.c:479
#define FileIsValid(file)
Definition: fd.c:185
void assign_debug_io_direct(const char *newval, void *extra)
Definition: fd.c:4050
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2319
static int nfile
Definition: fd.c:221
int CloseTransientFile(int fd)
Definition: fd.c:2831
#define DO_DB(A)
Definition: fd.c:179
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1086
void closeAllVfds(void)
Definition: fd.c:3042
int max_safe_fds
Definition: fd.c:158
static File AllocateVfd(void)
Definition: fd.c:1413
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1860
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1690
int ClosePipeStream(FILE *file)
Definition: fd.c:3013
void AtEOXact_Files(bool isCommit)
Definition: fd.c:3187
int FileGetRawFlags(File file)
Definition: fd.c:2509
static Size SizeVfdCache
Definition: fd.c:216
static int nextTempTableSpace
Definition: fd.c:289
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:192
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3820
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3795
static void ResOwnerReleaseFile(Datum res)
Definition: fd.c:4060
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3416
int FreeFile(FILE *file)
Definition: fd.c:2803
mode_t FileGetRawMode(File file)
Definition: fd.c:2519
static AllocateDesc * allocatedDescs
Definition: fd.c:268
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:963
static int FileAccess(File file)
Definition: fd.c:1491
static void FreeVfd(File file)
Definition: fd.c:1471
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition: fd.c:460
void FileClose(File file)
Definition: fd.c:1977
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2708
void ReleaseExternalFD(void)
Definition: fd.c:1238
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:193
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3356
bool pg_file_exists(const char *name)
Definition: fd.c:502
void RemovePgTempFiles(void)
Definition: fd.c:3296
#define FileIsNotOpen(file)
Definition: fd.c:188
bool TempTablespacesAreSet(void)
Definition: fd.c:3100
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:755
int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2391
int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2076
int data_sync_elevel(int elevel)
Definition: fd.c:3959
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1574
static void Insert(File file)
Definition: fd.c:1312
AllocateDescKind
Definition: fd.c:247
@ AllocateDescDir
Definition: fd.c:250
@ AllocateDescPipe
Definition: fd.c:249
@ AllocateDescFile
Definition: fd.c:248
@ AllocateDescRawFD
Definition: fd.c:251
Oid GetNextTempTableSpace(void)
Definition: fd.c:3133
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1587
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3782
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1527
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1803
bool AcquireExternalFD(void)
Definition: fd.c:1185
static void RegisterTemporaryFile(File file)
Definition: fd.c:1546
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2946
#define NUM_RESERVED_FDS
Definition: fd.c:128
static Oid * tempTableSpaces
Definition: fd.c:287
static bool reserveAllocatedDesc(void)
Definition: fd.c:2530
void InitFileAccess(void)
Definition: fd.c:902
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3444
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1723
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:871
static uint64 temporary_files_size
Definition: fd.c:235
void ReserveExternalFD(void)
Definition: fd.c:1220
char * FilePathName(File file)
Definition: fd.c:2483
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3472
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1931
void set_max_safe_fds(void)
Definition: fd.c:1043
int pg_fsync(int fd)
Definition: fd.c:385
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:3224
#define VFD_CLOSED
Definition: fd.c:183
static bool have_xact_temporary_files
Definition: fd.c:227
static int LruInsert(File file)
Definition: fd.c:1334
static int numExternalFDs
Definition: fd.c:273
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3896
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1659
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:3154
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2655
void InitTemporaryFileAccess(void)
Definition: fd.c:932
static Vfd * VfdCache
Definition: fd.c:215
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2664
bool data_sync_retry
Definition: fd.c:161
static void ReleaseLruFiles(void)
Definition: fd.c:1403
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2214
void SyncDataDirectory(void)
Definition: fd.c:3567
int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2346
off_t FileSize(File file)
Definition: fd.c:2431
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2158
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2448
bool check_debug_io_direct(char **newval, void **extra, GucSource source)
Definition: fd.c:3965
static void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: fd.c:371
static void BeforeShmemExit_Files(int code, Datum arg)
Definition: fd.c:3201
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3681
int pg_truncate(const char *path, off_t length)
Definition: fd.c:719
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:3071
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2865
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1778
#define IO_DIRECT_WAL
Definition: fd.h:55
#define IO_DIRECT_DATA
Definition: fd.h:54
#define IO_DIRECT_WAL_INIT
Definition: fd.h:56
int File
Definition: fd.h:51
#define PG_O_DIRECT
Definition: fd.h:97
int pg_file_create_mode
Definition: file_perm.c:19
int pg_dir_create_mode
Definition: file_perm.c:18
ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset)
Definition: file_utils.c:688
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:526
#define PG_TEMP_FILES_DIR
Definition: file_utils.h:62
#define PG_TEMP_FILE_PREFIX
Definition: file_utils.h:63
PGFileType
Definition: file_utils.h:19
@ PGFILETYPE_DIR
Definition: file_utils.h:23
@ PGFILETYPE_REG
Definition: file_utils.h:22
@ PGFILETYPE_ERROR
Definition: file_utils.h:20
@ DATA_DIR_SYNC_METHOD_SYNCFS
Definition: file_utils.h:30
@ DATA_DIR_SYNC_METHOD_FSYNC
Definition: file_utils.h:29
int MyProcPid
Definition: globals.c:46
bool enableFsync
Definition: globals.c:128
Oid MyDatabaseTableSpace
Definition: globals.c:95
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:638
#define newval
#define GUC_check_errdetail
Definition: guc.h:476
GucSource
Definition: guc.h:108
int temp_file_limit
Definition: guc_tables.c:533
int log_temp_files
Definition: guc_tables.c:528
#define realloc(a, b)
Definition: header.h:60
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
#define close(a)
Definition: win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:337
int j
Definition: isn.c:73
int i
Definition: isn.c:72
static void const char fflush(stdout)
void list_free(List *list)
Definition: list.c:1546
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:308
char * pstrdup(const char *in)
Definition: mcxt.c:1696
void pfree(void *pointer)
Definition: mcxt.c:1521
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void * palloc(Size size)
Definition: mcxt.c:1317
#define MAP_FAILED
Definition: mem.h:45
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
void * arg
static char * basedir
static PgChecksumMode mode
Definition: pg_checksums.c:55
#define MAXPGPATH
static ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pg_iovec.h:83
static ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pg_iovec.h:44
#define lfirst(lc)
Definition: pg_list.h:172
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition: pg_prng.c:144
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
static rewind_source * source
Definition: pg_rewind.c:89
static char * buf
Definition: pg_test_fsync.c:72
static char * tablespace
Definition: pgbench.c:216
void pgstat_report_tempfile(size_t filesize)
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
void get_parent_directory(char *path)
Definition: path.c:991
pqsigfunc pqsignal(int signo, pqsigfunc func)
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
uintptr_t Datum
Definition: postgres.h:64
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
static int fd(const char *x, int i)
Definition: preproc-init.c:105
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
#define PG_TBLSPC_DIR
Definition: relpath.h:41
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:33
ResourceOwner CurrentResourceOwner
Definition: resowner.c:165
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:554
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:514
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:442
@ RESOURCE_RELEASE_AFTER_LOCKS
Definition: resowner.h:56
#define RELEASE_PRIO_FILES
Definition: resowner.h:76
void pg_usleep(long microsec)
Definition: signal.c:53
static pg_noinline void Size size
Definition: slab.c:607
static void error(void)
Definition: sql-dyntest.c:147
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
SubTransactionId create_subid
Definition: fd.c:257
DIR * dir
Definition: fd.c:261
FILE * file
Definition: fd.c:260
int fd
Definition: fd.c:262
union AllocateDesc::@20 desc
AllocateDescKind kind
Definition: fd.c:256
Definition: dirent.c:26
Definition: pg_list.h:54
const char * name
Definition: resowner.h:93
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
__int64 st_size
Definition: win32_port.h:273
unsigned short st_mode
Definition: win32_port.h:268
Definition: fd.c:196
int fd
Definition: fd.c:197
int fileFlags
Definition: fd.c:206
File lruLessRecently
Definition: fd.c:202
File lruMoreRecently
Definition: fd.c:201
char * fileName
Definition: fd.c:204
ResourceOwner resowner
Definition: fd.c:199
unsigned short fdstate
Definition: fd.c:198
File nextFree
Definition: fd.c:200
mode_t fileMode
Definition: fd.c:207
off_t fileSize
Definition: fd.c:203
bool SplitGUCList(char *rawstring, char separator, List **namelist)
Definition: varlena.c:3680
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:85
static void pgstat_report_wait_end(void)
Definition: wait_event.h:101
const char * type
const char * name
#define fsync(fd)
Definition: win32_port.h:85
#define stat
Definition: win32_port.h:284
#define SIG_DFL
Definition: win32_port.h:163
#define EINTR
Definition: win32_port.h:374
#define EOPNOTSUPP
Definition: win32_port.h:398
#define SIGPIPE
Definition: win32_port.h:173
#define lstat(path, sb)
Definition: win32_port.h:285
#define S_ISDIR(m)
Definition: win32_port.h:325
void _dosmaperr(unsigned long)
Definition: win32error.c:177
#define S_ISLNK(m)
Definition: win32_port.h:344
#define mkdir(a, b)
Definition: win32_port.h:80
#define fstat
Definition: win32_port.h:283
#define ftruncate(a, b)
Definition: win32_port.h:82
#define SIG_IGN
Definition: win32_port.h:165
#define O_CLOEXEC
Definition: win32_port.h:359
#define O_DSYNC
Definition: win32_port.h:352
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:790
int wal_sync_method
Definition: xlog.c:130
@ WAL_SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:27
static const char * directory
Definition: zic.c:634