PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  * If a non-virtual file descriptor needs to be held open for any length of
65  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66  * (and eventually ReleaseExternalFD), so that we can take it into account
67  * while deciding how many VFDs can be open. This applies to FDs obtained
68  * with BasicOpenFile as well as those obtained without use of any fd.c API.
69  *
70  *-------------------------------------------------------------------------
71  */
72 
73 #include "postgres.h"
74 
75 #include <dirent.h>
76 #include <sys/file.h>
77 #include <sys/param.h>
78 #include <sys/resource.h> /* for getrlimit */
79 #include <sys/stat.h>
80 #include <sys/types.h>
81 #ifndef WIN32
82 #include <sys/mman.h>
83 #endif
84 #include <limits.h>
85 #include <unistd.h>
86 #include <fcntl.h>
87 
88 #include "access/xact.h"
89 #include "access/xlog.h"
90 #include "catalog/pg_tablespace.h"
91 #include "common/file_perm.h"
92 #include "common/file_utils.h"
93 #include "common/pg_prng.h"
94 #include "miscadmin.h"
95 #include "pgstat.h"
96 #include "portability/mem.h"
97 #include "postmaster/startup.h"
98 #include "storage/fd.h"
99 #include "storage/ipc.h"
100 #include "utils/guc.h"
101 #include "utils/guc_hooks.h"
102 #include "utils/resowner.h"
103 #include "utils/varlena.h"
104 
105 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
106 #if defined(HAVE_SYNC_FILE_RANGE)
107 #define PG_FLUSH_DATA_WORKS 1
108 #elif !defined(WIN32) && defined(MS_ASYNC)
109 #define PG_FLUSH_DATA_WORKS 1
110 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
111 #define PG_FLUSH_DATA_WORKS 1
112 #endif
113 
114 /*
115  * We must leave some file descriptors free for system(), the dynamic loader,
116  * and other code that tries to open files without consulting fd.c. This
117  * is the number left free. (While we try fairly hard to prevent EMFILE
118  * errors, there's never any guarantee that we won't get ENFILE due to
119  * other processes chewing up FDs. So it's a bad idea to try to open files
120  * without consulting fd.c. Nonetheless we cannot control all code.)
121  *
122  * Because this is just a fixed setting, we are effectively assuming that
123  * no such code will leave FDs open over the long term; otherwise the slop
124  * is likely to be insufficient. Note in particular that we expect that
125  * loading a shared library does not result in any permanent increase in
126  * the number of open files. (This appears to be true on most if not
127  * all platforms as of Feb 2004.)
128  */
129 #define NUM_RESERVED_FDS 10
130 
131 /*
132  * If we have fewer than this many usable FDs after allowing for the reserved
133  * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
134  * much less than that. Note that this value ensures numExternalFDs can be
135  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
136  * will not pass unless that can grow to at least 14.)
137  */
138 #define FD_MINFREE 48
139 
140 /*
141  * A number of platforms allow individual processes to open many more files
142  * than they can really support when *many* processes do the same thing.
143  * This GUC parameter lets the DBA limit max_safe_fds to something less than
144  * what the postmaster's initial probe suggests will work.
145  */
147 
148 /*
149  * Maximum number of file descriptors to open for operations that fd.c knows
150  * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
151  * to a conservative value, and remains that way indefinitely in bootstrap or
152  * standalone-backend cases. In normal postmaster operation, the postmaster
153  * calls set_max_safe_fds() late in initialization to update the value, and
154  * that value is then inherited by forked subprocesses.
155  *
156  * Note: the value of max_files_per_process is taken into account while
157  * setting this variable, and so need not be tested separately.
158  */
159 int max_safe_fds = FD_MINFREE; /* default if not changed */
160 
161 /* Whether it is safe to continue running after fsync() fails. */
162 bool data_sync_retry = false;
163 
164 /* How SyncDataDirectory() should do its job. */
166 
167 /* Which kinds of files should be opened with PG_O_DIRECT. */
169 
170 /* Debugging.... */
171 
172 #ifdef FDDEBUG
173 #define DO_DB(A) \
174  do { \
175  int _do_db_save_errno = errno; \
176  A; \
177  errno = _do_db_save_errno; \
178  } while (0)
179 #else
180 #define DO_DB(A) \
181  ((void) 0)
182 #endif
183 
184 #define VFD_CLOSED (-1)
185 
186 #define FileIsValid(file) \
187  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
188 
189 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
190 
191 /* these are the assigned bits in fdstate below: */
192 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
193 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
194 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
195 
196 typedef struct vfd
197 {
198  int fd; /* current FD, or VFD_CLOSED if none */
199  unsigned short fdstate; /* bitflags for VFD's state */
200  ResourceOwner resowner; /* owner, for automatic cleanup */
201  File nextFree; /* link to next free VFD, if in freelist */
202  File lruMoreRecently; /* doubly linked recency-of-use list */
204  off_t fileSize; /* current size of file (0 if not temporary) */
205  char *fileName; /* name of file, or NULL for unused VFD */
206  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
207  int fileFlags; /* open(2) flags for (re)opening the file */
208  mode_t fileMode; /* mode to pass to open(2) */
209 } Vfd;
210 
211 /*
212  * Virtual File Descriptor array pointer and size. This grows as
213  * needed. 'File' values are indexes into this array.
214  * Note that VfdCache[0] is not a usable VFD, just a list header.
215  */
216 static Vfd *VfdCache;
217 static Size SizeVfdCache = 0;
218 
219 /*
220  * Number of file descriptors known to be in use by VFD entries.
221  */
222 static int nfile = 0;
223 
224 /*
225  * Flag to tell whether it's worth scanning VfdCache looking for temp files
226  * to close
227  */
228 static bool have_xact_temporary_files = false;
229 
230 /*
231  * Tracks the total size of all temporary files. Note: when temp_file_limit
232  * is being enforced, this cannot overflow since the limit cannot be more
233  * than INT_MAX kilobytes. When not enforcing, it could theoretically
234  * overflow, but we don't care.
235  */
236 static uint64 temporary_files_size = 0;
237 
238 /* Temporary file access initialized and not yet shut down? */
239 #ifdef USE_ASSERT_CHECKING
240 static bool temporary_files_allowed = false;
241 #endif
242 
243 /*
244  * List of OS handles opened with AllocateFile, AllocateDir and
245  * OpenTransientFile.
246  */
247 typedef enum
248 {
254 
255 typedef struct
256 {
259  union
260  {
261  FILE *file;
263  int fd;
264  } desc;
265 } AllocateDesc;
266 
267 static int numAllocatedDescs = 0;
268 static int maxAllocatedDescs = 0;
270 
271 /*
272  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
273  */
274 static int numExternalFDs = 0;
275 
276 /*
277  * Number of temporary files opened during the current session;
278  * this is used in generation of tempfile names.
279  */
280 static long tempFileCounter = 0;
281 
282 /*
283  * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
284  * indicating that the current database's default tablespace should be used.)
285  * When numTempTableSpaces is -1, this has not been set in the current
286  * transaction.
287  */
288 static Oid *tempTableSpaces = NULL;
289 static int numTempTableSpaces = -1;
290 static int nextTempTableSpace = 0;
291 
292 
293 /*--------------------
294  *
295  * Private Routines
296  *
297  * Delete - delete a file from the Lru ring
298  * LruDelete - remove a file from the Lru ring and close its FD
299  * Insert - put a file at the front of the Lru ring
300  * LruInsert - put a file at the front of the Lru ring and open it
301  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
302  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
303  * AllocateVfd - grab a free (or new) file record (from VfdCache)
304  * FreeVfd - free a file record
305  *
306  * The Least Recently Used ring is a doubly linked list that begins and
307  * ends on element zero. Element zero is special -- it doesn't represent
308  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
309  * anchor that shows us the beginning/end of the ring.
310  * Only VFD elements that are currently really open (have an FD assigned) are
311  * in the Lru ring. Elements that are "virtually" open can be recognized
312  * by having a non-null fileName field.
313  *
314  * example:
315  *
316  * /--less----\ /---------\
317  * v \ v \
318  * #0 --more---> LeastRecentlyUsed --more-\ \
319  * ^\ | |
320  * \\less--> MostRecentlyUsedFile <---/ |
321  * \more---/ \--less--/
322  *
323  *--------------------
324  */
325 static void Delete(File file);
326 static void LruDelete(File file);
327 static void Insert(File file);
328 static int LruInsert(File file);
329 static bool ReleaseLruFile(void);
330 static void ReleaseLruFiles(void);
331 static File AllocateVfd(void);
332 static void FreeVfd(File file);
333 
334 static int FileAccess(File file);
335 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
336 static bool reserveAllocatedDesc(void);
337 static int FreeDesc(AllocateDesc *desc);
338 
339 static void BeforeShmemExit_Files(int code, Datum arg);
340 static void CleanupTempFiles(bool isCommit, bool isProcExit);
341 static void RemovePgTempRelationFiles(const char *tsdirname);
342 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
343 
344 static void walkdir(const char *path,
345  void (*action) (const char *fname, bool isdir, int elevel),
346  bool process_symlinks,
347  int elevel);
348 #ifdef PG_FLUSH_DATA_WORKS
349 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
350 #endif
351 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
352 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
353 
354 static int fsync_parent_path(const char *fname, int elevel);
355 
356 
357 /* ResourceOwner callbacks to hold virtual file descriptors */
358 static void ResOwnerReleaseFile(Datum res);
359 static char *ResOwnerPrintFile(Datum res);
360 
362 {
363  .name = "File",
364  .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
365  .release_priority = RELEASE_PRIO_FILES,
366  .ReleaseResource = ResOwnerReleaseFile,
367  .DebugPrint = ResOwnerPrintFile
368 };
369 
370 /* Convenience wrappers over ResourceOwnerRemember/Forget */
371 static inline void
373 {
375 }
376 static inline void
378 {
380 }
381 
382 /*
383  * pg_fsync --- do fsync with or without writethrough
384  */
385 int
387 {
388 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
389  struct stat st;
390 
391  /*
392  * Some operating system implementations of fsync() have requirements
393  * about the file access modes that were used when their file descriptor
394  * argument was opened, and these requirements differ depending on whether
395  * the file descriptor is for a directory.
396  *
397  * For any file descriptor that may eventually be handed to fsync(), we
398  * should have opened it with access modes that are compatible with
399  * fsync() on all supported systems, otherwise the code may not be
400  * portable, even if it runs ok on the current system.
401  *
402  * We assert here that a descriptor for a file was opened with write
403  * permissions (either O_RDWR or O_WRONLY) and for a directory without
404  * write permissions (O_RDONLY).
405  *
406  * Ignore any fstat errors and let the follow-up fsync() do its work.
407  * Doing this sanity check here counts for the case where fsync() is
408  * disabled.
409  */
410  if (fstat(fd, &st) == 0)
411  {
412  int desc_flags = fcntl(fd, F_GETFL);
413 
414  /*
415  * O_RDONLY is historically 0, so just make sure that for directories
416  * no write flags are used.
417  */
418  if (S_ISDIR(st.st_mode))
419  Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
420  else
421  Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
422  }
423  errno = 0;
424 #endif
425 
426  /* #if is to skip the wal_sync_method test if there's no need for it */
427 #if defined(HAVE_FSYNC_WRITETHROUGH)
429  return pg_fsync_writethrough(fd);
430  else
431 #endif
433 }
434 
435 
436 /*
437  * pg_fsync_no_writethrough --- same as fsync except does nothing if
438  * enableFsync is off
439  */
440 int
442 {
443  int rc;
444 
445  if (!enableFsync)
446  return 0;
447 
448 retry:
449  rc = fsync(fd);
450 
451  if (rc == -1 && errno == EINTR)
452  goto retry;
453 
454  return rc;
455 }
456 
457 /*
458  * pg_fsync_writethrough
459  */
460 int
462 {
463  if (enableFsync)
464  {
465 #if defined(F_FULLFSYNC)
466  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
467 #else
468  errno = ENOSYS;
469  return -1;
470 #endif
471  }
472  else
473  return 0;
474 }
475 
476 /*
477  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
478  */
479 int
481 {
482  int rc;
483 
484  if (!enableFsync)
485  return 0;
486 
487 retry:
488  rc = fdatasync(fd);
489 
490  if (rc == -1 && errno == EINTR)
491  goto retry;
492 
493  return rc;
494 }
495 
496 /*
497  * pg_file_exists -- check that a file exists.
498  *
499  * This requires an absolute path to the file. Returns true if the file is
500  * not a directory, false otherwise.
501  */
502 bool
503 pg_file_exists(const char *name)
504 {
505  struct stat st;
506 
507  Assert(name != NULL);
508 
509  if (stat(name, &st) == 0)
510  return !S_ISDIR(st.st_mode);
511  else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
512  ereport(ERROR,
514  errmsg("could not access file \"%s\": %m", name)));
515 
516  return false;
517 }
518 
519 /*
520  * pg_flush_data --- advise OS that the described dirty data should be flushed
521  *
522  * offset of 0 with nbytes 0 means that the entire file should be flushed
523  */
524 void
525 pg_flush_data(int fd, off_t offset, off_t nbytes)
526 {
527  /*
528  * Right now file flushing is primarily used to avoid making later
529  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
530  * if fsyncs are disabled - that's a decision we might want to make
531  * configurable at some point.
532  */
533  if (!enableFsync)
534  return;
535 
536  /*
537  * We compile all alternatives that are supported on the current platform,
538  * to find portability problems more easily.
539  */
540 #if defined(HAVE_SYNC_FILE_RANGE)
541  {
542  int rc;
543  static bool not_implemented_by_kernel = false;
544 
545  if (not_implemented_by_kernel)
546  return;
547 
548 retry:
549 
550  /*
551  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
552  * tells the OS that writeback for the specified blocks should be
553  * started, but that we don't want to wait for completion. Note that
554  * this call might block if too much dirty data exists in the range.
555  * This is the preferable method on OSs supporting it, as it works
556  * reliably when available (contrast to msync()) and doesn't flush out
557  * clean data (like FADV_DONTNEED).
558  */
559  rc = sync_file_range(fd, offset, nbytes,
560  SYNC_FILE_RANGE_WRITE);
561  if (rc != 0)
562  {
563  int elevel;
564 
565  if (rc == EINTR)
566  goto retry;
567 
568  /*
569  * For systems that don't have an implementation of
570  * sync_file_range() such as Windows WSL, generate only one
571  * warning and then suppress all further attempts by this process.
572  */
573  if (errno == ENOSYS)
574  {
575  elevel = WARNING;
576  not_implemented_by_kernel = true;
577  }
578  else
579  elevel = data_sync_elevel(WARNING);
580 
581  ereport(elevel,
583  errmsg("could not flush dirty data: %m")));
584  }
585 
586  return;
587  }
588 #endif
589 #if !defined(WIN32) && defined(MS_ASYNC)
590  {
591  void *p;
592  static int pagesize = 0;
593 
594  /*
595  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
596  * writeback. On linux it only does so if MS_SYNC is specified, but
597  * then it does the writeback synchronously. Luckily all common linux
598  * systems have sync_file_range(). This is preferable over
599  * FADV_DONTNEED because it doesn't flush out clean data.
600  *
601  * We map the file (mmap()), tell the kernel to sync back the contents
602  * (msync()), and then remove the mapping again (munmap()).
603  */
604 
605  /* mmap() needs actual length if we want to map whole file */
606  if (offset == 0 && nbytes == 0)
607  {
608  nbytes = lseek(fd, 0, SEEK_END);
609  if (nbytes < 0)
610  {
613  errmsg("could not determine dirty data size: %m")));
614  return;
615  }
616  }
617 
618  /*
619  * Some platforms reject partial-page mmap() attempts. To deal with
620  * that, just truncate the request to a page boundary. If any extra
621  * bytes don't get flushed, well, it's only a hint anyway.
622  */
623 
624  /* fetch pagesize only once */
625  if (pagesize == 0)
626  pagesize = sysconf(_SC_PAGESIZE);
627 
628  /* align length to pagesize, dropping any fractional page */
629  if (pagesize > 0)
630  nbytes = (nbytes / pagesize) * pagesize;
631 
632  /* fractional-page request is a no-op */
633  if (nbytes <= 0)
634  return;
635 
636  /*
637  * mmap could well fail, particularly on 32-bit platforms where there
638  * may simply not be enough address space. If so, silently fall
639  * through to the next implementation.
640  */
641  if (nbytes <= (off_t) SSIZE_MAX)
642  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
643  else
644  p = MAP_FAILED;
645 
646  if (p != MAP_FAILED)
647  {
648  int rc;
649 
650  rc = msync(p, (size_t) nbytes, MS_ASYNC);
651  if (rc != 0)
652  {
655  errmsg("could not flush dirty data: %m")));
656  /* NB: need to fall through to munmap()! */
657  }
658 
659  rc = munmap(p, (size_t) nbytes);
660  if (rc != 0)
661  {
662  /* FATAL error because mapping would remain */
663  ereport(FATAL,
665  errmsg("could not munmap() while flushing data: %m")));
666  }
667 
668  return;
669  }
670  }
671 #endif
672 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
673  {
674  int rc;
675 
676  /*
677  * Signal the kernel that the passed in range should not be cached
678  * anymore. This has the, desired, side effect of writing out dirty
679  * data, and the, undesired, side effect of likely discarding useful
680  * clean cached blocks. For the latter reason this is the least
681  * preferable method.
682  */
683 
684  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
685 
686  if (rc != 0)
687  {
688  /* don't error out, this is just a performance optimization */
691  errmsg("could not flush dirty data: %m")));
692  }
693 
694  return;
695  }
696 #endif
697 }
698 
699 /*
700  * Truncate an open file to a given length.
701  */
702 static int
703 pg_ftruncate(int fd, off_t length)
704 {
705  int ret;
706 
707 retry:
708  ret = ftruncate(fd, length);
709 
710  if (ret == -1 && errno == EINTR)
711  goto retry;
712 
713  return ret;
714 }
715 
716 /*
717  * Truncate a file to a given length by name.
718  */
719 int
720 pg_truncate(const char *path, off_t length)
721 {
722  int ret;
723 #ifdef WIN32
724  int save_errno;
725  int fd;
726 
727  fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
728  if (fd >= 0)
729  {
730  ret = pg_ftruncate(fd, length);
731  save_errno = errno;
733  errno = save_errno;
734  }
735  else
736  ret = -1;
737 #else
738 
739 retry:
740  ret = truncate(path, length);
741 
742  if (ret == -1 && errno == EINTR)
743  goto retry;
744 #endif
745 
746  return ret;
747 }
748 
749 /*
750  * fsync_fname -- fsync a file or directory, handling errors properly
751  *
752  * Try to fsync a file or directory. When doing the latter, ignore errors that
753  * indicate the OS just doesn't allow/require fsyncing directories.
754  */
755 void
756 fsync_fname(const char *fname, bool isdir)
757 {
758  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
759 }
760 
761 /*
762  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
763  *
764  * This routine ensures that, after returning, the effect of renaming file
765  * persists in case of a crash. A crash while this routine is running will
766  * leave you with either the pre-existing or the moved file in place of the
767  * new file; no mixed state or truncated files are possible.
768  *
769  * It does so by using fsync on the old filename and the possibly existing
770  * target filename before the rename, and the target file and directory after.
771  *
772  * Note that rename() cannot be used across arbitrary directories, as they
773  * might not be on the same filesystem. Therefore this routine does not
774  * support renaming across directories.
775  *
776  * Log errors with the caller specified severity.
777  *
778  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
779  * valid upon return.
780  */
781 int
782 durable_rename(const char *oldfile, const char *newfile, int elevel)
783 {
784  int fd;
785 
786  /*
787  * First fsync the old and target path (if it exists), to ensure that they
788  * are properly persistent on disk. Syncing the target file is not
789  * strictly necessary, but it makes it easier to reason about crashes;
790  * because it's then guaranteed that either source or target file exists
791  * after a crash.
792  */
793  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
794  return -1;
795 
796  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
797  if (fd < 0)
798  {
799  if (errno != ENOENT)
800  {
801  ereport(elevel,
803  errmsg("could not open file \"%s\": %m", newfile)));
804  return -1;
805  }
806  }
807  else
808  {
809  if (pg_fsync(fd) != 0)
810  {
811  int save_errno;
812 
813  /* close file upon error, might not be in transaction context */
814  save_errno = errno;
816  errno = save_errno;
817 
818  ereport(elevel,
820  errmsg("could not fsync file \"%s\": %m", newfile)));
821  return -1;
822  }
823 
824  if (CloseTransientFile(fd) != 0)
825  {
826  ereport(elevel,
828  errmsg("could not close file \"%s\": %m", newfile)));
829  return -1;
830  }
831  }
832 
833  /* Time to do the real deal... */
834  if (rename(oldfile, newfile) < 0)
835  {
836  ereport(elevel,
838  errmsg("could not rename file \"%s\" to \"%s\": %m",
839  oldfile, newfile)));
840  return -1;
841  }
842 
843  /*
844  * To guarantee renaming the file is persistent, fsync the file with its
845  * new name, and its containing directory.
846  */
847  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
848  return -1;
849 
850  if (fsync_parent_path(newfile, elevel) != 0)
851  return -1;
852 
853  return 0;
854 }
855 
856 /*
857  * durable_unlink -- remove a file in a durable manner
858  *
859  * This routine ensures that, after returning, the effect of removing file
860  * persists in case of a crash. A crash while this routine is running will
861  * leave the system in no mixed state.
862  *
863  * It does so by using fsync on the parent directory of the file after the
864  * actual removal is done.
865  *
866  * Log errors with the severity specified by caller.
867  *
868  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
869  * valid upon return.
870  */
871 int
872 durable_unlink(const char *fname, int elevel)
873 {
874  if (unlink(fname) < 0)
875  {
876  ereport(elevel,
878  errmsg("could not remove file \"%s\": %m",
879  fname)));
880  return -1;
881  }
882 
883  /*
884  * To guarantee that the removal of the file is persistent, fsync its
885  * parent directory.
886  */
887  if (fsync_parent_path(fname, elevel) != 0)
888  return -1;
889 
890  return 0;
891 }
892 
893 /*
894  * InitFileAccess --- initialize this module during backend startup
895  *
896  * This is called during either normal or standalone backend start.
897  * It is *not* called in the postmaster.
898  *
899  * Note that this does not initialize temporary file access, that is
900  * separately initialized via InitTemporaryFileAccess().
901  */
902 void
904 {
905  Assert(SizeVfdCache == 0); /* call me only once */
906 
907  /* initialize cache header entry */
908  VfdCache = (Vfd *) malloc(sizeof(Vfd));
909  if (VfdCache == NULL)
910  ereport(FATAL,
911  (errcode(ERRCODE_OUT_OF_MEMORY),
912  errmsg("out of memory")));
913 
914  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
916 
917  SizeVfdCache = 1;
918 }
919 
920 /*
921  * InitTemporaryFileAccess --- initialize temporary file access during startup
922  *
923  * This is called during either normal or standalone backend start.
924  * It is *not* called in the postmaster.
925  *
926  * This is separate from InitFileAccess() because temporary file cleanup can
927  * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
928  * our reporting has to happen before that. Low level file access should be
929  * available for longer, hence the separate initialization / shutdown of
930  * temporary file handling.
931  */
932 void
934 {
935  Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
936  Assert(!temporary_files_allowed); /* call me only once */
937 
938  /*
939  * Register before-shmem-exit hook to ensure temp files are dropped while
940  * we can still report stats.
941  */
943 
944 #ifdef USE_ASSERT_CHECKING
945  temporary_files_allowed = true;
946 #endif
947 }
948 
949 /*
950  * count_usable_fds --- count how many FDs the system will let us open,
951  * and estimate how many are already open.
952  *
953  * We stop counting if usable_fds reaches max_to_probe. Note: a small
954  * value of max_to_probe might result in an underestimate of already_open;
955  * we must fill in any "gaps" in the set of used FDs before the calculation
956  * of already_open will give the right answer. In practice, max_to_probe
957  * of a couple of dozen should be enough to ensure good results.
958  *
959  * We assume stderr (FD 2) is available for dup'ing. While the calling
960  * script could theoretically close that, it would be a really bad idea,
961  * since then one risks loss of error messages from, e.g., libc.
962  */
963 static void
964 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
965 {
966  int *fd;
967  int size;
968  int used = 0;
969  int highestfd = 0;
970  int j;
971 
972 #ifdef HAVE_GETRLIMIT
973  struct rlimit rlim;
974  int getrlimit_status;
975 #endif
976 
977  size = 1024;
978  fd = (int *) palloc(size * sizeof(int));
979 
980 #ifdef HAVE_GETRLIMIT
981  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
982  if (getrlimit_status != 0)
983  ereport(WARNING, (errmsg("getrlimit failed: %m")));
984 #endif /* HAVE_GETRLIMIT */
985 
986  /* dup until failure or probe limit reached */
987  for (;;)
988  {
989  int thisfd;
990 
991 #ifdef HAVE_GETRLIMIT
992 
993  /*
994  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
995  * some platforms
996  */
997  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
998  break;
999 #endif
1000 
1001  thisfd = dup(2);
1002  if (thisfd < 0)
1003  {
1004  /* Expect EMFILE or ENFILE, else it's fishy */
1005  if (errno != EMFILE && errno != ENFILE)
1006  elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1007  break;
1008  }
1009 
1010  if (used >= size)
1011  {
1012  size *= 2;
1013  fd = (int *) repalloc(fd, size * sizeof(int));
1014  }
1015  fd[used++] = thisfd;
1016 
1017  if (highestfd < thisfd)
1018  highestfd = thisfd;
1019 
1020  if (used >= max_to_probe)
1021  break;
1022  }
1023 
1024  /* release the files we opened */
1025  for (j = 0; j < used; j++)
1026  close(fd[j]);
1027 
1028  pfree(fd);
1029 
1030  /*
1031  * Return results. usable_fds is just the number of successful dups. We
1032  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1033  * number) and so already_open is highestfd+1 - usable_fds.
1034  */
1035  *usable_fds = used;
1036  *already_open = highestfd + 1 - used;
1037 }
1038 
1039 /*
1040  * set_max_safe_fds
1041  * Determine number of file descriptors that fd.c is allowed to use
1042  */
1043 void
1045 {
1046  int usable_fds;
1047  int already_open;
1048 
1049  /*----------
1050  * We want to set max_safe_fds to
1051  * MIN(usable_fds, max_files_per_process - already_open)
1052  * less the slop factor for files that are opened without consulting
1053  * fd.c. This ensures that we won't exceed either max_files_per_process
1054  * or the experimentally-determined EMFILE limit.
1055  *----------
1056  */
1058  &usable_fds, &already_open);
1059 
1060  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
1061 
1062  /*
1063  * Take off the FDs reserved for system() etc.
1064  */
1066 
1067  /*
1068  * Make sure we still have enough to get by.
1069  */
1070  if (max_safe_fds < FD_MINFREE)
1071  ereport(FATAL,
1072  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1073  errmsg("insufficient file descriptors available to start server process"),
1074  errdetail("System allows %d, server needs at least %d.",
1077 
1078  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1079  max_safe_fds, usable_fds, already_open);
1080 }
1081 
1082 /*
1083  * Open a file with BasicOpenFilePerm() and pass default file mode for the
1084  * fileMode parameter.
1085  */
1086 int
1087 BasicOpenFile(const char *fileName, int fileFlags)
1088 {
1089  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1090 }
1091 
1092 /*
1093  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1094  *
1095  * This is exported for use by places that really want a plain kernel FD,
1096  * but need to be proof against running out of FDs. Once an FD has been
1097  * successfully returned, it is the caller's responsibility to ensure that
1098  * it will not be leaked on ereport()! Most users should *not* call this
1099  * routine directly, but instead use the VFD abstraction level, which
1100  * provides protection against descriptor leaks as well as management of
1101  * files that need to be open for more than a short period of time.
1102  *
1103  * Ideally this should be the *only* direct call of open() in the backend.
1104  * In practice, the postmaster calls open() directly, and there are some
1105  * direct open() calls done early in backend startup. Those are OK since
1106  * this module wouldn't have any open files to close at that point anyway.
1107  */
1108 int
1109 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1110 {
1111  int fd;
1112 
1113 tryAgain:
1114 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1115 
1116  /*
1117  * The value we defined to stand in for O_DIRECT when simulating it with
1118  * F_NOCACHE had better not collide with any of the standard flags.
1119  */
1121  (O_APPEND |
1122  O_CLOEXEC |
1123  O_CREAT |
1124  O_DSYNC |
1125  O_EXCL |
1126  O_RDWR |
1127  O_RDONLY |
1128  O_SYNC |
1129  O_TRUNC |
1130  O_WRONLY)) == 0,
1131  "PG_O_DIRECT value collides with standard flag");
1132  fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1133 #else
1134  fd = open(fileName, fileFlags, fileMode);
1135 #endif
1136 
1137  if (fd >= 0)
1138  {
1139 #ifdef PG_O_DIRECT_USE_F_NOCACHE
1140  if (fileFlags & PG_O_DIRECT)
1141  {
1142  if (fcntl(fd, F_NOCACHE, 1) < 0)
1143  {
1144  int save_errno = errno;
1145 
1146  close(fd);
1147  errno = save_errno;
1148  return -1;
1149  }
1150  }
1151 #endif
1152 
1153  return fd; /* success! */
1154  }
1155 
1156  if (errno == EMFILE || errno == ENFILE)
1157  {
1158  int save_errno = errno;
1159 
1160  ereport(LOG,
1161  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1162  errmsg("out of file descriptors: %m; release and retry")));
1163  errno = 0;
1164  if (ReleaseLruFile())
1165  goto tryAgain;
1166  errno = save_errno;
1167  }
1168 
1169  return -1; /* failure */
1170 }
1171 
1172 /*
1173  * AcquireExternalFD - attempt to reserve an external file descriptor
1174  *
1175  * This should be used by callers that need to hold a file descriptor open
1176  * over more than a short interval, but cannot use any of the other facilities
1177  * provided by this module.
1178  *
1179  * The difference between this and the underlying ReserveExternalFD function
1180  * is that this will report failure (by setting errno and returning false)
1181  * if "too many" external FDs are already reserved. This should be used in
1182  * any code where the total number of FDs to be reserved is not predictable
1183  * and small.
1184  */
1185 bool
1187 {
1188  /*
1189  * We don't want more than max_safe_fds / 3 FDs to be consumed for
1190  * "external" FDs.
1191  */
1192  if (numExternalFDs < max_safe_fds / 3)
1193  {
1195  return true;
1196  }
1197  errno = EMFILE;
1198  return false;
1199 }
1200 
1201 /*
1202  * ReserveExternalFD - report external consumption of a file descriptor
1203  *
1204  * This should be used by callers that need to hold a file descriptor open
1205  * over more than a short interval, but cannot use any of the other facilities
1206  * provided by this module. This just tracks the use of the FD and closes
1207  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1208  *
1209  * Call this directly only in code where failure to reserve the FD would be
1210  * fatal; for example, the WAL-writing code does so, since the alternative is
1211  * session failure. Also, it's very unwise to do so in code that could
1212  * consume more than one FD per process.
1213  *
1214  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1215  * available, it doesn't matter too much whether this is called before or
1216  * after actually opening the FD; but doing so beforehand reduces the risk of
1217  * an EMFILE failure if not everybody played nice. In any case, it's solely
1218  * caller's responsibility to keep the external-FD count in sync with reality.
1219  */
1220 void
1222 {
1223  /*
1224  * Release VFDs if needed to stay safe. Because we do this before
1225  * incrementing numExternalFDs, the final state will be as desired, i.e.,
1226  * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1227  */
1228  ReleaseLruFiles();
1229 
1230  numExternalFDs++;
1231 }
1232 
1233 /*
1234  * ReleaseExternalFD - report release of an external file descriptor
1235  *
1236  * This is guaranteed not to change errno, so it can be used in failure paths.
1237  */
1238 void
1240 {
1241  Assert(numExternalFDs > 0);
1242  numExternalFDs--;
1243 }
1244 
1245 
1246 #if defined(FDDEBUG)
1247 
1248 static void
1249 _dump_lru(void)
1250 {
1251  int mru = VfdCache[0].lruLessRecently;
1252  Vfd *vfdP = &VfdCache[mru];
1253  char buf[2048];
1254 
1255  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1256  while (mru != 0)
1257  {
1258  mru = vfdP->lruLessRecently;
1259  vfdP = &VfdCache[mru];
1260  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1261  }
1262  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1263  elog(LOG, "%s", buf);
1264 }
1265 #endif /* FDDEBUG */
1266 
1267 static void
1269 {
1270  Vfd *vfdP;
1271 
1272  Assert(file != 0);
1273 
1274  DO_DB(elog(LOG, "Delete %d (%s)",
1275  file, VfdCache[file].fileName));
1276  DO_DB(_dump_lru());
1277 
1278  vfdP = &VfdCache[file];
1279 
1282 
1283  DO_DB(_dump_lru());
1284 }
1285 
1286 static void
1288 {
1289  Vfd *vfdP;
1290 
1291  Assert(file != 0);
1292 
1293  DO_DB(elog(LOG, "LruDelete %d (%s)",
1294  file, VfdCache[file].fileName));
1295 
1296  vfdP = &VfdCache[file];
1297 
1298  /*
1299  * Close the file. We aren't expecting this to fail; if it does, better
1300  * to leak the FD than to mess up our internal state.
1301  */
1302  if (close(vfdP->fd) != 0)
1304  "could not close file \"%s\": %m", vfdP->fileName);
1305  vfdP->fd = VFD_CLOSED;
1306  --nfile;
1307 
1308  /* delete the vfd record from the LRU ring */
1309  Delete(file);
1310 }
1311 
1312 static void
1314 {
1315  Vfd *vfdP;
1316 
1317  Assert(file != 0);
1318 
1319  DO_DB(elog(LOG, "Insert %d (%s)",
1320  file, VfdCache[file].fileName));
1321  DO_DB(_dump_lru());
1322 
1323  vfdP = &VfdCache[file];
1324 
1325  vfdP->lruMoreRecently = 0;
1327  VfdCache[0].lruLessRecently = file;
1329 
1330  DO_DB(_dump_lru());
1331 }
1332 
1333 /* returns 0 on success, -1 on re-open failure (with errno set) */
1334 static int
1336 {
1337  Vfd *vfdP;
1338 
1339  Assert(file != 0);
1340 
1341  DO_DB(elog(LOG, "LruInsert %d (%s)",
1342  file, VfdCache[file].fileName));
1343 
1344  vfdP = &VfdCache[file];
1345 
1346  if (FileIsNotOpen(file))
1347  {
1348  /* Close excess kernel FDs. */
1349  ReleaseLruFiles();
1350 
1351  /*
1352  * The open could still fail for lack of file descriptors, eg due to
1353  * overall system file table being full. So, be prepared to release
1354  * another FD if necessary...
1355  */
1356  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1357  vfdP->fileMode);
1358  if (vfdP->fd < 0)
1359  {
1360  DO_DB(elog(LOG, "re-open failed: %m"));
1361  return -1;
1362  }
1363  else
1364  {
1365  ++nfile;
1366  }
1367  }
1368 
1369  /*
1370  * put it at the head of the Lru ring
1371  */
1372 
1373  Insert(file);
1374 
1375  return 0;
1376 }
1377 
1378 /*
1379  * Release one kernel FD by closing the least-recently-used VFD.
1380  */
1381 static bool
1383 {
1384  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1385 
1386  if (nfile > 0)
1387  {
1388  /*
1389  * There are opened files and so there should be at least one used vfd
1390  * in the ring.
1391  */
1392  Assert(VfdCache[0].lruMoreRecently != 0);
1393  LruDelete(VfdCache[0].lruMoreRecently);
1394  return true; /* freed a file */
1395  }
1396  return false; /* no files available to free */
1397 }
1398 
1399 /*
1400  * Release kernel FDs as needed to get under the max_safe_fds limit.
1401  * After calling this, it's OK to try to open another file.
1402  */
1403 static void
1405 {
1407  {
1408  if (!ReleaseLruFile())
1409  break;
1410  }
1411 }
1412 
1413 static File
1415 {
1416  Index i;
1417  File file;
1418 
1419  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1420 
1421  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1422 
1423  if (VfdCache[0].nextFree == 0)
1424  {
1425  /*
1426  * The free list is empty so it is time to increase the size of the
1427  * array. We choose to double it each time this happens. However,
1428  * there's not much point in starting *real* small.
1429  */
1430  Size newCacheSize = SizeVfdCache * 2;
1431  Vfd *newVfdCache;
1432 
1433  if (newCacheSize < 32)
1434  newCacheSize = 32;
1435 
1436  /*
1437  * Be careful not to clobber VfdCache ptr if realloc fails.
1438  */
1439  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1440  if (newVfdCache == NULL)
1441  ereport(ERROR,
1442  (errcode(ERRCODE_OUT_OF_MEMORY),
1443  errmsg("out of memory")));
1444  VfdCache = newVfdCache;
1445 
1446  /*
1447  * Initialize the new entries and link them into the free list.
1448  */
1449  for (i = SizeVfdCache; i < newCacheSize; i++)
1450  {
1451  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1452  VfdCache[i].nextFree = i + 1;
1453  VfdCache[i].fd = VFD_CLOSED;
1454  }
1455  VfdCache[newCacheSize - 1].nextFree = 0;
1457 
1458  /*
1459  * Record the new size
1460  */
1461  SizeVfdCache = newCacheSize;
1462  }
1463 
1464  file = VfdCache[0].nextFree;
1465 
1466  VfdCache[0].nextFree = VfdCache[file].nextFree;
1467 
1468  return file;
1469 }
1470 
1471 static void
1473 {
1474  Vfd *vfdP = &VfdCache[file];
1475 
1476  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1477  file, vfdP->fileName ? vfdP->fileName : ""));
1478 
1479  if (vfdP->fileName != NULL)
1480  {
1481  free(vfdP->fileName);
1482  vfdP->fileName = NULL;
1483  }
1484  vfdP->fdstate = 0x0;
1485 
1486  vfdP->nextFree = VfdCache[0].nextFree;
1487  VfdCache[0].nextFree = file;
1488 }
1489 
1490 /* returns 0 on success, -1 on re-open failure (with errno set) */
1491 static int
1493 {
1494  int returnValue;
1495 
1496  DO_DB(elog(LOG, "FileAccess %d (%s)",
1497  file, VfdCache[file].fileName));
1498 
1499  /*
1500  * Is the file open? If not, open it and put it at the head of the LRU
1501  * ring (possibly closing the least recently used file to get an FD).
1502  */
1503 
1504  if (FileIsNotOpen(file))
1505  {
1506  returnValue = LruInsert(file);
1507  if (returnValue != 0)
1508  return returnValue;
1509  }
1510  else if (VfdCache[0].lruLessRecently != file)
1511  {
1512  /*
1513  * We now know that the file is open and that it is not the last one
1514  * accessed, so we need to move it to the head of the Lru ring.
1515  */
1516 
1517  Delete(file);
1518  Insert(file);
1519  }
1520 
1521  return 0;
1522 }
1523 
1524 /*
1525  * Called whenever a temporary file is deleted to report its size.
1526  */
1527 static void
1528 ReportTemporaryFileUsage(const char *path, off_t size)
1529 {
1531 
1532  if (log_temp_files >= 0)
1533  {
1534  if ((size / 1024) >= log_temp_files)
1535  ereport(LOG,
1536  (errmsg("temporary file: path \"%s\", size %lu",
1537  path, (unsigned long) size)));
1538  }
1539 }
1540 
1541 /*
1542  * Called to register a temporary file for automatic close.
1543  * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1544  * before the file was opened.
1545  */
1546 static void
1548 {
1551 
1552  /* Backup mechanism for closing at end of xact. */
1555 }
1556 
1557 /*
1558  * Called when we get a shared invalidation message on some relation.
1559  */
1560 #ifdef NOT_USED
1561 void
1562 FileInvalidate(File file)
1563 {
1564  Assert(FileIsValid(file));
1565  if (!FileIsNotOpen(file))
1566  LruDelete(file);
1567 }
1568 #endif
1569 
1570 /*
1571  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1572  * fileMode parameter.
1573  */
1574 File
1575 PathNameOpenFile(const char *fileName, int fileFlags)
1576 {
1577  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1578 }
1579 
1580 /*
1581  * open a file in an arbitrary directory
1582  *
1583  * NB: if the passed pathname is relative (which it usually is),
1584  * it will be interpreted relative to the process' working directory
1585  * (which should always be $PGDATA when this code is running).
1586  */
1587 File
1588 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1589 {
1590  char *fnamecopy;
1591  File file;
1592  Vfd *vfdP;
1593 
1594  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1595  fileName, fileFlags, fileMode));
1596 
1597  /*
1598  * We need a malloc'd copy of the file name; fail cleanly if no room.
1599  */
1600  fnamecopy = strdup(fileName);
1601  if (fnamecopy == NULL)
1602  ereport(ERROR,
1603  (errcode(ERRCODE_OUT_OF_MEMORY),
1604  errmsg("out of memory")));
1605 
1606  file = AllocateVfd();
1607  vfdP = &VfdCache[file];
1608 
1609  /* Close excess kernel FDs. */
1610  ReleaseLruFiles();
1611 
1612  /*
1613  * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1614  * client shouldn't be expected to know which kernel descriptors are
1615  * currently open, so it wouldn't make sense for them to be inherited by
1616  * executed subprograms.
1617  */
1618  fileFlags |= O_CLOEXEC;
1619 
1620  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1621 
1622  if (vfdP->fd < 0)
1623  {
1624  int save_errno = errno;
1625 
1626  FreeVfd(file);
1627  free(fnamecopy);
1628  errno = save_errno;
1629  return -1;
1630  }
1631  ++nfile;
1632  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1633  vfdP->fd));
1634 
1635  vfdP->fileName = fnamecopy;
1636  /* Saved flags are adjusted to be OK for re-opening file */
1637  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1638  vfdP->fileMode = fileMode;
1639  vfdP->fileSize = 0;
1640  vfdP->fdstate = 0x0;
1641  vfdP->resowner = NULL;
1642 
1643  Insert(file);
1644 
1645  return file;
1646 }
1647 
1648 /*
1649  * Create directory 'directory'. If necessary, create 'basedir', which must
1650  * be the directory above it. This is designed for creating the top-level
1651  * temporary directory on demand before creating a directory underneath it.
1652  * Do nothing if the directory already exists.
1653  *
1654  * Directories created within the top-level temporary directory should begin
1655  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1656  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1657  * that do not need any particular prefix.
1658 */
1659 void
1661 {
1662  if (MakePGDirectory(directory) < 0)
1663  {
1664  if (errno == EEXIST)
1665  return;
1666 
1667  /*
1668  * Failed. Try to create basedir first in case it's missing. Tolerate
1669  * EEXIST to close a race against another process following the same
1670  * algorithm.
1671  */
1672  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1673  ereport(ERROR,
1675  errmsg("cannot create temporary directory \"%s\": %m",
1676  basedir)));
1677 
1678  /* Try again. */
1679  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1680  ereport(ERROR,
1682  errmsg("cannot create temporary subdirectory \"%s\": %m",
1683  directory)));
1684  }
1685 }
1686 
1687 /*
1688  * Delete a directory and everything in it, if it exists.
1689  */
1690 void
1691 PathNameDeleteTemporaryDir(const char *dirname)
1692 {
1693  struct stat statbuf;
1694 
1695  /* Silently ignore missing directory. */
1696  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1697  return;
1698 
1699  /*
1700  * Currently, walkdir doesn't offer a way for our passed in function to
1701  * maintain state. Perhaps it should, so that we could tell the caller
1702  * whether this operation succeeded or failed. Since this operation is
1703  * used in a cleanup path, we wouldn't actually behave differently: we'll
1704  * just log failures.
1705  */
1706  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1707 }
1708 
1709 /*
1710  * Open a temporary file that will disappear when we close it.
1711  *
1712  * This routine takes care of generating an appropriate tempfile name.
1713  * There's no need to pass in fileFlags or fileMode either, since only
1714  * one setting makes any sense for a temp file.
1715  *
1716  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1717  * to ensure it's closed and deleted when it's no longer needed, typically at
1718  * the end-of-transaction. In most cases, you don't want temporary files to
1719  * outlive the transaction that created them, so this should be false -- but
1720  * if you need "somewhat" temporary storage, this might be useful. In either
1721  * case, the file is removed when the File is explicitly closed.
1722  */
1723 File
1724 OpenTemporaryFile(bool interXact)
1725 {
1726  File file = 0;
1727 
1728  Assert(temporary_files_allowed); /* check temp file access is up */
1729 
1730  /*
1731  * Make sure the current resource owner has space for this File before we
1732  * open it, if we'll be registering it below.
1733  */
1734  if (!interXact)
1736 
1737  /*
1738  * If some temp tablespace(s) have been given to us, try to use the next
1739  * one. If a given tablespace can't be found, we silently fall back to
1740  * the database's default tablespace.
1741  *
1742  * BUT: if the temp file is slated to outlive the current transaction,
1743  * force it into the database's default tablespace, so that it will not
1744  * pose a threat to possible tablespace drop attempts.
1745  */
1746  if (numTempTableSpaces > 0 && !interXact)
1747  {
1748  Oid tblspcOid = GetNextTempTableSpace();
1749 
1750  if (OidIsValid(tblspcOid))
1751  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1752  }
1753 
1754  /*
1755  * If not, or if tablespace is bad, create in database's default
1756  * tablespace. MyDatabaseTableSpace should normally be set before we get
1757  * here, but just in case it isn't, fall back to pg_default tablespace.
1758  */
1759  if (file <= 0)
1762  DEFAULTTABLESPACE_OID,
1763  true);
1764 
1765  /* Mark it for deletion at close and temporary file size limit */
1767 
1768  /* Register it with the current resource owner */
1769  if (!interXact)
1770  RegisterTemporaryFile(file);
1771 
1772  return file;
1773 }
1774 
1775 /*
1776  * Return the path of the temp directory in a given tablespace.
1777  */
1778 void
1780 {
1781  /*
1782  * Identify the tempfile directory for this tablespace.
1783  *
1784  * If someone tries to specify pg_global, use pg_default instead.
1785  */
1786  if (tablespace == InvalidOid ||
1787  tablespace == DEFAULTTABLESPACE_OID ||
1788  tablespace == GLOBALTABLESPACE_OID)
1789  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1790  else
1791  {
1792  /* All other tablespaces are accessed via symlinks */
1793  snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1796  }
1797 }
1798 
1799 /*
1800  * Open a temporary file in a specific tablespace.
1801  * Subroutine for OpenTemporaryFile, which see for details.
1802  */
1803 static File
1804 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1805 {
1806  char tempdirpath[MAXPGPATH];
1807  char tempfilepath[MAXPGPATH];
1808  File file;
1809 
1810  TempTablespacePath(tempdirpath, tblspcOid);
1811 
1812  /*
1813  * Generate a tempfile name that should be unique within the current
1814  * database instance.
1815  */
1816  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1817  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1818 
1819  /*
1820  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1821  * temp file that can be reused.
1822  */
1823  file = PathNameOpenFile(tempfilepath,
1824  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1825  if (file <= 0)
1826  {
1827  /*
1828  * We might need to create the tablespace's tempfile directory, if no
1829  * one has yet done so.
1830  *
1831  * Don't check for an error from MakePGDirectory; it could fail if
1832  * someone else just did the same thing. If it doesn't work then
1833  * we'll bomb out on the second create attempt, instead.
1834  */
1835  (void) MakePGDirectory(tempdirpath);
1836 
1837  file = PathNameOpenFile(tempfilepath,
1838  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1839  if (file <= 0 && rejectError)
1840  elog(ERROR, "could not create temporary file \"%s\": %m",
1841  tempfilepath);
1842  }
1843 
1844  return file;
1845 }
1846 
1847 
1848 /*
1849  * Create a new file. The directory containing it must already exist. Files
1850  * created this way are subject to temp_file_limit and are automatically
1851  * closed at end of transaction, but are not automatically deleted on close
1852  * because they are intended to be shared between cooperating backends.
1853  *
1854  * If the file is inside the top-level temporary directory, its name should
1855  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1856  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1857  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1858  * the prefix isn't needed.
1859  */
1860 File
1861 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1862 {
1863  File file;
1864 
1865  Assert(temporary_files_allowed); /* check temp file access is up */
1866 
1868 
1869  /*
1870  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1871  * temp file that can be reused.
1872  */
1873  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1874  if (file <= 0)
1875  {
1876  if (error_on_failure)
1877  ereport(ERROR,
1879  errmsg("could not create temporary file \"%s\": %m",
1880  path)));
1881  else
1882  return file;
1883  }
1884 
1885  /* Mark it for temp_file_limit accounting. */
1887 
1888  /* Register it for automatic close. */
1889  RegisterTemporaryFile(file);
1890 
1891  return file;
1892 }
1893 
1894 /*
1895  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1896  * another backend. Files opened this way don't count against the
1897  * temp_file_limit of the caller, are automatically closed at the end of the
1898  * transaction but are not deleted on close.
1899  */
1900 File
1901 PathNameOpenTemporaryFile(const char *path, int mode)
1902 {
1903  File file;
1904 
1905  Assert(temporary_files_allowed); /* check temp file access is up */
1906 
1908 
1909  file = PathNameOpenFile(path, mode | PG_BINARY);
1910 
1911  /* If no such file, then we don't raise an error. */
1912  if (file <= 0 && errno != ENOENT)
1913  ereport(ERROR,
1915  errmsg("could not open temporary file \"%s\": %m",
1916  path)));
1917 
1918  if (file > 0)
1919  {
1920  /* Register it for automatic close. */
1921  RegisterTemporaryFile(file);
1922  }
1923 
1924  return file;
1925 }
1926 
1927 /*
1928  * Delete a file by pathname. Return true if the file existed, false if
1929  * didn't.
1930  */
1931 bool
1932 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1933 {
1934  struct stat filestats;
1935  int stat_errno;
1936 
1937  /* Get the final size for pgstat reporting. */
1938  if (stat(path, &filestats) != 0)
1939  stat_errno = errno;
1940  else
1941  stat_errno = 0;
1942 
1943  /*
1944  * Unlike FileClose's automatic file deletion code, we tolerate
1945  * non-existence to support BufFileDeleteFileSet which doesn't know how
1946  * many segments it has to delete until it runs out.
1947  */
1948  if (stat_errno == ENOENT)
1949  return false;
1950 
1951  if (unlink(path) < 0)
1952  {
1953  if (errno != ENOENT)
1954  ereport(error_on_failure ? ERROR : LOG,
1956  errmsg("could not unlink temporary file \"%s\": %m",
1957  path)));
1958  return false;
1959  }
1960 
1961  if (stat_errno == 0)
1962  ReportTemporaryFileUsage(path, filestats.st_size);
1963  else
1964  {
1965  errno = stat_errno;
1966  ereport(LOG,
1968  errmsg("could not stat file \"%s\": %m", path)));
1969  }
1970 
1971  return true;
1972 }
1973 
1974 /*
1975  * close a file when done with it
1976  */
1977 void
1979 {
1980  Vfd *vfdP;
1981 
1982  Assert(FileIsValid(file));
1983 
1984  DO_DB(elog(LOG, "FileClose: %d (%s)",
1985  file, VfdCache[file].fileName));
1986 
1987  vfdP = &VfdCache[file];
1988 
1989  if (!FileIsNotOpen(file))
1990  {
1991  /* close the file */
1992  if (close(vfdP->fd) != 0)
1993  {
1994  /*
1995  * We may need to panic on failure to close non-temporary files;
1996  * see LruDelete.
1997  */
1999  "could not close file \"%s\": %m", vfdP->fileName);
2000  }
2001 
2002  --nfile;
2003  vfdP->fd = VFD_CLOSED;
2004 
2005  /* remove the file from the lru ring */
2006  Delete(file);
2007  }
2008 
2009  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2010  {
2011  /* Subtract its size from current usage (do first in case of error) */
2012  temporary_files_size -= vfdP->fileSize;
2013  vfdP->fileSize = 0;
2014  }
2015 
2016  /*
2017  * Delete the file if it was temporary, and make a log entry if wanted
2018  */
2019  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2020  {
2021  struct stat filestats;
2022  int stat_errno;
2023 
2024  /*
2025  * If we get an error, as could happen within the ereport/elog calls,
2026  * we'll come right back here during transaction abort. Reset the
2027  * flag to ensure that we can't get into an infinite loop. This code
2028  * is arranged to ensure that the worst-case consequence is failing to
2029  * emit log message(s), not failing to attempt the unlink.
2030  */
2031  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2032 
2033 
2034  /* first try the stat() */
2035  if (stat(vfdP->fileName, &filestats))
2036  stat_errno = errno;
2037  else
2038  stat_errno = 0;
2039 
2040  /* in any case do the unlink */
2041  if (unlink(vfdP->fileName))
2042  ereport(LOG,
2044  errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2045 
2046  /* and last report the stat results */
2047  if (stat_errno == 0)
2048  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2049  else
2050  {
2051  errno = stat_errno;
2052  ereport(LOG,
2054  errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2055  }
2056  }
2057 
2058  /* Unregister it from the resource owner */
2059  if (vfdP->resowner)
2060  ResourceOwnerForgetFile(vfdP->resowner, file);
2061 
2062  /*
2063  * Return the Vfd slot to the free list
2064  */
2065  FreeVfd(file);
2066 }
2067 
2068 /*
2069  * FilePrefetch - initiate asynchronous read of a given range of the file.
2070  *
2071  * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2072  *
2073  * posix_fadvise() is the simplest standardized interface that accomplishes
2074  * this.
2075  */
2076 int
2077 FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
2078 {
2079  Assert(FileIsValid(file));
2080 
2081  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2082  file, VfdCache[file].fileName,
2083  (int64) offset, (int64) amount));
2084 
2085 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2086  {
2087  int returnCode;
2088 
2089  returnCode = FileAccess(file);
2090  if (returnCode < 0)
2091  return returnCode;
2092 
2093 retry:
2094  pgstat_report_wait_start(wait_event_info);
2095  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2096  POSIX_FADV_WILLNEED);
2098 
2099  if (returnCode == EINTR)
2100  goto retry;
2101 
2102  return returnCode;
2103  }
2104 #elif defined(__darwin__)
2105  {
2106  struct radvisory
2107  {
2108  off_t ra_offset; /* offset into the file */
2109  int ra_count; /* size of the read */
2110  } ra;
2111  int returnCode;
2112 
2113  returnCode = FileAccess(file);
2114  if (returnCode < 0)
2115  return returnCode;
2116 
2117  ra.ra_offset = offset;
2118  ra.ra_count = amount;
2119  pgstat_report_wait_start(wait_event_info);
2120  returnCode = fcntl(VfdCache[file].fd, F_RDADVISE, &ra);
2122  if (returnCode != -1)
2123  return 0;
2124  else
2125  return errno;
2126  }
2127 #else
2128  return 0;
2129 #endif
2130 }
2131 
2132 void
2133 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
2134 {
2135  int returnCode;
2136 
2137  Assert(FileIsValid(file));
2138 
2139  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2140  file, VfdCache[file].fileName,
2141  (int64) offset, (int64) nbytes));
2142 
2143  if (nbytes <= 0)
2144  return;
2145 
2146  if (VfdCache[file].fileFlags & PG_O_DIRECT)
2147  return;
2148 
2149  returnCode = FileAccess(file);
2150  if (returnCode < 0)
2151  return;
2152 
2153  pgstat_report_wait_start(wait_event_info);
2154  pg_flush_data(VfdCache[file].fd, offset, nbytes);
2156 }
2157 
2158 ssize_t
2159 FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2160  uint32 wait_event_info)
2161 {
2162  ssize_t returnCode;
2163  Vfd *vfdP;
2164 
2165  Assert(FileIsValid(file));
2166 
2167  DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2168  file, VfdCache[file].fileName,
2169  (int64) offset,
2170  iovcnt));
2171 
2172  returnCode = FileAccess(file);
2173  if (returnCode < 0)
2174  return returnCode;
2175 
2176  vfdP = &VfdCache[file];
2177 
2178 retry:
2179  pgstat_report_wait_start(wait_event_info);
2180  returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2182 
2183  if (returnCode < 0)
2184  {
2185  /*
2186  * Windows may run out of kernel buffers and return "Insufficient
2187  * system resources" error. Wait a bit and retry to solve it.
2188  *
2189  * It is rumored that EINTR is also possible on some Unix filesystems,
2190  * in which case immediate retry is indicated.
2191  */
2192 #ifdef WIN32
2193  DWORD error = GetLastError();
2194 
2195  switch (error)
2196  {
2197  case ERROR_NO_SYSTEM_RESOURCES:
2198  pg_usleep(1000L);
2199  errno = EINTR;
2200  break;
2201  default:
2202  _dosmaperr(error);
2203  break;
2204  }
2205 #endif
2206  /* OK to retry if interrupted */
2207  if (errno == EINTR)
2208  goto retry;
2209  }
2210 
2211  return returnCode;
2212 }
2213 
2214 ssize_t
2215 FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset,
2216  uint32 wait_event_info)
2217 {
2218  ssize_t returnCode;
2219  Vfd *vfdP;
2220 
2221  Assert(FileIsValid(file));
2222 
2223  DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2224  file, VfdCache[file].fileName,
2225  (int64) offset,
2226  iovcnt));
2227 
2228  returnCode = FileAccess(file);
2229  if (returnCode < 0)
2230  return returnCode;
2231 
2232  vfdP = &VfdCache[file];
2233 
2234  /*
2235  * If enforcing temp_file_limit and it's a temp file, check to see if the
2236  * write would overrun temp_file_limit, and throw error if so. Note: it's
2237  * really a modularity violation to throw error here; we should set errno
2238  * and return -1. However, there's no way to report a suitable error
2239  * message if we do that. All current callers would just throw error
2240  * immediately anyway, so this is safe at present.
2241  */
2242  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2243  {
2244  off_t past_write = offset;
2245 
2246  for (int i = 0; i < iovcnt; ++i)
2247  past_write += iov[i].iov_len;
2248 
2249  if (past_write > vfdP->fileSize)
2250  {
2251  uint64 newTotal = temporary_files_size;
2252 
2253  newTotal += past_write - vfdP->fileSize;
2254  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2255  ereport(ERROR,
2256  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2257  errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2258  temp_file_limit)));
2259  }
2260  }
2261 
2262 retry:
2263  pgstat_report_wait_start(wait_event_info);
2264  returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2266 
2267  if (returnCode >= 0)
2268  {
2269  /*
2270  * Some callers expect short writes to set errno, and traditionally we
2271  * have assumed that they imply disk space shortage. We don't want to
2272  * waste CPU cycles adding up the total size here, so we'll just set
2273  * it for all successful writes in case such a caller determines that
2274  * the write was short and ereports "%m".
2275  */
2276  errno = ENOSPC;
2277 
2278  /*
2279  * Maintain fileSize and temporary_files_size if it's a temp file.
2280  */
2281  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2282  {
2283  off_t past_write = offset + returnCode;
2284 
2285  if (past_write > vfdP->fileSize)
2286  {
2287  temporary_files_size += past_write - vfdP->fileSize;
2288  vfdP->fileSize = past_write;
2289  }
2290  }
2291  }
2292  else
2293  {
2294  /*
2295  * See comments in FileReadV()
2296  */
2297 #ifdef WIN32
2298  DWORD error = GetLastError();
2299 
2300  switch (error)
2301  {
2302  case ERROR_NO_SYSTEM_RESOURCES:
2303  pg_usleep(1000L);
2304  errno = EINTR;
2305  break;
2306  default:
2307  _dosmaperr(error);
2308  break;
2309  }
2310 #endif
2311  /* OK to retry if interrupted */
2312  if (errno == EINTR)
2313  goto retry;
2314  }
2315 
2316  return returnCode;
2317 }
2318 
2319 int
2320 FileSync(File file, uint32 wait_event_info)
2321 {
2322  int returnCode;
2323 
2324  Assert(FileIsValid(file));
2325 
2326  DO_DB(elog(LOG, "FileSync: %d (%s)",
2327  file, VfdCache[file].fileName));
2328 
2329  returnCode = FileAccess(file);
2330  if (returnCode < 0)
2331  return returnCode;
2332 
2333  pgstat_report_wait_start(wait_event_info);
2334  returnCode = pg_fsync(VfdCache[file].fd);
2336 
2337  return returnCode;
2338 }
2339 
2340 /*
2341  * Zero a region of the file.
2342  *
2343  * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2344  * appropriate error.
2345  */
2346 int
2347 FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
2348 {
2349  int returnCode;
2350  ssize_t written;
2351 
2352  Assert(FileIsValid(file));
2353 
2354  DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2355  file, VfdCache[file].fileName,
2356  (int64) offset, (int64) amount));
2357 
2358  returnCode = FileAccess(file);
2359  if (returnCode < 0)
2360  return returnCode;
2361 
2362  pgstat_report_wait_start(wait_event_info);
2363  written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2365 
2366  if (written < 0)
2367  return -1;
2368  else if (written != amount)
2369  {
2370  /* if errno is unset, assume problem is no disk space */
2371  if (errno == 0)
2372  errno = ENOSPC;
2373  return -1;
2374  }
2375 
2376  return 0;
2377 }
2378 
2379 /*
2380  * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2381  * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2382  * use FileZero() instead.
2383  *
2384  * Note that at least glibc() implements posix_fallocate() in userspace if not
2385  * implemented by the filesystem. That's not the case for all environments
2386  * though.
2387  *
2388  * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2389  * appropriate error.
2390  */
2391 int
2392 FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
2393 {
2394 #ifdef HAVE_POSIX_FALLOCATE
2395  int returnCode;
2396 
2397  Assert(FileIsValid(file));
2398 
2399  DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2400  file, VfdCache[file].fileName,
2401  (int64) offset, (int64) amount));
2402 
2403  returnCode = FileAccess(file);
2404  if (returnCode < 0)
2405  return -1;
2406 
2407 retry:
2408  pgstat_report_wait_start(wait_event_info);
2409  returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2411 
2412  if (returnCode == 0)
2413  return 0;
2414  else if (returnCode == EINTR)
2415  goto retry;
2416 
2417  /* for compatibility with %m printing etc */
2418  errno = returnCode;
2419 
2420  /*
2421  * Return in cases of a "real" failure, if fallocate is not supported,
2422  * fall through to the FileZero() backed implementation.
2423  */
2424  if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
2425  return -1;
2426 #endif
2427 
2428  return FileZero(file, offset, amount, wait_event_info);
2429 }
2430 
2431 off_t
2433 {
2434  Assert(FileIsValid(file));
2435 
2436  DO_DB(elog(LOG, "FileSize %d (%s)",
2437  file, VfdCache[file].fileName));
2438 
2439  if (FileIsNotOpen(file))
2440  {
2441  if (FileAccess(file) < 0)
2442  return (off_t) -1;
2443  }
2444 
2445  return lseek(VfdCache[file].fd, 0, SEEK_END);
2446 }
2447 
2448 int
2449 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2450 {
2451  int returnCode;
2452 
2453  Assert(FileIsValid(file));
2454 
2455  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2456  file, VfdCache[file].fileName));
2457 
2458  returnCode = FileAccess(file);
2459  if (returnCode < 0)
2460  return returnCode;
2461 
2462  pgstat_report_wait_start(wait_event_info);
2463  returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2465 
2466  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2467  {
2468  /* adjust our state for truncation of a temp file */
2469  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2470  temporary_files_size -= VfdCache[file].fileSize - offset;
2471  VfdCache[file].fileSize = offset;
2472  }
2473 
2474  return returnCode;
2475 }
2476 
2477 /*
2478  * Return the pathname associated with an open file.
2479  *
2480  * The returned string points to an internal buffer, which is valid until
2481  * the file is closed.
2482  */
2483 char *
2485 {
2486  Assert(FileIsValid(file));
2487 
2488  return VfdCache[file].fileName;
2489 }
2490 
2491 /*
2492  * Return the raw file descriptor of an opened file.
2493  *
2494  * The returned file descriptor will be valid until the file is closed, but
2495  * there are a lot of things that can make that happen. So the caller should
2496  * be careful not to do much of anything else before it finishes using the
2497  * returned file descriptor.
2498  */
2499 int
2501 {
2502  Assert(FileIsValid(file));
2503  return VfdCache[file].fd;
2504 }
2505 
2506 /*
2507  * FileGetRawFlags - returns the file flags on open(2)
2508  */
2509 int
2511 {
2512  Assert(FileIsValid(file));
2513  return VfdCache[file].fileFlags;
2514 }
2515 
2516 /*
2517  * FileGetRawMode - returns the mode bitmask passed to open(2)
2518  */
2519 mode_t
2521 {
2522  Assert(FileIsValid(file));
2523  return VfdCache[file].fileMode;
2524 }
2525 
2526 /*
2527  * Make room for another allocatedDescs[] array entry if needed and possible.
2528  * Returns true if an array element is available.
2529  */
2530 static bool
2532 {
2533  AllocateDesc *newDescs;
2534  int newMax;
2535 
2536  /* Quick out if array already has a free slot. */
2538  return true;
2539 
2540  /*
2541  * If the array hasn't yet been created in the current process, initialize
2542  * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2543  * we will ever need, anyway. We don't want to look at max_safe_fds
2544  * immediately because set_max_safe_fds() may not have run yet.
2545  */
2546  if (allocatedDescs == NULL)
2547  {
2548  newMax = FD_MINFREE / 3;
2549  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2550  /* Out of memory already? Treat as fatal error. */
2551  if (newDescs == NULL)
2552  ereport(ERROR,
2553  (errcode(ERRCODE_OUT_OF_MEMORY),
2554  errmsg("out of memory")));
2555  allocatedDescs = newDescs;
2556  maxAllocatedDescs = newMax;
2557  return true;
2558  }
2559 
2560  /*
2561  * Consider enlarging the array beyond the initial allocation used above.
2562  * By the time this happens, max_safe_fds should be known accurately.
2563  *
2564  * We mustn't let allocated descriptors hog all the available FDs, and in
2565  * practice we'd better leave a reasonable number of FDs for VFD use. So
2566  * set the maximum to max_safe_fds / 3. (This should certainly be at
2567  * least as large as the initial size, FD_MINFREE / 3, so we aren't
2568  * tightening the restriction here.) Recall that "external" FDs are
2569  * allowed to consume another third of max_safe_fds.
2570  */
2571  newMax = max_safe_fds / 3;
2572  if (newMax > maxAllocatedDescs)
2573  {
2574  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2575  newMax * sizeof(AllocateDesc));
2576  /* Treat out-of-memory as a non-fatal error. */
2577  if (newDescs == NULL)
2578  return false;
2579  allocatedDescs = newDescs;
2580  maxAllocatedDescs = newMax;
2581  return true;
2582  }
2583 
2584  /* Can't enlarge allocatedDescs[] any more. */
2585  return false;
2586 }
2587 
2588 /*
2589  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2590  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2591  * necessary to open the file. When done, call FreeFile rather than fclose.
2592  *
2593  * Note that files that will be open for any significant length of time
2594  * should NOT be handled this way, since they cannot share kernel file
2595  * descriptors with other files; there is grave risk of running out of FDs
2596  * if anyone locks down too many FDs. Most callers of this routine are
2597  * simply reading a config file that they will read and close immediately.
2598  *
2599  * fd.c will automatically close all files opened with AllocateFile at
2600  * transaction commit or abort; this prevents FD leakage if a routine
2601  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2602  *
2603  * Ideally this should be the *only* direct call of fopen() in the backend.
2604  */
2605 FILE *
2606 AllocateFile(const char *name, const char *mode)
2607 {
2608  FILE *file;
2609 
2610  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2612 
2613  /* Can we allocate another non-virtual FD? */
2614  if (!reserveAllocatedDesc())
2615  ereport(ERROR,
2616  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2617  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2618  maxAllocatedDescs, name)));
2619 
2620  /* Close excess kernel FDs. */
2621  ReleaseLruFiles();
2622 
2623 TryAgain:
2624  if ((file = fopen(name, mode)) != NULL)
2625  {
2627 
2628  desc->kind = AllocateDescFile;
2629  desc->desc.file = file;
2632  return desc->desc.file;
2633  }
2634 
2635  if (errno == EMFILE || errno == ENFILE)
2636  {
2637  int save_errno = errno;
2638 
2639  ereport(LOG,
2640  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2641  errmsg("out of file descriptors: %m; release and retry")));
2642  errno = 0;
2643  if (ReleaseLruFile())
2644  goto TryAgain;
2645  errno = save_errno;
2646  }
2647 
2648  return NULL;
2649 }
2650 
2651 /*
2652  * Open a file with OpenTransientFilePerm() and pass default file mode for
2653  * the fileMode parameter.
2654  */
2655 int
2656 OpenTransientFile(const char *fileName, int fileFlags)
2657 {
2658  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2659 }
2660 
2661 /*
2662  * Like AllocateFile, but returns an unbuffered fd like open(2)
2663  */
2664 int
2665 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2666 {
2667  int fd;
2668 
2669  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2670  numAllocatedDescs, fileName));
2671 
2672  /* Can we allocate another non-virtual FD? */
2673  if (!reserveAllocatedDesc())
2674  ereport(ERROR,
2675  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2676  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2677  maxAllocatedDescs, fileName)));
2678 
2679  /* Close excess kernel FDs. */
2680  ReleaseLruFiles();
2681 
2682  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2683 
2684  if (fd >= 0)
2685  {
2687 
2688  desc->kind = AllocateDescRawFD;
2689  desc->desc.fd = fd;
2692 
2693  return fd;
2694  }
2695 
2696  return -1; /* failure */
2697 }
2698 
2699 /*
2700  * Routines that want to initiate a pipe stream should use OpenPipeStream
2701  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2702  * necessary. When done, call ClosePipeStream rather than pclose.
2703  *
2704  * This function also ensures that the popen'd program is run with default
2705  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2706  * uses. This ensures desirable response to, eg, closing a read pipe early.
2707  */
2708 FILE *
2709 OpenPipeStream(const char *command, const char *mode)
2710 {
2711  FILE *file;
2712  int save_errno;
2713 
2714  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2715  numAllocatedDescs, command));
2716 
2717  /* Can we allocate another non-virtual FD? */
2718  if (!reserveAllocatedDesc())
2719  ereport(ERROR,
2720  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2721  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2722  maxAllocatedDescs, command)));
2723 
2724  /* Close excess kernel FDs. */
2725  ReleaseLruFiles();
2726 
2727 TryAgain:
2728  fflush(NULL);
2730  errno = 0;
2731  file = popen(command, mode);
2732  save_errno = errno;
2734  errno = save_errno;
2735  if (file != NULL)
2736  {
2738 
2739  desc->kind = AllocateDescPipe;
2740  desc->desc.file = file;
2743  return desc->desc.file;
2744  }
2745 
2746  if (errno == EMFILE || errno == ENFILE)
2747  {
2748  ereport(LOG,
2749  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2750  errmsg("out of file descriptors: %m; release and retry")));
2751  if (ReleaseLruFile())
2752  goto TryAgain;
2753  errno = save_errno;
2754  }
2755 
2756  return NULL;
2757 }
2758 
2759 /*
2760  * Free an AllocateDesc of any type.
2761  *
2762  * The argument *must* point into the allocatedDescs[] array.
2763  */
2764 static int
2766 {
2767  int result;
2768 
2769  /* Close the underlying object */
2770  switch (desc->kind)
2771  {
2772  case AllocateDescFile:
2773  result = fclose(desc->desc.file);
2774  break;
2775  case AllocateDescPipe:
2776  result = pclose(desc->desc.file);
2777  break;
2778  case AllocateDescDir:
2779  result = closedir(desc->desc.dir);
2780  break;
2781  case AllocateDescRawFD:
2782  result = close(desc->desc.fd);
2783  break;
2784  default:
2785  elog(ERROR, "AllocateDesc kind not recognized");
2786  result = 0; /* keep compiler quiet */
2787  break;
2788  }
2789 
2790  /* Compact storage in the allocatedDescs array */
2793 
2794  return result;
2795 }
2796 
2797 /*
2798  * Close a file returned by AllocateFile.
2799  *
2800  * Note we do not check fclose's return value --- it is up to the caller
2801  * to handle close errors.
2802  */
2803 int
2804 FreeFile(FILE *file)
2805 {
2806  int i;
2807 
2808  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2809 
2810  /* Remove file from list of allocated files, if it's present */
2811  for (i = numAllocatedDescs; --i >= 0;)
2812  {
2813  AllocateDesc *desc = &allocatedDescs[i];
2814 
2815  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2816  return FreeDesc(desc);
2817  }
2818 
2819  /* Only get here if someone passes us a file not in allocatedDescs */
2820  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2821 
2822  return fclose(file);
2823 }
2824 
2825 /*
2826  * Close a file returned by OpenTransientFile.
2827  *
2828  * Note we do not check close's return value --- it is up to the caller
2829  * to handle close errors.
2830  */
2831 int
2833 {
2834  int i;
2835 
2836  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2837 
2838  /* Remove fd from list of allocated files, if it's present */
2839  for (i = numAllocatedDescs; --i >= 0;)
2840  {
2841  AllocateDesc *desc = &allocatedDescs[i];
2842 
2843  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2844  return FreeDesc(desc);
2845  }
2846 
2847  /* Only get here if someone passes us a file not in allocatedDescs */
2848  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2849 
2850  return close(fd);
2851 }
2852 
2853 /*
2854  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2855  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2856  * necessary to open the directory, and with closing it after an elog.
2857  * When done, call FreeDir rather than closedir.
2858  *
2859  * Returns NULL, with errno set, on failure. Note that failure detection
2860  * is commonly left to the following call of ReadDir or ReadDirExtended;
2861  * see the comments for ReadDir.
2862  *
2863  * Ideally this should be the *only* direct call of opendir() in the backend.
2864  */
2865 DIR *
2866 AllocateDir(const char *dirname)
2867 {
2868  DIR *dir;
2869 
2870  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2871  numAllocatedDescs, dirname));
2872 
2873  /* Can we allocate another non-virtual FD? */
2874  if (!reserveAllocatedDesc())
2875  ereport(ERROR,
2876  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2877  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2878  maxAllocatedDescs, dirname)));
2879 
2880  /* Close excess kernel FDs. */
2881  ReleaseLruFiles();
2882 
2883 TryAgain:
2884  if ((dir = opendir(dirname)) != NULL)
2885  {
2887 
2888  desc->kind = AllocateDescDir;
2889  desc->desc.dir = dir;
2892  return desc->desc.dir;
2893  }
2894 
2895  if (errno == EMFILE || errno == ENFILE)
2896  {
2897  int save_errno = errno;
2898 
2899  ereport(LOG,
2900  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2901  errmsg("out of file descriptors: %m; release and retry")));
2902  errno = 0;
2903  if (ReleaseLruFile())
2904  goto TryAgain;
2905  errno = save_errno;
2906  }
2907 
2908  return NULL;
2909 }
2910 
2911 /*
2912  * Read a directory opened with AllocateDir, ereport'ing any error.
2913  *
2914  * This is easier to use than raw readdir() since it takes care of some
2915  * otherwise rather tedious and error-prone manipulation of errno. Also,
2916  * if you are happy with a generic error message for AllocateDir failure,
2917  * you can just do
2918  *
2919  * dir = AllocateDir(path);
2920  * while ((dirent = ReadDir(dir, path)) != NULL)
2921  * process dirent;
2922  * FreeDir(dir);
2923  *
2924  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2925  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2926  * use this shortcut.)
2927  *
2928  * The pathname passed to AllocateDir must be passed to this routine too,
2929  * but it is only used for error reporting.
2930  */
2931 struct dirent *
2932 ReadDir(DIR *dir, const char *dirname)
2933 {
2934  return ReadDirExtended(dir, dirname, ERROR);
2935 }
2936 
2937 /*
2938  * Alternate version of ReadDir that allows caller to specify the elevel
2939  * for any error report (whether it's reporting an initial failure of
2940  * AllocateDir or a subsequent directory read failure).
2941  *
2942  * If elevel < ERROR, returns NULL after any error. With the normal coding
2943  * pattern, this will result in falling out of the loop immediately as
2944  * though the directory contained no (more) entries.
2945  */
2946 struct dirent *
2947 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2948 {
2949  struct dirent *dent;
2950 
2951  /* Give a generic message for AllocateDir failure, if caller didn't */
2952  if (dir == NULL)
2953  {
2954  ereport(elevel,
2956  errmsg("could not open directory \"%s\": %m",
2957  dirname)));
2958  return NULL;
2959  }
2960 
2961  errno = 0;
2962  if ((dent = readdir(dir)) != NULL)
2963  return dent;
2964 
2965  if (errno)
2966  ereport(elevel,
2968  errmsg("could not read directory \"%s\": %m",
2969  dirname)));
2970  return NULL;
2971 }
2972 
2973 /*
2974  * Close a directory opened with AllocateDir.
2975  *
2976  * Returns closedir's return value (with errno set if it's not 0).
2977  * Note we do not check the return value --- it is up to the caller
2978  * to handle close errors if wanted.
2979  *
2980  * Does nothing if dir == NULL; we assume that directory open failure was
2981  * already reported if desired.
2982  */
2983 int
2985 {
2986  int i;
2987 
2988  /* Nothing to do if AllocateDir failed */
2989  if (dir == NULL)
2990  return 0;
2991 
2992  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2993 
2994  /* Remove dir from list of allocated dirs, if it's present */
2995  for (i = numAllocatedDescs; --i >= 0;)
2996  {
2997  AllocateDesc *desc = &allocatedDescs[i];
2998 
2999  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3000  return FreeDesc(desc);
3001  }
3002 
3003  /* Only get here if someone passes us a dir not in allocatedDescs */
3004  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3005 
3006  return closedir(dir);
3007 }
3008 
3009 
3010 /*
3011  * Close a pipe stream returned by OpenPipeStream.
3012  */
3013 int
3014 ClosePipeStream(FILE *file)
3015 {
3016  int i;
3017 
3018  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3019 
3020  /* Remove file from list of allocated files, if it's present */
3021  for (i = numAllocatedDescs; --i >= 0;)
3022  {
3023  AllocateDesc *desc = &allocatedDescs[i];
3024 
3025  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3026  return FreeDesc(desc);
3027  }
3028 
3029  /* Only get here if someone passes us a file not in allocatedDescs */
3030  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3031 
3032  return pclose(file);
3033 }
3034 
3035 /*
3036  * closeAllVfds
3037  *
3038  * Force all VFDs into the physically-closed state, so that the fewest
3039  * possible number of kernel file descriptors are in use. There is no
3040  * change in the logical state of the VFDs.
3041  */
3042 void
3044 {
3045  Index i;
3046 
3047  if (SizeVfdCache > 0)
3048  {
3049  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3050  for (i = 1; i < SizeVfdCache; i++)
3051  {
3052  if (!FileIsNotOpen(i))
3053  LruDelete(i);
3054  }
3055  }
3056 }
3057 
3058 
3059 /*
3060  * SetTempTablespaces
3061  *
3062  * Define a list (actually an array) of OIDs of tablespaces to use for
3063  * temporary files. This list will be used until end of transaction,
3064  * unless this function is called again before then. It is caller's
3065  * responsibility that the passed-in array has adequate lifespan (typically
3066  * it'd be allocated in TopTransactionContext).
3067  *
3068  * Some entries of the array may be InvalidOid, indicating that the current
3069  * database's default tablespace should be used.
3070  */
3071 void
3072 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
3073 {
3074  Assert(numSpaces >= 0);
3075  tempTableSpaces = tableSpaces;
3076  numTempTableSpaces = numSpaces;
3077 
3078  /*
3079  * Select a random starting point in the list. This is to minimize
3080  * conflicts between backends that are most likely sharing the same list
3081  * of temp tablespaces. Note that if we create multiple temp files in the
3082  * same transaction, we'll advance circularly through the list --- this
3083  * ensures that large temporary sort files are nicely spread across all
3084  * available tablespaces.
3085  */
3086  if (numSpaces > 1)
3088  0, numSpaces - 1);
3089  else
3090  nextTempTableSpace = 0;
3091 }
3092 
3093 /*
3094  * TempTablespacesAreSet
3095  *
3096  * Returns true if SetTempTablespaces has been called in current transaction.
3097  * (This is just so that tablespaces.c doesn't need its own per-transaction
3098  * state.)
3099  */
3100 bool
3102 {
3103  return (numTempTableSpaces >= 0);
3104 }
3105 
3106 /*
3107  * GetTempTablespaces
3108  *
3109  * Populate an array with the OIDs of the tablespaces that should be used for
3110  * temporary files. (Some entries may be InvalidOid, indicating that the
3111  * current database's default tablespace should be used.) At most numSpaces
3112  * entries will be filled.
3113  * Returns the number of OIDs that were copied into the output array.
3114  */
3115 int
3116 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
3117 {
3118  int i;
3119 
3121  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3122  tableSpaces[i] = tempTableSpaces[i];
3123 
3124  return i;
3125 }
3126 
3127 /*
3128  * GetNextTempTableSpace
3129  *
3130  * Select the next temp tablespace to use. A result of InvalidOid means
3131  * to use the current database's default tablespace.
3132  */
3133 Oid
3135 {
3136  if (numTempTableSpaces > 0)
3137  {
3138  /* Advance nextTempTableSpace counter with wraparound */
3140  nextTempTableSpace = 0;
3142  }
3143  return InvalidOid;
3144 }
3145 
3146 
3147 /*
3148  * AtEOSubXact_Files
3149  *
3150  * Take care of subtransaction commit/abort. At abort, we close temp files
3151  * that the subtransaction may have opened. At commit, we reassign the
3152  * files that were opened to the parent subtransaction.
3153  */
3154 void
3155 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
3156  SubTransactionId parentSubid)
3157 {
3158  Index i;
3159 
3160  for (i = 0; i < numAllocatedDescs; i++)
3161  {
3162  if (allocatedDescs[i].create_subid == mySubid)
3163  {
3164  if (isCommit)
3165  allocatedDescs[i].create_subid = parentSubid;
3166  else
3167  {
3168  /* have to recheck the item after FreeDesc (ugly) */
3169  FreeDesc(&allocatedDescs[i--]);
3170  }
3171  }
3172  }
3173 }
3174 
3175 /*
3176  * AtEOXact_Files
3177  *
3178  * This routine is called during transaction commit or abort. All still-open
3179  * per-transaction temporary file VFDs are closed, which also causes the
3180  * underlying files to be deleted (although they should've been closed already
3181  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3182  * closed. We also forget any transaction-local temp tablespace list.
3183  *
3184  * The isCommit flag is used only to decide whether to emit warnings about
3185  * unclosed files.
3186  */
3187 void
3188 AtEOXact_Files(bool isCommit)
3189 {
3190  CleanupTempFiles(isCommit, false);
3191  tempTableSpaces = NULL;
3192  numTempTableSpaces = -1;
3193 }
3194 
3195 /*
3196  * BeforeShmemExit_Files
3197  *
3198  * before_shmem_exit hook to clean up temp files during backend shutdown.
3199  * Here, we want to clean up *all* temp files including interXact ones.
3200  */
3201 static void
3203 {
3204  CleanupTempFiles(false, true);
3205 
3206  /* prevent further temp files from being created */
3207 #ifdef USE_ASSERT_CHECKING
3208  temporary_files_allowed = false;
3209 #endif
3210 }
3211 
3212 /*
3213  * Close temporary files and delete their underlying files.
3214  *
3215  * isCommit: if true, this is normal transaction commit, and we don't
3216  * expect any remaining files; warn if there are some.
3217  *
3218  * isProcExit: if true, this is being called as the backend process is
3219  * exiting. If that's the case, we should remove all temporary files; if
3220  * that's not the case, we are being called for transaction commit/abort
3221  * and should only remove transaction-local temp files. In either case,
3222  * also clean up "allocated" stdio files, dirs and fds.
3223  */
3224 static void
3225 CleanupTempFiles(bool isCommit, bool isProcExit)
3226 {
3227  Index i;
3228 
3229  /*
3230  * Careful here: at proc_exit we need extra cleanup, not just
3231  * xact_temporary files.
3232  */
3233  if (isProcExit || have_xact_temporary_files)
3234  {
3235  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3236  for (i = 1; i < SizeVfdCache; i++)
3237  {
3238  unsigned short fdstate = VfdCache[i].fdstate;
3239 
3240  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3241  VfdCache[i].fileName != NULL)
3242  {
3243  /*
3244  * If we're in the process of exiting a backend process, close
3245  * all temporary files. Otherwise, only close temporary files
3246  * local to the current transaction. They should be closed by
3247  * the ResourceOwner mechanism already, so this is just a
3248  * debugging cross-check.
3249  */
3250  if (isProcExit)
3251  FileClose(i);
3252  else if (fdstate & FD_CLOSE_AT_EOXACT)
3253  {
3254  elog(WARNING,
3255  "temporary file %s not closed at end-of-transaction",
3256  VfdCache[i].fileName);
3257  FileClose(i);
3258  }
3259  }
3260  }
3261 
3262  have_xact_temporary_files = false;
3263  }
3264 
3265  /* Complain if any allocated files remain open at commit. */
3266  if (isCommit && numAllocatedDescs > 0)
3267  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3269 
3270  /* Clean up "allocated" stdio files, dirs and fds. */
3271  while (numAllocatedDescs > 0)
3272  FreeDesc(&allocatedDescs[0]);
3273 }
3274 
3275 
3276 /*
3277  * Remove temporary and temporary relation files left over from a prior
3278  * postmaster session
3279  *
3280  * This should be called during postmaster startup. It will forcibly
3281  * remove any leftover files created by OpenTemporaryFile and any leftover
3282  * temporary relation files created by mdcreate.
3283  *
3284  * During post-backend-crash restart cycle, this routine is called when
3285  * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3286  * queries are using temp files could result in useless storage usage that can
3287  * only be reclaimed by a service restart. The argument against enabling it is
3288  * that someone might want to examine the temporary files for debugging
3289  * purposes. This does however mean that OpenTemporaryFile had better allow for
3290  * collision with an existing temp file name.
3291  *
3292  * NOTE: this function and its subroutines generally report syscall failures
3293  * with ereport(LOG) and keep going. Removing temp files is not so critical
3294  * that we should fail to start the database when we can't do it.
3295  */
3296 void
3298 {
3299  char temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3300  DIR *spc_dir;
3301  struct dirent *spc_de;
3302 
3303  /*
3304  * First process temp files in pg_default ($PGDATA/base)
3305  */
3306  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3307  RemovePgTempFilesInDir(temp_path, true, false);
3308  RemovePgTempRelationFiles("base");
3309 
3310  /*
3311  * Cycle through temp directories for all non-default tablespaces.
3312  */
3313  spc_dir = AllocateDir(PG_TBLSPC_DIR);
3314 
3315  while ((spc_de = ReadDirExtended(spc_dir, PG_TBLSPC_DIR, LOG)) != NULL)
3316  {
3317  if (strcmp(spc_de->d_name, ".") == 0 ||
3318  strcmp(spc_de->d_name, "..") == 0)
3319  continue;
3320 
3321  snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3324  RemovePgTempFilesInDir(temp_path, true, false);
3325 
3326  snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3328  RemovePgTempRelationFiles(temp_path);
3329  }
3330 
3331  FreeDir(spc_dir);
3332 
3333  /*
3334  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3335  * DataDir as well. However, that is *not* cleaned here because doing so
3336  * would create a race condition. It's done separately, earlier in
3337  * postmaster startup.
3338  */
3339 }
3340 
3341 /*
3342  * Process one pgsql_tmp directory for RemovePgTempFiles.
3343  *
3344  * If missing_ok is true, it's all right for the named directory to not exist.
3345  * Any other problem results in a LOG message. (missing_ok should be true at
3346  * the top level, since pgsql_tmp directories are not created until needed.)
3347  *
3348  * At the top level, this should be called with unlink_all = false, so that
3349  * only files matching the temporary name prefix will be unlinked. When
3350  * recursing it will be called with unlink_all = true to unlink everything
3351  * under a top-level temporary directory.
3352  *
3353  * (These two flags could be replaced by one, but it seems clearer to keep
3354  * them separate.)
3355  */
3356 void
3357 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3358 {
3359  DIR *temp_dir;
3360  struct dirent *temp_de;
3361  char rm_path[MAXPGPATH * 2];
3362 
3363  temp_dir = AllocateDir(tmpdirname);
3364 
3365  if (temp_dir == NULL && errno == ENOENT && missing_ok)
3366  return;
3367 
3368  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3369  {
3370  if (strcmp(temp_de->d_name, ".") == 0 ||
3371  strcmp(temp_de->d_name, "..") == 0)
3372  continue;
3373 
3374  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3375  tmpdirname, temp_de->d_name);
3376 
3377  if (unlink_all ||
3378  strncmp(temp_de->d_name,
3380  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3381  {
3382  PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
3383 
3384  if (type == PGFILETYPE_ERROR)
3385  continue;
3386  else if (type == PGFILETYPE_DIR)
3387  {
3388  /* recursively remove contents, then directory itself */
3389  RemovePgTempFilesInDir(rm_path, false, true);
3390 
3391  if (rmdir(rm_path) < 0)
3392  ereport(LOG,
3394  errmsg("could not remove directory \"%s\": %m",
3395  rm_path)));
3396  }
3397  else
3398  {
3399  if (unlink(rm_path) < 0)
3400  ereport(LOG,
3402  errmsg("could not remove file \"%s\": %m",
3403  rm_path)));
3404  }
3405  }
3406  else
3407  ereport(LOG,
3408  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3409  rm_path)));
3410  }
3411 
3412  FreeDir(temp_dir);
3413 }
3414 
3415 /* Process one tablespace directory, look for per-DB subdirectories */
3416 static void
3417 RemovePgTempRelationFiles(const char *tsdirname)
3418 {
3419  DIR *ts_dir;
3420  struct dirent *de;
3421  char dbspace_path[MAXPGPATH * 2];
3422 
3423  ts_dir = AllocateDir(tsdirname);
3424 
3425  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3426  {
3427  /*
3428  * We're only interested in the per-database directories, which have
3429  * numeric names. Note that this code will also (properly) ignore "."
3430  * and "..".
3431  */
3432  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3433  continue;
3434 
3435  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3436  tsdirname, de->d_name);
3437  RemovePgTempRelationFilesInDbspace(dbspace_path);
3438  }
3439 
3440  FreeDir(ts_dir);
3441 }
3442 
3443 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3444 static void
3445 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3446 {
3447  DIR *dbspace_dir;
3448  struct dirent *de;
3449  char rm_path[MAXPGPATH * 2];
3450 
3451  dbspace_dir = AllocateDir(dbspacedirname);
3452 
3453  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3454  {
3455  if (!looks_like_temp_rel_name(de->d_name))
3456  continue;
3457 
3458  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3459  dbspacedirname, de->d_name);
3460 
3461  if (unlink(rm_path) < 0)
3462  ereport(LOG,
3464  errmsg("could not remove file \"%s\": %m",
3465  rm_path)));
3466  }
3467 
3468  FreeDir(dbspace_dir);
3469 }
3470 
3471 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3472 bool
3474 {
3475  int pos;
3476  int savepos;
3477 
3478  /* Must start with "t". */
3479  if (name[0] != 't')
3480  return false;
3481 
3482  /* Followed by a non-empty string of digits and then an underscore. */
3483  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3484  ;
3485  if (pos == 1 || name[pos] != '_')
3486  return false;
3487 
3488  /* Followed by another nonempty string of digits. */
3489  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3490  ;
3491  if (savepos == pos)
3492  return false;
3493 
3494  /* We might have _forkname or .segment or both. */
3495  if (name[pos] == '_')
3496  {
3497  int forkchar = forkname_chars(&name[pos + 1], NULL);
3498 
3499  if (forkchar <= 0)
3500  return false;
3501  pos += forkchar + 1;
3502  }
3503  if (name[pos] == '.')
3504  {
3505  int segchar;
3506 
3507  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3508  ;
3509  if (segchar <= 1)
3510  return false;
3511  pos += segchar;
3512  }
3513 
3514  /* Now we should be at the end. */
3515  if (name[pos] != '\0')
3516  return false;
3517  return true;
3518 }
3519 
3520 #ifdef HAVE_SYNCFS
3521 static void
3522 do_syncfs(const char *path)
3523 {
3524  int fd;
3525 
3526  ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3527  path);
3528 
3529  fd = OpenTransientFile(path, O_RDONLY);
3530  if (fd < 0)
3531  {
3532  ereport(LOG,
3534  errmsg("could not open file \"%s\": %m", path)));
3535  return;
3536  }
3537  if (syncfs(fd) < 0)
3538  ereport(LOG,
3540  errmsg("could not synchronize file system for file \"%s\": %m", path)));
3542 }
3543 #endif
3544 
3545 /*
3546  * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3547  * all potential filesystem, depending on recovery_init_sync_method setting.
3548  *
3549  * We fsync regular files and directories wherever they are, but we
3550  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3551  * Other symlinks are presumed to point at files we're not responsible
3552  * for fsyncing, and might not have privileges to write at all.
3553  *
3554  * Errors are logged but not considered fatal; that's because this is used
3555  * only during database startup, to deal with the possibility that there are
3556  * issued-but-unsynced writes pending against the data directory. We want to
3557  * ensure that such writes reach disk before anything that's done in the new
3558  * run. However, aborting on error would result in failure to start for
3559  * harmless cases such as read-only files in the data directory, and that's
3560  * not good either.
3561  *
3562  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3563  * rewriting all changes again during recovery.
3564  *
3565  * Note we assume we're chdir'd into PGDATA to begin with.
3566  */
3567 void
3569 {
3570  bool xlog_is_symlink;
3571 
3572  /* We can skip this whole thing if fsync is disabled. */
3573  if (!enableFsync)
3574  return;
3575 
3576  /*
3577  * If pg_wal is a symlink, we'll need to recurse into it separately,
3578  * because the first walkdir below will ignore it.
3579  */
3580  xlog_is_symlink = false;
3581 
3582  {
3583  struct stat st;
3584 
3585  if (lstat("pg_wal", &st) < 0)
3586  ereport(LOG,
3588  errmsg("could not stat file \"%s\": %m",
3589  "pg_wal")));
3590  else if (S_ISLNK(st.st_mode))
3591  xlog_is_symlink = true;
3592  }
3593 
3594 #ifdef HAVE_SYNCFS
3596  {
3597  DIR *dir;
3598  struct dirent *de;
3599 
3600  /*
3601  * On Linux, we don't have to open every single file one by one. We
3602  * can use syncfs() to sync whole filesystems. We only expect
3603  * filesystem boundaries to exist where we tolerate symlinks, namely
3604  * pg_wal and the tablespaces, so we call syncfs() for each of those
3605  * directories.
3606  */
3607 
3608  /* Prepare to report progress syncing the data directory via syncfs. */
3610 
3611  /* Sync the top level pgdata directory. */
3612  do_syncfs(".");
3613  /* If any tablespaces are configured, sync each of those. */
3614  dir = AllocateDir(PG_TBLSPC_DIR);
3615  while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3616  {
3617  char path[MAXPGPATH];
3618 
3619  if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3620  continue;
3621 
3622  snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3623  do_syncfs(path);
3624  }
3625  FreeDir(dir);
3626  /* If pg_wal is a symlink, process that too. */
3627  if (xlog_is_symlink)
3628  do_syncfs("pg_wal");
3629  return;
3630  }
3631 #endif /* !HAVE_SYNCFS */
3632 
3633 #ifdef PG_FLUSH_DATA_WORKS
3634  /* Prepare to report progress of the pre-fsync phase. */
3636 
3637  /*
3638  * If possible, hint to the kernel that we're soon going to fsync the data
3639  * directory and its contents. Errors in this step are even less
3640  * interesting than normal, so log them only at DEBUG1.
3641  */
3642  walkdir(".", pre_sync_fname, false, DEBUG1);
3643  if (xlog_is_symlink)
3644  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3645  walkdir(PG_TBLSPC_DIR, pre_sync_fname, true, DEBUG1);
3646 #endif
3647 
3648  /* Prepare to report progress syncing the data directory via fsync. */
3650 
3651  /*
3652  * Now we do the fsync()s in the same order.
3653  *
3654  * The main call ignores symlinks, so in addition to specially processing
3655  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3656  * process_symlinks = true. Note that if there are any plain directories
3657  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3658  * so we don't worry about optimizing it.
3659  */
3660  walkdir(".", datadir_fsync_fname, false, LOG);
3661  if (xlog_is_symlink)
3662  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3664 }
3665 
3666 /*
3667  * walkdir: recursively walk a directory, applying the action to each
3668  * regular file and directory (including the named directory itself).
3669  *
3670  * If process_symlinks is true, the action and recursion are also applied
3671  * to regular files and directories that are pointed to by symlinks in the
3672  * given directory; otherwise symlinks are ignored. Symlinks are always
3673  * ignored in subdirectories, ie we intentionally don't pass down the
3674  * process_symlinks flag to recursive calls.
3675  *
3676  * Errors are reported at level elevel, which might be ERROR or less.
3677  *
3678  * See also walkdir in file_utils.c, which is a frontend version of this
3679  * logic.
3680  */
3681 static void
3682 walkdir(const char *path,
3683  void (*action) (const char *fname, bool isdir, int elevel),
3684  bool process_symlinks,
3685  int elevel)
3686 {
3687  DIR *dir;
3688  struct dirent *de;
3689 
3690  dir = AllocateDir(path);
3691 
3692  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3693  {
3694  char subpath[MAXPGPATH * 2];
3695 
3697 
3698  if (strcmp(de->d_name, ".") == 0 ||
3699  strcmp(de->d_name, "..") == 0)
3700  continue;
3701 
3702  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3703 
3704  switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3705  {
3706  case PGFILETYPE_REG:
3707  (*action) (subpath, false, elevel);
3708  break;
3709  case PGFILETYPE_DIR:
3710  walkdir(subpath, action, false, elevel);
3711  break;
3712  default:
3713 
3714  /*
3715  * Errors are already reported directly by get_dirent_type(),
3716  * and any remaining symlinks and unknown file types are
3717  * ignored.
3718  */
3719  break;
3720  }
3721  }
3722 
3723  FreeDir(dir); /* we ignore any error here */
3724 
3725  /*
3726  * It's important to fsync the destination directory itself as individual
3727  * file fsyncs don't guarantee that the directory entry for the file is
3728  * synced. However, skip this if AllocateDir failed; the action function
3729  * might not be robust against that.
3730  */
3731  if (dir)
3732  (*action) (path, true, elevel);
3733 }
3734 
3735 
3736 /*
3737  * Hint to the OS that it should get ready to fsync() this file.
3738  *
3739  * Ignores errors trying to open unreadable files, and logs other errors at a
3740  * caller-specified level.
3741  */
3742 #ifdef PG_FLUSH_DATA_WORKS
3743 
3744 static void
3745 pre_sync_fname(const char *fname, bool isdir, int elevel)
3746 {
3747  int fd;
3748 
3749  /* Don't try to flush directories, it'll likely just fail */
3750  if (isdir)
3751  return;
3752 
3753  ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3754  fname);
3755 
3756  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3757 
3758  if (fd < 0)
3759  {
3760  if (errno == EACCES)
3761  return;
3762  ereport(elevel,
3764  errmsg("could not open file \"%s\": %m", fname)));
3765  return;
3766  }
3767 
3768  /*
3769  * pg_flush_data() ignores errors, which is ok because this is only a
3770  * hint.
3771  */
3772  pg_flush_data(fd, 0, 0);
3773 
3774  if (CloseTransientFile(fd) != 0)
3775  ereport(elevel,
3777  errmsg("could not close file \"%s\": %m", fname)));
3778 }
3779 
3780 #endif /* PG_FLUSH_DATA_WORKS */
3781 
3782 static void
3783 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3784 {
3785  ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3786  fname);
3787 
3788  /*
3789  * We want to silently ignoring errors about unreadable files. Pass that
3790  * desire on to fsync_fname_ext().
3791  */
3792  fsync_fname_ext(fname, isdir, true, elevel);
3793 }
3794 
3795 static void
3796 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3797 {
3798  if (isdir)
3799  {
3800  if (rmdir(fname) != 0 && errno != ENOENT)
3801  ereport(elevel,
3803  errmsg("could not remove directory \"%s\": %m", fname)));
3804  }
3805  else
3806  {
3807  /* Use PathNameDeleteTemporaryFile to report filesize */
3808  PathNameDeleteTemporaryFile(fname, false);
3809  }
3810 }
3811 
3812 /*
3813  * fsync_fname_ext -- Try to fsync a file or directory
3814  *
3815  * If ignore_perm is true, ignore errors upon trying to open unreadable
3816  * files. Logs other errors at a caller-specified level.
3817  *
3818  * Returns 0 if the operation succeeded, -1 otherwise.
3819  */
3820 int
3821 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3822 {
3823  int fd;
3824  int flags;
3825  int returncode;
3826 
3827  /*
3828  * Some OSs require directories to be opened read-only whereas other
3829  * systems don't allow us to fsync files opened read-only; so we need both
3830  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3831  * not writable by our userid, but we assume that's OK.
3832  */
3833  flags = PG_BINARY;
3834  if (!isdir)
3835  flags |= O_RDWR;
3836  else
3837  flags |= O_RDONLY;
3838 
3839  fd = OpenTransientFile(fname, flags);
3840 
3841  /*
3842  * Some OSs don't allow us to open directories at all (Windows returns
3843  * EACCES), just ignore the error in that case. If desired also silently
3844  * ignoring errors about unreadable files. Log others.
3845  */
3846  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3847  return 0;
3848  else if (fd < 0 && ignore_perm && errno == EACCES)
3849  return 0;
3850  else if (fd < 0)
3851  {
3852  ereport(elevel,
3854  errmsg("could not open file \"%s\": %m", fname)));
3855  return -1;
3856  }
3857 
3858  returncode = pg_fsync(fd);
3859 
3860  /*
3861  * Some OSes don't allow us to fsync directories at all, so we can ignore
3862  * those errors. Anything else needs to be logged.
3863  */
3864  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3865  {
3866  int save_errno;
3867 
3868  /* close file upon error, might not be in transaction context */
3869  save_errno = errno;
3870  (void) CloseTransientFile(fd);
3871  errno = save_errno;
3872 
3873  ereport(elevel,
3875  errmsg("could not fsync file \"%s\": %m", fname)));
3876  return -1;
3877  }
3878 
3879  if (CloseTransientFile(fd) != 0)
3880  {
3881  ereport(elevel,
3883  errmsg("could not close file \"%s\": %m", fname)));
3884  return -1;
3885  }
3886 
3887  return 0;
3888 }
3889 
3890 /*
3891  * fsync_parent_path -- fsync the parent path of a file or directory
3892  *
3893  * This is aimed at making file operations persistent on disk in case of
3894  * an OS crash or power failure.
3895  */
3896 static int
3897 fsync_parent_path(const char *fname, int elevel)
3898 {
3899  char parentpath[MAXPGPATH];
3900 
3901  strlcpy(parentpath, fname, MAXPGPATH);
3902  get_parent_directory(parentpath);
3903 
3904  /*
3905  * get_parent_directory() returns an empty string if the input argument is
3906  * just a file name (see comments in path.c), so handle that as being the
3907  * current directory.
3908  */
3909  if (strlen(parentpath) == 0)
3910  strlcpy(parentpath, ".", MAXPGPATH);
3911 
3912  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3913  return -1;
3914 
3915  return 0;
3916 }
3917 
3918 /*
3919  * Create a PostgreSQL data sub-directory
3920  *
3921  * The data directory itself, and most of its sub-directories, are created at
3922  * initdb time, but we do have some occasions when we create directories in
3923  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3924  * make sure that those directories are created consistently. Today, that means
3925  * making sure that the created directory has the correct permissions, which is
3926  * what pg_dir_create_mode tracks for us.
3927  *
3928  * Note that we also set the umask() based on what we understand the correct
3929  * permissions to be (see file_perm.c).
3930  *
3931  * For permissions other than the default, mkdir() can be used directly, but
3932  * be sure to consider carefully such cases -- a sub-directory with incorrect
3933  * permissions in a PostgreSQL data directory could cause backups and other
3934  * processes to fail.
3935  */
3936 int
3937 MakePGDirectory(const char *directoryName)
3938 {
3939  return mkdir(directoryName, pg_dir_create_mode);
3940 }
3941 
3942 /*
3943  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3944  *
3945  * Failure to fsync any data file is cause for immediate panic, unless
3946  * data_sync_retry is enabled. Data may have been written to the operating
3947  * system and removed from our buffer pool already, and if we are running on
3948  * an operating system that forgets dirty data on write-back failure, there
3949  * may be only one copy of the data remaining: in the WAL. A later attempt to
3950  * fsync again might falsely report success. Therefore we must not allow any
3951  * further checkpoints to be attempted. data_sync_retry can in theory be
3952  * enabled on systems known not to drop dirty buffered data on write-back
3953  * failure (with the likely outcome that checkpoints will continue to fail
3954  * until the underlying problem is fixed).
3955  *
3956  * Any code that reports a failure from fsync() or related functions should
3957  * filter the error level with this function.
3958  */
3959 int
3960 data_sync_elevel(int elevel)
3961 {
3962  return data_sync_retry ? elevel : PANIC;
3963 }
3964 
3965 bool
3967 {
3968  bool result = true;
3969  int flags;
3970 
3971 #if PG_O_DIRECT == 0
3972  if (strcmp(*newval, "") != 0)
3973  {
3974  GUC_check_errdetail("\"%s\" is not supported on this platform.",
3975  "debug_io_direct");
3976  result = false;
3977  }
3978  flags = 0;
3979 #else
3980  List *elemlist;
3981  ListCell *l;
3982  char *rawstring;
3983 
3984  /* Need a modifiable copy of string */
3985  rawstring = pstrdup(*newval);
3986 
3987  if (!SplitGUCList(rawstring, ',', &elemlist))
3988  {
3989  GUC_check_errdetail("Invalid list syntax in parameter \"%s\"",
3990  "debug_io_direct");
3991  pfree(rawstring);
3992  list_free(elemlist);
3993  return false;
3994  }
3995 
3996  flags = 0;
3997  foreach(l, elemlist)
3998  {
3999  char *item = (char *) lfirst(l);
4000 
4001  if (pg_strcasecmp(item, "data") == 0)
4002  flags |= IO_DIRECT_DATA;
4003  else if (pg_strcasecmp(item, "wal") == 0)
4004  flags |= IO_DIRECT_WAL;
4005  else if (pg_strcasecmp(item, "wal_init") == 0)
4006  flags |= IO_DIRECT_WAL_INIT;
4007  else
4008  {
4009  GUC_check_errdetail("Invalid option \"%s\"", item);
4010  result = false;
4011  break;
4012  }
4013  }
4014 
4015  /*
4016  * It's possible to configure block sizes smaller than our assumed I/O
4017  * alignment size, which could result in invalid I/O requests.
4018  */
4019 #if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4020  if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4021  {
4022  GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small",
4023  "debug_io_direct", "XLOG_BLCKSZ");
4024  result = false;
4025  }
4026 #endif
4027 #if BLCKSZ < PG_IO_ALIGN_SIZE
4028  if (result && (flags & IO_DIRECT_DATA))
4029  {
4030  GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small",
4031  "debug_io_direct", "BLCKSZ");
4032  result = false;
4033  }
4034 #endif
4035 
4036  pfree(rawstring);
4037  list_free(elemlist);
4038 #endif
4039 
4040  if (!result)
4041  return result;
4042 
4043  /* Save the flags in *extra, for use by assign_debug_io_direct */
4044  *extra = guc_malloc(ERROR, sizeof(int));
4045  *((int *) *extra) = flags;
4046 
4047  return result;
4048 }
4049 
4050 void
4051 assign_debug_io_direct(const char *newval, void *extra)
4052 {
4053  int *flags = (int *) extra;
4054 
4055  io_direct_flags = *flags;
4056 }
4057 
4058 /* ResourceOwner callbacks */
4059 
4060 static void
4062 {
4063  File file = (File) DatumGetInt32(res);
4064  Vfd *vfdP;
4065 
4066  Assert(FileIsValid(file));
4067 
4068  vfdP = &VfdCache[file];
4069  vfdP->resowner = NULL;
4070 
4071  FileClose(file);
4072 }
4073 
4074 static char *
4076 {
4077  return psprintf("File %d", DatumGetInt32(res));
4078 }
void begin_startup_progress_phase(void)
Definition: startup.c:343
unsigned int uint32
Definition: c.h:506
#define Min(x, y)
Definition: c.h:1004
uint32 SubTransactionId
Definition: c.h:656
#define INT64_FORMAT
Definition: c.h:548
#define Assert(condition)
Definition: c.h:858
#define PG_BINARY
Definition: c.h:1273
unsigned int Index
Definition: c.h:614
#define MemSet(start, val, len)
Definition: c.h:1020
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:938
int fdatasync(int fildes)
#define OidIsValid(objectId)
Definition: c.h:775
size_t Size
Definition: c.h:605
int closedir(DIR *)
Definition: dirent.c:127
struct dirent * readdir(DIR *)
Definition: dirent.c:78
DIR * opendir(const char *)
Definition: dirent.c:33
int errcode_for_file_access(void)
Definition: elog.c:876
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define FATAL
Definition: elog.h:41
#define WARNING
Definition: elog.h:36
#define DEBUG2
Definition: elog.h:29
#define PANIC
Definition: elog.h:42
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2932
static int pg_ftruncate(int fd, off_t length)
Definition: fd.c:703
int max_files_per_process
Definition: fd.c:146
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:525
int FileGetRawDesc(File file)
Definition: fd.c:2500
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3937
int FreeDir(DIR *dir)
Definition: fd.c:2984
int recovery_init_sync_method
Definition: fd.c:165
static const ResourceOwnerDesc file_resowner_desc
Definition: fd.c:361
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:2133
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:441
#define FD_MINFREE
Definition: fd.c:138
static int numTempTableSpaces
Definition: fd.c:289
static bool ReleaseLruFile(void)
Definition: fd.c:1382
int io_direct_flags
Definition: fd.c:168
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2606
#define FD_DELETE_AT_CLOSE
Definition: fd.c:192
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1109
static int maxAllocatedDescs
Definition: fd.c:268
static void Delete(File file)
Definition: fd.c:1268
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2765
static long tempFileCounter
Definition: fd.c:280
static char * ResOwnerPrintFile(Datum res)
Definition: fd.c:4075
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:782
static void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: fd.c:377
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:3116
static int numAllocatedDescs
Definition: fd.c:267
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition: fd.c:1901
static void LruDelete(File file)
Definition: fd.c:1287
int pg_fdatasync(int fd)
Definition: fd.c:480
#define FileIsValid(file)
Definition: fd.c:186
void assign_debug_io_direct(const char *newval, void *extra)
Definition: fd.c:4051
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2320
static int nfile
Definition: fd.c:222
int CloseTransientFile(int fd)
Definition: fd.c:2832
#define DO_DB(A)
Definition: fd.c:180
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1087
void closeAllVfds(void)
Definition: fd.c:3043
int max_safe_fds
Definition: fd.c:159
static File AllocateVfd(void)
Definition: fd.c:1414
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1861
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1691
int ClosePipeStream(FILE *file)
Definition: fd.c:3014
void AtEOXact_Files(bool isCommit)
Definition: fd.c:3188
int FileGetRawFlags(File file)
Definition: fd.c:2510
static Size SizeVfdCache
Definition: fd.c:217
static int nextTempTableSpace
Definition: fd.c:290
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:193
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3821
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3796
static void ResOwnerReleaseFile(Datum res)
Definition: fd.c:4061
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3417
int FreeFile(FILE *file)
Definition: fd.c:2804
mode_t FileGetRawMode(File file)
Definition: fd.c:2520
static AllocateDesc * allocatedDescs
Definition: fd.c:269
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:964
static int FileAccess(File file)
Definition: fd.c:1492
static void FreeVfd(File file)
Definition: fd.c:1472
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition: fd.c:461
void FileClose(File file)
Definition: fd.c:1978
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2709
void ReleaseExternalFD(void)
Definition: fd.c:1239
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:194
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3357
bool pg_file_exists(const char *name)
Definition: fd.c:503
void RemovePgTempFiles(void)
Definition: fd.c:3297
#define FileIsNotOpen(file)
Definition: fd.c:189
bool TempTablespacesAreSet(void)
Definition: fd.c:3101
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:756
int FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2392
int FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2077
int data_sync_elevel(int elevel)
Definition: fd.c:3960
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1575
static void Insert(File file)
Definition: fd.c:1313
AllocateDescKind
Definition: fd.c:248
@ AllocateDescDir
Definition: fd.c:251
@ AllocateDescPipe
Definition: fd.c:250
@ AllocateDescFile
Definition: fd.c:249
@ AllocateDescRawFD
Definition: fd.c:252
Oid GetNextTempTableSpace(void)
Definition: fd.c:3134
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1588
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3783
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1528
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1804
bool AcquireExternalFD(void)
Definition: fd.c:1186
static void RegisterTemporaryFile(File file)
Definition: fd.c:1547
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2947
#define NUM_RESERVED_FDS
Definition: fd.c:129
static Oid * tempTableSpaces
Definition: fd.c:288
static bool reserveAllocatedDesc(void)
Definition: fd.c:2531
void InitFileAccess(void)
Definition: fd.c:903
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3445
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1724
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:872
static uint64 temporary_files_size
Definition: fd.c:236
void ReserveExternalFD(void)
Definition: fd.c:1221
char * FilePathName(File file)
Definition: fd.c:2484
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3473
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1932
void set_max_safe_fds(void)
Definition: fd.c:1044
int pg_fsync(int fd)
Definition: fd.c:386
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:3225
#define VFD_CLOSED
Definition: fd.c:184
static bool have_xact_temporary_files
Definition: fd.c:228
static int LruInsert(File file)
Definition: fd.c:1335
static int numExternalFDs
Definition: fd.c:274
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3897
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1660
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:3155
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2656
void InitTemporaryFileAccess(void)
Definition: fd.c:933
static Vfd * VfdCache
Definition: fd.c:216
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2665
bool data_sync_retry
Definition: fd.c:162
static void ReleaseLruFiles(void)
Definition: fd.c:1404
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2215
void SyncDataDirectory(void)
Definition: fd.c:3568
int FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
Definition: fd.c:2347
off_t FileSize(File file)
Definition: fd.c:2432
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, off_t offset, uint32 wait_event_info)
Definition: fd.c:2159
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2449
bool check_debug_io_direct(char **newval, void **extra, GucSource source)
Definition: fd.c:3966
static void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: fd.c:372
static void BeforeShmemExit_Files(int code, Datum arg)
Definition: fd.c:3202
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3682
int pg_truncate(const char *path, off_t length)
Definition: fd.c:720
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:3072
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2866
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1779
#define IO_DIRECT_WAL
Definition: fd.h:55
#define IO_DIRECT_DATA
Definition: fd.h:54
#define IO_DIRECT_WAL_INIT
Definition: fd.h:56
int File
Definition: fd.h:51
#define PG_O_DIRECT
Definition: fd.h:97
int pg_file_create_mode
Definition: file_perm.c:19
int pg_dir_create_mode
Definition: file_perm.c:18
ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset)
Definition: file_utils.c:688
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:526
#define PG_TEMP_FILES_DIR
Definition: file_utils.h:62
#define PG_TEMP_FILE_PREFIX
Definition: file_utils.h:63
PGFileType
Definition: file_utils.h:19
@ PGFILETYPE_DIR
Definition: file_utils.h:23
@ PGFILETYPE_REG
Definition: file_utils.h:22
@ PGFILETYPE_ERROR
Definition: file_utils.h:20
@ DATA_DIR_SYNC_METHOD_SYNCFS
Definition: file_utils.h:30
@ DATA_DIR_SYNC_METHOD_FSYNC
Definition: file_utils.h:29
int MyProcPid
Definition: globals.c:46
bool enableFsync
Definition: globals.c:128
Oid MyDatabaseTableSpace
Definition: globals.c:95
void * guc_malloc(int elevel, size_t size)
Definition: guc.c:637
#define newval
#define GUC_check_errdetail
Definition: guc.h:476
GucSource
Definition: guc.h:108
int temp_file_limit
Definition: guc_tables.c:533
int log_temp_files
Definition: guc_tables.c:528
#define realloc(a, b)
Definition: header.h:60
#define free(a)
Definition: header.h:65
#define malloc(a)
Definition: header.h:50
#define close(a)
Definition: win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:337
int j
Definition: isn.c:74
int i
Definition: isn.c:73
static void const char fflush(stdout)
void list_free(List *list)
Definition: list.c:1546
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:310
char * pstrdup(const char *in)
Definition: mcxt.c:1696
void pfree(void *pointer)
Definition: mcxt.c:1521
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void * palloc(Size size)
Definition: mcxt.c:1317
#define MAP_FAILED
Definition: mem.h:45
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
void * arg
static char * basedir
static PgChecksumMode mode
Definition: pg_checksums.c:56
#define MAXPGPATH
static ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pg_iovec.h:83
static ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pg_iovec.h:44
#define lfirst(lc)
Definition: pg_list.h:172
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition: pg_prng.c:144
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
static rewind_source * source
Definition: pg_rewind.c:89
static char * buf
Definition: pg_test_fsync.c:73
static char * tablespace
Definition: pgbench.c:216
void pgstat_report_tempfile(size_t filesize)
int pg_strcasecmp(const char *s1, const char *s2)
Definition: pgstrcasecmp.c:36
void get_parent_directory(char *path)
Definition: path.c:991
pqsigfunc pqsignal(int signo, pqsigfunc func)
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
uintptr_t Datum
Definition: postgres.h:64
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
static int fd(const char *x, int i)
Definition: preproc-init.c:105
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
#define PG_TBLSPC_DIR
Definition: relpath.h:41
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:33
ResourceOwner CurrentResourceOwner
Definition: resowner.c:165
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:554
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition: resowner.c:514
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:442
@ RESOURCE_RELEASE_AFTER_LOCKS
Definition: resowner.h:56
#define RELEASE_PRIO_FILES
Definition: resowner.h:76
void pg_usleep(long microsec)
Definition: signal.c:53
static pg_noinline void Size size
Definition: slab.c:607
static void error(void)
Definition: sql-dyntest.c:147
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
SubTransactionId create_subid
Definition: fd.c:258
DIR * dir
Definition: fd.c:262
FILE * file
Definition: fd.c:261
int fd
Definition: fd.c:263
union AllocateDesc::@22 desc
AllocateDescKind kind
Definition: fd.c:257
Definition: dirent.c:26
Definition: pg_list.h:54
const char * name
Definition: resowner.h:93
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
__int64 st_size
Definition: win32_port.h:273
unsigned short st_mode
Definition: win32_port.h:268
Definition: fd.c:197
int fd
Definition: fd.c:198
int fileFlags
Definition: fd.c:207
File lruLessRecently
Definition: fd.c:203
File lruMoreRecently
Definition: fd.c:202
char * fileName
Definition: fd.c:205
ResourceOwner resowner
Definition: fd.c:200
unsigned short fdstate
Definition: fd.c:199
File nextFree
Definition: fd.c:201
mode_t fileMode
Definition: fd.c:208
off_t fileSize
Definition: fd.c:204
bool SplitGUCList(char *rawstring, char separator, List **namelist)
Definition: varlena.c:3685
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:85
static void pgstat_report_wait_end(void)
Definition: wait_event.h:101
const char * type
const char * name
#define fsync(fd)
Definition: win32_port.h:85
#define stat
Definition: win32_port.h:284
#define SIG_DFL
Definition: win32_port.h:163
#define EINTR
Definition: win32_port.h:374
#define EOPNOTSUPP
Definition: win32_port.h:398
#define SIGPIPE
Definition: win32_port.h:173
#define lstat(path, sb)
Definition: win32_port.h:285
#define S_ISDIR(m)
Definition: win32_port.h:325
void _dosmaperr(unsigned long)
Definition: win32error.c:177
#define S_ISLNK(m)
Definition: win32_port.h:344
#define mkdir(a, b)
Definition: win32_port.h:80
#define fstat
Definition: win32_port.h:283
#define ftruncate(a, b)
Definition: win32_port.h:82
#define SIG_IGN
Definition: win32_port.h:165
#define O_CLOEXEC
Definition: win32_port.h:359
#define O_DSYNC
Definition: win32_port.h:352
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:790
int wal_sync_method
Definition: xlog.c:129
@ WAL_SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:27
static const char * directory
Definition: zic.c:634