PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  * If a non-virtual file descriptor needs to be held open for any length of
65  * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
66  * (and eventually ReleaseExternalFD), so that we can take it into account
67  * while deciding how many VFDs can be open. This applies to FDs obtained
68  * with BasicOpenFile as well as those obtained without use of any fd.c API.
69  *
70  *-------------------------------------------------------------------------
71  */
72 
73 #include "postgres.h"
74 
75 #include <sys/file.h>
76 #include <sys/param.h>
77 #include <sys/stat.h>
78 #ifndef WIN32
79 #include <sys/mman.h>
80 #endif
81 #include <limits.h>
82 #include <unistd.h>
83 #include <fcntl.h>
84 #ifdef HAVE_SYS_RESOURCE_H
85 #include <sys/resource.h> /* for getrlimit */
86 #endif
87 
88 #include "access/xact.h"
89 #include "access/xlog.h"
90 #include "catalog/pg_tablespace.h"
91 #include "common/file_perm.h"
92 #include "miscadmin.h"
93 #include "pgstat.h"
94 #include "portability/mem.h"
95 #include "storage/fd.h"
96 #include "storage/ipc.h"
97 #include "utils/guc.h"
98 #include "utils/resowner_private.h"
99 
100 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
101 #if defined(HAVE_SYNC_FILE_RANGE)
102 #define PG_FLUSH_DATA_WORKS 1
103 #elif !defined(WIN32) && defined(MS_ASYNC)
104 #define PG_FLUSH_DATA_WORKS 1
105 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
106 #define PG_FLUSH_DATA_WORKS 1
107 #endif
108 
109 /*
110  * We must leave some file descriptors free for system(), the dynamic loader,
111  * and other code that tries to open files without consulting fd.c. This
112  * is the number left free. (While we try fairly hard to prevent EMFILE
113  * errors, there's never any guarantee that we won't get ENFILE due to
114  * other processes chewing up FDs. So it's a bad idea to try to open files
115  * without consulting fd.c. Nonetheless we cannot control all code.)
116  *
117  * Because this is just a fixed setting, we are effectively assuming that
118  * no such code will leave FDs open over the long term; otherwise the slop
119  * is likely to be insufficient. Note in particular that we expect that
120  * loading a shared library does not result in any permanent increase in
121  * the number of open files. (This appears to be true on most if not
122  * all platforms as of Feb 2004.)
123  */
124 #define NUM_RESERVED_FDS 10
125 
126 /*
127  * If we have fewer than this many usable FDs after allowing for the reserved
128  * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
129  * much less than that. Note that this value ensures numExternalFDs can be
130  * at least 16; as of this writing, the contrib/postgres_fdw regression tests
131  * will not pass unless that can grow to at least 14.)
132  */
133 #define FD_MINFREE 48
134 
135 /*
136  * A number of platforms allow individual processes to open many more files
137  * than they can really support when *many* processes do the same thing.
138  * This GUC parameter lets the DBA limit max_safe_fds to something less than
139  * what the postmaster's initial probe suggests will work.
140  */
142 
143 /*
144  * Maximum number of file descriptors to open for operations that fd.c knows
145  * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
146  * to a conservative value, and remains that way indefinitely in bootstrap or
147  * standalone-backend cases. In normal postmaster operation, the postmaster
148  * calls set_max_safe_fds() late in initialization to update the value, and
149  * that value is then inherited by forked subprocesses.
150  *
151  * Note: the value of max_files_per_process is taken into account while
152  * setting this variable, and so need not be tested separately.
153  */
154 int max_safe_fds = FD_MINFREE; /* default if not changed */
155 
156 /* Whether it is safe to continue running after fsync() fails. */
157 bool data_sync_retry = false;
158 
159 /* Debugging.... */
160 
161 #ifdef FDDEBUG
162 #define DO_DB(A) \
163  do { \
164  int _do_db_save_errno = errno; \
165  A; \
166  errno = _do_db_save_errno; \
167  } while (0)
168 #else
169 #define DO_DB(A) \
170  ((void) 0)
171 #endif
172 
173 #define VFD_CLOSED (-1)
174 
175 #define FileIsValid(file) \
176  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
177 
178 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
179 
180 /* these are the assigned bits in fdstate below: */
181 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
182 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
183 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
184 
185 typedef struct vfd
186 {
187  int fd; /* current FD, or VFD_CLOSED if none */
188  unsigned short fdstate; /* bitflags for VFD's state */
189  ResourceOwner resowner; /* owner, for automatic cleanup */
190  File nextFree; /* link to next free VFD, if in freelist */
191  File lruMoreRecently; /* doubly linked recency-of-use list */
193  off_t fileSize; /* current size of file (0 if not temporary) */
194  char *fileName; /* name of file, or NULL for unused VFD */
195  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
196  int fileFlags; /* open(2) flags for (re)opening the file */
197  mode_t fileMode; /* mode to pass to open(2) */
198 } Vfd;
199 
200 /*
201  * Virtual File Descriptor array pointer and size. This grows as
202  * needed. 'File' values are indexes into this array.
203  * Note that VfdCache[0] is not a usable VFD, just a list header.
204  */
205 static Vfd *VfdCache;
206 static Size SizeVfdCache = 0;
207 
208 /*
209  * Number of file descriptors known to be in use by VFD entries.
210  */
211 static int nfile = 0;
212 
213 /*
214  * Flag to tell whether it's worth scanning VfdCache looking for temp files
215  * to close
216  */
217 static bool have_xact_temporary_files = false;
218 
219 /*
220  * Tracks the total size of all temporary files. Note: when temp_file_limit
221  * is being enforced, this cannot overflow since the limit cannot be more
222  * than INT_MAX kilobytes. When not enforcing, it could theoretically
223  * overflow, but we don't care.
224  */
225 static uint64 temporary_files_size = 0;
226 
227 /*
228  * List of OS handles opened with AllocateFile, AllocateDir and
229  * OpenTransientFile.
230  */
231 typedef enum
232 {
238 
239 typedef struct
240 {
243  union
244  {
245  FILE *file;
247  int fd;
248  } desc;
249 } AllocateDesc;
250 
251 static int numAllocatedDescs = 0;
252 static int maxAllocatedDescs = 0;
254 
255 /*
256  * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
257  */
258 static int numExternalFDs = 0;
259 
260 /*
261  * Number of temporary files opened during the current session;
262  * this is used in generation of tempfile names.
263  */
264 static long tempFileCounter = 0;
265 
266 /*
267  * Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
268  * this has not been set in the current transaction.
269  */
270 static Oid *tempTableSpaces = NULL;
271 static int numTempTableSpaces = -1;
272 static int nextTempTableSpace = 0;
273 
274 
275 /*--------------------
276  *
277  * Private Routines
278  *
279  * Delete - delete a file from the Lru ring
280  * LruDelete - remove a file from the Lru ring and close its FD
281  * Insert - put a file at the front of the Lru ring
282  * LruInsert - put a file at the front of the Lru ring and open it
283  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
284  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
285  * AllocateVfd - grab a free (or new) file record (from VfdCache)
286  * FreeVfd - free a file record
287  *
288  * The Least Recently Used ring is a doubly linked list that begins and
289  * ends on element zero. Element zero is special -- it doesn't represent
290  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
291  * anchor that shows us the beginning/end of the ring.
292  * Only VFD elements that are currently really open (have an FD assigned) are
293  * in the Lru ring. Elements that are "virtually" open can be recognized
294  * by having a non-null fileName field.
295  *
296  * example:
297  *
298  * /--less----\ /---------\
299  * v \ v \
300  * #0 --more---> LeastRecentlyUsed --more-\ \
301  * ^\ | |
302  * \\less--> MostRecentlyUsedFile <---/ |
303  * \more---/ \--less--/
304  *
305  *--------------------
306  */
307 static void Delete(File file);
308 static void LruDelete(File file);
309 static void Insert(File file);
310 static int LruInsert(File file);
311 static bool ReleaseLruFile(void);
312 static void ReleaseLruFiles(void);
313 static File AllocateVfd(void);
314 static void FreeVfd(File file);
315 
316 static int FileAccess(File file);
317 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
318 static bool reserveAllocatedDesc(void);
319 static int FreeDesc(AllocateDesc *desc);
320 
321 static void AtProcExit_Files(int code, Datum arg);
322 static void CleanupTempFiles(bool isCommit, bool isProcExit);
323 static void RemovePgTempRelationFiles(const char *tsdirname);
324 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
325 
326 static void walkdir(const char *path,
327  void (*action) (const char *fname, bool isdir, int elevel),
328  bool process_symlinks,
329  int elevel);
330 #ifdef PG_FLUSH_DATA_WORKS
331 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
332 #endif
333 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
334 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
335 
336 static int fsync_parent_path(const char *fname, int elevel);
337 
338 
339 /*
340  * pg_fsync --- do fsync with or without writethrough
341  */
342 int
344 {
345 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
346  struct stat st;
347 
348  /*
349  * Some operating system implementations of fsync() have requirements
350  * about the file access modes that were used when their file descriptor
351  * argument was opened, and these requirements differ depending on whether
352  * the file descriptor is for a directory.
353  *
354  * For any file descriptor that may eventually be handed to fsync(), we
355  * should have opened it with access modes that are compatible with
356  * fsync() on all supported systems, otherwise the code may not be
357  * portable, even if it runs ok on the current system.
358  *
359  * We assert here that a descriptor for a file was opened with write
360  * permissions (either O_RDWR or O_WRONLY) and for a directory without
361  * write permissions (O_RDONLY).
362  *
363  * Ignore any fstat errors and let the follow-up fsync() do its work.
364  * Doing this sanity check here counts for the case where fsync() is
365  * disabled.
366  */
367  if (fstat(fd, &st) == 0)
368  {
369  int desc_flags = fcntl(fd, F_GETFL);
370 
371  /*
372  * O_RDONLY is historically 0, so just make sure that for directories
373  * no write flags are used.
374  */
375  if (S_ISDIR(st.st_mode))
376  Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
377  else
378  Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
379  }
380  errno = 0;
381 #endif
382 
383  /* #if is to skip the sync_method test if there's no need for it */
384 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
386  return pg_fsync_writethrough(fd);
387  else
388 #endif
389  return pg_fsync_no_writethrough(fd);
390 }
391 
392 
393 /*
394  * pg_fsync_no_writethrough --- same as fsync except does nothing if
395  * enableFsync is off
396  */
397 int
399 {
400  if (enableFsync)
401  return fsync(fd);
402  else
403  return 0;
404 }
405 
406 /*
407  * pg_fsync_writethrough
408  */
409 int
411 {
412  if (enableFsync)
413  {
414 #ifdef WIN32
415  return _commit(fd);
416 #elif defined(F_FULLFSYNC)
417  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
418 #else
419  errno = ENOSYS;
420  return -1;
421 #endif
422  }
423  else
424  return 0;
425 }
426 
427 /*
428  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
429  *
430  * Not all platforms have fdatasync; treat as fsync if not available.
431  */
432 int
434 {
435  if (enableFsync)
436  {
437 #ifdef HAVE_FDATASYNC
438  return fdatasync(fd);
439 #else
440  return fsync(fd);
441 #endif
442  }
443  else
444  return 0;
445 }
446 
447 /*
448  * pg_flush_data --- advise OS that the described dirty data should be flushed
449  *
450  * offset of 0 with nbytes 0 means that the entire file should be flushed
451  */
452 void
453 pg_flush_data(int fd, off_t offset, off_t nbytes)
454 {
455  /*
456  * Right now file flushing is primarily used to avoid making later
457  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
458  * if fsyncs are disabled - that's a decision we might want to make
459  * configurable at some point.
460  */
461  if (!enableFsync)
462  return;
463 
464  /*
465  * We compile all alternatives that are supported on the current platform,
466  * to find portability problems more easily.
467  */
468 #if defined(HAVE_SYNC_FILE_RANGE)
469  {
470  int rc;
471  static bool not_implemented_by_kernel = false;
472 
473  if (not_implemented_by_kernel)
474  return;
475 
476  /*
477  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
478  * tells the OS that writeback for the specified blocks should be
479  * started, but that we don't want to wait for completion. Note that
480  * this call might block if too much dirty data exists in the range.
481  * This is the preferable method on OSs supporting it, as it works
482  * reliably when available (contrast to msync()) and doesn't flush out
483  * clean data (like FADV_DONTNEED).
484  */
485  rc = sync_file_range(fd, offset, nbytes,
486  SYNC_FILE_RANGE_WRITE);
487  if (rc != 0)
488  {
489  int elevel;
490 
491  /*
492  * For systems that don't have an implementation of
493  * sync_file_range() such as Windows WSL, generate only one
494  * warning and then suppress all further attempts by this process.
495  */
496  if (errno == ENOSYS)
497  {
498  elevel = WARNING;
499  not_implemented_by_kernel = true;
500  }
501  else
502  elevel = data_sync_elevel(WARNING);
503 
504  ereport(elevel,
506  errmsg("could not flush dirty data: %m")));
507  }
508 
509  return;
510  }
511 #endif
512 #if !defined(WIN32) && defined(MS_ASYNC)
513  {
514  void *p;
515  static int pagesize = 0;
516 
517  /*
518  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
519  * writeback. On linux it only does so if MS_SYNC is specified, but
520  * then it does the writeback synchronously. Luckily all common linux
521  * systems have sync_file_range(). This is preferable over
522  * FADV_DONTNEED because it doesn't flush out clean data.
523  *
524  * We map the file (mmap()), tell the kernel to sync back the contents
525  * (msync()), and then remove the mapping again (munmap()).
526  */
527 
528  /* mmap() needs actual length if we want to map whole file */
529  if (offset == 0 && nbytes == 0)
530  {
531  nbytes = lseek(fd, 0, SEEK_END);
532  if (nbytes < 0)
533  {
536  errmsg("could not determine dirty data size: %m")));
537  return;
538  }
539  }
540 
541  /*
542  * Some platforms reject partial-page mmap() attempts. To deal with
543  * that, just truncate the request to a page boundary. If any extra
544  * bytes don't get flushed, well, it's only a hint anyway.
545  */
546 
547  /* fetch pagesize only once */
548  if (pagesize == 0)
549  pagesize = sysconf(_SC_PAGESIZE);
550 
551  /* align length to pagesize, dropping any fractional page */
552  if (pagesize > 0)
553  nbytes = (nbytes / pagesize) * pagesize;
554 
555  /* fractional-page request is a no-op */
556  if (nbytes <= 0)
557  return;
558 
559  /*
560  * mmap could well fail, particularly on 32-bit platforms where there
561  * may simply not be enough address space. If so, silently fall
562  * through to the next implementation.
563  */
564  if (nbytes <= (off_t) SSIZE_MAX)
565  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
566  else
567  p = MAP_FAILED;
568 
569  if (p != MAP_FAILED)
570  {
571  int rc;
572 
573  rc = msync(p, (size_t) nbytes, MS_ASYNC);
574  if (rc != 0)
575  {
578  errmsg("could not flush dirty data: %m")));
579  /* NB: need to fall through to munmap()! */
580  }
581 
582  rc = munmap(p, (size_t) nbytes);
583  if (rc != 0)
584  {
585  /* FATAL error because mapping would remain */
586  ereport(FATAL,
588  errmsg("could not munmap() while flushing data: %m")));
589  }
590 
591  return;
592  }
593  }
594 #endif
595 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
596  {
597  int rc;
598 
599  /*
600  * Signal the kernel that the passed in range should not be cached
601  * anymore. This has the, desired, side effect of writing out dirty
602  * data, and the, undesired, side effect of likely discarding useful
603  * clean cached blocks. For the latter reason this is the least
604  * preferable method.
605  */
606 
607  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
608 
609  if (rc != 0)
610  {
611  /* don't error out, this is just a performance optimization */
614  errmsg("could not flush dirty data: %m")));
615  }
616 
617  return;
618  }
619 #endif
620 }
621 
622 
623 /*
624  * fsync_fname -- fsync a file or directory, handling errors properly
625  *
626  * Try to fsync a file or directory. When doing the latter, ignore errors that
627  * indicate the OS just doesn't allow/require fsyncing directories.
628  */
629 void
630 fsync_fname(const char *fname, bool isdir)
631 {
632  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
633 }
634 
635 /*
636  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
637  *
638  * This routine ensures that, after returning, the effect of renaming file
639  * persists in case of a crash. A crash while this routine is running will
640  * leave you with either the pre-existing or the moved file in place of the
641  * new file; no mixed state or truncated files are possible.
642  *
643  * It does so by using fsync on the old filename and the possibly existing
644  * target filename before the rename, and the target file and directory after.
645  *
646  * Note that rename() cannot be used across arbitrary directories, as they
647  * might not be on the same filesystem. Therefore this routine does not
648  * support renaming across directories.
649  *
650  * Log errors with the caller specified severity.
651  *
652  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
653  * valid upon return.
654  */
655 int
656 durable_rename(const char *oldfile, const char *newfile, int elevel)
657 {
658  int fd;
659 
660  /*
661  * First fsync the old and target path (if it exists), to ensure that they
662  * are properly persistent on disk. Syncing the target file is not
663  * strictly necessary, but it makes it easier to reason about crashes;
664  * because it's then guaranteed that either source or target file exists
665  * after a crash.
666  */
667  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
668  return -1;
669 
670  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
671  if (fd < 0)
672  {
673  if (errno != ENOENT)
674  {
675  ereport(elevel,
677  errmsg("could not open file \"%s\": %m", newfile)));
678  return -1;
679  }
680  }
681  else
682  {
683  if (pg_fsync(fd) != 0)
684  {
685  int save_errno;
686 
687  /* close file upon error, might not be in transaction context */
688  save_errno = errno;
689  CloseTransientFile(fd);
690  errno = save_errno;
691 
692  ereport(elevel,
694  errmsg("could not fsync file \"%s\": %m", newfile)));
695  return -1;
696  }
697 
698  if (CloseTransientFile(fd) != 0)
699  {
700  ereport(elevel,
702  errmsg("could not close file \"%s\": %m", newfile)));
703  return -1;
704  }
705  }
706 
707  /* Time to do the real deal... */
708  if (rename(oldfile, newfile) < 0)
709  {
710  ereport(elevel,
712  errmsg("could not rename file \"%s\" to \"%s\": %m",
713  oldfile, newfile)));
714  return -1;
715  }
716 
717  /*
718  * To guarantee renaming the file is persistent, fsync the file with its
719  * new name, and its containing directory.
720  */
721  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
722  return -1;
723 
724  if (fsync_parent_path(newfile, elevel) != 0)
725  return -1;
726 
727  return 0;
728 }
729 
730 /*
731  * durable_unlink -- remove a file in a durable manner
732  *
733  * This routine ensures that, after returning, the effect of removing file
734  * persists in case of a crash. A crash while this routine is running will
735  * leave the system in no mixed state.
736  *
737  * It does so by using fsync on the parent directory of the file after the
738  * actual removal is done.
739  *
740  * Log errors with the severity specified by caller.
741  *
742  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
743  * valid upon return.
744  */
745 int
746 durable_unlink(const char *fname, int elevel)
747 {
748  if (unlink(fname) < 0)
749  {
750  ereport(elevel,
752  errmsg("could not remove file \"%s\": %m",
753  fname)));
754  return -1;
755  }
756 
757  /*
758  * To guarantee that the removal of the file is persistent, fsync its
759  * parent directory.
760  */
761  if (fsync_parent_path(fname, elevel) != 0)
762  return -1;
763 
764  return 0;
765 }
766 
767 /*
768  * durable_rename_excl -- rename a file in a durable manner, without
769  * overwriting an existing target file
770  *
771  * Similar to durable_rename(), except that this routine will fail if the
772  * target file already exists.
773  *
774  * Note that a crash in an unfortunate moment can leave you with two links to
775  * the target file.
776  *
777  * Log errors with the caller specified severity.
778  *
779  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
780  * valid upon return.
781  */
782 int
783 durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
784 {
785  /*
786  * Ensure that, if we crash directly after the rename/link, a file with
787  * valid contents is moved into place.
788  */
789  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
790  return -1;
791 
792  if (link(oldfile, newfile) < 0)
793  {
794  ereport(elevel,
796  errmsg("could not link file \"%s\" to \"%s\": %m",
797  oldfile, newfile)));
798  return -1;
799  }
800  unlink(oldfile);
801 
802  /*
803  * Make change persistent in case of an OS crash, both the new entry and
804  * its parent directory need to be flushed.
805  */
806  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
807  return -1;
808 
809  /* Same for parent directory */
810  if (fsync_parent_path(newfile, elevel) != 0)
811  return -1;
812 
813  return 0;
814 }
815 
816 /*
817  * InitFileAccess --- initialize this module during backend startup
818  *
819  * This is called during either normal or standalone backend start.
820  * It is *not* called in the postmaster.
821  */
822 void
824 {
825  Assert(SizeVfdCache == 0); /* call me only once */
826 
827  /* initialize cache header entry */
828  VfdCache = (Vfd *) malloc(sizeof(Vfd));
829  if (VfdCache == NULL)
830  ereport(FATAL,
831  (errcode(ERRCODE_OUT_OF_MEMORY),
832  errmsg("out of memory")));
833 
834  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
835  VfdCache->fd = VFD_CLOSED;
836 
837  SizeVfdCache = 1;
838 
839  /* register proc-exit hook to ensure temp files are dropped at exit */
841 }
842 
843 /*
844  * count_usable_fds --- count how many FDs the system will let us open,
845  * and estimate how many are already open.
846  *
847  * We stop counting if usable_fds reaches max_to_probe. Note: a small
848  * value of max_to_probe might result in an underestimate of already_open;
849  * we must fill in any "gaps" in the set of used FDs before the calculation
850  * of already_open will give the right answer. In practice, max_to_probe
851  * of a couple of dozen should be enough to ensure good results.
852  *
853  * We assume stdin (FD 0) is available for dup'ing
854  */
855 static void
856 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
857 {
858  int *fd;
859  int size;
860  int used = 0;
861  int highestfd = 0;
862  int j;
863 
864 #ifdef HAVE_GETRLIMIT
865  struct rlimit rlim;
866  int getrlimit_status;
867 #endif
868 
869  size = 1024;
870  fd = (int *) palloc(size * sizeof(int));
871 
872 #ifdef HAVE_GETRLIMIT
873 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
874  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
875 #else /* but BSD doesn't ... */
876  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
877 #endif /* RLIMIT_NOFILE */
878  if (getrlimit_status != 0)
879  ereport(WARNING, (errmsg("getrlimit failed: %m")));
880 #endif /* HAVE_GETRLIMIT */
881 
882  /* dup until failure or probe limit reached */
883  for (;;)
884  {
885  int thisfd;
886 
887 #ifdef HAVE_GETRLIMIT
888 
889  /*
890  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
891  * some platforms
892  */
893  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
894  break;
895 #endif
896 
897  thisfd = dup(0);
898  if (thisfd < 0)
899  {
900  /* Expect EMFILE or ENFILE, else it's fishy */
901  if (errno != EMFILE && errno != ENFILE)
902  elog(WARNING, "dup(0) failed after %d successes: %m", used);
903  break;
904  }
905 
906  if (used >= size)
907  {
908  size *= 2;
909  fd = (int *) repalloc(fd, size * sizeof(int));
910  }
911  fd[used++] = thisfd;
912 
913  if (highestfd < thisfd)
914  highestfd = thisfd;
915 
916  if (used >= max_to_probe)
917  break;
918  }
919 
920  /* release the files we opened */
921  for (j = 0; j < used; j++)
922  close(fd[j]);
923 
924  pfree(fd);
925 
926  /*
927  * Return results. usable_fds is just the number of successful dups. We
928  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
929  * number) and so already_open is highestfd+1 - usable_fds.
930  */
931  *usable_fds = used;
932  *already_open = highestfd + 1 - used;
933 }
934 
935 /*
936  * set_max_safe_fds
937  * Determine number of file descriptors that fd.c is allowed to use
938  */
939 void
941 {
942  int usable_fds;
943  int already_open;
944 
945  /*----------
946  * We want to set max_safe_fds to
947  * MIN(usable_fds, max_files_per_process - already_open)
948  * less the slop factor for files that are opened without consulting
949  * fd.c. This ensures that we won't exceed either max_files_per_process
950  * or the experimentally-determined EMFILE limit.
951  *----------
952  */
954  &usable_fds, &already_open);
955 
956  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
957 
958  /*
959  * Take off the FDs reserved for system() etc.
960  */
962 
963  /*
964  * Make sure we still have enough to get by.
965  */
966  if (max_safe_fds < FD_MINFREE)
967  ereport(FATAL,
968  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
969  errmsg("insufficient file descriptors available to start server process"),
970  errdetail("System allows %d, we need at least %d.",
973 
974  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
975  max_safe_fds, usable_fds, already_open);
976 }
977 
978 /*
979  * Open a file with BasicOpenFilePerm() and pass default file mode for the
980  * fileMode parameter.
981  */
982 int
984 {
985  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
986 }
987 
988 /*
989  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
990  *
991  * This is exported for use by places that really want a plain kernel FD,
992  * but need to be proof against running out of FDs. Once an FD has been
993  * successfully returned, it is the caller's responsibility to ensure that
994  * it will not be leaked on ereport()! Most users should *not* call this
995  * routine directly, but instead use the VFD abstraction level, which
996  * provides protection against descriptor leaks as well as management of
997  * files that need to be open for more than a short period of time.
998  *
999  * Ideally this should be the *only* direct call of open() in the backend.
1000  * In practice, the postmaster calls open() directly, and there are some
1001  * direct open() calls done early in backend startup. Those are OK since
1002  * this module wouldn't have any open files to close at that point anyway.
1003  */
1004 int
1006 {
1007  int fd;
1008 
1009 tryAgain:
1010  fd = open(fileName, fileFlags, fileMode);
1011 
1012  if (fd >= 0)
1013  return fd; /* success! */
1014 
1015  if (errno == EMFILE || errno == ENFILE)
1016  {
1017  int save_errno = errno;
1018 
1019  ereport(LOG,
1020  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1021  errmsg("out of file descriptors: %m; release and retry")));
1022  errno = 0;
1023  if (ReleaseLruFile())
1024  goto tryAgain;
1025  errno = save_errno;
1026  }
1027 
1028  return -1; /* failure */
1029 }
1030 
1031 /*
1032  * AcquireExternalFD - attempt to reserve an external file descriptor
1033  *
1034  * This should be used by callers that need to hold a file descriptor open
1035  * over more than a short interval, but cannot use any of the other facilities
1036  * provided by this module.
1037  *
1038  * The difference between this and the underlying ReserveExternalFD function
1039  * is that this will report failure (by setting errno and returning false)
1040  * if "too many" external FDs are already reserved. This should be used in
1041  * any code where the total number of FDs to be reserved is not predictable
1042  * and small.
1043  */
1044 bool
1046 {
1047  /*
1048  * We don't want more than max_safe_fds / 3 FDs to be consumed for
1049  * "external" FDs.
1050  */
1051  if (numExternalFDs < max_safe_fds / 3)
1052  {
1054  return true;
1055  }
1056  errno = EMFILE;
1057  return false;
1058 }
1059 
1060 /*
1061  * ReserveExternalFD - report external consumption of a file descriptor
1062  *
1063  * This should be used by callers that need to hold a file descriptor open
1064  * over more than a short interval, but cannot use any of the other facilities
1065  * provided by this module. This just tracks the use of the FD and closes
1066  * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1067  *
1068  * Call this directly only in code where failure to reserve the FD would be
1069  * fatal; for example, the WAL-writing code does so, since the alternative is
1070  * session failure. Also, it's very unwise to do so in code that could
1071  * consume more than one FD per process.
1072  *
1073  * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1074  * available, it doesn't matter too much whether this is called before or
1075  * after actually opening the FD; but doing so beforehand reduces the risk of
1076  * an EMFILE failure if not everybody played nice. In any case, it's solely
1077  * caller's responsibility to keep the external-FD count in sync with reality.
1078  */
1079 void
1081 {
1082  /*
1083  * Release VFDs if needed to stay safe. Because we do this before
1084  * incrementing numExternalFDs, the final state will be as desired, i.e.,
1085  * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1086  */
1087  ReleaseLruFiles();
1088 
1089  numExternalFDs++;
1090 }
1091 
1092 /*
1093  * ReleaseExternalFD - report release of an external file descriptor
1094  *
1095  * This is guaranteed not to change errno, so it can be used in failure paths.
1096  */
1097 void
1099 {
1100  Assert(numExternalFDs > 0);
1101  numExternalFDs--;
1102 }
1103 
1104 
1105 #if defined(FDDEBUG)
1106 
1107 static void
1108 _dump_lru(void)
1109 {
1110  int mru = VfdCache[0].lruLessRecently;
1111  Vfd *vfdP = &VfdCache[mru];
1112  char buf[2048];
1113 
1114  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1115  while (mru != 0)
1116  {
1117  mru = vfdP->lruLessRecently;
1118  vfdP = &VfdCache[mru];
1119  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1120  }
1121  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1122  elog(LOG, "%s", buf);
1123 }
1124 #endif /* FDDEBUG */
1125 
1126 static void
1128 {
1129  Vfd *vfdP;
1130 
1131  Assert(file != 0);
1132 
1133  DO_DB(elog(LOG, "Delete %d (%s)",
1134  file, VfdCache[file].fileName));
1135  DO_DB(_dump_lru());
1136 
1137  vfdP = &VfdCache[file];
1138 
1139  VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1140  VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1141 
1142  DO_DB(_dump_lru());
1143 }
1144 
1145 static void
1147 {
1148  Vfd *vfdP;
1149 
1150  Assert(file != 0);
1151 
1152  DO_DB(elog(LOG, "LruDelete %d (%s)",
1153  file, VfdCache[file].fileName));
1154 
1155  vfdP = &VfdCache[file];
1156 
1157  /*
1158  * Close the file. We aren't expecting this to fail; if it does, better
1159  * to leak the FD than to mess up our internal state.
1160  */
1161  if (close(vfdP->fd) != 0)
1163  "could not close file \"%s\": %m", vfdP->fileName);
1164  vfdP->fd = VFD_CLOSED;
1165  --nfile;
1166 
1167  /* delete the vfd record from the LRU ring */
1168  Delete(file);
1169 }
1170 
1171 static void
1173 {
1174  Vfd *vfdP;
1175 
1176  Assert(file != 0);
1177 
1178  DO_DB(elog(LOG, "Insert %d (%s)",
1179  file, VfdCache[file].fileName));
1180  DO_DB(_dump_lru());
1181 
1182  vfdP = &VfdCache[file];
1183 
1184  vfdP->lruMoreRecently = 0;
1185  vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1186  VfdCache[0].lruLessRecently = file;
1187  VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1188 
1189  DO_DB(_dump_lru());
1190 }
1191 
1192 /* returns 0 on success, -1 on re-open failure (with errno set) */
1193 static int
1195 {
1196  Vfd *vfdP;
1197 
1198  Assert(file != 0);
1199 
1200  DO_DB(elog(LOG, "LruInsert %d (%s)",
1201  file, VfdCache[file].fileName));
1202 
1203  vfdP = &VfdCache[file];
1204 
1205  if (FileIsNotOpen(file))
1206  {
1207  /* Close excess kernel FDs. */
1208  ReleaseLruFiles();
1209 
1210  /*
1211  * The open could still fail for lack of file descriptors, eg due to
1212  * overall system file table being full. So, be prepared to release
1213  * another FD if necessary...
1214  */
1215  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1216  vfdP->fileMode);
1217  if (vfdP->fd < 0)
1218  {
1219  DO_DB(elog(LOG, "re-open failed: %m"));
1220  return -1;
1221  }
1222  else
1223  {
1224  ++nfile;
1225  }
1226  }
1227 
1228  /*
1229  * put it at the head of the Lru ring
1230  */
1231 
1232  Insert(file);
1233 
1234  return 0;
1235 }
1236 
1237 /*
1238  * Release one kernel FD by closing the least-recently-used VFD.
1239  */
1240 static bool
1242 {
1243  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1244 
1245  if (nfile > 0)
1246  {
1247  /*
1248  * There are opened files and so there should be at least one used vfd
1249  * in the ring.
1250  */
1251  Assert(VfdCache[0].lruMoreRecently != 0);
1252  LruDelete(VfdCache[0].lruMoreRecently);
1253  return true; /* freed a file */
1254  }
1255  return false; /* no files available to free */
1256 }
1257 
1258 /*
1259  * Release kernel FDs as needed to get under the max_safe_fds limit.
1260  * After calling this, it's OK to try to open another file.
1261  */
1262 static void
1264 {
1266  {
1267  if (!ReleaseLruFile())
1268  break;
1269  }
1270 }
1271 
1272 static File
1274 {
1275  Index i;
1276  File file;
1277 
1278  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1279 
1280  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1281 
1282  if (VfdCache[0].nextFree == 0)
1283  {
1284  /*
1285  * The free list is empty so it is time to increase the size of the
1286  * array. We choose to double it each time this happens. However,
1287  * there's not much point in starting *real* small.
1288  */
1289  Size newCacheSize = SizeVfdCache * 2;
1290  Vfd *newVfdCache;
1291 
1292  if (newCacheSize < 32)
1293  newCacheSize = 32;
1294 
1295  /*
1296  * Be careful not to clobber VfdCache ptr if realloc fails.
1297  */
1298  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1299  if (newVfdCache == NULL)
1300  ereport(ERROR,
1301  (errcode(ERRCODE_OUT_OF_MEMORY),
1302  errmsg("out of memory")));
1303  VfdCache = newVfdCache;
1304 
1305  /*
1306  * Initialize the new entries and link them into the free list.
1307  */
1308  for (i = SizeVfdCache; i < newCacheSize; i++)
1309  {
1310  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1311  VfdCache[i].nextFree = i + 1;
1312  VfdCache[i].fd = VFD_CLOSED;
1313  }
1314  VfdCache[newCacheSize - 1].nextFree = 0;
1315  VfdCache[0].nextFree = SizeVfdCache;
1316 
1317  /*
1318  * Record the new size
1319  */
1320  SizeVfdCache = newCacheSize;
1321  }
1322 
1323  file = VfdCache[0].nextFree;
1324 
1325  VfdCache[0].nextFree = VfdCache[file].nextFree;
1326 
1327  return file;
1328 }
1329 
1330 static void
1332 {
1333  Vfd *vfdP = &VfdCache[file];
1334 
1335  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1336  file, vfdP->fileName ? vfdP->fileName : ""));
1337 
1338  if (vfdP->fileName != NULL)
1339  {
1340  free(vfdP->fileName);
1341  vfdP->fileName = NULL;
1342  }
1343  vfdP->fdstate = 0x0;
1344 
1345  vfdP->nextFree = VfdCache[0].nextFree;
1346  VfdCache[0].nextFree = file;
1347 }
1348 
1349 /* returns 0 on success, -1 on re-open failure (with errno set) */
1350 static int
1352 {
1353  int returnValue;
1354 
1355  DO_DB(elog(LOG, "FileAccess %d (%s)",
1356  file, VfdCache[file].fileName));
1357 
1358  /*
1359  * Is the file open? If not, open it and put it at the head of the LRU
1360  * ring (possibly closing the least recently used file to get an FD).
1361  */
1362 
1363  if (FileIsNotOpen(file))
1364  {
1365  returnValue = LruInsert(file);
1366  if (returnValue != 0)
1367  return returnValue;
1368  }
1369  else if (VfdCache[0].lruLessRecently != file)
1370  {
1371  /*
1372  * We now know that the file is open and that it is not the last one
1373  * accessed, so we need to move it to the head of the Lru ring.
1374  */
1375 
1376  Delete(file);
1377  Insert(file);
1378  }
1379 
1380  return 0;
1381 }
1382 
1383 /*
1384  * Called whenever a temporary file is deleted to report its size.
1385  */
1386 static void
1387 ReportTemporaryFileUsage(const char *path, off_t size)
1388 {
1389  pgstat_report_tempfile(size);
1390 
1391  if (log_temp_files >= 0)
1392  {
1393  if ((size / 1024) >= log_temp_files)
1394  ereport(LOG,
1395  (errmsg("temporary file: path \"%s\", size %lu",
1396  path, (unsigned long) size)));
1397  }
1398 }
1399 
1400 /*
1401  * Called to register a temporary file for automatic close.
1402  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1403  * before the file was opened.
1404  */
1405 static void
1407 {
1409  VfdCache[file].resowner = CurrentResourceOwner;
1410 
1411  /* Backup mechanism for closing at end of xact. */
1412  VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1414 }
1415 
1416 /*
1417  * Called when we get a shared invalidation message on some relation.
1418  */
1419 #ifdef NOT_USED
1420 void
1421 FileInvalidate(File file)
1422 {
1423  Assert(FileIsValid(file));
1424  if (!FileIsNotOpen(file))
1425  LruDelete(file);
1426 }
1427 #endif
1428 
1429 /*
1430  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1431  * fileMode parameter.
1432  */
1433 File
1435 {
1436  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1437 }
1438 
1439 /*
1440  * open a file in an arbitrary directory
1441  *
1442  * NB: if the passed pathname is relative (which it usually is),
1443  * it will be interpreted relative to the process' working directory
1444  * (which should always be $PGDATA when this code is running).
1445  */
1446 File
1448 {
1449  char *fnamecopy;
1450  File file;
1451  Vfd *vfdP;
1452 
1453  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1454  fileName, fileFlags, fileMode));
1455 
1456  /*
1457  * We need a malloc'd copy of the file name; fail cleanly if no room.
1458  */
1459  fnamecopy = strdup(fileName);
1460  if (fnamecopy == NULL)
1461  ereport(ERROR,
1462  (errcode(ERRCODE_OUT_OF_MEMORY),
1463  errmsg("out of memory")));
1464 
1465  file = AllocateVfd();
1466  vfdP = &VfdCache[file];
1467 
1468  /* Close excess kernel FDs. */
1469  ReleaseLruFiles();
1470 
1471  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1472 
1473  if (vfdP->fd < 0)
1474  {
1475  int save_errno = errno;
1476 
1477  FreeVfd(file);
1478  free(fnamecopy);
1479  errno = save_errno;
1480  return -1;
1481  }
1482  ++nfile;
1483  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1484  vfdP->fd));
1485 
1486  Insert(file);
1487 
1488  vfdP->fileName = fnamecopy;
1489  /* Saved flags are adjusted to be OK for re-opening file */
1490  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1491  vfdP->fileMode = fileMode;
1492  vfdP->fileSize = 0;
1493  vfdP->fdstate = 0x0;
1494  vfdP->resowner = NULL;
1495 
1496  return file;
1497 }
1498 
1499 /*
1500  * Create directory 'directory'. If necessary, create 'basedir', which must
1501  * be the directory above it. This is designed for creating the top-level
1502  * temporary directory on demand before creating a directory underneath it.
1503  * Do nothing if the directory already exists.
1504  *
1505  * Directories created within the top-level temporary directory should begin
1506  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1507  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1508  * that do not need any particular prefix.
1509 */
1510 void
1512 {
1513  if (MakePGDirectory(directory) < 0)
1514  {
1515  if (errno == EEXIST)
1516  return;
1517 
1518  /*
1519  * Failed. Try to create basedir first in case it's missing. Tolerate
1520  * EEXIST to close a race against another process following the same
1521  * algorithm.
1522  */
1523  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1524  ereport(ERROR,
1526  errmsg("cannot create temporary directory \"%s\": %m",
1527  basedir)));
1528 
1529  /* Try again. */
1530  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1531  ereport(ERROR,
1533  errmsg("cannot create temporary subdirectory \"%s\": %m",
1534  directory)));
1535  }
1536 }
1537 
1538 /*
1539  * Delete a directory and everything in it, if it exists.
1540  */
1541 void
1542 PathNameDeleteTemporaryDir(const char *dirname)
1543 {
1544  struct stat statbuf;
1545 
1546  /* Silently ignore missing directory. */
1547  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1548  return;
1549 
1550  /*
1551  * Currently, walkdir doesn't offer a way for our passed in function to
1552  * maintain state. Perhaps it should, so that we could tell the caller
1553  * whether this operation succeeded or failed. Since this operation is
1554  * used in a cleanup path, we wouldn't actually behave differently: we'll
1555  * just log failures.
1556  */
1557  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1558 }
1559 
1560 /*
1561  * Open a temporary file that will disappear when we close it.
1562  *
1563  * This routine takes care of generating an appropriate tempfile name.
1564  * There's no need to pass in fileFlags or fileMode either, since only
1565  * one setting makes any sense for a temp file.
1566  *
1567  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1568  * to ensure it's closed and deleted when it's no longer needed, typically at
1569  * the end-of-transaction. In most cases, you don't want temporary files to
1570  * outlive the transaction that created them, so this should be false -- but
1571  * if you need "somewhat" temporary storage, this might be useful. In either
1572  * case, the file is removed when the File is explicitly closed.
1573  */
1574 File
1575 OpenTemporaryFile(bool interXact)
1576 {
1577  File file = 0;
1578 
1579  /*
1580  * Make sure the current resource owner has space for this File before we
1581  * open it, if we'll be registering it below.
1582  */
1583  if (!interXact)
1585 
1586  /*
1587  * If some temp tablespace(s) have been given to us, try to use the next
1588  * one. If a given tablespace can't be found, we silently fall back to
1589  * the database's default tablespace.
1590  *
1591  * BUT: if the temp file is slated to outlive the current transaction,
1592  * force it into the database's default tablespace, so that it will not
1593  * pose a threat to possible tablespace drop attempts.
1594  */
1595  if (numTempTableSpaces > 0 && !interXact)
1596  {
1597  Oid tblspcOid = GetNextTempTableSpace();
1598 
1599  if (OidIsValid(tblspcOid))
1600  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1601  }
1602 
1603  /*
1604  * If not, or if tablespace is bad, create in database's default
1605  * tablespace. MyDatabaseTableSpace should normally be set before we get
1606  * here, but just in case it isn't, fall back to pg_default tablespace.
1607  */
1608  if (file <= 0)
1611  DEFAULTTABLESPACE_OID,
1612  true);
1613 
1614  /* Mark it for deletion at close and temporary file size limit */
1615  VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1616 
1617  /* Register it with the current resource owner */
1618  if (!interXact)
1619  RegisterTemporaryFile(file);
1620 
1621  return file;
1622 }
1623 
1624 /*
1625  * Return the path of the temp directory in a given tablespace.
1626  */
1627 void
1629 {
1630  /*
1631  * Identify the tempfile directory for this tablespace.
1632  *
1633  * If someone tries to specify pg_global, use pg_default instead.
1634  */
1635  if (tablespace == InvalidOid ||
1636  tablespace == DEFAULTTABLESPACE_OID ||
1637  tablespace == GLOBALTABLESPACE_OID)
1638  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1639  else
1640  {
1641  /* All other tablespaces are accessed via symlinks */
1642  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1643  tablespace, TABLESPACE_VERSION_DIRECTORY,
1645  }
1646 }
1647 
1648 /*
1649  * Open a temporary file in a specific tablespace.
1650  * Subroutine for OpenTemporaryFile, which see for details.
1651  */
1652 static File
1653 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1654 {
1655  char tempdirpath[MAXPGPATH];
1656  char tempfilepath[MAXPGPATH];
1657  File file;
1658 
1659  TempTablespacePath(tempdirpath, tblspcOid);
1660 
1661  /*
1662  * Generate a tempfile name that should be unique within the current
1663  * database instance.
1664  */
1665  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1666  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1667 
1668  /*
1669  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1670  * temp file that can be reused.
1671  */
1672  file = PathNameOpenFile(tempfilepath,
1673  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1674  if (file <= 0)
1675  {
1676  /*
1677  * We might need to create the tablespace's tempfile directory, if no
1678  * one has yet done so.
1679  *
1680  * Don't check for an error from MakePGDirectory; it could fail if
1681  * someone else just did the same thing. If it doesn't work then
1682  * we'll bomb out on the second create attempt, instead.
1683  */
1684  (void) MakePGDirectory(tempdirpath);
1685 
1686  file = PathNameOpenFile(tempfilepath,
1687  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1688  if (file <= 0 && rejectError)
1689  elog(ERROR, "could not create temporary file \"%s\": %m",
1690  tempfilepath);
1691  }
1692 
1693  return file;
1694 }
1695 
1696 
1697 /*
1698  * Create a new file. The directory containing it must already exist. Files
1699  * created this way are subject to temp_file_limit and are automatically
1700  * closed at end of transaction, but are not automatically deleted on close
1701  * because they are intended to be shared between cooperating backends.
1702  *
1703  * If the file is inside the top-level temporary directory, its name should
1704  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1705  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1706  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1707  * the prefix isn't needed.
1708  */
1709 File
1710 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1711 {
1712  File file;
1713 
1715 
1716  /*
1717  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1718  * temp file that can be reused.
1719  */
1720  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1721  if (file <= 0)
1722  {
1723  if (error_on_failure)
1724  ereport(ERROR,
1726  errmsg("could not create temporary file \"%s\": %m",
1727  path)));
1728  else
1729  return file;
1730  }
1731 
1732  /* Mark it for temp_file_limit accounting. */
1733  VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1734 
1735  /* Register it for automatic close. */
1736  RegisterTemporaryFile(file);
1737 
1738  return file;
1739 }
1740 
1741 /*
1742  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1743  * another backend. Files opened this way don't count against the
1744  * temp_file_limit of the caller, are read-only and are automatically closed
1745  * at the end of the transaction but are not deleted on close.
1746  */
1747 File
1748 PathNameOpenTemporaryFile(const char *path)
1749 {
1750  File file;
1751 
1753 
1754  /* We open the file read-only. */
1755  file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1756 
1757  /* If no such file, then we don't raise an error. */
1758  if (file <= 0 && errno != ENOENT)
1759  ereport(ERROR,
1761  errmsg("could not open temporary file \"%s\": %m",
1762  path)));
1763 
1764  if (file > 0)
1765  {
1766  /* Register it for automatic close. */
1767  RegisterTemporaryFile(file);
1768  }
1769 
1770  return file;
1771 }
1772 
1773 /*
1774  * Delete a file by pathname. Return true if the file existed, false if
1775  * didn't.
1776  */
1777 bool
1778 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1779 {
1780  struct stat filestats;
1781  int stat_errno;
1782 
1783  /* Get the final size for pgstat reporting. */
1784  if (stat(path, &filestats) != 0)
1785  stat_errno = errno;
1786  else
1787  stat_errno = 0;
1788 
1789  /*
1790  * Unlike FileClose's automatic file deletion code, we tolerate
1791  * non-existence to support BufFileDeleteShared which doesn't know how
1792  * many segments it has to delete until it runs out.
1793  */
1794  if (stat_errno == ENOENT)
1795  return false;
1796 
1797  if (unlink(path) < 0)
1798  {
1799  if (errno != ENOENT)
1800  ereport(error_on_failure ? ERROR : LOG,
1802  errmsg("could not unlink temporary file \"%s\": %m",
1803  path)));
1804  return false;
1805  }
1806 
1807  if (stat_errno == 0)
1808  ReportTemporaryFileUsage(path, filestats.st_size);
1809  else
1810  {
1811  errno = stat_errno;
1812  ereport(LOG,
1814  errmsg("could not stat file \"%s\": %m", path)));
1815  }
1816 
1817  return true;
1818 }
1819 
1820 /*
1821  * close a file when done with it
1822  */
1823 void
1825 {
1826  Vfd *vfdP;
1827 
1828  Assert(FileIsValid(file));
1829 
1830  DO_DB(elog(LOG, "FileClose: %d (%s)",
1831  file, VfdCache[file].fileName));
1832 
1833  vfdP = &VfdCache[file];
1834 
1835  if (!FileIsNotOpen(file))
1836  {
1837  /* close the file */
1838  if (close(vfdP->fd) != 0)
1839  {
1840  /*
1841  * We may need to panic on failure to close non-temporary files;
1842  * see LruDelete.
1843  */
1845  "could not close file \"%s\": %m", vfdP->fileName);
1846  }
1847 
1848  --nfile;
1849  vfdP->fd = VFD_CLOSED;
1850 
1851  /* remove the file from the lru ring */
1852  Delete(file);
1853  }
1854 
1855  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1856  {
1857  /* Subtract its size from current usage (do first in case of error) */
1858  temporary_files_size -= vfdP->fileSize;
1859  vfdP->fileSize = 0;
1860  }
1861 
1862  /*
1863  * Delete the file if it was temporary, and make a log entry if wanted
1864  */
1865  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1866  {
1867  struct stat filestats;
1868  int stat_errno;
1869 
1870  /*
1871  * If we get an error, as could happen within the ereport/elog calls,
1872  * we'll come right back here during transaction abort. Reset the
1873  * flag to ensure that we can't get into an infinite loop. This code
1874  * is arranged to ensure that the worst-case consequence is failing to
1875  * emit log message(s), not failing to attempt the unlink.
1876  */
1877  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1878 
1879 
1880  /* first try the stat() */
1881  if (stat(vfdP->fileName, &filestats))
1882  stat_errno = errno;
1883  else
1884  stat_errno = 0;
1885 
1886  /* in any case do the unlink */
1887  if (unlink(vfdP->fileName))
1888  elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1889 
1890  /* and last report the stat results */
1891  if (stat_errno == 0)
1892  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1893  else
1894  {
1895  errno = stat_errno;
1896  elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1897  }
1898  }
1899 
1900  /* Unregister it from the resource owner */
1901  if (vfdP->resowner)
1902  ResourceOwnerForgetFile(vfdP->resowner, file);
1903 
1904  /*
1905  * Return the Vfd slot to the free list
1906  */
1907  FreeVfd(file);
1908 }
1909 
1910 /*
1911  * FilePrefetch - initiate asynchronous read of a given range of the file.
1912  *
1913  * Currently the only implementation of this function is using posix_fadvise
1914  * which is the simplest standardized interface that accomplishes this.
1915  * We could add an implementation using libaio in the future; but note that
1916  * this API is inappropriate for libaio, which wants to have a buffer provided
1917  * to read into.
1918  */
1919 int
1920 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1921 {
1922 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1923  int returnCode;
1924 
1925  Assert(FileIsValid(file));
1926 
1927  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1928  file, VfdCache[file].fileName,
1929  (int64) offset, amount));
1930 
1931  returnCode = FileAccess(file);
1932  if (returnCode < 0)
1933  return returnCode;
1934 
1935  pgstat_report_wait_start(wait_event_info);
1936  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1937  POSIX_FADV_WILLNEED);
1939 
1940  return returnCode;
1941 #else
1942  Assert(FileIsValid(file));
1943  return 0;
1944 #endif
1945 }
1946 
1947 void
1948 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1949 {
1950  int returnCode;
1951 
1952  Assert(FileIsValid(file));
1953 
1954  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1955  file, VfdCache[file].fileName,
1956  (int64) offset, (int64) nbytes));
1957 
1958  if (nbytes <= 0)
1959  return;
1960 
1961  returnCode = FileAccess(file);
1962  if (returnCode < 0)
1963  return;
1964 
1965  pgstat_report_wait_start(wait_event_info);
1966  pg_flush_data(VfdCache[file].fd, offset, nbytes);
1968 }
1969 
1970 int
1971 FileRead(File file, char *buffer, int amount, off_t offset,
1972  uint32 wait_event_info)
1973 {
1974  int returnCode;
1975  Vfd *vfdP;
1976 
1977  Assert(FileIsValid(file));
1978 
1979  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1980  file, VfdCache[file].fileName,
1981  (int64) offset,
1982  amount, buffer));
1983 
1984  returnCode = FileAccess(file);
1985  if (returnCode < 0)
1986  return returnCode;
1987 
1988  vfdP = &VfdCache[file];
1989 
1990 retry:
1991  pgstat_report_wait_start(wait_event_info);
1992  returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
1994 
1995  if (returnCode < 0)
1996  {
1997  /*
1998  * Windows may run out of kernel buffers and return "Insufficient
1999  * system resources" error. Wait a bit and retry to solve it.
2000  *
2001  * It is rumored that EINTR is also possible on some Unix filesystems,
2002  * in which case immediate retry is indicated.
2003  */
2004 #ifdef WIN32
2005  DWORD error = GetLastError();
2006 
2007  switch (error)
2008  {
2009  case ERROR_NO_SYSTEM_RESOURCES:
2010  pg_usleep(1000L);
2011  errno = EINTR;
2012  break;
2013  default:
2014  _dosmaperr(error);
2015  break;
2016  }
2017 #endif
2018  /* OK to retry if interrupted */
2019  if (errno == EINTR)
2020  goto retry;
2021  }
2022 
2023  return returnCode;
2024 }
2025 
2026 int
2027 FileWrite(File file, char *buffer, int amount, off_t offset,
2028  uint32 wait_event_info)
2029 {
2030  int returnCode;
2031  Vfd *vfdP;
2032 
2033  Assert(FileIsValid(file));
2034 
2035  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
2036  file, VfdCache[file].fileName,
2037  (int64) offset,
2038  amount, buffer));
2039 
2040  returnCode = FileAccess(file);
2041  if (returnCode < 0)
2042  return returnCode;
2043 
2044  vfdP = &VfdCache[file];
2045 
2046  /*
2047  * If enforcing temp_file_limit and it's a temp file, check to see if the
2048  * write would overrun temp_file_limit, and throw error if so. Note: it's
2049  * really a modularity violation to throw error here; we should set errno
2050  * and return -1. However, there's no way to report a suitable error
2051  * message if we do that. All current callers would just throw error
2052  * immediately anyway, so this is safe at present.
2053  */
2054  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2055  {
2056  off_t past_write = offset + amount;
2057 
2058  if (past_write > vfdP->fileSize)
2059  {
2060  uint64 newTotal = temporary_files_size;
2061 
2062  newTotal += past_write - vfdP->fileSize;
2063  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2064  ereport(ERROR,
2065  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2066  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2067  temp_file_limit)));
2068  }
2069  }
2070 
2071 retry:
2072  errno = 0;
2073  pgstat_report_wait_start(wait_event_info);
2074  returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2076 
2077  /* if write didn't set errno, assume problem is no disk space */
2078  if (returnCode != amount && errno == 0)
2079  errno = ENOSPC;
2080 
2081  if (returnCode >= 0)
2082  {
2083  /*
2084  * Maintain fileSize and temporary_files_size if it's a temp file.
2085  */
2086  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2087  {
2088  off_t past_write = offset + amount;
2089 
2090  if (past_write > vfdP->fileSize)
2091  {
2092  temporary_files_size += past_write - vfdP->fileSize;
2093  vfdP->fileSize = past_write;
2094  }
2095  }
2096  }
2097  else
2098  {
2099  /*
2100  * See comments in FileRead()
2101  */
2102 #ifdef WIN32
2103  DWORD error = GetLastError();
2104 
2105  switch (error)
2106  {
2107  case ERROR_NO_SYSTEM_RESOURCES:
2108  pg_usleep(1000L);
2109  errno = EINTR;
2110  break;
2111  default:
2112  _dosmaperr(error);
2113  break;
2114  }
2115 #endif
2116  /* OK to retry if interrupted */
2117  if (errno == EINTR)
2118  goto retry;
2119  }
2120 
2121  return returnCode;
2122 }
2123 
2124 int
2125 FileSync(File file, uint32 wait_event_info)
2126 {
2127  int returnCode;
2128 
2129  Assert(FileIsValid(file));
2130 
2131  DO_DB(elog(LOG, "FileSync: %d (%s)",
2132  file, VfdCache[file].fileName));
2133 
2134  returnCode = FileAccess(file);
2135  if (returnCode < 0)
2136  return returnCode;
2137 
2138  pgstat_report_wait_start(wait_event_info);
2139  returnCode = pg_fsync(VfdCache[file].fd);
2141 
2142  return returnCode;
2143 }
2144 
2145 off_t
2147 {
2148  Assert(FileIsValid(file));
2149 
2150  DO_DB(elog(LOG, "FileSize %d (%s)",
2151  file, VfdCache[file].fileName));
2152 
2153  if (FileIsNotOpen(file))
2154  {
2155  if (FileAccess(file) < 0)
2156  return (off_t) -1;
2157  }
2158 
2159  return lseek(VfdCache[file].fd, 0, SEEK_END);
2160 }
2161 
2162 int
2163 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2164 {
2165  int returnCode;
2166 
2167  Assert(FileIsValid(file));
2168 
2169  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2170  file, VfdCache[file].fileName));
2171 
2172  returnCode = FileAccess(file);
2173  if (returnCode < 0)
2174  return returnCode;
2175 
2176  pgstat_report_wait_start(wait_event_info);
2177  returnCode = ftruncate(VfdCache[file].fd, offset);
2179 
2180  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2181  {
2182  /* adjust our state for truncation of a temp file */
2183  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2184  temporary_files_size -= VfdCache[file].fileSize - offset;
2185  VfdCache[file].fileSize = offset;
2186  }
2187 
2188  return returnCode;
2189 }
2190 
2191 /*
2192  * Return the pathname associated with an open file.
2193  *
2194  * The returned string points to an internal buffer, which is valid until
2195  * the file is closed.
2196  */
2197 char *
2199 {
2200  Assert(FileIsValid(file));
2201 
2202  return VfdCache[file].fileName;
2203 }
2204 
2205 /*
2206  * Return the raw file descriptor of an opened file.
2207  *
2208  * The returned file descriptor will be valid until the file is closed, but
2209  * there are a lot of things that can make that happen. So the caller should
2210  * be careful not to do much of anything else before it finishes using the
2211  * returned file descriptor.
2212  */
2213 int
2215 {
2216  Assert(FileIsValid(file));
2217  return VfdCache[file].fd;
2218 }
2219 
2220 /*
2221  * FileGetRawFlags - returns the file flags on open(2)
2222  */
2223 int
2225 {
2226  Assert(FileIsValid(file));
2227  return VfdCache[file].fileFlags;
2228 }
2229 
2230 /*
2231  * FileGetRawMode - returns the mode bitmask passed to open(2)
2232  */
2233 mode_t
2235 {
2236  Assert(FileIsValid(file));
2237  return VfdCache[file].fileMode;
2238 }
2239 
2240 /*
2241  * Make room for another allocatedDescs[] array entry if needed and possible.
2242  * Returns true if an array element is available.
2243  */
2244 static bool
2246 {
2247  AllocateDesc *newDescs;
2248  int newMax;
2249 
2250  /* Quick out if array already has a free slot. */
2252  return true;
2253 
2254  /*
2255  * If the array hasn't yet been created in the current process, initialize
2256  * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2257  * we will ever need, anyway. We don't want to look at max_safe_fds
2258  * immediately because set_max_safe_fds() may not have run yet.
2259  */
2260  if (allocatedDescs == NULL)
2261  {
2262  newMax = FD_MINFREE / 3;
2263  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2264  /* Out of memory already? Treat as fatal error. */
2265  if (newDescs == NULL)
2266  ereport(ERROR,
2267  (errcode(ERRCODE_OUT_OF_MEMORY),
2268  errmsg("out of memory")));
2269  allocatedDescs = newDescs;
2270  maxAllocatedDescs = newMax;
2271  return true;
2272  }
2273 
2274  /*
2275  * Consider enlarging the array beyond the initial allocation used above.
2276  * By the time this happens, max_safe_fds should be known accurately.
2277  *
2278  * We mustn't let allocated descriptors hog all the available FDs, and in
2279  * practice we'd better leave a reasonable number of FDs for VFD use. So
2280  * set the maximum to max_safe_fds / 3. (This should certainly be at
2281  * least as large as the initial size, FD_MINFREE / 3, so we aren't
2282  * tightening the restriction here.) Recall that "external" FDs are
2283  * allowed to consume another third of max_safe_fds.
2284  */
2285  newMax = max_safe_fds / 3;
2286  if (newMax > maxAllocatedDescs)
2287  {
2288  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2289  newMax * sizeof(AllocateDesc));
2290  /* Treat out-of-memory as a non-fatal error. */
2291  if (newDescs == NULL)
2292  return false;
2293  allocatedDescs = newDescs;
2294  maxAllocatedDescs = newMax;
2295  return true;
2296  }
2297 
2298  /* Can't enlarge allocatedDescs[] any more. */
2299  return false;
2300 }
2301 
2302 /*
2303  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2304  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2305  * necessary to open the file. When done, call FreeFile rather than fclose.
2306  *
2307  * Note that files that will be open for any significant length of time
2308  * should NOT be handled this way, since they cannot share kernel file
2309  * descriptors with other files; there is grave risk of running out of FDs
2310  * if anyone locks down too many FDs. Most callers of this routine are
2311  * simply reading a config file that they will read and close immediately.
2312  *
2313  * fd.c will automatically close all files opened with AllocateFile at
2314  * transaction commit or abort; this prevents FD leakage if a routine
2315  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2316  *
2317  * Ideally this should be the *only* direct call of fopen() in the backend.
2318  */
2319 FILE *
2320 AllocateFile(const char *name, const char *mode)
2321 {
2322  FILE *file;
2323 
2324  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2325  numAllocatedDescs, name));
2326 
2327  /* Can we allocate another non-virtual FD? */
2328  if (!reserveAllocatedDesc())
2329  ereport(ERROR,
2330  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2331  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2332  maxAllocatedDescs, name)));
2333 
2334  /* Close excess kernel FDs. */
2335  ReleaseLruFiles();
2336 
2337 TryAgain:
2338  if ((file = fopen(name, mode)) != NULL)
2339  {
2340  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2341 
2342  desc->kind = AllocateDescFile;
2343  desc->desc.file = file;
2346  return desc->desc.file;
2347  }
2348 
2349  if (errno == EMFILE || errno == ENFILE)
2350  {
2351  int save_errno = errno;
2352 
2353  ereport(LOG,
2354  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2355  errmsg("out of file descriptors: %m; release and retry")));
2356  errno = 0;
2357  if (ReleaseLruFile())
2358  goto TryAgain;
2359  errno = save_errno;
2360  }
2361 
2362  return NULL;
2363 }
2364 
2365 /*
2366  * Open a file with OpenTransientFilePerm() and pass default file mode for
2367  * the fileMode parameter.
2368  */
2369 int
2371 {
2372  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2373 }
2374 
2375 /*
2376  * Like AllocateFile, but returns an unbuffered fd like open(2)
2377  */
2378 int
2380 {
2381  int fd;
2382 
2383  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2384  numAllocatedDescs, fileName));
2385 
2386  /* Can we allocate another non-virtual FD? */
2387  if (!reserveAllocatedDesc())
2388  ereport(ERROR,
2389  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2390  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2391  maxAllocatedDescs, fileName)));
2392 
2393  /* Close excess kernel FDs. */
2394  ReleaseLruFiles();
2395 
2396  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2397 
2398  if (fd >= 0)
2399  {
2400  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2401 
2402  desc->kind = AllocateDescRawFD;
2403  desc->desc.fd = fd;
2406 
2407  return fd;
2408  }
2409 
2410  return -1; /* failure */
2411 }
2412 
2413 /*
2414  * Routines that want to initiate a pipe stream should use OpenPipeStream
2415  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2416  * necessary. When done, call ClosePipeStream rather than pclose.
2417  *
2418  * This function also ensures that the popen'd program is run with default
2419  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2420  * uses. This ensures desirable response to, eg, closing a read pipe early.
2421  */
2422 FILE *
2423 OpenPipeStream(const char *command, const char *mode)
2424 {
2425  FILE *file;
2426  int save_errno;
2427 
2428  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2429  numAllocatedDescs, command));
2430 
2431  /* Can we allocate another non-virtual FD? */
2432  if (!reserveAllocatedDesc())
2433  ereport(ERROR,
2434  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2435  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2436  maxAllocatedDescs, command)));
2437 
2438  /* Close excess kernel FDs. */
2439  ReleaseLruFiles();
2440 
2441 TryAgain:
2442  fflush(stdout);
2443  fflush(stderr);
2445  errno = 0;
2446  file = popen(command, mode);
2447  save_errno = errno;
2449  errno = save_errno;
2450  if (file != NULL)
2451  {
2452  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2453 
2454  desc->kind = AllocateDescPipe;
2455  desc->desc.file = file;
2458  return desc->desc.file;
2459  }
2460 
2461  if (errno == EMFILE || errno == ENFILE)
2462  {
2463  ereport(LOG,
2464  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2465  errmsg("out of file descriptors: %m; release and retry")));
2466  if (ReleaseLruFile())
2467  goto TryAgain;
2468  errno = save_errno;
2469  }
2470 
2471  return NULL;
2472 }
2473 
2474 /*
2475  * Free an AllocateDesc of any type.
2476  *
2477  * The argument *must* point into the allocatedDescs[] array.
2478  */
2479 static int
2481 {
2482  int result;
2483 
2484  /* Close the underlying object */
2485  switch (desc->kind)
2486  {
2487  case AllocateDescFile:
2488  result = fclose(desc->desc.file);
2489  break;
2490  case AllocateDescPipe:
2491  result = pclose(desc->desc.file);
2492  break;
2493  case AllocateDescDir:
2494  result = closedir(desc->desc.dir);
2495  break;
2496  case AllocateDescRawFD:
2497  result = close(desc->desc.fd);
2498  break;
2499  default:
2500  elog(ERROR, "AllocateDesc kind not recognized");
2501  result = 0; /* keep compiler quiet */
2502  break;
2503  }
2504 
2505  /* Compact storage in the allocatedDescs array */
2507  *desc = allocatedDescs[numAllocatedDescs];
2508 
2509  return result;
2510 }
2511 
2512 /*
2513  * Close a file returned by AllocateFile.
2514  *
2515  * Note we do not check fclose's return value --- it is up to the caller
2516  * to handle close errors.
2517  */
2518 int
2519 FreeFile(FILE *file)
2520 {
2521  int i;
2522 
2523  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2524 
2525  /* Remove file from list of allocated files, if it's present */
2526  for (i = numAllocatedDescs; --i >= 0;)
2527  {
2528  AllocateDesc *desc = &allocatedDescs[i];
2529 
2530  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2531  return FreeDesc(desc);
2532  }
2533 
2534  /* Only get here if someone passes us a file not in allocatedDescs */
2535  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2536 
2537  return fclose(file);
2538 }
2539 
2540 /*
2541  * Close a file returned by OpenTransientFile.
2542  *
2543  * Note we do not check close's return value --- it is up to the caller
2544  * to handle close errors.
2545  */
2546 int
2548 {
2549  int i;
2550 
2551  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2552 
2553  /* Remove fd from list of allocated files, if it's present */
2554  for (i = numAllocatedDescs; --i >= 0;)
2555  {
2556  AllocateDesc *desc = &allocatedDescs[i];
2557 
2558  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2559  return FreeDesc(desc);
2560  }
2561 
2562  /* Only get here if someone passes us a file not in allocatedDescs */
2563  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2564 
2565  return close(fd);
2566 }
2567 
2568 /*
2569  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2570  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2571  * necessary to open the directory, and with closing it after an elog.
2572  * When done, call FreeDir rather than closedir.
2573  *
2574  * Returns NULL, with errno set, on failure. Note that failure detection
2575  * is commonly left to the following call of ReadDir or ReadDirExtended;
2576  * see the comments for ReadDir.
2577  *
2578  * Ideally this should be the *only* direct call of opendir() in the backend.
2579  */
2580 DIR *
2581 AllocateDir(const char *dirname)
2582 {
2583  DIR *dir;
2584 
2585  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2586  numAllocatedDescs, dirname));
2587 
2588  /* Can we allocate another non-virtual FD? */
2589  if (!reserveAllocatedDesc())
2590  ereport(ERROR,
2591  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2592  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2593  maxAllocatedDescs, dirname)));
2594 
2595  /* Close excess kernel FDs. */
2596  ReleaseLruFiles();
2597 
2598 TryAgain:
2599  if ((dir = opendir(dirname)) != NULL)
2600  {
2601  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2602 
2603  desc->kind = AllocateDescDir;
2604  desc->desc.dir = dir;
2607  return desc->desc.dir;
2608  }
2609 
2610  if (errno == EMFILE || errno == ENFILE)
2611  {
2612  int save_errno = errno;
2613 
2614  ereport(LOG,
2615  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2616  errmsg("out of file descriptors: %m; release and retry")));
2617  errno = 0;
2618  if (ReleaseLruFile())
2619  goto TryAgain;
2620  errno = save_errno;
2621  }
2622 
2623  return NULL;
2624 }
2625 
2626 /*
2627  * Read a directory opened with AllocateDir, ereport'ing any error.
2628  *
2629  * This is easier to use than raw readdir() since it takes care of some
2630  * otherwise rather tedious and error-prone manipulation of errno. Also,
2631  * if you are happy with a generic error message for AllocateDir failure,
2632  * you can just do
2633  *
2634  * dir = AllocateDir(path);
2635  * while ((dirent = ReadDir(dir, path)) != NULL)
2636  * process dirent;
2637  * FreeDir(dir);
2638  *
2639  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2640  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2641  * use this shortcut.)
2642  *
2643  * The pathname passed to AllocateDir must be passed to this routine too,
2644  * but it is only used for error reporting.
2645  */
2646 struct dirent *
2647 ReadDir(DIR *dir, const char *dirname)
2648 {
2649  return ReadDirExtended(dir, dirname, ERROR);
2650 }
2651 
2652 /*
2653  * Alternate version of ReadDir that allows caller to specify the elevel
2654  * for any error report (whether it's reporting an initial failure of
2655  * AllocateDir or a subsequent directory read failure).
2656  *
2657  * If elevel < ERROR, returns NULL after any error. With the normal coding
2658  * pattern, this will result in falling out of the loop immediately as
2659  * though the directory contained no (more) entries.
2660  */
2661 struct dirent *
2662 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2663 {
2664  struct dirent *dent;
2665 
2666  /* Give a generic message for AllocateDir failure, if caller didn't */
2667  if (dir == NULL)
2668  {
2669  ereport(elevel,
2671  errmsg("could not open directory \"%s\": %m",
2672  dirname)));
2673  return NULL;
2674  }
2675 
2676  errno = 0;
2677  if ((dent = readdir(dir)) != NULL)
2678  return dent;
2679 
2680  if (errno)
2681  ereport(elevel,
2683  errmsg("could not read directory \"%s\": %m",
2684  dirname)));
2685  return NULL;
2686 }
2687 
2688 /*
2689  * Close a directory opened with AllocateDir.
2690  *
2691  * Returns closedir's return value (with errno set if it's not 0).
2692  * Note we do not check the return value --- it is up to the caller
2693  * to handle close errors if wanted.
2694  *
2695  * Does nothing if dir == NULL; we assume that directory open failure was
2696  * already reported if desired.
2697  */
2698 int
2700 {
2701  int i;
2702 
2703  /* Nothing to do if AllocateDir failed */
2704  if (dir == NULL)
2705  return 0;
2706 
2707  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2708 
2709  /* Remove dir from list of allocated dirs, if it's present */
2710  for (i = numAllocatedDescs; --i >= 0;)
2711  {
2712  AllocateDesc *desc = &allocatedDescs[i];
2713 
2714  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2715  return FreeDesc(desc);
2716  }
2717 
2718  /* Only get here if someone passes us a dir not in allocatedDescs */
2719  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2720 
2721  return closedir(dir);
2722 }
2723 
2724 
2725 /*
2726  * Close a pipe stream returned by OpenPipeStream.
2727  */
2728 int
2729 ClosePipeStream(FILE *file)
2730 {
2731  int i;
2732 
2733  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2734 
2735  /* Remove file from list of allocated files, if it's present */
2736  for (i = numAllocatedDescs; --i >= 0;)
2737  {
2738  AllocateDesc *desc = &allocatedDescs[i];
2739 
2740  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2741  return FreeDesc(desc);
2742  }
2743 
2744  /* Only get here if someone passes us a file not in allocatedDescs */
2745  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2746 
2747  return pclose(file);
2748 }
2749 
2750 /*
2751  * closeAllVfds
2752  *
2753  * Force all VFDs into the physically-closed state, so that the fewest
2754  * possible number of kernel file descriptors are in use. There is no
2755  * change in the logical state of the VFDs.
2756  */
2757 void
2759 {
2760  Index i;
2761 
2762  if (SizeVfdCache > 0)
2763  {
2764  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2765  for (i = 1; i < SizeVfdCache; i++)
2766  {
2767  if (!FileIsNotOpen(i))
2768  LruDelete(i);
2769  }
2770  }
2771 }
2772 
2773 
2774 /*
2775  * SetTempTablespaces
2776  *
2777  * Define a list (actually an array) of OIDs of tablespaces to use for
2778  * temporary files. This list will be used until end of transaction,
2779  * unless this function is called again before then. It is caller's
2780  * responsibility that the passed-in array has adequate lifespan (typically
2781  * it'd be allocated in TopTransactionContext).
2782  */
2783 void
2784 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2785 {
2786  Assert(numSpaces >= 0);
2787  tempTableSpaces = tableSpaces;
2788  numTempTableSpaces = numSpaces;
2789 
2790  /*
2791  * Select a random starting point in the list. This is to minimize
2792  * conflicts between backends that are most likely sharing the same list
2793  * of temp tablespaces. Note that if we create multiple temp files in the
2794  * same transaction, we'll advance circularly through the list --- this
2795  * ensures that large temporary sort files are nicely spread across all
2796  * available tablespaces.
2797  */
2798  if (numSpaces > 1)
2799  nextTempTableSpace = random() % numSpaces;
2800  else
2801  nextTempTableSpace = 0;
2802 }
2803 
2804 /*
2805  * TempTablespacesAreSet
2806  *
2807  * Returns true if SetTempTablespaces has been called in current transaction.
2808  * (This is just so that tablespaces.c doesn't need its own per-transaction
2809  * state.)
2810  */
2811 bool
2813 {
2814  return (numTempTableSpaces >= 0);
2815 }
2816 
2817 /*
2818  * GetTempTablespaces
2819  *
2820  * Populate an array with the OIDs of the tablespaces that should be used for
2821  * temporary files. Return the number that were copied into the output array.
2822  */
2823 int
2824 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2825 {
2826  int i;
2827 
2829  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2830  tableSpaces[i] = tempTableSpaces[i];
2831 
2832  return i;
2833 }
2834 
2835 /*
2836  * GetNextTempTableSpace
2837  *
2838  * Select the next temp tablespace to use. A result of InvalidOid means
2839  * to use the current database's default tablespace.
2840  */
2841 Oid
2843 {
2844  if (numTempTableSpaces > 0)
2845  {
2846  /* Advance nextTempTableSpace counter with wraparound */
2848  nextTempTableSpace = 0;
2850  }
2851  return InvalidOid;
2852 }
2853 
2854 
2855 /*
2856  * AtEOSubXact_Files
2857  *
2858  * Take care of subtransaction commit/abort. At abort, we close temp files
2859  * that the subtransaction may have opened. At commit, we reassign the
2860  * files that were opened to the parent subtransaction.
2861  */
2862 void
2863 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2864  SubTransactionId parentSubid)
2865 {
2866  Index i;
2867 
2868  for (i = 0; i < numAllocatedDescs; i++)
2869  {
2870  if (allocatedDescs[i].create_subid == mySubid)
2871  {
2872  if (isCommit)
2873  allocatedDescs[i].create_subid = parentSubid;
2874  else
2875  {
2876  /* have to recheck the item after FreeDesc (ugly) */
2877  FreeDesc(&allocatedDescs[i--]);
2878  }
2879  }
2880  }
2881 }
2882 
2883 /*
2884  * AtEOXact_Files
2885  *
2886  * This routine is called during transaction commit or abort. All still-open
2887  * per-transaction temporary file VFDs are closed, which also causes the
2888  * underlying files to be deleted (although they should've been closed already
2889  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2890  * closed. We also forget any transaction-local temp tablespace list.
2891  *
2892  * The isCommit flag is used only to decide whether to emit warnings about
2893  * unclosed files.
2894  */
2895 void
2896 AtEOXact_Files(bool isCommit)
2897 {
2898  CleanupTempFiles(isCommit, false);
2899  tempTableSpaces = NULL;
2900  numTempTableSpaces = -1;
2901 }
2902 
2903 /*
2904  * AtProcExit_Files
2905  *
2906  * on_proc_exit hook to clean up temp files during backend shutdown.
2907  * Here, we want to clean up *all* temp files including interXact ones.
2908  */
2909 static void
2911 {
2912  CleanupTempFiles(false, true);
2913 }
2914 
2915 /*
2916  * Close temporary files and delete their underlying files.
2917  *
2918  * isCommit: if true, this is normal transaction commit, and we don't
2919  * expect any remaining files; warn if there are some.
2920  *
2921  * isProcExit: if true, this is being called as the backend process is
2922  * exiting. If that's the case, we should remove all temporary files; if
2923  * that's not the case, we are being called for transaction commit/abort
2924  * and should only remove transaction-local temp files. In either case,
2925  * also clean up "allocated" stdio files, dirs and fds.
2926  */
2927 static void
2928 CleanupTempFiles(bool isCommit, bool isProcExit)
2929 {
2930  Index i;
2931 
2932  /*
2933  * Careful here: at proc_exit we need extra cleanup, not just
2934  * xact_temporary files.
2935  */
2936  if (isProcExit || have_xact_temporary_files)
2937  {
2938  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2939  for (i = 1; i < SizeVfdCache; i++)
2940  {
2941  unsigned short fdstate = VfdCache[i].fdstate;
2942 
2943  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2944  VfdCache[i].fileName != NULL)
2945  {
2946  /*
2947  * If we're in the process of exiting a backend process, close
2948  * all temporary files. Otherwise, only close temporary files
2949  * local to the current transaction. They should be closed by
2950  * the ResourceOwner mechanism already, so this is just a
2951  * debugging cross-check.
2952  */
2953  if (isProcExit)
2954  FileClose(i);
2955  else if (fdstate & FD_CLOSE_AT_EOXACT)
2956  {
2957  elog(WARNING,
2958  "temporary file %s not closed at end-of-transaction",
2959  VfdCache[i].fileName);
2960  FileClose(i);
2961  }
2962  }
2963  }
2964 
2965  have_xact_temporary_files = false;
2966  }
2967 
2968  /* Complain if any allocated files remain open at commit. */
2969  if (isCommit && numAllocatedDescs > 0)
2970  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
2972 
2973  /* Clean up "allocated" stdio files, dirs and fds. */
2974  while (numAllocatedDescs > 0)
2975  FreeDesc(&allocatedDescs[0]);
2976 }
2977 
2978 
2979 /*
2980  * Remove temporary and temporary relation files left over from a prior
2981  * postmaster session
2982  *
2983  * This should be called during postmaster startup. It will forcibly
2984  * remove any leftover files created by OpenTemporaryFile and any leftover
2985  * temporary relation files created by mdcreate.
2986  *
2987  * NOTE: we could, but don't, call this during a post-backend-crash restart
2988  * cycle. The argument for not doing it is that someone might want to examine
2989  * the temp files for debugging purposes. This does however mean that
2990  * OpenTemporaryFile had better allow for collision with an existing temp
2991  * file name.
2992  *
2993  * NOTE: this function and its subroutines generally report syscall failures
2994  * with ereport(LOG) and keep going. Removing temp files is not so critical
2995  * that we should fail to start the database when we can't do it.
2996  */
2997 void
2999 {
3000  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3001  DIR *spc_dir;
3002  struct dirent *spc_de;
3003 
3004  /*
3005  * First process temp files in pg_default ($PGDATA/base)
3006  */
3007  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3008  RemovePgTempFilesInDir(temp_path, true, false);
3009  RemovePgTempRelationFiles("base");
3010 
3011  /*
3012  * Cycle through temp directories for all non-default tablespaces.
3013  */
3014  spc_dir = AllocateDir("pg_tblspc");
3015 
3016  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3017  {
3018  if (strcmp(spc_de->d_name, ".") == 0 ||
3019  strcmp(spc_de->d_name, "..") == 0)
3020  continue;
3021 
3022  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3024  RemovePgTempFilesInDir(temp_path, true, false);
3025 
3026  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3028  RemovePgTempRelationFiles(temp_path);
3029  }
3030 
3031  FreeDir(spc_dir);
3032 
3033  /*
3034  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3035  * DataDir as well. However, that is *not* cleaned here because doing so
3036  * would create a race condition. It's done separately, earlier in
3037  * postmaster startup.
3038  */
3039 }
3040 
3041 /*
3042  * Process one pgsql_tmp directory for RemovePgTempFiles.
3043  *
3044  * If missing_ok is true, it's all right for the named directory to not exist.
3045  * Any other problem results in a LOG message. (missing_ok should be true at
3046  * the top level, since pgsql_tmp directories are not created until needed.)
3047  *
3048  * At the top level, this should be called with unlink_all = false, so that
3049  * only files matching the temporary name prefix will be unlinked. When
3050  * recursing it will be called with unlink_all = true to unlink everything
3051  * under a top-level temporary directory.
3052  *
3053  * (These two flags could be replaced by one, but it seems clearer to keep
3054  * them separate.)
3055  */
3056 void
3057 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3058 {
3059  DIR *temp_dir;
3060  struct dirent *temp_de;
3061  char rm_path[MAXPGPATH * 2];
3062 
3063  temp_dir = AllocateDir(tmpdirname);
3064 
3065  if (temp_dir == NULL && errno == ENOENT && missing_ok)
3066  return;
3067 
3068  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3069  {
3070  if (strcmp(temp_de->d_name, ".") == 0 ||
3071  strcmp(temp_de->d_name, "..") == 0)
3072  continue;
3073 
3074  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3075  tmpdirname, temp_de->d_name);
3076 
3077  if (unlink_all ||
3078  strncmp(temp_de->d_name,
3080  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3081  {
3082  struct stat statbuf;
3083 
3084  if (lstat(rm_path, &statbuf) < 0)
3085  {
3086  ereport(LOG,
3088  errmsg("could not stat file \"%s\": %m", rm_path)));
3089  continue;
3090  }
3091 
3092  if (S_ISDIR(statbuf.st_mode))
3093  {
3094  /* recursively remove contents, then directory itself */
3095  RemovePgTempFilesInDir(rm_path, false, true);
3096 
3097  if (rmdir(rm_path) < 0)
3098  ereport(LOG,
3100  errmsg("could not remove directory \"%s\": %m",
3101  rm_path)));
3102  }
3103  else
3104  {
3105  if (unlink(rm_path) < 0)
3106  ereport(LOG,
3108  errmsg("could not remove file \"%s\": %m",
3109  rm_path)));
3110  }
3111  }
3112  else
3113  ereport(LOG,
3114  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3115  rm_path)));
3116  }
3117 
3118  FreeDir(temp_dir);
3119 }
3120 
3121 /* Process one tablespace directory, look for per-DB subdirectories */
3122 static void
3123 RemovePgTempRelationFiles(const char *tsdirname)
3124 {
3125  DIR *ts_dir;
3126  struct dirent *de;
3127  char dbspace_path[MAXPGPATH * 2];
3128 
3129  ts_dir = AllocateDir(tsdirname);
3130 
3131  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3132  {
3133  /*
3134  * We're only interested in the per-database directories, which have
3135  * numeric names. Note that this code will also (properly) ignore "."
3136  * and "..".
3137  */
3138  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3139  continue;
3140 
3141  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3142  tsdirname, de->d_name);
3143  RemovePgTempRelationFilesInDbspace(dbspace_path);
3144  }
3145 
3146  FreeDir(ts_dir);
3147 }
3148 
3149 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3150 static void
3151 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3152 {
3153  DIR *dbspace_dir;
3154  struct dirent *de;
3155  char rm_path[MAXPGPATH * 2];
3156 
3157  dbspace_dir = AllocateDir(dbspacedirname);
3158 
3159  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3160  {
3161  if (!looks_like_temp_rel_name(de->d_name))
3162  continue;
3163 
3164  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3165  dbspacedirname, de->d_name);
3166 
3167  if (unlink(rm_path) < 0)
3168  ereport(LOG,
3170  errmsg("could not remove file \"%s\": %m",
3171  rm_path)));
3172  }
3173 
3174  FreeDir(dbspace_dir);
3175 }
3176 
3177 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3178 bool
3180 {
3181  int pos;
3182  int savepos;
3183 
3184  /* Must start with "t". */
3185  if (name[0] != 't')
3186  return false;
3187 
3188  /* Followed by a non-empty string of digits and then an underscore. */
3189  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3190  ;
3191  if (pos == 1 || name[pos] != '_')
3192  return false;
3193 
3194  /* Followed by another nonempty string of digits. */
3195  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3196  ;
3197  if (savepos == pos)
3198  return false;
3199 
3200  /* We might have _forkname or .segment or both. */
3201  if (name[pos] == '_')
3202  {
3203  int forkchar = forkname_chars(&name[pos + 1], NULL);
3204 
3205  if (forkchar <= 0)
3206  return false;
3207  pos += forkchar + 1;
3208  }
3209  if (name[pos] == '.')
3210  {
3211  int segchar;
3212 
3213  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3214  ;
3215  if (segchar <= 1)
3216  return false;
3217  pos += segchar;
3218  }
3219 
3220  /* Now we should be at the end. */
3221  if (name[pos] != '\0')
3222  return false;
3223  return true;
3224 }
3225 
3226 
3227 /*
3228  * Issue fsync recursively on PGDATA and all its contents.
3229  *
3230  * We fsync regular files and directories wherever they are, but we
3231  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3232  * Other symlinks are presumed to point at files we're not responsible
3233  * for fsyncing, and might not have privileges to write at all.
3234  *
3235  * Errors are logged but not considered fatal; that's because this is used
3236  * only during database startup, to deal with the possibility that there are
3237  * issued-but-unsynced writes pending against the data directory. We want to
3238  * ensure that such writes reach disk before anything that's done in the new
3239  * run. However, aborting on error would result in failure to start for
3240  * harmless cases such as read-only files in the data directory, and that's
3241  * not good either.
3242  *
3243  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3244  * rewriting all changes again during recovery.
3245  *
3246  * Note we assume we're chdir'd into PGDATA to begin with.
3247  */
3248 void
3250 {
3251  bool xlog_is_symlink;
3252 
3253  /* We can skip this whole thing if fsync is disabled. */
3254  if (!enableFsync)
3255  return;
3256 
3257  /*
3258  * If pg_wal is a symlink, we'll need to recurse into it separately,
3259  * because the first walkdir below will ignore it.
3260  */
3261  xlog_is_symlink = false;
3262 
3263 #ifndef WIN32
3264  {
3265  struct stat st;
3266 
3267  if (lstat("pg_wal", &st) < 0)
3268  ereport(LOG,
3270  errmsg("could not stat file \"%s\": %m",
3271  "pg_wal")));
3272  else if (S_ISLNK(st.st_mode))
3273  xlog_is_symlink = true;
3274  }
3275 #else
3276  if (pgwin32_is_junction("pg_wal"))
3277  xlog_is_symlink = true;
3278 #endif
3279 
3280  /*
3281  * If possible, hint to the kernel that we're soon going to fsync the data
3282  * directory and its contents. Errors in this step are even less
3283  * interesting than normal, so log them only at DEBUG1.
3284  */
3285 #ifdef PG_FLUSH_DATA_WORKS
3286  walkdir(".", pre_sync_fname, false, DEBUG1);
3287  if (xlog_is_symlink)
3288  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3289  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3290 #endif
3291 
3292  /*
3293  * Now we do the fsync()s in the same order.
3294  *
3295  * The main call ignores symlinks, so in addition to specially processing
3296  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3297  * process_symlinks = true. Note that if there are any plain directories
3298  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3299  * so we don't worry about optimizing it.
3300  */
3301  walkdir(".", datadir_fsync_fname, false, LOG);
3302  if (xlog_is_symlink)
3303  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3304  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3305 }
3306 
3307 /*
3308  * walkdir: recursively walk a directory, applying the action to each
3309  * regular file and directory (including the named directory itself).
3310  *
3311  * If process_symlinks is true, the action and recursion are also applied
3312  * to regular files and directories that are pointed to by symlinks in the
3313  * given directory; otherwise symlinks are ignored. Symlinks are always
3314  * ignored in subdirectories, ie we intentionally don't pass down the
3315  * process_symlinks flag to recursive calls.
3316  *
3317  * Errors are reported at level elevel, which might be ERROR or less.
3318  *
3319  * See also walkdir in file_utils.c, which is a frontend version of this
3320  * logic.
3321  */
3322 static void
3323 walkdir(const char *path,
3324  void (*action) (const char *fname, bool isdir, int elevel),
3325  bool process_symlinks,
3326  int elevel)
3327 {
3328  DIR *dir;
3329  struct dirent *de;
3330 
3331  dir = AllocateDir(path);
3332 
3333  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3334  {
3335  char subpath[MAXPGPATH * 2];
3336  struct stat fst;
3337  int sret;
3338 
3340 
3341  if (strcmp(de->d_name, ".") == 0 ||
3342  strcmp(de->d_name, "..") == 0)
3343  continue;
3344 
3345  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3346 
3347  if (process_symlinks)
3348  sret = stat(subpath, &fst);
3349  else
3350  sret = lstat(subpath, &fst);
3351 
3352  if (sret < 0)
3353  {
3354  ereport(elevel,
3356  errmsg("could not stat file \"%s\": %m", subpath)));
3357  continue;
3358  }
3359 
3360  if (S_ISREG(fst.st_mode))
3361  (*action) (subpath, false, elevel);
3362  else if (S_ISDIR(fst.st_mode))
3363  walkdir(subpath, action, false, elevel);
3364  }
3365 
3366  FreeDir(dir); /* we ignore any error here */
3367 
3368  /*
3369  * It's important to fsync the destination directory itself as individual
3370  * file fsyncs don't guarantee that the directory entry for the file is
3371  * synced. However, skip this if AllocateDir failed; the action function
3372  * might not be robust against that.
3373  */
3374  if (dir)
3375  (*action) (path, true, elevel);
3376 }
3377 
3378 
3379 /*
3380  * Hint to the OS that it should get ready to fsync() this file.
3381  *
3382  * Ignores errors trying to open unreadable files, and logs other errors at a
3383  * caller-specified level.
3384  */
3385 #ifdef PG_FLUSH_DATA_WORKS
3386 
3387 static void
3388 pre_sync_fname(const char *fname, bool isdir, int elevel)
3389 {
3390  int fd;
3391 
3392  /* Don't try to flush directories, it'll likely just fail */
3393  if (isdir)
3394  return;
3395 
3396  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3397 
3398  if (fd < 0)
3399  {
3400  if (errno == EACCES)
3401  return;
3402  ereport(elevel,
3404  errmsg("could not open file \"%s\": %m", fname)));
3405  return;
3406  }
3407 
3408  /*
3409  * pg_flush_data() ignores errors, which is ok because this is only a
3410  * hint.
3411  */
3412  pg_flush_data(fd, 0, 0);
3413 
3414  if (CloseTransientFile(fd) != 0)
3415  ereport(elevel,
3417  errmsg("could not close file \"%s\": %m", fname)));
3418 }
3419 
3420 #endif /* PG_FLUSH_DATA_WORKS */
3421 
3422 static void
3423 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3424 {
3425  /*
3426  * We want to silently ignoring errors about unreadable files. Pass that
3427  * desire on to fsync_fname_ext().
3428  */
3429  fsync_fname_ext(fname, isdir, true, elevel);
3430 }
3431 
3432 static void
3433 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3434 {
3435  if (isdir)
3436  {
3437  if (rmdir(fname) != 0 && errno != ENOENT)
3438  ereport(elevel,
3440  errmsg("could not remove directory \"%s\": %m", fname)));
3441  }
3442  else
3443  {
3444  /* Use PathNameDeleteTemporaryFile to report filesize */
3445  PathNameDeleteTemporaryFile(fname, false);
3446  }
3447 }
3448 
3449 /*
3450  * fsync_fname_ext -- Try to fsync a file or directory
3451  *
3452  * If ignore_perm is true, ignore errors upon trying to open unreadable
3453  * files. Logs other errors at a caller-specified level.
3454  *
3455  * Returns 0 if the operation succeeded, -1 otherwise.
3456  */
3457 int
3458 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3459 {
3460  int fd;
3461  int flags;
3462  int returncode;
3463 
3464  /*
3465  * Some OSs require directories to be opened read-only whereas other
3466  * systems don't allow us to fsync files opened read-only; so we need both
3467  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3468  * not writable by our userid, but we assume that's OK.
3469  */
3470  flags = PG_BINARY;
3471  if (!isdir)
3472  flags |= O_RDWR;
3473  else
3474  flags |= O_RDONLY;
3475 
3476  fd = OpenTransientFile(fname, flags);
3477 
3478  /*
3479  * Some OSs don't allow us to open directories at all (Windows returns
3480  * EACCES), just ignore the error in that case. If desired also silently
3481  * ignoring errors about unreadable files. Log others.
3482  */
3483  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3484  return 0;
3485  else if (fd < 0 && ignore_perm && errno == EACCES)
3486  return 0;
3487  else if (fd < 0)
3488  {
3489  ereport(elevel,
3491  errmsg("could not open file \"%s\": %m", fname)));
3492  return -1;
3493  }
3494 
3495  returncode = pg_fsync(fd);
3496 
3497  /*
3498  * Some OSes don't allow us to fsync directories at all, so we can ignore
3499  * those errors. Anything else needs to be logged.
3500  */
3501  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3502  {
3503  int save_errno;
3504 
3505  /* close file upon error, might not be in transaction context */
3506  save_errno = errno;
3507  (void) CloseTransientFile(fd);
3508  errno = save_errno;
3509 
3510  ereport(elevel,
3512  errmsg("could not fsync file \"%s\": %m", fname)));
3513  return -1;
3514  }
3515 
3516  if (CloseTransientFile(fd) != 0)
3517  {
3518  ereport(elevel,
3520  errmsg("could not close file \"%s\": %m", fname)));
3521  return -1;
3522  }
3523 
3524  return 0;
3525 }
3526 
3527 /*
3528  * fsync_parent_path -- fsync the parent path of a file or directory
3529  *
3530  * This is aimed at making file operations persistent on disk in case of
3531  * an OS crash or power failure.
3532  */
3533 static int
3534 fsync_parent_path(const char *fname, int elevel)
3535 {
3536  char parentpath[MAXPGPATH];
3537 
3538  strlcpy(parentpath, fname, MAXPGPATH);
3539  get_parent_directory(parentpath);
3540 
3541  /*
3542  * get_parent_directory() returns an empty string if the input argument is
3543  * just a file name (see comments in path.c), so handle that as being the
3544  * current directory.
3545  */
3546  if (strlen(parentpath) == 0)
3547  strlcpy(parentpath, ".", MAXPGPATH);
3548 
3549  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3550  return -1;
3551 
3552  return 0;
3553 }
3554 
3555 /*
3556  * Create a PostgreSQL data sub-directory
3557  *
3558  * The data directory itself, and most of its sub-directories, are created at
3559  * initdb time, but we do have some occasions when we create directories in
3560  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3561  * make sure that those directories are created consistently. Today, that means
3562  * making sure that the created directory has the correct permissions, which is
3563  * what pg_dir_create_mode tracks for us.
3564  *
3565  * Note that we also set the umask() based on what we understand the correct
3566  * permissions to be (see file_perm.c).
3567  *
3568  * For permissions other than the default, mkdir() can be used directly, but
3569  * be sure to consider carefully such cases -- a sub-directory with incorrect
3570  * permissions in a PostgreSQL data directory could cause backups and other
3571  * processes to fail.
3572  */
3573 int
3574 MakePGDirectory(const char *directoryName)
3575 {
3576  return mkdir(directoryName, pg_dir_create_mode);
3577 }
3578 
3579 /*
3580  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3581  *
3582  * Failure to fsync any data file is cause for immediate panic, unless
3583  * data_sync_retry is enabled. Data may have been written to the operating
3584  * system and removed from our buffer pool already, and if we are running on
3585  * an operating system that forgets dirty data on write-back failure, there
3586  * may be only one copy of the data remaining: in the WAL. A later attempt to
3587  * fsync again might falsely report success. Therefore we must not allow any
3588  * further checkpoints to be attempted. data_sync_retry can in theory be
3589  * enabled on systems known not to drop dirty buffered data on write-back
3590  * failure (with the likely outcome that checkpoints will continue to fail
3591  * until the underlying problem is fixed).
3592  *
3593  * Any code that reports a failure from fsync() or related functions should
3594  * filter the error level with this function.
3595  */
3596 int
3597 data_sync_elevel(int elevel)
3598 {
3599  return data_sync_retry ? elevel : PANIC;
3600 }
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1447
File lruLessRecently
Definition: fd.c:192
void closeAllVfds(void)
Definition: fd.c:2758
static PgChecksumMode mode
Definition: pg_checksums.c:61
File nextFree
Definition: fd.c:190
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:856
int pg_file_create_mode
Definition: file_perm.c:19
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1778
#define MAP_FAILED
Definition: mem.h:45
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:40
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1710
#define NUM_RESERVED_FDS
Definition: fd.c:124
static AllocateDesc * allocatedDescs
Definition: fd.c:253
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1434
int pg_fdatasync(int fd)
Definition: fd.c:433
static void error(void)
Definition: sql-dyntest.c:147
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
AllocateDescKind
Definition: fd.c:231
DIR * dir
Definition: fd.c:246
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1653
static void AtProcExit_Files(int code, Datum arg)
Definition: fd.c:2910
static Size SizeVfdCache
Definition: fd.c:206
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:183
void on_proc_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:305
#define DO_DB(A)
Definition: fd.c:169
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2824
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3323
long random(void)
Definition: random.c:22
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
static int numExternalFDs
Definition: fd.c:258
int pg_fsync_writethrough(int fd)
Definition: fd.c:410
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2662
int max_safe_fds
Definition: fd.c:154
#define Min(x, y)
Definition: c.h:920
off_t FileSize(File file)
Definition: fd.c:2146
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:630
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2379
#define FD_DELETE_AT_CLOSE
Definition: fd.c:181
int log_temp_files
Definition: guc.c:549
mode_t FileGetRawMode(File file)
Definition: fd.c:2234
void _dosmaperr(unsigned long)
Definition: win32error.c:171
static Vfd * VfdCache
Definition: fd.c:205
static void Delete(File file)
Definition: fd.c:1127
int closedir(DIR *)
Definition: dirent.c:113
static int numTempTableSpaces
Definition: fd.c:271
#define PG_TEMP_FILES_DIR
Definition: pg_checksums.c:58
int errcode(int sqlerrcode)
Definition: elog.c:610
#define MemSet(start, val, len)
Definition: c.h:971
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1542
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:398
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3151
File PathNameOpenTemporaryFile(const char *path)
Definition: fd.c:1748
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1619
static bool reserveAllocatedDesc(void)
Definition: fd.c:2245
uint32 SubTransactionId
Definition: c.h:517
#define SIGPIPE
Definition: win32_port.h:158
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1628
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
AllocateDescKind kind
Definition: fd.c:241
char * FilePathName(File file)
Definition: fd.c:2198
Definition: dirent.h:9
#define OidIsValid(objectId)
Definition: c.h:644
#define PANIC
Definition: elog.h:53
#define PG_BINARY
Definition: c.h:1234
static char * basedir
ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset)
Definition: pwrite.c:27
void AtEOXact_Files(bool isCommit)
Definition: fd.c:2896
Oid MyDatabaseTableSpace
Definition: globals.c:87
int ClosePipeStream(FILE *file)
Definition: fd.c:2729
ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset)
Definition: pread.c:27
#define malloc(a)
Definition: header.h:50
static void LruDelete(File file)
Definition: fd.c:1146
void pg_usleep(long microsec)
Definition: signal.c:53
bool TempTablespacesAreSet(void)
Definition: fd.c:2812
#define fsync(fd)
Definition: win32_port.h:62
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2480
void pfree(void *pointer)
Definition: mcxt.c:1056
mode_t fileMode
Definition: fd.c:197
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3123
static bool ReleaseLruFile(void)
Definition: fd.c:1241
Definition: dirent.c:25
int durable_rename_excl(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:783
#define ERROR
Definition: elog.h:43
#define PG_TEMP_FILE_PREFIX
Definition: pg_checksums.c:59
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2370
static int LruInsert(File file)
Definition: fd.c:1194
#define FATAL
Definition: elog.h:52
static bool have_xact_temporary_files
Definition: fd.c:217
#define MAXPGPATH
void ReserveExternalFD(void)
Definition: fd.c:1080
DIR * opendir(const char *)
Definition: dirent.c:33
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2125
#define DEBUG2
Definition: elog.h:24
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:26
char * fileName
Definition: fd.c:194
static char * buf
Definition: pg_test_fsync.c:67
Oid GetNextTempTableSpace(void)
Definition: fd.c:2842
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1268
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3433
int errdetail(const char *fmt,...)
Definition: elog.c:957
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3057
char * tablespace
Definition: pgbench.c:188
int errcode_for_file_access(void)
Definition: elog.c:633
void get_parent_directory(char *path)
Definition: path.c:854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2320
static int nfile
Definition: fd.c:211
unsigned int uint32
Definition: c.h:367
void SyncDataDirectory(void)
Definition: fd.c:3249
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2581
static int nextTempTableSpace
Definition: fd.c:272
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1380
int max_files_per_process
Definition: fd.c:141
static File AllocateVfd(void)
Definition: fd.c:1273
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2423
unsigned short fdstate
Definition: fd.c:188
Definition: fd.c:185
off_t fileSize
Definition: fd.c:193
int fd
Definition: fd.c:187
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2784
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:656
static void Insert(File file)
Definition: fd.c:1172
ResourceOwner resowner
Definition: fd.c:189
bool data_sync_retry
Definition: fd.c:157
#define S_ISREG(m)
Definition: win32_port.h:299
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3423
int CloseTransientFile(int fd)
Definition: fd.c:2547
#define SIG_IGN
Definition: win32_port.h:150
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1387
static void ReleaseLruFiles(void)
Definition: fd.c:1263
#define WARNING
Definition: elog.h:40
#define stat(a, b)
Definition: win32_port.h:255
#define FileIsNotOpen(file)
Definition: fd.c:178
int pg_dir_create_mode
Definition: file_perm.c:18
static int elevel
Definition: vacuumlazy.c:323
int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:2027
struct vfd Vfd
int data_sync_elevel(int elevel)
Definition: fd.c:3597
uintptr_t Datum
Definition: postgres.h:367
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:2863
unsigned int Index
Definition: c.h:475
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:453
#define FileIsValid(file)
Definition: fd.c:175
bool AcquireExternalFD(void)
Definition: fd.c:1045
FILE * file
Definition: fd.c:245
#define InvalidOid
Definition: postgres_ext.h:36
#define VFD_CLOSED
Definition: fd.c:173
static uint64 temporary_files_size
Definition: fd.c:225
#define ereport(elevel,...)
Definition: elog.h:144
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3574
pqsigfunc pqsignal(int signum, pqsigfunc handler)
Definition: signal.c:170
#define free(a)
Definition: header.h:65
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static void RegisterTemporaryFile(File file)
Definition: fd.c:1406
void FileClose(File file)
Definition: fd.c:1824
#define SIG_DFL
Definition: win32_port.h:148
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:1920
static int FileAccess(File file)
Definition: fd.c:1351
#define Assert(condition)
Definition: c.h:738
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:708
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2647
File lruMoreRecently
Definition: fd.c:191
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:1948
void RemovePgTempFiles(void)
Definition: fd.c:2998
SubTransactionId create_subid
Definition: fd.c:242
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1575
size_t Size
Definition: c.h:466
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1356
static const char * directory
Definition: zic.c:622
int sync_method
Definition: xlog.c:105
struct dirent * readdir(DIR *)
Definition: dirent.c:77
#define FD_MINFREE
Definition: fd.c:133
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3179
#define realloc(a, b)
Definition: header.h:60
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
#define INT64_FORMAT
Definition: c.h:409
const char * name
Definition: encode.c:555
static long tempFileCounter
Definition: fd.c:264
int fd
Definition: fd.c:247
#define S_ISDIR(m)
Definition: win32_port.h:296
#define lstat(path, sb)
Definition: win32_port.h:244
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:746
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:983
int FreeFile(FILE *file)
Definition: fd.c:2519
void set_max_safe_fds(void)
Definition: fd.c:940
union AllocateDesc::@25 desc
bool enableFsync
Definition: globals.c:119
static Oid * tempTableSpaces
Definition: fd.c:270
void ReleaseExternalFD(void)
Definition: fd.c:1098
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:824
int FileGetRawFlags(File file)
Definition: fd.c:2224
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1257
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1005
#define elog(elevel,...)
Definition: elog.h:214
int i
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:182
void * arg
int FileGetRawDesc(File file)
Definition: fd.c:2214
static void FreeVfd(File file)
Definition: fd.c:1331
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
int pg_fsync(int fd)
Definition: fd.c:343
char d_name[MAX_PATH]
Definition: dirent.h:14
#define mkdir(a, b)
Definition: win32_port.h:57
int link(const char *src, const char *dst)
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:323
int fileFlags
Definition: fd.c:196
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1511
int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:1971
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1277
#define snprintf
Definition: port.h:193
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2163
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3458
static int maxAllocatedDescs
Definition: fd.c:252
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:2928
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3534
int File
Definition: fd.h:49
int FreeDir(DIR *dir)
Definition: fd.c:2699
int temp_file_limit
Definition: guc.c:556
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:241
void InitFileAccess(void)
Definition: fd.c:823
static int numAllocatedDescs
Definition: fd.c:251
bool pgwin32_is_junction(const char *path)
#define ftruncate(a, b)
Definition: win32_port.h:59