PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  *-------------------------------------------------------------------------
65  */
66 
67 #include "postgres.h"
68 
69 #include <sys/file.h>
70 #include <sys/param.h>
71 #include <sys/stat.h>
72 #ifndef WIN32
73 #include <sys/mman.h>
74 #endif
75 #include <limits.h>
76 #include <unistd.h>
77 #include <fcntl.h>
78 #ifdef HAVE_SYS_RESOURCE_H
79 #include <sys/resource.h> /* for getrlimit */
80 #endif
81 
82 #include "miscadmin.h"
83 #include "access/xact.h"
84 #include "access/xlog.h"
85 #include "catalog/pg_tablespace.h"
86 #include "common/file_perm.h"
87 #include "pgstat.h"
88 #include "portability/mem.h"
89 #include "storage/fd.h"
90 #include "storage/ipc.h"
91 #include "utils/guc.h"
92 #include "utils/resowner_private.h"
93 
94 
95 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
96 #if defined(HAVE_SYNC_FILE_RANGE)
97 #define PG_FLUSH_DATA_WORKS 1
98 #elif !defined(WIN32) && defined(MS_ASYNC)
99 #define PG_FLUSH_DATA_WORKS 1
100 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
101 #define PG_FLUSH_DATA_WORKS 1
102 #endif
103 
104 /*
105  * We must leave some file descriptors free for system(), the dynamic loader,
106  * and other code that tries to open files without consulting fd.c. This
107  * is the number left free. (While we can be pretty sure we won't get
108  * EMFILE, there's never any guarantee that we won't get ENFILE due to
109  * other processes chewing up FDs. So it's a bad idea to try to open files
110  * without consulting fd.c. Nonetheless we cannot control all code.)
111  *
112  * Because this is just a fixed setting, we are effectively assuming that
113  * no such code will leave FDs open over the long term; otherwise the slop
114  * is likely to be insufficient. Note in particular that we expect that
115  * loading a shared library does not result in any permanent increase in
116  * the number of open files. (This appears to be true on most if not
117  * all platforms as of Feb 2004.)
118  */
119 #define NUM_RESERVED_FDS 10
120 
121 /*
122  * If we have fewer than this many usable FDs after allowing for the reserved
123  * ones, choke.
124  */
125 #define FD_MINFREE 10
126 
127 /*
128  * A number of platforms allow individual processes to open many more files
129  * than they can really support when *many* processes do the same thing.
130  * This GUC parameter lets the DBA limit max_safe_fds to something less than
131  * what the postmaster's initial probe suggests will work.
132  */
134 
135 /*
136  * Maximum number of file descriptors to open for either VFD entries or
137  * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
138  * to a conservative value, and remains that way indefinitely in bootstrap or
139  * standalone-backend cases. In normal postmaster operation, the postmaster
140  * calls set_max_safe_fds() late in initialization to update the value, and
141  * that value is then inherited by forked subprocesses.
142  *
143  * Note: the value of max_files_per_process is taken into account while
144  * setting this variable, and so need not be tested separately.
145  */
146 int max_safe_fds = 32; /* default if not changed */
147 
148 /* Whether it is safe to continue running after fsync() fails. */
149 bool data_sync_retry = false;
150 
151 /* Debugging.... */
152 
153 #ifdef FDDEBUG
154 #define DO_DB(A) \
155  do { \
156  int _do_db_save_errno = errno; \
157  A; \
158  errno = _do_db_save_errno; \
159  } while (0)
160 #else
161 #define DO_DB(A) \
162  ((void) 0)
163 #endif
164 
165 #define VFD_CLOSED (-1)
166 
167 #define FileIsValid(file) \
168  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
169 
170 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
171 
172 /* these are the assigned bits in fdstate below: */
173 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
174 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
175 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
176 
177 typedef struct vfd
178 {
179  int fd; /* current FD, or VFD_CLOSED if none */
180  unsigned short fdstate; /* bitflags for VFD's state */
181  ResourceOwner resowner; /* owner, for automatic cleanup */
182  File nextFree; /* link to next free VFD, if in freelist */
183  File lruMoreRecently; /* doubly linked recency-of-use list */
185  off_t fileSize; /* current size of file (0 if not temporary) */
186  char *fileName; /* name of file, or NULL for unused VFD */
187  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
188  int fileFlags; /* open(2) flags for (re)opening the file */
189  mode_t fileMode; /* mode to pass to open(2) */
190 } Vfd;
191 
192 /*
193  * Virtual File Descriptor array pointer and size. This grows as
194  * needed. 'File' values are indexes into this array.
195  * Note that VfdCache[0] is not a usable VFD, just a list header.
196  */
197 static Vfd *VfdCache;
198 static Size SizeVfdCache = 0;
199 
200 /*
201  * Number of file descriptors known to be in use by VFD entries.
202  */
203 static int nfile = 0;
204 
205 /*
206  * Flag to tell whether it's worth scanning VfdCache looking for temp files
207  * to close
208  */
209 static bool have_xact_temporary_files = false;
210 
211 /*
212  * Tracks the total size of all temporary files. Note: when temp_file_limit
213  * is being enforced, this cannot overflow since the limit cannot be more
214  * than INT_MAX kilobytes. When not enforcing, it could theoretically
215  * overflow, but we don't care.
216  */
217 static uint64 temporary_files_size = 0;
218 
219 /*
220  * List of OS handles opened with AllocateFile, AllocateDir and
221  * OpenTransientFile.
222  */
223 typedef enum
224 {
230 
231 typedef struct
232 {
235  union
236  {
237  FILE *file;
239  int fd;
240  } desc;
241 } AllocateDesc;
242 
243 static int numAllocatedDescs = 0;
244 static int maxAllocatedDescs = 0;
246 
247 /*
248  * Number of temporary files opened during the current session;
249  * this is used in generation of tempfile names.
250  */
251 static long tempFileCounter = 0;
252 
253 /*
254  * Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
255  * this has not been set in the current transaction.
256  */
257 static Oid *tempTableSpaces = NULL;
258 static int numTempTableSpaces = -1;
259 static int nextTempTableSpace = 0;
260 
261 
262 /*--------------------
263  *
264  * Private Routines
265  *
266  * Delete - delete a file from the Lru ring
267  * LruDelete - remove a file from the Lru ring and close its FD
268  * Insert - put a file at the front of the Lru ring
269  * LruInsert - put a file at the front of the Lru ring and open it
270  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
271  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
272  * AllocateVfd - grab a free (or new) file record (from VfdArray)
273  * FreeVfd - free a file record
274  *
275  * The Least Recently Used ring is a doubly linked list that begins and
276  * ends on element zero. Element zero is special -- it doesn't represent
277  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
278  * anchor that shows us the beginning/end of the ring.
279  * Only VFD elements that are currently really open (have an FD assigned) are
280  * in the Lru ring. Elements that are "virtually" open can be recognized
281  * by having a non-null fileName field.
282  *
283  * example:
284  *
285  * /--less----\ /---------\
286  * v \ v \
287  * #0 --more---> LeastRecentlyUsed --more-\ \
288  * ^\ | |
289  * \\less--> MostRecentlyUsedFile <---/ |
290  * \more---/ \--less--/
291  *
292  *--------------------
293  */
294 static void Delete(File file);
295 static void LruDelete(File file);
296 static void Insert(File file);
297 static int LruInsert(File file);
298 static bool ReleaseLruFile(void);
299 static void ReleaseLruFiles(void);
300 static File AllocateVfd(void);
301 static void FreeVfd(File file);
302 
303 static int FileAccess(File file);
304 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
305 static bool reserveAllocatedDesc(void);
306 static int FreeDesc(AllocateDesc *desc);
307 
308 static void AtProcExit_Files(int code, Datum arg);
309 static void CleanupTempFiles(bool isCommit, bool isProcExit);
310 static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
311  bool unlink_all);
312 static void RemovePgTempRelationFiles(const char *tsdirname);
313 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
314 
315 static void walkdir(const char *path,
316  void (*action) (const char *fname, bool isdir, int elevel),
317  bool process_symlinks,
318  int elevel);
319 #ifdef PG_FLUSH_DATA_WORKS
320 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
321 #endif
322 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
323 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
324 
325 static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
326 static int fsync_parent_path(const char *fname, int elevel);
327 
328 
329 /*
330  * pg_fsync --- do fsync with or without writethrough
331  */
332 int
334 {
335  /* #if is to skip the sync_method test if there's no need for it */
336 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
338  return pg_fsync_writethrough(fd);
339  else
340 #endif
341  return pg_fsync_no_writethrough(fd);
342 }
343 
344 
345 /*
346  * pg_fsync_no_writethrough --- same as fsync except does nothing if
347  * enableFsync is off
348  */
349 int
351 {
352  if (enableFsync)
353  return fsync(fd);
354  else
355  return 0;
356 }
357 
358 /*
359  * pg_fsync_writethrough
360  */
361 int
363 {
364  if (enableFsync)
365  {
366 #ifdef WIN32
367  return _commit(fd);
368 #elif defined(F_FULLFSYNC)
369  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
370 #else
371  errno = ENOSYS;
372  return -1;
373 #endif
374  }
375  else
376  return 0;
377 }
378 
379 /*
380  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
381  *
382  * Not all platforms have fdatasync; treat as fsync if not available.
383  */
384 int
386 {
387  if (enableFsync)
388  {
389 #ifdef HAVE_FDATASYNC
390  return fdatasync(fd);
391 #else
392  return fsync(fd);
393 #endif
394  }
395  else
396  return 0;
397 }
398 
399 /*
400  * pg_flush_data --- advise OS that the described dirty data should be flushed
401  *
402  * offset of 0 with nbytes 0 means that the entire file should be flushed
403  */
404 void
405 pg_flush_data(int fd, off_t offset, off_t nbytes)
406 {
407  /*
408  * Right now file flushing is primarily used to avoid making later
409  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
410  * if fsyncs are disabled - that's a decision we might want to make
411  * configurable at some point.
412  */
413  if (!enableFsync)
414  return;
415 
416  /*
417  * We compile all alternatives that are supported on the current platform,
418  * to find portability problems more easily.
419  */
420 #if defined(HAVE_SYNC_FILE_RANGE)
421  {
422  int rc;
423  static bool not_implemented_by_kernel = false;
424 
425  if (not_implemented_by_kernel)
426  return;
427 
428  /*
429  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
430  * tells the OS that writeback for the specified blocks should be
431  * started, but that we don't want to wait for completion. Note that
432  * this call might block if too much dirty data exists in the range.
433  * This is the preferable method on OSs supporting it, as it works
434  * reliably when available (contrast to msync()) and doesn't flush out
435  * clean data (like FADV_DONTNEED).
436  */
437  rc = sync_file_range(fd, offset, nbytes,
438  SYNC_FILE_RANGE_WRITE);
439  if (rc != 0)
440  {
441  int elevel;
442 
443  /*
444  * For systems that don't have an implementation of
445  * sync_file_range() such as Windows WSL, generate only one
446  * warning and then suppress all further attempts by this process.
447  */
448  if (errno == ENOSYS)
449  {
450  elevel = WARNING;
451  not_implemented_by_kernel = true;
452  }
453  else
454  elevel = data_sync_elevel(WARNING);
455 
456  ereport(elevel,
458  errmsg("could not flush dirty data: %m")));
459  }
460 
461  return;
462  }
463 #endif
464 #if !defined(WIN32) && defined(MS_ASYNC)
465  {
466  void *p;
467  static int pagesize = 0;
468 
469  /*
470  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
471  * writeback. On linux it only does so if MS_SYNC is specified, but
472  * then it does the writeback synchronously. Luckily all common linux
473  * systems have sync_file_range(). This is preferable over
474  * FADV_DONTNEED because it doesn't flush out clean data.
475  *
476  * We map the file (mmap()), tell the kernel to sync back the contents
477  * (msync()), and then remove the mapping again (munmap()).
478  */
479 
480  /* mmap() needs actual length if we want to map whole file */
481  if (offset == 0 && nbytes == 0)
482  {
483  nbytes = lseek(fd, 0, SEEK_END);
484  if (nbytes < 0)
485  {
488  errmsg("could not determine dirty data size: %m")));
489  return;
490  }
491  }
492 
493  /*
494  * Some platforms reject partial-page mmap() attempts. To deal with
495  * that, just truncate the request to a page boundary. If any extra
496  * bytes don't get flushed, well, it's only a hint anyway.
497  */
498 
499  /* fetch pagesize only once */
500  if (pagesize == 0)
501  pagesize = sysconf(_SC_PAGESIZE);
502 
503  /* align length to pagesize, dropping any fractional page */
504  if (pagesize > 0)
505  nbytes = (nbytes / pagesize) * pagesize;
506 
507  /* fractional-page request is a no-op */
508  if (nbytes <= 0)
509  return;
510 
511  /*
512  * mmap could well fail, particularly on 32-bit platforms where there
513  * may simply not be enough address space. If so, silently fall
514  * through to the next implementation.
515  */
516  if (nbytes <= (off_t) SSIZE_MAX)
517  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
518  else
519  p = MAP_FAILED;
520 
521  if (p != MAP_FAILED)
522  {
523  int rc;
524 
525  rc = msync(p, (size_t) nbytes, MS_ASYNC);
526  if (rc != 0)
527  {
530  errmsg("could not flush dirty data: %m")));
531  /* NB: need to fall through to munmap()! */
532  }
533 
534  rc = munmap(p, (size_t) nbytes);
535  if (rc != 0)
536  {
537  /* FATAL error because mapping would remain */
538  ereport(FATAL,
540  errmsg("could not munmap() while flushing data: %m")));
541  }
542 
543  return;
544  }
545  }
546 #endif
547 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
548  {
549  int rc;
550 
551  /*
552  * Signal the kernel that the passed in range should not be cached
553  * anymore. This has the, desired, side effect of writing out dirty
554  * data, and the, undesired, side effect of likely discarding useful
555  * clean cached blocks. For the latter reason this is the least
556  * preferable method.
557  */
558 
559  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
560 
561  if (rc != 0)
562  {
563  /* don't error out, this is just a performance optimization */
566  errmsg("could not flush dirty data: %m")));
567  }
568 
569  return;
570  }
571 #endif
572 }
573 
574 
575 /*
576  * fsync_fname -- fsync a file or directory, handling errors properly
577  *
578  * Try to fsync a file or directory. When doing the latter, ignore errors that
579  * indicate the OS just doesn't allow/require fsyncing directories.
580  */
581 void
582 fsync_fname(const char *fname, bool isdir)
583 {
584  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
585 }
586 
587 /*
588  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
589  *
590  * This routine ensures that, after returning, the effect of renaming file
591  * persists in case of a crash. A crash while this routine is running will
592  * leave you with either the pre-existing or the moved file in place of the
593  * new file; no mixed state or truncated files are possible.
594  *
595  * It does so by using fsync on the old filename and the possibly existing
596  * target filename before the rename, and the target file and directory after.
597  *
598  * Note that rename() cannot be used across arbitrary directories, as they
599  * might not be on the same filesystem. Therefore this routine does not
600  * support renaming across directories.
601  *
602  * Log errors with the caller specified severity.
603  *
604  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
605  * valid upon return.
606  */
607 int
608 durable_rename(const char *oldfile, const char *newfile, int elevel)
609 {
610  int fd;
611 
612  /*
613  * First fsync the old and target path (if it exists), to ensure that they
614  * are properly persistent on disk. Syncing the target file is not
615  * strictly necessary, but it makes it easier to reason about crashes;
616  * because it's then guaranteed that either source or target file exists
617  * after a crash.
618  */
619  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
620  return -1;
621 
622  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
623  if (fd < 0)
624  {
625  if (errno != ENOENT)
626  {
627  ereport(elevel,
629  errmsg("could not open file \"%s\": %m", newfile)));
630  return -1;
631  }
632  }
633  else
634  {
635  if (pg_fsync(fd) != 0)
636  {
637  int save_errno;
638 
639  /* close file upon error, might not be in transaction context */
640  save_errno = errno;
641  CloseTransientFile(fd);
642  errno = save_errno;
643 
644  ereport(elevel,
646  errmsg("could not fsync file \"%s\": %m", newfile)));
647  return -1;
648  }
649 
650  if (CloseTransientFile(fd) != 0)
651  {
652  ereport(elevel,
654  errmsg("could not close file \"%s\": %m", newfile)));
655  return -1;
656  }
657  }
658 
659  /* Time to do the real deal... */
660  if (rename(oldfile, newfile) < 0)
661  {
662  ereport(elevel,
664  errmsg("could not rename file \"%s\" to \"%s\": %m",
665  oldfile, newfile)));
666  return -1;
667  }
668 
669  /*
670  * To guarantee renaming the file is persistent, fsync the file with its
671  * new name, and its containing directory.
672  */
673  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
674  return -1;
675 
676  if (fsync_parent_path(newfile, elevel) != 0)
677  return -1;
678 
679  return 0;
680 }
681 
682 /*
683  * durable_unlink -- remove a file in a durable manner
684  *
685  * This routine ensures that, after returning, the effect of removing file
686  * persists in case of a crash. A crash while this routine is running will
687  * leave the system in no mixed state.
688  *
689  * It does so by using fsync on the parent directory of the file after the
690  * actual removal is done.
691  *
692  * Log errors with the severity specified by caller.
693  *
694  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
695  * valid upon return.
696  */
697 int
698 durable_unlink(const char *fname, int elevel)
699 {
700  if (unlink(fname) < 0)
701  {
702  ereport(elevel,
704  errmsg("could not remove file \"%s\": %m",
705  fname)));
706  return -1;
707  }
708 
709  /*
710  * To guarantee that the removal of the file is persistent, fsync its
711  * parent directory.
712  */
713  if (fsync_parent_path(fname, elevel) != 0)
714  return -1;
715 
716  return 0;
717 }
718 
719 /*
720  * durable_link_or_rename -- rename a file in a durable manner.
721  *
722  * Similar to durable_rename(), except that this routine tries (but does not
723  * guarantee) not to overwrite the target file.
724  *
725  * Note that a crash in an unfortunate moment can leave you with two links to
726  * the target file.
727  *
728  * Log errors with the caller specified severity.
729  *
730  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
731  * valid upon return.
732  */
733 int
734 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
735 {
736  /*
737  * Ensure that, if we crash directly after the rename/link, a file with
738  * valid contents is moved into place.
739  */
740  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
741  return -1;
742 
743 #if HAVE_WORKING_LINK
744  if (link(oldfile, newfile) < 0)
745  {
746  ereport(elevel,
748  errmsg("could not link file \"%s\" to \"%s\": %m",
749  oldfile, newfile)));
750  return -1;
751  }
752  unlink(oldfile);
753 #else
754  /* XXX: Add racy file existence check? */
755  if (rename(oldfile, newfile) < 0)
756  {
757  ereport(elevel,
759  errmsg("could not rename file \"%s\" to \"%s\": %m",
760  oldfile, newfile)));
761  return -1;
762  }
763 #endif
764 
765  /*
766  * Make change persistent in case of an OS crash, both the new entry and
767  * its parent directory need to be flushed.
768  */
769  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
770  return -1;
771 
772  /* Same for parent directory */
773  if (fsync_parent_path(newfile, elevel) != 0)
774  return -1;
775 
776  return 0;
777 }
778 
779 /*
780  * InitFileAccess --- initialize this module during backend startup
781  *
782  * This is called during either normal or standalone backend start.
783  * It is *not* called in the postmaster.
784  */
785 void
787 {
788  Assert(SizeVfdCache == 0); /* call me only once */
789 
790  /* initialize cache header entry */
791  VfdCache = (Vfd *) malloc(sizeof(Vfd));
792  if (VfdCache == NULL)
793  ereport(FATAL,
794  (errcode(ERRCODE_OUT_OF_MEMORY),
795  errmsg("out of memory")));
796 
797  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
798  VfdCache->fd = VFD_CLOSED;
799 
800  SizeVfdCache = 1;
801 
802  /* register proc-exit hook to ensure temp files are dropped at exit */
804 }
805 
806 /*
807  * count_usable_fds --- count how many FDs the system will let us open,
808  * and estimate how many are already open.
809  *
810  * We stop counting if usable_fds reaches max_to_probe. Note: a small
811  * value of max_to_probe might result in an underestimate of already_open;
812  * we must fill in any "gaps" in the set of used FDs before the calculation
813  * of already_open will give the right answer. In practice, max_to_probe
814  * of a couple of dozen should be enough to ensure good results.
815  *
816  * We assume stdin (FD 0) is available for dup'ing
817  */
818 static void
819 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
820 {
821  int *fd;
822  int size;
823  int used = 0;
824  int highestfd = 0;
825  int j;
826 
827 #ifdef HAVE_GETRLIMIT
828  struct rlimit rlim;
829  int getrlimit_status;
830 #endif
831 
832  size = 1024;
833  fd = (int *) palloc(size * sizeof(int));
834 
835 #ifdef HAVE_GETRLIMIT
836 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
837  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
838 #else /* but BSD doesn't ... */
839  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
840 #endif /* RLIMIT_NOFILE */
841  if (getrlimit_status != 0)
842  ereport(WARNING, (errmsg("getrlimit failed: %m")));
843 #endif /* HAVE_GETRLIMIT */
844 
845  /* dup until failure or probe limit reached */
846  for (;;)
847  {
848  int thisfd;
849 
850 #ifdef HAVE_GETRLIMIT
851 
852  /*
853  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
854  * some platforms
855  */
856  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
857  break;
858 #endif
859 
860  thisfd = dup(0);
861  if (thisfd < 0)
862  {
863  /* Expect EMFILE or ENFILE, else it's fishy */
864  if (errno != EMFILE && errno != ENFILE)
865  elog(WARNING, "dup(0) failed after %d successes: %m", used);
866  break;
867  }
868 
869  if (used >= size)
870  {
871  size *= 2;
872  fd = (int *) repalloc(fd, size * sizeof(int));
873  }
874  fd[used++] = thisfd;
875 
876  if (highestfd < thisfd)
877  highestfd = thisfd;
878 
879  if (used >= max_to_probe)
880  break;
881  }
882 
883  /* release the files we opened */
884  for (j = 0; j < used; j++)
885  close(fd[j]);
886 
887  pfree(fd);
888 
889  /*
890  * Return results. usable_fds is just the number of successful dups. We
891  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
892  * number) and so already_open is highestfd+1 - usable_fds.
893  */
894  *usable_fds = used;
895  *already_open = highestfd + 1 - used;
896 }
897 
898 /*
899  * set_max_safe_fds
900  * Determine number of file descriptors that fd.c is allowed to use
901  */
902 void
904 {
905  int usable_fds;
906  int already_open;
907 
908  /*----------
909  * We want to set max_safe_fds to
910  * MIN(usable_fds, max_files_per_process - already_open)
911  * less the slop factor for files that are opened without consulting
912  * fd.c. This ensures that we won't exceed either max_files_per_process
913  * or the experimentally-determined EMFILE limit.
914  *----------
915  */
917  &usable_fds, &already_open);
918 
919  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
920 
921  /*
922  * Take off the FDs reserved for system() etc.
923  */
925 
926  /*
927  * Make sure we still have enough to get by.
928  */
929  if (max_safe_fds < FD_MINFREE)
930  ereport(FATAL,
931  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
932  errmsg("insufficient file descriptors available to start server process"),
933  errdetail("System allows %d, we need at least %d.",
936 
937  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
938  max_safe_fds, usable_fds, already_open);
939 }
940 
941 /*
942  * Open a file with BasicOpenFilePerm() and pass default file mode for the
943  * fileMode parameter.
944  */
945 int
947 {
948  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
949 }
950 
951 /*
952  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
953  *
954  * This is exported for use by places that really want a plain kernel FD,
955  * but need to be proof against running out of FDs. Once an FD has been
956  * successfully returned, it is the caller's responsibility to ensure that
957  * it will not be leaked on ereport()! Most users should *not* call this
958  * routine directly, but instead use the VFD abstraction level, which
959  * provides protection against descriptor leaks as well as management of
960  * files that need to be open for more than a short period of time.
961  *
962  * Ideally this should be the *only* direct call of open() in the backend.
963  * In practice, the postmaster calls open() directly, and there are some
964  * direct open() calls done early in backend startup. Those are OK since
965  * this module wouldn't have any open files to close at that point anyway.
966  */
967 int
968 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
969 {
970  int fd;
971 
972 tryAgain:
973  fd = open(fileName, fileFlags, fileMode);
974 
975  if (fd >= 0)
976  return fd; /* success! */
977 
978  if (errno == EMFILE || errno == ENFILE)
979  {
980  int save_errno = errno;
981 
982  ereport(LOG,
983  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
984  errmsg("out of file descriptors: %m; release and retry")));
985  errno = 0;
986  if (ReleaseLruFile())
987  goto tryAgain;
988  errno = save_errno;
989  }
990 
991  return -1; /* failure */
992 }
993 
994 #if defined(FDDEBUG)
995 
996 static void
997 _dump_lru(void)
998 {
999  int mru = VfdCache[0].lruLessRecently;
1000  Vfd *vfdP = &VfdCache[mru];
1001  char buf[2048];
1002 
1003  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1004  while (mru != 0)
1005  {
1006  mru = vfdP->lruLessRecently;
1007  vfdP = &VfdCache[mru];
1008  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1009  }
1010  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1011  elog(LOG, "%s", buf);
1012 }
1013 #endif /* FDDEBUG */
1014 
1015 static void
1017 {
1018  Vfd *vfdP;
1019 
1020  Assert(file != 0);
1021 
1022  DO_DB(elog(LOG, "Delete %d (%s)",
1023  file, VfdCache[file].fileName));
1024  DO_DB(_dump_lru());
1025 
1026  vfdP = &VfdCache[file];
1027 
1028  VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1029  VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1030 
1031  DO_DB(_dump_lru());
1032 }
1033 
1034 static void
1036 {
1037  Vfd *vfdP;
1038 
1039  Assert(file != 0);
1040 
1041  DO_DB(elog(LOG, "LruDelete %d (%s)",
1042  file, VfdCache[file].fileName));
1043 
1044  vfdP = &VfdCache[file];
1045 
1046  /*
1047  * Close the file. We aren't expecting this to fail; if it does, better
1048  * to leak the FD than to mess up our internal state.
1049  */
1050  if (close(vfdP->fd) != 0)
1052  "could not close file \"%s\": %m", vfdP->fileName);
1053  vfdP->fd = VFD_CLOSED;
1054  --nfile;
1055 
1056  /* delete the vfd record from the LRU ring */
1057  Delete(file);
1058 }
1059 
1060 static void
1062 {
1063  Vfd *vfdP;
1064 
1065  Assert(file != 0);
1066 
1067  DO_DB(elog(LOG, "Insert %d (%s)",
1068  file, VfdCache[file].fileName));
1069  DO_DB(_dump_lru());
1070 
1071  vfdP = &VfdCache[file];
1072 
1073  vfdP->lruMoreRecently = 0;
1074  vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1075  VfdCache[0].lruLessRecently = file;
1076  VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1077 
1078  DO_DB(_dump_lru());
1079 }
1080 
1081 /* returns 0 on success, -1 on re-open failure (with errno set) */
1082 static int
1084 {
1085  Vfd *vfdP;
1086 
1087  Assert(file != 0);
1088 
1089  DO_DB(elog(LOG, "LruInsert %d (%s)",
1090  file, VfdCache[file].fileName));
1091 
1092  vfdP = &VfdCache[file];
1093 
1094  if (FileIsNotOpen(file))
1095  {
1096  /* Close excess kernel FDs. */
1097  ReleaseLruFiles();
1098 
1099  /*
1100  * The open could still fail for lack of file descriptors, eg due to
1101  * overall system file table being full. So, be prepared to release
1102  * another FD if necessary...
1103  */
1104  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1105  vfdP->fileMode);
1106  if (vfdP->fd < 0)
1107  {
1108  DO_DB(elog(LOG, "re-open failed: %m"));
1109  return -1;
1110  }
1111  else
1112  {
1113  ++nfile;
1114  }
1115  }
1116 
1117  /*
1118  * put it at the head of the Lru ring
1119  */
1120 
1121  Insert(file);
1122 
1123  return 0;
1124 }
1125 
1126 /*
1127  * Release one kernel FD by closing the least-recently-used VFD.
1128  */
1129 static bool
1131 {
1132  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1133 
1134  if (nfile > 0)
1135  {
1136  /*
1137  * There are opened files and so there should be at least one used vfd
1138  * in the ring.
1139  */
1140  Assert(VfdCache[0].lruMoreRecently != 0);
1141  LruDelete(VfdCache[0].lruMoreRecently);
1142  return true; /* freed a file */
1143  }
1144  return false; /* no files available to free */
1145 }
1146 
1147 /*
1148  * Release kernel FDs as needed to get under the max_safe_fds limit.
1149  * After calling this, it's OK to try to open another file.
1150  */
1151 static void
1153 {
1154  while (nfile + numAllocatedDescs >= max_safe_fds)
1155  {
1156  if (!ReleaseLruFile())
1157  break;
1158  }
1159 }
1160 
1161 static File
1163 {
1164  Index i;
1165  File file;
1166 
1167  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1168 
1169  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1170 
1171  if (VfdCache[0].nextFree == 0)
1172  {
1173  /*
1174  * The free list is empty so it is time to increase the size of the
1175  * array. We choose to double it each time this happens. However,
1176  * there's not much point in starting *real* small.
1177  */
1178  Size newCacheSize = SizeVfdCache * 2;
1179  Vfd *newVfdCache;
1180 
1181  if (newCacheSize < 32)
1182  newCacheSize = 32;
1183 
1184  /*
1185  * Be careful not to clobber VfdCache ptr if realloc fails.
1186  */
1187  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1188  if (newVfdCache == NULL)
1189  ereport(ERROR,
1190  (errcode(ERRCODE_OUT_OF_MEMORY),
1191  errmsg("out of memory")));
1192  VfdCache = newVfdCache;
1193 
1194  /*
1195  * Initialize the new entries and link them into the free list.
1196  */
1197  for (i = SizeVfdCache; i < newCacheSize; i++)
1198  {
1199  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1200  VfdCache[i].nextFree = i + 1;
1201  VfdCache[i].fd = VFD_CLOSED;
1202  }
1203  VfdCache[newCacheSize - 1].nextFree = 0;
1204  VfdCache[0].nextFree = SizeVfdCache;
1205 
1206  /*
1207  * Record the new size
1208  */
1209  SizeVfdCache = newCacheSize;
1210  }
1211 
1212  file = VfdCache[0].nextFree;
1213 
1214  VfdCache[0].nextFree = VfdCache[file].nextFree;
1215 
1216  return file;
1217 }
1218 
1219 static void
1221 {
1222  Vfd *vfdP = &VfdCache[file];
1223 
1224  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1225  file, vfdP->fileName ? vfdP->fileName : ""));
1226 
1227  if (vfdP->fileName != NULL)
1228  {
1229  free(vfdP->fileName);
1230  vfdP->fileName = NULL;
1231  }
1232  vfdP->fdstate = 0x0;
1233 
1234  vfdP->nextFree = VfdCache[0].nextFree;
1235  VfdCache[0].nextFree = file;
1236 }
1237 
1238 /* returns 0 on success, -1 on re-open failure (with errno set) */
1239 static int
1241 {
1242  int returnValue;
1243 
1244  DO_DB(elog(LOG, "FileAccess %d (%s)",
1245  file, VfdCache[file].fileName));
1246 
1247  /*
1248  * Is the file open? If not, open it and put it at the head of the LRU
1249  * ring (possibly closing the least recently used file to get an FD).
1250  */
1251 
1252  if (FileIsNotOpen(file))
1253  {
1254  returnValue = LruInsert(file);
1255  if (returnValue != 0)
1256  return returnValue;
1257  }
1258  else if (VfdCache[0].lruLessRecently != file)
1259  {
1260  /*
1261  * We now know that the file is open and that it is not the last one
1262  * accessed, so we need to move it to the head of the Lru ring.
1263  */
1264 
1265  Delete(file);
1266  Insert(file);
1267  }
1268 
1269  return 0;
1270 }
1271 
1272 /*
1273  * Called whenever a temporary file is deleted to report its size.
1274  */
1275 static void
1276 ReportTemporaryFileUsage(const char *path, off_t size)
1277 {
1278  pgstat_report_tempfile(size);
1279 
1280  if (log_temp_files >= 0)
1281  {
1282  if ((size / 1024) >= log_temp_files)
1283  ereport(LOG,
1284  (errmsg("temporary file: path \"%s\", size %lu",
1285  path, (unsigned long) size)));
1286  }
1287 }
1288 
1289 /*
1290  * Called to register a temporary file for automatic close.
1291  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1292  * before the file was opened.
1293  */
1294 static void
1296 {
1298  VfdCache[file].resowner = CurrentResourceOwner;
1299 
1300  /* Backup mechanism for closing at end of xact. */
1301  VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1303 }
1304 
1305 /*
1306  * Called when we get a shared invalidation message on some relation.
1307  */
1308 #ifdef NOT_USED
1309 void
1310 FileInvalidate(File file)
1311 {
1312  Assert(FileIsValid(file));
1313  if (!FileIsNotOpen(file))
1314  LruDelete(file);
1315 }
1316 #endif
1317 
1318 /*
1319  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1320  * fileMode parameter.
1321  */
1322 File
1324 {
1325  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1326 }
1327 
1328 /*
1329  * open a file in an arbitrary directory
1330  *
1331  * NB: if the passed pathname is relative (which it usually is),
1332  * it will be interpreted relative to the process' working directory
1333  * (which should always be $PGDATA when this code is running).
1334  */
1335 File
1337 {
1338  char *fnamecopy;
1339  File file;
1340  Vfd *vfdP;
1341 
1342  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1343  fileName, fileFlags, fileMode));
1344 
1345  /*
1346  * We need a malloc'd copy of the file name; fail cleanly if no room.
1347  */
1348  fnamecopy = strdup(fileName);
1349  if (fnamecopy == NULL)
1350  ereport(ERROR,
1351  (errcode(ERRCODE_OUT_OF_MEMORY),
1352  errmsg("out of memory")));
1353 
1354  file = AllocateVfd();
1355  vfdP = &VfdCache[file];
1356 
1357  /* Close excess kernel FDs. */
1358  ReleaseLruFiles();
1359 
1360  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1361 
1362  if (vfdP->fd < 0)
1363  {
1364  int save_errno = errno;
1365 
1366  FreeVfd(file);
1367  free(fnamecopy);
1368  errno = save_errno;
1369  return -1;
1370  }
1371  ++nfile;
1372  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1373  vfdP->fd));
1374 
1375  Insert(file);
1376 
1377  vfdP->fileName = fnamecopy;
1378  /* Saved flags are adjusted to be OK for re-opening file */
1379  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1380  vfdP->fileMode = fileMode;
1381  vfdP->fileSize = 0;
1382  vfdP->fdstate = 0x0;
1383  vfdP->resowner = NULL;
1384 
1385  return file;
1386 }
1387 
1388 /*
1389  * Create directory 'directory'. If necessary, create 'basedir', which must
1390  * be the directory above it. This is designed for creating the top-level
1391  * temporary directory on demand before creating a directory underneath it.
1392  * Do nothing if the directory already exists.
1393  *
1394  * Directories created within the top-level temporary directory should begin
1395  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1396  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1397  * that do not need any particular prefix.
1398 */
1399 void
1401 {
1402  if (MakePGDirectory(directory) < 0)
1403  {
1404  if (errno == EEXIST)
1405  return;
1406 
1407  /*
1408  * Failed. Try to create basedir first in case it's missing. Tolerate
1409  * EEXIST to close a race against another process following the same
1410  * algorithm.
1411  */
1412  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1413  ereport(ERROR,
1415  errmsg("cannot create temporary directory \"%s\": %m",
1416  basedir)));
1417 
1418  /* Try again. */
1419  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1420  ereport(ERROR,
1422  errmsg("cannot create temporary subdirectory \"%s\": %m",
1423  directory)));
1424  }
1425 }
1426 
1427 /*
1428  * Delete a directory and everything in it, if it exists.
1429  */
1430 void
1431 PathNameDeleteTemporaryDir(const char *dirname)
1432 {
1433  struct stat statbuf;
1434 
1435  /* Silently ignore missing directory. */
1436  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1437  return;
1438 
1439  /*
1440  * Currently, walkdir doesn't offer a way for our passed in function to
1441  * maintain state. Perhaps it should, so that we could tell the caller
1442  * whether this operation succeeded or failed. Since this operation is
1443  * used in a cleanup path, we wouldn't actually behave differently: we'll
1444  * just log failures.
1445  */
1446  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1447 }
1448 
1449 /*
1450  * Open a temporary file that will disappear when we close it.
1451  *
1452  * This routine takes care of generating an appropriate tempfile name.
1453  * There's no need to pass in fileFlags or fileMode either, since only
1454  * one setting makes any sense for a temp file.
1455  *
1456  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1457  * to ensure it's closed and deleted when it's no longer needed, typically at
1458  * the end-of-transaction. In most cases, you don't want temporary files to
1459  * outlive the transaction that created them, so this should be false -- but
1460  * if you need "somewhat" temporary storage, this might be useful. In either
1461  * case, the file is removed when the File is explicitly closed.
1462  */
1463 File
1464 OpenTemporaryFile(bool interXact)
1465 {
1466  File file = 0;
1467 
1468  /*
1469  * Make sure the current resource owner has space for this File before we
1470  * open it, if we'll be registering it below.
1471  */
1472  if (!interXact)
1474 
1475  /*
1476  * If some temp tablespace(s) have been given to us, try to use the next
1477  * one. If a given tablespace can't be found, we silently fall back to
1478  * the database's default tablespace.
1479  *
1480  * BUT: if the temp file is slated to outlive the current transaction,
1481  * force it into the database's default tablespace, so that it will not
1482  * pose a threat to possible tablespace drop attempts.
1483  */
1484  if (numTempTableSpaces > 0 && !interXact)
1485  {
1486  Oid tblspcOid = GetNextTempTableSpace();
1487 
1488  if (OidIsValid(tblspcOid))
1489  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1490  }
1491 
1492  /*
1493  * If not, or if tablespace is bad, create in database's default
1494  * tablespace. MyDatabaseTableSpace should normally be set before we get
1495  * here, but just in case it isn't, fall back to pg_default tablespace.
1496  */
1497  if (file <= 0)
1500  DEFAULTTABLESPACE_OID,
1501  true);
1502 
1503  /* Mark it for deletion at close and temporary file size limit */
1504  VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1505 
1506  /* Register it with the current resource owner */
1507  if (!interXact)
1508  RegisterTemporaryFile(file);
1509 
1510  return file;
1511 }
1512 
1513 /*
1514  * Return the path of the temp directory in a given tablespace.
1515  */
1516 void
1518 {
1519  /*
1520  * Identify the tempfile directory for this tablespace.
1521  *
1522  * If someone tries to specify pg_global, use pg_default instead.
1523  */
1524  if (tablespace == InvalidOid ||
1525  tablespace == DEFAULTTABLESPACE_OID ||
1526  tablespace == GLOBALTABLESPACE_OID)
1527  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1528  else
1529  {
1530  /* All other tablespaces are accessed via symlinks */
1531  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1532  tablespace, TABLESPACE_VERSION_DIRECTORY,
1534  }
1535 }
1536 
1537 /*
1538  * Open a temporary file in a specific tablespace.
1539  * Subroutine for OpenTemporaryFile, which see for details.
1540  */
1541 static File
1542 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1543 {
1544  char tempdirpath[MAXPGPATH];
1545  char tempfilepath[MAXPGPATH];
1546  File file;
1547 
1548  TempTablespacePath(tempdirpath, tblspcOid);
1549 
1550  /*
1551  * Generate a tempfile name that should be unique within the current
1552  * database instance.
1553  */
1554  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1555  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1556 
1557  /*
1558  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1559  * temp file that can be reused.
1560  */
1561  file = PathNameOpenFile(tempfilepath,
1562  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1563  if (file <= 0)
1564  {
1565  /*
1566  * We might need to create the tablespace's tempfile directory, if no
1567  * one has yet done so.
1568  *
1569  * Don't check for an error from MakePGDirectory; it could fail if
1570  * someone else just did the same thing. If it doesn't work then
1571  * we'll bomb out on the second create attempt, instead.
1572  */
1573  (void) MakePGDirectory(tempdirpath);
1574 
1575  file = PathNameOpenFile(tempfilepath,
1576  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1577  if (file <= 0 && rejectError)
1578  elog(ERROR, "could not create temporary file \"%s\": %m",
1579  tempfilepath);
1580  }
1581 
1582  return file;
1583 }
1584 
1585 
1586 /*
1587  * Create a new file. The directory containing it must already exist. Files
1588  * created this way are subject to temp_file_limit and are automatically
1589  * closed at end of transaction, but are not automatically deleted on close
1590  * because they are intended to be shared between cooperating backends.
1591  *
1592  * If the file is inside the top-level temporary directory, its name should
1593  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1594  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1595  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1596  * the prefix isn't needed.
1597  */
1598 File
1599 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1600 {
1601  File file;
1602 
1604 
1605  /*
1606  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1607  * temp file that can be reused.
1608  */
1609  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1610  if (file <= 0)
1611  {
1612  if (error_on_failure)
1613  ereport(ERROR,
1615  errmsg("could not create temporary file \"%s\": %m",
1616  path)));
1617  else
1618  return file;
1619  }
1620 
1621  /* Mark it for temp_file_limit accounting. */
1622  VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1623 
1624  /* Register it for automatic close. */
1625  RegisterTemporaryFile(file);
1626 
1627  return file;
1628 }
1629 
1630 /*
1631  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1632  * another backend. Files opened this way don't count against the
1633  * temp_file_limit of the caller, are read-only and are automatically closed
1634  * at the end of the transaction but are not deleted on close.
1635  */
1636 File
1637 PathNameOpenTemporaryFile(const char *path)
1638 {
1639  File file;
1640 
1642 
1643  /* We open the file read-only. */
1644  file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1645 
1646  /* If no such file, then we don't raise an error. */
1647  if (file <= 0 && errno != ENOENT)
1648  ereport(ERROR,
1650  errmsg("could not open temporary file \"%s\": %m",
1651  path)));
1652 
1653  if (file > 0)
1654  {
1655  /* Register it for automatic close. */
1656  RegisterTemporaryFile(file);
1657  }
1658 
1659  return file;
1660 }
1661 
1662 /*
1663  * Delete a file by pathname. Return true if the file existed, false if
1664  * didn't.
1665  */
1666 bool
1667 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1668 {
1669  struct stat filestats;
1670  int stat_errno;
1671 
1672  /* Get the final size for pgstat reporting. */
1673  if (stat(path, &filestats) != 0)
1674  stat_errno = errno;
1675  else
1676  stat_errno = 0;
1677 
1678  /*
1679  * Unlike FileClose's automatic file deletion code, we tolerate
1680  * non-existence to support BufFileDeleteShared which doesn't know how
1681  * many segments it has to delete until it runs out.
1682  */
1683  if (stat_errno == ENOENT)
1684  return false;
1685 
1686  if (unlink(path) < 0)
1687  {
1688  if (errno != ENOENT)
1689  ereport(error_on_failure ? ERROR : LOG,
1691  errmsg("cannot unlink temporary file \"%s\": %m",
1692  path)));
1693  return false;
1694  }
1695 
1696  if (stat_errno == 0)
1697  ReportTemporaryFileUsage(path, filestats.st_size);
1698  else
1699  {
1700  errno = stat_errno;
1701  ereport(LOG,
1703  errmsg("could not stat file \"%s\": %m", path)));
1704  }
1705 
1706  return true;
1707 }
1708 
1709 /*
1710  * close a file when done with it
1711  */
1712 void
1714 {
1715  Vfd *vfdP;
1716 
1717  Assert(FileIsValid(file));
1718 
1719  DO_DB(elog(LOG, "FileClose: %d (%s)",
1720  file, VfdCache[file].fileName));
1721 
1722  vfdP = &VfdCache[file];
1723 
1724  if (!FileIsNotOpen(file))
1725  {
1726  /* close the file */
1727  if (close(vfdP->fd) != 0)
1728  {
1729  /*
1730  * We may need to panic on failure to close non-temporary files;
1731  * see LruDelete.
1732  */
1734  "could not close file \"%s\": %m", vfdP->fileName);
1735  }
1736 
1737  --nfile;
1738  vfdP->fd = VFD_CLOSED;
1739 
1740  /* remove the file from the lru ring */
1741  Delete(file);
1742  }
1743 
1744  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1745  {
1746  /* Subtract its size from current usage (do first in case of error) */
1747  temporary_files_size -= vfdP->fileSize;
1748  vfdP->fileSize = 0;
1749  }
1750 
1751  /*
1752  * Delete the file if it was temporary, and make a log entry if wanted
1753  */
1754  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1755  {
1756  struct stat filestats;
1757  int stat_errno;
1758 
1759  /*
1760  * If we get an error, as could happen within the ereport/elog calls,
1761  * we'll come right back here during transaction abort. Reset the
1762  * flag to ensure that we can't get into an infinite loop. This code
1763  * is arranged to ensure that the worst-case consequence is failing to
1764  * emit log message(s), not failing to attempt the unlink.
1765  */
1766  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1767 
1768 
1769  /* first try the stat() */
1770  if (stat(vfdP->fileName, &filestats))
1771  stat_errno = errno;
1772  else
1773  stat_errno = 0;
1774 
1775  /* in any case do the unlink */
1776  if (unlink(vfdP->fileName))
1777  elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1778 
1779  /* and last report the stat results */
1780  if (stat_errno == 0)
1781  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1782  else
1783  {
1784  errno = stat_errno;
1785  elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1786  }
1787  }
1788 
1789  /* Unregister it from the resource owner */
1790  if (vfdP->resowner)
1791  ResourceOwnerForgetFile(vfdP->resowner, file);
1792 
1793  /*
1794  * Return the Vfd slot to the free list
1795  */
1796  FreeVfd(file);
1797 }
1798 
1799 /*
1800  * FilePrefetch - initiate asynchronous read of a given range of the file.
1801  *
1802  * Currently the only implementation of this function is using posix_fadvise
1803  * which is the simplest standardized interface that accomplishes this.
1804  * We could add an implementation using libaio in the future; but note that
1805  * this API is inappropriate for libaio, which wants to have a buffer provided
1806  * to read into.
1807  */
1808 int
1809 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1810 {
1811 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1812  int returnCode;
1813 
1814  Assert(FileIsValid(file));
1815 
1816  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1817  file, VfdCache[file].fileName,
1818  (int64) offset, amount));
1819 
1820  returnCode = FileAccess(file);
1821  if (returnCode < 0)
1822  return returnCode;
1823 
1824  pgstat_report_wait_start(wait_event_info);
1825  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1826  POSIX_FADV_WILLNEED);
1828 
1829  return returnCode;
1830 #else
1831  Assert(FileIsValid(file));
1832  return 0;
1833 #endif
1834 }
1835 
1836 void
1837 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1838 {
1839  int returnCode;
1840 
1841  Assert(FileIsValid(file));
1842 
1843  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1844  file, VfdCache[file].fileName,
1845  (int64) offset, (int64) nbytes));
1846 
1847  if (nbytes <= 0)
1848  return;
1849 
1850  returnCode = FileAccess(file);
1851  if (returnCode < 0)
1852  return;
1853 
1854  pgstat_report_wait_start(wait_event_info);
1855  pg_flush_data(VfdCache[file].fd, offset, nbytes);
1857 }
1858 
1859 int
1860 FileRead(File file, char *buffer, int amount, off_t offset,
1861  uint32 wait_event_info)
1862 {
1863  int returnCode;
1864  Vfd *vfdP;
1865 
1866  Assert(FileIsValid(file));
1867 
1868  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1869  file, VfdCache[file].fileName,
1870  (int64) offset,
1871  amount, buffer));
1872 
1873  returnCode = FileAccess(file);
1874  if (returnCode < 0)
1875  return returnCode;
1876 
1877  vfdP = &VfdCache[file];
1878 
1879 retry:
1880  pgstat_report_wait_start(wait_event_info);
1881  returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
1883 
1884  if (returnCode < 0)
1885  {
1886  /*
1887  * Windows may run out of kernel buffers and return "Insufficient
1888  * system resources" error. Wait a bit and retry to solve it.
1889  *
1890  * It is rumored that EINTR is also possible on some Unix filesystems,
1891  * in which case immediate retry is indicated.
1892  */
1893 #ifdef WIN32
1894  DWORD error = GetLastError();
1895 
1896  switch (error)
1897  {
1898  case ERROR_NO_SYSTEM_RESOURCES:
1899  pg_usleep(1000L);
1900  errno = EINTR;
1901  break;
1902  default:
1903  _dosmaperr(error);
1904  break;
1905  }
1906 #endif
1907  /* OK to retry if interrupted */
1908  if (errno == EINTR)
1909  goto retry;
1910  }
1911 
1912  return returnCode;
1913 }
1914 
1915 int
1916 FileWrite(File file, char *buffer, int amount, off_t offset,
1917  uint32 wait_event_info)
1918 {
1919  int returnCode;
1920  Vfd *vfdP;
1921 
1922  Assert(FileIsValid(file));
1923 
1924  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1925  file, VfdCache[file].fileName,
1926  (int64) offset,
1927  amount, buffer));
1928 
1929  returnCode = FileAccess(file);
1930  if (returnCode < 0)
1931  return returnCode;
1932 
1933  vfdP = &VfdCache[file];
1934 
1935  /*
1936  * If enforcing temp_file_limit and it's a temp file, check to see if the
1937  * write would overrun temp_file_limit, and throw error if so. Note: it's
1938  * really a modularity violation to throw error here; we should set errno
1939  * and return -1. However, there's no way to report a suitable error
1940  * message if we do that. All current callers would just throw error
1941  * immediately anyway, so this is safe at present.
1942  */
1943  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
1944  {
1945  off_t past_write = offset + amount;
1946 
1947  if (past_write > vfdP->fileSize)
1948  {
1949  uint64 newTotal = temporary_files_size;
1950 
1951  newTotal += past_write - vfdP->fileSize;
1952  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1953  ereport(ERROR,
1954  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1955  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1956  temp_file_limit)));
1957  }
1958  }
1959 
1960 retry:
1961  errno = 0;
1962  pgstat_report_wait_start(wait_event_info);
1963  returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
1965 
1966  /* if write didn't set errno, assume problem is no disk space */
1967  if (returnCode != amount && errno == 0)
1968  errno = ENOSPC;
1969 
1970  if (returnCode >= 0)
1971  {
1972  /*
1973  * Maintain fileSize and temporary_files_size if it's a temp file.
1974  */
1975  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1976  {
1977  off_t past_write = offset + amount;
1978 
1979  if (past_write > vfdP->fileSize)
1980  {
1981  temporary_files_size += past_write - vfdP->fileSize;
1982  vfdP->fileSize = past_write;
1983  }
1984  }
1985  }
1986  else
1987  {
1988  /*
1989  * See comments in FileRead()
1990  */
1991 #ifdef WIN32
1992  DWORD error = GetLastError();
1993 
1994  switch (error)
1995  {
1996  case ERROR_NO_SYSTEM_RESOURCES:
1997  pg_usleep(1000L);
1998  errno = EINTR;
1999  break;
2000  default:
2001  _dosmaperr(error);
2002  break;
2003  }
2004 #endif
2005  /* OK to retry if interrupted */
2006  if (errno == EINTR)
2007  goto retry;
2008  }
2009 
2010  return returnCode;
2011 }
2012 
2013 int
2014 FileSync(File file, uint32 wait_event_info)
2015 {
2016  int returnCode;
2017 
2018  Assert(FileIsValid(file));
2019 
2020  DO_DB(elog(LOG, "FileSync: %d (%s)",
2021  file, VfdCache[file].fileName));
2022 
2023  returnCode = FileAccess(file);
2024  if (returnCode < 0)
2025  return returnCode;
2026 
2027  pgstat_report_wait_start(wait_event_info);
2028  returnCode = pg_fsync(VfdCache[file].fd);
2030 
2031  return returnCode;
2032 }
2033 
2034 off_t
2036 {
2037  Assert(FileIsValid(file));
2038 
2039  DO_DB(elog(LOG, "FileSize %d (%s)",
2040  file, VfdCache[file].fileName));
2041 
2042  if (FileIsNotOpen(file))
2043  {
2044  if (FileAccess(file) < 0)
2045  return (off_t) -1;
2046  }
2047 
2048  return lseek(VfdCache[file].fd, 0, SEEK_END);
2049 }
2050 
2051 int
2052 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2053 {
2054  int returnCode;
2055 
2056  Assert(FileIsValid(file));
2057 
2058  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2059  file, VfdCache[file].fileName));
2060 
2061  returnCode = FileAccess(file);
2062  if (returnCode < 0)
2063  return returnCode;
2064 
2065  pgstat_report_wait_start(wait_event_info);
2066  returnCode = ftruncate(VfdCache[file].fd, offset);
2068 
2069  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2070  {
2071  /* adjust our state for truncation of a temp file */
2072  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2073  temporary_files_size -= VfdCache[file].fileSize - offset;
2074  VfdCache[file].fileSize = offset;
2075  }
2076 
2077  return returnCode;
2078 }
2079 
2080 /*
2081  * Return the pathname associated with an open file.
2082  *
2083  * The returned string points to an internal buffer, which is valid until
2084  * the file is closed.
2085  */
2086 char *
2088 {
2089  Assert(FileIsValid(file));
2090 
2091  return VfdCache[file].fileName;
2092 }
2093 
2094 /*
2095  * Return the raw file descriptor of an opened file.
2096  *
2097  * The returned file descriptor will be valid until the file is closed, but
2098  * there are a lot of things that can make that happen. So the caller should
2099  * be careful not to do much of anything else before it finishes using the
2100  * returned file descriptor.
2101  */
2102 int
2104 {
2105  Assert(FileIsValid(file));
2106  return VfdCache[file].fd;
2107 }
2108 
2109 /*
2110  * FileGetRawFlags - returns the file flags on open(2)
2111  */
2112 int
2114 {
2115  Assert(FileIsValid(file));
2116  return VfdCache[file].fileFlags;
2117 }
2118 
2119 /*
2120  * FileGetRawMode - returns the mode bitmask passed to open(2)
2121  */
2122 mode_t
2124 {
2125  Assert(FileIsValid(file));
2126  return VfdCache[file].fileMode;
2127 }
2128 
2129 /*
2130  * Make room for another allocatedDescs[] array entry if needed and possible.
2131  * Returns true if an array element is available.
2132  */
2133 static bool
2135 {
2136  AllocateDesc *newDescs;
2137  int newMax;
2138 
2139  /* Quick out if array already has a free slot. */
2141  return true;
2142 
2143  /*
2144  * If the array hasn't yet been created in the current process, initialize
2145  * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2146  * we will ever need, anyway. We don't want to look at max_safe_fds
2147  * immediately because set_max_safe_fds() may not have run yet.
2148  */
2149  if (allocatedDescs == NULL)
2150  {
2151  newMax = FD_MINFREE / 2;
2152  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2153  /* Out of memory already? Treat as fatal error. */
2154  if (newDescs == NULL)
2155  ereport(ERROR,
2156  (errcode(ERRCODE_OUT_OF_MEMORY),
2157  errmsg("out of memory")));
2158  allocatedDescs = newDescs;
2159  maxAllocatedDescs = newMax;
2160  return true;
2161  }
2162 
2163  /*
2164  * Consider enlarging the array beyond the initial allocation used above.
2165  * By the time this happens, max_safe_fds should be known accurately.
2166  *
2167  * We mustn't let allocated descriptors hog all the available FDs, and in
2168  * practice we'd better leave a reasonable number of FDs for VFD use. So
2169  * set the maximum to max_safe_fds / 2. (This should certainly be at
2170  * least as large as the initial size, FD_MINFREE / 2.)
2171  */
2172  newMax = max_safe_fds / 2;
2173  if (newMax > maxAllocatedDescs)
2174  {
2175  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2176  newMax * sizeof(AllocateDesc));
2177  /* Treat out-of-memory as a non-fatal error. */
2178  if (newDescs == NULL)
2179  return false;
2180  allocatedDescs = newDescs;
2181  maxAllocatedDescs = newMax;
2182  return true;
2183  }
2184 
2185  /* Can't enlarge allocatedDescs[] any more. */
2186  return false;
2187 }
2188 
2189 /*
2190  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2191  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2192  * necessary to open the file. When done, call FreeFile rather than fclose.
2193  *
2194  * Note that files that will be open for any significant length of time
2195  * should NOT be handled this way, since they cannot share kernel file
2196  * descriptors with other files; there is grave risk of running out of FDs
2197  * if anyone locks down too many FDs. Most callers of this routine are
2198  * simply reading a config file that they will read and close immediately.
2199  *
2200  * fd.c will automatically close all files opened with AllocateFile at
2201  * transaction commit or abort; this prevents FD leakage if a routine
2202  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2203  *
2204  * Ideally this should be the *only* direct call of fopen() in the backend.
2205  */
2206 FILE *
2207 AllocateFile(const char *name, const char *mode)
2208 {
2209  FILE *file;
2210 
2211  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2212  numAllocatedDescs, name));
2213 
2214  /* Can we allocate another non-virtual FD? */
2215  if (!reserveAllocatedDesc())
2216  ereport(ERROR,
2217  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2218  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2219  maxAllocatedDescs, name)));
2220 
2221  /* Close excess kernel FDs. */
2222  ReleaseLruFiles();
2223 
2224 TryAgain:
2225  if ((file = fopen(name, mode)) != NULL)
2226  {
2227  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2228 
2229  desc->kind = AllocateDescFile;
2230  desc->desc.file = file;
2233  return desc->desc.file;
2234  }
2235 
2236  if (errno == EMFILE || errno == ENFILE)
2237  {
2238  int save_errno = errno;
2239 
2240  ereport(LOG,
2241  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2242  errmsg("out of file descriptors: %m; release and retry")));
2243  errno = 0;
2244  if (ReleaseLruFile())
2245  goto TryAgain;
2246  errno = save_errno;
2247  }
2248 
2249  return NULL;
2250 }
2251 
2252 /*
2253  * Open a file with OpenTransientFilePerm() and pass default file mode for
2254  * the fileMode parameter.
2255  */
2256 int
2258 {
2259  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2260 }
2261 
2262 /*
2263  * Like AllocateFile, but returns an unbuffered fd like open(2)
2264  */
2265 int
2267 {
2268  int fd;
2269 
2270  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2271  numAllocatedDescs, fileName));
2272 
2273  /* Can we allocate another non-virtual FD? */
2274  if (!reserveAllocatedDesc())
2275  ereport(ERROR,
2276  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2277  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2278  maxAllocatedDescs, fileName)));
2279 
2280  /* Close excess kernel FDs. */
2281  ReleaseLruFiles();
2282 
2283  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2284 
2285  if (fd >= 0)
2286  {
2287  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2288 
2289  desc->kind = AllocateDescRawFD;
2290  desc->desc.fd = fd;
2293 
2294  return fd;
2295  }
2296 
2297  return -1; /* failure */
2298 }
2299 
2300 /*
2301  * Routines that want to initiate a pipe stream should use OpenPipeStream
2302  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2303  * necessary. When done, call ClosePipeStream rather than pclose.
2304  *
2305  * This function also ensures that the popen'd program is run with default
2306  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2307  * uses. This ensures desirable response to, eg, closing a read pipe early.
2308  */
2309 FILE *
2310 OpenPipeStream(const char *command, const char *mode)
2311 {
2312  FILE *file;
2313  int save_errno;
2314 
2315  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2316  numAllocatedDescs, command));
2317 
2318  /* Can we allocate another non-virtual FD? */
2319  if (!reserveAllocatedDesc())
2320  ereport(ERROR,
2321  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2322  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2323  maxAllocatedDescs, command)));
2324 
2325  /* Close excess kernel FDs. */
2326  ReleaseLruFiles();
2327 
2328 TryAgain:
2329  fflush(stdout);
2330  fflush(stderr);
2332  errno = 0;
2333  file = popen(command, mode);
2334  save_errno = errno;
2336  errno = save_errno;
2337  if (file != NULL)
2338  {
2339  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2340 
2341  desc->kind = AllocateDescPipe;
2342  desc->desc.file = file;
2345  return desc->desc.file;
2346  }
2347 
2348  if (errno == EMFILE || errno == ENFILE)
2349  {
2350  ereport(LOG,
2351  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2352  errmsg("out of file descriptors: %m; release and retry")));
2353  if (ReleaseLruFile())
2354  goto TryAgain;
2355  errno = save_errno;
2356  }
2357 
2358  return NULL;
2359 }
2360 
2361 /*
2362  * Free an AllocateDesc of any type.
2363  *
2364  * The argument *must* point into the allocatedDescs[] array.
2365  */
2366 static int
2368 {
2369  int result;
2370 
2371  /* Close the underlying object */
2372  switch (desc->kind)
2373  {
2374  case AllocateDescFile:
2375  result = fclose(desc->desc.file);
2376  break;
2377  case AllocateDescPipe:
2378  result = pclose(desc->desc.file);
2379  break;
2380  case AllocateDescDir:
2381  result = closedir(desc->desc.dir);
2382  break;
2383  case AllocateDescRawFD:
2384  result = close(desc->desc.fd);
2385  break;
2386  default:
2387  elog(ERROR, "AllocateDesc kind not recognized");
2388  result = 0; /* keep compiler quiet */
2389  break;
2390  }
2391 
2392  /* Compact storage in the allocatedDescs array */
2394  *desc = allocatedDescs[numAllocatedDescs];
2395 
2396  return result;
2397 }
2398 
2399 /*
2400  * Close a file returned by AllocateFile.
2401  *
2402  * Note we do not check fclose's return value --- it is up to the caller
2403  * to handle close errors.
2404  */
2405 int
2406 FreeFile(FILE *file)
2407 {
2408  int i;
2409 
2410  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2411 
2412  /* Remove file from list of allocated files, if it's present */
2413  for (i = numAllocatedDescs; --i >= 0;)
2414  {
2415  AllocateDesc *desc = &allocatedDescs[i];
2416 
2417  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2418  return FreeDesc(desc);
2419  }
2420 
2421  /* Only get here if someone passes us a file not in allocatedDescs */
2422  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2423 
2424  return fclose(file);
2425 }
2426 
2427 /*
2428  * Close a file returned by OpenTransientFile.
2429  *
2430  * Note we do not check close's return value --- it is up to the caller
2431  * to handle close errors.
2432  */
2433 int
2435 {
2436  int i;
2437 
2438  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2439 
2440  /* Remove fd from list of allocated files, if it's present */
2441  for (i = numAllocatedDescs; --i >= 0;)
2442  {
2443  AllocateDesc *desc = &allocatedDescs[i];
2444 
2445  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2446  return FreeDesc(desc);
2447  }
2448 
2449  /* Only get here if someone passes us a file not in allocatedDescs */
2450  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2451 
2452  return close(fd);
2453 }
2454 
2455 /*
2456  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2457  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2458  * necessary to open the directory, and with closing it after an elog.
2459  * When done, call FreeDir rather than closedir.
2460  *
2461  * Returns NULL, with errno set, on failure. Note that failure detection
2462  * is commonly left to the following call of ReadDir or ReadDirExtended;
2463  * see the comments for ReadDir.
2464  *
2465  * Ideally this should be the *only* direct call of opendir() in the backend.
2466  */
2467 DIR *
2468 AllocateDir(const char *dirname)
2469 {
2470  DIR *dir;
2471 
2472  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2473  numAllocatedDescs, dirname));
2474 
2475  /* Can we allocate another non-virtual FD? */
2476  if (!reserveAllocatedDesc())
2477  ereport(ERROR,
2478  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2479  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2480  maxAllocatedDescs, dirname)));
2481 
2482  /* Close excess kernel FDs. */
2483  ReleaseLruFiles();
2484 
2485 TryAgain:
2486  if ((dir = opendir(dirname)) != NULL)
2487  {
2488  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2489 
2490  desc->kind = AllocateDescDir;
2491  desc->desc.dir = dir;
2494  return desc->desc.dir;
2495  }
2496 
2497  if (errno == EMFILE || errno == ENFILE)
2498  {
2499  int save_errno = errno;
2500 
2501  ereport(LOG,
2502  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2503  errmsg("out of file descriptors: %m; release and retry")));
2504  errno = 0;
2505  if (ReleaseLruFile())
2506  goto TryAgain;
2507  errno = save_errno;
2508  }
2509 
2510  return NULL;
2511 }
2512 
2513 /*
2514  * Read a directory opened with AllocateDir, ereport'ing any error.
2515  *
2516  * This is easier to use than raw readdir() since it takes care of some
2517  * otherwise rather tedious and error-prone manipulation of errno. Also,
2518  * if you are happy with a generic error message for AllocateDir failure,
2519  * you can just do
2520  *
2521  * dir = AllocateDir(path);
2522  * while ((dirent = ReadDir(dir, path)) != NULL)
2523  * process dirent;
2524  * FreeDir(dir);
2525  *
2526  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2527  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2528  * use this shortcut.)
2529  *
2530  * The pathname passed to AllocateDir must be passed to this routine too,
2531  * but it is only used for error reporting.
2532  */
2533 struct dirent *
2534 ReadDir(DIR *dir, const char *dirname)
2535 {
2536  return ReadDirExtended(dir, dirname, ERROR);
2537 }
2538 
2539 /*
2540  * Alternate version of ReadDir that allows caller to specify the elevel
2541  * for any error report (whether it's reporting an initial failure of
2542  * AllocateDir or a subsequent directory read failure).
2543  *
2544  * If elevel < ERROR, returns NULL after any error. With the normal coding
2545  * pattern, this will result in falling out of the loop immediately as
2546  * though the directory contained no (more) entries.
2547  */
2548 struct dirent *
2549 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2550 {
2551  struct dirent *dent;
2552 
2553  /* Give a generic message for AllocateDir failure, if caller didn't */
2554  if (dir == NULL)
2555  {
2556  ereport(elevel,
2558  errmsg("could not open directory \"%s\": %m",
2559  dirname)));
2560  return NULL;
2561  }
2562 
2563  errno = 0;
2564  if ((dent = readdir(dir)) != NULL)
2565  return dent;
2566 
2567  if (errno)
2568  ereport(elevel,
2570  errmsg("could not read directory \"%s\": %m",
2571  dirname)));
2572  return NULL;
2573 }
2574 
2575 /*
2576  * Close a directory opened with AllocateDir.
2577  *
2578  * Returns closedir's return value (with errno set if it's not 0).
2579  * Note we do not check the return value --- it is up to the caller
2580  * to handle close errors if wanted.
2581  *
2582  * Does nothing if dir == NULL; we assume that directory open failure was
2583  * already reported if desired.
2584  */
2585 int
2587 {
2588  int i;
2589 
2590  /* Nothing to do if AllocateDir failed */
2591  if (dir == NULL)
2592  return 0;
2593 
2594  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2595 
2596  /* Remove dir from list of allocated dirs, if it's present */
2597  for (i = numAllocatedDescs; --i >= 0;)
2598  {
2599  AllocateDesc *desc = &allocatedDescs[i];
2600 
2601  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2602  return FreeDesc(desc);
2603  }
2604 
2605  /* Only get here if someone passes us a dir not in allocatedDescs */
2606  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2607 
2608  return closedir(dir);
2609 }
2610 
2611 
2612 /*
2613  * Close a pipe stream returned by OpenPipeStream.
2614  */
2615 int
2616 ClosePipeStream(FILE *file)
2617 {
2618  int i;
2619 
2620  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2621 
2622  /* Remove file from list of allocated files, if it's present */
2623  for (i = numAllocatedDescs; --i >= 0;)
2624  {
2625  AllocateDesc *desc = &allocatedDescs[i];
2626 
2627  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2628  return FreeDesc(desc);
2629  }
2630 
2631  /* Only get here if someone passes us a file not in allocatedDescs */
2632  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2633 
2634  return pclose(file);
2635 }
2636 
2637 /*
2638  * closeAllVfds
2639  *
2640  * Force all VFDs into the physically-closed state, so that the fewest
2641  * possible number of kernel file descriptors are in use. There is no
2642  * change in the logical state of the VFDs.
2643  */
2644 void
2646 {
2647  Index i;
2648 
2649  if (SizeVfdCache > 0)
2650  {
2651  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2652  for (i = 1; i < SizeVfdCache; i++)
2653  {
2654  if (!FileIsNotOpen(i))
2655  LruDelete(i);
2656  }
2657  }
2658 }
2659 
2660 
2661 /*
2662  * SetTempTablespaces
2663  *
2664  * Define a list (actually an array) of OIDs of tablespaces to use for
2665  * temporary files. This list will be used until end of transaction,
2666  * unless this function is called again before then. It is caller's
2667  * responsibility that the passed-in array has adequate lifespan (typically
2668  * it'd be allocated in TopTransactionContext).
2669  */
2670 void
2671 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2672 {
2673  Assert(numSpaces >= 0);
2674  tempTableSpaces = tableSpaces;
2675  numTempTableSpaces = numSpaces;
2676 
2677  /*
2678  * Select a random starting point in the list. This is to minimize
2679  * conflicts between backends that are most likely sharing the same list
2680  * of temp tablespaces. Note that if we create multiple temp files in the
2681  * same transaction, we'll advance circularly through the list --- this
2682  * ensures that large temporary sort files are nicely spread across all
2683  * available tablespaces.
2684  */
2685  if (numSpaces > 1)
2686  nextTempTableSpace = random() % numSpaces;
2687  else
2688  nextTempTableSpace = 0;
2689 }
2690 
2691 /*
2692  * TempTablespacesAreSet
2693  *
2694  * Returns true if SetTempTablespaces has been called in current transaction.
2695  * (This is just so that tablespaces.c doesn't need its own per-transaction
2696  * state.)
2697  */
2698 bool
2700 {
2701  return (numTempTableSpaces >= 0);
2702 }
2703 
2704 /*
2705  * GetTempTablespaces
2706  *
2707  * Populate an array with the OIDs of the tablespaces that should be used for
2708  * temporary files. Return the number that were copied into the output array.
2709  */
2710 int
2711 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2712 {
2713  int i;
2714 
2716  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2717  tableSpaces[i] = tempTableSpaces[i];
2718 
2719  return i;
2720 }
2721 
2722 /*
2723  * GetNextTempTableSpace
2724  *
2725  * Select the next temp tablespace to use. A result of InvalidOid means
2726  * to use the current database's default tablespace.
2727  */
2728 Oid
2730 {
2731  if (numTempTableSpaces > 0)
2732  {
2733  /* Advance nextTempTableSpace counter with wraparound */
2735  nextTempTableSpace = 0;
2737  }
2738  return InvalidOid;
2739 }
2740 
2741 
2742 /*
2743  * AtEOSubXact_Files
2744  *
2745  * Take care of subtransaction commit/abort. At abort, we close temp files
2746  * that the subtransaction may have opened. At commit, we reassign the
2747  * files that were opened to the parent subtransaction.
2748  */
2749 void
2750 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2751  SubTransactionId parentSubid)
2752 {
2753  Index i;
2754 
2755  for (i = 0; i < numAllocatedDescs; i++)
2756  {
2757  if (allocatedDescs[i].create_subid == mySubid)
2758  {
2759  if (isCommit)
2760  allocatedDescs[i].create_subid = parentSubid;
2761  else
2762  {
2763  /* have to recheck the item after FreeDesc (ugly) */
2764  FreeDesc(&allocatedDescs[i--]);
2765  }
2766  }
2767  }
2768 }
2769 
2770 /*
2771  * AtEOXact_Files
2772  *
2773  * This routine is called during transaction commit or abort. All still-open
2774  * per-transaction temporary file VFDs are closed, which also causes the
2775  * underlying files to be deleted (although they should've been closed already
2776  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2777  * closed. We also forget any transaction-local temp tablespace list.
2778  *
2779  * The isCommit flag is used only to decide whether to emit warnings about
2780  * unclosed files.
2781  */
2782 void
2783 AtEOXact_Files(bool isCommit)
2784 {
2785  CleanupTempFiles(isCommit, false);
2786  tempTableSpaces = NULL;
2787  numTempTableSpaces = -1;
2788 }
2789 
2790 /*
2791  * AtProcExit_Files
2792  *
2793  * on_proc_exit hook to clean up temp files during backend shutdown.
2794  * Here, we want to clean up *all* temp files including interXact ones.
2795  */
2796 static void
2798 {
2799  CleanupTempFiles(false, true);
2800 }
2801 
2802 /*
2803  * Close temporary files and delete their underlying files.
2804  *
2805  * isCommit: if true, this is normal transaction commit, and we don't
2806  * expect any remaining files; warn if there are some.
2807  *
2808  * isProcExit: if true, this is being called as the backend process is
2809  * exiting. If that's the case, we should remove all temporary files; if
2810  * that's not the case, we are being called for transaction commit/abort
2811  * and should only remove transaction-local temp files. In either case,
2812  * also clean up "allocated" stdio files, dirs and fds.
2813  */
2814 static void
2815 CleanupTempFiles(bool isCommit, bool isProcExit)
2816 {
2817  Index i;
2818 
2819  /*
2820  * Careful here: at proc_exit we need extra cleanup, not just
2821  * xact_temporary files.
2822  */
2823  if (isProcExit || have_xact_temporary_files)
2824  {
2825  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2826  for (i = 1; i < SizeVfdCache; i++)
2827  {
2828  unsigned short fdstate = VfdCache[i].fdstate;
2829 
2830  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2831  VfdCache[i].fileName != NULL)
2832  {
2833  /*
2834  * If we're in the process of exiting a backend process, close
2835  * all temporary files. Otherwise, only close temporary files
2836  * local to the current transaction. They should be closed by
2837  * the ResourceOwner mechanism already, so this is just a
2838  * debugging cross-check.
2839  */
2840  if (isProcExit)
2841  FileClose(i);
2842  else if (fdstate & FD_CLOSE_AT_EOXACT)
2843  {
2844  elog(WARNING,
2845  "temporary file %s not closed at end-of-transaction",
2846  VfdCache[i].fileName);
2847  FileClose(i);
2848  }
2849  }
2850  }
2851 
2852  have_xact_temporary_files = false;
2853  }
2854 
2855  /* Complain if any allocated files remain open at commit. */
2856  if (isCommit && numAllocatedDescs > 0)
2857  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
2859 
2860  /* Clean up "allocated" stdio files, dirs and fds. */
2861  while (numAllocatedDescs > 0)
2862  FreeDesc(&allocatedDescs[0]);
2863 }
2864 
2865 
2866 /*
2867  * Remove temporary and temporary relation files left over from a prior
2868  * postmaster session
2869  *
2870  * This should be called during postmaster startup. It will forcibly
2871  * remove any leftover files created by OpenTemporaryFile and any leftover
2872  * temporary relation files created by mdcreate.
2873  *
2874  * NOTE: we could, but don't, call this during a post-backend-crash restart
2875  * cycle. The argument for not doing it is that someone might want to examine
2876  * the temp files for debugging purposes. This does however mean that
2877  * OpenTemporaryFile had better allow for collision with an existing temp
2878  * file name.
2879  *
2880  * NOTE: this function and its subroutines generally report syscall failures
2881  * with ereport(LOG) and keep going. Removing temp files is not so critical
2882  * that we should fail to start the database when we can't do it.
2883  */
2884 void
2886 {
2887  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2888  DIR *spc_dir;
2889  struct dirent *spc_de;
2890 
2891  /*
2892  * First process temp files in pg_default ($PGDATA/base)
2893  */
2894  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2895  RemovePgTempFilesInDir(temp_path, true, false);
2896  RemovePgTempRelationFiles("base");
2897 
2898  /*
2899  * Cycle through temp directories for all non-default tablespaces.
2900  */
2901  spc_dir = AllocateDir("pg_tblspc");
2902 
2903  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
2904  {
2905  if (strcmp(spc_de->d_name, ".") == 0 ||
2906  strcmp(spc_de->d_name, "..") == 0)
2907  continue;
2908 
2909  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2911  RemovePgTempFilesInDir(temp_path, true, false);
2912 
2913  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2915  RemovePgTempRelationFiles(temp_path);
2916  }
2917 
2918  FreeDir(spc_dir);
2919 
2920  /*
2921  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2922  * DataDir as well.
2923  */
2924 #ifdef EXEC_BACKEND
2926 #endif
2927 }
2928 
2929 /*
2930  * Process one pgsql_tmp directory for RemovePgTempFiles.
2931  *
2932  * If missing_ok is true, it's all right for the named directory to not exist.
2933  * Any other problem results in a LOG message. (missing_ok should be true at
2934  * the top level, since pgsql_tmp directories are not created until needed.)
2935  *
2936  * At the top level, this should be called with unlink_all = false, so that
2937  * only files matching the temporary name prefix will be unlinked. When
2938  * recursing it will be called with unlink_all = true to unlink everything
2939  * under a top-level temporary directory.
2940  *
2941  * (These two flags could be replaced by one, but it seems clearer to keep
2942  * them separate.)
2943  */
2944 static void
2945 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
2946 {
2947  DIR *temp_dir;
2948  struct dirent *temp_de;
2949  char rm_path[MAXPGPATH * 2];
2950 
2951  temp_dir = AllocateDir(tmpdirname);
2952 
2953  if (temp_dir == NULL && errno == ENOENT && missing_ok)
2954  return;
2955 
2956  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
2957  {
2958  if (strcmp(temp_de->d_name, ".") == 0 ||
2959  strcmp(temp_de->d_name, "..") == 0)
2960  continue;
2961 
2962  snprintf(rm_path, sizeof(rm_path), "%s/%s",
2963  tmpdirname, temp_de->d_name);
2964 
2965  if (unlink_all ||
2966  strncmp(temp_de->d_name,
2968  strlen(PG_TEMP_FILE_PREFIX)) == 0)
2969  {
2970  struct stat statbuf;
2971 
2972  if (lstat(rm_path, &statbuf) < 0)
2973  {
2974  ereport(LOG,
2976  errmsg("could not stat file \"%s\": %m", rm_path)));
2977  continue;
2978  }
2979 
2980  if (S_ISDIR(statbuf.st_mode))
2981  {
2982  /* recursively remove contents, then directory itself */
2983  RemovePgTempFilesInDir(rm_path, false, true);
2984 
2985  if (rmdir(rm_path) < 0)
2986  ereport(LOG,
2988  errmsg("could not remove directory \"%s\": %m",
2989  rm_path)));
2990  }
2991  else
2992  {
2993  if (unlink(rm_path) < 0)
2994  ereport(LOG,
2996  errmsg("could not remove file \"%s\": %m",
2997  rm_path)));
2998  }
2999  }
3000  else
3001  ereport(LOG,
3002  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3003  rm_path)));
3004  }
3005 
3006  FreeDir(temp_dir);
3007 }
3008 
3009 /* Process one tablespace directory, look for per-DB subdirectories */
3010 static void
3011 RemovePgTempRelationFiles(const char *tsdirname)
3012 {
3013  DIR *ts_dir;
3014  struct dirent *de;
3015  char dbspace_path[MAXPGPATH * 2];
3016 
3017  ts_dir = AllocateDir(tsdirname);
3018 
3019  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3020  {
3021  /*
3022  * We're only interested in the per-database directories, which have
3023  * numeric names. Note that this code will also (properly) ignore "."
3024  * and "..".
3025  */
3026  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3027  continue;
3028 
3029  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3030  tsdirname, de->d_name);
3031  RemovePgTempRelationFilesInDbspace(dbspace_path);
3032  }
3033 
3034  FreeDir(ts_dir);
3035 }
3036 
3037 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3038 static void
3039 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3040 {
3041  DIR *dbspace_dir;
3042  struct dirent *de;
3043  char rm_path[MAXPGPATH * 2];
3044 
3045  dbspace_dir = AllocateDir(dbspacedirname);
3046 
3047  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3048  {
3049  if (!looks_like_temp_rel_name(de->d_name))
3050  continue;
3051 
3052  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3053  dbspacedirname, de->d_name);
3054 
3055  if (unlink(rm_path) < 0)
3056  ereport(LOG,
3058  errmsg("could not remove file \"%s\": %m",
3059  rm_path)));
3060  }
3061 
3062  FreeDir(dbspace_dir);
3063 }
3064 
3065 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3066 bool
3068 {
3069  int pos;
3070  int savepos;
3071 
3072  /* Must start with "t". */
3073  if (name[0] != 't')
3074  return false;
3075 
3076  /* Followed by a non-empty string of digits and then an underscore. */
3077  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3078  ;
3079  if (pos == 1 || name[pos] != '_')
3080  return false;
3081 
3082  /* Followed by another nonempty string of digits. */
3083  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3084  ;
3085  if (savepos == pos)
3086  return false;
3087 
3088  /* We might have _forkname or .segment or both. */
3089  if (name[pos] == '_')
3090  {
3091  int forkchar = forkname_chars(&name[pos + 1], NULL);
3092 
3093  if (forkchar <= 0)
3094  return false;
3095  pos += forkchar + 1;
3096  }
3097  if (name[pos] == '.')
3098  {
3099  int segchar;
3100 
3101  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3102  ;
3103  if (segchar <= 1)
3104  return false;
3105  pos += segchar;
3106  }
3107 
3108  /* Now we should be at the end. */
3109  if (name[pos] != '\0')
3110  return false;
3111  return true;
3112 }
3113 
3114 
3115 /*
3116  * Issue fsync recursively on PGDATA and all its contents.
3117  *
3118  * We fsync regular files and directories wherever they are, but we
3119  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3120  * Other symlinks are presumed to point at files we're not responsible
3121  * for fsyncing, and might not have privileges to write at all.
3122  *
3123  * Errors are logged but not considered fatal; that's because this is used
3124  * only during database startup, to deal with the possibility that there are
3125  * issued-but-unsynced writes pending against the data directory. We want to
3126  * ensure that such writes reach disk before anything that's done in the new
3127  * run. However, aborting on error would result in failure to start for
3128  * harmless cases such as read-only files in the data directory, and that's
3129  * not good either.
3130  *
3131  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3132  * rewriting all changes again during recovery.
3133  *
3134  * Note we assume we're chdir'd into PGDATA to begin with.
3135  */
3136 void
3138 {
3139  bool xlog_is_symlink;
3140 
3141  /* We can skip this whole thing if fsync is disabled. */
3142  if (!enableFsync)
3143  return;
3144 
3145  /*
3146  * If pg_wal is a symlink, we'll need to recurse into it separately,
3147  * because the first walkdir below will ignore it.
3148  */
3149  xlog_is_symlink = false;
3150 
3151 #ifndef WIN32
3152  {
3153  struct stat st;
3154 
3155  if (lstat("pg_wal", &st) < 0)
3156  ereport(LOG,
3158  errmsg("could not stat file \"%s\": %m",
3159  "pg_wal")));
3160  else if (S_ISLNK(st.st_mode))
3161  xlog_is_symlink = true;
3162  }
3163 #else
3164  if (pgwin32_is_junction("pg_wal"))
3165  xlog_is_symlink = true;
3166 #endif
3167 
3168  /*
3169  * If possible, hint to the kernel that we're soon going to fsync the data
3170  * directory and its contents. Errors in this step are even less
3171  * interesting than normal, so log them only at DEBUG1.
3172  */
3173 #ifdef PG_FLUSH_DATA_WORKS
3174  walkdir(".", pre_sync_fname, false, DEBUG1);
3175  if (xlog_is_symlink)
3176  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3177  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3178 #endif
3179 
3180  /*
3181  * Now we do the fsync()s in the same order.
3182  *
3183  * The main call ignores symlinks, so in addition to specially processing
3184  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3185  * process_symlinks = true. Note that if there are any plain directories
3186  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3187  * so we don't worry about optimizing it.
3188  */
3189  walkdir(".", datadir_fsync_fname, false, LOG);
3190  if (xlog_is_symlink)
3191  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3192  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3193 }
3194 
3195 /*
3196  * walkdir: recursively walk a directory, applying the action to each
3197  * regular file and directory (including the named directory itself).
3198  *
3199  * If process_symlinks is true, the action and recursion are also applied
3200  * to regular files and directories that are pointed to by symlinks in the
3201  * given directory; otherwise symlinks are ignored. Symlinks are always
3202  * ignored in subdirectories, ie we intentionally don't pass down the
3203  * process_symlinks flag to recursive calls.
3204  *
3205  * Errors are reported at level elevel, which might be ERROR or less.
3206  *
3207  * See also walkdir in file_utils.c, which is a frontend version of this
3208  * logic.
3209  */
3210 static void
3211 walkdir(const char *path,
3212  void (*action) (const char *fname, bool isdir, int elevel),
3213  bool process_symlinks,
3214  int elevel)
3215 {
3216  DIR *dir;
3217  struct dirent *de;
3218 
3219  dir = AllocateDir(path);
3220 
3221  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3222  {
3223  char subpath[MAXPGPATH * 2];
3224  struct stat fst;
3225  int sret;
3226 
3228 
3229  if (strcmp(de->d_name, ".") == 0 ||
3230  strcmp(de->d_name, "..") == 0)
3231  continue;
3232 
3233  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3234 
3235  if (process_symlinks)
3236  sret = stat(subpath, &fst);
3237  else
3238  sret = lstat(subpath, &fst);
3239 
3240  if (sret < 0)
3241  {
3242  ereport(elevel,
3244  errmsg("could not stat file \"%s\": %m", subpath)));
3245  continue;
3246  }
3247 
3248  if (S_ISREG(fst.st_mode))
3249  (*action) (subpath, false, elevel);
3250  else if (S_ISDIR(fst.st_mode))
3251  walkdir(subpath, action, false, elevel);
3252  }
3253 
3254  FreeDir(dir); /* we ignore any error here */
3255 
3256  /*
3257  * It's important to fsync the destination directory itself as individual
3258  * file fsyncs don't guarantee that the directory entry for the file is
3259  * synced. However, skip this if AllocateDir failed; the action function
3260  * might not be robust against that.
3261  */
3262  if (dir)
3263  (*action) (path, true, elevel);
3264 }
3265 
3266 
3267 /*
3268  * Hint to the OS that it should get ready to fsync() this file.
3269  *
3270  * Ignores errors trying to open unreadable files, and logs other errors at a
3271  * caller-specified level.
3272  */
3273 #ifdef PG_FLUSH_DATA_WORKS
3274 
3275 static void
3276 pre_sync_fname(const char *fname, bool isdir, int elevel)
3277 {
3278  int fd;
3279 
3280  /* Don't try to flush directories, it'll likely just fail */
3281  if (isdir)
3282  return;
3283 
3284  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3285 
3286  if (fd < 0)
3287  {
3288  if (errno == EACCES)
3289  return;
3290  ereport(elevel,
3292  errmsg("could not open file \"%s\": %m", fname)));
3293  return;
3294  }
3295 
3296  /*
3297  * pg_flush_data() ignores errors, which is ok because this is only a
3298  * hint.
3299  */
3300  pg_flush_data(fd, 0, 0);
3301 
3302  if (CloseTransientFile(fd) != 0)
3303  ereport(elevel,
3305  errmsg("could not close file \"%s\": %m", fname)));
3306 }
3307 
3308 #endif /* PG_FLUSH_DATA_WORKS */
3309 
3310 static void
3311 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3312 {
3313  /*
3314  * We want to silently ignoring errors about unreadable files. Pass that
3315  * desire on to fsync_fname_ext().
3316  */
3317  fsync_fname_ext(fname, isdir, true, elevel);
3318 }
3319 
3320 static void
3321 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3322 {
3323  if (isdir)
3324  {
3325  if (rmdir(fname) != 0 && errno != ENOENT)
3326  ereport(elevel,
3328  errmsg("could not rmdir directory \"%s\": %m", fname)));
3329  }
3330  else
3331  {
3332  /* Use PathNameDeleteTemporaryFile to report filesize */
3333  PathNameDeleteTemporaryFile(fname, false);
3334  }
3335 }
3336 
3337 /*
3338  * fsync_fname_ext -- Try to fsync a file or directory
3339  *
3340  * If ignore_perm is true, ignore errors upon trying to open unreadable
3341  * files. Logs other errors at a caller-specified level.
3342  *
3343  * Returns 0 if the operation succeeded, -1 otherwise.
3344  */
3345 static int
3346 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3347 {
3348  int fd;
3349  int flags;
3350  int returncode;
3351 
3352  /*
3353  * Some OSs require directories to be opened read-only whereas other
3354  * systems don't allow us to fsync files opened read-only; so we need both
3355  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3356  * not writable by our userid, but we assume that's OK.
3357  */
3358  flags = PG_BINARY;
3359  if (!isdir)
3360  flags |= O_RDWR;
3361  else
3362  flags |= O_RDONLY;
3363 
3364  fd = OpenTransientFile(fname, flags);
3365 
3366  /*
3367  * Some OSs don't allow us to open directories at all (Windows returns
3368  * EACCES), just ignore the error in that case. If desired also silently
3369  * ignoring errors about unreadable files. Log others.
3370  */
3371  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3372  return 0;
3373  else if (fd < 0 && ignore_perm && errno == EACCES)
3374  return 0;
3375  else if (fd < 0)
3376  {
3377  ereport(elevel,
3379  errmsg("could not open file \"%s\": %m", fname)));
3380  return -1;
3381  }
3382 
3383  returncode = pg_fsync(fd);
3384 
3385  /*
3386  * Some OSes don't allow us to fsync directories at all, so we can ignore
3387  * those errors. Anything else needs to be logged.
3388  */
3389  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3390  {
3391  int save_errno;
3392 
3393  /* close file upon error, might not be in transaction context */
3394  save_errno = errno;
3395  (void) CloseTransientFile(fd);
3396  errno = save_errno;
3397 
3398  ereport(elevel,
3400  errmsg("could not fsync file \"%s\": %m", fname)));
3401  return -1;
3402  }
3403 
3404  if (CloseTransientFile(fd) != 0)
3405  {
3406  ereport(elevel,
3408  errmsg("could not close file \"%s\": %m", fname)));
3409  return -1;
3410  }
3411 
3412  return 0;
3413 }
3414 
3415 /*
3416  * fsync_parent_path -- fsync the parent path of a file or directory
3417  *
3418  * This is aimed at making file operations persistent on disk in case of
3419  * an OS crash or power failure.
3420  */
3421 static int
3422 fsync_parent_path(const char *fname, int elevel)
3423 {
3424  char parentpath[MAXPGPATH];
3425 
3426  strlcpy(parentpath, fname, MAXPGPATH);
3427  get_parent_directory(parentpath);
3428 
3429  /*
3430  * get_parent_directory() returns an empty string if the input argument is
3431  * just a file name (see comments in path.c), so handle that as being the
3432  * current directory.
3433  */
3434  if (strlen(parentpath) == 0)
3435  strlcpy(parentpath, ".", MAXPGPATH);
3436 
3437  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3438  return -1;
3439 
3440  return 0;
3441 }
3442 
3443 /*
3444  * Create a PostgreSQL data sub-directory
3445  *
3446  * The data directory itself, and most of its sub-directories, are created at
3447  * initdb time, but we do have some occasions when we create directories in
3448  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3449  * make sure that those directories are created consistently. Today, that means
3450  * making sure that the created directory has the correct permissions, which is
3451  * what pg_dir_create_mode tracks for us.
3452  *
3453  * Note that we also set the umask() based on what we understand the correct
3454  * permissions to be (see file_perm.c).
3455  *
3456  * For permissions other than the default, mkdir() can be used directly, but
3457  * be sure to consider carefully such cases -- a sub-directory with incorrect
3458  * permissions in a PostgreSQL data directory could cause backups and other
3459  * processes to fail.
3460  */
3461 int
3462 MakePGDirectory(const char *directoryName)
3463 {
3464  return mkdir(directoryName, pg_dir_create_mode);
3465 }
3466 
3467 /*
3468  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3469  *
3470  * Failure to fsync any data file is cause for immediate panic, unless
3471  * data_sync_retry is enabled. Data may have been written to the operating
3472  * system and removed from our buffer pool already, and if we are running on
3473  * an operating system that forgets dirty data on write-back failure, there
3474  * may be only one copy of the data remaining: in the WAL. A later attempt to
3475  * fsync again might falsely report success. Therefore we must not allow any
3476  * further checkpoints to be attempted. data_sync_retry can in theory be
3477  * enabled on systems known not to drop dirty buffered data on write-back
3478  * failure (with the likely outcome that checkpoints will continue to fail
3479  * until the underlying problem is fixed).
3480  *
3481  * Any code that reports a failure from fsync() or related functions should
3482  * filter the error level with this function.
3483  */
3484 int
3485 data_sync_elevel(int elevel)
3486 {
3487  return data_sync_retry ? elevel : PANIC;
3488 }
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1336
File lruLessRecently
Definition: fd.c:184
void closeAllVfds(void)
Definition: fd.c:2645
static PgChecksumMode mode
Definition: pg_checksums.c:61
File nextFree
Definition: fd.c:182
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:819
int pg_file_create_mode
Definition: file_perm.c:19
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1667
#define MAP_FAILED
Definition: mem.h:45
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:40
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1599
#define NUM_RESERVED_FDS
Definition: fd.c:119
static AllocateDesc * allocatedDescs
Definition: fd.c:245
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1323
int pg_fdatasync(int fd)
Definition: fd.c:385
static void error(void)
Definition: sql-dyntest.c:147
union AllocateDesc::@26 desc
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
AllocateDescKind
Definition: fd.c:223
DIR * dir
Definition: fd.c:238
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1542
static void AtProcExit_Files(int code, Datum arg)
Definition: fd.c:2797
static Size SizeVfdCache
Definition: fd.c:198
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:175
void on_proc_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:305
#define DO_DB(A)
Definition: fd.c:161
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2711
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3211
long random(void)
Definition: random.c:22
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
int pg_fsync_writethrough(int fd)
Definition: fd.c:362
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:78
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2549
int max_safe_fds
Definition: fd.c:146
#define Min(x, y)
Definition: c.h:904
off_t FileSize(File file)
Definition: fd.c:2035
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:582
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2266
#define FD_DELETE_AT_CLOSE
Definition: fd.c:173
int log_temp_files
Definition: guc.c:513
mode_t FileGetRawMode(File file)
Definition: fd.c:2123
void _dosmaperr(unsigned long)
Definition: win32error.c:171
static Vfd * VfdCache
Definition: fd.c:197
static void Delete(File file)
Definition: fd.c:1016
int closedir(DIR *)
Definition: dirent.c:113
static int numTempTableSpaces
Definition: fd.c:258
#define PG_TEMP_FILES_DIR
Definition: pg_checksums.c:58
int errcode(int sqlerrcode)
Definition: elog.c:570
#define MemSet(start, val, len)
Definition: c.h:955
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1431
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:350
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3039
File PathNameOpenTemporaryFile(const char *path)
Definition: fd.c:1637
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1566
static bool reserveAllocatedDesc(void)
Definition: fd.c:2134
uint32 SubTransactionId
Definition: c.h:511
#define SIGPIPE
Definition: win32_port.h:168
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1517
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
AllocateDescKind kind
Definition: fd.c:233
char * FilePathName(File file)
Definition: fd.c:2087
Definition: dirent.h:9
#define OidIsValid(objectId)
Definition: c.h:638
#define PANIC
Definition: elog.h:53
#define PG_BINARY
Definition: c.h:1191
static char * basedir
Definition: pg_basebackup.c:86
ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset)
Definition: pwrite.c:27
void AtEOXact_Files(bool isCommit)
Definition: fd.c:2783
Oid MyDatabaseTableSpace
Definition: globals.c:87
int ClosePipeStream(FILE *file)
Definition: fd.c:2616
ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset)
Definition: pread.c:27
#define malloc(a)
Definition: header.h:50
static void LruDelete(File file)
Definition: fd.c:1035
void pg_usleep(long microsec)
Definition: signal.c:53
bool TempTablespacesAreSet(void)
Definition: fd.c:2699
#define fsync(fd)
Definition: win32_port.h:63
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2367
void pfree(void *pointer)
Definition: mcxt.c:1031
mode_t fileMode
Definition: fd.c:189
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3011
static bool ReleaseLruFile(void)
Definition: fd.c:1130
Definition: dirent.c:25
#define ERROR
Definition: elog.h:43
#define PG_TEMP_FILE_PREFIX
Definition: pg_checksums.c:59
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2257
static int LruInsert(File file)
Definition: fd.c:1083
#define FATAL
Definition: elog.h:52
static bool have_xact_temporary_files
Definition: fd.c:209
#define MAXPGPATH
DIR * opendir(const char *)
Definition: dirent.c:33
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2014
#define DEBUG2
Definition: elog.h:24
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:26
char * fileName
Definition: fd.c:186
static char * buf
Definition: pg_test_fsync.c:68
Oid GetNextTempTableSpace(void)
Definition: fd.c:2729
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1244
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3321
int errdetail(const char *fmt,...)
Definition: elog.c:860
char * tablespace
Definition: pgbench.c:186
int errcode_for_file_access(void)
Definition: elog.c:593
void get_parent_directory(char *path)
Definition: path.c:854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2207
static int nfile
Definition: fd.c:203
unsigned int uint32
Definition: c.h:358
void SyncDataDirectory(void)
Definition: fd.c:3137
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2468
static int nextTempTableSpace
Definition: fd.c:259
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1342
int max_files_per_process
Definition: fd.c:133
static File AllocateVfd(void)
Definition: fd.c:1162
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2310
unsigned short fdstate
Definition: fd.c:180
Definition: fd.c:177
off_t fileSize
Definition: fd.c:185
int fd
Definition: fd.c:179
#define ereport(elevel, rest)
Definition: elog.h:141
int link(const char *fromname, const char *toname)
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2671
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:608
static void Insert(File file)
Definition: fd.c:1061
ResourceOwner resowner
Definition: fd.c:181
bool data_sync_retry
Definition: fd.c:149
#define S_ISREG(m)
Definition: win32_port.h:308
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3311
int CloseTransientFile(int fd)
Definition: fd.c:2434
#define SIG_IGN
Definition: win32_port.h:160
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1276
static void ReleaseLruFiles(void)
Definition: fd.c:1152
#define WARNING
Definition: elog.h:40
#define stat(a, b)
Definition: win32_port.h:264
#define FileIsNotOpen(file)
Definition: fd.c:170
int pg_dir_create_mode
Definition: file_perm.c:18
static int elevel
Definition: vacuumlazy.c:143
int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:1916
struct vfd Vfd
int data_sync_elevel(int elevel)
Definition: fd.c:3485
uintptr_t Datum
Definition: postgres.h:367
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:2750
unsigned int Index
Definition: c.h:475
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:405
#define FileIsValid(file)
Definition: fd.c:167
FILE * file
Definition: fd.c:237
#define InvalidOid
Definition: postgres_ext.h:36
#define VFD_CLOSED
Definition: fd.c:165
static uint64 temporary_files_size
Definition: fd.c:217
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3462
pqsigfunc pqsignal(int signum, pqsigfunc handler)
Definition: signal.c:170
#define free(a)
Definition: header.h:65
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static void RegisterTemporaryFile(File file)
Definition: fd.c:1295
static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:2945
void FileClose(File file)
Definition: fd.c:1713
#define SIG_DFL
Definition: win32_port.h:158
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:1809
static int FileAccess(File file)
Definition: fd.c:1240
#define Assert(condition)
Definition: c.h:732
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:708
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2534
File lruMoreRecently
Definition: fd.c:183
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:1837
void RemovePgTempFiles(void)
Definition: fd.c:2885
SubTransactionId create_subid
Definition: fd.c:234
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1464
int durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:734
size_t Size
Definition: c.h:466
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1318
static const char * directory
Definition: zic.c:622
int sync_method
Definition: xlog.c:102
struct dirent * readdir(DIR *)
Definition: dirent.c:77
#define FD_MINFREE
Definition: fd.c:125
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3067
#define realloc(a, b)
Definition: header.h:60
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1044
#define INT64_FORMAT
Definition: c.h:400
const char * name
Definition: encode.c:521
static long tempFileCounter
Definition: fd.c:251
int fd
Definition: fd.c:239
#define S_ISDIR(m)
Definition: win32_port.h:305
#define lstat(path, sb)
Definition: win32_port.h:253
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:698
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:946
int FreeFile(FILE *file)
Definition: fd.c:2406
void set_max_safe_fds(void)
Definition: fd.c:903
bool enableFsync
Definition: globals.c:119
static Oid * tempTableSpaces
Definition: fd.c:257
void * palloc(Size size)
Definition: mcxt.c:924
int errmsg(const char *fmt,...)
Definition: elog.c:784
int FileGetRawFlags(File file)
Definition: fd.c:2113
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1233
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3346
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:968
#define elog(elevel,...)
Definition: elog.h:226
int i
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:174
void * arg
int FileGetRawDesc(File file)
Definition: fd.c:2103
static void FreeVfd(File file)
Definition: fd.c:1220
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
int pg_fsync(int fd)
Definition: fd.c:333
char d_name[MAX_PATH]
Definition: dirent.h:14
#define mkdir(a, b)
Definition: win32_port.h:58
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:332
int fileFlags
Definition: fd.c:188
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1400
int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:1860
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1253
#define snprintf
Definition: port.h:192
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2052
static int maxAllocatedDescs
Definition: fd.c:244
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:2815
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3422
int File
Definition: fd.h:45
int FreeDir(DIR *dir)
Definition: fd.c:2586
int temp_file_limit
Definition: guc.c:517
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:241
void InitFileAccess(void)
Definition: fd.c:786
static int numAllocatedDescs
Definition: fd.c:243
bool pgwin32_is_junction(const char *path)
#define ftruncate(a, b)
Definition: win32_port.h:60