PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  *-------------------------------------------------------------------------
65  */
66 
67 #include "postgres.h"
68 
69 #include <sys/file.h>
70 #include <sys/param.h>
71 #include <sys/stat.h>
72 #ifndef WIN32
73 #include <sys/mman.h>
74 #endif
75 #include <limits.h>
76 #include <unistd.h>
77 #include <fcntl.h>
78 #ifdef HAVE_SYS_RESOURCE_H
79 #include <sys/resource.h> /* for getrlimit */
80 #endif
81 
82 #include "miscadmin.h"
83 #include "access/xact.h"
84 #include "access/xlog.h"
85 #include "catalog/pg_tablespace.h"
86 #include "common/file_perm.h"
87 #include "pgstat.h"
88 #include "portability/mem.h"
89 #include "storage/fd.h"
90 #include "storage/ipc.h"
91 #include "utils/guc.h"
92 #include "utils/resowner_private.h"
93 
94 
95 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
96 #if defined(HAVE_SYNC_FILE_RANGE)
97 #define PG_FLUSH_DATA_WORKS 1
98 #elif !defined(WIN32) && defined(MS_ASYNC)
99 #define PG_FLUSH_DATA_WORKS 1
100 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
101 #define PG_FLUSH_DATA_WORKS 1
102 #endif
103 
104 /*
105  * We must leave some file descriptors free for system(), the dynamic loader,
106  * and other code that tries to open files without consulting fd.c. This
107  * is the number left free. (While we can be pretty sure we won't get
108  * EMFILE, there's never any guarantee that we won't get ENFILE due to
109  * other processes chewing up FDs. So it's a bad idea to try to open files
110  * without consulting fd.c. Nonetheless we cannot control all code.)
111  *
112  * Because this is just a fixed setting, we are effectively assuming that
113  * no such code will leave FDs open over the long term; otherwise the slop
114  * is likely to be insufficient. Note in particular that we expect that
115  * loading a shared library does not result in any permanent increase in
116  * the number of open files. (This appears to be true on most if not
117  * all platforms as of Feb 2004.)
118  */
119 #define NUM_RESERVED_FDS 10
120 
121 /*
122  * If we have fewer than this many usable FDs after allowing for the reserved
123  * ones, choke.
124  */
125 #define FD_MINFREE 10
126 
127 /*
128  * A number of platforms allow individual processes to open many more files
129  * than they can really support when *many* processes do the same thing.
130  * This GUC parameter lets the DBA limit max_safe_fds to something less than
131  * what the postmaster's initial probe suggests will work.
132  */
134 
135 /*
136  * Maximum number of file descriptors to open for either VFD entries or
137  * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
138  * to a conservative value, and remains that way indefinitely in bootstrap or
139  * standalone-backend cases. In normal postmaster operation, the postmaster
140  * calls set_max_safe_fds() late in initialization to update the value, and
141  * that value is then inherited by forked subprocesses.
142  *
143  * Note: the value of max_files_per_process is taken into account while
144  * setting this variable, and so need not be tested separately.
145  */
146 int max_safe_fds = 32; /* default if not changed */
147 
148 /* Whether it is safe to continue running after fsync() fails. */
149 bool data_sync_retry = false;
150 
151 /* Debugging.... */
152 
153 #ifdef FDDEBUG
154 #define DO_DB(A) \
155  do { \
156  int _do_db_save_errno = errno; \
157  A; \
158  errno = _do_db_save_errno; \
159  } while (0)
160 #else
161 #define DO_DB(A) \
162  ((void) 0)
163 #endif
164 
165 #define VFD_CLOSED (-1)
166 
167 #define FileIsValid(file) \
168  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
169 
170 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
171 
172 /* these are the assigned bits in fdstate below: */
173 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
174 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
175 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
176 
177 typedef struct vfd
178 {
179  int fd; /* current FD, or VFD_CLOSED if none */
180  unsigned short fdstate; /* bitflags for VFD's state */
181  ResourceOwner resowner; /* owner, for automatic cleanup */
182  File nextFree; /* link to next free VFD, if in freelist */
183  File lruMoreRecently; /* doubly linked recency-of-use list */
185  off_t fileSize; /* current size of file (0 if not temporary) */
186  char *fileName; /* name of file, or NULL for unused VFD */
187  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
188  int fileFlags; /* open(2) flags for (re)opening the file */
189  mode_t fileMode; /* mode to pass to open(2) */
190 } Vfd;
191 
192 /*
193  * Virtual File Descriptor array pointer and size. This grows as
194  * needed. 'File' values are indexes into this array.
195  * Note that VfdCache[0] is not a usable VFD, just a list header.
196  */
197 static Vfd *VfdCache;
198 static Size SizeVfdCache = 0;
199 
200 /*
201  * Number of file descriptors known to be in use by VFD entries.
202  */
203 static int nfile = 0;
204 
205 /*
206  * Flag to tell whether it's worth scanning VfdCache looking for temp files
207  * to close
208  */
209 static bool have_xact_temporary_files = false;
210 
211 /*
212  * Tracks the total size of all temporary files. Note: when temp_file_limit
213  * is being enforced, this cannot overflow since the limit cannot be more
214  * than INT_MAX kilobytes. When not enforcing, it could theoretically
215  * overflow, but we don't care.
216  */
217 static uint64 temporary_files_size = 0;
218 
219 /*
220  * List of OS handles opened with AllocateFile, AllocateDir and
221  * OpenTransientFile.
222  */
223 typedef enum
224 {
230 
231 typedef struct
232 {
235  union
236  {
237  FILE *file;
239  int fd;
240  } desc;
241 } AllocateDesc;
242 
243 static int numAllocatedDescs = 0;
244 static int maxAllocatedDescs = 0;
246 
247 /*
248  * Number of temporary files opened during the current session;
249  * this is used in generation of tempfile names.
250  */
251 static long tempFileCounter = 0;
252 
253 /*
254  * Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
255  * this has not been set in the current transaction.
256  */
257 static Oid *tempTableSpaces = NULL;
258 static int numTempTableSpaces = -1;
259 static int nextTempTableSpace = 0;
260 
261 
262 /*--------------------
263  *
264  * Private Routines
265  *
266  * Delete - delete a file from the Lru ring
267  * LruDelete - remove a file from the Lru ring and close its FD
268  * Insert - put a file at the front of the Lru ring
269  * LruInsert - put a file at the front of the Lru ring and open it
270  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
271  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
272  * AllocateVfd - grab a free (or new) file record (from VfdCache)
273  * FreeVfd - free a file record
274  *
275  * The Least Recently Used ring is a doubly linked list that begins and
276  * ends on element zero. Element zero is special -- it doesn't represent
277  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
278  * anchor that shows us the beginning/end of the ring.
279  * Only VFD elements that are currently really open (have an FD assigned) are
280  * in the Lru ring. Elements that are "virtually" open can be recognized
281  * by having a non-null fileName field.
282  *
283  * example:
284  *
285  * /--less----\ /---------\
286  * v \ v \
287  * #0 --more---> LeastRecentlyUsed --more-\ \
288  * ^\ | |
289  * \\less--> MostRecentlyUsedFile <---/ |
290  * \more---/ \--less--/
291  *
292  *--------------------
293  */
294 static void Delete(File file);
295 static void LruDelete(File file);
296 static void Insert(File file);
297 static int LruInsert(File file);
298 static bool ReleaseLruFile(void);
299 static void ReleaseLruFiles(void);
300 static File AllocateVfd(void);
301 static void FreeVfd(File file);
302 
303 static int FileAccess(File file);
304 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
305 static bool reserveAllocatedDesc(void);
306 static int FreeDesc(AllocateDesc *desc);
307 
308 static void AtProcExit_Files(int code, Datum arg);
309 static void CleanupTempFiles(bool isCommit, bool isProcExit);
310 static void RemovePgTempRelationFiles(const char *tsdirname);
311 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
312 
313 static void walkdir(const char *path,
314  void (*action) (const char *fname, bool isdir, int elevel),
315  bool process_symlinks,
316  int elevel);
317 #ifdef PG_FLUSH_DATA_WORKS
318 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
319 #endif
320 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
321 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
322 
323 static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
324 static int fsync_parent_path(const char *fname, int elevel);
325 
326 
327 /*
328  * pg_fsync --- do fsync with or without writethrough
329  */
330 int
332 {
333  /* #if is to skip the sync_method test if there's no need for it */
334 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
336  return pg_fsync_writethrough(fd);
337  else
338 #endif
339  return pg_fsync_no_writethrough(fd);
340 }
341 
342 
343 /*
344  * pg_fsync_no_writethrough --- same as fsync except does nothing if
345  * enableFsync is off
346  */
347 int
349 {
350  if (enableFsync)
351  return fsync(fd);
352  else
353  return 0;
354 }
355 
356 /*
357  * pg_fsync_writethrough
358  */
359 int
361 {
362  if (enableFsync)
363  {
364 #ifdef WIN32
365  return _commit(fd);
366 #elif defined(F_FULLFSYNC)
367  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
368 #else
369  errno = ENOSYS;
370  return -1;
371 #endif
372  }
373  else
374  return 0;
375 }
376 
377 /*
378  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
379  *
380  * Not all platforms have fdatasync; treat as fsync if not available.
381  */
382 int
384 {
385  if (enableFsync)
386  {
387 #ifdef HAVE_FDATASYNC
388  return fdatasync(fd);
389 #else
390  return fsync(fd);
391 #endif
392  }
393  else
394  return 0;
395 }
396 
397 /*
398  * pg_flush_data --- advise OS that the described dirty data should be flushed
399  *
400  * offset of 0 with nbytes 0 means that the entire file should be flushed
401  */
402 void
403 pg_flush_data(int fd, off_t offset, off_t nbytes)
404 {
405  /*
406  * Right now file flushing is primarily used to avoid making later
407  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
408  * if fsyncs are disabled - that's a decision we might want to make
409  * configurable at some point.
410  */
411  if (!enableFsync)
412  return;
413 
414  /*
415  * We compile all alternatives that are supported on the current platform,
416  * to find portability problems more easily.
417  */
418 #if defined(HAVE_SYNC_FILE_RANGE)
419  {
420  int rc;
421  static bool not_implemented_by_kernel = false;
422 
423  if (not_implemented_by_kernel)
424  return;
425 
426  /*
427  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
428  * tells the OS that writeback for the specified blocks should be
429  * started, but that we don't want to wait for completion. Note that
430  * this call might block if too much dirty data exists in the range.
431  * This is the preferable method on OSs supporting it, as it works
432  * reliably when available (contrast to msync()) and doesn't flush out
433  * clean data (like FADV_DONTNEED).
434  */
435  rc = sync_file_range(fd, offset, nbytes,
436  SYNC_FILE_RANGE_WRITE);
437  if (rc != 0)
438  {
439  int elevel;
440 
441  /*
442  * For systems that don't have an implementation of
443  * sync_file_range() such as Windows WSL, generate only one
444  * warning and then suppress all further attempts by this process.
445  */
446  if (errno == ENOSYS)
447  {
448  elevel = WARNING;
449  not_implemented_by_kernel = true;
450  }
451  else
452  elevel = data_sync_elevel(WARNING);
453 
454  ereport(elevel,
456  errmsg("could not flush dirty data: %m")));
457  }
458 
459  return;
460  }
461 #endif
462 #if !defined(WIN32) && defined(MS_ASYNC)
463  {
464  void *p;
465  static int pagesize = 0;
466 
467  /*
468  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
469  * writeback. On linux it only does so if MS_SYNC is specified, but
470  * then it does the writeback synchronously. Luckily all common linux
471  * systems have sync_file_range(). This is preferable over
472  * FADV_DONTNEED because it doesn't flush out clean data.
473  *
474  * We map the file (mmap()), tell the kernel to sync back the contents
475  * (msync()), and then remove the mapping again (munmap()).
476  */
477 
478  /* mmap() needs actual length if we want to map whole file */
479  if (offset == 0 && nbytes == 0)
480  {
481  nbytes = lseek(fd, 0, SEEK_END);
482  if (nbytes < 0)
483  {
486  errmsg("could not determine dirty data size: %m")));
487  return;
488  }
489  }
490 
491  /*
492  * Some platforms reject partial-page mmap() attempts. To deal with
493  * that, just truncate the request to a page boundary. If any extra
494  * bytes don't get flushed, well, it's only a hint anyway.
495  */
496 
497  /* fetch pagesize only once */
498  if (pagesize == 0)
499  pagesize = sysconf(_SC_PAGESIZE);
500 
501  /* align length to pagesize, dropping any fractional page */
502  if (pagesize > 0)
503  nbytes = (nbytes / pagesize) * pagesize;
504 
505  /* fractional-page request is a no-op */
506  if (nbytes <= 0)
507  return;
508 
509  /*
510  * mmap could well fail, particularly on 32-bit platforms where there
511  * may simply not be enough address space. If so, silently fall
512  * through to the next implementation.
513  */
514  if (nbytes <= (off_t) SSIZE_MAX)
515  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
516  else
517  p = MAP_FAILED;
518 
519  if (p != MAP_FAILED)
520  {
521  int rc;
522 
523  rc = msync(p, (size_t) nbytes, MS_ASYNC);
524  if (rc != 0)
525  {
528  errmsg("could not flush dirty data: %m")));
529  /* NB: need to fall through to munmap()! */
530  }
531 
532  rc = munmap(p, (size_t) nbytes);
533  if (rc != 0)
534  {
535  /* FATAL error because mapping would remain */
536  ereport(FATAL,
538  errmsg("could not munmap() while flushing data: %m")));
539  }
540 
541  return;
542  }
543  }
544 #endif
545 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
546  {
547  int rc;
548 
549  /*
550  * Signal the kernel that the passed in range should not be cached
551  * anymore. This has the, desired, side effect of writing out dirty
552  * data, and the, undesired, side effect of likely discarding useful
553  * clean cached blocks. For the latter reason this is the least
554  * preferable method.
555  */
556 
557  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
558 
559  if (rc != 0)
560  {
561  /* don't error out, this is just a performance optimization */
564  errmsg("could not flush dirty data: %m")));
565  }
566 
567  return;
568  }
569 #endif
570 }
571 
572 
573 /*
574  * fsync_fname -- fsync a file or directory, handling errors properly
575  *
576  * Try to fsync a file or directory. When doing the latter, ignore errors that
577  * indicate the OS just doesn't allow/require fsyncing directories.
578  */
579 void
580 fsync_fname(const char *fname, bool isdir)
581 {
582  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
583 }
584 
585 /*
586  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
587  *
588  * This routine ensures that, after returning, the effect of renaming file
589  * persists in case of a crash. A crash while this routine is running will
590  * leave you with either the pre-existing or the moved file in place of the
591  * new file; no mixed state or truncated files are possible.
592  *
593  * It does so by using fsync on the old filename and the possibly existing
594  * target filename before the rename, and the target file and directory after.
595  *
596  * Note that rename() cannot be used across arbitrary directories, as they
597  * might not be on the same filesystem. Therefore this routine does not
598  * support renaming across directories.
599  *
600  * Log errors with the caller specified severity.
601  *
602  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
603  * valid upon return.
604  */
605 int
606 durable_rename(const char *oldfile, const char *newfile, int elevel)
607 {
608  int fd;
609 
610  /*
611  * First fsync the old and target path (if it exists), to ensure that they
612  * are properly persistent on disk. Syncing the target file is not
613  * strictly necessary, but it makes it easier to reason about crashes;
614  * because it's then guaranteed that either source or target file exists
615  * after a crash.
616  */
617  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
618  return -1;
619 
620  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
621  if (fd < 0)
622  {
623  if (errno != ENOENT)
624  {
625  ereport(elevel,
627  errmsg("could not open file \"%s\": %m", newfile)));
628  return -1;
629  }
630  }
631  else
632  {
633  if (pg_fsync(fd) != 0)
634  {
635  int save_errno;
636 
637  /* close file upon error, might not be in transaction context */
638  save_errno = errno;
639  CloseTransientFile(fd);
640  errno = save_errno;
641 
642  ereport(elevel,
644  errmsg("could not fsync file \"%s\": %m", newfile)));
645  return -1;
646  }
647 
648  if (CloseTransientFile(fd) != 0)
649  {
650  ereport(elevel,
652  errmsg("could not close file \"%s\": %m", newfile)));
653  return -1;
654  }
655  }
656 
657  /* Time to do the real deal... */
658  if (rename(oldfile, newfile) < 0)
659  {
660  ereport(elevel,
662  errmsg("could not rename file \"%s\" to \"%s\": %m",
663  oldfile, newfile)));
664  return -1;
665  }
666 
667  /*
668  * To guarantee renaming the file is persistent, fsync the file with its
669  * new name, and its containing directory.
670  */
671  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
672  return -1;
673 
674  if (fsync_parent_path(newfile, elevel) != 0)
675  return -1;
676 
677  return 0;
678 }
679 
680 /*
681  * durable_unlink -- remove a file in a durable manner
682  *
683  * This routine ensures that, after returning, the effect of removing file
684  * persists in case of a crash. A crash while this routine is running will
685  * leave the system in no mixed state.
686  *
687  * It does so by using fsync on the parent directory of the file after the
688  * actual removal is done.
689  *
690  * Log errors with the severity specified by caller.
691  *
692  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
693  * valid upon return.
694  */
695 int
696 durable_unlink(const char *fname, int elevel)
697 {
698  if (unlink(fname) < 0)
699  {
700  ereport(elevel,
702  errmsg("could not remove file \"%s\": %m",
703  fname)));
704  return -1;
705  }
706 
707  /*
708  * To guarantee that the removal of the file is persistent, fsync its
709  * parent directory.
710  */
711  if (fsync_parent_path(fname, elevel) != 0)
712  return -1;
713 
714  return 0;
715 }
716 
717 /*
718  * durable_link_or_rename -- rename a file in a durable manner.
719  *
720  * Similar to durable_rename(), except that this routine tries (but does not
721  * guarantee) not to overwrite the target file.
722  *
723  * Note that a crash in an unfortunate moment can leave you with two links to
724  * the target file.
725  *
726  * Log errors with the caller specified severity.
727  *
728  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
729  * valid upon return.
730  */
731 int
732 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
733 {
734  /*
735  * Ensure that, if we crash directly after the rename/link, a file with
736  * valid contents is moved into place.
737  */
738  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
739  return -1;
740 
741 #ifdef HAVE_WORKING_LINK
742  if (link(oldfile, newfile) < 0)
743  {
744  ereport(elevel,
746  errmsg("could not link file \"%s\" to \"%s\": %m",
747  oldfile, newfile)));
748  return -1;
749  }
750  unlink(oldfile);
751 #else
752  /* XXX: Add racy file existence check? */
753  if (rename(oldfile, newfile) < 0)
754  {
755  ereport(elevel,
757  errmsg("could not rename file \"%s\" to \"%s\": %m",
758  oldfile, newfile)));
759  return -1;
760  }
761 #endif
762 
763  /*
764  * Make change persistent in case of an OS crash, both the new entry and
765  * its parent directory need to be flushed.
766  */
767  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
768  return -1;
769 
770  /* Same for parent directory */
771  if (fsync_parent_path(newfile, elevel) != 0)
772  return -1;
773 
774  return 0;
775 }
776 
777 /*
778  * InitFileAccess --- initialize this module during backend startup
779  *
780  * This is called during either normal or standalone backend start.
781  * It is *not* called in the postmaster.
782  */
783 void
785 {
786  Assert(SizeVfdCache == 0); /* call me only once */
787 
788  /* initialize cache header entry */
789  VfdCache = (Vfd *) malloc(sizeof(Vfd));
790  if (VfdCache == NULL)
791  ereport(FATAL,
792  (errcode(ERRCODE_OUT_OF_MEMORY),
793  errmsg("out of memory")));
794 
795  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
796  VfdCache->fd = VFD_CLOSED;
797 
798  SizeVfdCache = 1;
799 
800  /* register proc-exit hook to ensure temp files are dropped at exit */
802 }
803 
804 /*
805  * count_usable_fds --- count how many FDs the system will let us open,
806  * and estimate how many are already open.
807  *
808  * We stop counting if usable_fds reaches max_to_probe. Note: a small
809  * value of max_to_probe might result in an underestimate of already_open;
810  * we must fill in any "gaps" in the set of used FDs before the calculation
811  * of already_open will give the right answer. In practice, max_to_probe
812  * of a couple of dozen should be enough to ensure good results.
813  *
814  * We assume stdin (FD 0) is available for dup'ing
815  */
816 static void
817 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
818 {
819  int *fd;
820  int size;
821  int used = 0;
822  int highestfd = 0;
823  int j;
824 
825 #ifdef HAVE_GETRLIMIT
826  struct rlimit rlim;
827  int getrlimit_status;
828 #endif
829 
830  size = 1024;
831  fd = (int *) palloc(size * sizeof(int));
832 
833 #ifdef HAVE_GETRLIMIT
834 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
835  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
836 #else /* but BSD doesn't ... */
837  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
838 #endif /* RLIMIT_NOFILE */
839  if (getrlimit_status != 0)
840  ereport(WARNING, (errmsg("getrlimit failed: %m")));
841 #endif /* HAVE_GETRLIMIT */
842 
843  /* dup until failure or probe limit reached */
844  for (;;)
845  {
846  int thisfd;
847 
848 #ifdef HAVE_GETRLIMIT
849 
850  /*
851  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
852  * some platforms
853  */
854  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
855  break;
856 #endif
857 
858  thisfd = dup(0);
859  if (thisfd < 0)
860  {
861  /* Expect EMFILE or ENFILE, else it's fishy */
862  if (errno != EMFILE && errno != ENFILE)
863  elog(WARNING, "dup(0) failed after %d successes: %m", used);
864  break;
865  }
866 
867  if (used >= size)
868  {
869  size *= 2;
870  fd = (int *) repalloc(fd, size * sizeof(int));
871  }
872  fd[used++] = thisfd;
873 
874  if (highestfd < thisfd)
875  highestfd = thisfd;
876 
877  if (used >= max_to_probe)
878  break;
879  }
880 
881  /* release the files we opened */
882  for (j = 0; j < used; j++)
883  close(fd[j]);
884 
885  pfree(fd);
886 
887  /*
888  * Return results. usable_fds is just the number of successful dups. We
889  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
890  * number) and so already_open is highestfd+1 - usable_fds.
891  */
892  *usable_fds = used;
893  *already_open = highestfd + 1 - used;
894 }
895 
896 /*
897  * set_max_safe_fds
898  * Determine number of file descriptors that fd.c is allowed to use
899  */
900 void
902 {
903  int usable_fds;
904  int already_open;
905 
906  /*----------
907  * We want to set max_safe_fds to
908  * MIN(usable_fds, max_files_per_process - already_open)
909  * less the slop factor for files that are opened without consulting
910  * fd.c. This ensures that we won't exceed either max_files_per_process
911  * or the experimentally-determined EMFILE limit.
912  *----------
913  */
915  &usable_fds, &already_open);
916 
917  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
918 
919  /*
920  * Take off the FDs reserved for system() etc.
921  */
923 
924  /*
925  * Make sure we still have enough to get by.
926  */
927  if (max_safe_fds < FD_MINFREE)
928  ereport(FATAL,
929  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
930  errmsg("insufficient file descriptors available to start server process"),
931  errdetail("System allows %d, we need at least %d.",
934 
935  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
936  max_safe_fds, usable_fds, already_open);
937 }
938 
939 /*
940  * Open a file with BasicOpenFilePerm() and pass default file mode for the
941  * fileMode parameter.
942  */
943 int
945 {
946  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
947 }
948 
949 /*
950  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
951  *
952  * This is exported for use by places that really want a plain kernel FD,
953  * but need to be proof against running out of FDs. Once an FD has been
954  * successfully returned, it is the caller's responsibility to ensure that
955  * it will not be leaked on ereport()! Most users should *not* call this
956  * routine directly, but instead use the VFD abstraction level, which
957  * provides protection against descriptor leaks as well as management of
958  * files that need to be open for more than a short period of time.
959  *
960  * Ideally this should be the *only* direct call of open() in the backend.
961  * In practice, the postmaster calls open() directly, and there are some
962  * direct open() calls done early in backend startup. Those are OK since
963  * this module wouldn't have any open files to close at that point anyway.
964  */
965 int
966 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
967 {
968  int fd;
969 
970 tryAgain:
971  fd = open(fileName, fileFlags, fileMode);
972 
973  if (fd >= 0)
974  return fd; /* success! */
975 
976  if (errno == EMFILE || errno == ENFILE)
977  {
978  int save_errno = errno;
979 
980  ereport(LOG,
981  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
982  errmsg("out of file descriptors: %m; release and retry")));
983  errno = 0;
984  if (ReleaseLruFile())
985  goto tryAgain;
986  errno = save_errno;
987  }
988 
989  return -1; /* failure */
990 }
991 
992 #if defined(FDDEBUG)
993 
994 static void
995 _dump_lru(void)
996 {
997  int mru = VfdCache[0].lruLessRecently;
998  Vfd *vfdP = &VfdCache[mru];
999  char buf[2048];
1000 
1001  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1002  while (mru != 0)
1003  {
1004  mru = vfdP->lruLessRecently;
1005  vfdP = &VfdCache[mru];
1006  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1007  }
1008  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1009  elog(LOG, "%s", buf);
1010 }
1011 #endif /* FDDEBUG */
1012 
1013 static void
1015 {
1016  Vfd *vfdP;
1017 
1018  Assert(file != 0);
1019 
1020  DO_DB(elog(LOG, "Delete %d (%s)",
1021  file, VfdCache[file].fileName));
1022  DO_DB(_dump_lru());
1023 
1024  vfdP = &VfdCache[file];
1025 
1026  VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1027  VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1028 
1029  DO_DB(_dump_lru());
1030 }
1031 
1032 static void
1034 {
1035  Vfd *vfdP;
1036 
1037  Assert(file != 0);
1038 
1039  DO_DB(elog(LOG, "LruDelete %d (%s)",
1040  file, VfdCache[file].fileName));
1041 
1042  vfdP = &VfdCache[file];
1043 
1044  /*
1045  * Close the file. We aren't expecting this to fail; if it does, better
1046  * to leak the FD than to mess up our internal state.
1047  */
1048  if (close(vfdP->fd) != 0)
1050  "could not close file \"%s\": %m", vfdP->fileName);
1051  vfdP->fd = VFD_CLOSED;
1052  --nfile;
1053 
1054  /* delete the vfd record from the LRU ring */
1055  Delete(file);
1056 }
1057 
1058 static void
1060 {
1061  Vfd *vfdP;
1062 
1063  Assert(file != 0);
1064 
1065  DO_DB(elog(LOG, "Insert %d (%s)",
1066  file, VfdCache[file].fileName));
1067  DO_DB(_dump_lru());
1068 
1069  vfdP = &VfdCache[file];
1070 
1071  vfdP->lruMoreRecently = 0;
1072  vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1073  VfdCache[0].lruLessRecently = file;
1074  VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1075 
1076  DO_DB(_dump_lru());
1077 }
1078 
1079 /* returns 0 on success, -1 on re-open failure (with errno set) */
1080 static int
1082 {
1083  Vfd *vfdP;
1084 
1085  Assert(file != 0);
1086 
1087  DO_DB(elog(LOG, "LruInsert %d (%s)",
1088  file, VfdCache[file].fileName));
1089 
1090  vfdP = &VfdCache[file];
1091 
1092  if (FileIsNotOpen(file))
1093  {
1094  /* Close excess kernel FDs. */
1095  ReleaseLruFiles();
1096 
1097  /*
1098  * The open could still fail for lack of file descriptors, eg due to
1099  * overall system file table being full. So, be prepared to release
1100  * another FD if necessary...
1101  */
1102  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1103  vfdP->fileMode);
1104  if (vfdP->fd < 0)
1105  {
1106  DO_DB(elog(LOG, "re-open failed: %m"));
1107  return -1;
1108  }
1109  else
1110  {
1111  ++nfile;
1112  }
1113  }
1114 
1115  /*
1116  * put it at the head of the Lru ring
1117  */
1118 
1119  Insert(file);
1120 
1121  return 0;
1122 }
1123 
1124 /*
1125  * Release one kernel FD by closing the least-recently-used VFD.
1126  */
1127 static bool
1129 {
1130  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1131 
1132  if (nfile > 0)
1133  {
1134  /*
1135  * There are opened files and so there should be at least one used vfd
1136  * in the ring.
1137  */
1138  Assert(VfdCache[0].lruMoreRecently != 0);
1139  LruDelete(VfdCache[0].lruMoreRecently);
1140  return true; /* freed a file */
1141  }
1142  return false; /* no files available to free */
1143 }
1144 
1145 /*
1146  * Release kernel FDs as needed to get under the max_safe_fds limit.
1147  * After calling this, it's OK to try to open another file.
1148  */
1149 static void
1151 {
1152  while (nfile + numAllocatedDescs >= max_safe_fds)
1153  {
1154  if (!ReleaseLruFile())
1155  break;
1156  }
1157 }
1158 
1159 static File
1161 {
1162  Index i;
1163  File file;
1164 
1165  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1166 
1167  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1168 
1169  if (VfdCache[0].nextFree == 0)
1170  {
1171  /*
1172  * The free list is empty so it is time to increase the size of the
1173  * array. We choose to double it each time this happens. However,
1174  * there's not much point in starting *real* small.
1175  */
1176  Size newCacheSize = SizeVfdCache * 2;
1177  Vfd *newVfdCache;
1178 
1179  if (newCacheSize < 32)
1180  newCacheSize = 32;
1181 
1182  /*
1183  * Be careful not to clobber VfdCache ptr if realloc fails.
1184  */
1185  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1186  if (newVfdCache == NULL)
1187  ereport(ERROR,
1188  (errcode(ERRCODE_OUT_OF_MEMORY),
1189  errmsg("out of memory")));
1190  VfdCache = newVfdCache;
1191 
1192  /*
1193  * Initialize the new entries and link them into the free list.
1194  */
1195  for (i = SizeVfdCache; i < newCacheSize; i++)
1196  {
1197  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1198  VfdCache[i].nextFree = i + 1;
1199  VfdCache[i].fd = VFD_CLOSED;
1200  }
1201  VfdCache[newCacheSize - 1].nextFree = 0;
1202  VfdCache[0].nextFree = SizeVfdCache;
1203 
1204  /*
1205  * Record the new size
1206  */
1207  SizeVfdCache = newCacheSize;
1208  }
1209 
1210  file = VfdCache[0].nextFree;
1211 
1212  VfdCache[0].nextFree = VfdCache[file].nextFree;
1213 
1214  return file;
1215 }
1216 
1217 static void
1219 {
1220  Vfd *vfdP = &VfdCache[file];
1221 
1222  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1223  file, vfdP->fileName ? vfdP->fileName : ""));
1224 
1225  if (vfdP->fileName != NULL)
1226  {
1227  free(vfdP->fileName);
1228  vfdP->fileName = NULL;
1229  }
1230  vfdP->fdstate = 0x0;
1231 
1232  vfdP->nextFree = VfdCache[0].nextFree;
1233  VfdCache[0].nextFree = file;
1234 }
1235 
1236 /* returns 0 on success, -1 on re-open failure (with errno set) */
1237 static int
1239 {
1240  int returnValue;
1241 
1242  DO_DB(elog(LOG, "FileAccess %d (%s)",
1243  file, VfdCache[file].fileName));
1244 
1245  /*
1246  * Is the file open? If not, open it and put it at the head of the LRU
1247  * ring (possibly closing the least recently used file to get an FD).
1248  */
1249 
1250  if (FileIsNotOpen(file))
1251  {
1252  returnValue = LruInsert(file);
1253  if (returnValue != 0)
1254  return returnValue;
1255  }
1256  else if (VfdCache[0].lruLessRecently != file)
1257  {
1258  /*
1259  * We now know that the file is open and that it is not the last one
1260  * accessed, so we need to move it to the head of the Lru ring.
1261  */
1262 
1263  Delete(file);
1264  Insert(file);
1265  }
1266 
1267  return 0;
1268 }
1269 
1270 /*
1271  * Called whenever a temporary file is deleted to report its size.
1272  */
1273 static void
1274 ReportTemporaryFileUsage(const char *path, off_t size)
1275 {
1276  pgstat_report_tempfile(size);
1277 
1278  if (log_temp_files >= 0)
1279  {
1280  if ((size / 1024) >= log_temp_files)
1281  ereport(LOG,
1282  (errmsg("temporary file: path \"%s\", size %lu",
1283  path, (unsigned long) size)));
1284  }
1285 }
1286 
1287 /*
1288  * Called to register a temporary file for automatic close.
1289  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1290  * before the file was opened.
1291  */
1292 static void
1294 {
1296  VfdCache[file].resowner = CurrentResourceOwner;
1297 
1298  /* Backup mechanism for closing at end of xact. */
1299  VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1301 }
1302 
1303 /*
1304  * Called when we get a shared invalidation message on some relation.
1305  */
1306 #ifdef NOT_USED
1307 void
1308 FileInvalidate(File file)
1309 {
1310  Assert(FileIsValid(file));
1311  if (!FileIsNotOpen(file))
1312  LruDelete(file);
1313 }
1314 #endif
1315 
1316 /*
1317  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1318  * fileMode parameter.
1319  */
1320 File
1322 {
1323  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1324 }
1325 
1326 /*
1327  * open a file in an arbitrary directory
1328  *
1329  * NB: if the passed pathname is relative (which it usually is),
1330  * it will be interpreted relative to the process' working directory
1331  * (which should always be $PGDATA when this code is running).
1332  */
1333 File
1335 {
1336  char *fnamecopy;
1337  File file;
1338  Vfd *vfdP;
1339 
1340  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1341  fileName, fileFlags, fileMode));
1342 
1343  /*
1344  * We need a malloc'd copy of the file name; fail cleanly if no room.
1345  */
1346  fnamecopy = strdup(fileName);
1347  if (fnamecopy == NULL)
1348  ereport(ERROR,
1349  (errcode(ERRCODE_OUT_OF_MEMORY),
1350  errmsg("out of memory")));
1351 
1352  file = AllocateVfd();
1353  vfdP = &VfdCache[file];
1354 
1355  /* Close excess kernel FDs. */
1356  ReleaseLruFiles();
1357 
1358  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1359 
1360  if (vfdP->fd < 0)
1361  {
1362  int save_errno = errno;
1363 
1364  FreeVfd(file);
1365  free(fnamecopy);
1366  errno = save_errno;
1367  return -1;
1368  }
1369  ++nfile;
1370  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1371  vfdP->fd));
1372 
1373  Insert(file);
1374 
1375  vfdP->fileName = fnamecopy;
1376  /* Saved flags are adjusted to be OK for re-opening file */
1377  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1378  vfdP->fileMode = fileMode;
1379  vfdP->fileSize = 0;
1380  vfdP->fdstate = 0x0;
1381  vfdP->resowner = NULL;
1382 
1383  return file;
1384 }
1385 
1386 /*
1387  * Create directory 'directory'. If necessary, create 'basedir', which must
1388  * be the directory above it. This is designed for creating the top-level
1389  * temporary directory on demand before creating a directory underneath it.
1390  * Do nothing if the directory already exists.
1391  *
1392  * Directories created within the top-level temporary directory should begin
1393  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1394  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1395  * that do not need any particular prefix.
1396 */
1397 void
1399 {
1400  if (MakePGDirectory(directory) < 0)
1401  {
1402  if (errno == EEXIST)
1403  return;
1404 
1405  /*
1406  * Failed. Try to create basedir first in case it's missing. Tolerate
1407  * EEXIST to close a race against another process following the same
1408  * algorithm.
1409  */
1410  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1411  ereport(ERROR,
1413  errmsg("cannot create temporary directory \"%s\": %m",
1414  basedir)));
1415 
1416  /* Try again. */
1417  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1418  ereport(ERROR,
1420  errmsg("cannot create temporary subdirectory \"%s\": %m",
1421  directory)));
1422  }
1423 }
1424 
1425 /*
1426  * Delete a directory and everything in it, if it exists.
1427  */
1428 void
1429 PathNameDeleteTemporaryDir(const char *dirname)
1430 {
1431  struct stat statbuf;
1432 
1433  /* Silently ignore missing directory. */
1434  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1435  return;
1436 
1437  /*
1438  * Currently, walkdir doesn't offer a way for our passed in function to
1439  * maintain state. Perhaps it should, so that we could tell the caller
1440  * whether this operation succeeded or failed. Since this operation is
1441  * used in a cleanup path, we wouldn't actually behave differently: we'll
1442  * just log failures.
1443  */
1444  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1445 }
1446 
1447 /*
1448  * Open a temporary file that will disappear when we close it.
1449  *
1450  * This routine takes care of generating an appropriate tempfile name.
1451  * There's no need to pass in fileFlags or fileMode either, since only
1452  * one setting makes any sense for a temp file.
1453  *
1454  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1455  * to ensure it's closed and deleted when it's no longer needed, typically at
1456  * the end-of-transaction. In most cases, you don't want temporary files to
1457  * outlive the transaction that created them, so this should be false -- but
1458  * if you need "somewhat" temporary storage, this might be useful. In either
1459  * case, the file is removed when the File is explicitly closed.
1460  */
1461 File
1462 OpenTemporaryFile(bool interXact)
1463 {
1464  File file = 0;
1465 
1466  /*
1467  * Make sure the current resource owner has space for this File before we
1468  * open it, if we'll be registering it below.
1469  */
1470  if (!interXact)
1472 
1473  /*
1474  * If some temp tablespace(s) have been given to us, try to use the next
1475  * one. If a given tablespace can't be found, we silently fall back to
1476  * the database's default tablespace.
1477  *
1478  * BUT: if the temp file is slated to outlive the current transaction,
1479  * force it into the database's default tablespace, so that it will not
1480  * pose a threat to possible tablespace drop attempts.
1481  */
1482  if (numTempTableSpaces > 0 && !interXact)
1483  {
1484  Oid tblspcOid = GetNextTempTableSpace();
1485 
1486  if (OidIsValid(tblspcOid))
1487  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1488  }
1489 
1490  /*
1491  * If not, or if tablespace is bad, create in database's default
1492  * tablespace. MyDatabaseTableSpace should normally be set before we get
1493  * here, but just in case it isn't, fall back to pg_default tablespace.
1494  */
1495  if (file <= 0)
1498  DEFAULTTABLESPACE_OID,
1499  true);
1500 
1501  /* Mark it for deletion at close and temporary file size limit */
1502  VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1503 
1504  /* Register it with the current resource owner */
1505  if (!interXact)
1506  RegisterTemporaryFile(file);
1507 
1508  return file;
1509 }
1510 
1511 /*
1512  * Return the path of the temp directory in a given tablespace.
1513  */
1514 void
1516 {
1517  /*
1518  * Identify the tempfile directory for this tablespace.
1519  *
1520  * If someone tries to specify pg_global, use pg_default instead.
1521  */
1522  if (tablespace == InvalidOid ||
1523  tablespace == DEFAULTTABLESPACE_OID ||
1524  tablespace == GLOBALTABLESPACE_OID)
1525  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1526  else
1527  {
1528  /* All other tablespaces are accessed via symlinks */
1529  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1530  tablespace, TABLESPACE_VERSION_DIRECTORY,
1532  }
1533 }
1534 
1535 /*
1536  * Open a temporary file in a specific tablespace.
1537  * Subroutine for OpenTemporaryFile, which see for details.
1538  */
1539 static File
1540 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1541 {
1542  char tempdirpath[MAXPGPATH];
1543  char tempfilepath[MAXPGPATH];
1544  File file;
1545 
1546  TempTablespacePath(tempdirpath, tblspcOid);
1547 
1548  /*
1549  * Generate a tempfile name that should be unique within the current
1550  * database instance.
1551  */
1552  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1553  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1554 
1555  /*
1556  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1557  * temp file that can be reused.
1558  */
1559  file = PathNameOpenFile(tempfilepath,
1560  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1561  if (file <= 0)
1562  {
1563  /*
1564  * We might need to create the tablespace's tempfile directory, if no
1565  * one has yet done so.
1566  *
1567  * Don't check for an error from MakePGDirectory; it could fail if
1568  * someone else just did the same thing. If it doesn't work then
1569  * we'll bomb out on the second create attempt, instead.
1570  */
1571  (void) MakePGDirectory(tempdirpath);
1572 
1573  file = PathNameOpenFile(tempfilepath,
1574  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1575  if (file <= 0 && rejectError)
1576  elog(ERROR, "could not create temporary file \"%s\": %m",
1577  tempfilepath);
1578  }
1579 
1580  return file;
1581 }
1582 
1583 
1584 /*
1585  * Create a new file. The directory containing it must already exist. Files
1586  * created this way are subject to temp_file_limit and are automatically
1587  * closed at end of transaction, but are not automatically deleted on close
1588  * because they are intended to be shared between cooperating backends.
1589  *
1590  * If the file is inside the top-level temporary directory, its name should
1591  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1592  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1593  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1594  * the prefix isn't needed.
1595  */
1596 File
1597 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1598 {
1599  File file;
1600 
1602 
1603  /*
1604  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1605  * temp file that can be reused.
1606  */
1607  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1608  if (file <= 0)
1609  {
1610  if (error_on_failure)
1611  ereport(ERROR,
1613  errmsg("could not create temporary file \"%s\": %m",
1614  path)));
1615  else
1616  return file;
1617  }
1618 
1619  /* Mark it for temp_file_limit accounting. */
1620  VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1621 
1622  /* Register it for automatic close. */
1623  RegisterTemporaryFile(file);
1624 
1625  return file;
1626 }
1627 
1628 /*
1629  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1630  * another backend. Files opened this way don't count against the
1631  * temp_file_limit of the caller, are read-only and are automatically closed
1632  * at the end of the transaction but are not deleted on close.
1633  */
1634 File
1635 PathNameOpenTemporaryFile(const char *path)
1636 {
1637  File file;
1638 
1640 
1641  /* We open the file read-only. */
1642  file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1643 
1644  /* If no such file, then we don't raise an error. */
1645  if (file <= 0 && errno != ENOENT)
1646  ereport(ERROR,
1648  errmsg("could not open temporary file \"%s\": %m",
1649  path)));
1650 
1651  if (file > 0)
1652  {
1653  /* Register it for automatic close. */
1654  RegisterTemporaryFile(file);
1655  }
1656 
1657  return file;
1658 }
1659 
1660 /*
1661  * Delete a file by pathname. Return true if the file existed, false if
1662  * didn't.
1663  */
1664 bool
1665 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1666 {
1667  struct stat filestats;
1668  int stat_errno;
1669 
1670  /* Get the final size for pgstat reporting. */
1671  if (stat(path, &filestats) != 0)
1672  stat_errno = errno;
1673  else
1674  stat_errno = 0;
1675 
1676  /*
1677  * Unlike FileClose's automatic file deletion code, we tolerate
1678  * non-existence to support BufFileDeleteShared which doesn't know how
1679  * many segments it has to delete until it runs out.
1680  */
1681  if (stat_errno == ENOENT)
1682  return false;
1683 
1684  if (unlink(path) < 0)
1685  {
1686  if (errno != ENOENT)
1687  ereport(error_on_failure ? ERROR : LOG,
1689  errmsg("could not unlink temporary file \"%s\": %m",
1690  path)));
1691  return false;
1692  }
1693 
1694  if (stat_errno == 0)
1695  ReportTemporaryFileUsage(path, filestats.st_size);
1696  else
1697  {
1698  errno = stat_errno;
1699  ereport(LOG,
1701  errmsg("could not stat file \"%s\": %m", path)));
1702  }
1703 
1704  return true;
1705 }
1706 
1707 /*
1708  * close a file when done with it
1709  */
1710 void
1712 {
1713  Vfd *vfdP;
1714 
1715  Assert(FileIsValid(file));
1716 
1717  DO_DB(elog(LOG, "FileClose: %d (%s)",
1718  file, VfdCache[file].fileName));
1719 
1720  vfdP = &VfdCache[file];
1721 
1722  if (!FileIsNotOpen(file))
1723  {
1724  /* close the file */
1725  if (close(vfdP->fd) != 0)
1726  {
1727  /*
1728  * We may need to panic on failure to close non-temporary files;
1729  * see LruDelete.
1730  */
1732  "could not close file \"%s\": %m", vfdP->fileName);
1733  }
1734 
1735  --nfile;
1736  vfdP->fd = VFD_CLOSED;
1737 
1738  /* remove the file from the lru ring */
1739  Delete(file);
1740  }
1741 
1742  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1743  {
1744  /* Subtract its size from current usage (do first in case of error) */
1745  temporary_files_size -= vfdP->fileSize;
1746  vfdP->fileSize = 0;
1747  }
1748 
1749  /*
1750  * Delete the file if it was temporary, and make a log entry if wanted
1751  */
1752  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1753  {
1754  struct stat filestats;
1755  int stat_errno;
1756 
1757  /*
1758  * If we get an error, as could happen within the ereport/elog calls,
1759  * we'll come right back here during transaction abort. Reset the
1760  * flag to ensure that we can't get into an infinite loop. This code
1761  * is arranged to ensure that the worst-case consequence is failing to
1762  * emit log message(s), not failing to attempt the unlink.
1763  */
1764  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1765 
1766 
1767  /* first try the stat() */
1768  if (stat(vfdP->fileName, &filestats))
1769  stat_errno = errno;
1770  else
1771  stat_errno = 0;
1772 
1773  /* in any case do the unlink */
1774  if (unlink(vfdP->fileName))
1775  elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1776 
1777  /* and last report the stat results */
1778  if (stat_errno == 0)
1779  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1780  else
1781  {
1782  errno = stat_errno;
1783  elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1784  }
1785  }
1786 
1787  /* Unregister it from the resource owner */
1788  if (vfdP->resowner)
1789  ResourceOwnerForgetFile(vfdP->resowner, file);
1790 
1791  /*
1792  * Return the Vfd slot to the free list
1793  */
1794  FreeVfd(file);
1795 }
1796 
1797 /*
1798  * FilePrefetch - initiate asynchronous read of a given range of the file.
1799  *
1800  * Currently the only implementation of this function is using posix_fadvise
1801  * which is the simplest standardized interface that accomplishes this.
1802  * We could add an implementation using libaio in the future; but note that
1803  * this API is inappropriate for libaio, which wants to have a buffer provided
1804  * to read into.
1805  */
1806 int
1807 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1808 {
1809 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1810  int returnCode;
1811 
1812  Assert(FileIsValid(file));
1813 
1814  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1815  file, VfdCache[file].fileName,
1816  (int64) offset, amount));
1817 
1818  returnCode = FileAccess(file);
1819  if (returnCode < 0)
1820  return returnCode;
1821 
1822  pgstat_report_wait_start(wait_event_info);
1823  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1824  POSIX_FADV_WILLNEED);
1826 
1827  return returnCode;
1828 #else
1829  Assert(FileIsValid(file));
1830  return 0;
1831 #endif
1832 }
1833 
1834 void
1835 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1836 {
1837  int returnCode;
1838 
1839  Assert(FileIsValid(file));
1840 
1841  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1842  file, VfdCache[file].fileName,
1843  (int64) offset, (int64) nbytes));
1844 
1845  if (nbytes <= 0)
1846  return;
1847 
1848  returnCode = FileAccess(file);
1849  if (returnCode < 0)
1850  return;
1851 
1852  pgstat_report_wait_start(wait_event_info);
1853  pg_flush_data(VfdCache[file].fd, offset, nbytes);
1855 }
1856 
1857 int
1858 FileRead(File file, char *buffer, int amount, off_t offset,
1859  uint32 wait_event_info)
1860 {
1861  int returnCode;
1862  Vfd *vfdP;
1863 
1864  Assert(FileIsValid(file));
1865 
1866  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1867  file, VfdCache[file].fileName,
1868  (int64) offset,
1869  amount, buffer));
1870 
1871  returnCode = FileAccess(file);
1872  if (returnCode < 0)
1873  return returnCode;
1874 
1875  vfdP = &VfdCache[file];
1876 
1877 retry:
1878  pgstat_report_wait_start(wait_event_info);
1879  returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
1881 
1882  if (returnCode < 0)
1883  {
1884  /*
1885  * Windows may run out of kernel buffers and return "Insufficient
1886  * system resources" error. Wait a bit and retry to solve it.
1887  *
1888  * It is rumored that EINTR is also possible on some Unix filesystems,
1889  * in which case immediate retry is indicated.
1890  */
1891 #ifdef WIN32
1892  DWORD error = GetLastError();
1893 
1894  switch (error)
1895  {
1896  case ERROR_NO_SYSTEM_RESOURCES:
1897  pg_usleep(1000L);
1898  errno = EINTR;
1899  break;
1900  default:
1901  _dosmaperr(error);
1902  break;
1903  }
1904 #endif
1905  /* OK to retry if interrupted */
1906  if (errno == EINTR)
1907  goto retry;
1908  }
1909 
1910  return returnCode;
1911 }
1912 
1913 int
1914 FileWrite(File file, char *buffer, int amount, off_t offset,
1915  uint32 wait_event_info)
1916 {
1917  int returnCode;
1918  Vfd *vfdP;
1919 
1920  Assert(FileIsValid(file));
1921 
1922  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1923  file, VfdCache[file].fileName,
1924  (int64) offset,
1925  amount, buffer));
1926 
1927  returnCode = FileAccess(file);
1928  if (returnCode < 0)
1929  return returnCode;
1930 
1931  vfdP = &VfdCache[file];
1932 
1933  /*
1934  * If enforcing temp_file_limit and it's a temp file, check to see if the
1935  * write would overrun temp_file_limit, and throw error if so. Note: it's
1936  * really a modularity violation to throw error here; we should set errno
1937  * and return -1. However, there's no way to report a suitable error
1938  * message if we do that. All current callers would just throw error
1939  * immediately anyway, so this is safe at present.
1940  */
1941  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
1942  {
1943  off_t past_write = offset + amount;
1944 
1945  if (past_write > vfdP->fileSize)
1946  {
1947  uint64 newTotal = temporary_files_size;
1948 
1949  newTotal += past_write - vfdP->fileSize;
1950  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1951  ereport(ERROR,
1952  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1953  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1954  temp_file_limit)));
1955  }
1956  }
1957 
1958 retry:
1959  errno = 0;
1960  pgstat_report_wait_start(wait_event_info);
1961  returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
1963 
1964  /* if write didn't set errno, assume problem is no disk space */
1965  if (returnCode != amount && errno == 0)
1966  errno = ENOSPC;
1967 
1968  if (returnCode >= 0)
1969  {
1970  /*
1971  * Maintain fileSize and temporary_files_size if it's a temp file.
1972  */
1973  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1974  {
1975  off_t past_write = offset + amount;
1976 
1977  if (past_write > vfdP->fileSize)
1978  {
1979  temporary_files_size += past_write - vfdP->fileSize;
1980  vfdP->fileSize = past_write;
1981  }
1982  }
1983  }
1984  else
1985  {
1986  /*
1987  * See comments in FileRead()
1988  */
1989 #ifdef WIN32
1990  DWORD error = GetLastError();
1991 
1992  switch (error)
1993  {
1994  case ERROR_NO_SYSTEM_RESOURCES:
1995  pg_usleep(1000L);
1996  errno = EINTR;
1997  break;
1998  default:
1999  _dosmaperr(error);
2000  break;
2001  }
2002 #endif
2003  /* OK to retry if interrupted */
2004  if (errno == EINTR)
2005  goto retry;
2006  }
2007 
2008  return returnCode;
2009 }
2010 
2011 int
2012 FileSync(File file, uint32 wait_event_info)
2013 {
2014  int returnCode;
2015 
2016  Assert(FileIsValid(file));
2017 
2018  DO_DB(elog(LOG, "FileSync: %d (%s)",
2019  file, VfdCache[file].fileName));
2020 
2021  returnCode = FileAccess(file);
2022  if (returnCode < 0)
2023  return returnCode;
2024 
2025  pgstat_report_wait_start(wait_event_info);
2026  returnCode = pg_fsync(VfdCache[file].fd);
2028 
2029  return returnCode;
2030 }
2031 
2032 off_t
2034 {
2035  Assert(FileIsValid(file));
2036 
2037  DO_DB(elog(LOG, "FileSize %d (%s)",
2038  file, VfdCache[file].fileName));
2039 
2040  if (FileIsNotOpen(file))
2041  {
2042  if (FileAccess(file) < 0)
2043  return (off_t) -1;
2044  }
2045 
2046  return lseek(VfdCache[file].fd, 0, SEEK_END);
2047 }
2048 
2049 int
2050 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2051 {
2052  int returnCode;
2053 
2054  Assert(FileIsValid(file));
2055 
2056  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2057  file, VfdCache[file].fileName));
2058 
2059  returnCode = FileAccess(file);
2060  if (returnCode < 0)
2061  return returnCode;
2062 
2063  pgstat_report_wait_start(wait_event_info);
2064  returnCode = ftruncate(VfdCache[file].fd, offset);
2066 
2067  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2068  {
2069  /* adjust our state for truncation of a temp file */
2070  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2071  temporary_files_size -= VfdCache[file].fileSize - offset;
2072  VfdCache[file].fileSize = offset;
2073  }
2074 
2075  return returnCode;
2076 }
2077 
2078 /*
2079  * Return the pathname associated with an open file.
2080  *
2081  * The returned string points to an internal buffer, which is valid until
2082  * the file is closed.
2083  */
2084 char *
2086 {
2087  Assert(FileIsValid(file));
2088 
2089  return VfdCache[file].fileName;
2090 }
2091 
2092 /*
2093  * Return the raw file descriptor of an opened file.
2094  *
2095  * The returned file descriptor will be valid until the file is closed, but
2096  * there are a lot of things that can make that happen. So the caller should
2097  * be careful not to do much of anything else before it finishes using the
2098  * returned file descriptor.
2099  */
2100 int
2102 {
2103  Assert(FileIsValid(file));
2104  return VfdCache[file].fd;
2105 }
2106 
2107 /*
2108  * FileGetRawFlags - returns the file flags on open(2)
2109  */
2110 int
2112 {
2113  Assert(FileIsValid(file));
2114  return VfdCache[file].fileFlags;
2115 }
2116 
2117 /*
2118  * FileGetRawMode - returns the mode bitmask passed to open(2)
2119  */
2120 mode_t
2122 {
2123  Assert(FileIsValid(file));
2124  return VfdCache[file].fileMode;
2125 }
2126 
2127 /*
2128  * Make room for another allocatedDescs[] array entry if needed and possible.
2129  * Returns true if an array element is available.
2130  */
2131 static bool
2133 {
2134  AllocateDesc *newDescs;
2135  int newMax;
2136 
2137  /* Quick out if array already has a free slot. */
2139  return true;
2140 
2141  /*
2142  * If the array hasn't yet been created in the current process, initialize
2143  * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2144  * we will ever need, anyway. We don't want to look at max_safe_fds
2145  * immediately because set_max_safe_fds() may not have run yet.
2146  */
2147  if (allocatedDescs == NULL)
2148  {
2149  newMax = FD_MINFREE / 2;
2150  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2151  /* Out of memory already? Treat as fatal error. */
2152  if (newDescs == NULL)
2153  ereport(ERROR,
2154  (errcode(ERRCODE_OUT_OF_MEMORY),
2155  errmsg("out of memory")));
2156  allocatedDescs = newDescs;
2157  maxAllocatedDescs = newMax;
2158  return true;
2159  }
2160 
2161  /*
2162  * Consider enlarging the array beyond the initial allocation used above.
2163  * By the time this happens, max_safe_fds should be known accurately.
2164  *
2165  * We mustn't let allocated descriptors hog all the available FDs, and in
2166  * practice we'd better leave a reasonable number of FDs for VFD use. So
2167  * set the maximum to max_safe_fds / 2. (This should certainly be at
2168  * least as large as the initial size, FD_MINFREE / 2.)
2169  */
2170  newMax = max_safe_fds / 2;
2171  if (newMax > maxAllocatedDescs)
2172  {
2173  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2174  newMax * sizeof(AllocateDesc));
2175  /* Treat out-of-memory as a non-fatal error. */
2176  if (newDescs == NULL)
2177  return false;
2178  allocatedDescs = newDescs;
2179  maxAllocatedDescs = newMax;
2180  return true;
2181  }
2182 
2183  /* Can't enlarge allocatedDescs[] any more. */
2184  return false;
2185 }
2186 
2187 /*
2188  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2189  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2190  * necessary to open the file. When done, call FreeFile rather than fclose.
2191  *
2192  * Note that files that will be open for any significant length of time
2193  * should NOT be handled this way, since they cannot share kernel file
2194  * descriptors with other files; there is grave risk of running out of FDs
2195  * if anyone locks down too many FDs. Most callers of this routine are
2196  * simply reading a config file that they will read and close immediately.
2197  *
2198  * fd.c will automatically close all files opened with AllocateFile at
2199  * transaction commit or abort; this prevents FD leakage if a routine
2200  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2201  *
2202  * Ideally this should be the *only* direct call of fopen() in the backend.
2203  */
2204 FILE *
2205 AllocateFile(const char *name, const char *mode)
2206 {
2207  FILE *file;
2208 
2209  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2210  numAllocatedDescs, name));
2211 
2212  /* Can we allocate another non-virtual FD? */
2213  if (!reserveAllocatedDesc())
2214  ereport(ERROR,
2215  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2216  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2217  maxAllocatedDescs, name)));
2218 
2219  /* Close excess kernel FDs. */
2220  ReleaseLruFiles();
2221 
2222 TryAgain:
2223  if ((file = fopen(name, mode)) != NULL)
2224  {
2225  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2226 
2227  desc->kind = AllocateDescFile;
2228  desc->desc.file = file;
2231  return desc->desc.file;
2232  }
2233 
2234  if (errno == EMFILE || errno == ENFILE)
2235  {
2236  int save_errno = errno;
2237 
2238  ereport(LOG,
2239  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2240  errmsg("out of file descriptors: %m; release and retry")));
2241  errno = 0;
2242  if (ReleaseLruFile())
2243  goto TryAgain;
2244  errno = save_errno;
2245  }
2246 
2247  return NULL;
2248 }
2249 
2250 /*
2251  * Open a file with OpenTransientFilePerm() and pass default file mode for
2252  * the fileMode parameter.
2253  */
2254 int
2256 {
2257  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2258 }
2259 
2260 /*
2261  * Like AllocateFile, but returns an unbuffered fd like open(2)
2262  */
2263 int
2265 {
2266  int fd;
2267 
2268  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2269  numAllocatedDescs, fileName));
2270 
2271  /* Can we allocate another non-virtual FD? */
2272  if (!reserveAllocatedDesc())
2273  ereport(ERROR,
2274  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2275  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2276  maxAllocatedDescs, fileName)));
2277 
2278  /* Close excess kernel FDs. */
2279  ReleaseLruFiles();
2280 
2281  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2282 
2283  if (fd >= 0)
2284  {
2285  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2286 
2287  desc->kind = AllocateDescRawFD;
2288  desc->desc.fd = fd;
2291 
2292  return fd;
2293  }
2294 
2295  return -1; /* failure */
2296 }
2297 
2298 /*
2299  * Routines that want to initiate a pipe stream should use OpenPipeStream
2300  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2301  * necessary. When done, call ClosePipeStream rather than pclose.
2302  *
2303  * This function also ensures that the popen'd program is run with default
2304  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2305  * uses. This ensures desirable response to, eg, closing a read pipe early.
2306  */
2307 FILE *
2308 OpenPipeStream(const char *command, const char *mode)
2309 {
2310  FILE *file;
2311  int save_errno;
2312 
2313  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2314  numAllocatedDescs, command));
2315 
2316  /* Can we allocate another non-virtual FD? */
2317  if (!reserveAllocatedDesc())
2318  ereport(ERROR,
2319  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2320  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2321  maxAllocatedDescs, command)));
2322 
2323  /* Close excess kernel FDs. */
2324  ReleaseLruFiles();
2325 
2326 TryAgain:
2327  fflush(stdout);
2328  fflush(stderr);
2330  errno = 0;
2331  file = popen(command, mode);
2332  save_errno = errno;
2334  errno = save_errno;
2335  if (file != NULL)
2336  {
2337  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2338 
2339  desc->kind = AllocateDescPipe;
2340  desc->desc.file = file;
2343  return desc->desc.file;
2344  }
2345 
2346  if (errno == EMFILE || errno == ENFILE)
2347  {
2348  ereport(LOG,
2349  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2350  errmsg("out of file descriptors: %m; release and retry")));
2351  if (ReleaseLruFile())
2352  goto TryAgain;
2353  errno = save_errno;
2354  }
2355 
2356  return NULL;
2357 }
2358 
2359 /*
2360  * Free an AllocateDesc of any type.
2361  *
2362  * The argument *must* point into the allocatedDescs[] array.
2363  */
2364 static int
2366 {
2367  int result;
2368 
2369  /* Close the underlying object */
2370  switch (desc->kind)
2371  {
2372  case AllocateDescFile:
2373  result = fclose(desc->desc.file);
2374  break;
2375  case AllocateDescPipe:
2376  result = pclose(desc->desc.file);
2377  break;
2378  case AllocateDescDir:
2379  result = closedir(desc->desc.dir);
2380  break;
2381  case AllocateDescRawFD:
2382  result = close(desc->desc.fd);
2383  break;
2384  default:
2385  elog(ERROR, "AllocateDesc kind not recognized");
2386  result = 0; /* keep compiler quiet */
2387  break;
2388  }
2389 
2390  /* Compact storage in the allocatedDescs array */
2392  *desc = allocatedDescs[numAllocatedDescs];
2393 
2394  return result;
2395 }
2396 
2397 /*
2398  * Close a file returned by AllocateFile.
2399  *
2400  * Note we do not check fclose's return value --- it is up to the caller
2401  * to handle close errors.
2402  */
2403 int
2404 FreeFile(FILE *file)
2405 {
2406  int i;
2407 
2408  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2409 
2410  /* Remove file from list of allocated files, if it's present */
2411  for (i = numAllocatedDescs; --i >= 0;)
2412  {
2413  AllocateDesc *desc = &allocatedDescs[i];
2414 
2415  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2416  return FreeDesc(desc);
2417  }
2418 
2419  /* Only get here if someone passes us a file not in allocatedDescs */
2420  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2421 
2422  return fclose(file);
2423 }
2424 
2425 /*
2426  * Close a file returned by OpenTransientFile.
2427  *
2428  * Note we do not check close's return value --- it is up to the caller
2429  * to handle close errors.
2430  */
2431 int
2433 {
2434  int i;
2435 
2436  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2437 
2438  /* Remove fd from list of allocated files, if it's present */
2439  for (i = numAllocatedDescs; --i >= 0;)
2440  {
2441  AllocateDesc *desc = &allocatedDescs[i];
2442 
2443  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2444  return FreeDesc(desc);
2445  }
2446 
2447  /* Only get here if someone passes us a file not in allocatedDescs */
2448  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2449 
2450  return close(fd);
2451 }
2452 
2453 /*
2454  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2455  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2456  * necessary to open the directory, and with closing it after an elog.
2457  * When done, call FreeDir rather than closedir.
2458  *
2459  * Returns NULL, with errno set, on failure. Note that failure detection
2460  * is commonly left to the following call of ReadDir or ReadDirExtended;
2461  * see the comments for ReadDir.
2462  *
2463  * Ideally this should be the *only* direct call of opendir() in the backend.
2464  */
2465 DIR *
2466 AllocateDir(const char *dirname)
2467 {
2468  DIR *dir;
2469 
2470  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2471  numAllocatedDescs, dirname));
2472 
2473  /* Can we allocate another non-virtual FD? */
2474  if (!reserveAllocatedDesc())
2475  ereport(ERROR,
2476  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2477  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2478  maxAllocatedDescs, dirname)));
2479 
2480  /* Close excess kernel FDs. */
2481  ReleaseLruFiles();
2482 
2483 TryAgain:
2484  if ((dir = opendir(dirname)) != NULL)
2485  {
2486  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2487 
2488  desc->kind = AllocateDescDir;
2489  desc->desc.dir = dir;
2492  return desc->desc.dir;
2493  }
2494 
2495  if (errno == EMFILE || errno == ENFILE)
2496  {
2497  int save_errno = errno;
2498 
2499  ereport(LOG,
2500  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2501  errmsg("out of file descriptors: %m; release and retry")));
2502  errno = 0;
2503  if (ReleaseLruFile())
2504  goto TryAgain;
2505  errno = save_errno;
2506  }
2507 
2508  return NULL;
2509 }
2510 
2511 /*
2512  * Read a directory opened with AllocateDir, ereport'ing any error.
2513  *
2514  * This is easier to use than raw readdir() since it takes care of some
2515  * otherwise rather tedious and error-prone manipulation of errno. Also,
2516  * if you are happy with a generic error message for AllocateDir failure,
2517  * you can just do
2518  *
2519  * dir = AllocateDir(path);
2520  * while ((dirent = ReadDir(dir, path)) != NULL)
2521  * process dirent;
2522  * FreeDir(dir);
2523  *
2524  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2525  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2526  * use this shortcut.)
2527  *
2528  * The pathname passed to AllocateDir must be passed to this routine too,
2529  * but it is only used for error reporting.
2530  */
2531 struct dirent *
2532 ReadDir(DIR *dir, const char *dirname)
2533 {
2534  return ReadDirExtended(dir, dirname, ERROR);
2535 }
2536 
2537 /*
2538  * Alternate version of ReadDir that allows caller to specify the elevel
2539  * for any error report (whether it's reporting an initial failure of
2540  * AllocateDir or a subsequent directory read failure).
2541  *
2542  * If elevel < ERROR, returns NULL after any error. With the normal coding
2543  * pattern, this will result in falling out of the loop immediately as
2544  * though the directory contained no (more) entries.
2545  */
2546 struct dirent *
2547 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2548 {
2549  struct dirent *dent;
2550 
2551  /* Give a generic message for AllocateDir failure, if caller didn't */
2552  if (dir == NULL)
2553  {
2554  ereport(elevel,
2556  errmsg("could not open directory \"%s\": %m",
2557  dirname)));
2558  return NULL;
2559  }
2560 
2561  errno = 0;
2562  if ((dent = readdir(dir)) != NULL)
2563  return dent;
2564 
2565  if (errno)
2566  ereport(elevel,
2568  errmsg("could not read directory \"%s\": %m",
2569  dirname)));
2570  return NULL;
2571 }
2572 
2573 /*
2574  * Close a directory opened with AllocateDir.
2575  *
2576  * Returns closedir's return value (with errno set if it's not 0).
2577  * Note we do not check the return value --- it is up to the caller
2578  * to handle close errors if wanted.
2579  *
2580  * Does nothing if dir == NULL; we assume that directory open failure was
2581  * already reported if desired.
2582  */
2583 int
2585 {
2586  int i;
2587 
2588  /* Nothing to do if AllocateDir failed */
2589  if (dir == NULL)
2590  return 0;
2591 
2592  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2593 
2594  /* Remove dir from list of allocated dirs, if it's present */
2595  for (i = numAllocatedDescs; --i >= 0;)
2596  {
2597  AllocateDesc *desc = &allocatedDescs[i];
2598 
2599  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2600  return FreeDesc(desc);
2601  }
2602 
2603  /* Only get here if someone passes us a dir not in allocatedDescs */
2604  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2605 
2606  return closedir(dir);
2607 }
2608 
2609 
2610 /*
2611  * Close a pipe stream returned by OpenPipeStream.
2612  */
2613 int
2614 ClosePipeStream(FILE *file)
2615 {
2616  int i;
2617 
2618  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2619 
2620  /* Remove file from list of allocated files, if it's present */
2621  for (i = numAllocatedDescs; --i >= 0;)
2622  {
2623  AllocateDesc *desc = &allocatedDescs[i];
2624 
2625  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2626  return FreeDesc(desc);
2627  }
2628 
2629  /* Only get here if someone passes us a file not in allocatedDescs */
2630  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2631 
2632  return pclose(file);
2633 }
2634 
2635 /*
2636  * closeAllVfds
2637  *
2638  * Force all VFDs into the physically-closed state, so that the fewest
2639  * possible number of kernel file descriptors are in use. There is no
2640  * change in the logical state of the VFDs.
2641  */
2642 void
2644 {
2645  Index i;
2646 
2647  if (SizeVfdCache > 0)
2648  {
2649  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2650  for (i = 1; i < SizeVfdCache; i++)
2651  {
2652  if (!FileIsNotOpen(i))
2653  LruDelete(i);
2654  }
2655  }
2656 }
2657 
2658 
2659 /*
2660  * SetTempTablespaces
2661  *
2662  * Define a list (actually an array) of OIDs of tablespaces to use for
2663  * temporary files. This list will be used until end of transaction,
2664  * unless this function is called again before then. It is caller's
2665  * responsibility that the passed-in array has adequate lifespan (typically
2666  * it'd be allocated in TopTransactionContext).
2667  */
2668 void
2669 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2670 {
2671  Assert(numSpaces >= 0);
2672  tempTableSpaces = tableSpaces;
2673  numTempTableSpaces = numSpaces;
2674 
2675  /*
2676  * Select a random starting point in the list. This is to minimize
2677  * conflicts between backends that are most likely sharing the same list
2678  * of temp tablespaces. Note that if we create multiple temp files in the
2679  * same transaction, we'll advance circularly through the list --- this
2680  * ensures that large temporary sort files are nicely spread across all
2681  * available tablespaces.
2682  */
2683  if (numSpaces > 1)
2684  nextTempTableSpace = random() % numSpaces;
2685  else
2686  nextTempTableSpace = 0;
2687 }
2688 
2689 /*
2690  * TempTablespacesAreSet
2691  *
2692  * Returns true if SetTempTablespaces has been called in current transaction.
2693  * (This is just so that tablespaces.c doesn't need its own per-transaction
2694  * state.)
2695  */
2696 bool
2698 {
2699  return (numTempTableSpaces >= 0);
2700 }
2701 
2702 /*
2703  * GetTempTablespaces
2704  *
2705  * Populate an array with the OIDs of the tablespaces that should be used for
2706  * temporary files. Return the number that were copied into the output array.
2707  */
2708 int
2709 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2710 {
2711  int i;
2712 
2714  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2715  tableSpaces[i] = tempTableSpaces[i];
2716 
2717  return i;
2718 }
2719 
2720 /*
2721  * GetNextTempTableSpace
2722  *
2723  * Select the next temp tablespace to use. A result of InvalidOid means
2724  * to use the current database's default tablespace.
2725  */
2726 Oid
2728 {
2729  if (numTempTableSpaces > 0)
2730  {
2731  /* Advance nextTempTableSpace counter with wraparound */
2733  nextTempTableSpace = 0;
2735  }
2736  return InvalidOid;
2737 }
2738 
2739 
2740 /*
2741  * AtEOSubXact_Files
2742  *
2743  * Take care of subtransaction commit/abort. At abort, we close temp files
2744  * that the subtransaction may have opened. At commit, we reassign the
2745  * files that were opened to the parent subtransaction.
2746  */
2747 void
2748 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2749  SubTransactionId parentSubid)
2750 {
2751  Index i;
2752 
2753  for (i = 0; i < numAllocatedDescs; i++)
2754  {
2755  if (allocatedDescs[i].create_subid == mySubid)
2756  {
2757  if (isCommit)
2758  allocatedDescs[i].create_subid = parentSubid;
2759  else
2760  {
2761  /* have to recheck the item after FreeDesc (ugly) */
2762  FreeDesc(&allocatedDescs[i--]);
2763  }
2764  }
2765  }
2766 }
2767 
2768 /*
2769  * AtEOXact_Files
2770  *
2771  * This routine is called during transaction commit or abort. All still-open
2772  * per-transaction temporary file VFDs are closed, which also causes the
2773  * underlying files to be deleted (although they should've been closed already
2774  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2775  * closed. We also forget any transaction-local temp tablespace list.
2776  *
2777  * The isCommit flag is used only to decide whether to emit warnings about
2778  * unclosed files.
2779  */
2780 void
2781 AtEOXact_Files(bool isCommit)
2782 {
2783  CleanupTempFiles(isCommit, false);
2784  tempTableSpaces = NULL;
2785  numTempTableSpaces = -1;
2786 }
2787 
2788 /*
2789  * AtProcExit_Files
2790  *
2791  * on_proc_exit hook to clean up temp files during backend shutdown.
2792  * Here, we want to clean up *all* temp files including interXact ones.
2793  */
2794 static void
2796 {
2797  CleanupTempFiles(false, true);
2798 }
2799 
2800 /*
2801  * Close temporary files and delete their underlying files.
2802  *
2803  * isCommit: if true, this is normal transaction commit, and we don't
2804  * expect any remaining files; warn if there are some.
2805  *
2806  * isProcExit: if true, this is being called as the backend process is
2807  * exiting. If that's the case, we should remove all temporary files; if
2808  * that's not the case, we are being called for transaction commit/abort
2809  * and should only remove transaction-local temp files. In either case,
2810  * also clean up "allocated" stdio files, dirs and fds.
2811  */
2812 static void
2813 CleanupTempFiles(bool isCommit, bool isProcExit)
2814 {
2815  Index i;
2816 
2817  /*
2818  * Careful here: at proc_exit we need extra cleanup, not just
2819  * xact_temporary files.
2820  */
2821  if (isProcExit || have_xact_temporary_files)
2822  {
2823  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2824  for (i = 1; i < SizeVfdCache; i++)
2825  {
2826  unsigned short fdstate = VfdCache[i].fdstate;
2827 
2828  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2829  VfdCache[i].fileName != NULL)
2830  {
2831  /*
2832  * If we're in the process of exiting a backend process, close
2833  * all temporary files. Otherwise, only close temporary files
2834  * local to the current transaction. They should be closed by
2835  * the ResourceOwner mechanism already, so this is just a
2836  * debugging cross-check.
2837  */
2838  if (isProcExit)
2839  FileClose(i);
2840  else if (fdstate & FD_CLOSE_AT_EOXACT)
2841  {
2842  elog(WARNING,
2843  "temporary file %s not closed at end-of-transaction",
2844  VfdCache[i].fileName);
2845  FileClose(i);
2846  }
2847  }
2848  }
2849 
2850  have_xact_temporary_files = false;
2851  }
2852 
2853  /* Complain if any allocated files remain open at commit. */
2854  if (isCommit && numAllocatedDescs > 0)
2855  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
2857 
2858  /* Clean up "allocated" stdio files, dirs and fds. */
2859  while (numAllocatedDescs > 0)
2860  FreeDesc(&allocatedDescs[0]);
2861 }
2862 
2863 
2864 /*
2865  * Remove temporary and temporary relation files left over from a prior
2866  * postmaster session
2867  *
2868  * This should be called during postmaster startup. It will forcibly
2869  * remove any leftover files created by OpenTemporaryFile and any leftover
2870  * temporary relation files created by mdcreate.
2871  *
2872  * NOTE: we could, but don't, call this during a post-backend-crash restart
2873  * cycle. The argument for not doing it is that someone might want to examine
2874  * the temp files for debugging purposes. This does however mean that
2875  * OpenTemporaryFile had better allow for collision with an existing temp
2876  * file name.
2877  *
2878  * NOTE: this function and its subroutines generally report syscall failures
2879  * with ereport(LOG) and keep going. Removing temp files is not so critical
2880  * that we should fail to start the database when we can't do it.
2881  */
2882 void
2884 {
2885  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2886  DIR *spc_dir;
2887  struct dirent *spc_de;
2888 
2889  /*
2890  * First process temp files in pg_default ($PGDATA/base)
2891  */
2892  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2893  RemovePgTempFilesInDir(temp_path, true, false);
2894  RemovePgTempRelationFiles("base");
2895 
2896  /*
2897  * Cycle through temp directories for all non-default tablespaces.
2898  */
2899  spc_dir = AllocateDir("pg_tblspc");
2900 
2901  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
2902  {
2903  if (strcmp(spc_de->d_name, ".") == 0 ||
2904  strcmp(spc_de->d_name, "..") == 0)
2905  continue;
2906 
2907  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2909  RemovePgTempFilesInDir(temp_path, true, false);
2910 
2911  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2913  RemovePgTempRelationFiles(temp_path);
2914  }
2915 
2916  FreeDir(spc_dir);
2917 
2918  /*
2919  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2920  * DataDir as well. However, that is *not* cleaned here because doing so
2921  * would create a race condition. It's done separately, earlier in
2922  * postmaster startup.
2923  */
2924 }
2925 
2926 /*
2927  * Process one pgsql_tmp directory for RemovePgTempFiles.
2928  *
2929  * If missing_ok is true, it's all right for the named directory to not exist.
2930  * Any other problem results in a LOG message. (missing_ok should be true at
2931  * the top level, since pgsql_tmp directories are not created until needed.)
2932  *
2933  * At the top level, this should be called with unlink_all = false, so that
2934  * only files matching the temporary name prefix will be unlinked. When
2935  * recursing it will be called with unlink_all = true to unlink everything
2936  * under a top-level temporary directory.
2937  *
2938  * (These two flags could be replaced by one, but it seems clearer to keep
2939  * them separate.)
2940  */
2941 void
2942 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
2943 {
2944  DIR *temp_dir;
2945  struct dirent *temp_de;
2946  char rm_path[MAXPGPATH * 2];
2947 
2948  temp_dir = AllocateDir(tmpdirname);
2949 
2950  if (temp_dir == NULL && errno == ENOENT && missing_ok)
2951  return;
2952 
2953  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
2954  {
2955  if (strcmp(temp_de->d_name, ".") == 0 ||
2956  strcmp(temp_de->d_name, "..") == 0)
2957  continue;
2958 
2959  snprintf(rm_path, sizeof(rm_path), "%s/%s",
2960  tmpdirname, temp_de->d_name);
2961 
2962  if (unlink_all ||
2963  strncmp(temp_de->d_name,
2965  strlen(PG_TEMP_FILE_PREFIX)) == 0)
2966  {
2967  struct stat statbuf;
2968 
2969  if (lstat(rm_path, &statbuf) < 0)
2970  {
2971  ereport(LOG,
2973  errmsg("could not stat file \"%s\": %m", rm_path)));
2974  continue;
2975  }
2976 
2977  if (S_ISDIR(statbuf.st_mode))
2978  {
2979  /* recursively remove contents, then directory itself */
2980  RemovePgTempFilesInDir(rm_path, false, true);
2981 
2982  if (rmdir(rm_path) < 0)
2983  ereport(LOG,
2985  errmsg("could not remove directory \"%s\": %m",
2986  rm_path)));
2987  }
2988  else
2989  {
2990  if (unlink(rm_path) < 0)
2991  ereport(LOG,
2993  errmsg("could not remove file \"%s\": %m",
2994  rm_path)));
2995  }
2996  }
2997  else
2998  ereport(LOG,
2999  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3000  rm_path)));
3001  }
3002 
3003  FreeDir(temp_dir);
3004 }
3005 
3006 /* Process one tablespace directory, look for per-DB subdirectories */
3007 static void
3008 RemovePgTempRelationFiles(const char *tsdirname)
3009 {
3010  DIR *ts_dir;
3011  struct dirent *de;
3012  char dbspace_path[MAXPGPATH * 2];
3013 
3014  ts_dir = AllocateDir(tsdirname);
3015 
3016  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3017  {
3018  /*
3019  * We're only interested in the per-database directories, which have
3020  * numeric names. Note that this code will also (properly) ignore "."
3021  * and "..".
3022  */
3023  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3024  continue;
3025 
3026  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3027  tsdirname, de->d_name);
3028  RemovePgTempRelationFilesInDbspace(dbspace_path);
3029  }
3030 
3031  FreeDir(ts_dir);
3032 }
3033 
3034 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3035 static void
3036 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3037 {
3038  DIR *dbspace_dir;
3039  struct dirent *de;
3040  char rm_path[MAXPGPATH * 2];
3041 
3042  dbspace_dir = AllocateDir(dbspacedirname);
3043 
3044  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3045  {
3046  if (!looks_like_temp_rel_name(de->d_name))
3047  continue;
3048 
3049  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3050  dbspacedirname, de->d_name);
3051 
3052  if (unlink(rm_path) < 0)
3053  ereport(LOG,
3055  errmsg("could not remove file \"%s\": %m",
3056  rm_path)));
3057  }
3058 
3059  FreeDir(dbspace_dir);
3060 }
3061 
3062 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3063 bool
3065 {
3066  int pos;
3067  int savepos;
3068 
3069  /* Must start with "t". */
3070  if (name[0] != 't')
3071  return false;
3072 
3073  /* Followed by a non-empty string of digits and then an underscore. */
3074  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3075  ;
3076  if (pos == 1 || name[pos] != '_')
3077  return false;
3078 
3079  /* Followed by another nonempty string of digits. */
3080  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3081  ;
3082  if (savepos == pos)
3083  return false;
3084 
3085  /* We might have _forkname or .segment or both. */
3086  if (name[pos] == '_')
3087  {
3088  int forkchar = forkname_chars(&name[pos + 1], NULL);
3089 
3090  if (forkchar <= 0)
3091  return false;
3092  pos += forkchar + 1;
3093  }
3094  if (name[pos] == '.')
3095  {
3096  int segchar;
3097 
3098  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3099  ;
3100  if (segchar <= 1)
3101  return false;
3102  pos += segchar;
3103  }
3104 
3105  /* Now we should be at the end. */
3106  if (name[pos] != '\0')
3107  return false;
3108  return true;
3109 }
3110 
3111 
3112 /*
3113  * Issue fsync recursively on PGDATA and all its contents.
3114  *
3115  * We fsync regular files and directories wherever they are, but we
3116  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3117  * Other symlinks are presumed to point at files we're not responsible
3118  * for fsyncing, and might not have privileges to write at all.
3119  *
3120  * Errors are logged but not considered fatal; that's because this is used
3121  * only during database startup, to deal with the possibility that there are
3122  * issued-but-unsynced writes pending against the data directory. We want to
3123  * ensure that such writes reach disk before anything that's done in the new
3124  * run. However, aborting on error would result in failure to start for
3125  * harmless cases such as read-only files in the data directory, and that's
3126  * not good either.
3127  *
3128  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3129  * rewriting all changes again during recovery.
3130  *
3131  * Note we assume we're chdir'd into PGDATA to begin with.
3132  */
3133 void
3135 {
3136  bool xlog_is_symlink;
3137 
3138  /* We can skip this whole thing if fsync is disabled. */
3139  if (!enableFsync)
3140  return;
3141 
3142  /*
3143  * If pg_wal is a symlink, we'll need to recurse into it separately,
3144  * because the first walkdir below will ignore it.
3145  */
3146  xlog_is_symlink = false;
3147 
3148 #ifndef WIN32
3149  {
3150  struct stat st;
3151 
3152  if (lstat("pg_wal", &st) < 0)
3153  ereport(LOG,
3155  errmsg("could not stat file \"%s\": %m",
3156  "pg_wal")));
3157  else if (S_ISLNK(st.st_mode))
3158  xlog_is_symlink = true;
3159  }
3160 #else
3161  if (pgwin32_is_junction("pg_wal"))
3162  xlog_is_symlink = true;
3163 #endif
3164 
3165  /*
3166  * If possible, hint to the kernel that we're soon going to fsync the data
3167  * directory and its contents. Errors in this step are even less
3168  * interesting than normal, so log them only at DEBUG1.
3169  */
3170 #ifdef PG_FLUSH_DATA_WORKS
3171  walkdir(".", pre_sync_fname, false, DEBUG1);
3172  if (xlog_is_symlink)
3173  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3174  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3175 #endif
3176 
3177  /*
3178  * Now we do the fsync()s in the same order.
3179  *
3180  * The main call ignores symlinks, so in addition to specially processing
3181  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3182  * process_symlinks = true. Note that if there are any plain directories
3183  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3184  * so we don't worry about optimizing it.
3185  */
3186  walkdir(".", datadir_fsync_fname, false, LOG);
3187  if (xlog_is_symlink)
3188  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3189  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3190 }
3191 
3192 /*
3193  * walkdir: recursively walk a directory, applying the action to each
3194  * regular file and directory (including the named directory itself).
3195  *
3196  * If process_symlinks is true, the action and recursion are also applied
3197  * to regular files and directories that are pointed to by symlinks in the
3198  * given directory; otherwise symlinks are ignored. Symlinks are always
3199  * ignored in subdirectories, ie we intentionally don't pass down the
3200  * process_symlinks flag to recursive calls.
3201  *
3202  * Errors are reported at level elevel, which might be ERROR or less.
3203  *
3204  * See also walkdir in file_utils.c, which is a frontend version of this
3205  * logic.
3206  */
3207 static void
3208 walkdir(const char *path,
3209  void (*action) (const char *fname, bool isdir, int elevel),
3210  bool process_symlinks,
3211  int elevel)
3212 {
3213  DIR *dir;
3214  struct dirent *de;
3215 
3216  dir = AllocateDir(path);
3217 
3218  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3219  {
3220  char subpath[MAXPGPATH * 2];
3221  struct stat fst;
3222  int sret;
3223 
3225 
3226  if (strcmp(de->d_name, ".") == 0 ||
3227  strcmp(de->d_name, "..") == 0)
3228  continue;
3229 
3230  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3231 
3232  if (process_symlinks)
3233  sret = stat(subpath, &fst);
3234  else
3235  sret = lstat(subpath, &fst);
3236 
3237  if (sret < 0)
3238  {
3239  ereport(elevel,
3241  errmsg("could not stat file \"%s\": %m", subpath)));
3242  continue;
3243  }
3244 
3245  if (S_ISREG(fst.st_mode))
3246  (*action) (subpath, false, elevel);
3247  else if (S_ISDIR(fst.st_mode))
3248  walkdir(subpath, action, false, elevel);
3249  }
3250 
3251  FreeDir(dir); /* we ignore any error here */
3252 
3253  /*
3254  * It's important to fsync the destination directory itself as individual
3255  * file fsyncs don't guarantee that the directory entry for the file is
3256  * synced. However, skip this if AllocateDir failed; the action function
3257  * might not be robust against that.
3258  */
3259  if (dir)
3260  (*action) (path, true, elevel);
3261 }
3262 
3263 
3264 /*
3265  * Hint to the OS that it should get ready to fsync() this file.
3266  *
3267  * Ignores errors trying to open unreadable files, and logs other errors at a
3268  * caller-specified level.
3269  */
3270 #ifdef PG_FLUSH_DATA_WORKS
3271 
3272 static void
3273 pre_sync_fname(const char *fname, bool isdir, int elevel)
3274 {
3275  int fd;
3276 
3277  /* Don't try to flush directories, it'll likely just fail */
3278  if (isdir)
3279  return;
3280 
3281  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3282 
3283  if (fd < 0)
3284  {
3285  if (errno == EACCES)
3286  return;
3287  ereport(elevel,
3289  errmsg("could not open file \"%s\": %m", fname)));
3290  return;
3291  }
3292 
3293  /*
3294  * pg_flush_data() ignores errors, which is ok because this is only a
3295  * hint.
3296  */
3297  pg_flush_data(fd, 0, 0);
3298 
3299  if (CloseTransientFile(fd) != 0)
3300  ereport(elevel,
3302  errmsg("could not close file \"%s\": %m", fname)));
3303 }
3304 
3305 #endif /* PG_FLUSH_DATA_WORKS */
3306 
3307 static void
3308 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3309 {
3310  /*
3311  * We want to silently ignoring errors about unreadable files. Pass that
3312  * desire on to fsync_fname_ext().
3313  */
3314  fsync_fname_ext(fname, isdir, true, elevel);
3315 }
3316 
3317 static void
3318 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3319 {
3320  if (isdir)
3321  {
3322  if (rmdir(fname) != 0 && errno != ENOENT)
3323  ereport(elevel,
3325  errmsg("could not remove directory \"%s\": %m", fname)));
3326  }
3327  else
3328  {
3329  /* Use PathNameDeleteTemporaryFile to report filesize */
3330  PathNameDeleteTemporaryFile(fname, false);
3331  }
3332 }
3333 
3334 /*
3335  * fsync_fname_ext -- Try to fsync a file or directory
3336  *
3337  * If ignore_perm is true, ignore errors upon trying to open unreadable
3338  * files. Logs other errors at a caller-specified level.
3339  *
3340  * Returns 0 if the operation succeeded, -1 otherwise.
3341  */
3342 static int
3343 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3344 {
3345  int fd;
3346  int flags;
3347  int returncode;
3348 
3349  /*
3350  * Some OSs require directories to be opened read-only whereas other
3351  * systems don't allow us to fsync files opened read-only; so we need both
3352  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3353  * not writable by our userid, but we assume that's OK.
3354  */
3355  flags = PG_BINARY;
3356  if (!isdir)
3357  flags |= O_RDWR;
3358  else
3359  flags |= O_RDONLY;
3360 
3361  fd = OpenTransientFile(fname, flags);
3362 
3363  /*
3364  * Some OSs don't allow us to open directories at all (Windows returns
3365  * EACCES), just ignore the error in that case. If desired also silently
3366  * ignoring errors about unreadable files. Log others.
3367  */
3368  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3369  return 0;
3370  else if (fd < 0 && ignore_perm && errno == EACCES)
3371  return 0;
3372  else if (fd < 0)
3373  {
3374  ereport(elevel,
3376  errmsg("could not open file \"%s\": %m", fname)));
3377  return -1;
3378  }
3379 
3380  returncode = pg_fsync(fd);
3381 
3382  /*
3383  * Some OSes don't allow us to fsync directories at all, so we can ignore
3384  * those errors. Anything else needs to be logged.
3385  */
3386  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3387  {
3388  int save_errno;
3389 
3390  /* close file upon error, might not be in transaction context */
3391  save_errno = errno;
3392  (void) CloseTransientFile(fd);
3393  errno = save_errno;
3394 
3395  ereport(elevel,
3397  errmsg("could not fsync file \"%s\": %m", fname)));
3398  return -1;
3399  }
3400 
3401  if (CloseTransientFile(fd) != 0)
3402  {
3403  ereport(elevel,
3405  errmsg("could not close file \"%s\": %m", fname)));
3406  return -1;
3407  }
3408 
3409  return 0;
3410 }
3411 
3412 /*
3413  * fsync_parent_path -- fsync the parent path of a file or directory
3414  *
3415  * This is aimed at making file operations persistent on disk in case of
3416  * an OS crash or power failure.
3417  */
3418 static int
3419 fsync_parent_path(const char *fname, int elevel)
3420 {
3421  char parentpath[MAXPGPATH];
3422 
3423  strlcpy(parentpath, fname, MAXPGPATH);
3424  get_parent_directory(parentpath);
3425 
3426  /*
3427  * get_parent_directory() returns an empty string if the input argument is
3428  * just a file name (see comments in path.c), so handle that as being the
3429  * current directory.
3430  */
3431  if (strlen(parentpath) == 0)
3432  strlcpy(parentpath, ".", MAXPGPATH);
3433 
3434  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3435  return -1;
3436 
3437  return 0;
3438 }
3439 
3440 /*
3441  * Create a PostgreSQL data sub-directory
3442  *
3443  * The data directory itself, and most of its sub-directories, are created at
3444  * initdb time, but we do have some occasions when we create directories in
3445  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3446  * make sure that those directories are created consistently. Today, that means
3447  * making sure that the created directory has the correct permissions, which is
3448  * what pg_dir_create_mode tracks for us.
3449  *
3450  * Note that we also set the umask() based on what we understand the correct
3451  * permissions to be (see file_perm.c).
3452  *
3453  * For permissions other than the default, mkdir() can be used directly, but
3454  * be sure to consider carefully such cases -- a sub-directory with incorrect
3455  * permissions in a PostgreSQL data directory could cause backups and other
3456  * processes to fail.
3457  */
3458 int
3459 MakePGDirectory(const char *directoryName)
3460 {
3461  return mkdir(directoryName, pg_dir_create_mode);
3462 }
3463 
3464 /*
3465  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3466  *
3467  * Failure to fsync any data file is cause for immediate panic, unless
3468  * data_sync_retry is enabled. Data may have been written to the operating
3469  * system and removed from our buffer pool already, and if we are running on
3470  * an operating system that forgets dirty data on write-back failure, there
3471  * may be only one copy of the data remaining: in the WAL. A later attempt to
3472  * fsync again might falsely report success. Therefore we must not allow any
3473  * further checkpoints to be attempted. data_sync_retry can in theory be
3474  * enabled on systems known not to drop dirty buffered data on write-back
3475  * failure (with the likely outcome that checkpoints will continue to fail
3476  * until the underlying problem is fixed).
3477  *
3478  * Any code that reports a failure from fsync() or related functions should
3479  * filter the error level with this function.
3480  */
3481 int
3482 data_sync_elevel(int elevel)
3483 {
3484  return data_sync_retry ? elevel : PANIC;
3485 }
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1334
File lruLessRecently
Definition: fd.c:184
void closeAllVfds(void)
Definition: fd.c:2643
static PgChecksumMode mode
Definition: pg_checksums.c:61
File nextFree
Definition: fd.c:182
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:817
int pg_file_create_mode
Definition: file_perm.c:19
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1665
#define MAP_FAILED
Definition: mem.h:45
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:40
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1597
#define NUM_RESERVED_FDS
Definition: fd.c:119
static AllocateDesc * allocatedDescs
Definition: fd.c:245
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1321
int pg_fdatasync(int fd)
Definition: fd.c:383
static void error(void)
Definition: sql-dyntest.c:147
union AllocateDesc::@26 desc
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
AllocateDescKind
Definition: fd.c:223
DIR * dir
Definition: fd.c:238
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1540
static void AtProcExit_Files(int code, Datum arg)
Definition: fd.c:2795
static Size SizeVfdCache
Definition: fd.c:198
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:175
void on_proc_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:305
#define DO_DB(A)
Definition: fd.c:161
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2709
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3208
long random(void)
Definition: random.c:22
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
int pg_fsync_writethrough(int fd)
Definition: fd.c:360
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:78
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2547
int max_safe_fds
Definition: fd.c:146
#define Min(x, y)
Definition: c.h:904
off_t FileSize(File file)
Definition: fd.c:2033
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:580
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2264
#define FD_DELETE_AT_CLOSE
Definition: fd.c:173
int log_temp_files
Definition: guc.c:513
mode_t FileGetRawMode(File file)
Definition: fd.c:2121
void _dosmaperr(unsigned long)
Definition: win32error.c:171
static Vfd * VfdCache
Definition: fd.c:197
static void Delete(File file)
Definition: fd.c:1014
int closedir(DIR *)
Definition: dirent.c:113
static int numTempTableSpaces
Definition: fd.c:258
#define PG_TEMP_FILES_DIR
Definition: pg_checksums.c:58
int errcode(int sqlerrcode)
Definition: elog.c:570
#define MemSet(start, val, len)
Definition: c.h:955
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1429
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:348
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3036
File PathNameOpenTemporaryFile(const char *path)
Definition: fd.c:1635
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1566
static bool reserveAllocatedDesc(void)
Definition: fd.c:2132
uint32 SubTransactionId
Definition: c.h:511
#define SIGPIPE
Definition: win32_port.h:159
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1515
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
AllocateDescKind kind
Definition: fd.c:233
char * FilePathName(File file)
Definition: fd.c:2085
Definition: dirent.h:9
#define OidIsValid(objectId)
Definition: c.h:638
#define PANIC
Definition: elog.h:53
#define PG_BINARY
Definition: c.h:1191
static char * basedir
Definition: pg_basebackup.c:82
ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset)
Definition: pwrite.c:27
void AtEOXact_Files(bool isCommit)
Definition: fd.c:2781
Oid MyDatabaseTableSpace
Definition: globals.c:87
int ClosePipeStream(FILE *file)
Definition: fd.c:2614
ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset)
Definition: pread.c:27
#define malloc(a)
Definition: header.h:50
static void LruDelete(File file)
Definition: fd.c:1033
void pg_usleep(long microsec)
Definition: signal.c:53
bool TempTablespacesAreSet(void)
Definition: fd.c:2697
#define fsync(fd)
Definition: win32_port.h:63
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2365
void pfree(void *pointer)
Definition: mcxt.c:1056
mode_t fileMode
Definition: fd.c:189
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3008
static bool ReleaseLruFile(void)
Definition: fd.c:1128
Definition: dirent.c:25
#define ERROR
Definition: elog.h:43
#define PG_TEMP_FILE_PREFIX
Definition: pg_checksums.c:59
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2255
static int LruInsert(File file)
Definition: fd.c:1081
#define FATAL
Definition: elog.h:52
static bool have_xact_temporary_files
Definition: fd.c:209
#define MAXPGPATH
DIR * opendir(const char *)
Definition: dirent.c:33
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2012
#define DEBUG2
Definition: elog.h:24
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:26
char * fileName
Definition: fd.c:186
static char * buf
Definition: pg_test_fsync.c:68
Oid GetNextTempTableSpace(void)
Definition: fd.c:2727
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1244
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3318
int errdetail(const char *fmt,...)
Definition: elog.c:860
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:2942
char * tablespace
Definition: pgbench.c:186
int errcode_for_file_access(void)
Definition: elog.c:593
void get_parent_directory(char *path)
Definition: path.c:854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2205
static int nfile
Definition: fd.c:203
unsigned int uint32
Definition: c.h:358
void SyncDataDirectory(void)
Definition: fd.c:3134
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2466
static int nextTempTableSpace
Definition: fd.c:259
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1342
int max_files_per_process
Definition: fd.c:133
static File AllocateVfd(void)
Definition: fd.c:1160
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2308
unsigned short fdstate
Definition: fd.c:180
Definition: fd.c:177
off_t fileSize
Definition: fd.c:185
int fd
Definition: fd.c:179
#define ereport(elevel, rest)
Definition: elog.h:141
int link(const char *fromname, const char *toname)
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2669
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:606
static void Insert(File file)
Definition: fd.c:1059
ResourceOwner resowner
Definition: fd.c:181
bool data_sync_retry
Definition: fd.c:149
#define S_ISREG(m)
Definition: win32_port.h:299
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3308
int CloseTransientFile(int fd)
Definition: fd.c:2432
#define SIG_IGN
Definition: win32_port.h:151
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1274
static void ReleaseLruFiles(void)
Definition: fd.c:1150
#define WARNING
Definition: elog.h:40
#define stat(a, b)
Definition: win32_port.h:255
#define FileIsNotOpen(file)
Definition: fd.c:170
int pg_dir_create_mode
Definition: file_perm.c:18
static int elevel
Definition: vacuumlazy.c:143
int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:1914
struct vfd Vfd
int data_sync_elevel(int elevel)
Definition: fd.c:3482
uintptr_t Datum
Definition: postgres.h:367
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:2748
unsigned int Index
Definition: c.h:475
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:403
#define FileIsValid(file)
Definition: fd.c:167
FILE * file
Definition: fd.c:237
#define InvalidOid
Definition: postgres_ext.h:36
#define VFD_CLOSED
Definition: fd.c:165
static uint64 temporary_files_size
Definition: fd.c:217
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3459
pqsigfunc pqsignal(int signum, pqsigfunc handler)
Definition: signal.c:170
#define free(a)
Definition: header.h:65
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static void RegisterTemporaryFile(File file)
Definition: fd.c:1293
void FileClose(File file)
Definition: fd.c:1711
#define SIG_DFL
Definition: win32_port.h:149
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:1807
static int FileAccess(File file)
Definition: fd.c:1238
#define Assert(condition)
Definition: c.h:732
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:708
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2532
File lruMoreRecently
Definition: fd.c:183
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:1835
void RemovePgTempFiles(void)
Definition: fd.c:2883
SubTransactionId create_subid
Definition: fd.c:234
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1462
int durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:732
size_t Size
Definition: c.h:466
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1318
static const char * directory
Definition: zic.c:622
int sync_method
Definition: xlog.c:102
struct dirent * readdir(DIR *)
Definition: dirent.c:77
#define FD_MINFREE
Definition: fd.c:125
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3064
#define realloc(a, b)
Definition: header.h:60
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
#define INT64_FORMAT
Definition: c.h:400
const char * name
Definition: encode.c:521
static long tempFileCounter
Definition: fd.c:251
int fd
Definition: fd.c:239
#define S_ISDIR(m)
Definition: win32_port.h:296
#define lstat(path, sb)
Definition: win32_port.h:244
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:696
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:944
int FreeFile(FILE *file)
Definition: fd.c:2404
void set_max_safe_fds(void)
Definition: fd.c:901
bool enableFsync
Definition: globals.c:119
static Oid * tempTableSpaces
Definition: fd.c:257
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:784
int FileGetRawFlags(File file)
Definition: fd.c:2111
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1233
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3343
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:966
#define elog(elevel,...)
Definition: elog.h:226
int i
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:174
void * arg
int FileGetRawDesc(File file)
Definition: fd.c:2101
static void FreeVfd(File file)
Definition: fd.c:1218
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
int pg_fsync(int fd)
Definition: fd.c:331
char d_name[MAX_PATH]
Definition: dirent.h:14
#define mkdir(a, b)
Definition: win32_port.h:58
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:323
int fileFlags
Definition: fd.c:188
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1398
int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:1858
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1253
#define snprintf
Definition: port.h:192
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2050
static int maxAllocatedDescs
Definition: fd.c:244
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:2813
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3419
int File
Definition: fd.h:45
int FreeDir(DIR *dir)
Definition: fd.c:2584
int temp_file_limit
Definition: guc.c:517
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:241
void InitFileAccess(void)
Definition: fd.c:784
static int numAllocatedDescs
Definition: fd.c:243
bool pgwin32_is_junction(const char *path)
#define ftruncate(a, b)
Definition: win32_port.h:60