PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 256 on many modern
20  * operating systems, but can be as low as 32 on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
43  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
44  * They behave like the corresponding native functions, except that the handle
45  * is registered with the current subtransaction, and will be automatically
46  * closed at abort. These are intended mainly for short operations like
47  * reading a configuration file; there is a limit on the number of files that
48  * can be opened using these functions at any one time.
49  *
50  * Finally, BasicOpenFile is just a thin wrapper around open() that can
51  * release file descriptors in use by the virtual file descriptors if
52  * necessary. There is no automatic cleanup of file descriptors returned by
53  * BasicOpenFile, it is solely the caller's responsibility to close the file
54  * descriptor by calling close(2).
55  *
56  *-------------------------------------------------------------------------
57  */
58 
59 #include "postgres.h"
60 
61 #include <sys/file.h>
62 #include <sys/param.h>
63 #include <sys/stat.h>
64 #ifndef WIN32
65 #include <sys/mman.h>
66 #endif
67 #include <limits.h>
68 #include <unistd.h>
69 #include <fcntl.h>
70 #ifdef HAVE_SYS_RESOURCE_H
71 #include <sys/resource.h> /* for getrlimit */
72 #endif
73 
74 #include "miscadmin.h"
75 #include "access/xact.h"
76 #include "access/xlog.h"
77 #include "catalog/catalog.h"
78 #include "catalog/pg_tablespace.h"
79 #include "pgstat.h"
80 #include "portability/mem.h"
81 #include "storage/fd.h"
82 #include "storage/ipc.h"
83 #include "utils/guc.h"
84 #include "utils/resowner_private.h"
85 
86 
87 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
88 #if defined(HAVE_SYNC_FILE_RANGE)
89 #define PG_FLUSH_DATA_WORKS 1
90 #elif !defined(WIN32) && defined(MS_ASYNC)
91 #define PG_FLUSH_DATA_WORKS 1
92 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
93 #define PG_FLUSH_DATA_WORKS 1
94 #endif
95 
96 /*
97  * We must leave some file descriptors free for system(), the dynamic loader,
98  * and other code that tries to open files without consulting fd.c. This
99  * is the number left free. (While we can be pretty sure we won't get
100  * EMFILE, there's never any guarantee that we won't get ENFILE due to
101  * other processes chewing up FDs. So it's a bad idea to try to open files
102  * without consulting fd.c. Nonetheless we cannot control all code.)
103  *
104  * Because this is just a fixed setting, we are effectively assuming that
105  * no such code will leave FDs open over the long term; otherwise the slop
106  * is likely to be insufficient. Note in particular that we expect that
107  * loading a shared library does not result in any permanent increase in
108  * the number of open files. (This appears to be true on most if not
109  * all platforms as of Feb 2004.)
110  */
111 #define NUM_RESERVED_FDS 10
112 
113 /*
114  * If we have fewer than this many usable FDs after allowing for the reserved
115  * ones, choke.
116  */
117 #define FD_MINFREE 10
118 
119 
120 /*
121  * A number of platforms allow individual processes to open many more files
122  * than they can really support when *many* processes do the same thing.
123  * This GUC parameter lets the DBA limit max_safe_fds to something less than
124  * what the postmaster's initial probe suggests will work.
125  */
127 
128 /*
129  * Maximum number of file descriptors to open for either VFD entries or
130  * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
131  * to a conservative value, and remains that way indefinitely in bootstrap or
132  * standalone-backend cases. In normal postmaster operation, the postmaster
133  * calls set_max_safe_fds() late in initialization to update the value, and
134  * that value is then inherited by forked subprocesses.
135  *
136  * Note: the value of max_files_per_process is taken into account while
137  * setting this variable, and so need not be tested separately.
138  */
139 int max_safe_fds = 32; /* default if not changed */
140 
141 
142 /* Debugging.... */
143 
144 #ifdef FDDEBUG
145 #define DO_DB(A) \
146  do { \
147  int _do_db_save_errno = errno; \
148  A; \
149  errno = _do_db_save_errno; \
150  } while (0)
151 #else
152 #define DO_DB(A) \
153  ((void) 0)
154 #endif
155 
156 #define VFD_CLOSED (-1)
157 
158 #define FileIsValid(file) \
159  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
160 
161 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
162 
163 /*
164  * Note: a VFD's seekPos is normally always valid, but if for some reason
165  * an lseek() fails, it might become set to FileUnknownPos. We can struggle
166  * along without knowing the seek position in many cases, but in some places
167  * we have to fail if we don't have it.
168  */
169 #define FileUnknownPos ((off_t) -1)
170 #define FilePosIsUnknown(pos) ((pos) < 0)
171 
172 /* these are the assigned bits in fdstate below: */
173 #define FD_TEMPORARY (1 << 0) /* T = delete when closed */
174 #define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
175 
176 typedef struct vfd
177 {
178  int fd; /* current FD, or VFD_CLOSED if none */
179  unsigned short fdstate; /* bitflags for VFD's state */
180  ResourceOwner resowner; /* owner, for automatic cleanup */
181  File nextFree; /* link to next free VFD, if in freelist */
182  File lruMoreRecently; /* doubly linked recency-of-use list */
184  off_t seekPos; /* current logical file position, or -1 */
185  off_t fileSize; /* current size of file (0 if not temporary) */
186  char *fileName; /* name of file, or NULL for unused VFD */
187  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
188  int fileFlags; /* open(2) flags for (re)opening the file */
189  int fileMode; /* mode to pass to open(2) */
190 } Vfd;
191 
192 /*
193  * Virtual File Descriptor array pointer and size. This grows as
194  * needed. 'File' values are indexes into this array.
195  * Note that VfdCache[0] is not a usable VFD, just a list header.
196  */
197 static Vfd *VfdCache;
198 static Size SizeVfdCache = 0;
199 
200 /*
201  * Number of file descriptors known to be in use by VFD entries.
202  */
203 static int nfile = 0;
204 
205 /*
206  * Flag to tell whether it's worth scanning VfdCache looking for temp files
207  * to close
208  */
209 static bool have_xact_temporary_files = false;
210 
211 /*
212  * Tracks the total size of all temporary files. Note: when temp_file_limit
213  * is being enforced, this cannot overflow since the limit cannot be more
214  * than INT_MAX kilobytes. When not enforcing, it could theoretically
215  * overflow, but we don't care.
216  */
217 static uint64 temporary_files_size = 0;
218 
219 /*
220  * List of OS handles opened with AllocateFile, AllocateDir and
221  * OpenTransientFile.
222  */
223 typedef enum
224 {
230 
231 typedef struct
232 {
235  union
236  {
237  FILE *file;
239  int fd;
240  } desc;
241 } AllocateDesc;
242 
243 static int numAllocatedDescs = 0;
244 static int maxAllocatedDescs = 0;
246 
247 /*
248  * Number of temporary files opened during the current session;
249  * this is used in generation of tempfile names.
250  */
251 static long tempFileCounter = 0;
252 
253 /*
254  * Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
255  * this has not been set in the current transaction.
256  */
258 static int numTempTableSpaces = -1;
259 static int nextTempTableSpace = 0;
260 
261 
262 /*--------------------
263  *
264  * Private Routines
265  *
266  * Delete - delete a file from the Lru ring
267  * LruDelete - remove a file from the Lru ring and close its FD
268  * Insert - put a file at the front of the Lru ring
269  * LruInsert - put a file at the front of the Lru ring and open it
270  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
271  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
272  * AllocateVfd - grab a free (or new) file record (from VfdArray)
273  * FreeVfd - free a file record
274  *
275  * The Least Recently Used ring is a doubly linked list that begins and
276  * ends on element zero. Element zero is special -- it doesn't represent
277  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
278  * anchor that shows us the beginning/end of the ring.
279  * Only VFD elements that are currently really open (have an FD assigned) are
280  * in the Lru ring. Elements that are "virtually" open can be recognized
281  * by having a non-null fileName field.
282  *
283  * example:
284  *
285  * /--less----\ /---------\
286  * v \ v \
287  * #0 --more---> LeastRecentlyUsed --more-\ \
288  * ^\ | |
289  * \\less--> MostRecentlyUsedFile <---/ |
290  * \more---/ \--less--/
291  *
292  *--------------------
293  */
294 static void Delete(File file);
295 static void LruDelete(File file);
296 static void Insert(File file);
297 static int LruInsert(File file);
298 static bool ReleaseLruFile(void);
299 static void ReleaseLruFiles(void);
300 static File AllocateVfd(void);
301 static void FreeVfd(File file);
302 
303 static int FileAccess(File file);
304 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
305 static bool reserveAllocatedDesc(void);
306 static int FreeDesc(AllocateDesc *desc);
307 static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel);
308 
309 static void AtProcExit_Files(int code, Datum arg);
310 static void CleanupTempFiles(bool isProcExit);
311 static void RemovePgTempFilesInDir(const char *tmpdirname);
312 static void RemovePgTempRelationFiles(const char *tsdirname);
313 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
314 static bool looks_like_temp_rel_name(const char *name);
315 
316 static void walkdir(const char *path,
317  void (*action) (const char *fname, bool isdir, int elevel),
318  bool process_symlinks,
319  int elevel);
320 #ifdef PG_FLUSH_DATA_WORKS
321 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
322 #endif
323 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
324 
325 static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
326 static int fsync_parent_path(const char *fname, int elevel);
327 
328 
329 /*
330  * pg_fsync --- do fsync with or without writethrough
331  */
332 int
334 {
335  /* #if is to skip the sync_method test if there's no need for it */
336 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
338  return pg_fsync_writethrough(fd);
339  else
340 #endif
341  return pg_fsync_no_writethrough(fd);
342 }
343 
344 
345 /*
346  * pg_fsync_no_writethrough --- same as fsync except does nothing if
347  * enableFsync is off
348  */
349 int
351 {
352  if (enableFsync)
353  return fsync(fd);
354  else
355  return 0;
356 }
357 
358 /*
359  * pg_fsync_writethrough
360  */
361 int
363 {
364  if (enableFsync)
365  {
366 #ifdef WIN32
367  return _commit(fd);
368 #elif defined(F_FULLFSYNC)
369  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
370 #else
371  errno = ENOSYS;
372  return -1;
373 #endif
374  }
375  else
376  return 0;
377 }
378 
379 /*
380  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
381  *
382  * Not all platforms have fdatasync; treat as fsync if not available.
383  */
384 int
386 {
387  if (enableFsync)
388  {
389 #ifdef HAVE_FDATASYNC
390  return fdatasync(fd);
391 #else
392  return fsync(fd);
393 #endif
394  }
395  else
396  return 0;
397 }
398 
399 /*
400  * pg_flush_data --- advise OS that the described dirty data should be flushed
401  *
402  * offset of 0 with nbytes 0 means that the entire file should be flushed;
403  * in this case, this function may have side-effects on the file's
404  * seek position!
405  */
406 void
407 pg_flush_data(int fd, off_t offset, off_t nbytes)
408 {
409  /*
410  * Right now file flushing is primarily used to avoid making later
411  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
412  * if fsyncs are disabled - that's a decision we might want to make
413  * configurable at some point.
414  */
415  if (!enableFsync)
416  return;
417 
418  /*
419  * We compile all alternatives that are supported on the current platform,
420  * to find portability problems more easily.
421  */
422 #if defined(HAVE_SYNC_FILE_RANGE)
423  {
424  int rc;
425 
426  /*
427  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
428  * tells the OS that writeback for the specified blocks should be
429  * started, but that we don't want to wait for completion. Note that
430  * this call might block if too much dirty data exists in the range.
431  * This is the preferable method on OSs supporting it, as it works
432  * reliably when available (contrast to msync()) and doesn't flush out
433  * clean data (like FADV_DONTNEED).
434  */
435  rc = sync_file_range(fd, offset, nbytes,
436  SYNC_FILE_RANGE_WRITE);
437 
438  /* don't error out, this is just a performance optimization */
439  if (rc != 0)
440  {
443  errmsg("could not flush dirty data: %m")));
444  }
445 
446  return;
447  }
448 #endif
449 #if !defined(WIN32) && defined(MS_ASYNC)
450  {
451  void *p;
452  static int pagesize = 0;
453 
454  /*
455  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
456  * writeback. On linux it only does so if MS_SYNC is specified, but
457  * then it does the writeback synchronously. Luckily all common linux
458  * systems have sync_file_range(). This is preferable over
459  * FADV_DONTNEED because it doesn't flush out clean data.
460  *
461  * We map the file (mmap()), tell the kernel to sync back the contents
462  * (msync()), and then remove the mapping again (munmap()).
463  */
464 
465  /* mmap() needs actual length if we want to map whole file */
466  if (offset == 0 && nbytes == 0)
467  {
468  nbytes = lseek(fd, 0, SEEK_END);
469  if (nbytes < 0)
470  {
473  errmsg("could not determine dirty data size: %m")));
474  return;
475  }
476  }
477 
478  /*
479  * Some platforms reject partial-page mmap() attempts. To deal with
480  * that, just truncate the request to a page boundary. If any extra
481  * bytes don't get flushed, well, it's only a hint anyway.
482  */
483 
484  /* fetch pagesize only once */
485  if (pagesize == 0)
486  pagesize = sysconf(_SC_PAGESIZE);
487 
488  /* align length to pagesize, dropping any fractional page */
489  if (pagesize > 0)
490  nbytes = (nbytes / pagesize) * pagesize;
491 
492  /* fractional-page request is a no-op */
493  if (nbytes <= 0)
494  return;
495 
496  /*
497  * mmap could well fail, particularly on 32-bit platforms where there
498  * may simply not be enough address space. If so, silently fall
499  * through to the next implementation.
500  */
501  if (nbytes <= (off_t) SSIZE_MAX)
502  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
503  else
504  p = MAP_FAILED;
505 
506  if (p != MAP_FAILED)
507  {
508  int rc;
509 
510  rc = msync(p, (size_t) nbytes, MS_ASYNC);
511  if (rc != 0)
512  {
515  errmsg("could not flush dirty data: %m")));
516  /* NB: need to fall through to munmap()! */
517  }
518 
519  rc = munmap(p, (size_t) nbytes);
520  if (rc != 0)
521  {
522  /* FATAL error because mapping would remain */
523  ereport(FATAL,
525  errmsg("could not munmap() while flushing data: %m")));
526  }
527 
528  return;
529  }
530  }
531 #endif
532 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
533  {
534  int rc;
535 
536  /*
537  * Signal the kernel that the passed in range should not be cached
538  * anymore. This has the, desired, side effect of writing out dirty
539  * data, and the, undesired, side effect of likely discarding useful
540  * clean cached blocks. For the latter reason this is the least
541  * preferable method.
542  */
543 
544  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
545 
546  if (rc != 0)
547  {
548  /* don't error out, this is just a performance optimization */
551  errmsg("could not flush dirty data: %m")));
552  }
553 
554  return;
555  }
556 #endif
557 }
558 
559 
560 /*
561  * fsync_fname -- fsync a file or directory, handling errors properly
562  *
563  * Try to fsync a file or directory. When doing the latter, ignore errors that
564  * indicate the OS just doesn't allow/require fsyncing directories.
565  */
566 void
567 fsync_fname(const char *fname, bool isdir)
568 {
569  fsync_fname_ext(fname, isdir, false, ERROR);
570 }
571 
572 /*
573  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
574  *
575  * This routine ensures that, after returning, the effect of renaming file
576  * persists in case of a crash. A crash while this routine is running will
577  * leave you with either the pre-existing or the moved file in place of the
578  * new file; no mixed state or truncated files are possible.
579  *
580  * It does so by using fsync on the old filename and the possibly existing
581  * target filename before the rename, and the target file and directory after.
582  *
583  * Note that rename() cannot be used across arbitrary directories, as they
584  * might not be on the same filesystem. Therefore this routine does not
585  * support renaming across directories.
586  *
587  * Log errors with the caller specified severity.
588  *
589  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
590  * valid upon return.
591  */
592 int
593 durable_rename(const char *oldfile, const char *newfile, int elevel)
594 {
595  int fd;
596 
597  /*
598  * First fsync the old and target path (if it exists), to ensure that they
599  * are properly persistent on disk. Syncing the target file is not
600  * strictly necessary, but it makes it easier to reason about crashes;
601  * because it's then guaranteed that either source or target file exists
602  * after a crash.
603  */
604  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
605  return -1;
606 
607  fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDWR, 0);
608  if (fd < 0)
609  {
610  if (errno != ENOENT)
611  {
612  ereport(elevel,
614  errmsg("could not open file \"%s\": %m", newfile)));
615  return -1;
616  }
617  }
618  else
619  {
620  if (pg_fsync(fd) != 0)
621  {
622  int save_errno;
623 
624  /* close file upon error, might not be in transaction context */
625  save_errno = errno;
626  CloseTransientFile(fd);
627  errno = save_errno;
628 
629  ereport(elevel,
631  errmsg("could not fsync file \"%s\": %m", newfile)));
632  return -1;
633  }
634  CloseTransientFile(fd);
635  }
636 
637  /* Time to do the real deal... */
638  if (rename(oldfile, newfile) < 0)
639  {
640  ereport(elevel,
642  errmsg("could not rename file \"%s\" to \"%s\": %m",
643  oldfile, newfile)));
644  return -1;
645  }
646 
647  /*
648  * To guarantee renaming the file is persistent, fsync the file with its
649  * new name, and its containing directory.
650  */
651  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
652  return -1;
653 
654  if (fsync_parent_path(newfile, elevel) != 0)
655  return -1;
656 
657  return 0;
658 }
659 
660 /*
661  * durable_unlink -- remove a file in a durable manner
662  *
663  * This routine ensures that, after returning, the effect of removing file
664  * persists in case of a crash. A crash while this routine is running will
665  * leave the system in no mixed state.
666  *
667  * It does so by using fsync on the parent directory of the file after the
668  * actual removal is done.
669  *
670  * Log errors with the severity specified by caller.
671  *
672  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
673  * valid upon return.
674  */
675 int
676 durable_unlink(const char *fname, int elevel)
677 {
678  if (unlink(fname) < 0)
679  {
680  ereport(elevel,
682  errmsg("could not remove file \"%s\": %m",
683  fname)));
684  return -1;
685  }
686 
687  /*
688  * To guarantee that the removal of the file is persistent, fsync its
689  * parent directory.
690  */
691  if (fsync_parent_path(fname, elevel) != 0)
692  return -1;
693 
694  return 0;
695 }
696 
697 /*
698  * durable_link_or_rename -- rename a file in a durable manner.
699  *
700  * Similar to durable_rename(), except that this routine tries (but does not
701  * guarantee) not to overwrite the target file.
702  *
703  * Note that a crash in an unfortunate moment can leave you with two links to
704  * the target file.
705  *
706  * Log errors with the caller specified severity.
707  *
708  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
709  * valid upon return.
710  */
711 int
712 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
713 {
714  /*
715  * Ensure that, if we crash directly after the rename/link, a file with
716  * valid contents is moved into place.
717  */
718  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
719  return -1;
720 
721 #if HAVE_WORKING_LINK
722  if (link(oldfile, newfile) < 0)
723  {
724  ereport(elevel,
726  errmsg("could not link file \"%s\" to \"%s\": %m",
727  oldfile, newfile)));
728  return -1;
729  }
730  unlink(oldfile);
731 #else
732  /* XXX: Add racy file existence check? */
733  if (rename(oldfile, newfile) < 0)
734  {
735  ereport(elevel,
737  errmsg("could not rename file \"%s\" to \"%s\": %m",
738  oldfile, newfile)));
739  return -1;
740  }
741 #endif
742 
743  /*
744  * Make change persistent in case of an OS crash, both the new entry and
745  * its parent directory need to be flushed.
746  */
747  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
748  return -1;
749 
750  /* Same for parent directory */
751  if (fsync_parent_path(newfile, elevel) != 0)
752  return -1;
753 
754  return 0;
755 }
756 
757 /*
758  * InitFileAccess --- initialize this module during backend startup
759  *
760  * This is called during either normal or standalone backend start.
761  * It is *not* called in the postmaster.
762  */
763 void
765 {
766  Assert(SizeVfdCache == 0); /* call me only once */
767 
768  /* initialize cache header entry */
769  VfdCache = (Vfd *) malloc(sizeof(Vfd));
770  if (VfdCache == NULL)
771  ereport(FATAL,
772  (errcode(ERRCODE_OUT_OF_MEMORY),
773  errmsg("out of memory")));
774 
775  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
776  VfdCache->fd = VFD_CLOSED;
777 
778  SizeVfdCache = 1;
779 
780  /* register proc-exit hook to ensure temp files are dropped at exit */
782 }
783 
784 /*
785  * count_usable_fds --- count how many FDs the system will let us open,
786  * and estimate how many are already open.
787  *
788  * We stop counting if usable_fds reaches max_to_probe. Note: a small
789  * value of max_to_probe might result in an underestimate of already_open;
790  * we must fill in any "gaps" in the set of used FDs before the calculation
791  * of already_open will give the right answer. In practice, max_to_probe
792  * of a couple of dozen should be enough to ensure good results.
793  *
794  * We assume stdin (FD 0) is available for dup'ing
795  */
796 static void
797 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
798 {
799  int *fd;
800  int size;
801  int used = 0;
802  int highestfd = 0;
803  int j;
804 
805 #ifdef HAVE_GETRLIMIT
806  struct rlimit rlim;
807  int getrlimit_status;
808 #endif
809 
810  size = 1024;
811  fd = (int *) palloc(size * sizeof(int));
812 
813 #ifdef HAVE_GETRLIMIT
814 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
815  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
816 #else /* but BSD doesn't ... */
817  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
818 #endif /* RLIMIT_NOFILE */
819  if (getrlimit_status != 0)
820  ereport(WARNING, (errmsg("getrlimit failed: %m")));
821 #endif /* HAVE_GETRLIMIT */
822 
823  /* dup until failure or probe limit reached */
824  for (;;)
825  {
826  int thisfd;
827 
828 #ifdef HAVE_GETRLIMIT
829 
830  /*
831  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
832  * some platforms
833  */
834  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
835  break;
836 #endif
837 
838  thisfd = dup(0);
839  if (thisfd < 0)
840  {
841  /* Expect EMFILE or ENFILE, else it's fishy */
842  if (errno != EMFILE && errno != ENFILE)
843  elog(WARNING, "dup(0) failed after %d successes: %m", used);
844  break;
845  }
846 
847  if (used >= size)
848  {
849  size *= 2;
850  fd = (int *) repalloc(fd, size * sizeof(int));
851  }
852  fd[used++] = thisfd;
853 
854  if (highestfd < thisfd)
855  highestfd = thisfd;
856 
857  if (used >= max_to_probe)
858  break;
859  }
860 
861  /* release the files we opened */
862  for (j = 0; j < used; j++)
863  close(fd[j]);
864 
865  pfree(fd);
866 
867  /*
868  * Return results. usable_fds is just the number of successful dups. We
869  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
870  * number) and so already_open is highestfd+1 - usable_fds.
871  */
872  *usable_fds = used;
873  *already_open = highestfd + 1 - used;
874 }
875 
876 /*
877  * set_max_safe_fds
878  * Determine number of filedescriptors that fd.c is allowed to use
879  */
880 void
882 {
883  int usable_fds;
884  int already_open;
885 
886  /*----------
887  * We want to set max_safe_fds to
888  * MIN(usable_fds, max_files_per_process - already_open)
889  * less the slop factor for files that are opened without consulting
890  * fd.c. This ensures that we won't exceed either max_files_per_process
891  * or the experimentally-determined EMFILE limit.
892  *----------
893  */
895  &usable_fds, &already_open);
896 
897  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
898 
899  /*
900  * Take off the FDs reserved for system() etc.
901  */
903 
904  /*
905  * Make sure we still have enough to get by.
906  */
907  if (max_safe_fds < FD_MINFREE)
908  ereport(FATAL,
909  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
910  errmsg("insufficient file descriptors available to start server process"),
911  errdetail("System allows %d, we need at least %d.",
914 
915  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
916  max_safe_fds, usable_fds, already_open);
917 }
918 
919 /*
920  * BasicOpenFile --- same as open(2) except can free other FDs if needed
921  *
922  * This is exported for use by places that really want a plain kernel FD,
923  * but need to be proof against running out of FDs. Once an FD has been
924  * successfully returned, it is the caller's responsibility to ensure that
925  * it will not be leaked on ereport()! Most users should *not* call this
926  * routine directly, but instead use the VFD abstraction level, which
927  * provides protection against descriptor leaks as well as management of
928  * files that need to be open for more than a short period of time.
929  *
930  * Ideally this should be the *only* direct call of open() in the backend.
931  * In practice, the postmaster calls open() directly, and there are some
932  * direct open() calls done early in backend startup. Those are OK since
933  * this module wouldn't have any open files to close at that point anyway.
934  */
935 int
936 BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
937 {
938  int fd;
939 
940 tryAgain:
941  fd = open(fileName, fileFlags, fileMode);
942 
943  if (fd >= 0)
944  return fd; /* success! */
945 
946  if (errno == EMFILE || errno == ENFILE)
947  {
948  int save_errno = errno;
949 
950  ereport(LOG,
951  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
952  errmsg("out of file descriptors: %m; release and retry")));
953  errno = 0;
954  if (ReleaseLruFile())
955  goto tryAgain;
956  errno = save_errno;
957  }
958 
959  return -1; /* failure */
960 }
961 
962 #if defined(FDDEBUG)
963 
964 static void
965 _dump_lru(void)
966 {
967  int mru = VfdCache[0].lruLessRecently;
968  Vfd *vfdP = &VfdCache[mru];
969  char buf[2048];
970 
971  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
972  while (mru != 0)
973  {
974  mru = vfdP->lruLessRecently;
975  vfdP = &VfdCache[mru];
976  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
977  }
978  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
979  elog(LOG, "%s", buf);
980 }
981 #endif /* FDDEBUG */
982 
983 static void
985 {
986  Vfd *vfdP;
987 
988  Assert(file != 0);
989 
990  DO_DB(elog(LOG, "Delete %d (%s)",
991  file, VfdCache[file].fileName));
992  DO_DB(_dump_lru());
993 
994  vfdP = &VfdCache[file];
995 
996  VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
997  VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
998 
999  DO_DB(_dump_lru());
1000 }
1001 
1002 static void
1004 {
1005  Vfd *vfdP;
1006 
1007  Assert(file != 0);
1008 
1009  DO_DB(elog(LOG, "LruDelete %d (%s)",
1010  file, VfdCache[file].fileName));
1011 
1012  vfdP = &VfdCache[file];
1013 
1014  /*
1015  * Normally we should know the seek position, but if for some reason we
1016  * have lost track of it, try again to get it. If we still can't get it,
1017  * we have a problem: we will be unable to restore the file seek position
1018  * when and if the file is re-opened. But we can't really throw an error
1019  * and refuse to close the file, or activities such as transaction cleanup
1020  * will be broken.
1021  */
1022  if (FilePosIsUnknown(vfdP->seekPos))
1023  {
1024  vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1025  if (FilePosIsUnknown(vfdP->seekPos))
1026  elog(LOG, "could not seek file \"%s\" before closing: %m",
1027  vfdP->fileName);
1028  }
1029 
1030  /*
1031  * Close the file. We aren't expecting this to fail; if it does, better
1032  * to leak the FD than to mess up our internal state.
1033  */
1034  if (close(vfdP->fd))
1035  elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1036  vfdP->fd = VFD_CLOSED;
1037  --nfile;
1038 
1039  /* delete the vfd record from the LRU ring */
1040  Delete(file);
1041 }
1042 
1043 static void
1045 {
1046  Vfd *vfdP;
1047 
1048  Assert(file != 0);
1049 
1050  DO_DB(elog(LOG, "Insert %d (%s)",
1051  file, VfdCache[file].fileName));
1052  DO_DB(_dump_lru());
1053 
1054  vfdP = &VfdCache[file];
1055 
1056  vfdP->lruMoreRecently = 0;
1057  vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1058  VfdCache[0].lruLessRecently = file;
1059  VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1060 
1061  DO_DB(_dump_lru());
1062 }
1063 
1064 /* returns 0 on success, -1 on re-open failure (with errno set) */
1065 static int
1067 {
1068  Vfd *vfdP;
1069 
1070  Assert(file != 0);
1071 
1072  DO_DB(elog(LOG, "LruInsert %d (%s)",
1073  file, VfdCache[file].fileName));
1074 
1075  vfdP = &VfdCache[file];
1076 
1077  if (FileIsNotOpen(file))
1078  {
1079  /* Close excess kernel FDs. */
1080  ReleaseLruFiles();
1081 
1082  /*
1083  * The open could still fail for lack of file descriptors, eg due to
1084  * overall system file table being full. So, be prepared to release
1085  * another FD if necessary...
1086  */
1087  vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
1088  vfdP->fileMode);
1089  if (vfdP->fd < 0)
1090  {
1091  DO_DB(elog(LOG, "re-open failed: %m"));
1092  return -1;
1093  }
1094  else
1095  {
1096  ++nfile;
1097  }
1098 
1099  /*
1100  * Seek to the right position. We need no special case for seekPos
1101  * equal to FileUnknownPos, as lseek() will certainly reject that
1102  * (thus completing the logic noted in LruDelete() that we will fail
1103  * to re-open a file if we couldn't get its seek position before
1104  * closing).
1105  */
1106  if (vfdP->seekPos != (off_t) 0)
1107  {
1108  if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
1109  {
1110  /*
1111  * If we fail to restore the seek position, treat it like an
1112  * open() failure.
1113  */
1114  int save_errno = errno;
1115 
1116  elog(LOG, "could not seek file \"%s\" after re-opening: %m",
1117  vfdP->fileName);
1118  (void) close(vfdP->fd);
1119  vfdP->fd = VFD_CLOSED;
1120  --nfile;
1121  errno = save_errno;
1122  return -1;
1123  }
1124  }
1125  }
1126 
1127  /*
1128  * put it at the head of the Lru ring
1129  */
1130 
1131  Insert(file);
1132 
1133  return 0;
1134 }
1135 
1136 /*
1137  * Release one kernel FD by closing the least-recently-used VFD.
1138  */
1139 static bool
1141 {
1142  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1143 
1144  if (nfile > 0)
1145  {
1146  /*
1147  * There are opened files and so there should be at least one used vfd
1148  * in the ring.
1149  */
1150  Assert(VfdCache[0].lruMoreRecently != 0);
1151  LruDelete(VfdCache[0].lruMoreRecently);
1152  return true; /* freed a file */
1153  }
1154  return false; /* no files available to free */
1155 }
1156 
1157 /*
1158  * Release kernel FDs as needed to get under the max_safe_fds limit.
1159  * After calling this, it's OK to try to open another file.
1160  */
1161 static void
1163 {
1164  while (nfile + numAllocatedDescs >= max_safe_fds)
1165  {
1166  if (!ReleaseLruFile())
1167  break;
1168  }
1169 }
1170 
1171 static File
1173 {
1174  Index i;
1175  File file;
1176 
1177  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1178 
1179  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1180 
1181  if (VfdCache[0].nextFree == 0)
1182  {
1183  /*
1184  * The free list is empty so it is time to increase the size of the
1185  * array. We choose to double it each time this happens. However,
1186  * there's not much point in starting *real* small.
1187  */
1188  Size newCacheSize = SizeVfdCache * 2;
1189  Vfd *newVfdCache;
1190 
1191  if (newCacheSize < 32)
1192  newCacheSize = 32;
1193 
1194  /*
1195  * Be careful not to clobber VfdCache ptr if realloc fails.
1196  */
1197  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1198  if (newVfdCache == NULL)
1199  ereport(ERROR,
1200  (errcode(ERRCODE_OUT_OF_MEMORY),
1201  errmsg("out of memory")));
1202  VfdCache = newVfdCache;
1203 
1204  /*
1205  * Initialize the new entries and link them into the free list.
1206  */
1207  for (i = SizeVfdCache; i < newCacheSize; i++)
1208  {
1209  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1210  VfdCache[i].nextFree = i + 1;
1211  VfdCache[i].fd = VFD_CLOSED;
1212  }
1213  VfdCache[newCacheSize - 1].nextFree = 0;
1214  VfdCache[0].nextFree = SizeVfdCache;
1215 
1216  /*
1217  * Record the new size
1218  */
1219  SizeVfdCache = newCacheSize;
1220  }
1221 
1222  file = VfdCache[0].nextFree;
1223 
1224  VfdCache[0].nextFree = VfdCache[file].nextFree;
1225 
1226  return file;
1227 }
1228 
1229 static void
1231 {
1232  Vfd *vfdP = &VfdCache[file];
1233 
1234  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1235  file, vfdP->fileName ? vfdP->fileName : ""));
1236 
1237  if (vfdP->fileName != NULL)
1238  {
1239  free(vfdP->fileName);
1240  vfdP->fileName = NULL;
1241  }
1242  vfdP->fdstate = 0x0;
1243 
1244  vfdP->nextFree = VfdCache[0].nextFree;
1245  VfdCache[0].nextFree = file;
1246 }
1247 
1248 /* returns 0 on success, -1 on re-open failure (with errno set) */
1249 static int
1251 {
1252  int returnValue;
1253 
1254  DO_DB(elog(LOG, "FileAccess %d (%s)",
1255  file, VfdCache[file].fileName));
1256 
1257  /*
1258  * Is the file open? If not, open it and put it at the head of the LRU
1259  * ring (possibly closing the least recently used file to get an FD).
1260  */
1261 
1262  if (FileIsNotOpen(file))
1263  {
1264  returnValue = LruInsert(file);
1265  if (returnValue != 0)
1266  return returnValue;
1267  }
1268  else if (VfdCache[0].lruLessRecently != file)
1269  {
1270  /*
1271  * We now know that the file is open and that it is not the last one
1272  * accessed, so we need to move it to the head of the Lru ring.
1273  */
1274 
1275  Delete(file);
1276  Insert(file);
1277  }
1278 
1279  return 0;
1280 }
1281 
1282 /*
1283  * Called when we get a shared invalidation message on some relation.
1284  */
1285 #ifdef NOT_USED
1286 void
1287 FileInvalidate(File file)
1288 {
1289  Assert(FileIsValid(file));
1290  if (!FileIsNotOpen(file))
1291  LruDelete(file);
1292 }
1293 #endif
1294 
1295 /*
1296  * open a file in an arbitrary directory
1297  *
1298  * NB: if the passed pathname is relative (which it usually is),
1299  * it will be interpreted relative to the process' working directory
1300  * (which should always be $PGDATA when this code is running).
1301  */
1302 File
1303 PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
1304 {
1305  char *fnamecopy;
1306  File file;
1307  Vfd *vfdP;
1308 
1309  DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
1310  fileName, fileFlags, fileMode));
1311 
1312  /*
1313  * We need a malloc'd copy of the file name; fail cleanly if no room.
1314  */
1315  fnamecopy = strdup(fileName);
1316  if (fnamecopy == NULL)
1317  ereport(ERROR,
1318  (errcode(ERRCODE_OUT_OF_MEMORY),
1319  errmsg("out of memory")));
1320 
1321  file = AllocateVfd();
1322  vfdP = &VfdCache[file];
1323 
1324  /* Close excess kernel FDs. */
1325  ReleaseLruFiles();
1326 
1327  vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
1328 
1329  if (vfdP->fd < 0)
1330  {
1331  int save_errno = errno;
1332 
1333  FreeVfd(file);
1334  free(fnamecopy);
1335  errno = save_errno;
1336  return -1;
1337  }
1338  ++nfile;
1339  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1340  vfdP->fd));
1341 
1342  Insert(file);
1343 
1344  vfdP->fileName = fnamecopy;
1345  /* Saved flags are adjusted to be OK for re-opening file */
1346  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1347  vfdP->fileMode = fileMode;
1348  vfdP->seekPos = 0;
1349  vfdP->fileSize = 0;
1350  vfdP->fdstate = 0x0;
1351  vfdP->resowner = NULL;
1352 
1353  return file;
1354 }
1355 
1356 /*
1357  * Open a temporary file that will disappear when we close it.
1358  *
1359  * This routine takes care of generating an appropriate tempfile name.
1360  * There's no need to pass in fileFlags or fileMode either, since only
1361  * one setting makes any sense for a temp file.
1362  *
1363  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1364  * to ensure it's closed and deleted when it's no longer needed, typically at
1365  * the end-of-transaction. In most cases, you don't want temporary files to
1366  * outlive the transaction that created them, so this should be false -- but
1367  * if you need "somewhat" temporary storage, this might be useful. In either
1368  * case, the file is removed when the File is explicitly closed.
1369  */
1370 File
1371 OpenTemporaryFile(bool interXact)
1372 {
1373  File file = 0;
1374 
1375  /*
1376  * If some temp tablespace(s) have been given to us, try to use the next
1377  * one. If a given tablespace can't be found, we silently fall back to
1378  * the database's default tablespace.
1379  *
1380  * BUT: if the temp file is slated to outlive the current transaction,
1381  * force it into the database's default tablespace, so that it will not
1382  * pose a threat to possible tablespace drop attempts.
1383  */
1384  if (numTempTableSpaces > 0 && !interXact)
1385  {
1386  Oid tblspcOid = GetNextTempTableSpace();
1387 
1388  if (OidIsValid(tblspcOid))
1389  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1390  }
1391 
1392  /*
1393  * If not, or if tablespace is bad, create in database's default
1394  * tablespace. MyDatabaseTableSpace should normally be set before we get
1395  * here, but just in case it isn't, fall back to pg_default tablespace.
1396  */
1397  if (file <= 0)
1401  true);
1402 
1403  /* Mark it for deletion at close */
1404  VfdCache[file].fdstate |= FD_TEMPORARY;
1405 
1406  /* Register it with the current resource owner */
1407  if (!interXact)
1408  {
1409  VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
1410 
1413  VfdCache[file].resowner = CurrentResourceOwner;
1414 
1415  /* ensure cleanup happens at eoxact */
1417  }
1418 
1419  return file;
1420 }
1421 
1422 /*
1423  * Open a temporary file in a specific tablespace.
1424  * Subroutine for OpenTemporaryFile, which see for details.
1425  */
1426 static File
1427 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1428 {
1429  char tempdirpath[MAXPGPATH];
1430  char tempfilepath[MAXPGPATH];
1431  File file;
1432 
1433  /*
1434  * Identify the tempfile directory for this tablespace.
1435  *
1436  * If someone tries to specify pg_global, use pg_default instead.
1437  */
1438  if (tblspcOid == DEFAULTTABLESPACE_OID ||
1439  tblspcOid == GLOBALTABLESPACE_OID)
1440  {
1441  /* The default tablespace is {datadir}/base */
1442  snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
1444  }
1445  else
1446  {
1447  /* All other tablespaces are accessed via symlinks */
1448  snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
1450  }
1451 
1452  /*
1453  * Generate a tempfile name that should be unique within the current
1454  * database instance.
1455  */
1456  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1457  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1458 
1459  /*
1460  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1461  * temp file that can be reused.
1462  */
1463  file = PathNameOpenFile(tempfilepath,
1464  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
1465  0600);
1466  if (file <= 0)
1467  {
1468  /*
1469  * We might need to create the tablespace's tempfile directory, if no
1470  * one has yet done so.
1471  *
1472  * Don't check for error from mkdir; it could fail if someone else
1473  * just did the same thing. If it doesn't work then we'll bomb out on
1474  * the second create attempt, instead.
1475  */
1476  mkdir(tempdirpath, S_IRWXU);
1477 
1478  file = PathNameOpenFile(tempfilepath,
1479  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
1480  0600);
1481  if (file <= 0 && rejectError)
1482  elog(ERROR, "could not create temporary file \"%s\": %m",
1483  tempfilepath);
1484  }
1485 
1486  return file;
1487 }
1488 
1489 /*
1490  * close a file when done with it
1491  */
1492 void
1494 {
1495  Vfd *vfdP;
1496 
1497  Assert(FileIsValid(file));
1498 
1499  DO_DB(elog(LOG, "FileClose: %d (%s)",
1500  file, VfdCache[file].fileName));
1501 
1502  vfdP = &VfdCache[file];
1503 
1504  if (!FileIsNotOpen(file))
1505  {
1506  /* close the file */
1507  if (close(vfdP->fd))
1508  elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1509 
1510  --nfile;
1511  vfdP->fd = VFD_CLOSED;
1512 
1513  /* remove the file from the lru ring */
1514  Delete(file);
1515  }
1516 
1517  /*
1518  * Delete the file if it was temporary, and make a log entry if wanted
1519  */
1520  if (vfdP->fdstate & FD_TEMPORARY)
1521  {
1522  struct stat filestats;
1523  int stat_errno;
1524 
1525  /*
1526  * If we get an error, as could happen within the ereport/elog calls,
1527  * we'll come right back here during transaction abort. Reset the
1528  * flag to ensure that we can't get into an infinite loop. This code
1529  * is arranged to ensure that the worst-case consequence is failing to
1530  * emit log message(s), not failing to attempt the unlink.
1531  */
1532  vfdP->fdstate &= ~FD_TEMPORARY;
1533 
1534  /* Subtract its size from current usage (do first in case of error) */
1535  temporary_files_size -= vfdP->fileSize;
1536  vfdP->fileSize = 0;
1537 
1538  /* first try the stat() */
1539  if (stat(vfdP->fileName, &filestats))
1540  stat_errno = errno;
1541  else
1542  stat_errno = 0;
1543 
1544  /* in any case do the unlink */
1545  if (unlink(vfdP->fileName))
1546  elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1547 
1548  /* and last report the stat results */
1549  if (stat_errno == 0)
1550  {
1551  pgstat_report_tempfile(filestats.st_size);
1552 
1553  if (log_temp_files >= 0)
1554  {
1555  if ((filestats.st_size / 1024) >= log_temp_files)
1556  ereport(LOG,
1557  (errmsg("temporary file: path \"%s\", size %lu",
1558  vfdP->fileName,
1559  (unsigned long) filestats.st_size)));
1560  }
1561  }
1562  else
1563  {
1564  errno = stat_errno;
1565  elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1566  }
1567  }
1568 
1569  /* Unregister it from the resource owner */
1570  if (vfdP->resowner)
1571  ResourceOwnerForgetFile(vfdP->resowner, file);
1572 
1573  /*
1574  * Return the Vfd slot to the free list
1575  */
1576  FreeVfd(file);
1577 }
1578 
1579 /*
1580  * FilePrefetch - initiate asynchronous read of a given range of the file.
1581  * The logical seek position is unaffected.
1582  *
1583  * Currently the only implementation of this function is using posix_fadvise
1584  * which is the simplest standardized interface that accomplishes this.
1585  * We could add an implementation using libaio in the future; but note that
1586  * this API is inappropriate for libaio, which wants to have a buffer provided
1587  * to read into.
1588  */
1589 int
1590 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1591 {
1592 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1593  int returnCode;
1594 
1595  Assert(FileIsValid(file));
1596 
1597  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1598  file, VfdCache[file].fileName,
1599  (int64) offset, amount));
1600 
1601  returnCode = FileAccess(file);
1602  if (returnCode < 0)
1603  return returnCode;
1604 
1605  pgstat_report_wait_start(wait_event_info);
1606  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1607  POSIX_FADV_WILLNEED);
1609 
1610  return returnCode;
1611 #else
1612  Assert(FileIsValid(file));
1613  return 0;
1614 #endif
1615 }
1616 
1617 void
1618 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1619 {
1620  int returnCode;
1621 
1622  Assert(FileIsValid(file));
1623 
1624  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1625  file, VfdCache[file].fileName,
1626  (int64) offset, (int64) nbytes));
1627 
1628  /*
1629  * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
1630  * file's seek position. We prefer to define that as a no-op here.
1631  */
1632  if (nbytes <= 0)
1633  return;
1634 
1635  returnCode = FileAccess(file);
1636  if (returnCode < 0)
1637  return;
1638 
1639  pgstat_report_wait_start(wait_event_info);
1640  pg_flush_data(VfdCache[file].fd, offset, nbytes);
1642 }
1643 
1644 int
1645 FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
1646 {
1647  int returnCode;
1648  Vfd *vfdP;
1649 
1650  Assert(FileIsValid(file));
1651 
1652  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1653  file, VfdCache[file].fileName,
1654  (int64) VfdCache[file].seekPos,
1655  amount, buffer));
1656 
1657  returnCode = FileAccess(file);
1658  if (returnCode < 0)
1659  return returnCode;
1660 
1661  vfdP = &VfdCache[file];
1662 
1663 retry:
1664  pgstat_report_wait_start(wait_event_info);
1665  returnCode = read(vfdP->fd, buffer, amount);
1667 
1668  if (returnCode >= 0)
1669  {
1670  /* if seekPos is unknown, leave it that way */
1671  if (!FilePosIsUnknown(vfdP->seekPos))
1672  vfdP->seekPos += returnCode;
1673  }
1674  else
1675  {
1676  /*
1677  * Windows may run out of kernel buffers and return "Insufficient
1678  * system resources" error. Wait a bit and retry to solve it.
1679  *
1680  * It is rumored that EINTR is also possible on some Unix filesystems,
1681  * in which case immediate retry is indicated.
1682  */
1683 #ifdef WIN32
1684  DWORD error = GetLastError();
1685 
1686  switch (error)
1687  {
1688  case ERROR_NO_SYSTEM_RESOURCES:
1689  pg_usleep(1000L);
1690  errno = EINTR;
1691  break;
1692  default:
1693  _dosmaperr(error);
1694  break;
1695  }
1696 #endif
1697  /* OK to retry if interrupted */
1698  if (errno == EINTR)
1699  goto retry;
1700 
1701  /* Trouble, so assume we don't know the file position anymore */
1702  vfdP->seekPos = FileUnknownPos;
1703  }
1704 
1705  return returnCode;
1706 }
1707 
1708 int
1709 FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
1710 {
1711  int returnCode;
1712  Vfd *vfdP;
1713 
1714  Assert(FileIsValid(file));
1715 
1716  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1717  file, VfdCache[file].fileName,
1718  (int64) VfdCache[file].seekPos,
1719  amount, buffer));
1720 
1721  returnCode = FileAccess(file);
1722  if (returnCode < 0)
1723  return returnCode;
1724 
1725  vfdP = &VfdCache[file];
1726 
1727  /*
1728  * If enforcing temp_file_limit and it's a temp file, check to see if the
1729  * write would overrun temp_file_limit, and throw error if so. Note: it's
1730  * really a modularity violation to throw error here; we should set errno
1731  * and return -1. However, there's no way to report a suitable error
1732  * message if we do that. All current callers would just throw error
1733  * immediately anyway, so this is safe at present.
1734  */
1735  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
1736  {
1737  off_t newPos;
1738 
1739  /*
1740  * Normally we should know the seek position, but if for some reason
1741  * we have lost track of it, try again to get it. Here, it's fine to
1742  * throw an error if we still can't get it.
1743  */
1744  if (FilePosIsUnknown(vfdP->seekPos))
1745  {
1746  vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1747  if (FilePosIsUnknown(vfdP->seekPos))
1748  elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
1749  }
1750 
1751  newPos = vfdP->seekPos + amount;
1752  if (newPos > vfdP->fileSize)
1753  {
1754  uint64 newTotal = temporary_files_size;
1755 
1756  newTotal += newPos - vfdP->fileSize;
1757  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1758  ereport(ERROR,
1759  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1760  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1761  temp_file_limit)));
1762  }
1763  }
1764 
1765 retry:
1766  errno = 0;
1767  pgstat_report_wait_start(wait_event_info);
1768  returnCode = write(vfdP->fd, buffer, amount);
1770 
1771  /* if write didn't set errno, assume problem is no disk space */
1772  if (returnCode != amount && errno == 0)
1773  errno = ENOSPC;
1774 
1775  if (returnCode >= 0)
1776  {
1777  /* if seekPos is unknown, leave it that way */
1778  if (!FilePosIsUnknown(vfdP->seekPos))
1779  vfdP->seekPos += returnCode;
1780 
1781  /*
1782  * Maintain fileSize and temporary_files_size if it's a temp file.
1783  *
1784  * If seekPos is -1 (unknown), this will do nothing; but we could only
1785  * get here in that state if we're not enforcing temporary_files_size,
1786  * so we don't care.
1787  */
1788  if (vfdP->fdstate & FD_TEMPORARY)
1789  {
1790  off_t newPos = vfdP->seekPos;
1791 
1792  if (newPos > vfdP->fileSize)
1793  {
1794  temporary_files_size += newPos - vfdP->fileSize;
1795  vfdP->fileSize = newPos;
1796  }
1797  }
1798  }
1799  else
1800  {
1801  /*
1802  * See comments in FileRead()
1803  */
1804 #ifdef WIN32
1805  DWORD error = GetLastError();
1806 
1807  switch (error)
1808  {
1809  case ERROR_NO_SYSTEM_RESOURCES:
1810  pg_usleep(1000L);
1811  errno = EINTR;
1812  break;
1813  default:
1814  _dosmaperr(error);
1815  break;
1816  }
1817 #endif
1818  /* OK to retry if interrupted */
1819  if (errno == EINTR)
1820  goto retry;
1821 
1822  /* Trouble, so assume we don't know the file position anymore */
1823  vfdP->seekPos = FileUnknownPos;
1824  }
1825 
1826  return returnCode;
1827 }
1828 
1829 int
1830 FileSync(File file, uint32 wait_event_info)
1831 {
1832  int returnCode;
1833 
1834  Assert(FileIsValid(file));
1835 
1836  DO_DB(elog(LOG, "FileSync: %d (%s)",
1837  file, VfdCache[file].fileName));
1838 
1839  returnCode = FileAccess(file);
1840  if (returnCode < 0)
1841  return returnCode;
1842 
1843  pgstat_report_wait_start(wait_event_info);
1844  returnCode = pg_fsync(VfdCache[file].fd);
1846 
1847  return returnCode;
1848 }
1849 
1850 off_t
1851 FileSeek(File file, off_t offset, int whence)
1852 {
1853  Vfd *vfdP;
1854 
1855  Assert(FileIsValid(file));
1856 
1857  DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
1858  file, VfdCache[file].fileName,
1859  (int64) VfdCache[file].seekPos,
1860  (int64) offset, whence));
1861 
1862  vfdP = &VfdCache[file];
1863 
1864  if (FileIsNotOpen(file))
1865  {
1866  switch (whence)
1867  {
1868  case SEEK_SET:
1869  if (offset < 0)
1870  {
1871  errno = EINVAL;
1872  return (off_t) -1;
1873  }
1874  vfdP->seekPos = offset;
1875  break;
1876  case SEEK_CUR:
1877  if (FilePosIsUnknown(vfdP->seekPos) ||
1878  vfdP->seekPos + offset < 0)
1879  {
1880  errno = EINVAL;
1881  return (off_t) -1;
1882  }
1883  vfdP->seekPos += offset;
1884  break;
1885  case SEEK_END:
1886  if (FileAccess(file) < 0)
1887  return (off_t) -1;
1888  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1889  break;
1890  default:
1891  elog(ERROR, "invalid whence: %d", whence);
1892  break;
1893  }
1894  }
1895  else
1896  {
1897  switch (whence)
1898  {
1899  case SEEK_SET:
1900  if (offset < 0)
1901  {
1902  errno = EINVAL;
1903  return (off_t) -1;
1904  }
1905  if (vfdP->seekPos != offset)
1906  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1907  break;
1908  case SEEK_CUR:
1909  if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
1910  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1911  break;
1912  case SEEK_END:
1913  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1914  break;
1915  default:
1916  elog(ERROR, "invalid whence: %d", whence);
1917  break;
1918  }
1919  }
1920 
1921  return vfdP->seekPos;
1922 }
1923 
1924 /*
1925  * XXX not actually used but here for completeness
1926  */
1927 #ifdef NOT_USED
1928 off_t
1929 FileTell(File file)
1930 {
1931  Assert(FileIsValid(file));
1932  DO_DB(elog(LOG, "FileTell %d (%s)",
1933  file, VfdCache[file].fileName));
1934  return VfdCache[file].seekPos;
1935 }
1936 #endif
1937 
1938 int
1939 FileTruncate(File file, off_t offset, uint32 wait_event_info)
1940 {
1941  int returnCode;
1942 
1943  Assert(FileIsValid(file));
1944 
1945  DO_DB(elog(LOG, "FileTruncate %d (%s)",
1946  file, VfdCache[file].fileName));
1947 
1948  returnCode = FileAccess(file);
1949  if (returnCode < 0)
1950  return returnCode;
1951 
1952  pgstat_report_wait_start(wait_event_info);
1953  returnCode = ftruncate(VfdCache[file].fd, offset);
1955 
1956  if (returnCode == 0 && VfdCache[file].fileSize > offset)
1957  {
1958  /* adjust our state for truncation of a temp file */
1959  Assert(VfdCache[file].fdstate & FD_TEMPORARY);
1960  temporary_files_size -= VfdCache[file].fileSize - offset;
1961  VfdCache[file].fileSize = offset;
1962  }
1963 
1964  return returnCode;
1965 }
1966 
1967 /*
1968  * Return the pathname associated with an open file.
1969  *
1970  * The returned string points to an internal buffer, which is valid until
1971  * the file is closed.
1972  */
1973 char *
1975 {
1976  Assert(FileIsValid(file));
1977 
1978  return VfdCache[file].fileName;
1979 }
1980 
1981 /*
1982  * Return the raw file descriptor of an opened file.
1983  *
1984  * The returned file descriptor will be valid until the file is closed, but
1985  * there are a lot of things that can make that happen. So the caller should
1986  * be careful not to do much of anything else before it finishes using the
1987  * returned file descriptor.
1988  */
1989 int
1991 {
1992  Assert(FileIsValid(file));
1993  return VfdCache[file].fd;
1994 }
1995 
1996 /*
1997  * FileGetRawFlags - returns the file flags on open(2)
1998  */
1999 int
2001 {
2002  Assert(FileIsValid(file));
2003  return VfdCache[file].fileFlags;
2004 }
2005 
2006 /*
2007  * FileGetRawMode - returns the mode bitmask passed to open(2)
2008  */
2009 int
2011 {
2012  Assert(FileIsValid(file));
2013  return VfdCache[file].fileMode;
2014 }
2015 
2016 /*
2017  * Make room for another allocatedDescs[] array entry if needed and possible.
2018  * Returns true if an array element is available.
2019  */
2020 static bool
2022 {
2023  AllocateDesc *newDescs;
2024  int newMax;
2025 
2026  /* Quick out if array already has a free slot. */
2028  return true;
2029 
2030  /*
2031  * If the array hasn't yet been created in the current process, initialize
2032  * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2033  * we will ever need, anyway. We don't want to look at max_safe_fds
2034  * immediately because set_max_safe_fds() may not have run yet.
2035  */
2036  if (allocatedDescs == NULL)
2037  {
2038  newMax = FD_MINFREE / 2;
2039  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2040  /* Out of memory already? Treat as fatal error. */
2041  if (newDescs == NULL)
2042  ereport(ERROR,
2043  (errcode(ERRCODE_OUT_OF_MEMORY),
2044  errmsg("out of memory")));
2045  allocatedDescs = newDescs;
2046  maxAllocatedDescs = newMax;
2047  return true;
2048  }
2049 
2050  /*
2051  * Consider enlarging the array beyond the initial allocation used above.
2052  * By the time this happens, max_safe_fds should be known accurately.
2053  *
2054  * We mustn't let allocated descriptors hog all the available FDs, and in
2055  * practice we'd better leave a reasonable number of FDs for VFD use. So
2056  * set the maximum to max_safe_fds / 2. (This should certainly be at
2057  * least as large as the initial size, FD_MINFREE / 2.)
2058  */
2059  newMax = max_safe_fds / 2;
2060  if (newMax > maxAllocatedDescs)
2061  {
2062  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2063  newMax * sizeof(AllocateDesc));
2064  /* Treat out-of-memory as a non-fatal error. */
2065  if (newDescs == NULL)
2066  return false;
2067  allocatedDescs = newDescs;
2068  maxAllocatedDescs = newMax;
2069  return true;
2070  }
2071 
2072  /* Can't enlarge allocatedDescs[] any more. */
2073  return false;
2074 }
2075 
2076 /*
2077  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2078  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2079  * necessary to open the file. When done, call FreeFile rather than fclose.
2080  *
2081  * Note that files that will be open for any significant length of time
2082  * should NOT be handled this way, since they cannot share kernel file
2083  * descriptors with other files; there is grave risk of running out of FDs
2084  * if anyone locks down too many FDs. Most callers of this routine are
2085  * simply reading a config file that they will read and close immediately.
2086  *
2087  * fd.c will automatically close all files opened with AllocateFile at
2088  * transaction commit or abort; this prevents FD leakage if a routine
2089  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2090  *
2091  * Ideally this should be the *only* direct call of fopen() in the backend.
2092  */
2093 FILE *
2094 AllocateFile(const char *name, const char *mode)
2095 {
2096  FILE *file;
2097 
2098  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2099  numAllocatedDescs, name));
2100 
2101  /* Can we allocate another non-virtual FD? */
2102  if (!reserveAllocatedDesc())
2103  ereport(ERROR,
2104  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2105  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2106  maxAllocatedDescs, name)));
2107 
2108  /* Close excess kernel FDs. */
2109  ReleaseLruFiles();
2110 
2111 TryAgain:
2112  if ((file = fopen(name, mode)) != NULL)
2113  {
2114  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2115 
2116  desc->kind = AllocateDescFile;
2117  desc->desc.file = file;
2120  return desc->desc.file;
2121  }
2122 
2123  if (errno == EMFILE || errno == ENFILE)
2124  {
2125  int save_errno = errno;
2126 
2127  ereport(LOG,
2128  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2129  errmsg("out of file descriptors: %m; release and retry")));
2130  errno = 0;
2131  if (ReleaseLruFile())
2132  goto TryAgain;
2133  errno = save_errno;
2134  }
2135 
2136  return NULL;
2137 }
2138 
2139 
2140 /*
2141  * Like AllocateFile, but returns an unbuffered fd like open(2)
2142  */
2143 int
2144 OpenTransientFile(FileName fileName, int fileFlags, int fileMode)
2145 {
2146  int fd;
2147 
2148  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2149  numAllocatedDescs, fileName));
2150 
2151  /* Can we allocate another non-virtual FD? */
2152  if (!reserveAllocatedDesc())
2153  ereport(ERROR,
2154  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2155  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2156  maxAllocatedDescs, fileName)));
2157 
2158  /* Close excess kernel FDs. */
2159  ReleaseLruFiles();
2160 
2161  fd = BasicOpenFile(fileName, fileFlags, fileMode);
2162 
2163  if (fd >= 0)
2164  {
2165  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2166 
2167  desc->kind = AllocateDescRawFD;
2168  desc->desc.fd = fd;
2171 
2172  return fd;
2173  }
2174 
2175  return -1; /* failure */
2176 }
2177 
2178 /*
2179  * Routines that want to initiate a pipe stream should use OpenPipeStream
2180  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2181  * necessary. When done, call ClosePipeStream rather than pclose.
2182  */
2183 FILE *
2184 OpenPipeStream(const char *command, const char *mode)
2185 {
2186  FILE *file;
2187 
2188  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2189  numAllocatedDescs, command));
2190 
2191  /* Can we allocate another non-virtual FD? */
2192  if (!reserveAllocatedDesc())
2193  ereport(ERROR,
2194  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2195  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2196  maxAllocatedDescs, command)));
2197 
2198  /* Close excess kernel FDs. */
2199  ReleaseLruFiles();
2200 
2201 TryAgain:
2202  fflush(stdout);
2203  fflush(stderr);
2204  errno = 0;
2205  if ((file = popen(command, mode)) != NULL)
2206  {
2207  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2208 
2209  desc->kind = AllocateDescPipe;
2210  desc->desc.file = file;
2213  return desc->desc.file;
2214  }
2215 
2216  if (errno == EMFILE || errno == ENFILE)
2217  {
2218  int save_errno = errno;
2219 
2220  ereport(LOG,
2221  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2222  errmsg("out of file descriptors: %m; release and retry")));
2223  errno = 0;
2224  if (ReleaseLruFile())
2225  goto TryAgain;
2226  errno = save_errno;
2227  }
2228 
2229  return NULL;
2230 }
2231 
2232 /*
2233  * Free an AllocateDesc of any type.
2234  *
2235  * The argument *must* point into the allocatedDescs[] array.
2236  */
2237 static int
2239 {
2240  int result;
2241 
2242  /* Close the underlying object */
2243  switch (desc->kind)
2244  {
2245  case AllocateDescFile:
2246  result = fclose(desc->desc.file);
2247  break;
2248  case AllocateDescPipe:
2249  result = pclose(desc->desc.file);
2250  break;
2251  case AllocateDescDir:
2252  result = closedir(desc->desc.dir);
2253  break;
2254  case AllocateDescRawFD:
2255  result = close(desc->desc.fd);
2256  break;
2257  default:
2258  elog(ERROR, "AllocateDesc kind not recognized");
2259  result = 0; /* keep compiler quiet */
2260  break;
2261  }
2262 
2263  /* Compact storage in the allocatedDescs array */
2265  *desc = allocatedDescs[numAllocatedDescs];
2266 
2267  return result;
2268 }
2269 
2270 /*
2271  * Close a file returned by AllocateFile.
2272  *
2273  * Note we do not check fclose's return value --- it is up to the caller
2274  * to handle close errors.
2275  */
2276 int
2277 FreeFile(FILE *file)
2278 {
2279  int i;
2280 
2281  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2282 
2283  /* Remove file from list of allocated files, if it's present */
2284  for (i = numAllocatedDescs; --i >= 0;)
2285  {
2286  AllocateDesc *desc = &allocatedDescs[i];
2287 
2288  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2289  return FreeDesc(desc);
2290  }
2291 
2292  /* Only get here if someone passes us a file not in allocatedDescs */
2293  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2294 
2295  return fclose(file);
2296 }
2297 
2298 /*
2299  * Close a file returned by OpenTransientFile.
2300  *
2301  * Note we do not check close's return value --- it is up to the caller
2302  * to handle close errors.
2303  */
2304 int
2306 {
2307  int i;
2308 
2309  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2310 
2311  /* Remove fd from list of allocated files, if it's present */
2312  for (i = numAllocatedDescs; --i >= 0;)
2313  {
2314  AllocateDesc *desc = &allocatedDescs[i];
2315 
2316  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2317  return FreeDesc(desc);
2318  }
2319 
2320  /* Only get here if someone passes us a file not in allocatedDescs */
2321  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2322 
2323  return close(fd);
2324 }
2325 
2326 /*
2327  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2328  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2329  * necessary to open the directory, and with closing it after an elog.
2330  * When done, call FreeDir rather than closedir.
2331  *
2332  * Ideally this should be the *only* direct call of opendir() in the backend.
2333  */
2334 DIR *
2335 AllocateDir(const char *dirname)
2336 {
2337  DIR *dir;
2338 
2339  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2340  numAllocatedDescs, dirname));
2341 
2342  /* Can we allocate another non-virtual FD? */
2343  if (!reserveAllocatedDesc())
2344  ereport(ERROR,
2345  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2346  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2347  maxAllocatedDescs, dirname)));
2348 
2349  /* Close excess kernel FDs. */
2350  ReleaseLruFiles();
2351 
2352 TryAgain:
2353  if ((dir = opendir(dirname)) != NULL)
2354  {
2355  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2356 
2357  desc->kind = AllocateDescDir;
2358  desc->desc.dir = dir;
2361  return desc->desc.dir;
2362  }
2363 
2364  if (errno == EMFILE || errno == ENFILE)
2365  {
2366  int save_errno = errno;
2367 
2368  ereport(LOG,
2369  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2370  errmsg("out of file descriptors: %m; release and retry")));
2371  errno = 0;
2372  if (ReleaseLruFile())
2373  goto TryAgain;
2374  errno = save_errno;
2375  }
2376 
2377  return NULL;
2378 }
2379 
2380 /*
2381  * Read a directory opened with AllocateDir, ereport'ing any error.
2382  *
2383  * This is easier to use than raw readdir() since it takes care of some
2384  * otherwise rather tedious and error-prone manipulation of errno. Also,
2385  * if you are happy with a generic error message for AllocateDir failure,
2386  * you can just do
2387  *
2388  * dir = AllocateDir(path);
2389  * while ((dirent = ReadDir(dir, path)) != NULL)
2390  * process dirent;
2391  * FreeDir(dir);
2392  *
2393  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2394  * (Make sure errno hasn't been changed since AllocateDir if you use this
2395  * shortcut.)
2396  *
2397  * The pathname passed to AllocateDir must be passed to this routine too,
2398  * but it is only used for error reporting.
2399  */
2400 struct dirent *
2401 ReadDir(DIR *dir, const char *dirname)
2402 {
2403  return ReadDirExtended(dir, dirname, ERROR);
2404 }
2405 
2406 /*
2407  * Alternate version that allows caller to specify the elevel for any
2408  * error report. If elevel < ERROR, returns NULL on any error.
2409  */
2410 static struct dirent *
2411 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2412 {
2413  struct dirent *dent;
2414 
2415  /* Give a generic message for AllocateDir failure, if caller didn't */
2416  if (dir == NULL)
2417  {
2418  ereport(elevel,
2420  errmsg("could not open directory \"%s\": %m",
2421  dirname)));
2422  return NULL;
2423  }
2424 
2425  errno = 0;
2426  if ((dent = readdir(dir)) != NULL)
2427  return dent;
2428 
2429  if (errno)
2430  ereport(elevel,
2432  errmsg("could not read directory \"%s\": %m",
2433  dirname)));
2434  return NULL;
2435 }
2436 
2437 /*
2438  * Close a directory opened with AllocateDir.
2439  *
2440  * Note we do not check closedir's return value --- it is up to the caller
2441  * to handle close errors.
2442  */
2443 int
2445 {
2446  int i;
2447 
2448  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2449 
2450  /* Remove dir from list of allocated dirs, if it's present */
2451  for (i = numAllocatedDescs; --i >= 0;)
2452  {
2453  AllocateDesc *desc = &allocatedDescs[i];
2454 
2455  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2456  return FreeDesc(desc);
2457  }
2458 
2459  /* Only get here if someone passes us a dir not in allocatedDescs */
2460  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2461 
2462  return closedir(dir);
2463 }
2464 
2465 
2466 /*
2467  * Close a pipe stream returned by OpenPipeStream.
2468  */
2469 int
2470 ClosePipeStream(FILE *file)
2471 {
2472  int i;
2473 
2474  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2475 
2476  /* Remove file from list of allocated files, if it's present */
2477  for (i = numAllocatedDescs; --i >= 0;)
2478  {
2479  AllocateDesc *desc = &allocatedDescs[i];
2480 
2481  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2482  return FreeDesc(desc);
2483  }
2484 
2485  /* Only get here if someone passes us a file not in allocatedDescs */
2486  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2487 
2488  return pclose(file);
2489 }
2490 
2491 /*
2492  * closeAllVfds
2493  *
2494  * Force all VFDs into the physically-closed state, so that the fewest
2495  * possible number of kernel file descriptors are in use. There is no
2496  * change in the logical state of the VFDs.
2497  */
2498 void
2500 {
2501  Index i;
2502 
2503  if (SizeVfdCache > 0)
2504  {
2505  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2506  for (i = 1; i < SizeVfdCache; i++)
2507  {
2508  if (!FileIsNotOpen(i))
2509  LruDelete(i);
2510  }
2511  }
2512 }
2513 
2514 
2515 /*
2516  * SetTempTablespaces
2517  *
2518  * Define a list (actually an array) of OIDs of tablespaces to use for
2519  * temporary files. This list will be used until end of transaction,
2520  * unless this function is called again before then. It is caller's
2521  * responsibility that the passed-in array has adequate lifespan (typically
2522  * it'd be allocated in TopTransactionContext).
2523  */
2524 void
2525 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2526 {
2527  Assert(numSpaces >= 0);
2528  tempTableSpaces = tableSpaces;
2529  numTempTableSpaces = numSpaces;
2530 
2531  /*
2532  * Select a random starting point in the list. This is to minimize
2533  * conflicts between backends that are most likely sharing the same list
2534  * of temp tablespaces. Note that if we create multiple temp files in the
2535  * same transaction, we'll advance circularly through the list --- this
2536  * ensures that large temporary sort files are nicely spread across all
2537  * available tablespaces.
2538  */
2539  if (numSpaces > 1)
2540  nextTempTableSpace = random() % numSpaces;
2541  else
2542  nextTempTableSpace = 0;
2543 }
2544 
2545 /*
2546  * TempTablespacesAreSet
2547  *
2548  * Returns TRUE if SetTempTablespaces has been called in current transaction.
2549  * (This is just so that tablespaces.c doesn't need its own per-transaction
2550  * state.)
2551  */
2552 bool
2554 {
2555  return (numTempTableSpaces >= 0);
2556 }
2557 
2558 /*
2559  * GetNextTempTableSpace
2560  *
2561  * Select the next temp tablespace to use. A result of InvalidOid means
2562  * to use the current database's default tablespace.
2563  */
2564 Oid
2566 {
2567  if (numTempTableSpaces > 0)
2568  {
2569  /* Advance nextTempTableSpace counter with wraparound */
2571  nextTempTableSpace = 0;
2573  }
2574  return InvalidOid;
2575 }
2576 
2577 
2578 /*
2579  * AtEOSubXact_Files
2580  *
2581  * Take care of subtransaction commit/abort. At abort, we close temp files
2582  * that the subtransaction may have opened. At commit, we reassign the
2583  * files that were opened to the parent subtransaction.
2584  */
2585 void
2586 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2587  SubTransactionId parentSubid)
2588 {
2589  Index i;
2590 
2591  for (i = 0; i < numAllocatedDescs; i++)
2592  {
2593  if (allocatedDescs[i].create_subid == mySubid)
2594  {
2595  if (isCommit)
2596  allocatedDescs[i].create_subid = parentSubid;
2597  else
2598  {
2599  /* have to recheck the item after FreeDesc (ugly) */
2600  FreeDesc(&allocatedDescs[i--]);
2601  }
2602  }
2603  }
2604 }
2605 
2606 /*
2607  * AtEOXact_Files
2608  *
2609  * This routine is called during transaction commit or abort (it doesn't
2610  * particularly care which). All still-open per-transaction temporary file
2611  * VFDs are closed, which also causes the underlying files to be deleted
2612  * (although they should've been closed already by the ResourceOwner
2613  * cleanup). Furthermore, all "allocated" stdio files are closed. We also
2614  * forget any transaction-local temp tablespace list.
2615  */
2616 void
2618 {
2619  CleanupTempFiles(false);
2621  numTempTableSpaces = -1;
2622 }
2623 
2624 /*
2625  * AtProcExit_Files
2626  *
2627  * on_proc_exit hook to clean up temp files during backend shutdown.
2628  * Here, we want to clean up *all* temp files including interXact ones.
2629  */
2630 static void
2632 {
2633  CleanupTempFiles(true);
2634 }
2635 
2636 /*
2637  * Close temporary files and delete their underlying files.
2638  *
2639  * isProcExit: if true, this is being called as the backend process is
2640  * exiting. If that's the case, we should remove all temporary files; if
2641  * that's not the case, we are being called for transaction commit/abort
2642  * and should only remove transaction-local temp files. In either case,
2643  * also clean up "allocated" stdio files, dirs and fds.
2644  */
2645 static void
2646 CleanupTempFiles(bool isProcExit)
2647 {
2648  Index i;
2649 
2650  /*
2651  * Careful here: at proc_exit we need extra cleanup, not just
2652  * xact_temporary files.
2653  */
2654  if (isProcExit || have_xact_temporary_files)
2655  {
2656  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2657  for (i = 1; i < SizeVfdCache; i++)
2658  {
2659  unsigned short fdstate = VfdCache[i].fdstate;
2660 
2661  if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
2662  {
2663  /*
2664  * If we're in the process of exiting a backend process, close
2665  * all temporary files. Otherwise, only close temporary files
2666  * local to the current transaction. They should be closed by
2667  * the ResourceOwner mechanism already, so this is just a
2668  * debugging cross-check.
2669  */
2670  if (isProcExit)
2671  FileClose(i);
2672  else if (fdstate & FD_XACT_TEMPORARY)
2673  {
2674  elog(WARNING,
2675  "temporary file %s not closed at end-of-transaction",
2676  VfdCache[i].fileName);
2677  FileClose(i);
2678  }
2679  }
2680  }
2681 
2682  have_xact_temporary_files = false;
2683  }
2684 
2685  /* Clean up "allocated" stdio files, dirs and fds. */
2686  while (numAllocatedDescs > 0)
2687  FreeDesc(&allocatedDescs[0]);
2688 }
2689 
2690 
2691 /*
2692  * Remove temporary and temporary relation files left over from a prior
2693  * postmaster session
2694  *
2695  * This should be called during postmaster startup. It will forcibly
2696  * remove any leftover files created by OpenTemporaryFile and any leftover
2697  * temporary relation files created by mdcreate.
2698  *
2699  * NOTE: we could, but don't, call this during a post-backend-crash restart
2700  * cycle. The argument for not doing it is that someone might want to examine
2701  * the temp files for debugging purposes. This does however mean that
2702  * OpenTemporaryFile had better allow for collision with an existing temp
2703  * file name.
2704  */
2705 void
2707 {
2708  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2709  DIR *spc_dir;
2710  struct dirent *spc_de;
2711 
2712  /*
2713  * First process temp files in pg_default ($PGDATA/base)
2714  */
2715  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2716  RemovePgTempFilesInDir(temp_path);
2717  RemovePgTempRelationFiles("base");
2718 
2719  /*
2720  * Cycle through temp directories for all non-default tablespaces.
2721  */
2722  spc_dir = AllocateDir("pg_tblspc");
2723 
2724  while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
2725  {
2726  if (strcmp(spc_de->d_name, ".") == 0 ||
2727  strcmp(spc_de->d_name, "..") == 0)
2728  continue;
2729 
2730  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2732  RemovePgTempFilesInDir(temp_path);
2733 
2734  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2736  RemovePgTempRelationFiles(temp_path);
2737  }
2738 
2739  FreeDir(spc_dir);
2740 
2741  /*
2742  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2743  * DataDir as well.
2744  */
2745 #ifdef EXEC_BACKEND
2747 #endif
2748 }
2749 
2750 /* Process one pgsql_tmp directory for RemovePgTempFiles */
2751 static void
2752 RemovePgTempFilesInDir(const char *tmpdirname)
2753 {
2754  DIR *temp_dir;
2755  struct dirent *temp_de;
2756  char rm_path[MAXPGPATH * 2];
2757 
2758  temp_dir = AllocateDir(tmpdirname);
2759  if (temp_dir == NULL)
2760  {
2761  /* anything except ENOENT is fishy */
2762  if (errno != ENOENT)
2763  elog(LOG,
2764  "could not open temporary-files directory \"%s\": %m",
2765  tmpdirname);
2766  return;
2767  }
2768 
2769  while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
2770  {
2771  if (strcmp(temp_de->d_name, ".") == 0 ||
2772  strcmp(temp_de->d_name, "..") == 0)
2773  continue;
2774 
2775  snprintf(rm_path, sizeof(rm_path), "%s/%s",
2776  tmpdirname, temp_de->d_name);
2777 
2778  if (strncmp(temp_de->d_name,
2780  strlen(PG_TEMP_FILE_PREFIX)) == 0)
2781  unlink(rm_path); /* note we ignore any error */
2782  else
2783  elog(LOG,
2784  "unexpected file found in temporary-files directory: \"%s\"",
2785  rm_path);
2786  }
2787 
2788  FreeDir(temp_dir);
2789 }
2790 
2791 /* Process one tablespace directory, look for per-DB subdirectories */
2792 static void
2793 RemovePgTempRelationFiles(const char *tsdirname)
2794 {
2795  DIR *ts_dir;
2796  struct dirent *de;
2797  char dbspace_path[MAXPGPATH * 2];
2798 
2799  ts_dir = AllocateDir(tsdirname);
2800  if (ts_dir == NULL)
2801  {
2802  /* anything except ENOENT is fishy */
2803  if (errno != ENOENT)
2804  elog(LOG,
2805  "could not open tablespace directory \"%s\": %m",
2806  tsdirname);
2807  return;
2808  }
2809 
2810  while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
2811  {
2812  int i = 0;
2813 
2814  /*
2815  * We're only interested in the per-database directories, which have
2816  * numeric names. Note that this code will also (properly) ignore "."
2817  * and "..".
2818  */
2819  while (isdigit((unsigned char) de->d_name[i]))
2820  ++i;
2821  if (de->d_name[i] != '\0' || i == 0)
2822  continue;
2823 
2824  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
2825  tsdirname, de->d_name);
2826  RemovePgTempRelationFilesInDbspace(dbspace_path);
2827  }
2828 
2829  FreeDir(ts_dir);
2830 }
2831 
2832 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
2833 static void
2834 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
2835 {
2836  DIR *dbspace_dir;
2837  struct dirent *de;
2838  char rm_path[MAXPGPATH * 2];
2839 
2840  dbspace_dir = AllocateDir(dbspacedirname);
2841  if (dbspace_dir == NULL)
2842  {
2843  /* we just saw this directory, so it really ought to be there */
2844  elog(LOG,
2845  "could not open dbspace directory \"%s\": %m",
2846  dbspacedirname);
2847  return;
2848  }
2849 
2850  while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
2851  {
2852  if (!looks_like_temp_rel_name(de->d_name))
2853  continue;
2854 
2855  snprintf(rm_path, sizeof(rm_path), "%s/%s",
2856  dbspacedirname, de->d_name);
2857 
2858  unlink(rm_path); /* note we ignore any error */
2859  }
2860 
2861  FreeDir(dbspace_dir);
2862 }
2863 
2864 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
2865 static bool
2867 {
2868  int pos;
2869  int savepos;
2870 
2871  /* Must start with "t". */
2872  if (name[0] != 't')
2873  return false;
2874 
2875  /* Followed by a non-empty string of digits and then an underscore. */
2876  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
2877  ;
2878  if (pos == 1 || name[pos] != '_')
2879  return false;
2880 
2881  /* Followed by another nonempty string of digits. */
2882  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
2883  ;
2884  if (savepos == pos)
2885  return false;
2886 
2887  /* We might have _forkname or .segment or both. */
2888  if (name[pos] == '_')
2889  {
2890  int forkchar = forkname_chars(&name[pos + 1], NULL);
2891 
2892  if (forkchar <= 0)
2893  return false;
2894  pos += forkchar + 1;
2895  }
2896  if (name[pos] == '.')
2897  {
2898  int segchar;
2899 
2900  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
2901  ;
2902  if (segchar <= 1)
2903  return false;
2904  pos += segchar;
2905  }
2906 
2907  /* Now we should be at the end. */
2908  if (name[pos] != '\0')
2909  return false;
2910  return true;
2911 }
2912 
2913 
2914 /*
2915  * Issue fsync recursively on PGDATA and all its contents.
2916  *
2917  * We fsync regular files and directories wherever they are, but we
2918  * follow symlinks only for pg_wal and immediately under pg_tblspc.
2919  * Other symlinks are presumed to point at files we're not responsible
2920  * for fsyncing, and might not have privileges to write at all.
2921  *
2922  * Errors are logged but not considered fatal; that's because this is used
2923  * only during database startup, to deal with the possibility that there are
2924  * issued-but-unsynced writes pending against the data directory. We want to
2925  * ensure that such writes reach disk before anything that's done in the new
2926  * run. However, aborting on error would result in failure to start for
2927  * harmless cases such as read-only files in the data directory, and that's
2928  * not good either.
2929  *
2930  * Note we assume we're chdir'd into PGDATA to begin with.
2931  */
2932 void
2934 {
2935  bool xlog_is_symlink;
2936 
2937  /* We can skip this whole thing if fsync is disabled. */
2938  if (!enableFsync)
2939  return;
2940 
2941  /*
2942  * If pg_wal is a symlink, we'll need to recurse into it separately,
2943  * because the first walkdir below will ignore it.
2944  */
2945  xlog_is_symlink = false;
2946 
2947 #ifndef WIN32
2948  {
2949  struct stat st;
2950 
2951  if (lstat("pg_wal", &st) < 0)
2952  ereport(LOG,
2954  errmsg("could not stat file \"%s\": %m",
2955  "pg_wal")));
2956  else if (S_ISLNK(st.st_mode))
2957  xlog_is_symlink = true;
2958  }
2959 #else
2960  if (pgwin32_is_junction("pg_wal"))
2961  xlog_is_symlink = true;
2962 #endif
2963 
2964  /*
2965  * If possible, hint to the kernel that we're soon going to fsync the data
2966  * directory and its contents. Errors in this step are even less
2967  * interesting than normal, so log them only at DEBUG1.
2968  */
2969 #ifdef PG_FLUSH_DATA_WORKS
2970  walkdir(".", pre_sync_fname, false, DEBUG1);
2971  if (xlog_is_symlink)
2972  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
2973  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
2974 #endif
2975 
2976  /*
2977  * Now we do the fsync()s in the same order.
2978  *
2979  * The main call ignores symlinks, so in addition to specially processing
2980  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
2981  * process_symlinks = true. Note that if there are any plain directories
2982  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
2983  * so we don't worry about optimizing it.
2984  */
2985  walkdir(".", datadir_fsync_fname, false, LOG);
2986  if (xlog_is_symlink)
2987  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
2988  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
2989 }
2990 
2991 /*
2992  * walkdir: recursively walk a directory, applying the action to each
2993  * regular file and directory (including the named directory itself).
2994  *
2995  * If process_symlinks is true, the action and recursion are also applied
2996  * to regular files and directories that are pointed to by symlinks in the
2997  * given directory; otherwise symlinks are ignored. Symlinks are always
2998  * ignored in subdirectories, ie we intentionally don't pass down the
2999  * process_symlinks flag to recursive calls.
3000  *
3001  * Errors are reported at level elevel, which might be ERROR or less.
3002  *
3003  * See also walkdir in initdb.c, which is a frontend version of this logic.
3004  */
3005 static void
3006 walkdir(const char *path,
3007  void (*action) (const char *fname, bool isdir, int elevel),
3008  bool process_symlinks,
3009  int elevel)
3010 {
3011  DIR *dir;
3012  struct dirent *de;
3013 
3014  dir = AllocateDir(path);
3015  if (dir == NULL)
3016  {
3017  ereport(elevel,
3019  errmsg("could not open directory \"%s\": %m", path)));
3020  return;
3021  }
3022 
3023  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3024  {
3025  char subpath[MAXPGPATH * 2];
3026  struct stat fst;
3027  int sret;
3028 
3030 
3031  if (strcmp(de->d_name, ".") == 0 ||
3032  strcmp(de->d_name, "..") == 0)
3033  continue;
3034 
3035  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3036 
3037  if (process_symlinks)
3038  sret = stat(subpath, &fst);
3039  else
3040  sret = lstat(subpath, &fst);
3041 
3042  if (sret < 0)
3043  {
3044  ereport(elevel,
3046  errmsg("could not stat file \"%s\": %m", subpath)));
3047  continue;
3048  }
3049 
3050  if (S_ISREG(fst.st_mode))
3051  (*action) (subpath, false, elevel);
3052  else if (S_ISDIR(fst.st_mode))
3053  walkdir(subpath, action, false, elevel);
3054  }
3055 
3056  FreeDir(dir); /* we ignore any error here */
3057 
3058  /*
3059  * It's important to fsync the destination directory itself as individual
3060  * file fsyncs don't guarantee that the directory entry for the file is
3061  * synced.
3062  */
3063  (*action) (path, true, elevel);
3064 }
3065 
3066 
3067 /*
3068  * Hint to the OS that it should get ready to fsync() this file.
3069  *
3070  * Ignores errors trying to open unreadable files, and logs other errors at a
3071  * caller-specified level.
3072  */
3073 #ifdef PG_FLUSH_DATA_WORKS
3074 
3075 static void
3076 pre_sync_fname(const char *fname, bool isdir, int elevel)
3077 {
3078  int fd;
3079 
3080  /* Don't try to flush directories, it'll likely just fail */
3081  if (isdir)
3082  return;
3083 
3084  fd = OpenTransientFile((char *) fname, O_RDONLY | PG_BINARY, 0);
3085 
3086  if (fd < 0)
3087  {
3088  if (errno == EACCES)
3089  return;
3090  ereport(elevel,
3092  errmsg("could not open file \"%s\": %m", fname)));
3093  return;
3094  }
3095 
3096  /*
3097  * pg_flush_data() ignores errors, which is ok because this is only a
3098  * hint.
3099  */
3100  pg_flush_data(fd, 0, 0);
3101 
3102  (void) CloseTransientFile(fd);
3103 }
3104 
3105 #endif /* PG_FLUSH_DATA_WORKS */
3106 
3107 static void
3108 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3109 {
3110  /*
3111  * We want to silently ignoring errors about unreadable files. Pass that
3112  * desire on to fsync_fname_ext().
3113  */
3114  fsync_fname_ext(fname, isdir, true, elevel);
3115 }
3116 
3117 /*
3118  * fsync_fname_ext -- Try to fsync a file or directory
3119  *
3120  * If ignore_perm is true, ignore errors upon trying to open unreadable
3121  * files. Logs other errors at a caller-specified level.
3122  *
3123  * Returns 0 if the operation succeeded, -1 otherwise.
3124  */
3125 static int
3126 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3127 {
3128  int fd;
3129  int flags;
3130  int returncode;
3131 
3132  /*
3133  * Some OSs require directories to be opened read-only whereas other
3134  * systems don't allow us to fsync files opened read-only; so we need both
3135  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3136  * not writable by our userid, but we assume that's OK.
3137  */
3138  flags = PG_BINARY;
3139  if (!isdir)
3140  flags |= O_RDWR;
3141  else
3142  flags |= O_RDONLY;
3143 
3144  fd = OpenTransientFile((char *) fname, flags, 0);
3145 
3146  /*
3147  * Some OSs don't allow us to open directories at all (Windows returns
3148  * EACCES), just ignore the error in that case. If desired also silently
3149  * ignoring errors about unreadable files. Log others.
3150  */
3151  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3152  return 0;
3153  else if (fd < 0 && ignore_perm && errno == EACCES)
3154  return 0;
3155  else if (fd < 0)
3156  {
3157  ereport(elevel,
3159  errmsg("could not open file \"%s\": %m", fname)));
3160  return -1;
3161  }
3162 
3163  returncode = pg_fsync(fd);
3164 
3165  /*
3166  * Some OSes don't allow us to fsync directories at all, so we can ignore
3167  * those errors. Anything else needs to be logged.
3168  */
3169  if (returncode != 0 && !(isdir && errno == EBADF))
3170  {
3171  int save_errno;
3172 
3173  /* close file upon error, might not be in transaction context */
3174  save_errno = errno;
3175  (void) CloseTransientFile(fd);
3176  errno = save_errno;
3177 
3178  ereport(elevel,
3180  errmsg("could not fsync file \"%s\": %m", fname)));
3181  return -1;
3182  }
3183 
3184  (void) CloseTransientFile(fd);
3185 
3186  return 0;
3187 }
3188 
3189 /*
3190  * fsync_parent_path -- fsync the parent path of a file or directory
3191  *
3192  * This is aimed at making file operations persistent on disk in case of
3193  * an OS crash or power failure.
3194  */
3195 static int
3196 fsync_parent_path(const char *fname, int elevel)
3197 {
3198  char parentpath[MAXPGPATH];
3199 
3200  strlcpy(parentpath, fname, MAXPGPATH);
3201  get_parent_directory(parentpath);
3202 
3203  /*
3204  * get_parent_directory() returns an empty string if the input argument is
3205  * just a file name (see comments in path.c), so handle that as being the
3206  * current directory.
3207  */
3208  if (strlen(parentpath) == 0)
3209  strlcpy(parentpath, ".", MAXPGPATH);
3210 
3211  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3212  return -1;
3213 
3214  return 0;
3215 }
File lruLessRecently
Definition: fd.c:183
void closeAllVfds(void)
Definition: fd.c:2499
File nextFree
Definition: fd.c:181
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:797
File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
Definition: fd.c:1303
#define MAP_FAILED
Definition: mem.h:45
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:39
#define NUM_RESERVED_FDS
Definition: fd.c:111
static AllocateDesc * allocatedDescs
Definition: fd.c:245
char * FileName
Definition: fd.h:49
int pg_fdatasync(int fd)
Definition: fd.c:385
static void error(void)
Definition: sql-dyntest.c:147
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
AllocateDescKind
Definition: fd.c:223
DIR * dir
Definition: fd.c:238
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1427
union AllocateDesc::@28 desc
static void AtProcExit_Files(int code, Datum arg)
Definition: fd.c:2631
#define write(a, b, c)
Definition: win32.h:14
static Size SizeVfdCache
Definition: fd.c:198
void on_proc_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:292
#define DO_DB(A)
Definition: fd.c:152
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3006
long random(void)
Definition: random.c:22
#define mkdir(a, b)
Definition: win32.h:57
ResourceOwner CurrentResourceOwner
Definition: resowner.c:138
int pg_fsync_writethrough(int fd)
Definition: fd.c:362
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:79
int max_safe_fds
Definition: fd.c:139
#define Min(x, y)
Definition: c.h:807
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:567
int log_temp_files
Definition: guc.c:454
#define GLOBALTABLESPACE_OID
Definition: pg_tablespace.h:64
static Vfd * VfdCache
Definition: fd.c:197
static void Delete(File file)
Definition: fd.c:984
int closedir(DIR *)
Definition: dirent.c:111
static int numTempTableSpaces
Definition: fd.c:258
int errcode(int sqlerrcode)
Definition: elog.c:575
#define MemSet(start, val, len)
Definition: c.h:858
return result
Definition: formatting.c:1633
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:350
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:2834
int snprintf(char *str, size_t count, const char *fmt,...) pg_attribute_printf(3
#define PG_TEMP_FILE_PREFIX
Definition: fd.h:128
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1525
static bool reserveAllocatedDesc(void)
Definition: fd.c:2021
uint32 SubTransactionId
Definition: c.h:401
#define LOG
Definition: elog.h:26
static void RemovePgTempFilesInDir(const char *tmpdirname)
Definition: fd.c:2752
unsigned int Oid
Definition: postgres_ext.h:31
#define FilePosIsUnknown(pos)
Definition: fd.c:170
AllocateDescKind kind
Definition: fd.c:233
char * FilePathName(File file)
Definition: fd.c:1974
Definition: dirent.h:9
#define OidIsValid(objectId)
Definition: c.h:538
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define PG_BINARY
Definition: c.h:1039
Oid MyDatabaseTableSpace
Definition: globals.c:79
int ClosePipeStream(FILE *file)
Definition: fd.c:2470
#define malloc(a)
Definition: header.h:50
static void LruDelete(File file)
Definition: fd.c:1003
int fileMode
Definition: fd.c:189
void pg_usleep(long microsec)
Definition: signal.c:53
bool TempTablespacesAreSet(void)
Definition: fd.c:2553
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2238
void pfree(void *pointer)
Definition: mcxt.c:950
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:2793
static bool ReleaseLruFile(void)
Definition: fd.c:1140
Definition: dirent.c:25
#define ERROR
Definition: elog.h:43
static int LruInsert(File file)
Definition: fd.c:1066
void AtEOXact_Files(void)
Definition: fd.c:2617
#define FATAL
Definition: elog.h:52
static bool have_xact_temporary_files
Definition: fd.c:209
#define MAXPGPATH
DIR * opendir(const char *)
Definition: dirent.c:33
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:1830
#define DEBUG2
Definition: elog.h:24
char * fileName
Definition: fd.c:186
static struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2411
static char * buf
Definition: pg_test_fsync.c:66
Oid GetNextTempTableSpace(void)
Definition: fd.c:2565
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1200
static void CleanupTempFiles(bool isProcExit)
Definition: fd.c:2646
#define DEFAULTTABLESPACE_OID
Definition: pg_tablespace.h:63
int OpenTransientFile(FileName fileName, int fileFlags, int fileMode)
Definition: fd.c:2144
int errdetail(const char *fmt,...)
Definition: elog.c:873
int errcode_for_file_access(void)
Definition: elog.c:598
void get_parent_directory(char *path)
Definition: path.c:854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2094
static int nfile
Definition: fd.c:203
unsigned int uint32
Definition: c.h:268
void SyncDataDirectory(void)
Definition: fd.c:2933
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2335
static int nextTempTableSpace
Definition: fd.c:259
int FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
Definition: fd.c:1709
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1235
int max_files_per_process
Definition: fd.c:126
static File AllocateVfd(void)
Definition: fd.c:1172
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2184
off_t seekPos
Definition: fd.c:184
unsigned short fdstate
Definition: fd.c:179
Definition: fd.c:176
off_t fileSize
Definition: fd.c:185
int fd
Definition: fd.c:178
int unlink(const char *filename)
#define ereport(elevel, rest)
Definition: elog.h:122
int FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
Definition: fd.c:1645
int link(const char *fromname, const char *toname)
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2525
#define fsync(fd)
Definition: win32.h:62
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:593
static void Insert(File file)
Definition: fd.c:1044
ResourceOwner resowner
Definition: fd.c:180
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3108
int CloseTransientFile(int fd)
Definition: fd.c:2305
static void ReleaseLruFiles(void)
Definition: fd.c:1162
#define WARNING
Definition: elog.h:40
#define FileIsNotOpen(file)
Definition: fd.c:161
static int elevel
Definition: vacuumlazy.c:136
#define FD_TEMPORARY
Definition: fd.c:173
#define FD_XACT_TEMPORARY
Definition: fd.c:174
struct vfd Vfd
uintptr_t Datum
Definition: postgres.h:372
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:2586
#define EINTR
Definition: win32.h:285
unsigned int Index
Definition: c.h:365
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:407
#define FileIsValid(file)
Definition: fd.c:158
FILE * file
Definition: fd.c:237
static bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:2866
#define InvalidOid
Definition: postgres_ext.h:36
#define VFD_CLOSED
Definition: fd.c:156
static uint64 temporary_files_size
Definition: fd.c:217
#define free(a)
Definition: header.h:65
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
#define ftruncate(a, b)
Definition: win32.h:59
#define PG_TEMP_FILES_DIR
Definition: fd.h:127
void FileClose(File file)
Definition: fd.c:1493
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:1590
#define NULL
Definition: c.h:229
static int FileAccess(File file)
Definition: fd.c:1250
#define Assert(condition)
Definition: c.h:676
void _dosmaperr(unsigned long)
Definition: win32error.c:171
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:649
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2401
File lruMoreRecently
Definition: fd.c:182
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:1618
void RemovePgTempFiles(void)
Definition: fd.c:2706
SubTransactionId create_subid
Definition: fd.c:234
WalTimeSample buffer[LAG_TRACKER_BUFFER_SIZE]
Definition: walsender.c:214
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1371
int durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:712
size_t Size
Definition: c.h:356
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1211
int sync_method
Definition: xlog.c:103
struct dirent * readdir(DIR *)
Definition: dirent.c:77
#define FD_MINFREE
Definition: fd.c:117
#define TABLESPACE_VERSION_DIRECTORY
Definition: catalog.h:26
#define realloc(a, b)
Definition: header.h:60
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:963
#define INT64_FORMAT
Definition: c.h:315
const char * name
Definition: encode.c:521
static long tempFileCounter
Definition: fd.c:251
int fd
Definition: fd.c:239
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:676
int FreeFile(FILE *file)
Definition: fd.c:2277
void set_max_safe_fds(void)
Definition: fd.c:881
bool enableFsync
Definition: globals.c:111
static Oid * tempTableSpaces
Definition: fd.c:257
void * palloc(Size size)
Definition: mcxt.c:849
int errmsg(const char *fmt,...)
Definition: elog.c:797
int FileGetRawFlags(File file)
Definition: fd.c:2000
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1189
int FileGetRawMode(File file)
Definition: fd.c:2010
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3126
int i
void * arg
int FileGetRawDesc(File file)
Definition: fd.c:1990
static void FreeVfd(File file)
Definition: fd.c:1230
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:98
int pg_fsync(int fd)
Definition: fd.c:333
char d_name[MAX_PATH]
Definition: dirent.h:14
#define elog
Definition: elog.h:219
#define close(a)
Definition: win32.h:12
int fileFlags
Definition: fd.c:188
off_t FileSeek(File file, off_t offset, int whence)
Definition: fd.c:1851
#define lstat(path, sb)
Definition: win32.h:262
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1209
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:1939
#define FileUnknownPos
Definition: fd.c:169
static int maxAllocatedDescs
Definition: fd.c:244
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3196
int File
Definition: fd.h:51
#define read(a, b, c)
Definition: win32.h:13
int FreeDir(DIR *dir)
Definition: fd.c:2444
int temp_file_limit
Definition: guc.c:457
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:234
void InitFileAccess(void)
Definition: fd.c:764
static int numAllocatedDescs
Definition: fd.c:243
int BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
Definition: fd.c:936