PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 256 on many modern
20  * operating systems, but can be as low as 32 on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
43  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
44  * They behave like the corresponding native functions, except that the handle
45  * is registered with the current subtransaction, and will be automatically
46  * closed at abort. These are intended mainly for short operations like
47  * reading a configuration file; there is a limit on the number of files that
48  * can be opened using these functions at any one time.
49  *
50  * Finally, BasicOpenFile is just a thin wrapper around open() that can
51  * release file descriptors in use by the virtual file descriptors if
52  * necessary. There is no automatic cleanup of file descriptors returned by
53  * BasicOpenFile, it is solely the caller's responsibility to close the file
54  * descriptor by calling close(2).
55  *
56  *-------------------------------------------------------------------------
57  */
58 
59 #include "postgres.h"
60 
61 #include <sys/file.h>
62 #include <sys/param.h>
63 #include <sys/stat.h>
64 #ifndef WIN32
65 #include <sys/mman.h>
66 #endif
67 #include <limits.h>
68 #include <unistd.h>
69 #include <fcntl.h>
70 #ifdef HAVE_SYS_RESOURCE_H
71 #include <sys/resource.h> /* for getrlimit */
72 #endif
73 
74 #include "miscadmin.h"
75 #include "access/xact.h"
76 #include "access/xlog.h"
77 #include "catalog/catalog.h"
78 #include "catalog/pg_tablespace.h"
79 #include "pgstat.h"
80 #include "portability/mem.h"
81 #include "storage/fd.h"
82 #include "storage/ipc.h"
83 #include "utils/guc.h"
84 #include "utils/resowner_private.h"
85 
86 
87 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
88 #if defined(HAVE_SYNC_FILE_RANGE)
89 #define PG_FLUSH_DATA_WORKS 1
90 #elif !defined(WIN32) && defined(MS_ASYNC)
91 #define PG_FLUSH_DATA_WORKS 1
92 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
93 #define PG_FLUSH_DATA_WORKS 1
94 #endif
95 
96 /*
97  * We must leave some file descriptors free for system(), the dynamic loader,
98  * and other code that tries to open files without consulting fd.c. This
99  * is the number left free. (While we can be pretty sure we won't get
100  * EMFILE, there's never any guarantee that we won't get ENFILE due to
101  * other processes chewing up FDs. So it's a bad idea to try to open files
102  * without consulting fd.c. Nonetheless we cannot control all code.)
103  *
104  * Because this is just a fixed setting, we are effectively assuming that
105  * no such code will leave FDs open over the long term; otherwise the slop
106  * is likely to be insufficient. Note in particular that we expect that
107  * loading a shared library does not result in any permanent increase in
108  * the number of open files. (This appears to be true on most if not
109  * all platforms as of Feb 2004.)
110  */
111 #define NUM_RESERVED_FDS 10
112 
113 /*
114  * If we have fewer than this many usable FDs after allowing for the reserved
115  * ones, choke.
116  */
117 #define FD_MINFREE 10
118 
119 /*
120  * Default mode for created files, unless something else is specified using
121  * the *Perm() function variants.
122  */
123 #define PG_FILE_MODE_DEFAULT (S_IRUSR | S_IWUSR)
124 
125 /*
126  * A number of platforms allow individual processes to open many more files
127  * than they can really support when *many* processes do the same thing.
128  * This GUC parameter lets the DBA limit max_safe_fds to something less than
129  * what the postmaster's initial probe suggests will work.
130  */
132 
133 /*
134  * Maximum number of file descriptors to open for either VFD entries or
135  * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
136  * to a conservative value, and remains that way indefinitely in bootstrap or
137  * standalone-backend cases. In normal postmaster operation, the postmaster
138  * calls set_max_safe_fds() late in initialization to update the value, and
139  * that value is then inherited by forked subprocesses.
140  *
141  * Note: the value of max_files_per_process is taken into account while
142  * setting this variable, and so need not be tested separately.
143  */
144 int max_safe_fds = 32; /* default if not changed */
145 
146 
147 /* Debugging.... */
148 
149 #ifdef FDDEBUG
150 #define DO_DB(A) \
151  do { \
152  int _do_db_save_errno = errno; \
153  A; \
154  errno = _do_db_save_errno; \
155  } while (0)
156 #else
157 #define DO_DB(A) \
158  ((void) 0)
159 #endif
160 
161 #define VFD_CLOSED (-1)
162 
163 #define FileIsValid(file) \
164  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
165 
166 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
167 
168 /*
169  * Note: a VFD's seekPos is normally always valid, but if for some reason
170  * an lseek() fails, it might become set to FileUnknownPos. We can struggle
171  * along without knowing the seek position in many cases, but in some places
172  * we have to fail if we don't have it.
173  */
174 #define FileUnknownPos ((off_t) -1)
175 #define FilePosIsUnknown(pos) ((pos) < 0)
176 
177 /* these are the assigned bits in fdstate below: */
178 #define FD_TEMPORARY (1 << 0) /* T = delete when closed */
179 #define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
180 
181 typedef struct vfd
182 {
183  int fd; /* current FD, or VFD_CLOSED if none */
184  unsigned short fdstate; /* bitflags for VFD's state */
185  ResourceOwner resowner; /* owner, for automatic cleanup */
186  File nextFree; /* link to next free VFD, if in freelist */
187  File lruMoreRecently; /* doubly linked recency-of-use list */
189  off_t seekPos; /* current logical file position, or -1 */
190  off_t fileSize; /* current size of file (0 if not temporary) */
191  char *fileName; /* name of file, or NULL for unused VFD */
192  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
193  int fileFlags; /* open(2) flags for (re)opening the file */
194  mode_t fileMode; /* mode to pass to open(2) */
195 } Vfd;
196 
197 /*
198  * Virtual File Descriptor array pointer and size. This grows as
199  * needed. 'File' values are indexes into this array.
200  * Note that VfdCache[0] is not a usable VFD, just a list header.
201  */
202 static Vfd *VfdCache;
203 static Size SizeVfdCache = 0;
204 
205 /*
206  * Number of file descriptors known to be in use by VFD entries.
207  */
208 static int nfile = 0;
209 
210 /*
211  * Flag to tell whether it's worth scanning VfdCache looking for temp files
212  * to close
213  */
214 static bool have_xact_temporary_files = false;
215 
216 /*
217  * Tracks the total size of all temporary files. Note: when temp_file_limit
218  * is being enforced, this cannot overflow since the limit cannot be more
219  * than INT_MAX kilobytes. When not enforcing, it could theoretically
220  * overflow, but we don't care.
221  */
222 static uint64 temporary_files_size = 0;
223 
224 /*
225  * List of OS handles opened with AllocateFile, AllocateDir and
226  * OpenTransientFile.
227  */
228 typedef enum
229 {
235 
236 typedef struct
237 {
240  union
241  {
242  FILE *file;
244  int fd;
245  } desc;
246 } AllocateDesc;
247 
248 static int numAllocatedDescs = 0;
249 static int maxAllocatedDescs = 0;
251 
252 /*
253  * Number of temporary files opened during the current session;
254  * this is used in generation of tempfile names.
255  */
256 static long tempFileCounter = 0;
257 
258 /*
259  * Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
260  * this has not been set in the current transaction.
261  */
262 static Oid *tempTableSpaces = NULL;
263 static int numTempTableSpaces = -1;
264 static int nextTempTableSpace = 0;
265 
266 
267 /*--------------------
268  *
269  * Private Routines
270  *
271  * Delete - delete a file from the Lru ring
272  * LruDelete - remove a file from the Lru ring and close its FD
273  * Insert - put a file at the front of the Lru ring
274  * LruInsert - put a file at the front of the Lru ring and open it
275  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
276  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
277  * AllocateVfd - grab a free (or new) file record (from VfdArray)
278  * FreeVfd - free a file record
279  *
280  * The Least Recently Used ring is a doubly linked list that begins and
281  * ends on element zero. Element zero is special -- it doesn't represent
282  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
283  * anchor that shows us the beginning/end of the ring.
284  * Only VFD elements that are currently really open (have an FD assigned) are
285  * in the Lru ring. Elements that are "virtually" open can be recognized
286  * by having a non-null fileName field.
287  *
288  * example:
289  *
290  * /--less----\ /---------\
291  * v \ v \
292  * #0 --more---> LeastRecentlyUsed --more-\ \
293  * ^\ | |
294  * \\less--> MostRecentlyUsedFile <---/ |
295  * \more---/ \--less--/
296  *
297  *--------------------
298  */
299 static void Delete(File file);
300 static void LruDelete(File file);
301 static void Insert(File file);
302 static int LruInsert(File file);
303 static bool ReleaseLruFile(void);
304 static void ReleaseLruFiles(void);
305 static File AllocateVfd(void);
306 static void FreeVfd(File file);
307 
308 static int FileAccess(File file);
309 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
310 static bool reserveAllocatedDesc(void);
311 static int FreeDesc(AllocateDesc *desc);
312 static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel);
313 
314 static void AtProcExit_Files(int code, Datum arg);
315 static void CleanupTempFiles(bool isProcExit);
316 static void RemovePgTempFilesInDir(const char *tmpdirname);
317 static void RemovePgTempRelationFiles(const char *tsdirname);
318 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
319 static bool looks_like_temp_rel_name(const char *name);
320 
321 static void walkdir(const char *path,
322  void (*action) (const char *fname, bool isdir, int elevel),
323  bool process_symlinks,
324  int elevel);
325 #ifdef PG_FLUSH_DATA_WORKS
326 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
327 #endif
328 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
329 
330 static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
331 static int fsync_parent_path(const char *fname, int elevel);
332 
333 
334 /*
335  * pg_fsync --- do fsync with or without writethrough
336  */
337 int
339 {
340  /* #if is to skip the sync_method test if there's no need for it */
341 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
343  return pg_fsync_writethrough(fd);
344  else
345 #endif
346  return pg_fsync_no_writethrough(fd);
347 }
348 
349 
350 /*
351  * pg_fsync_no_writethrough --- same as fsync except does nothing if
352  * enableFsync is off
353  */
354 int
356 {
357  if (enableFsync)
358  return fsync(fd);
359  else
360  return 0;
361 }
362 
363 /*
364  * pg_fsync_writethrough
365  */
366 int
368 {
369  if (enableFsync)
370  {
371 #ifdef WIN32
372  return _commit(fd);
373 #elif defined(F_FULLFSYNC)
374  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
375 #else
376  errno = ENOSYS;
377  return -1;
378 #endif
379  }
380  else
381  return 0;
382 }
383 
384 /*
385  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
386  *
387  * Not all platforms have fdatasync; treat as fsync if not available.
388  */
389 int
391 {
392  if (enableFsync)
393  {
394 #ifdef HAVE_FDATASYNC
395  return fdatasync(fd);
396 #else
397  return fsync(fd);
398 #endif
399  }
400  else
401  return 0;
402 }
403 
404 /*
405  * pg_flush_data --- advise OS that the described dirty data should be flushed
406  *
407  * offset of 0 with nbytes 0 means that the entire file should be flushed;
408  * in this case, this function may have side-effects on the file's
409  * seek position!
410  */
411 void
412 pg_flush_data(int fd, off_t offset, off_t nbytes)
413 {
414  /*
415  * Right now file flushing is primarily used to avoid making later
416  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
417  * if fsyncs are disabled - that's a decision we might want to make
418  * configurable at some point.
419  */
420  if (!enableFsync)
421  return;
422 
423  /*
424  * We compile all alternatives that are supported on the current platform,
425  * to find portability problems more easily.
426  */
427 #if defined(HAVE_SYNC_FILE_RANGE)
428  {
429  int rc;
430 
431  /*
432  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
433  * tells the OS that writeback for the specified blocks should be
434  * started, but that we don't want to wait for completion. Note that
435  * this call might block if too much dirty data exists in the range.
436  * This is the preferable method on OSs supporting it, as it works
437  * reliably when available (contrast to msync()) and doesn't flush out
438  * clean data (like FADV_DONTNEED).
439  */
440  rc = sync_file_range(fd, offset, nbytes,
441  SYNC_FILE_RANGE_WRITE);
442 
443  /* don't error out, this is just a performance optimization */
444  if (rc != 0)
445  {
448  errmsg("could not flush dirty data: %m")));
449  }
450 
451  return;
452  }
453 #endif
454 #if !defined(WIN32) && defined(MS_ASYNC)
455  {
456  void *p;
457  static int pagesize = 0;
458 
459  /*
460  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
461  * writeback. On linux it only does so if MS_SYNC is specified, but
462  * then it does the writeback synchronously. Luckily all common linux
463  * systems have sync_file_range(). This is preferable over
464  * FADV_DONTNEED because it doesn't flush out clean data.
465  *
466  * We map the file (mmap()), tell the kernel to sync back the contents
467  * (msync()), and then remove the mapping again (munmap()).
468  */
469 
470  /* mmap() needs actual length if we want to map whole file */
471  if (offset == 0 && nbytes == 0)
472  {
473  nbytes = lseek(fd, 0, SEEK_END);
474  if (nbytes < 0)
475  {
478  errmsg("could not determine dirty data size: %m")));
479  return;
480  }
481  }
482 
483  /*
484  * Some platforms reject partial-page mmap() attempts. To deal with
485  * that, just truncate the request to a page boundary. If any extra
486  * bytes don't get flushed, well, it's only a hint anyway.
487  */
488 
489  /* fetch pagesize only once */
490  if (pagesize == 0)
491  pagesize = sysconf(_SC_PAGESIZE);
492 
493  /* align length to pagesize, dropping any fractional page */
494  if (pagesize > 0)
495  nbytes = (nbytes / pagesize) * pagesize;
496 
497  /* fractional-page request is a no-op */
498  if (nbytes <= 0)
499  return;
500 
501  /*
502  * mmap could well fail, particularly on 32-bit platforms where there
503  * may simply not be enough address space. If so, silently fall
504  * through to the next implementation.
505  */
506  if (nbytes <= (off_t) SSIZE_MAX)
507  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
508  else
509  p = MAP_FAILED;
510 
511  if (p != MAP_FAILED)
512  {
513  int rc;
514 
515  rc = msync(p, (size_t) nbytes, MS_ASYNC);
516  if (rc != 0)
517  {
520  errmsg("could not flush dirty data: %m")));
521  /* NB: need to fall through to munmap()! */
522  }
523 
524  rc = munmap(p, (size_t) nbytes);
525  if (rc != 0)
526  {
527  /* FATAL error because mapping would remain */
528  ereport(FATAL,
530  errmsg("could not munmap() while flushing data: %m")));
531  }
532 
533  return;
534  }
535  }
536 #endif
537 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
538  {
539  int rc;
540 
541  /*
542  * Signal the kernel that the passed in range should not be cached
543  * anymore. This has the, desired, side effect of writing out dirty
544  * data, and the, undesired, side effect of likely discarding useful
545  * clean cached blocks. For the latter reason this is the least
546  * preferable method.
547  */
548 
549  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
550 
551  if (rc != 0)
552  {
553  /* don't error out, this is just a performance optimization */
556  errmsg("could not flush dirty data: %m")));
557  }
558 
559  return;
560  }
561 #endif
562 }
563 
564 
565 /*
566  * fsync_fname -- fsync a file or directory, handling errors properly
567  *
568  * Try to fsync a file or directory. When doing the latter, ignore errors that
569  * indicate the OS just doesn't allow/require fsyncing directories.
570  */
571 void
572 fsync_fname(const char *fname, bool isdir)
573 {
574  fsync_fname_ext(fname, isdir, false, ERROR);
575 }
576 
577 /*
578  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
579  *
580  * This routine ensures that, after returning, the effect of renaming file
581  * persists in case of a crash. A crash while this routine is running will
582  * leave you with either the pre-existing or the moved file in place of the
583  * new file; no mixed state or truncated files are possible.
584  *
585  * It does so by using fsync on the old filename and the possibly existing
586  * target filename before the rename, and the target file and directory after.
587  *
588  * Note that rename() cannot be used across arbitrary directories, as they
589  * might not be on the same filesystem. Therefore this routine does not
590  * support renaming across directories.
591  *
592  * Log errors with the caller specified severity.
593  *
594  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
595  * valid upon return.
596  */
597 int
598 durable_rename(const char *oldfile, const char *newfile, int elevel)
599 {
600  int fd;
601 
602  /*
603  * First fsync the old and target path (if it exists), to ensure that they
604  * are properly persistent on disk. Syncing the target file is not
605  * strictly necessary, but it makes it easier to reason about crashes;
606  * because it's then guaranteed that either source or target file exists
607  * after a crash.
608  */
609  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
610  return -1;
611 
612  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
613  if (fd < 0)
614  {
615  if (errno != ENOENT)
616  {
617  ereport(elevel,
619  errmsg("could not open file \"%s\": %m", newfile)));
620  return -1;
621  }
622  }
623  else
624  {
625  if (pg_fsync(fd) != 0)
626  {
627  int save_errno;
628 
629  /* close file upon error, might not be in transaction context */
630  save_errno = errno;
631  CloseTransientFile(fd);
632  errno = save_errno;
633 
634  ereport(elevel,
636  errmsg("could not fsync file \"%s\": %m", newfile)));
637  return -1;
638  }
639  CloseTransientFile(fd);
640  }
641 
642  /* Time to do the real deal... */
643  if (rename(oldfile, newfile) < 0)
644  {
645  ereport(elevel,
647  errmsg("could not rename file \"%s\" to \"%s\": %m",
648  oldfile, newfile)));
649  return -1;
650  }
651 
652  /*
653  * To guarantee renaming the file is persistent, fsync the file with its
654  * new name, and its containing directory.
655  */
656  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
657  return -1;
658 
659  if (fsync_parent_path(newfile, elevel) != 0)
660  return -1;
661 
662  return 0;
663 }
664 
665 /*
666  * durable_unlink -- remove a file in a durable manner
667  *
668  * This routine ensures that, after returning, the effect of removing file
669  * persists in case of a crash. A crash while this routine is running will
670  * leave the system in no mixed state.
671  *
672  * It does so by using fsync on the parent directory of the file after the
673  * actual removal is done.
674  *
675  * Log errors with the severity specified by caller.
676  *
677  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
678  * valid upon return.
679  */
680 int
681 durable_unlink(const char *fname, int elevel)
682 {
683  if (unlink(fname) < 0)
684  {
685  ereport(elevel,
687  errmsg("could not remove file \"%s\": %m",
688  fname)));
689  return -1;
690  }
691 
692  /*
693  * To guarantee that the removal of the file is persistent, fsync its
694  * parent directory.
695  */
696  if (fsync_parent_path(fname, elevel) != 0)
697  return -1;
698 
699  return 0;
700 }
701 
702 /*
703  * durable_link_or_rename -- rename a file in a durable manner.
704  *
705  * Similar to durable_rename(), except that this routine tries (but does not
706  * guarantee) not to overwrite the target file.
707  *
708  * Note that a crash in an unfortunate moment can leave you with two links to
709  * the target file.
710  *
711  * Log errors with the caller specified severity.
712  *
713  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
714  * valid upon return.
715  */
716 int
717 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
718 {
719  /*
720  * Ensure that, if we crash directly after the rename/link, a file with
721  * valid contents is moved into place.
722  */
723  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
724  return -1;
725 
726 #if HAVE_WORKING_LINK
727  if (link(oldfile, newfile) < 0)
728  {
729  ereport(elevel,
731  errmsg("could not link file \"%s\" to \"%s\": %m",
732  oldfile, newfile)));
733  return -1;
734  }
735  unlink(oldfile);
736 #else
737  /* XXX: Add racy file existence check? */
738  if (rename(oldfile, newfile) < 0)
739  {
740  ereport(elevel,
742  errmsg("could not rename file \"%s\" to \"%s\": %m",
743  oldfile, newfile)));
744  return -1;
745  }
746 #endif
747 
748  /*
749  * Make change persistent in case of an OS crash, both the new entry and
750  * its parent directory need to be flushed.
751  */
752  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
753  return -1;
754 
755  /* Same for parent directory */
756  if (fsync_parent_path(newfile, elevel) != 0)
757  return -1;
758 
759  return 0;
760 }
761 
762 /*
763  * InitFileAccess --- initialize this module during backend startup
764  *
765  * This is called during either normal or standalone backend start.
766  * It is *not* called in the postmaster.
767  */
768 void
770 {
771  Assert(SizeVfdCache == 0); /* call me only once */
772 
773  /* initialize cache header entry */
774  VfdCache = (Vfd *) malloc(sizeof(Vfd));
775  if (VfdCache == NULL)
776  ereport(FATAL,
777  (errcode(ERRCODE_OUT_OF_MEMORY),
778  errmsg("out of memory")));
779 
780  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
781  VfdCache->fd = VFD_CLOSED;
782 
783  SizeVfdCache = 1;
784 
785  /* register proc-exit hook to ensure temp files are dropped at exit */
787 }
788 
789 /*
790  * count_usable_fds --- count how many FDs the system will let us open,
791  * and estimate how many are already open.
792  *
793  * We stop counting if usable_fds reaches max_to_probe. Note: a small
794  * value of max_to_probe might result in an underestimate of already_open;
795  * we must fill in any "gaps" in the set of used FDs before the calculation
796  * of already_open will give the right answer. In practice, max_to_probe
797  * of a couple of dozen should be enough to ensure good results.
798  *
799  * We assume stdin (FD 0) is available for dup'ing
800  */
801 static void
802 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
803 {
804  int *fd;
805  int size;
806  int used = 0;
807  int highestfd = 0;
808  int j;
809 
810 #ifdef HAVE_GETRLIMIT
811  struct rlimit rlim;
812  int getrlimit_status;
813 #endif
814 
815  size = 1024;
816  fd = (int *) palloc(size * sizeof(int));
817 
818 #ifdef HAVE_GETRLIMIT
819 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
820  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
821 #else /* but BSD doesn't ... */
822  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
823 #endif /* RLIMIT_NOFILE */
824  if (getrlimit_status != 0)
825  ereport(WARNING, (errmsg("getrlimit failed: %m")));
826 #endif /* HAVE_GETRLIMIT */
827 
828  /* dup until failure or probe limit reached */
829  for (;;)
830  {
831  int thisfd;
832 
833 #ifdef HAVE_GETRLIMIT
834 
835  /*
836  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
837  * some platforms
838  */
839  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
840  break;
841 #endif
842 
843  thisfd = dup(0);
844  if (thisfd < 0)
845  {
846  /* Expect EMFILE or ENFILE, else it's fishy */
847  if (errno != EMFILE && errno != ENFILE)
848  elog(WARNING, "dup(0) failed after %d successes: %m", used);
849  break;
850  }
851 
852  if (used >= size)
853  {
854  size *= 2;
855  fd = (int *) repalloc(fd, size * sizeof(int));
856  }
857  fd[used++] = thisfd;
858 
859  if (highestfd < thisfd)
860  highestfd = thisfd;
861 
862  if (used >= max_to_probe)
863  break;
864  }
865 
866  /* release the files we opened */
867  for (j = 0; j < used; j++)
868  close(fd[j]);
869 
870  pfree(fd);
871 
872  /*
873  * Return results. usable_fds is just the number of successful dups. We
874  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
875  * number) and so already_open is highestfd+1 - usable_fds.
876  */
877  *usable_fds = used;
878  *already_open = highestfd + 1 - used;
879 }
880 
881 /*
882  * set_max_safe_fds
883  * Determine number of filedescriptors that fd.c is allowed to use
884  */
885 void
887 {
888  int usable_fds;
889  int already_open;
890 
891  /*----------
892  * We want to set max_safe_fds to
893  * MIN(usable_fds, max_files_per_process - already_open)
894  * less the slop factor for files that are opened without consulting
895  * fd.c. This ensures that we won't exceed either max_files_per_process
896  * or the experimentally-determined EMFILE limit.
897  *----------
898  */
900  &usable_fds, &already_open);
901 
902  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
903 
904  /*
905  * Take off the FDs reserved for system() etc.
906  */
908 
909  /*
910  * Make sure we still have enough to get by.
911  */
912  if (max_safe_fds < FD_MINFREE)
913  ereport(FATAL,
914  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
915  errmsg("insufficient file descriptors available to start server process"),
916  errdetail("System allows %d, we need at least %d.",
919 
920  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
921  max_safe_fds, usable_fds, already_open);
922 }
923 
924 /*
925  * Open a file with BasicOpenFilePerm() and pass default file mode for the
926  * fileMode parameter.
927  */
928 int
929 BasicOpenFile(const char *fileName, int fileFlags)
930 {
931  return BasicOpenFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT);
932 }
933 
934 /*
935  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
936  *
937  * This is exported for use by places that really want a plain kernel FD,
938  * but need to be proof against running out of FDs. Once an FD has been
939  * successfully returned, it is the caller's responsibility to ensure that
940  * it will not be leaked on ereport()! Most users should *not* call this
941  * routine directly, but instead use the VFD abstraction level, which
942  * provides protection against descriptor leaks as well as management of
943  * files that need to be open for more than a short period of time.
944  *
945  * Ideally this should be the *only* direct call of open() in the backend.
946  * In practice, the postmaster calls open() directly, and there are some
947  * direct open() calls done early in backend startup. Those are OK since
948  * this module wouldn't have any open files to close at that point anyway.
949  */
950 int
951 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
952 {
953  int fd;
954 
955 tryAgain:
956  fd = open(fileName, fileFlags, fileMode);
957 
958  if (fd >= 0)
959  return fd; /* success! */
960 
961  if (errno == EMFILE || errno == ENFILE)
962  {
963  int save_errno = errno;
964 
965  ereport(LOG,
966  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
967  errmsg("out of file descriptors: %m; release and retry")));
968  errno = 0;
969  if (ReleaseLruFile())
970  goto tryAgain;
971  errno = save_errno;
972  }
973 
974  return -1; /* failure */
975 }
976 
977 #if defined(FDDEBUG)
978 
979 static void
980 _dump_lru(void)
981 {
982  int mru = VfdCache[0].lruLessRecently;
983  Vfd *vfdP = &VfdCache[mru];
984  char buf[2048];
985 
986  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
987  while (mru != 0)
988  {
989  mru = vfdP->lruLessRecently;
990  vfdP = &VfdCache[mru];
991  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
992  }
993  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
994  elog(LOG, "%s", buf);
995 }
996 #endif /* FDDEBUG */
997 
998 static void
1000 {
1001  Vfd *vfdP;
1002 
1003  Assert(file != 0);
1004 
1005  DO_DB(elog(LOG, "Delete %d (%s)",
1006  file, VfdCache[file].fileName));
1007  DO_DB(_dump_lru());
1008 
1009  vfdP = &VfdCache[file];
1010 
1011  VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1012  VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1013 
1014  DO_DB(_dump_lru());
1015 }
1016 
1017 static void
1019 {
1020  Vfd *vfdP;
1021 
1022  Assert(file != 0);
1023 
1024  DO_DB(elog(LOG, "LruDelete %d (%s)",
1025  file, VfdCache[file].fileName));
1026 
1027  vfdP = &VfdCache[file];
1028 
1029  /*
1030  * Normally we should know the seek position, but if for some reason we
1031  * have lost track of it, try again to get it. If we still can't get it,
1032  * we have a problem: we will be unable to restore the file seek position
1033  * when and if the file is re-opened. But we can't really throw an error
1034  * and refuse to close the file, or activities such as transaction cleanup
1035  * will be broken.
1036  */
1037  if (FilePosIsUnknown(vfdP->seekPos))
1038  {
1039  vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1040  if (FilePosIsUnknown(vfdP->seekPos))
1041  elog(LOG, "could not seek file \"%s\" before closing: %m",
1042  vfdP->fileName);
1043  }
1044 
1045  /*
1046  * Close the file. We aren't expecting this to fail; if it does, better
1047  * to leak the FD than to mess up our internal state.
1048  */
1049  if (close(vfdP->fd))
1050  elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1051  vfdP->fd = VFD_CLOSED;
1052  --nfile;
1053 
1054  /* delete the vfd record from the LRU ring */
1055  Delete(file);
1056 }
1057 
1058 static void
1060 {
1061  Vfd *vfdP;
1062 
1063  Assert(file != 0);
1064 
1065  DO_DB(elog(LOG, "Insert %d (%s)",
1066  file, VfdCache[file].fileName));
1067  DO_DB(_dump_lru());
1068 
1069  vfdP = &VfdCache[file];
1070 
1071  vfdP->lruMoreRecently = 0;
1072  vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1073  VfdCache[0].lruLessRecently = file;
1074  VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1075 
1076  DO_DB(_dump_lru());
1077 }
1078 
1079 /* returns 0 on success, -1 on re-open failure (with errno set) */
1080 static int
1082 {
1083  Vfd *vfdP;
1084 
1085  Assert(file != 0);
1086 
1087  DO_DB(elog(LOG, "LruInsert %d (%s)",
1088  file, VfdCache[file].fileName));
1089 
1090  vfdP = &VfdCache[file];
1091 
1092  if (FileIsNotOpen(file))
1093  {
1094  /* Close excess kernel FDs. */
1095  ReleaseLruFiles();
1096 
1097  /*
1098  * The open could still fail for lack of file descriptors, eg due to
1099  * overall system file table being full. So, be prepared to release
1100  * another FD if necessary...
1101  */
1102  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1103  vfdP->fileMode);
1104  if (vfdP->fd < 0)
1105  {
1106  DO_DB(elog(LOG, "re-open failed: %m"));
1107  return -1;
1108  }
1109  else
1110  {
1111  ++nfile;
1112  }
1113 
1114  /*
1115  * Seek to the right position. We need no special case for seekPos
1116  * equal to FileUnknownPos, as lseek() will certainly reject that
1117  * (thus completing the logic noted in LruDelete() that we will fail
1118  * to re-open a file if we couldn't get its seek position before
1119  * closing).
1120  */
1121  if (vfdP->seekPos != (off_t) 0)
1122  {
1123  if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
1124  {
1125  /*
1126  * If we fail to restore the seek position, treat it like an
1127  * open() failure.
1128  */
1129  int save_errno = errno;
1130 
1131  elog(LOG, "could not seek file \"%s\" after re-opening: %m",
1132  vfdP->fileName);
1133  (void) close(vfdP->fd);
1134  vfdP->fd = VFD_CLOSED;
1135  --nfile;
1136  errno = save_errno;
1137  return -1;
1138  }
1139  }
1140  }
1141 
1142  /*
1143  * put it at the head of the Lru ring
1144  */
1145 
1146  Insert(file);
1147 
1148  return 0;
1149 }
1150 
1151 /*
1152  * Release one kernel FD by closing the least-recently-used VFD.
1153  */
1154 static bool
1156 {
1157  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1158 
1159  if (nfile > 0)
1160  {
1161  /*
1162  * There are opened files and so there should be at least one used vfd
1163  * in the ring.
1164  */
1165  Assert(VfdCache[0].lruMoreRecently != 0);
1166  LruDelete(VfdCache[0].lruMoreRecently);
1167  return true; /* freed a file */
1168  }
1169  return false; /* no files available to free */
1170 }
1171 
1172 /*
1173  * Release kernel FDs as needed to get under the max_safe_fds limit.
1174  * After calling this, it's OK to try to open another file.
1175  */
1176 static void
1178 {
1179  while (nfile + numAllocatedDescs >= max_safe_fds)
1180  {
1181  if (!ReleaseLruFile())
1182  break;
1183  }
1184 }
1185 
1186 static File
1188 {
1189  Index i;
1190  File file;
1191 
1192  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1193 
1194  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1195 
1196  if (VfdCache[0].nextFree == 0)
1197  {
1198  /*
1199  * The free list is empty so it is time to increase the size of the
1200  * array. We choose to double it each time this happens. However,
1201  * there's not much point in starting *real* small.
1202  */
1203  Size newCacheSize = SizeVfdCache * 2;
1204  Vfd *newVfdCache;
1205 
1206  if (newCacheSize < 32)
1207  newCacheSize = 32;
1208 
1209  /*
1210  * Be careful not to clobber VfdCache ptr if realloc fails.
1211  */
1212  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1213  if (newVfdCache == NULL)
1214  ereport(ERROR,
1215  (errcode(ERRCODE_OUT_OF_MEMORY),
1216  errmsg("out of memory")));
1217  VfdCache = newVfdCache;
1218 
1219  /*
1220  * Initialize the new entries and link them into the free list.
1221  */
1222  for (i = SizeVfdCache; i < newCacheSize; i++)
1223  {
1224  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1225  VfdCache[i].nextFree = i + 1;
1226  VfdCache[i].fd = VFD_CLOSED;
1227  }
1228  VfdCache[newCacheSize - 1].nextFree = 0;
1229  VfdCache[0].nextFree = SizeVfdCache;
1230 
1231  /*
1232  * Record the new size
1233  */
1234  SizeVfdCache = newCacheSize;
1235  }
1236 
1237  file = VfdCache[0].nextFree;
1238 
1239  VfdCache[0].nextFree = VfdCache[file].nextFree;
1240 
1241  return file;
1242 }
1243 
1244 static void
1246 {
1247  Vfd *vfdP = &VfdCache[file];
1248 
1249  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1250  file, vfdP->fileName ? vfdP->fileName : ""));
1251 
1252  if (vfdP->fileName != NULL)
1253  {
1254  free(vfdP->fileName);
1255  vfdP->fileName = NULL;
1256  }
1257  vfdP->fdstate = 0x0;
1258 
1259  vfdP->nextFree = VfdCache[0].nextFree;
1260  VfdCache[0].nextFree = file;
1261 }
1262 
1263 /* returns 0 on success, -1 on re-open failure (with errno set) */
1264 static int
1266 {
1267  int returnValue;
1268 
1269  DO_DB(elog(LOG, "FileAccess %d (%s)",
1270  file, VfdCache[file].fileName));
1271 
1272  /*
1273  * Is the file open? If not, open it and put it at the head of the LRU
1274  * ring (possibly closing the least recently used file to get an FD).
1275  */
1276 
1277  if (FileIsNotOpen(file))
1278  {
1279  returnValue = LruInsert(file);
1280  if (returnValue != 0)
1281  return returnValue;
1282  }
1283  else if (VfdCache[0].lruLessRecently != file)
1284  {
1285  /*
1286  * We now know that the file is open and that it is not the last one
1287  * accessed, so we need to move it to the head of the Lru ring.
1288  */
1289 
1290  Delete(file);
1291  Insert(file);
1292  }
1293 
1294  return 0;
1295 }
1296 
1297 /*
1298  * Called when we get a shared invalidation message on some relation.
1299  */
1300 #ifdef NOT_USED
1301 void
1302 FileInvalidate(File file)
1303 {
1304  Assert(FileIsValid(file));
1305  if (!FileIsNotOpen(file))
1306  LruDelete(file);
1307 }
1308 #endif
1309 
1310 /*
1311  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1312  * fileMode parameter.
1313  */
1314 File
1315 PathNameOpenFile(const char *fileName, int fileFlags)
1316 {
1317  return PathNameOpenFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT);
1318 }
1319 
1320 /*
1321  * open a file in an arbitrary directory
1322  *
1323  * NB: if the passed pathname is relative (which it usually is),
1324  * it will be interpreted relative to the process' working directory
1325  * (which should always be $PGDATA when this code is running).
1326  */
1327 File
1328 PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1329 {
1330  char *fnamecopy;
1331  File file;
1332  Vfd *vfdP;
1333 
1334  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1335  fileName, fileFlags, fileMode));
1336 
1337  /*
1338  * We need a malloc'd copy of the file name; fail cleanly if no room.
1339  */
1340  fnamecopy = strdup(fileName);
1341  if (fnamecopy == NULL)
1342  ereport(ERROR,
1343  (errcode(ERRCODE_OUT_OF_MEMORY),
1344  errmsg("out of memory")));
1345 
1346  file = AllocateVfd();
1347  vfdP = &VfdCache[file];
1348 
1349  /* Close excess kernel FDs. */
1350  ReleaseLruFiles();
1351 
1352  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1353 
1354  if (vfdP->fd < 0)
1355  {
1356  int save_errno = errno;
1357 
1358  FreeVfd(file);
1359  free(fnamecopy);
1360  errno = save_errno;
1361  return -1;
1362  }
1363  ++nfile;
1364  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1365  vfdP->fd));
1366 
1367  Insert(file);
1368 
1369  vfdP->fileName = fnamecopy;
1370  /* Saved flags are adjusted to be OK for re-opening file */
1371  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1372  vfdP->fileMode = fileMode;
1373  vfdP->seekPos = 0;
1374  vfdP->fileSize = 0;
1375  vfdP->fdstate = 0x0;
1376  vfdP->resowner = NULL;
1377 
1378  return file;
1379 }
1380 
1381 /*
1382  * Open a temporary file that will disappear when we close it.
1383  *
1384  * This routine takes care of generating an appropriate tempfile name.
1385  * There's no need to pass in fileFlags or fileMode either, since only
1386  * one setting makes any sense for a temp file.
1387  *
1388  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1389  * to ensure it's closed and deleted when it's no longer needed, typically at
1390  * the end-of-transaction. In most cases, you don't want temporary files to
1391  * outlive the transaction that created them, so this should be false -- but
1392  * if you need "somewhat" temporary storage, this might be useful. In either
1393  * case, the file is removed when the File is explicitly closed.
1394  */
1395 File
1396 OpenTemporaryFile(bool interXact)
1397 {
1398  File file = 0;
1399 
1400  /*
1401  * If some temp tablespace(s) have been given to us, try to use the next
1402  * one. If a given tablespace can't be found, we silently fall back to
1403  * the database's default tablespace.
1404  *
1405  * BUT: if the temp file is slated to outlive the current transaction,
1406  * force it into the database's default tablespace, so that it will not
1407  * pose a threat to possible tablespace drop attempts.
1408  */
1409  if (numTempTableSpaces > 0 && !interXact)
1410  {
1411  Oid tblspcOid = GetNextTempTableSpace();
1412 
1413  if (OidIsValid(tblspcOid))
1414  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1415  }
1416 
1417  /*
1418  * If not, or if tablespace is bad, create in database's default
1419  * tablespace. MyDatabaseTableSpace should normally be set before we get
1420  * here, but just in case it isn't, fall back to pg_default tablespace.
1421  */
1422  if (file <= 0)
1426  true);
1427 
1428  /* Mark it for deletion at close */
1429  VfdCache[file].fdstate |= FD_TEMPORARY;
1430 
1431  /* Register it with the current resource owner */
1432  if (!interXact)
1433  {
1434  VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
1435 
1438  VfdCache[file].resowner = CurrentResourceOwner;
1439 
1440  /* ensure cleanup happens at eoxact */
1442  }
1443 
1444  return file;
1445 }
1446 
1447 /*
1448  * Open a temporary file in a specific tablespace.
1449  * Subroutine for OpenTemporaryFile, which see for details.
1450  */
1451 static File
1452 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1453 {
1454  char tempdirpath[MAXPGPATH];
1455  char tempfilepath[MAXPGPATH];
1456  File file;
1457 
1458  /*
1459  * Identify the tempfile directory for this tablespace.
1460  *
1461  * If someone tries to specify pg_global, use pg_default instead.
1462  */
1463  if (tblspcOid == DEFAULTTABLESPACE_OID ||
1464  tblspcOid == GLOBALTABLESPACE_OID)
1465  {
1466  /* The default tablespace is {datadir}/base */
1467  snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
1469  }
1470  else
1471  {
1472  /* All other tablespaces are accessed via symlinks */
1473  snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
1475  }
1476 
1477  /*
1478  * Generate a tempfile name that should be unique within the current
1479  * database instance.
1480  */
1481  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1482  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1483 
1484  /*
1485  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1486  * temp file that can be reused.
1487  */
1488  file = PathNameOpenFile(tempfilepath,
1489  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1490  if (file <= 0)
1491  {
1492  /*
1493  * We might need to create the tablespace's tempfile directory, if no
1494  * one has yet done so.
1495  *
1496  * Don't check for error from mkdir; it could fail if someone else
1497  * just did the same thing. If it doesn't work then we'll bomb out on
1498  * the second create attempt, instead.
1499  */
1500  mkdir(tempdirpath, S_IRWXU);
1501 
1502  file = PathNameOpenFile(tempfilepath,
1503  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1504  if (file <= 0 && rejectError)
1505  elog(ERROR, "could not create temporary file \"%s\": %m",
1506  tempfilepath);
1507  }
1508 
1509  return file;
1510 }
1511 
1512 /*
1513  * close a file when done with it
1514  */
1515 void
1517 {
1518  Vfd *vfdP;
1519 
1520  Assert(FileIsValid(file));
1521 
1522  DO_DB(elog(LOG, "FileClose: %d (%s)",
1523  file, VfdCache[file].fileName));
1524 
1525  vfdP = &VfdCache[file];
1526 
1527  if (!FileIsNotOpen(file))
1528  {
1529  /* close the file */
1530  if (close(vfdP->fd))
1531  elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1532 
1533  --nfile;
1534  vfdP->fd = VFD_CLOSED;
1535 
1536  /* remove the file from the lru ring */
1537  Delete(file);
1538  }
1539 
1540  /*
1541  * Delete the file if it was temporary, and make a log entry if wanted
1542  */
1543  if (vfdP->fdstate & FD_TEMPORARY)
1544  {
1545  struct stat filestats;
1546  int stat_errno;
1547 
1548  /*
1549  * If we get an error, as could happen within the ereport/elog calls,
1550  * we'll come right back here during transaction abort. Reset the
1551  * flag to ensure that we can't get into an infinite loop. This code
1552  * is arranged to ensure that the worst-case consequence is failing to
1553  * emit log message(s), not failing to attempt the unlink.
1554  */
1555  vfdP->fdstate &= ~FD_TEMPORARY;
1556 
1557  /* Subtract its size from current usage (do first in case of error) */
1558  temporary_files_size -= vfdP->fileSize;
1559  vfdP->fileSize = 0;
1560 
1561  /* first try the stat() */
1562  if (stat(vfdP->fileName, &filestats))
1563  stat_errno = errno;
1564  else
1565  stat_errno = 0;
1566 
1567  /* in any case do the unlink */
1568  if (unlink(vfdP->fileName))
1569  elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1570 
1571  /* and last report the stat results */
1572  if (stat_errno == 0)
1573  {
1574  pgstat_report_tempfile(filestats.st_size);
1575 
1576  if (log_temp_files >= 0)
1577  {
1578  if ((filestats.st_size / 1024) >= log_temp_files)
1579  ereport(LOG,
1580  (errmsg("temporary file: path \"%s\", size %lu",
1581  vfdP->fileName,
1582  (unsigned long) filestats.st_size)));
1583  }
1584  }
1585  else
1586  {
1587  errno = stat_errno;
1588  elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1589  }
1590  }
1591 
1592  /* Unregister it from the resource owner */
1593  if (vfdP->resowner)
1594  ResourceOwnerForgetFile(vfdP->resowner, file);
1595 
1596  /*
1597  * Return the Vfd slot to the free list
1598  */
1599  FreeVfd(file);
1600 }
1601 
1602 /*
1603  * FilePrefetch - initiate asynchronous read of a given range of the file.
1604  * The logical seek position is unaffected.
1605  *
1606  * Currently the only implementation of this function is using posix_fadvise
1607  * which is the simplest standardized interface that accomplishes this.
1608  * We could add an implementation using libaio in the future; but note that
1609  * this API is inappropriate for libaio, which wants to have a buffer provided
1610  * to read into.
1611  */
1612 int
1613 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1614 {
1615 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1616  int returnCode;
1617 
1618  Assert(FileIsValid(file));
1619 
1620  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1621  file, VfdCache[file].fileName,
1622  (int64) offset, amount));
1623 
1624  returnCode = FileAccess(file);
1625  if (returnCode < 0)
1626  return returnCode;
1627 
1628  pgstat_report_wait_start(wait_event_info);
1629  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1630  POSIX_FADV_WILLNEED);
1632 
1633  return returnCode;
1634 #else
1635  Assert(FileIsValid(file));
1636  return 0;
1637 #endif
1638 }
1639 
1640 void
1641 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1642 {
1643  int returnCode;
1644 
1645  Assert(FileIsValid(file));
1646 
1647  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1648  file, VfdCache[file].fileName,
1649  (int64) offset, (int64) nbytes));
1650 
1651  /*
1652  * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
1653  * file's seek position. We prefer to define that as a no-op here.
1654  */
1655  if (nbytes <= 0)
1656  return;
1657 
1658  returnCode = FileAccess(file);
1659  if (returnCode < 0)
1660  return;
1661 
1662  pgstat_report_wait_start(wait_event_info);
1663  pg_flush_data(VfdCache[file].fd, offset, nbytes);
1665 }
1666 
1667 int
1668 FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
1669 {
1670  int returnCode;
1671  Vfd *vfdP;
1672 
1673  Assert(FileIsValid(file));
1674 
1675  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1676  file, VfdCache[file].fileName,
1677  (int64) VfdCache[file].seekPos,
1678  amount, buffer));
1679 
1680  returnCode = FileAccess(file);
1681  if (returnCode < 0)
1682  return returnCode;
1683 
1684  vfdP = &VfdCache[file];
1685 
1686 retry:
1687  pgstat_report_wait_start(wait_event_info);
1688  returnCode = read(vfdP->fd, buffer, amount);
1690 
1691  if (returnCode >= 0)
1692  {
1693  /* if seekPos is unknown, leave it that way */
1694  if (!FilePosIsUnknown(vfdP->seekPos))
1695  vfdP->seekPos += returnCode;
1696  }
1697  else
1698  {
1699  /*
1700  * Windows may run out of kernel buffers and return "Insufficient
1701  * system resources" error. Wait a bit and retry to solve it.
1702  *
1703  * It is rumored that EINTR is also possible on some Unix filesystems,
1704  * in which case immediate retry is indicated.
1705  */
1706 #ifdef WIN32
1707  DWORD error = GetLastError();
1708 
1709  switch (error)
1710  {
1711  case ERROR_NO_SYSTEM_RESOURCES:
1712  pg_usleep(1000L);
1713  errno = EINTR;
1714  break;
1715  default:
1716  _dosmaperr(error);
1717  break;
1718  }
1719 #endif
1720  /* OK to retry if interrupted */
1721  if (errno == EINTR)
1722  goto retry;
1723 
1724  /* Trouble, so assume we don't know the file position anymore */
1725  vfdP->seekPos = FileUnknownPos;
1726  }
1727 
1728  return returnCode;
1729 }
1730 
1731 int
1732 FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
1733 {
1734  int returnCode;
1735  Vfd *vfdP;
1736 
1737  Assert(FileIsValid(file));
1738 
1739  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1740  file, VfdCache[file].fileName,
1741  (int64) VfdCache[file].seekPos,
1742  amount, buffer));
1743 
1744  returnCode = FileAccess(file);
1745  if (returnCode < 0)
1746  return returnCode;
1747 
1748  vfdP = &VfdCache[file];
1749 
1750  /*
1751  * If enforcing temp_file_limit and it's a temp file, check to see if the
1752  * write would overrun temp_file_limit, and throw error if so. Note: it's
1753  * really a modularity violation to throw error here; we should set errno
1754  * and return -1. However, there's no way to report a suitable error
1755  * message if we do that. All current callers would just throw error
1756  * immediately anyway, so this is safe at present.
1757  */
1758  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMPORARY))
1759  {
1760  off_t newPos;
1761 
1762  /*
1763  * Normally we should know the seek position, but if for some reason
1764  * we have lost track of it, try again to get it. Here, it's fine to
1765  * throw an error if we still can't get it.
1766  */
1767  if (FilePosIsUnknown(vfdP->seekPos))
1768  {
1769  vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1770  if (FilePosIsUnknown(vfdP->seekPos))
1771  elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
1772  }
1773 
1774  newPos = vfdP->seekPos + amount;
1775  if (newPos > vfdP->fileSize)
1776  {
1777  uint64 newTotal = temporary_files_size;
1778 
1779  newTotal += newPos - vfdP->fileSize;
1780  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1781  ereport(ERROR,
1782  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1783  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1784  temp_file_limit)));
1785  }
1786  }
1787 
1788 retry:
1789  errno = 0;
1790  pgstat_report_wait_start(wait_event_info);
1791  returnCode = write(vfdP->fd, buffer, amount);
1793 
1794  /* if write didn't set errno, assume problem is no disk space */
1795  if (returnCode != amount && errno == 0)
1796  errno = ENOSPC;
1797 
1798  if (returnCode >= 0)
1799  {
1800  /* if seekPos is unknown, leave it that way */
1801  if (!FilePosIsUnknown(vfdP->seekPos))
1802  vfdP->seekPos += returnCode;
1803 
1804  /*
1805  * Maintain fileSize and temporary_files_size if it's a temp file.
1806  *
1807  * If seekPos is -1 (unknown), this will do nothing; but we could only
1808  * get here in that state if we're not enforcing temporary_files_size,
1809  * so we don't care.
1810  */
1811  if (vfdP->fdstate & FD_TEMPORARY)
1812  {
1813  off_t newPos = vfdP->seekPos;
1814 
1815  if (newPos > vfdP->fileSize)
1816  {
1817  temporary_files_size += newPos - vfdP->fileSize;
1818  vfdP->fileSize = newPos;
1819  }
1820  }
1821  }
1822  else
1823  {
1824  /*
1825  * See comments in FileRead()
1826  */
1827 #ifdef WIN32
1828  DWORD error = GetLastError();
1829 
1830  switch (error)
1831  {
1832  case ERROR_NO_SYSTEM_RESOURCES:
1833  pg_usleep(1000L);
1834  errno = EINTR;
1835  break;
1836  default:
1837  _dosmaperr(error);
1838  break;
1839  }
1840 #endif
1841  /* OK to retry if interrupted */
1842  if (errno == EINTR)
1843  goto retry;
1844 
1845  /* Trouble, so assume we don't know the file position anymore */
1846  vfdP->seekPos = FileUnknownPos;
1847  }
1848 
1849  return returnCode;
1850 }
1851 
1852 int
1853 FileSync(File file, uint32 wait_event_info)
1854 {
1855  int returnCode;
1856 
1857  Assert(FileIsValid(file));
1858 
1859  DO_DB(elog(LOG, "FileSync: %d (%s)",
1860  file, VfdCache[file].fileName));
1861 
1862  returnCode = FileAccess(file);
1863  if (returnCode < 0)
1864  return returnCode;
1865 
1866  pgstat_report_wait_start(wait_event_info);
1867  returnCode = pg_fsync(VfdCache[file].fd);
1869 
1870  return returnCode;
1871 }
1872 
1873 off_t
1874 FileSeek(File file, off_t offset, int whence)
1875 {
1876  Vfd *vfdP;
1877 
1878  Assert(FileIsValid(file));
1879 
1880  DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
1881  file, VfdCache[file].fileName,
1882  (int64) VfdCache[file].seekPos,
1883  (int64) offset, whence));
1884 
1885  vfdP = &VfdCache[file];
1886 
1887  if (FileIsNotOpen(file))
1888  {
1889  switch (whence)
1890  {
1891  case SEEK_SET:
1892  if (offset < 0)
1893  {
1894  errno = EINVAL;
1895  return (off_t) -1;
1896  }
1897  vfdP->seekPos = offset;
1898  break;
1899  case SEEK_CUR:
1900  if (FilePosIsUnknown(vfdP->seekPos) ||
1901  vfdP->seekPos + offset < 0)
1902  {
1903  errno = EINVAL;
1904  return (off_t) -1;
1905  }
1906  vfdP->seekPos += offset;
1907  break;
1908  case SEEK_END:
1909  if (FileAccess(file) < 0)
1910  return (off_t) -1;
1911  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1912  break;
1913  default:
1914  elog(ERROR, "invalid whence: %d", whence);
1915  break;
1916  }
1917  }
1918  else
1919  {
1920  switch (whence)
1921  {
1922  case SEEK_SET:
1923  if (offset < 0)
1924  {
1925  errno = EINVAL;
1926  return (off_t) -1;
1927  }
1928  if (vfdP->seekPos != offset)
1929  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1930  break;
1931  case SEEK_CUR:
1932  if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
1933  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1934  break;
1935  case SEEK_END:
1936  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
1937  break;
1938  default:
1939  elog(ERROR, "invalid whence: %d", whence);
1940  break;
1941  }
1942  }
1943 
1944  return vfdP->seekPos;
1945 }
1946 
1947 /*
1948  * XXX not actually used but here for completeness
1949  */
1950 #ifdef NOT_USED
1951 off_t
1952 FileTell(File file)
1953 {
1954  Assert(FileIsValid(file));
1955  DO_DB(elog(LOG, "FileTell %d (%s)",
1956  file, VfdCache[file].fileName));
1957  return VfdCache[file].seekPos;
1958 }
1959 #endif
1960 
1961 int
1962 FileTruncate(File file, off_t offset, uint32 wait_event_info)
1963 {
1964  int returnCode;
1965 
1966  Assert(FileIsValid(file));
1967 
1968  DO_DB(elog(LOG, "FileTruncate %d (%s)",
1969  file, VfdCache[file].fileName));
1970 
1971  returnCode = FileAccess(file);
1972  if (returnCode < 0)
1973  return returnCode;
1974 
1975  pgstat_report_wait_start(wait_event_info);
1976  returnCode = ftruncate(VfdCache[file].fd, offset);
1978 
1979  if (returnCode == 0 && VfdCache[file].fileSize > offset)
1980  {
1981  /* adjust our state for truncation of a temp file */
1982  Assert(VfdCache[file].fdstate & FD_TEMPORARY);
1983  temporary_files_size -= VfdCache[file].fileSize - offset;
1984  VfdCache[file].fileSize = offset;
1985  }
1986 
1987  return returnCode;
1988 }
1989 
1990 /*
1991  * Return the pathname associated with an open file.
1992  *
1993  * The returned string points to an internal buffer, which is valid until
1994  * the file is closed.
1995  */
1996 char *
1998 {
1999  Assert(FileIsValid(file));
2000 
2001  return VfdCache[file].fileName;
2002 }
2003 
2004 /*
2005  * Return the raw file descriptor of an opened file.
2006  *
2007  * The returned file descriptor will be valid until the file is closed, but
2008  * there are a lot of things that can make that happen. So the caller should
2009  * be careful not to do much of anything else before it finishes using the
2010  * returned file descriptor.
2011  */
2012 int
2014 {
2015  Assert(FileIsValid(file));
2016  return VfdCache[file].fd;
2017 }
2018 
2019 /*
2020  * FileGetRawFlags - returns the file flags on open(2)
2021  */
2022 int
2024 {
2025  Assert(FileIsValid(file));
2026  return VfdCache[file].fileFlags;
2027 }
2028 
2029 /*
2030  * FileGetRawMode - returns the mode bitmask passed to open(2)
2031  */
2032 mode_t
2034 {
2035  Assert(FileIsValid(file));
2036  return VfdCache[file].fileMode;
2037 }
2038 
2039 /*
2040  * Make room for another allocatedDescs[] array entry if needed and possible.
2041  * Returns true if an array element is available.
2042  */
2043 static bool
2045 {
2046  AllocateDesc *newDescs;
2047  int newMax;
2048 
2049  /* Quick out if array already has a free slot. */
2051  return true;
2052 
2053  /*
2054  * If the array hasn't yet been created in the current process, initialize
2055  * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2056  * we will ever need, anyway. We don't want to look at max_safe_fds
2057  * immediately because set_max_safe_fds() may not have run yet.
2058  */
2059  if (allocatedDescs == NULL)
2060  {
2061  newMax = FD_MINFREE / 2;
2062  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2063  /* Out of memory already? Treat as fatal error. */
2064  if (newDescs == NULL)
2065  ereport(ERROR,
2066  (errcode(ERRCODE_OUT_OF_MEMORY),
2067  errmsg("out of memory")));
2068  allocatedDescs = newDescs;
2069  maxAllocatedDescs = newMax;
2070  return true;
2071  }
2072 
2073  /*
2074  * Consider enlarging the array beyond the initial allocation used above.
2075  * By the time this happens, max_safe_fds should be known accurately.
2076  *
2077  * We mustn't let allocated descriptors hog all the available FDs, and in
2078  * practice we'd better leave a reasonable number of FDs for VFD use. So
2079  * set the maximum to max_safe_fds / 2. (This should certainly be at
2080  * least as large as the initial size, FD_MINFREE / 2.)
2081  */
2082  newMax = max_safe_fds / 2;
2083  if (newMax > maxAllocatedDescs)
2084  {
2085  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2086  newMax * sizeof(AllocateDesc));
2087  /* Treat out-of-memory as a non-fatal error. */
2088  if (newDescs == NULL)
2089  return false;
2090  allocatedDescs = newDescs;
2091  maxAllocatedDescs = newMax;
2092  return true;
2093  }
2094 
2095  /* Can't enlarge allocatedDescs[] any more. */
2096  return false;
2097 }
2098 
2099 /*
2100  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2101  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2102  * necessary to open the file. When done, call FreeFile rather than fclose.
2103  *
2104  * Note that files that will be open for any significant length of time
2105  * should NOT be handled this way, since they cannot share kernel file
2106  * descriptors with other files; there is grave risk of running out of FDs
2107  * if anyone locks down too many FDs. Most callers of this routine are
2108  * simply reading a config file that they will read and close immediately.
2109  *
2110  * fd.c will automatically close all files opened with AllocateFile at
2111  * transaction commit or abort; this prevents FD leakage if a routine
2112  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2113  *
2114  * Ideally this should be the *only* direct call of fopen() in the backend.
2115  */
2116 FILE *
2117 AllocateFile(const char *name, const char *mode)
2118 {
2119  FILE *file;
2120 
2121  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2122  numAllocatedDescs, name));
2123 
2124  /* Can we allocate another non-virtual FD? */
2125  if (!reserveAllocatedDesc())
2126  ereport(ERROR,
2127  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2128  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2129  maxAllocatedDescs, name)));
2130 
2131  /* Close excess kernel FDs. */
2132  ReleaseLruFiles();
2133 
2134 TryAgain:
2135  if ((file = fopen(name, mode)) != NULL)
2136  {
2137  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2138 
2139  desc->kind = AllocateDescFile;
2140  desc->desc.file = file;
2143  return desc->desc.file;
2144  }
2145 
2146  if (errno == EMFILE || errno == ENFILE)
2147  {
2148  int save_errno = errno;
2149 
2150  ereport(LOG,
2151  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2152  errmsg("out of file descriptors: %m; release and retry")));
2153  errno = 0;
2154  if (ReleaseLruFile())
2155  goto TryAgain;
2156  errno = save_errno;
2157  }
2158 
2159  return NULL;
2160 }
2161 
2162 /*
2163  * Open a file with OpenTransientFilePerm() and pass default file mode for
2164  * the fileMode parameter.
2165  */
2166 int
2167 OpenTransientFile(const char *fileName, int fileFlags)
2168 {
2169  return OpenTransientFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT);
2170 }
2171 
2172 /*
2173  * Like AllocateFile, but returns an unbuffered fd like open(2)
2174  */
2175 int
2176 OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2177 {
2178  int fd;
2179 
2180  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2181  numAllocatedDescs, fileName));
2182 
2183  /* Can we allocate another non-virtual FD? */
2184  if (!reserveAllocatedDesc())
2185  ereport(ERROR,
2186  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2187  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2188  maxAllocatedDescs, fileName)));
2189 
2190  /* Close excess kernel FDs. */
2191  ReleaseLruFiles();
2192 
2193  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2194 
2195  if (fd >= 0)
2196  {
2197  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2198 
2199  desc->kind = AllocateDescRawFD;
2200  desc->desc.fd = fd;
2203 
2204  return fd;
2205  }
2206 
2207  return -1; /* failure */
2208 }
2209 
2210 /*
2211  * Routines that want to initiate a pipe stream should use OpenPipeStream
2212  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2213  * necessary. When done, call ClosePipeStream rather than pclose.
2214  */
2215 FILE *
2216 OpenPipeStream(const char *command, const char *mode)
2217 {
2218  FILE *file;
2219 
2220  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2221  numAllocatedDescs, command));
2222 
2223  /* Can we allocate another non-virtual FD? */
2224  if (!reserveAllocatedDesc())
2225  ereport(ERROR,
2226  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2227  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2228  maxAllocatedDescs, command)));
2229 
2230  /* Close excess kernel FDs. */
2231  ReleaseLruFiles();
2232 
2233 TryAgain:
2234  fflush(stdout);
2235  fflush(stderr);
2236  errno = 0;
2237  if ((file = popen(command, mode)) != NULL)
2238  {
2239  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2240 
2241  desc->kind = AllocateDescPipe;
2242  desc->desc.file = file;
2245  return desc->desc.file;
2246  }
2247 
2248  if (errno == EMFILE || errno == ENFILE)
2249  {
2250  int save_errno = errno;
2251 
2252  ereport(LOG,
2253  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2254  errmsg("out of file descriptors: %m; release and retry")));
2255  errno = 0;
2256  if (ReleaseLruFile())
2257  goto TryAgain;
2258  errno = save_errno;
2259  }
2260 
2261  return NULL;
2262 }
2263 
2264 /*
2265  * Free an AllocateDesc of any type.
2266  *
2267  * The argument *must* point into the allocatedDescs[] array.
2268  */
2269 static int
2271 {
2272  int result;
2273 
2274  /* Close the underlying object */
2275  switch (desc->kind)
2276  {
2277  case AllocateDescFile:
2278  result = fclose(desc->desc.file);
2279  break;
2280  case AllocateDescPipe:
2281  result = pclose(desc->desc.file);
2282  break;
2283  case AllocateDescDir:
2284  result = closedir(desc->desc.dir);
2285  break;
2286  case AllocateDescRawFD:
2287  result = close(desc->desc.fd);
2288  break;
2289  default:
2290  elog(ERROR, "AllocateDesc kind not recognized");
2291  result = 0; /* keep compiler quiet */
2292  break;
2293  }
2294 
2295  /* Compact storage in the allocatedDescs array */
2297  *desc = allocatedDescs[numAllocatedDescs];
2298 
2299  return result;
2300 }
2301 
2302 /*
2303  * Close a file returned by AllocateFile.
2304  *
2305  * Note we do not check fclose's return value --- it is up to the caller
2306  * to handle close errors.
2307  */
2308 int
2309 FreeFile(FILE *file)
2310 {
2311  int i;
2312 
2313  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2314 
2315  /* Remove file from list of allocated files, if it's present */
2316  for (i = numAllocatedDescs; --i >= 0;)
2317  {
2318  AllocateDesc *desc = &allocatedDescs[i];
2319 
2320  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2321  return FreeDesc(desc);
2322  }
2323 
2324  /* Only get here if someone passes us a file not in allocatedDescs */
2325  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2326 
2327  return fclose(file);
2328 }
2329 
2330 /*
2331  * Close a file returned by OpenTransientFile.
2332  *
2333  * Note we do not check close's return value --- it is up to the caller
2334  * to handle close errors.
2335  */
2336 int
2338 {
2339  int i;
2340 
2341  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2342 
2343  /* Remove fd from list of allocated files, if it's present */
2344  for (i = numAllocatedDescs; --i >= 0;)
2345  {
2346  AllocateDesc *desc = &allocatedDescs[i];
2347 
2348  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2349  return FreeDesc(desc);
2350  }
2351 
2352  /* Only get here if someone passes us a file not in allocatedDescs */
2353  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2354 
2355  return close(fd);
2356 }
2357 
2358 /*
2359  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2360  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2361  * necessary to open the directory, and with closing it after an elog.
2362  * When done, call FreeDir rather than closedir.
2363  *
2364  * Ideally this should be the *only* direct call of opendir() in the backend.
2365  */
2366 DIR *
2367 AllocateDir(const char *dirname)
2368 {
2369  DIR *dir;
2370 
2371  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2372  numAllocatedDescs, dirname));
2373 
2374  /* Can we allocate another non-virtual FD? */
2375  if (!reserveAllocatedDesc())
2376  ereport(ERROR,
2377  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2378  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2379  maxAllocatedDescs, dirname)));
2380 
2381  /* Close excess kernel FDs. */
2382  ReleaseLruFiles();
2383 
2384 TryAgain:
2385  if ((dir = opendir(dirname)) != NULL)
2386  {
2387  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2388 
2389  desc->kind = AllocateDescDir;
2390  desc->desc.dir = dir;
2393  return desc->desc.dir;
2394  }
2395 
2396  if (errno == EMFILE || errno == ENFILE)
2397  {
2398  int save_errno = errno;
2399 
2400  ereport(LOG,
2401  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2402  errmsg("out of file descriptors: %m; release and retry")));
2403  errno = 0;
2404  if (ReleaseLruFile())
2405  goto TryAgain;
2406  errno = save_errno;
2407  }
2408 
2409  return NULL;
2410 }
2411 
2412 /*
2413  * Read a directory opened with AllocateDir, ereport'ing any error.
2414  *
2415  * This is easier to use than raw readdir() since it takes care of some
2416  * otherwise rather tedious and error-prone manipulation of errno. Also,
2417  * if you are happy with a generic error message for AllocateDir failure,
2418  * you can just do
2419  *
2420  * dir = AllocateDir(path);
2421  * while ((dirent = ReadDir(dir, path)) != NULL)
2422  * process dirent;
2423  * FreeDir(dir);
2424  *
2425  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2426  * (Make sure errno hasn't been changed since AllocateDir if you use this
2427  * shortcut.)
2428  *
2429  * The pathname passed to AllocateDir must be passed to this routine too,
2430  * but it is only used for error reporting.
2431  */
2432 struct dirent *
2433 ReadDir(DIR *dir, const char *dirname)
2434 {
2435  return ReadDirExtended(dir, dirname, ERROR);
2436 }
2437 
2438 /*
2439  * Alternate version that allows caller to specify the elevel for any
2440  * error report. If elevel < ERROR, returns NULL on any error.
2441  */
2442 static struct dirent *
2443 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2444 {
2445  struct dirent *dent;
2446 
2447  /* Give a generic message for AllocateDir failure, if caller didn't */
2448  if (dir == NULL)
2449  {
2450  ereport(elevel,
2452  errmsg("could not open directory \"%s\": %m",
2453  dirname)));
2454  return NULL;
2455  }
2456 
2457  errno = 0;
2458  if ((dent = readdir(dir)) != NULL)
2459  return dent;
2460 
2461  if (errno)
2462  ereport(elevel,
2464  errmsg("could not read directory \"%s\": %m",
2465  dirname)));
2466  return NULL;
2467 }
2468 
2469 /*
2470  * Close a directory opened with AllocateDir.
2471  *
2472  * Note we do not check closedir's return value --- it is up to the caller
2473  * to handle close errors.
2474  */
2475 int
2477 {
2478  int i;
2479 
2480  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2481 
2482  /* Remove dir from list of allocated dirs, if it's present */
2483  for (i = numAllocatedDescs; --i >= 0;)
2484  {
2485  AllocateDesc *desc = &allocatedDescs[i];
2486 
2487  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2488  return FreeDesc(desc);
2489  }
2490 
2491  /* Only get here if someone passes us a dir not in allocatedDescs */
2492  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2493 
2494  return closedir(dir);
2495 }
2496 
2497 
2498 /*
2499  * Close a pipe stream returned by OpenPipeStream.
2500  */
2501 int
2502 ClosePipeStream(FILE *file)
2503 {
2504  int i;
2505 
2506  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2507 
2508  /* Remove file from list of allocated files, if it's present */
2509  for (i = numAllocatedDescs; --i >= 0;)
2510  {
2511  AllocateDesc *desc = &allocatedDescs[i];
2512 
2513  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2514  return FreeDesc(desc);
2515  }
2516 
2517  /* Only get here if someone passes us a file not in allocatedDescs */
2518  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2519 
2520  return pclose(file);
2521 }
2522 
2523 /*
2524  * closeAllVfds
2525  *
2526  * Force all VFDs into the physically-closed state, so that the fewest
2527  * possible number of kernel file descriptors are in use. There is no
2528  * change in the logical state of the VFDs.
2529  */
2530 void
2532 {
2533  Index i;
2534 
2535  if (SizeVfdCache > 0)
2536  {
2537  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2538  for (i = 1; i < SizeVfdCache; i++)
2539  {
2540  if (!FileIsNotOpen(i))
2541  LruDelete(i);
2542  }
2543  }
2544 }
2545 
2546 
2547 /*
2548  * SetTempTablespaces
2549  *
2550  * Define a list (actually an array) of OIDs of tablespaces to use for
2551  * temporary files. This list will be used until end of transaction,
2552  * unless this function is called again before then. It is caller's
2553  * responsibility that the passed-in array has adequate lifespan (typically
2554  * it'd be allocated in TopTransactionContext).
2555  */
2556 void
2557 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2558 {
2559  Assert(numSpaces >= 0);
2560  tempTableSpaces = tableSpaces;
2561  numTempTableSpaces = numSpaces;
2562 
2563  /*
2564  * Select a random starting point in the list. This is to minimize
2565  * conflicts between backends that are most likely sharing the same list
2566  * of temp tablespaces. Note that if we create multiple temp files in the
2567  * same transaction, we'll advance circularly through the list --- this
2568  * ensures that large temporary sort files are nicely spread across all
2569  * available tablespaces.
2570  */
2571  if (numSpaces > 1)
2572  nextTempTableSpace = random() % numSpaces;
2573  else
2574  nextTempTableSpace = 0;
2575 }
2576 
2577 /*
2578  * TempTablespacesAreSet
2579  *
2580  * Returns TRUE if SetTempTablespaces has been called in current transaction.
2581  * (This is just so that tablespaces.c doesn't need its own per-transaction
2582  * state.)
2583  */
2584 bool
2586 {
2587  return (numTempTableSpaces >= 0);
2588 }
2589 
2590 /*
2591  * GetNextTempTableSpace
2592  *
2593  * Select the next temp tablespace to use. A result of InvalidOid means
2594  * to use the current database's default tablespace.
2595  */
2596 Oid
2598 {
2599  if (numTempTableSpaces > 0)
2600  {
2601  /* Advance nextTempTableSpace counter with wraparound */
2603  nextTempTableSpace = 0;
2605  }
2606  return InvalidOid;
2607 }
2608 
2609 
2610 /*
2611  * AtEOSubXact_Files
2612  *
2613  * Take care of subtransaction commit/abort. At abort, we close temp files
2614  * that the subtransaction may have opened. At commit, we reassign the
2615  * files that were opened to the parent subtransaction.
2616  */
2617 void
2618 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2619  SubTransactionId parentSubid)
2620 {
2621  Index i;
2622 
2623  for (i = 0; i < numAllocatedDescs; i++)
2624  {
2625  if (allocatedDescs[i].create_subid == mySubid)
2626  {
2627  if (isCommit)
2628  allocatedDescs[i].create_subid = parentSubid;
2629  else
2630  {
2631  /* have to recheck the item after FreeDesc (ugly) */
2632  FreeDesc(&allocatedDescs[i--]);
2633  }
2634  }
2635  }
2636 }
2637 
2638 /*
2639  * AtEOXact_Files
2640  *
2641  * This routine is called during transaction commit or abort (it doesn't
2642  * particularly care which). All still-open per-transaction temporary file
2643  * VFDs are closed, which also causes the underlying files to be deleted
2644  * (although they should've been closed already by the ResourceOwner
2645  * cleanup). Furthermore, all "allocated" stdio files are closed. We also
2646  * forget any transaction-local temp tablespace list.
2647  */
2648 void
2650 {
2651  CleanupTempFiles(false);
2652  tempTableSpaces = NULL;
2653  numTempTableSpaces = -1;
2654 }
2655 
2656 /*
2657  * AtProcExit_Files
2658  *
2659  * on_proc_exit hook to clean up temp files during backend shutdown.
2660  * Here, we want to clean up *all* temp files including interXact ones.
2661  */
2662 static void
2664 {
2665  CleanupTempFiles(true);
2666 }
2667 
2668 /*
2669  * Close temporary files and delete their underlying files.
2670  *
2671  * isProcExit: if true, this is being called as the backend process is
2672  * exiting. If that's the case, we should remove all temporary files; if
2673  * that's not the case, we are being called for transaction commit/abort
2674  * and should only remove transaction-local temp files. In either case,
2675  * also clean up "allocated" stdio files, dirs and fds.
2676  */
2677 static void
2678 CleanupTempFiles(bool isProcExit)
2679 {
2680  Index i;
2681 
2682  /*
2683  * Careful here: at proc_exit we need extra cleanup, not just
2684  * xact_temporary files.
2685  */
2686  if (isProcExit || have_xact_temporary_files)
2687  {
2688  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2689  for (i = 1; i < SizeVfdCache; i++)
2690  {
2691  unsigned short fdstate = VfdCache[i].fdstate;
2692 
2693  if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
2694  {
2695  /*
2696  * If we're in the process of exiting a backend process, close
2697  * all temporary files. Otherwise, only close temporary files
2698  * local to the current transaction. They should be closed by
2699  * the ResourceOwner mechanism already, so this is just a
2700  * debugging cross-check.
2701  */
2702  if (isProcExit)
2703  FileClose(i);
2704  else if (fdstate & FD_XACT_TEMPORARY)
2705  {
2706  elog(WARNING,
2707  "temporary file %s not closed at end-of-transaction",
2708  VfdCache[i].fileName);
2709  FileClose(i);
2710  }
2711  }
2712  }
2713 
2714  have_xact_temporary_files = false;
2715  }
2716 
2717  /* Clean up "allocated" stdio files, dirs and fds. */
2718  while (numAllocatedDescs > 0)
2719  FreeDesc(&allocatedDescs[0]);
2720 }
2721 
2722 
2723 /*
2724  * Remove temporary and temporary relation files left over from a prior
2725  * postmaster session
2726  *
2727  * This should be called during postmaster startup. It will forcibly
2728  * remove any leftover files created by OpenTemporaryFile and any leftover
2729  * temporary relation files created by mdcreate.
2730  *
2731  * NOTE: we could, but don't, call this during a post-backend-crash restart
2732  * cycle. The argument for not doing it is that someone might want to examine
2733  * the temp files for debugging purposes. This does however mean that
2734  * OpenTemporaryFile had better allow for collision with an existing temp
2735  * file name.
2736  */
2737 void
2739 {
2740  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2741  DIR *spc_dir;
2742  struct dirent *spc_de;
2743 
2744  /*
2745  * First process temp files in pg_default ($PGDATA/base)
2746  */
2747  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2748  RemovePgTempFilesInDir(temp_path);
2749  RemovePgTempRelationFiles("base");
2750 
2751  /*
2752  * Cycle through temp directories for all non-default tablespaces.
2753  */
2754  spc_dir = AllocateDir("pg_tblspc");
2755 
2756  while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
2757  {
2758  if (strcmp(spc_de->d_name, ".") == 0 ||
2759  strcmp(spc_de->d_name, "..") == 0)
2760  continue;
2761 
2762  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2764  RemovePgTempFilesInDir(temp_path);
2765 
2766  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2768  RemovePgTempRelationFiles(temp_path);
2769  }
2770 
2771  FreeDir(spc_dir);
2772 
2773  /*
2774  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2775  * DataDir as well.
2776  */
2777 #ifdef EXEC_BACKEND
2779 #endif
2780 }
2781 
2782 /* Process one pgsql_tmp directory for RemovePgTempFiles */
2783 static void
2784 RemovePgTempFilesInDir(const char *tmpdirname)
2785 {
2786  DIR *temp_dir;
2787  struct dirent *temp_de;
2788  char rm_path[MAXPGPATH * 2];
2789 
2790  temp_dir = AllocateDir(tmpdirname);
2791  if (temp_dir == NULL)
2792  {
2793  /* anything except ENOENT is fishy */
2794  if (errno != ENOENT)
2795  elog(LOG,
2796  "could not open temporary-files directory \"%s\": %m",
2797  tmpdirname);
2798  return;
2799  }
2800 
2801  while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
2802  {
2803  if (strcmp(temp_de->d_name, ".") == 0 ||
2804  strcmp(temp_de->d_name, "..") == 0)
2805  continue;
2806 
2807  snprintf(rm_path, sizeof(rm_path), "%s/%s",
2808  tmpdirname, temp_de->d_name);
2809 
2810  if (strncmp(temp_de->d_name,
2812  strlen(PG_TEMP_FILE_PREFIX)) == 0)
2813  unlink(rm_path); /* note we ignore any error */
2814  else
2815  elog(LOG,
2816  "unexpected file found in temporary-files directory: \"%s\"",
2817  rm_path);
2818  }
2819 
2820  FreeDir(temp_dir);
2821 }
2822 
2823 /* Process one tablespace directory, look for per-DB subdirectories */
2824 static void
2825 RemovePgTempRelationFiles(const char *tsdirname)
2826 {
2827  DIR *ts_dir;
2828  struct dirent *de;
2829  char dbspace_path[MAXPGPATH * 2];
2830 
2831  ts_dir = AllocateDir(tsdirname);
2832  if (ts_dir == NULL)
2833  {
2834  /* anything except ENOENT is fishy */
2835  if (errno != ENOENT)
2836  elog(LOG,
2837  "could not open tablespace directory \"%s\": %m",
2838  tsdirname);
2839  return;
2840  }
2841 
2842  while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
2843  {
2844  int i = 0;
2845 
2846  /*
2847  * We're only interested in the per-database directories, which have
2848  * numeric names. Note that this code will also (properly) ignore "."
2849  * and "..".
2850  */
2851  while (isdigit((unsigned char) de->d_name[i]))
2852  ++i;
2853  if (de->d_name[i] != '\0' || i == 0)
2854  continue;
2855 
2856  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
2857  tsdirname, de->d_name);
2858  RemovePgTempRelationFilesInDbspace(dbspace_path);
2859  }
2860 
2861  FreeDir(ts_dir);
2862 }
2863 
2864 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
2865 static void
2866 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
2867 {
2868  DIR *dbspace_dir;
2869  struct dirent *de;
2870  char rm_path[MAXPGPATH * 2];
2871 
2872  dbspace_dir = AllocateDir(dbspacedirname);
2873  if (dbspace_dir == NULL)
2874  {
2875  /* we just saw this directory, so it really ought to be there */
2876  elog(LOG,
2877  "could not open dbspace directory \"%s\": %m",
2878  dbspacedirname);
2879  return;
2880  }
2881 
2882  while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
2883  {
2884  if (!looks_like_temp_rel_name(de->d_name))
2885  continue;
2886 
2887  snprintf(rm_path, sizeof(rm_path), "%s/%s",
2888  dbspacedirname, de->d_name);
2889 
2890  unlink(rm_path); /* note we ignore any error */
2891  }
2892 
2893  FreeDir(dbspace_dir);
2894 }
2895 
2896 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
2897 static bool
2899 {
2900  int pos;
2901  int savepos;
2902 
2903  /* Must start with "t". */
2904  if (name[0] != 't')
2905  return false;
2906 
2907  /* Followed by a non-empty string of digits and then an underscore. */
2908  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
2909  ;
2910  if (pos == 1 || name[pos] != '_')
2911  return false;
2912 
2913  /* Followed by another nonempty string of digits. */
2914  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
2915  ;
2916  if (savepos == pos)
2917  return false;
2918 
2919  /* We might have _forkname or .segment or both. */
2920  if (name[pos] == '_')
2921  {
2922  int forkchar = forkname_chars(&name[pos + 1], NULL);
2923 
2924  if (forkchar <= 0)
2925  return false;
2926  pos += forkchar + 1;
2927  }
2928  if (name[pos] == '.')
2929  {
2930  int segchar;
2931 
2932  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
2933  ;
2934  if (segchar <= 1)
2935  return false;
2936  pos += segchar;
2937  }
2938 
2939  /* Now we should be at the end. */
2940  if (name[pos] != '\0')
2941  return false;
2942  return true;
2943 }
2944 
2945 
2946 /*
2947  * Issue fsync recursively on PGDATA and all its contents.
2948  *
2949  * We fsync regular files and directories wherever they are, but we
2950  * follow symlinks only for pg_wal and immediately under pg_tblspc.
2951  * Other symlinks are presumed to point at files we're not responsible
2952  * for fsyncing, and might not have privileges to write at all.
2953  *
2954  * Errors are logged but not considered fatal; that's because this is used
2955  * only during database startup, to deal with the possibility that there are
2956  * issued-but-unsynced writes pending against the data directory. We want to
2957  * ensure that such writes reach disk before anything that's done in the new
2958  * run. However, aborting on error would result in failure to start for
2959  * harmless cases such as read-only files in the data directory, and that's
2960  * not good either.
2961  *
2962  * Note we assume we're chdir'd into PGDATA to begin with.
2963  */
2964 void
2966 {
2967  bool xlog_is_symlink;
2968 
2969  /* We can skip this whole thing if fsync is disabled. */
2970  if (!enableFsync)
2971  return;
2972 
2973  /*
2974  * If pg_wal is a symlink, we'll need to recurse into it separately,
2975  * because the first walkdir below will ignore it.
2976  */
2977  xlog_is_symlink = false;
2978 
2979 #ifndef WIN32
2980  {
2981  struct stat st;
2982 
2983  if (lstat("pg_wal", &st) < 0)
2984  ereport(LOG,
2986  errmsg("could not stat file \"%s\": %m",
2987  "pg_wal")));
2988  else if (S_ISLNK(st.st_mode))
2989  xlog_is_symlink = true;
2990  }
2991 #else
2992  if (pgwin32_is_junction("pg_wal"))
2993  xlog_is_symlink = true;
2994 #endif
2995 
2996  /*
2997  * If possible, hint to the kernel that we're soon going to fsync the data
2998  * directory and its contents. Errors in this step are even less
2999  * interesting than normal, so log them only at DEBUG1.
3000  */
3001 #ifdef PG_FLUSH_DATA_WORKS
3002  walkdir(".", pre_sync_fname, false, DEBUG1);
3003  if (xlog_is_symlink)
3004  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3005  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3006 #endif
3007 
3008  /*
3009  * Now we do the fsync()s in the same order.
3010  *
3011  * The main call ignores symlinks, so in addition to specially processing
3012  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3013  * process_symlinks = true. Note that if there are any plain directories
3014  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3015  * so we don't worry about optimizing it.
3016  */
3017  walkdir(".", datadir_fsync_fname, false, LOG);
3018  if (xlog_is_symlink)
3019  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3020  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3021 }
3022 
3023 /*
3024  * walkdir: recursively walk a directory, applying the action to each
3025  * regular file and directory (including the named directory itself).
3026  *
3027  * If process_symlinks is true, the action and recursion are also applied
3028  * to regular files and directories that are pointed to by symlinks in the
3029  * given directory; otherwise symlinks are ignored. Symlinks are always
3030  * ignored in subdirectories, ie we intentionally don't pass down the
3031  * process_symlinks flag to recursive calls.
3032  *
3033  * Errors are reported at level elevel, which might be ERROR or less.
3034  *
3035  * See also walkdir in initdb.c, which is a frontend version of this logic.
3036  */
3037 static void
3038 walkdir(const char *path,
3039  void (*action) (const char *fname, bool isdir, int elevel),
3040  bool process_symlinks,
3041  int elevel)
3042 {
3043  DIR *dir;
3044  struct dirent *de;
3045 
3046  dir = AllocateDir(path);
3047  if (dir == NULL)
3048  {
3049  ereport(elevel,
3051  errmsg("could not open directory \"%s\": %m", path)));
3052  return;
3053  }
3054 
3055  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3056  {
3057  char subpath[MAXPGPATH * 2];
3058  struct stat fst;
3059  int sret;
3060 
3062 
3063  if (strcmp(de->d_name, ".") == 0 ||
3064  strcmp(de->d_name, "..") == 0)
3065  continue;
3066 
3067  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3068 
3069  if (process_symlinks)
3070  sret = stat(subpath, &fst);
3071  else
3072  sret = lstat(subpath, &fst);
3073 
3074  if (sret < 0)
3075  {
3076  ereport(elevel,
3078  errmsg("could not stat file \"%s\": %m", subpath)));
3079  continue;
3080  }
3081 
3082  if (S_ISREG(fst.st_mode))
3083  (*action) (subpath, false, elevel);
3084  else if (S_ISDIR(fst.st_mode))
3085  walkdir(subpath, action, false, elevel);
3086  }
3087 
3088  FreeDir(dir); /* we ignore any error here */
3089 
3090  /*
3091  * It's important to fsync the destination directory itself as individual
3092  * file fsyncs don't guarantee that the directory entry for the file is
3093  * synced.
3094  */
3095  (*action) (path, true, elevel);
3096 }
3097 
3098 
3099 /*
3100  * Hint to the OS that it should get ready to fsync() this file.
3101  *
3102  * Ignores errors trying to open unreadable files, and logs other errors at a
3103  * caller-specified level.
3104  */
3105 #ifdef PG_FLUSH_DATA_WORKS
3106 
3107 static void
3108 pre_sync_fname(const char *fname, bool isdir, int elevel)
3109 {
3110  int fd;
3111 
3112  /* Don't try to flush directories, it'll likely just fail */
3113  if (isdir)
3114  return;
3115 
3116  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3117 
3118  if (fd < 0)
3119  {
3120  if (errno == EACCES)
3121  return;
3122  ereport(elevel,
3124  errmsg("could not open file \"%s\": %m", fname)));
3125  return;
3126  }
3127 
3128  /*
3129  * pg_flush_data() ignores errors, which is ok because this is only a
3130  * hint.
3131  */
3132  pg_flush_data(fd, 0, 0);
3133 
3134  (void) CloseTransientFile(fd);
3135 }
3136 
3137 #endif /* PG_FLUSH_DATA_WORKS */
3138 
3139 static void
3140 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3141 {
3142  /*
3143  * We want to silently ignoring errors about unreadable files. Pass that
3144  * desire on to fsync_fname_ext().
3145  */
3146  fsync_fname_ext(fname, isdir, true, elevel);
3147 }
3148 
3149 /*
3150  * fsync_fname_ext -- Try to fsync a file or directory
3151  *
3152  * If ignore_perm is true, ignore errors upon trying to open unreadable
3153  * files. Logs other errors at a caller-specified level.
3154  *
3155  * Returns 0 if the operation succeeded, -1 otherwise.
3156  */
3157 static int
3158 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3159 {
3160  int fd;
3161  int flags;
3162  int returncode;
3163 
3164  /*
3165  * Some OSs require directories to be opened read-only whereas other
3166  * systems don't allow us to fsync files opened read-only; so we need both
3167  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3168  * not writable by our userid, but we assume that's OK.
3169  */
3170  flags = PG_BINARY;
3171  if (!isdir)
3172  flags |= O_RDWR;
3173  else
3174  flags |= O_RDONLY;
3175 
3176  fd = OpenTransientFile(fname, flags);
3177 
3178  /*
3179  * Some OSs don't allow us to open directories at all (Windows returns
3180  * EACCES), just ignore the error in that case. If desired also silently
3181  * ignoring errors about unreadable files. Log others.
3182  */
3183  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3184  return 0;
3185  else if (fd < 0 && ignore_perm && errno == EACCES)
3186  return 0;
3187  else if (fd < 0)
3188  {
3189  ereport(elevel,
3191  errmsg("could not open file \"%s\": %m", fname)));
3192  return -1;
3193  }
3194 
3195  returncode = pg_fsync(fd);
3196 
3197  /*
3198  * Some OSes don't allow us to fsync directories at all, so we can ignore
3199  * those errors. Anything else needs to be logged.
3200  */
3201  if (returncode != 0 && !(isdir && errno == EBADF))
3202  {
3203  int save_errno;
3204 
3205  /* close file upon error, might not be in transaction context */
3206  save_errno = errno;
3207  (void) CloseTransientFile(fd);
3208  errno = save_errno;
3209 
3210  ereport(elevel,
3212  errmsg("could not fsync file \"%s\": %m", fname)));
3213  return -1;
3214  }
3215 
3216  (void) CloseTransientFile(fd);
3217 
3218  return 0;
3219 }
3220 
3221 /*
3222  * fsync_parent_path -- fsync the parent path of a file or directory
3223  *
3224  * This is aimed at making file operations persistent on disk in case of
3225  * an OS crash or power failure.
3226  */
3227 static int
3228 fsync_parent_path(const char *fname, int elevel)
3229 {
3230  char parentpath[MAXPGPATH];
3231 
3232  strlcpy(parentpath, fname, MAXPGPATH);
3233  get_parent_directory(parentpath);
3234 
3235  /*
3236  * get_parent_directory() returns an empty string if the input argument is
3237  * just a file name (see comments in path.c), so handle that as being the
3238  * current directory.
3239  */
3240  if (strlen(parentpath) == 0)
3241  strlcpy(parentpath, ".", MAXPGPATH);
3242 
3243  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3244  return -1;
3245 
3246  return 0;
3247 }
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1328
File lruLessRecently
Definition: fd.c:188
void closeAllVfds(void)
Definition: fd.c:2531
File nextFree
Definition: fd.c:186
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:802
#define MAP_FAILED
Definition: mem.h:45
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:39
#define NUM_RESERVED_FDS
Definition: fd.c:111
static AllocateDesc * allocatedDescs
Definition: fd.c:250
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1315
int pg_fdatasync(int fd)
Definition: fd.c:390
static void error(void)
Definition: sql-dyntest.c:147
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
AllocateDescKind
Definition: fd.c:228
DIR * dir
Definition: fd.c:243
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1452
static void AtProcExit_Files(int code, Datum arg)
Definition: fd.c:2663
#define write(a, b, c)
Definition: win32.h:14
static Size SizeVfdCache
Definition: fd.c:203
void on_proc_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:292
#define DO_DB(A)
Definition: fd.c:157
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3038
long random(void)
Definition: random.c:22
#define mkdir(a, b)
Definition: win32.h:57
ResourceOwner CurrentResourceOwner
Definition: resowner.c:138
int pg_fsync_writethrough(int fd)
Definition: fd.c:367
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:79
int max_safe_fds
Definition: fd.c:144
#define Min(x, y)
Definition: c.h:812
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:572
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2176
int log_temp_files
Definition: guc.c:454
mode_t FileGetRawMode(File file)
Definition: fd.c:2033
#define GLOBALTABLESPACE_OID
Definition: pg_tablespace.h:64
static Vfd * VfdCache
Definition: fd.c:202
static void Delete(File file)
Definition: fd.c:999
int closedir(DIR *)
Definition: dirent.c:111
static int numTempTableSpaces
Definition: fd.c:263
int errcode(int sqlerrcode)
Definition: elog.c:575
#define MemSet(start, val, len)
Definition: c.h:863
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:355
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:2866
int snprintf(char *str, size_t count, const char *fmt,...) pg_attribute_printf(3
#define PG_TEMP_FILE_PREFIX
Definition: fd.h:129
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1525
static bool reserveAllocatedDesc(void)
Definition: fd.c:2044
uint32 SubTransactionId
Definition: c.h:395
#define LOG
Definition: elog.h:26
static void RemovePgTempFilesInDir(const char *tmpdirname)
Definition: fd.c:2784
unsigned int Oid
Definition: postgres_ext.h:31
#define FilePosIsUnknown(pos)
Definition: fd.c:175
AllocateDescKind kind
Definition: fd.c:238
char * FilePathName(File file)
Definition: fd.c:1997
Definition: dirent.h:9
#define OidIsValid(objectId)
Definition: c.h:532
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define PG_BINARY
Definition: c.h:1044
Oid MyDatabaseTableSpace
Definition: globals.c:79
int ClosePipeStream(FILE *file)
Definition: fd.c:2502
#define malloc(a)
Definition: header.h:50
static void LruDelete(File file)
Definition: fd.c:1018
void pg_usleep(long microsec)
Definition: signal.c:53
bool TempTablespacesAreSet(void)
Definition: fd.c:2585
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2270
void pfree(void *pointer)
Definition: mcxt.c:949
mode_t fileMode
Definition: fd.c:194
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:2825
static bool ReleaseLruFile(void)
Definition: fd.c:1155
Definition: dirent.c:25
#define ERROR
Definition: elog.h:43
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2167
static int LruInsert(File file)
Definition: fd.c:1081
void AtEOXact_Files(void)
Definition: fd.c:2649
#define FATAL
Definition: elog.h:52
static bool have_xact_temporary_files
Definition: fd.c:214
#define MAXPGPATH
DIR * opendir(const char *)
Definition: dirent.c:33
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:1853
#define DEBUG2
Definition: elog.h:24
char * fileName
Definition: fd.c:191
static struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2443
#define PG_FILE_MODE_DEFAULT
Definition: fd.c:123
static char * buf
Definition: pg_test_fsync.c:67
Oid GetNextTempTableSpace(void)
Definition: fd.c:2597
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1186
static void CleanupTempFiles(bool isProcExit)
Definition: fd.c:2678
#define DEFAULTTABLESPACE_OID
Definition: pg_tablespace.h:63
int errdetail(const char *fmt,...)
Definition: elog.c:873
int errcode_for_file_access(void)
Definition: elog.c:598
void get_parent_directory(char *path)
Definition: path.c:854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2117
static int nfile
Definition: fd.c:208
unsigned int uint32
Definition: c.h:258
void SyncDataDirectory(void)
Definition: fd.c:2965
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2367
static int nextTempTableSpace
Definition: fd.c:264
int FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
Definition: fd.c:1732
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1244
int max_files_per_process
Definition: fd.c:131
static File AllocateVfd(void)
Definition: fd.c:1187
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2216
off_t seekPos
Definition: fd.c:189
unsigned short fdstate
Definition: fd.c:184
Definition: fd.c:181
off_t fileSize
Definition: fd.c:190
int fd
Definition: fd.c:183
#define ereport(elevel, rest)
Definition: elog.h:122
int FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
Definition: fd.c:1668
int link(const char *fromname, const char *toname)
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2557
#define fsync(fd)
Definition: win32.h:62
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:598
static void Insert(File file)
Definition: fd.c:1059
ResourceOwner resowner
Definition: fd.c:185
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3140
int CloseTransientFile(int fd)
Definition: fd.c:2337
static void ReleaseLruFiles(void)
Definition: fd.c:1177
#define WARNING
Definition: elog.h:40
#define FileIsNotOpen(file)
Definition: fd.c:166
static int elevel
Definition: vacuumlazy.c:136
#define FD_TEMPORARY
Definition: fd.c:178
#define FD_XACT_TEMPORARY
Definition: fd.c:179
struct vfd Vfd
uintptr_t Datum
Definition: postgres.h:372
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:2618
#define EINTR
Definition: win32.h:285
unsigned int Index
Definition: c.h:359
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:412
#define FileIsValid(file)
Definition: fd.c:163
FILE * file
Definition: fd.c:242
static bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:2898
#define InvalidOid
Definition: postgres_ext.h:36
#define VFD_CLOSED
Definition: fd.c:161
static uint64 temporary_files_size
Definition: fd.c:222
#define free(a)
Definition: header.h:65
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
#define ftruncate(a, b)
Definition: win32.h:59
#define PG_TEMP_FILES_DIR
Definition: fd.h:128
void FileClose(File file)
Definition: fd.c:1516
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:1613
static int FileAccess(File file)
Definition: fd.c:1265
#define Assert(condition)
Definition: c.h:681
void _dosmaperr(unsigned long)
Definition: win32error.c:171
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:642
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2433
File lruMoreRecently
Definition: fd.c:187
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:1641
void RemovePgTempFiles(void)
Definition: fd.c:2738
SubTransactionId create_subid
Definition: fd.c:239
WalTimeSample buffer[LAG_TRACKER_BUFFER_SIZE]
Definition: walsender.c:214
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1396
int durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:717
size_t Size
Definition: c.h:350
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1220
int sync_method
Definition: xlog.c:103
struct dirent * readdir(DIR *)
Definition: dirent.c:77
#define FD_MINFREE
Definition: fd.c:117
#define TABLESPACE_VERSION_DIRECTORY
Definition: catalog.h:26
#define realloc(a, b)
Definition: header.h:60
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:962
#define INT64_FORMAT
Definition: c.h:300
const char * name
Definition: encode.c:521
static long tempFileCounter
Definition: fd.c:256
int fd
Definition: fd.c:244
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:681
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:929
int FreeFile(FILE *file)
Definition: fd.c:2309
void set_max_safe_fds(void)
Definition: fd.c:886
bool enableFsync
Definition: globals.c:111
static Oid * tempTableSpaces
Definition: fd.c:262
void * palloc(Size size)
Definition: mcxt.c:848
int errmsg(const char *fmt,...)
Definition: elog.c:797
int FileGetRawFlags(File file)
Definition: fd.c:2023
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1175
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3158
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:951
union AllocateDesc::@27 desc
int i
void * arg
int FileGetRawDesc(File file)
Definition: fd.c:2013
static void FreeVfd(File file)
Definition: fd.c:1245
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:98
int pg_fsync(int fd)
Definition: fd.c:338
char d_name[MAX_PATH]
Definition: dirent.h:14
#define elog
Definition: elog.h:219
#define close(a)
Definition: win32.h:12
int fileFlags
Definition: fd.c:193
off_t FileSeek(File file, off_t offset, int whence)
Definition: fd.c:1874
#define lstat(path, sb)
Definition: win32.h:262
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1195
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:1962
#define FileUnknownPos
Definition: fd.c:174
static int maxAllocatedDescs
Definition: fd.c:249
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3228
int File
Definition: fd.h:49
#define read(a, b, c)
Definition: win32.h:13
int FreeDir(DIR *dir)
Definition: fd.c:2476
int temp_file_limit
Definition: guc.c:457
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:234
void InitFileAccess(void)
Definition: fd.c:769
static int numAllocatedDescs
Definition: fd.c:248