PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 256 on many modern
20  * operating systems, but can be as low as 32 on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  *-------------------------------------------------------------------------
65  */
66 
67 #include "postgres.h"
68 
69 #include <sys/file.h>
70 #include <sys/param.h>
71 #include <sys/stat.h>
72 #ifndef WIN32
73 #include <sys/mman.h>
74 #endif
75 #include <limits.h>
76 #include <unistd.h>
77 #include <fcntl.h>
78 #ifdef HAVE_SYS_RESOURCE_H
79 #include <sys/resource.h> /* for getrlimit */
80 #endif
81 
82 #include "miscadmin.h"
83 #include "access/xact.h"
84 #include "access/xlog.h"
85 #include "catalog/catalog.h"
86 #include "catalog/pg_tablespace.h"
87 #include "pgstat.h"
88 #include "portability/mem.h"
89 #include "storage/fd.h"
90 #include "storage/ipc.h"
91 #include "utils/guc.h"
92 #include "utils/resowner_private.h"
93 
94 
95 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
96 #if defined(HAVE_SYNC_FILE_RANGE)
97 #define PG_FLUSH_DATA_WORKS 1
98 #elif !defined(WIN32) && defined(MS_ASYNC)
99 #define PG_FLUSH_DATA_WORKS 1
100 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
101 #define PG_FLUSH_DATA_WORKS 1
102 #endif
103 
104 /*
105  * We must leave some file descriptors free for system(), the dynamic loader,
106  * and other code that tries to open files without consulting fd.c. This
107  * is the number left free. (While we can be pretty sure we won't get
108  * EMFILE, there's never any guarantee that we won't get ENFILE due to
109  * other processes chewing up FDs. So it's a bad idea to try to open files
110  * without consulting fd.c. Nonetheless we cannot control all code.)
111  *
112  * Because this is just a fixed setting, we are effectively assuming that
113  * no such code will leave FDs open over the long term; otherwise the slop
114  * is likely to be insufficient. Note in particular that we expect that
115  * loading a shared library does not result in any permanent increase in
116  * the number of open files. (This appears to be true on most if not
117  * all platforms as of Feb 2004.)
118  */
119 #define NUM_RESERVED_FDS 10
120 
121 /*
122  * If we have fewer than this many usable FDs after allowing for the reserved
123  * ones, choke.
124  */
125 #define FD_MINFREE 10
126 
127 /*
128  * Default mode for created files, unless something else is specified using
129  * the *Perm() function variants.
130  */
131 #define PG_FILE_MODE_DEFAULT (S_IRUSR | S_IWUSR)
132 
133 /*
134  * A number of platforms allow individual processes to open many more files
135  * than they can really support when *many* processes do the same thing.
136  * This GUC parameter lets the DBA limit max_safe_fds to something less than
137  * what the postmaster's initial probe suggests will work.
138  */
140 
141 /*
142  * Maximum number of file descriptors to open for either VFD entries or
143  * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
144  * to a conservative value, and remains that way indefinitely in bootstrap or
145  * standalone-backend cases. In normal postmaster operation, the postmaster
146  * calls set_max_safe_fds() late in initialization to update the value, and
147  * that value is then inherited by forked subprocesses.
148  *
149  * Note: the value of max_files_per_process is taken into account while
150  * setting this variable, and so need not be tested separately.
151  */
152 int max_safe_fds = 32; /* default if not changed */
153 
154 
155 /* Debugging.... */
156 
157 #ifdef FDDEBUG
158 #define DO_DB(A) \
159  do { \
160  int _do_db_save_errno = errno; \
161  A; \
162  errno = _do_db_save_errno; \
163  } while (0)
164 #else
165 #define DO_DB(A) \
166  ((void) 0)
167 #endif
168 
169 #define VFD_CLOSED (-1)
170 
171 #define FileIsValid(file) \
172  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
173 
174 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
175 
176 /*
177  * Note: a VFD's seekPos is normally always valid, but if for some reason
178  * an lseek() fails, it might become set to FileUnknownPos. We can struggle
179  * along without knowing the seek position in many cases, but in some places
180  * we have to fail if we don't have it.
181  */
182 #define FileUnknownPos ((off_t) -1)
183 #define FilePosIsUnknown(pos) ((pos) < 0)
184 
185 /* these are the assigned bits in fdstate below: */
186 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
187 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
188 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
189 
190 typedef struct vfd
191 {
192  int fd; /* current FD, or VFD_CLOSED if none */
193  unsigned short fdstate; /* bitflags for VFD's state */
194  ResourceOwner resowner; /* owner, for automatic cleanup */
195  File nextFree; /* link to next free VFD, if in freelist */
196  File lruMoreRecently; /* doubly linked recency-of-use list */
198  off_t seekPos; /* current logical file position, or -1 */
199  off_t fileSize; /* current size of file (0 if not temporary) */
200  char *fileName; /* name of file, or NULL for unused VFD */
201  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
202  int fileFlags; /* open(2) flags for (re)opening the file */
203  mode_t fileMode; /* mode to pass to open(2) */
204 } Vfd;
205 
206 /*
207  * Virtual File Descriptor array pointer and size. This grows as
208  * needed. 'File' values are indexes into this array.
209  * Note that VfdCache[0] is not a usable VFD, just a list header.
210  */
211 static Vfd *VfdCache;
212 static Size SizeVfdCache = 0;
213 
214 /*
215  * Number of file descriptors known to be in use by VFD entries.
216  */
217 static int nfile = 0;
218 
219 /*
220  * Flag to tell whether it's worth scanning VfdCache looking for temp files
221  * to close
222  */
223 static bool have_xact_temporary_files = false;
224 
225 /*
226  * Tracks the total size of all temporary files. Note: when temp_file_limit
227  * is being enforced, this cannot overflow since the limit cannot be more
228  * than INT_MAX kilobytes. When not enforcing, it could theoretically
229  * overflow, but we don't care.
230  */
231 static uint64 temporary_files_size = 0;
232 
233 /*
234  * List of OS handles opened with AllocateFile, AllocateDir and
235  * OpenTransientFile.
236  */
237 typedef enum
238 {
244 
245 typedef struct
246 {
249  union
250  {
251  FILE *file;
253  int fd;
254  } desc;
255 } AllocateDesc;
256 
257 static int numAllocatedDescs = 0;
258 static int maxAllocatedDescs = 0;
260 
261 /*
262  * Number of temporary files opened during the current session;
263  * this is used in generation of tempfile names.
264  */
265 static long tempFileCounter = 0;
266 
267 /*
268  * Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
269  * this has not been set in the current transaction.
270  */
271 static Oid *tempTableSpaces = NULL;
272 static int numTempTableSpaces = -1;
273 static int nextTempTableSpace = 0;
274 
275 
276 /*--------------------
277  *
278  * Private Routines
279  *
280  * Delete - delete a file from the Lru ring
281  * LruDelete - remove a file from the Lru ring and close its FD
282  * Insert - put a file at the front of the Lru ring
283  * LruInsert - put a file at the front of the Lru ring and open it
284  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
285  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
286  * AllocateVfd - grab a free (or new) file record (from VfdArray)
287  * FreeVfd - free a file record
288  *
289  * The Least Recently Used ring is a doubly linked list that begins and
290  * ends on element zero. Element zero is special -- it doesn't represent
291  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
292  * anchor that shows us the beginning/end of the ring.
293  * Only VFD elements that are currently really open (have an FD assigned) are
294  * in the Lru ring. Elements that are "virtually" open can be recognized
295  * by having a non-null fileName field.
296  *
297  * example:
298  *
299  * /--less----\ /---------\
300  * v \ v \
301  * #0 --more---> LeastRecentlyUsed --more-\ \
302  * ^\ | |
303  * \\less--> MostRecentlyUsedFile <---/ |
304  * \more---/ \--less--/
305  *
306  *--------------------
307  */
308 static void Delete(File file);
309 static void LruDelete(File file);
310 static void Insert(File file);
311 static int LruInsert(File file);
312 static bool ReleaseLruFile(void);
313 static void ReleaseLruFiles(void);
314 static File AllocateVfd(void);
315 static void FreeVfd(File file);
316 
317 static int FileAccess(File file);
318 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
319 static bool reserveAllocatedDesc(void);
320 static int FreeDesc(AllocateDesc *desc);
321 
322 static void AtProcExit_Files(int code, Datum arg);
323 static void CleanupTempFiles(bool isProcExit);
324 static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
325  bool unlink_all);
326 static void RemovePgTempRelationFiles(const char *tsdirname);
327 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
328 static bool looks_like_temp_rel_name(const char *name);
329 
330 static void walkdir(const char *path,
331  void (*action) (const char *fname, bool isdir, int elevel),
332  bool process_symlinks,
333  int elevel);
334 #ifdef PG_FLUSH_DATA_WORKS
335 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
336 #endif
337 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
338 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
339 
340 static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
341 static int fsync_parent_path(const char *fname, int elevel);
342 
343 
344 /*
345  * pg_fsync --- do fsync with or without writethrough
346  */
347 int
349 {
350  /* #if is to skip the sync_method test if there's no need for it */
351 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
353  return pg_fsync_writethrough(fd);
354  else
355 #endif
356  return pg_fsync_no_writethrough(fd);
357 }
358 
359 
360 /*
361  * pg_fsync_no_writethrough --- same as fsync except does nothing if
362  * enableFsync is off
363  */
364 int
366 {
367  if (enableFsync)
368  return fsync(fd);
369  else
370  return 0;
371 }
372 
373 /*
374  * pg_fsync_writethrough
375  */
376 int
378 {
379  if (enableFsync)
380  {
381 #ifdef WIN32
382  return _commit(fd);
383 #elif defined(F_FULLFSYNC)
384  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
385 #else
386  errno = ENOSYS;
387  return -1;
388 #endif
389  }
390  else
391  return 0;
392 }
393 
394 /*
395  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
396  *
397  * Not all platforms have fdatasync; treat as fsync if not available.
398  */
399 int
401 {
402  if (enableFsync)
403  {
404 #ifdef HAVE_FDATASYNC
405  return fdatasync(fd);
406 #else
407  return fsync(fd);
408 #endif
409  }
410  else
411  return 0;
412 }
413 
414 /*
415  * pg_flush_data --- advise OS that the described dirty data should be flushed
416  *
417  * offset of 0 with nbytes 0 means that the entire file should be flushed;
418  * in this case, this function may have side-effects on the file's
419  * seek position!
420  */
421 void
422 pg_flush_data(int fd, off_t offset, off_t nbytes)
423 {
424  /*
425  * Right now file flushing is primarily used to avoid making later
426  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
427  * if fsyncs are disabled - that's a decision we might want to make
428  * configurable at some point.
429  */
430  if (!enableFsync)
431  return;
432 
433  /*
434  * We compile all alternatives that are supported on the current platform,
435  * to find portability problems more easily.
436  */
437 #if defined(HAVE_SYNC_FILE_RANGE)
438  {
439  int rc;
440 
441  /*
442  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
443  * tells the OS that writeback for the specified blocks should be
444  * started, but that we don't want to wait for completion. Note that
445  * this call might block if too much dirty data exists in the range.
446  * This is the preferable method on OSs supporting it, as it works
447  * reliably when available (contrast to msync()) and doesn't flush out
448  * clean data (like FADV_DONTNEED).
449  */
450  rc = sync_file_range(fd, offset, nbytes,
451  SYNC_FILE_RANGE_WRITE);
452 
453  /* don't error out, this is just a performance optimization */
454  if (rc != 0)
455  {
458  errmsg("could not flush dirty data: %m")));
459  }
460 
461  return;
462  }
463 #endif
464 #if !defined(WIN32) && defined(MS_ASYNC)
465  {
466  void *p;
467  static int pagesize = 0;
468 
469  /*
470  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
471  * writeback. On linux it only does so if MS_SYNC is specified, but
472  * then it does the writeback synchronously. Luckily all common linux
473  * systems have sync_file_range(). This is preferable over
474  * FADV_DONTNEED because it doesn't flush out clean data.
475  *
476  * We map the file (mmap()), tell the kernel to sync back the contents
477  * (msync()), and then remove the mapping again (munmap()).
478  */
479 
480  /* mmap() needs actual length if we want to map whole file */
481  if (offset == 0 && nbytes == 0)
482  {
483  nbytes = lseek(fd, 0, SEEK_END);
484  if (nbytes < 0)
485  {
488  errmsg("could not determine dirty data size: %m")));
489  return;
490  }
491  }
492 
493  /*
494  * Some platforms reject partial-page mmap() attempts. To deal with
495  * that, just truncate the request to a page boundary. If any extra
496  * bytes don't get flushed, well, it's only a hint anyway.
497  */
498 
499  /* fetch pagesize only once */
500  if (pagesize == 0)
501  pagesize = sysconf(_SC_PAGESIZE);
502 
503  /* align length to pagesize, dropping any fractional page */
504  if (pagesize > 0)
505  nbytes = (nbytes / pagesize) * pagesize;
506 
507  /* fractional-page request is a no-op */
508  if (nbytes <= 0)
509  return;
510 
511  /*
512  * mmap could well fail, particularly on 32-bit platforms where there
513  * may simply not be enough address space. If so, silently fall
514  * through to the next implementation.
515  */
516  if (nbytes <= (off_t) SSIZE_MAX)
517  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
518  else
519  p = MAP_FAILED;
520 
521  if (p != MAP_FAILED)
522  {
523  int rc;
524 
525  rc = msync(p, (size_t) nbytes, MS_ASYNC);
526  if (rc != 0)
527  {
530  errmsg("could not flush dirty data: %m")));
531  /* NB: need to fall through to munmap()! */
532  }
533 
534  rc = munmap(p, (size_t) nbytes);
535  if (rc != 0)
536  {
537  /* FATAL error because mapping would remain */
538  ereport(FATAL,
540  errmsg("could not munmap() while flushing data: %m")));
541  }
542 
543  return;
544  }
545  }
546 #endif
547 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
548  {
549  int rc;
550 
551  /*
552  * Signal the kernel that the passed in range should not be cached
553  * anymore. This has the, desired, side effect of writing out dirty
554  * data, and the, undesired, side effect of likely discarding useful
555  * clean cached blocks. For the latter reason this is the least
556  * preferable method.
557  */
558 
559  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
560 
561  if (rc != 0)
562  {
563  /* don't error out, this is just a performance optimization */
566  errmsg("could not flush dirty data: %m")));
567  }
568 
569  return;
570  }
571 #endif
572 }
573 
574 
575 /*
576  * fsync_fname -- fsync a file or directory, handling errors properly
577  *
578  * Try to fsync a file or directory. When doing the latter, ignore errors that
579  * indicate the OS just doesn't allow/require fsyncing directories.
580  */
581 void
582 fsync_fname(const char *fname, bool isdir)
583 {
584  fsync_fname_ext(fname, isdir, false, ERROR);
585 }
586 
587 /*
588  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
589  *
590  * This routine ensures that, after returning, the effect of renaming file
591  * persists in case of a crash. A crash while this routine is running will
592  * leave you with either the pre-existing or the moved file in place of the
593  * new file; no mixed state or truncated files are possible.
594  *
595  * It does so by using fsync on the old filename and the possibly existing
596  * target filename before the rename, and the target file and directory after.
597  *
598  * Note that rename() cannot be used across arbitrary directories, as they
599  * might not be on the same filesystem. Therefore this routine does not
600  * support renaming across directories.
601  *
602  * Log errors with the caller specified severity.
603  *
604  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
605  * valid upon return.
606  */
607 int
608 durable_rename(const char *oldfile, const char *newfile, int elevel)
609 {
610  int fd;
611 
612  /*
613  * First fsync the old and target path (if it exists), to ensure that they
614  * are properly persistent on disk. Syncing the target file is not
615  * strictly necessary, but it makes it easier to reason about crashes;
616  * because it's then guaranteed that either source or target file exists
617  * after a crash.
618  */
619  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
620  return -1;
621 
622  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
623  if (fd < 0)
624  {
625  if (errno != ENOENT)
626  {
627  ereport(elevel,
629  errmsg("could not open file \"%s\": %m", newfile)));
630  return -1;
631  }
632  }
633  else
634  {
635  if (pg_fsync(fd) != 0)
636  {
637  int save_errno;
638 
639  /* close file upon error, might not be in transaction context */
640  save_errno = errno;
641  CloseTransientFile(fd);
642  errno = save_errno;
643 
644  ereport(elevel,
646  errmsg("could not fsync file \"%s\": %m", newfile)));
647  return -1;
648  }
649  CloseTransientFile(fd);
650  }
651 
652  /* Time to do the real deal... */
653  if (rename(oldfile, newfile) < 0)
654  {
655  ereport(elevel,
657  errmsg("could not rename file \"%s\" to \"%s\": %m",
658  oldfile, newfile)));
659  return -1;
660  }
661 
662  /*
663  * To guarantee renaming the file is persistent, fsync the file with its
664  * new name, and its containing directory.
665  */
666  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
667  return -1;
668 
669  if (fsync_parent_path(newfile, elevel) != 0)
670  return -1;
671 
672  return 0;
673 }
674 
675 /*
676  * durable_unlink -- remove a file in a durable manner
677  *
678  * This routine ensures that, after returning, the effect of removing file
679  * persists in case of a crash. A crash while this routine is running will
680  * leave the system in no mixed state.
681  *
682  * It does so by using fsync on the parent directory of the file after the
683  * actual removal is done.
684  *
685  * Log errors with the severity specified by caller.
686  *
687  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
688  * valid upon return.
689  */
690 int
691 durable_unlink(const char *fname, int elevel)
692 {
693  if (unlink(fname) < 0)
694  {
695  ereport(elevel,
697  errmsg("could not remove file \"%s\": %m",
698  fname)));
699  return -1;
700  }
701 
702  /*
703  * To guarantee that the removal of the file is persistent, fsync its
704  * parent directory.
705  */
706  if (fsync_parent_path(fname, elevel) != 0)
707  return -1;
708 
709  return 0;
710 }
711 
712 /*
713  * durable_link_or_rename -- rename a file in a durable manner.
714  *
715  * Similar to durable_rename(), except that this routine tries (but does not
716  * guarantee) not to overwrite the target file.
717  *
718  * Note that a crash in an unfortunate moment can leave you with two links to
719  * the target file.
720  *
721  * Log errors with the caller specified severity.
722  *
723  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
724  * valid upon return.
725  */
726 int
727 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
728 {
729  /*
730  * Ensure that, if we crash directly after the rename/link, a file with
731  * valid contents is moved into place.
732  */
733  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
734  return -1;
735 
736 #if HAVE_WORKING_LINK
737  if (link(oldfile, newfile) < 0)
738  {
739  ereport(elevel,
741  errmsg("could not link file \"%s\" to \"%s\": %m",
742  oldfile, newfile)));
743  return -1;
744  }
745  unlink(oldfile);
746 #else
747  /* XXX: Add racy file existence check? */
748  if (rename(oldfile, newfile) < 0)
749  {
750  ereport(elevel,
752  errmsg("could not rename file \"%s\" to \"%s\": %m",
753  oldfile, newfile)));
754  return -1;
755  }
756 #endif
757 
758  /*
759  * Make change persistent in case of an OS crash, both the new entry and
760  * its parent directory need to be flushed.
761  */
762  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
763  return -1;
764 
765  /* Same for parent directory */
766  if (fsync_parent_path(newfile, elevel) != 0)
767  return -1;
768 
769  return 0;
770 }
771 
772 /*
773  * InitFileAccess --- initialize this module during backend startup
774  *
775  * This is called during either normal or standalone backend start.
776  * It is *not* called in the postmaster.
777  */
778 void
780 {
781  Assert(SizeVfdCache == 0); /* call me only once */
782 
783  /* initialize cache header entry */
784  VfdCache = (Vfd *) malloc(sizeof(Vfd));
785  if (VfdCache == NULL)
786  ereport(FATAL,
787  (errcode(ERRCODE_OUT_OF_MEMORY),
788  errmsg("out of memory")));
789 
790  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
791  VfdCache->fd = VFD_CLOSED;
792 
793  SizeVfdCache = 1;
794 
795  /* register proc-exit hook to ensure temp files are dropped at exit */
797 }
798 
799 /*
800  * count_usable_fds --- count how many FDs the system will let us open,
801  * and estimate how many are already open.
802  *
803  * We stop counting if usable_fds reaches max_to_probe. Note: a small
804  * value of max_to_probe might result in an underestimate of already_open;
805  * we must fill in any "gaps" in the set of used FDs before the calculation
806  * of already_open will give the right answer. In practice, max_to_probe
807  * of a couple of dozen should be enough to ensure good results.
808  *
809  * We assume stdin (FD 0) is available for dup'ing
810  */
811 static void
812 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
813 {
814  int *fd;
815  int size;
816  int used = 0;
817  int highestfd = 0;
818  int j;
819 
820 #ifdef HAVE_GETRLIMIT
821  struct rlimit rlim;
822  int getrlimit_status;
823 #endif
824 
825  size = 1024;
826  fd = (int *) palloc(size * sizeof(int));
827 
828 #ifdef HAVE_GETRLIMIT
829 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
830  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
831 #else /* but BSD doesn't ... */
832  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
833 #endif /* RLIMIT_NOFILE */
834  if (getrlimit_status != 0)
835  ereport(WARNING, (errmsg("getrlimit failed: %m")));
836 #endif /* HAVE_GETRLIMIT */
837 
838  /* dup until failure or probe limit reached */
839  for (;;)
840  {
841  int thisfd;
842 
843 #ifdef HAVE_GETRLIMIT
844 
845  /*
846  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
847  * some platforms
848  */
849  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
850  break;
851 #endif
852 
853  thisfd = dup(0);
854  if (thisfd < 0)
855  {
856  /* Expect EMFILE or ENFILE, else it's fishy */
857  if (errno != EMFILE && errno != ENFILE)
858  elog(WARNING, "dup(0) failed after %d successes: %m", used);
859  break;
860  }
861 
862  if (used >= size)
863  {
864  size *= 2;
865  fd = (int *) repalloc(fd, size * sizeof(int));
866  }
867  fd[used++] = thisfd;
868 
869  if (highestfd < thisfd)
870  highestfd = thisfd;
871 
872  if (used >= max_to_probe)
873  break;
874  }
875 
876  /* release the files we opened */
877  for (j = 0; j < used; j++)
878  close(fd[j]);
879 
880  pfree(fd);
881 
882  /*
883  * Return results. usable_fds is just the number of successful dups. We
884  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
885  * number) and so already_open is highestfd+1 - usable_fds.
886  */
887  *usable_fds = used;
888  *already_open = highestfd + 1 - used;
889 }
890 
891 /*
892  * set_max_safe_fds
893  * Determine number of filedescriptors that fd.c is allowed to use
894  */
895 void
897 {
898  int usable_fds;
899  int already_open;
900 
901  /*----------
902  * We want to set max_safe_fds to
903  * MIN(usable_fds, max_files_per_process - already_open)
904  * less the slop factor for files that are opened without consulting
905  * fd.c. This ensures that we won't exceed either max_files_per_process
906  * or the experimentally-determined EMFILE limit.
907  *----------
908  */
910  &usable_fds, &already_open);
911 
912  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
913 
914  /*
915  * Take off the FDs reserved for system() etc.
916  */
918 
919  /*
920  * Make sure we still have enough to get by.
921  */
922  if (max_safe_fds < FD_MINFREE)
923  ereport(FATAL,
924  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
925  errmsg("insufficient file descriptors available to start server process"),
926  errdetail("System allows %d, we need at least %d.",
929 
930  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
931  max_safe_fds, usable_fds, already_open);
932 }
933 
934 /*
935  * Open a file with BasicOpenFilePerm() and pass default file mode for the
936  * fileMode parameter.
937  */
938 int
940 {
941  return BasicOpenFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT);
942 }
943 
944 /*
945  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
946  *
947  * This is exported for use by places that really want a plain kernel FD,
948  * but need to be proof against running out of FDs. Once an FD has been
949  * successfully returned, it is the caller's responsibility to ensure that
950  * it will not be leaked on ereport()! Most users should *not* call this
951  * routine directly, but instead use the VFD abstraction level, which
952  * provides protection against descriptor leaks as well as management of
953  * files that need to be open for more than a short period of time.
954  *
955  * Ideally this should be the *only* direct call of open() in the backend.
956  * In practice, the postmaster calls open() directly, and there are some
957  * direct open() calls done early in backend startup. Those are OK since
958  * this module wouldn't have any open files to close at that point anyway.
959  */
960 int
961 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
962 {
963  int fd;
964 
965 tryAgain:
966  fd = open(fileName, fileFlags, fileMode);
967 
968  if (fd >= 0)
969  return fd; /* success! */
970 
971  if (errno == EMFILE || errno == ENFILE)
972  {
973  int save_errno = errno;
974 
975  ereport(LOG,
976  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
977  errmsg("out of file descriptors: %m; release and retry")));
978  errno = 0;
979  if (ReleaseLruFile())
980  goto tryAgain;
981  errno = save_errno;
982  }
983 
984  return -1; /* failure */
985 }
986 
987 #if defined(FDDEBUG)
988 
989 static void
990 _dump_lru(void)
991 {
992  int mru = VfdCache[0].lruLessRecently;
993  Vfd *vfdP = &VfdCache[mru];
994  char buf[2048];
995 
996  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
997  while (mru != 0)
998  {
999  mru = vfdP->lruLessRecently;
1000  vfdP = &VfdCache[mru];
1001  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1002  }
1003  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1004  elog(LOG, "%s", buf);
1005 }
1006 #endif /* FDDEBUG */
1007 
1008 static void
1010 {
1011  Vfd *vfdP;
1012 
1013  Assert(file != 0);
1014 
1015  DO_DB(elog(LOG, "Delete %d (%s)",
1016  file, VfdCache[file].fileName));
1017  DO_DB(_dump_lru());
1018 
1019  vfdP = &VfdCache[file];
1020 
1021  VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1022  VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1023 
1024  DO_DB(_dump_lru());
1025 }
1026 
1027 static void
1029 {
1030  Vfd *vfdP;
1031 
1032  Assert(file != 0);
1033 
1034  DO_DB(elog(LOG, "LruDelete %d (%s)",
1035  file, VfdCache[file].fileName));
1036 
1037  vfdP = &VfdCache[file];
1038 
1039  /*
1040  * Normally we should know the seek position, but if for some reason we
1041  * have lost track of it, try again to get it. If we still can't get it,
1042  * we have a problem: we will be unable to restore the file seek position
1043  * when and if the file is re-opened. But we can't really throw an error
1044  * and refuse to close the file, or activities such as transaction cleanup
1045  * will be broken.
1046  */
1047  if (FilePosIsUnknown(vfdP->seekPos))
1048  {
1049  vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1050  if (FilePosIsUnknown(vfdP->seekPos))
1051  elog(LOG, "could not seek file \"%s\" before closing: %m",
1052  vfdP->fileName);
1053  }
1054 
1055  /*
1056  * Close the file. We aren't expecting this to fail; if it does, better
1057  * to leak the FD than to mess up our internal state.
1058  */
1059  if (close(vfdP->fd))
1060  elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1061  vfdP->fd = VFD_CLOSED;
1062  --nfile;
1063 
1064  /* delete the vfd record from the LRU ring */
1065  Delete(file);
1066 }
1067 
1068 static void
1070 {
1071  Vfd *vfdP;
1072 
1073  Assert(file != 0);
1074 
1075  DO_DB(elog(LOG, "Insert %d (%s)",
1076  file, VfdCache[file].fileName));
1077  DO_DB(_dump_lru());
1078 
1079  vfdP = &VfdCache[file];
1080 
1081  vfdP->lruMoreRecently = 0;
1082  vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1083  VfdCache[0].lruLessRecently = file;
1084  VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1085 
1086  DO_DB(_dump_lru());
1087 }
1088 
1089 /* returns 0 on success, -1 on re-open failure (with errno set) */
1090 static int
1092 {
1093  Vfd *vfdP;
1094 
1095  Assert(file != 0);
1096 
1097  DO_DB(elog(LOG, "LruInsert %d (%s)",
1098  file, VfdCache[file].fileName));
1099 
1100  vfdP = &VfdCache[file];
1101 
1102  if (FileIsNotOpen(file))
1103  {
1104  /* Close excess kernel FDs. */
1105  ReleaseLruFiles();
1106 
1107  /*
1108  * The open could still fail for lack of file descriptors, eg due to
1109  * overall system file table being full. So, be prepared to release
1110  * another FD if necessary...
1111  */
1112  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1113  vfdP->fileMode);
1114  if (vfdP->fd < 0)
1115  {
1116  DO_DB(elog(LOG, "re-open failed: %m"));
1117  return -1;
1118  }
1119  else
1120  {
1121  ++nfile;
1122  }
1123 
1124  /*
1125  * Seek to the right position. We need no special case for seekPos
1126  * equal to FileUnknownPos, as lseek() will certainly reject that
1127  * (thus completing the logic noted in LruDelete() that we will fail
1128  * to re-open a file if we couldn't get its seek position before
1129  * closing).
1130  */
1131  if (vfdP->seekPos != (off_t) 0)
1132  {
1133  if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
1134  {
1135  /*
1136  * If we fail to restore the seek position, treat it like an
1137  * open() failure.
1138  */
1139  int save_errno = errno;
1140 
1141  elog(LOG, "could not seek file \"%s\" after re-opening: %m",
1142  vfdP->fileName);
1143  (void) close(vfdP->fd);
1144  vfdP->fd = VFD_CLOSED;
1145  --nfile;
1146  errno = save_errno;
1147  return -1;
1148  }
1149  }
1150  }
1151 
1152  /*
1153  * put it at the head of the Lru ring
1154  */
1155 
1156  Insert(file);
1157 
1158  return 0;
1159 }
1160 
1161 /*
1162  * Release one kernel FD by closing the least-recently-used VFD.
1163  */
1164 static bool
1166 {
1167  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1168 
1169  if (nfile > 0)
1170  {
1171  /*
1172  * There are opened files and so there should be at least one used vfd
1173  * in the ring.
1174  */
1175  Assert(VfdCache[0].lruMoreRecently != 0);
1176  LruDelete(VfdCache[0].lruMoreRecently);
1177  return true; /* freed a file */
1178  }
1179  return false; /* no files available to free */
1180 }
1181 
1182 /*
1183  * Release kernel FDs as needed to get under the max_safe_fds limit.
1184  * After calling this, it's OK to try to open another file.
1185  */
1186 static void
1188 {
1189  while (nfile + numAllocatedDescs >= max_safe_fds)
1190  {
1191  if (!ReleaseLruFile())
1192  break;
1193  }
1194 }
1195 
1196 static File
1198 {
1199  Index i;
1200  File file;
1201 
1202  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1203 
1204  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1205 
1206  if (VfdCache[0].nextFree == 0)
1207  {
1208  /*
1209  * The free list is empty so it is time to increase the size of the
1210  * array. We choose to double it each time this happens. However,
1211  * there's not much point in starting *real* small.
1212  */
1213  Size newCacheSize = SizeVfdCache * 2;
1214  Vfd *newVfdCache;
1215 
1216  if (newCacheSize < 32)
1217  newCacheSize = 32;
1218 
1219  /*
1220  * Be careful not to clobber VfdCache ptr if realloc fails.
1221  */
1222  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1223  if (newVfdCache == NULL)
1224  ereport(ERROR,
1225  (errcode(ERRCODE_OUT_OF_MEMORY),
1226  errmsg("out of memory")));
1227  VfdCache = newVfdCache;
1228 
1229  /*
1230  * Initialize the new entries and link them into the free list.
1231  */
1232  for (i = SizeVfdCache; i < newCacheSize; i++)
1233  {
1234  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1235  VfdCache[i].nextFree = i + 1;
1236  VfdCache[i].fd = VFD_CLOSED;
1237  }
1238  VfdCache[newCacheSize - 1].nextFree = 0;
1239  VfdCache[0].nextFree = SizeVfdCache;
1240 
1241  /*
1242  * Record the new size
1243  */
1244  SizeVfdCache = newCacheSize;
1245  }
1246 
1247  file = VfdCache[0].nextFree;
1248 
1249  VfdCache[0].nextFree = VfdCache[file].nextFree;
1250 
1251  return file;
1252 }
1253 
1254 static void
1256 {
1257  Vfd *vfdP = &VfdCache[file];
1258 
1259  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1260  file, vfdP->fileName ? vfdP->fileName : ""));
1261 
1262  if (vfdP->fileName != NULL)
1263  {
1264  free(vfdP->fileName);
1265  vfdP->fileName = NULL;
1266  }
1267  vfdP->fdstate = 0x0;
1268 
1269  vfdP->nextFree = VfdCache[0].nextFree;
1270  VfdCache[0].nextFree = file;
1271 }
1272 
1273 /* returns 0 on success, -1 on re-open failure (with errno set) */
1274 static int
1276 {
1277  int returnValue;
1278 
1279  DO_DB(elog(LOG, "FileAccess %d (%s)",
1280  file, VfdCache[file].fileName));
1281 
1282  /*
1283  * Is the file open? If not, open it and put it at the head of the LRU
1284  * ring (possibly closing the least recently used file to get an FD).
1285  */
1286 
1287  if (FileIsNotOpen(file))
1288  {
1289  returnValue = LruInsert(file);
1290  if (returnValue != 0)
1291  return returnValue;
1292  }
1293  else if (VfdCache[0].lruLessRecently != file)
1294  {
1295  /*
1296  * We now know that the file is open and that it is not the last one
1297  * accessed, so we need to move it to the head of the Lru ring.
1298  */
1299 
1300  Delete(file);
1301  Insert(file);
1302  }
1303 
1304  return 0;
1305 }
1306 
1307 /*
1308  * Called whenever a temporary file is deleted to report its size.
1309  */
1310 static void
1311 ReportTemporaryFileUsage(const char *path, off_t size)
1312 {
1313  pgstat_report_tempfile(size);
1314 
1315  if (log_temp_files >= 0)
1316  {
1317  if ((size / 1024) >= log_temp_files)
1318  ereport(LOG,
1319  (errmsg("temporary file: path \"%s\", size %lu",
1320  path, (unsigned long) size)));
1321  }
1322 }
1323 
1324 /*
1325  * Called to register a temporary file for automatic close.
1326  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1327  * before the file was opened.
1328  */
1329 static void
1331 {
1333  VfdCache[file].resowner = CurrentResourceOwner;
1334 
1335  /* Backup mechanism for closing at end of xact. */
1336  VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1338 }
1339 
1340 /*
1341  * Called when we get a shared invalidation message on some relation.
1342  */
1343 #ifdef NOT_USED
1344 void
1345 FileInvalidate(File file)
1346 {
1347  Assert(FileIsValid(file));
1348  if (!FileIsNotOpen(file))
1349  LruDelete(file);
1350 }
1351 #endif
1352 
1353 /*
1354  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1355  * fileMode parameter.
1356  */
1357 File
1359 {
1360  return PathNameOpenFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT);
1361 }
1362 
1363 /*
1364  * open a file in an arbitrary directory
1365  *
1366  * NB: if the passed pathname is relative (which it usually is),
1367  * it will be interpreted relative to the process' working directory
1368  * (which should always be $PGDATA when this code is running).
1369  */
1370 File
1372 {
1373  char *fnamecopy;
1374  File file;
1375  Vfd *vfdP;
1376 
1377  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1378  fileName, fileFlags, fileMode));
1379 
1380  /*
1381  * We need a malloc'd copy of the file name; fail cleanly if no room.
1382  */
1383  fnamecopy = strdup(fileName);
1384  if (fnamecopy == NULL)
1385  ereport(ERROR,
1386  (errcode(ERRCODE_OUT_OF_MEMORY),
1387  errmsg("out of memory")));
1388 
1389  file = AllocateVfd();
1390  vfdP = &VfdCache[file];
1391 
1392  /* Close excess kernel FDs. */
1393  ReleaseLruFiles();
1394 
1395  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1396 
1397  if (vfdP->fd < 0)
1398  {
1399  int save_errno = errno;
1400 
1401  FreeVfd(file);
1402  free(fnamecopy);
1403  errno = save_errno;
1404  return -1;
1405  }
1406  ++nfile;
1407  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1408  vfdP->fd));
1409 
1410  Insert(file);
1411 
1412  vfdP->fileName = fnamecopy;
1413  /* Saved flags are adjusted to be OK for re-opening file */
1414  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1415  vfdP->fileMode = fileMode;
1416  vfdP->seekPos = 0;
1417  vfdP->fileSize = 0;
1418  vfdP->fdstate = 0x0;
1419  vfdP->resowner = NULL;
1420 
1421  return file;
1422 }
1423 
1424 /*
1425  * Create directory 'directory'. If necessary, create 'basedir', which must
1426  * be the directory above it. This is designed for creating the top-level
1427  * temporary directory on demand before creating a directory underneath it.
1428  * Do nothing if the directory already exists.
1429  *
1430  * Directories created within the top-level temporary directory should begin
1431  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1432  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1433  * that do not need any particular prefix.
1434 */
1435 void
1437 {
1438  if (mkdir(directory, S_IRWXU) < 0)
1439  {
1440  if (errno == EEXIST)
1441  return;
1442 
1443  /*
1444  * Failed. Try to create basedir first in case it's missing. Tolerate
1445  * EEXIST to close a race against another process following the same
1446  * algorithm.
1447  */
1448  if (mkdir(basedir, S_IRWXU) < 0 && errno != EEXIST)
1449  ereport(ERROR,
1451  errmsg("cannot create temporary directory \"%s\": %m",
1452  basedir)));
1453 
1454  /* Try again. */
1455  if (mkdir(directory, S_IRWXU) < 0 && errno != EEXIST)
1456  ereport(ERROR,
1458  errmsg("cannot create temporary subdirectory \"%s\": %m",
1459  directory)));
1460  }
1461 }
1462 
1463 /*
1464  * Delete a directory and everything in it, if it exists.
1465  */
1466 void
1467 PathNameDeleteTemporaryDir(const char *dirname)
1468 {
1469  struct stat statbuf;
1470 
1471  /* Silently ignore missing directory. */
1472  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1473  return;
1474 
1475  /*
1476  * Currently, walkdir doesn't offer a way for our passed in function to
1477  * maintain state. Perhaps it should, so that we could tell the caller
1478  * whether this operation succeeded or failed. Since this operation is
1479  * used in a cleanup path, we wouldn't actually behave differently: we'll
1480  * just log failures.
1481  */
1482  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1483 }
1484 
1485 /*
1486  * Open a temporary file that will disappear when we close it.
1487  *
1488  * This routine takes care of generating an appropriate tempfile name.
1489  * There's no need to pass in fileFlags or fileMode either, since only
1490  * one setting makes any sense for a temp file.
1491  *
1492  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1493  * to ensure it's closed and deleted when it's no longer needed, typically at
1494  * the end-of-transaction. In most cases, you don't want temporary files to
1495  * outlive the transaction that created them, so this should be false -- but
1496  * if you need "somewhat" temporary storage, this might be useful. In either
1497  * case, the file is removed when the File is explicitly closed.
1498  */
1499 File
1500 OpenTemporaryFile(bool interXact)
1501 {
1502  File file = 0;
1503 
1504  /*
1505  * Make sure the current resource owner has space for this File before we
1506  * open it, if we'll be registering it below.
1507  */
1508  if (!interXact)
1510 
1511  /*
1512  * If some temp tablespace(s) have been given to us, try to use the next
1513  * one. If a given tablespace can't be found, we silently fall back to
1514  * the database's default tablespace.
1515  *
1516  * BUT: if the temp file is slated to outlive the current transaction,
1517  * force it into the database's default tablespace, so that it will not
1518  * pose a threat to possible tablespace drop attempts.
1519  */
1520  if (numTempTableSpaces > 0 && !interXact)
1521  {
1522  Oid tblspcOid = GetNextTempTableSpace();
1523 
1524  if (OidIsValid(tblspcOid))
1525  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1526  }
1527 
1528  /*
1529  * If not, or if tablespace is bad, create in database's default
1530  * tablespace. MyDatabaseTableSpace should normally be set before we get
1531  * here, but just in case it isn't, fall back to pg_default tablespace.
1532  */
1533  if (file <= 0)
1537  true);
1538 
1539  /* Mark it for deletion at close and temporary file size limit */
1540  VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1541 
1542  /* Register it with the current resource owner */
1543  if (!interXact)
1544  RegisterTemporaryFile(file);
1545 
1546  return file;
1547 }
1548 
1549 /*
1550  * Return the path of the temp directory in a given tablespace.
1551  */
1552 void
1554 {
1555  /*
1556  * Identify the tempfile directory for this tablespace.
1557  *
1558  * If someone tries to specify pg_global, use pg_default instead.
1559  */
1560  if (tablespace == InvalidOid ||
1561  tablespace == DEFAULTTABLESPACE_OID ||
1562  tablespace == GLOBALTABLESPACE_OID)
1563  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1564  else
1565  {
1566  /* All other tablespaces are accessed via symlinks */
1567  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1568  tablespace, TABLESPACE_VERSION_DIRECTORY,
1570  }
1571 }
1572 
1573 /*
1574  * Open a temporary file in a specific tablespace.
1575  * Subroutine for OpenTemporaryFile, which see for details.
1576  */
1577 static File
1578 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1579 {
1580  char tempdirpath[MAXPGPATH];
1581  char tempfilepath[MAXPGPATH];
1582  File file;
1583 
1584  TempTablespacePath(tempdirpath, tblspcOid);
1585 
1586  /*
1587  * Generate a tempfile name that should be unique within the current
1588  * database instance.
1589  */
1590  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1591  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1592 
1593  /*
1594  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1595  * temp file that can be reused.
1596  */
1597  file = PathNameOpenFile(tempfilepath,
1598  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1599  if (file <= 0)
1600  {
1601  /*
1602  * We might need to create the tablespace's tempfile directory, if no
1603  * one has yet done so.
1604  *
1605  * Don't check for error from mkdir; it could fail if someone else
1606  * just did the same thing. If it doesn't work then we'll bomb out on
1607  * the second create attempt, instead.
1608  */
1609  mkdir(tempdirpath, S_IRWXU);
1610 
1611  file = PathNameOpenFile(tempfilepath,
1612  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1613  if (file <= 0 && rejectError)
1614  elog(ERROR, "could not create temporary file \"%s\": %m",
1615  tempfilepath);
1616  }
1617 
1618  return file;
1619 }
1620 
1621 
1622 /*
1623  * Create a new file. The directory containing it must already exist. Files
1624  * created this way are subject to temp_file_limit and are automatically
1625  * closed at end of transaction, but are not automatically deleted on close
1626  * because they are intended to be shared between cooperating backends.
1627  *
1628  * If the file is inside the top-level temporary directory, its name should
1629  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1630  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1631  * inside a directory created with PathnameCreateTemporaryDir(), in which case
1632  * the prefix isn't needed.
1633  */
1634 File
1635 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1636 {
1637  File file;
1638 
1640 
1641  /*
1642  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1643  * temp file that can be reused.
1644  */
1645  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1646  if (file <= 0)
1647  {
1648  if (error_on_failure)
1649  ereport(ERROR,
1651  errmsg("could not create temporary file \"%s\": %m",
1652  path)));
1653  else
1654  return file;
1655  }
1656 
1657  /* Mark it for temp_file_limit accounting. */
1658  VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1659 
1660  /* Register it for automatic close. */
1661  RegisterTemporaryFile(file);
1662 
1663  return file;
1664 }
1665 
1666 /*
1667  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1668  * another backend. Files opened this way don't count against the
1669  * temp_file_limit of the caller, are read-only and are automatically closed
1670  * at the end of the transaction but are not deleted on close.
1671  */
1672 File
1673 PathNameOpenTemporaryFile(const char *path)
1674 {
1675  File file;
1676 
1678 
1679  /* We open the file read-only. */
1680  file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1681 
1682  /* If no such file, then we don't raise an error. */
1683  if (file <= 0 && errno != ENOENT)
1684  ereport(ERROR,
1686  errmsg("could not open temporary file \"%s\": %m",
1687  path)));
1688 
1689  if (file > 0)
1690  {
1691  /* Register it for automatic close. */
1692  RegisterTemporaryFile(file);
1693  }
1694 
1695  return file;
1696 }
1697 
1698 /*
1699  * Delete a file by pathname. Return true if the file existed, false if
1700  * didn't.
1701  */
1702 bool
1703 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1704 {
1705  struct stat filestats;
1706  int stat_errno;
1707 
1708  /* Get the final size for pgstat reporting. */
1709  if (stat(path, &filestats) != 0)
1710  stat_errno = errno;
1711  else
1712  stat_errno = 0;
1713 
1714  /*
1715  * Unlike FileClose's automatic file deletion code, we tolerate
1716  * non-existence to support BufFileDeleteShared which doesn't know how
1717  * many segments it has to delete until it runs out.
1718  */
1719  if (stat_errno == ENOENT)
1720  return false;
1721 
1722  if (unlink(path) < 0)
1723  {
1724  if (errno != ENOENT)
1725  ereport(error_on_failure ? ERROR : LOG,
1727  errmsg("cannot unlink temporary file \"%s\": %m",
1728  path)));
1729  return false;
1730  }
1731 
1732  if (stat_errno == 0)
1733  ReportTemporaryFileUsage(path, filestats.st_size);
1734  else
1735  {
1736  errno = stat_errno;
1737  ereport(LOG,
1739  errmsg("could not stat file \"%s\": %m", path)));
1740  }
1741 
1742  return true;
1743 }
1744 
1745 /*
1746  * close a file when done with it
1747  */
1748 void
1750 {
1751  Vfd *vfdP;
1752 
1753  Assert(FileIsValid(file));
1754 
1755  DO_DB(elog(LOG, "FileClose: %d (%s)",
1756  file, VfdCache[file].fileName));
1757 
1758  vfdP = &VfdCache[file];
1759 
1760  if (!FileIsNotOpen(file))
1761  {
1762  /* close the file */
1763  if (close(vfdP->fd))
1764  elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1765 
1766  --nfile;
1767  vfdP->fd = VFD_CLOSED;
1768 
1769  /* remove the file from the lru ring */
1770  Delete(file);
1771  }
1772 
1773  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1774  {
1775  /* Subtract its size from current usage (do first in case of error) */
1776  temporary_files_size -= vfdP->fileSize;
1777  vfdP->fileSize = 0;
1778  }
1779 
1780  /*
1781  * Delete the file if it was temporary, and make a log entry if wanted
1782  */
1783  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1784  {
1785  struct stat filestats;
1786  int stat_errno;
1787 
1788  /*
1789  * If we get an error, as could happen within the ereport/elog calls,
1790  * we'll come right back here during transaction abort. Reset the
1791  * flag to ensure that we can't get into an infinite loop. This code
1792  * is arranged to ensure that the worst-case consequence is failing to
1793  * emit log message(s), not failing to attempt the unlink.
1794  */
1795  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1796 
1797 
1798  /* first try the stat() */
1799  if (stat(vfdP->fileName, &filestats))
1800  stat_errno = errno;
1801  else
1802  stat_errno = 0;
1803 
1804  /* in any case do the unlink */
1805  if (unlink(vfdP->fileName))
1806  elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1807 
1808  /* and last report the stat results */
1809  if (stat_errno == 0)
1810  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1811  else
1812  {
1813  errno = stat_errno;
1814  elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1815  }
1816  }
1817 
1818  /* Unregister it from the resource owner */
1819  if (vfdP->resowner)
1820  ResourceOwnerForgetFile(vfdP->resowner, file);
1821 
1822  /*
1823  * Return the Vfd slot to the free list
1824  */
1825  FreeVfd(file);
1826 }
1827 
1828 /*
1829  * FilePrefetch - initiate asynchronous read of a given range of the file.
1830  * The logical seek position is unaffected.
1831  *
1832  * Currently the only implementation of this function is using posix_fadvise
1833  * which is the simplest standardized interface that accomplishes this.
1834  * We could add an implementation using libaio in the future; but note that
1835  * this API is inappropriate for libaio, which wants to have a buffer provided
1836  * to read into.
1837  */
1838 int
1839 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1840 {
1841 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1842  int returnCode;
1843 
1844  Assert(FileIsValid(file));
1845 
1846  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1847  file, VfdCache[file].fileName,
1848  (int64) offset, amount));
1849 
1850  returnCode = FileAccess(file);
1851  if (returnCode < 0)
1852  return returnCode;
1853 
1854  pgstat_report_wait_start(wait_event_info);
1855  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1856  POSIX_FADV_WILLNEED);
1858 
1859  return returnCode;
1860 #else
1861  Assert(FileIsValid(file));
1862  return 0;
1863 #endif
1864 }
1865 
1866 void
1867 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1868 {
1869  int returnCode;
1870 
1871  Assert(FileIsValid(file));
1872 
1873  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1874  file, VfdCache[file].fileName,
1875  (int64) offset, (int64) nbytes));
1876 
1877  /*
1878  * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
1879  * file's seek position. We prefer to define that as a no-op here.
1880  */
1881  if (nbytes <= 0)
1882  return;
1883 
1884  returnCode = FileAccess(file);
1885  if (returnCode < 0)
1886  return;
1887 
1888  pgstat_report_wait_start(wait_event_info);
1889  pg_flush_data(VfdCache[file].fd, offset, nbytes);
1891 }
1892 
1893 int
1894 FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
1895 {
1896  int returnCode;
1897  Vfd *vfdP;
1898 
1899  Assert(FileIsValid(file));
1900 
1901  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1902  file, VfdCache[file].fileName,
1903  (int64) VfdCache[file].seekPos,
1904  amount, buffer));
1905 
1906  returnCode = FileAccess(file);
1907  if (returnCode < 0)
1908  return returnCode;
1909 
1910  vfdP = &VfdCache[file];
1911 
1912 retry:
1913  pgstat_report_wait_start(wait_event_info);
1914  returnCode = read(vfdP->fd, buffer, amount);
1916 
1917  if (returnCode >= 0)
1918  {
1919  /* if seekPos is unknown, leave it that way */
1920  if (!FilePosIsUnknown(vfdP->seekPos))
1921  vfdP->seekPos += returnCode;
1922  }
1923  else
1924  {
1925  /*
1926  * Windows may run out of kernel buffers and return "Insufficient
1927  * system resources" error. Wait a bit and retry to solve it.
1928  *
1929  * It is rumored that EINTR is also possible on some Unix filesystems,
1930  * in which case immediate retry is indicated.
1931  */
1932 #ifdef WIN32
1933  DWORD error = GetLastError();
1934 
1935  switch (error)
1936  {
1937  case ERROR_NO_SYSTEM_RESOURCES:
1938  pg_usleep(1000L);
1939  errno = EINTR;
1940  break;
1941  default:
1942  _dosmaperr(error);
1943  break;
1944  }
1945 #endif
1946  /* OK to retry if interrupted */
1947  if (errno == EINTR)
1948  goto retry;
1949 
1950  /* Trouble, so assume we don't know the file position anymore */
1951  vfdP->seekPos = FileUnknownPos;
1952  }
1953 
1954  return returnCode;
1955 }
1956 
1957 int
1958 FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
1959 {
1960  int returnCode;
1961  Vfd *vfdP;
1962 
1963  Assert(FileIsValid(file));
1964 
1965  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1966  file, VfdCache[file].fileName,
1967  (int64) VfdCache[file].seekPos,
1968  amount, buffer));
1969 
1970  returnCode = FileAccess(file);
1971  if (returnCode < 0)
1972  return returnCode;
1973 
1974  vfdP = &VfdCache[file];
1975 
1976  /*
1977  * If enforcing temp_file_limit and it's a temp file, check to see if the
1978  * write would overrun temp_file_limit, and throw error if so. Note: it's
1979  * really a modularity violation to throw error here; we should set errno
1980  * and return -1. However, there's no way to report a suitable error
1981  * message if we do that. All current callers would just throw error
1982  * immediately anyway, so this is safe at present.
1983  */
1984  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
1985  {
1986  off_t newPos;
1987 
1988  /*
1989  * Normally we should know the seek position, but if for some reason
1990  * we have lost track of it, try again to get it. Here, it's fine to
1991  * throw an error if we still can't get it.
1992  */
1993  if (FilePosIsUnknown(vfdP->seekPos))
1994  {
1995  vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1996  if (FilePosIsUnknown(vfdP->seekPos))
1997  elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
1998  }
1999 
2000  newPos = vfdP->seekPos + amount;
2001  if (newPos > vfdP->fileSize)
2002  {
2003  uint64 newTotal = temporary_files_size;
2004 
2005  newTotal += newPos - vfdP->fileSize;
2006  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2007  ereport(ERROR,
2008  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2009  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2010  temp_file_limit)));
2011  }
2012  }
2013 
2014 retry:
2015  errno = 0;
2016  pgstat_report_wait_start(wait_event_info);
2017  returnCode = write(vfdP->fd, buffer, amount);
2019 
2020  /* if write didn't set errno, assume problem is no disk space */
2021  if (returnCode != amount && errno == 0)
2022  errno = ENOSPC;
2023 
2024  if (returnCode >= 0)
2025  {
2026  /* if seekPos is unknown, leave it that way */
2027  if (!FilePosIsUnknown(vfdP->seekPos))
2028  vfdP->seekPos += returnCode;
2029 
2030  /*
2031  * Maintain fileSize and temporary_files_size if it's a temp file.
2032  *
2033  * If seekPos is -1 (unknown), this will do nothing; but we could only
2034  * get here in that state if we're not enforcing temporary_files_size,
2035  * so we don't care.
2036  */
2037  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2038  {
2039  off_t newPos = vfdP->seekPos;
2040 
2041  if (newPos > vfdP->fileSize)
2042  {
2043  temporary_files_size += newPos - vfdP->fileSize;
2044  vfdP->fileSize = newPos;
2045  }
2046  }
2047  }
2048  else
2049  {
2050  /*
2051  * See comments in FileRead()
2052  */
2053 #ifdef WIN32
2054  DWORD error = GetLastError();
2055 
2056  switch (error)
2057  {
2058  case ERROR_NO_SYSTEM_RESOURCES:
2059  pg_usleep(1000L);
2060  errno = EINTR;
2061  break;
2062  default:
2063  _dosmaperr(error);
2064  break;
2065  }
2066 #endif
2067  /* OK to retry if interrupted */
2068  if (errno == EINTR)
2069  goto retry;
2070 
2071  /* Trouble, so assume we don't know the file position anymore */
2072  vfdP->seekPos = FileUnknownPos;
2073  }
2074 
2075  return returnCode;
2076 }
2077 
2078 int
2079 FileSync(File file, uint32 wait_event_info)
2080 {
2081  int returnCode;
2082 
2083  Assert(FileIsValid(file));
2084 
2085  DO_DB(elog(LOG, "FileSync: %d (%s)",
2086  file, VfdCache[file].fileName));
2087 
2088  returnCode = FileAccess(file);
2089  if (returnCode < 0)
2090  return returnCode;
2091 
2092  pgstat_report_wait_start(wait_event_info);
2093  returnCode = pg_fsync(VfdCache[file].fd);
2095 
2096  return returnCode;
2097 }
2098 
2099 off_t
2100 FileSeek(File file, off_t offset, int whence)
2101 {
2102  Vfd *vfdP;
2103 
2104  Assert(FileIsValid(file));
2105 
2106  DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
2107  file, VfdCache[file].fileName,
2108  (int64) VfdCache[file].seekPos,
2109  (int64) offset, whence));
2110 
2111  vfdP = &VfdCache[file];
2112 
2113  if (FileIsNotOpen(file))
2114  {
2115  switch (whence)
2116  {
2117  case SEEK_SET:
2118  if (offset < 0)
2119  {
2120  errno = EINVAL;
2121  return (off_t) -1;
2122  }
2123  vfdP->seekPos = offset;
2124  break;
2125  case SEEK_CUR:
2126  if (FilePosIsUnknown(vfdP->seekPos) ||
2127  vfdP->seekPos + offset < 0)
2128  {
2129  errno = EINVAL;
2130  return (off_t) -1;
2131  }
2132  vfdP->seekPos += offset;
2133  break;
2134  case SEEK_END:
2135  if (FileAccess(file) < 0)
2136  return (off_t) -1;
2137  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2138  break;
2139  default:
2140  elog(ERROR, "invalid whence: %d", whence);
2141  break;
2142  }
2143  }
2144  else
2145  {
2146  switch (whence)
2147  {
2148  case SEEK_SET:
2149  if (offset < 0)
2150  {
2151  errno = EINVAL;
2152  return (off_t) -1;
2153  }
2154  if (vfdP->seekPos != offset)
2155  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2156  break;
2157  case SEEK_CUR:
2158  if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
2159  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2160  break;
2161  case SEEK_END:
2162  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2163  break;
2164  default:
2165  elog(ERROR, "invalid whence: %d", whence);
2166  break;
2167  }
2168  }
2169 
2170  return vfdP->seekPos;
2171 }
2172 
2173 /*
2174  * XXX not actually used but here for completeness
2175  */
2176 #ifdef NOT_USED
2177 off_t
2178 FileTell(File file)
2179 {
2180  Assert(FileIsValid(file));
2181  DO_DB(elog(LOG, "FileTell %d (%s)",
2182  file, VfdCache[file].fileName));
2183  return VfdCache[file].seekPos;
2184 }
2185 #endif
2186 
2187 int
2188 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2189 {
2190  int returnCode;
2191 
2192  Assert(FileIsValid(file));
2193 
2194  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2195  file, VfdCache[file].fileName));
2196 
2197  returnCode = FileAccess(file);
2198  if (returnCode < 0)
2199  return returnCode;
2200 
2201  pgstat_report_wait_start(wait_event_info);
2202  returnCode = ftruncate(VfdCache[file].fd, offset);
2204 
2205  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2206  {
2207  /* adjust our state for truncation of a temp file */
2208  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2209  temporary_files_size -= VfdCache[file].fileSize - offset;
2210  VfdCache[file].fileSize = offset;
2211  }
2212 
2213  return returnCode;
2214 }
2215 
2216 /*
2217  * Return the pathname associated with an open file.
2218  *
2219  * The returned string points to an internal buffer, which is valid until
2220  * the file is closed.
2221  */
2222 char *
2224 {
2225  Assert(FileIsValid(file));
2226 
2227  return VfdCache[file].fileName;
2228 }
2229 
2230 /*
2231  * Return the raw file descriptor of an opened file.
2232  *
2233  * The returned file descriptor will be valid until the file is closed, but
2234  * there are a lot of things that can make that happen. So the caller should
2235  * be careful not to do much of anything else before it finishes using the
2236  * returned file descriptor.
2237  */
2238 int
2240 {
2241  Assert(FileIsValid(file));
2242  return VfdCache[file].fd;
2243 }
2244 
2245 /*
2246  * FileGetRawFlags - returns the file flags on open(2)
2247  */
2248 int
2250 {
2251  Assert(FileIsValid(file));
2252  return VfdCache[file].fileFlags;
2253 }
2254 
2255 /*
2256  * FileGetRawMode - returns the mode bitmask passed to open(2)
2257  */
2258 mode_t
2260 {
2261  Assert(FileIsValid(file));
2262  return VfdCache[file].fileMode;
2263 }
2264 
2265 /*
2266  * Make room for another allocatedDescs[] array entry if needed and possible.
2267  * Returns true if an array element is available.
2268  */
2269 static bool
2271 {
2272  AllocateDesc *newDescs;
2273  int newMax;
2274 
2275  /* Quick out if array already has a free slot. */
2277  return true;
2278 
2279  /*
2280  * If the array hasn't yet been created in the current process, initialize
2281  * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2282  * we will ever need, anyway. We don't want to look at max_safe_fds
2283  * immediately because set_max_safe_fds() may not have run yet.
2284  */
2285  if (allocatedDescs == NULL)
2286  {
2287  newMax = FD_MINFREE / 2;
2288  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2289  /* Out of memory already? Treat as fatal error. */
2290  if (newDescs == NULL)
2291  ereport(ERROR,
2292  (errcode(ERRCODE_OUT_OF_MEMORY),
2293  errmsg("out of memory")));
2294  allocatedDescs = newDescs;
2295  maxAllocatedDescs = newMax;
2296  return true;
2297  }
2298 
2299  /*
2300  * Consider enlarging the array beyond the initial allocation used above.
2301  * By the time this happens, max_safe_fds should be known accurately.
2302  *
2303  * We mustn't let allocated descriptors hog all the available FDs, and in
2304  * practice we'd better leave a reasonable number of FDs for VFD use. So
2305  * set the maximum to max_safe_fds / 2. (This should certainly be at
2306  * least as large as the initial size, FD_MINFREE / 2.)
2307  */
2308  newMax = max_safe_fds / 2;
2309  if (newMax > maxAllocatedDescs)
2310  {
2311  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2312  newMax * sizeof(AllocateDesc));
2313  /* Treat out-of-memory as a non-fatal error. */
2314  if (newDescs == NULL)
2315  return false;
2316  allocatedDescs = newDescs;
2317  maxAllocatedDescs = newMax;
2318  return true;
2319  }
2320 
2321  /* Can't enlarge allocatedDescs[] any more. */
2322  return false;
2323 }
2324 
2325 /*
2326  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2327  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2328  * necessary to open the file. When done, call FreeFile rather than fclose.
2329  *
2330  * Note that files that will be open for any significant length of time
2331  * should NOT be handled this way, since they cannot share kernel file
2332  * descriptors with other files; there is grave risk of running out of FDs
2333  * if anyone locks down too many FDs. Most callers of this routine are
2334  * simply reading a config file that they will read and close immediately.
2335  *
2336  * fd.c will automatically close all files opened with AllocateFile at
2337  * transaction commit or abort; this prevents FD leakage if a routine
2338  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2339  *
2340  * Ideally this should be the *only* direct call of fopen() in the backend.
2341  */
2342 FILE *
2343 AllocateFile(const char *name, const char *mode)
2344 {
2345  FILE *file;
2346 
2347  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2348  numAllocatedDescs, name));
2349 
2350  /* Can we allocate another non-virtual FD? */
2351  if (!reserveAllocatedDesc())
2352  ereport(ERROR,
2353  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2354  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2355  maxAllocatedDescs, name)));
2356 
2357  /* Close excess kernel FDs. */
2358  ReleaseLruFiles();
2359 
2360 TryAgain:
2361  if ((file = fopen(name, mode)) != NULL)
2362  {
2363  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2364 
2365  desc->kind = AllocateDescFile;
2366  desc->desc.file = file;
2369  return desc->desc.file;
2370  }
2371 
2372  if (errno == EMFILE || errno == ENFILE)
2373  {
2374  int save_errno = errno;
2375 
2376  ereport(LOG,
2377  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2378  errmsg("out of file descriptors: %m; release and retry")));
2379  errno = 0;
2380  if (ReleaseLruFile())
2381  goto TryAgain;
2382  errno = save_errno;
2383  }
2384 
2385  return NULL;
2386 }
2387 
2388 /*
2389  * Open a file with OpenTransientFilePerm() and pass default file mode for
2390  * the fileMode parameter.
2391  */
2392 int
2394 {
2395  return OpenTransientFilePerm(fileName, fileFlags, PG_FILE_MODE_DEFAULT);
2396 }
2397 
2398 /*
2399  * Like AllocateFile, but returns an unbuffered fd like open(2)
2400  */
2401 int
2403 {
2404  int fd;
2405 
2406  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2407  numAllocatedDescs, fileName));
2408 
2409  /* Can we allocate another non-virtual FD? */
2410  if (!reserveAllocatedDesc())
2411  ereport(ERROR,
2412  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2413  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2414  maxAllocatedDescs, fileName)));
2415 
2416  /* Close excess kernel FDs. */
2417  ReleaseLruFiles();
2418 
2419  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2420 
2421  if (fd >= 0)
2422  {
2423  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2424 
2425  desc->kind = AllocateDescRawFD;
2426  desc->desc.fd = fd;
2429 
2430  return fd;
2431  }
2432 
2433  return -1; /* failure */
2434 }
2435 
2436 /*
2437  * Routines that want to initiate a pipe stream should use OpenPipeStream
2438  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2439  * necessary. When done, call ClosePipeStream rather than pclose.
2440  */
2441 FILE *
2442 OpenPipeStream(const char *command, const char *mode)
2443 {
2444  FILE *file;
2445 
2446  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2447  numAllocatedDescs, command));
2448 
2449  /* Can we allocate another non-virtual FD? */
2450  if (!reserveAllocatedDesc())
2451  ereport(ERROR,
2452  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2453  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2454  maxAllocatedDescs, command)));
2455 
2456  /* Close excess kernel FDs. */
2457  ReleaseLruFiles();
2458 
2459 TryAgain:
2460  fflush(stdout);
2461  fflush(stderr);
2462  errno = 0;
2463  if ((file = popen(command, mode)) != NULL)
2464  {
2465  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2466 
2467  desc->kind = AllocateDescPipe;
2468  desc->desc.file = file;
2471  return desc->desc.file;
2472  }
2473 
2474  if (errno == EMFILE || errno == ENFILE)
2475  {
2476  int save_errno = errno;
2477 
2478  ereport(LOG,
2479  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2480  errmsg("out of file descriptors: %m; release and retry")));
2481  errno = 0;
2482  if (ReleaseLruFile())
2483  goto TryAgain;
2484  errno = save_errno;
2485  }
2486 
2487  return NULL;
2488 }
2489 
2490 /*
2491  * Free an AllocateDesc of any type.
2492  *
2493  * The argument *must* point into the allocatedDescs[] array.
2494  */
2495 static int
2497 {
2498  int result;
2499 
2500  /* Close the underlying object */
2501  switch (desc->kind)
2502  {
2503  case AllocateDescFile:
2504  result = fclose(desc->desc.file);
2505  break;
2506  case AllocateDescPipe:
2507  result = pclose(desc->desc.file);
2508  break;
2509  case AllocateDescDir:
2510  result = closedir(desc->desc.dir);
2511  break;
2512  case AllocateDescRawFD:
2513  result = close(desc->desc.fd);
2514  break;
2515  default:
2516  elog(ERROR, "AllocateDesc kind not recognized");
2517  result = 0; /* keep compiler quiet */
2518  break;
2519  }
2520 
2521  /* Compact storage in the allocatedDescs array */
2523  *desc = allocatedDescs[numAllocatedDescs];
2524 
2525  return result;
2526 }
2527 
2528 /*
2529  * Close a file returned by AllocateFile.
2530  *
2531  * Note we do not check fclose's return value --- it is up to the caller
2532  * to handle close errors.
2533  */
2534 int
2535 FreeFile(FILE *file)
2536 {
2537  int i;
2538 
2539  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2540 
2541  /* Remove file from list of allocated files, if it's present */
2542  for (i = numAllocatedDescs; --i >= 0;)
2543  {
2544  AllocateDesc *desc = &allocatedDescs[i];
2545 
2546  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2547  return FreeDesc(desc);
2548  }
2549 
2550  /* Only get here if someone passes us a file not in allocatedDescs */
2551  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2552 
2553  return fclose(file);
2554 }
2555 
2556 /*
2557  * Close a file returned by OpenTransientFile.
2558  *
2559  * Note we do not check close's return value --- it is up to the caller
2560  * to handle close errors.
2561  */
2562 int
2564 {
2565  int i;
2566 
2567  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2568 
2569  /* Remove fd from list of allocated files, if it's present */
2570  for (i = numAllocatedDescs; --i >= 0;)
2571  {
2572  AllocateDesc *desc = &allocatedDescs[i];
2573 
2574  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2575  return FreeDesc(desc);
2576  }
2577 
2578  /* Only get here if someone passes us a file not in allocatedDescs */
2579  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2580 
2581  return close(fd);
2582 }
2583 
2584 /*
2585  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2586  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2587  * necessary to open the directory, and with closing it after an elog.
2588  * When done, call FreeDir rather than closedir.
2589  *
2590  * Returns NULL, with errno set, on failure. Note that failure detection
2591  * is commonly left to the following call of ReadDir or ReadDirExtended;
2592  * see the comments for ReadDir.
2593  *
2594  * Ideally this should be the *only* direct call of opendir() in the backend.
2595  */
2596 DIR *
2597 AllocateDir(const char *dirname)
2598 {
2599  DIR *dir;
2600 
2601  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2602  numAllocatedDescs, dirname));
2603 
2604  /* Can we allocate another non-virtual FD? */
2605  if (!reserveAllocatedDesc())
2606  ereport(ERROR,
2607  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2608  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2609  maxAllocatedDescs, dirname)));
2610 
2611  /* Close excess kernel FDs. */
2612  ReleaseLruFiles();
2613 
2614 TryAgain:
2615  if ((dir = opendir(dirname)) != NULL)
2616  {
2617  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2618 
2619  desc->kind = AllocateDescDir;
2620  desc->desc.dir = dir;
2623  return desc->desc.dir;
2624  }
2625 
2626  if (errno == EMFILE || errno == ENFILE)
2627  {
2628  int save_errno = errno;
2629 
2630  ereport(LOG,
2631  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2632  errmsg("out of file descriptors: %m; release and retry")));
2633  errno = 0;
2634  if (ReleaseLruFile())
2635  goto TryAgain;
2636  errno = save_errno;
2637  }
2638 
2639  return NULL;
2640 }
2641 
2642 /*
2643  * Read a directory opened with AllocateDir, ereport'ing any error.
2644  *
2645  * This is easier to use than raw readdir() since it takes care of some
2646  * otherwise rather tedious and error-prone manipulation of errno. Also,
2647  * if you are happy with a generic error message for AllocateDir failure,
2648  * you can just do
2649  *
2650  * dir = AllocateDir(path);
2651  * while ((dirent = ReadDir(dir, path)) != NULL)
2652  * process dirent;
2653  * FreeDir(dir);
2654  *
2655  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2656  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2657  * use this shortcut.)
2658  *
2659  * The pathname passed to AllocateDir must be passed to this routine too,
2660  * but it is only used for error reporting.
2661  */
2662 struct dirent *
2663 ReadDir(DIR *dir, const char *dirname)
2664 {
2665  return ReadDirExtended(dir, dirname, ERROR);
2666 }
2667 
2668 /*
2669  * Alternate version of ReadDir that allows caller to specify the elevel
2670  * for any error report (whether it's reporting an initial failure of
2671  * AllocateDir or a subsequent directory read failure).
2672  *
2673  * If elevel < ERROR, returns NULL after any error. With the normal coding
2674  * pattern, this will result in falling out of the loop immediately as
2675  * though the directory contained no (more) entries.
2676  */
2677 struct dirent *
2678 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2679 {
2680  struct dirent *dent;
2681 
2682  /* Give a generic message for AllocateDir failure, if caller didn't */
2683  if (dir == NULL)
2684  {
2685  ereport(elevel,
2687  errmsg("could not open directory \"%s\": %m",
2688  dirname)));
2689  return NULL;
2690  }
2691 
2692  errno = 0;
2693  if ((dent = readdir(dir)) != NULL)
2694  return dent;
2695 
2696  if (errno)
2697  ereport(elevel,
2699  errmsg("could not read directory \"%s\": %m",
2700  dirname)));
2701  return NULL;
2702 }
2703 
2704 /*
2705  * Close a directory opened with AllocateDir.
2706  *
2707  * Returns closedir's return value (with errno set if it's not 0).
2708  * Note we do not check the return value --- it is up to the caller
2709  * to handle close errors if wanted.
2710  *
2711  * Does nothing if dir == NULL; we assume that directory open failure was
2712  * already reported if desired.
2713  */
2714 int
2716 {
2717  int i;
2718 
2719  /* Nothing to do if AllocateDir failed */
2720  if (dir == NULL)
2721  return 0;
2722 
2723  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2724 
2725  /* Remove dir from list of allocated dirs, if it's present */
2726  for (i = numAllocatedDescs; --i >= 0;)
2727  {
2728  AllocateDesc *desc = &allocatedDescs[i];
2729 
2730  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2731  return FreeDesc(desc);
2732  }
2733 
2734  /* Only get here if someone passes us a dir not in allocatedDescs */
2735  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2736 
2737  return closedir(dir);
2738 }
2739 
2740 
2741 /*
2742  * Close a pipe stream returned by OpenPipeStream.
2743  */
2744 int
2745 ClosePipeStream(FILE *file)
2746 {
2747  int i;
2748 
2749  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2750 
2751  /* Remove file from list of allocated files, if it's present */
2752  for (i = numAllocatedDescs; --i >= 0;)
2753  {
2754  AllocateDesc *desc = &allocatedDescs[i];
2755 
2756  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2757  return FreeDesc(desc);
2758  }
2759 
2760  /* Only get here if someone passes us a file not in allocatedDescs */
2761  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2762 
2763  return pclose(file);
2764 }
2765 
2766 /*
2767  * closeAllVfds
2768  *
2769  * Force all VFDs into the physically-closed state, so that the fewest
2770  * possible number of kernel file descriptors are in use. There is no
2771  * change in the logical state of the VFDs.
2772  */
2773 void
2775 {
2776  Index i;
2777 
2778  if (SizeVfdCache > 0)
2779  {
2780  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2781  for (i = 1; i < SizeVfdCache; i++)
2782  {
2783  if (!FileIsNotOpen(i))
2784  LruDelete(i);
2785  }
2786  }
2787 }
2788 
2789 
2790 /*
2791  * SetTempTablespaces
2792  *
2793  * Define a list (actually an array) of OIDs of tablespaces to use for
2794  * temporary files. This list will be used until end of transaction,
2795  * unless this function is called again before then. It is caller's
2796  * responsibility that the passed-in array has adequate lifespan (typically
2797  * it'd be allocated in TopTransactionContext).
2798  */
2799 void
2800 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2801 {
2802  Assert(numSpaces >= 0);
2803  tempTableSpaces = tableSpaces;
2804  numTempTableSpaces = numSpaces;
2805 
2806  /*
2807  * Select a random starting point in the list. This is to minimize
2808  * conflicts between backends that are most likely sharing the same list
2809  * of temp tablespaces. Note that if we create multiple temp files in the
2810  * same transaction, we'll advance circularly through the list --- this
2811  * ensures that large temporary sort files are nicely spread across all
2812  * available tablespaces.
2813  */
2814  if (numSpaces > 1)
2815  nextTempTableSpace = random() % numSpaces;
2816  else
2817  nextTempTableSpace = 0;
2818 }
2819 
2820 /*
2821  * TempTablespacesAreSet
2822  *
2823  * Returns true if SetTempTablespaces has been called in current transaction.
2824  * (This is just so that tablespaces.c doesn't need its own per-transaction
2825  * state.)
2826  */
2827 bool
2829 {
2830  return (numTempTableSpaces >= 0);
2831 }
2832 
2833 /*
2834  * GetTempTablespaces
2835  *
2836  * Populate an array with the OIDs of the tablespaces that should be used for
2837  * temporary files. Return the number that were copied into the output array.
2838  */
2839 int
2840 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2841 {
2842  int i;
2843 
2845  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2846  tableSpaces[i] = tempTableSpaces[i];
2847 
2848  return i;
2849 }
2850 
2851 /*
2852  * GetNextTempTableSpace
2853  *
2854  * Select the next temp tablespace to use. A result of InvalidOid means
2855  * to use the current database's default tablespace.
2856  */
2857 Oid
2859 {
2860  if (numTempTableSpaces > 0)
2861  {
2862  /* Advance nextTempTableSpace counter with wraparound */
2864  nextTempTableSpace = 0;
2866  }
2867  return InvalidOid;
2868 }
2869 
2870 
2871 /*
2872  * AtEOSubXact_Files
2873  *
2874  * Take care of subtransaction commit/abort. At abort, we close temp files
2875  * that the subtransaction may have opened. At commit, we reassign the
2876  * files that were opened to the parent subtransaction.
2877  */
2878 void
2879 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2880  SubTransactionId parentSubid)
2881 {
2882  Index i;
2883 
2884  for (i = 0; i < numAllocatedDescs; i++)
2885  {
2886  if (allocatedDescs[i].create_subid == mySubid)
2887  {
2888  if (isCommit)
2889  allocatedDescs[i].create_subid = parentSubid;
2890  else
2891  {
2892  /* have to recheck the item after FreeDesc (ugly) */
2893  FreeDesc(&allocatedDescs[i--]);
2894  }
2895  }
2896  }
2897 }
2898 
2899 /*
2900  * AtEOXact_Files
2901  *
2902  * This routine is called during transaction commit or abort (it doesn't
2903  * particularly care which). All still-open per-transaction temporary file
2904  * VFDs are closed, which also causes the underlying files to be deleted
2905  * (although they should've been closed already by the ResourceOwner
2906  * cleanup). Furthermore, all "allocated" stdio files are closed. We also
2907  * forget any transaction-local temp tablespace list.
2908  */
2909 void
2911 {
2912  CleanupTempFiles(false);
2913  tempTableSpaces = NULL;
2914  numTempTableSpaces = -1;
2915 }
2916 
2917 /*
2918  * AtProcExit_Files
2919  *
2920  * on_proc_exit hook to clean up temp files during backend shutdown.
2921  * Here, we want to clean up *all* temp files including interXact ones.
2922  */
2923 static void
2925 {
2926  CleanupTempFiles(true);
2927 }
2928 
2929 /*
2930  * Close temporary files and delete their underlying files.
2931  *
2932  * isProcExit: if true, this is being called as the backend process is
2933  * exiting. If that's the case, we should remove all temporary files; if
2934  * that's not the case, we are being called for transaction commit/abort
2935  * and should only remove transaction-local temp files. In either case,
2936  * also clean up "allocated" stdio files, dirs and fds.
2937  */
2938 static void
2939 CleanupTempFiles(bool isProcExit)
2940 {
2941  Index i;
2942 
2943  /*
2944  * Careful here: at proc_exit we need extra cleanup, not just
2945  * xact_temporary files.
2946  */
2947  if (isProcExit || have_xact_temporary_files)
2948  {
2949  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2950  for (i = 1; i < SizeVfdCache; i++)
2951  {
2952  unsigned short fdstate = VfdCache[i].fdstate;
2953 
2954  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2955  VfdCache[i].fileName != NULL)
2956  {
2957  /*
2958  * If we're in the process of exiting a backend process, close
2959  * all temporary files. Otherwise, only close temporary files
2960  * local to the current transaction. They should be closed by
2961  * the ResourceOwner mechanism already, so this is just a
2962  * debugging cross-check.
2963  */
2964  if (isProcExit)
2965  FileClose(i);
2966  else if (fdstate & FD_CLOSE_AT_EOXACT)
2967  {
2968  elog(WARNING,
2969  "temporary file %s not closed at end-of-transaction",
2970  VfdCache[i].fileName);
2971  FileClose(i);
2972  }
2973  }
2974  }
2975 
2976  have_xact_temporary_files = false;
2977  }
2978 
2979  /* Clean up "allocated" stdio files, dirs and fds. */
2980  while (numAllocatedDescs > 0)
2981  FreeDesc(&allocatedDescs[0]);
2982 }
2983 
2984 
2985 /*
2986  * Remove temporary and temporary relation files left over from a prior
2987  * postmaster session
2988  *
2989  * This should be called during postmaster startup. It will forcibly
2990  * remove any leftover files created by OpenTemporaryFile and any leftover
2991  * temporary relation files created by mdcreate.
2992  *
2993  * NOTE: we could, but don't, call this during a post-backend-crash restart
2994  * cycle. The argument for not doing it is that someone might want to examine
2995  * the temp files for debugging purposes. This does however mean that
2996  * OpenTemporaryFile had better allow for collision with an existing temp
2997  * file name.
2998  *
2999  * NOTE: this function and its subroutines generally report syscall failures
3000  * with ereport(LOG) and keep going. Removing temp files is not so critical
3001  * that we should fail to start the database when we can't do it.
3002  */
3003 void
3005 {
3006  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3007  DIR *spc_dir;
3008  struct dirent *spc_de;
3009 
3010  /*
3011  * First process temp files in pg_default ($PGDATA/base)
3012  */
3013  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3014  RemovePgTempFilesInDir(temp_path, true, false);
3015  RemovePgTempRelationFiles("base");
3016 
3017  /*
3018  * Cycle through temp directories for all non-default tablespaces.
3019  */
3020  spc_dir = AllocateDir("pg_tblspc");
3021 
3022  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3023  {
3024  if (strcmp(spc_de->d_name, ".") == 0 ||
3025  strcmp(spc_de->d_name, "..") == 0)
3026  continue;
3027 
3028  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3030  RemovePgTempFilesInDir(temp_path, true, false);
3031 
3032  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3034  RemovePgTempRelationFiles(temp_path);
3035  }
3036 
3037  FreeDir(spc_dir);
3038 
3039  /*
3040  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3041  * DataDir as well.
3042  */
3043 #ifdef EXEC_BACKEND
3045 #endif
3046 }
3047 
3048 /*
3049  * Process one pgsql_tmp directory for RemovePgTempFiles.
3050  *
3051  * If missing_ok is true, it's all right for the named directory to not exist.
3052  * Any other problem results in a LOG message. (missing_ok should be true at
3053  * the top level, since pgsql_tmp directories are not created until needed.)
3054  *
3055  * At the top level, this should be called with unlink_all = false, so that
3056  * only files matching the temporary name prefix will be unlinked. When
3057  * recursing it will be called with unlink_all = true to unlink everything
3058  * under a top-level temporary directory.
3059  *
3060  * (These two flags could be replaced by one, but it seems clearer to keep
3061  * them separate.)
3062  */
3063 static void
3064 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3065 {
3066  DIR *temp_dir;
3067  struct dirent *temp_de;
3068  char rm_path[MAXPGPATH * 2];
3069 
3070  temp_dir = AllocateDir(tmpdirname);
3071 
3072  if (temp_dir == NULL && errno == ENOENT && missing_ok)
3073  return;
3074 
3075  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3076  {
3077  if (strcmp(temp_de->d_name, ".") == 0 ||
3078  strcmp(temp_de->d_name, "..") == 0)
3079  continue;
3080 
3081  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3082  tmpdirname, temp_de->d_name);
3083 
3084  if (unlink_all ||
3085  strncmp(temp_de->d_name,
3087  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3088  {
3089  struct stat statbuf;
3090 
3091  if (lstat(rm_path, &statbuf) < 0)
3092  {
3093  ereport(LOG,
3095  errmsg("could not stat file \"%s\": %m", rm_path)));
3096  continue;
3097  }
3098 
3099  if (S_ISDIR(statbuf.st_mode))
3100  {
3101  /* recursively remove contents, then directory itself */
3102  RemovePgTempFilesInDir(rm_path, false, true);
3103 
3104  if (rmdir(rm_path) < 0)
3105  ereport(LOG,
3107  errmsg("could not remove directory \"%s\": %m",
3108  rm_path)));
3109  }
3110  else
3111  {
3112  if (unlink(rm_path) < 0)
3113  ereport(LOG,
3115  errmsg("could not remove file \"%s\": %m",
3116  rm_path)));
3117  }
3118  }
3119  else
3120  ereport(LOG,
3121  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3122  rm_path)));
3123  }
3124 
3125  FreeDir(temp_dir);
3126 }
3127 
3128 /* Process one tablespace directory, look for per-DB subdirectories */
3129 static void
3130 RemovePgTempRelationFiles(const char *tsdirname)
3131 {
3132  DIR *ts_dir;
3133  struct dirent *de;
3134  char dbspace_path[MAXPGPATH * 2];
3135 
3136  ts_dir = AllocateDir(tsdirname);
3137 
3138  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3139  {
3140  /*
3141  * We're only interested in the per-database directories, which have
3142  * numeric names. Note that this code will also (properly) ignore "."
3143  * and "..".
3144  */
3145  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3146  continue;
3147 
3148  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3149  tsdirname, de->d_name);
3150  RemovePgTempRelationFilesInDbspace(dbspace_path);
3151  }
3152 
3153  FreeDir(ts_dir);
3154 }
3155 
3156 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3157 static void
3158 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3159 {
3160  DIR *dbspace_dir;
3161  struct dirent *de;
3162  char rm_path[MAXPGPATH * 2];
3163 
3164  dbspace_dir = AllocateDir(dbspacedirname);
3165 
3166  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3167  {
3168  if (!looks_like_temp_rel_name(de->d_name))
3169  continue;
3170 
3171  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3172  dbspacedirname, de->d_name);
3173 
3174  if (unlink(rm_path) < 0)
3175  ereport(LOG,
3177  errmsg("could not remove file \"%s\": %m",
3178  rm_path)));
3179  }
3180 
3181  FreeDir(dbspace_dir);
3182 }
3183 
3184 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3185 static bool
3187 {
3188  int pos;
3189  int savepos;
3190 
3191  /* Must start with "t". */
3192  if (name[0] != 't')
3193  return false;
3194 
3195  /* Followed by a non-empty string of digits and then an underscore. */
3196  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3197  ;
3198  if (pos == 1 || name[pos] != '_')
3199  return false;
3200 
3201  /* Followed by another nonempty string of digits. */
3202  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3203  ;
3204  if (savepos == pos)
3205  return false;
3206 
3207  /* We might have _forkname or .segment or both. */
3208  if (name[pos] == '_')
3209  {
3210  int forkchar = forkname_chars(&name[pos + 1], NULL);
3211 
3212  if (forkchar <= 0)
3213  return false;
3214  pos += forkchar + 1;
3215  }
3216  if (name[pos] == '.')
3217  {
3218  int segchar;
3219 
3220  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3221  ;
3222  if (segchar <= 1)
3223  return false;
3224  pos += segchar;
3225  }
3226 
3227  /* Now we should be at the end. */
3228  if (name[pos] != '\0')
3229  return false;
3230  return true;
3231 }
3232 
3233 
3234 /*
3235  * Issue fsync recursively on PGDATA and all its contents.
3236  *
3237  * We fsync regular files and directories wherever they are, but we
3238  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3239  * Other symlinks are presumed to point at files we're not responsible
3240  * for fsyncing, and might not have privileges to write at all.
3241  *
3242  * Errors are logged but not considered fatal; that's because this is used
3243  * only during database startup, to deal with the possibility that there are
3244  * issued-but-unsynced writes pending against the data directory. We want to
3245  * ensure that such writes reach disk before anything that's done in the new
3246  * run. However, aborting on error would result in failure to start for
3247  * harmless cases such as read-only files in the data directory, and that's
3248  * not good either.
3249  *
3250  * Note we assume we're chdir'd into PGDATA to begin with.
3251  */
3252 void
3254 {
3255  bool xlog_is_symlink;
3256 
3257  /* We can skip this whole thing if fsync is disabled. */
3258  if (!enableFsync)
3259  return;
3260 
3261  /*
3262  * If pg_wal is a symlink, we'll need to recurse into it separately,
3263  * because the first walkdir below will ignore it.
3264  */
3265  xlog_is_symlink = false;
3266 
3267 #ifndef WIN32
3268  {
3269  struct stat st;
3270 
3271  if (lstat("pg_wal", &st) < 0)
3272  ereport(LOG,
3274  errmsg("could not stat file \"%s\": %m",
3275  "pg_wal")));
3276  else if (S_ISLNK(st.st_mode))
3277  xlog_is_symlink = true;
3278  }
3279 #else
3280  if (pgwin32_is_junction("pg_wal"))
3281  xlog_is_symlink = true;
3282 #endif
3283 
3284  /*
3285  * If possible, hint to the kernel that we're soon going to fsync the data
3286  * directory and its contents. Errors in this step are even less
3287  * interesting than normal, so log them only at DEBUG1.
3288  */
3289 #ifdef PG_FLUSH_DATA_WORKS
3290  walkdir(".", pre_sync_fname, false, DEBUG1);
3291  if (xlog_is_symlink)
3292  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3293  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3294 #endif
3295 
3296  /*
3297  * Now we do the fsync()s in the same order.
3298  *
3299  * The main call ignores symlinks, so in addition to specially processing
3300  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3301  * process_symlinks = true. Note that if there are any plain directories
3302  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3303  * so we don't worry about optimizing it.
3304  */
3305  walkdir(".", datadir_fsync_fname, false, LOG);
3306  if (xlog_is_symlink)
3307  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3308  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3309 }
3310 
3311 /*
3312  * walkdir: recursively walk a directory, applying the action to each
3313  * regular file and directory (including the named directory itself).
3314  *
3315  * If process_symlinks is true, the action and recursion are also applied
3316  * to regular files and directories that are pointed to by symlinks in the
3317  * given directory; otherwise symlinks are ignored. Symlinks are always
3318  * ignored in subdirectories, ie we intentionally don't pass down the
3319  * process_symlinks flag to recursive calls.
3320  *
3321  * Errors are reported at level elevel, which might be ERROR or less.
3322  *
3323  * See also walkdir in initdb.c, which is a frontend version of this logic.
3324  */
3325 static void
3326 walkdir(const char *path,
3327  void (*action) (const char *fname, bool isdir, int elevel),
3328  bool process_symlinks,
3329  int elevel)
3330 {
3331  DIR *dir;
3332  struct dirent *de;
3333 
3334  dir = AllocateDir(path);
3335 
3336  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3337  {
3338  char subpath[MAXPGPATH * 2];
3339  struct stat fst;
3340  int sret;
3341 
3343 
3344  if (strcmp(de->d_name, ".") == 0 ||
3345  strcmp(de->d_name, "..") == 0)
3346  continue;
3347 
3348  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3349 
3350  if (process_symlinks)
3351  sret = stat(subpath, &fst);
3352  else
3353  sret = lstat(subpath, &fst);
3354 
3355  if (sret < 0)
3356  {
3357  ereport(elevel,
3359  errmsg("could not stat file \"%s\": %m", subpath)));
3360  continue;
3361  }
3362 
3363  if (S_ISREG(fst.st_mode))
3364  (*action) (subpath, false, elevel);
3365  else if (S_ISDIR(fst.st_mode))
3366  walkdir(subpath, action, false, elevel);
3367  }
3368 
3369  FreeDir(dir); /* we ignore any error here */
3370 
3371  /*
3372  * It's important to fsync the destination directory itself as individual
3373  * file fsyncs don't guarantee that the directory entry for the file is
3374  * synced. However, skip this if AllocateDir failed; the action function
3375  * might not be robust against that.
3376  */
3377  if (dir)
3378  (*action) (path, true, elevel);
3379 }
3380 
3381 
3382 /*
3383  * Hint to the OS that it should get ready to fsync() this file.
3384  *
3385  * Ignores errors trying to open unreadable files, and logs other errors at a
3386  * caller-specified level.
3387  */
3388 #ifdef PG_FLUSH_DATA_WORKS
3389 
3390 static void
3391 pre_sync_fname(const char *fname, bool isdir, int elevel)
3392 {
3393  int fd;
3394 
3395  /* Don't try to flush directories, it'll likely just fail */
3396  if (isdir)
3397  return;
3398 
3399  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3400 
3401  if (fd < 0)
3402  {
3403  if (errno == EACCES)
3404  return;
3405  ereport(elevel,
3407  errmsg("could not open file \"%s\": %m", fname)));
3408  return;
3409  }
3410 
3411  /*
3412  * pg_flush_data() ignores errors, which is ok because this is only a
3413  * hint.
3414  */
3415  pg_flush_data(fd, 0, 0);
3416 
3417  (void) CloseTransientFile(fd);
3418 }
3419 
3420 #endif /* PG_FLUSH_DATA_WORKS */
3421 
3422 static void
3423 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3424 {
3425  /*
3426  * We want to silently ignoring errors about unreadable files. Pass that
3427  * desire on to fsync_fname_ext().
3428  */
3429  fsync_fname_ext(fname, isdir, true, elevel);
3430 }
3431 
3432 static void
3433 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3434 {
3435  if (isdir)
3436  {
3437  if (rmdir(fname) != 0 && errno != ENOENT)
3438  ereport(elevel,
3440  errmsg("could not rmdir directory \"%s\": %m", fname)));
3441  }
3442  else
3443  {
3444  /* Use PathNameDeleteTemporaryFile to report filesize */
3445  PathNameDeleteTemporaryFile(fname, false);
3446  }
3447 }
3448 
3449 /*
3450  * fsync_fname_ext -- Try to fsync a file or directory
3451  *
3452  * If ignore_perm is true, ignore errors upon trying to open unreadable
3453  * files. Logs other errors at a caller-specified level.
3454  *
3455  * Returns 0 if the operation succeeded, -1 otherwise.
3456  */
3457 static int
3458 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3459 {
3460  int fd;
3461  int flags;
3462  int returncode;
3463 
3464  /*
3465  * Some OSs require directories to be opened read-only whereas other
3466  * systems don't allow us to fsync files opened read-only; so we need both
3467  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3468  * not writable by our userid, but we assume that's OK.
3469  */
3470  flags = PG_BINARY;
3471  if (!isdir)
3472  flags |= O_RDWR;
3473  else
3474  flags |= O_RDONLY;
3475 
3476  fd = OpenTransientFile(fname, flags);
3477 
3478  /*
3479  * Some OSs don't allow us to open directories at all (Windows returns
3480  * EACCES), just ignore the error in that case. If desired also silently
3481  * ignoring errors about unreadable files. Log others.
3482  */
3483  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3484  return 0;
3485  else if (fd < 0 && ignore_perm && errno == EACCES)
3486  return 0;
3487  else if (fd < 0)
3488  {
3489  ereport(elevel,
3491  errmsg("could not open file \"%s\": %m", fname)));
3492  return -1;
3493  }
3494 
3495  returncode = pg_fsync(fd);
3496 
3497  /*
3498  * Some OSes don't allow us to fsync directories at all, so we can ignore
3499  * those errors. Anything else needs to be logged.
3500  */
3501  if (returncode != 0 && !(isdir && errno == EBADF))
3502  {
3503  int save_errno;
3504 
3505  /* close file upon error, might not be in transaction context */
3506  save_errno = errno;
3507  (void) CloseTransientFile(fd);
3508  errno = save_errno;
3509 
3510  ereport(elevel,
3512  errmsg("could not fsync file \"%s\": %m", fname)));
3513  return -1;
3514  }
3515 
3516  (void) CloseTransientFile(fd);
3517 
3518  return 0;
3519 }
3520 
3521 /*
3522  * fsync_parent_path -- fsync the parent path of a file or directory
3523  *
3524  * This is aimed at making file operations persistent on disk in case of
3525  * an OS crash or power failure.
3526  */
3527 static int
3528 fsync_parent_path(const char *fname, int elevel)
3529 {
3530  char parentpath[MAXPGPATH];
3531 
3532  strlcpy(parentpath, fname, MAXPGPATH);
3533  get_parent_directory(parentpath);
3534 
3535  /*
3536  * get_parent_directory() returns an empty string if the input argument is
3537  * just a file name (see comments in path.c), so handle that as being the
3538  * current directory.
3539  */
3540  if (strlen(parentpath) == 0)
3541  strlcpy(parentpath, ".", MAXPGPATH);
3542 
3543  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3544  return -1;
3545 
3546  return 0;
3547 }
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1371
File lruLessRecently
Definition: fd.c:197
void closeAllVfds(void)
Definition: fd.c:2774
File nextFree
Definition: fd.c:195
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:812
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1703
#define MAP_FAILED
Definition: mem.h:45
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:39
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1635
#define NUM_RESERVED_FDS
Definition: fd.c:119
static AllocateDesc * allocatedDescs
Definition: fd.c:259
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1358
int pg_fdatasync(int fd)
Definition: fd.c:400
static void error(void)
Definition: sql-dyntest.c:147
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
AllocateDescKind
Definition: fd.c:237
DIR * dir
Definition: fd.c:252
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1578
static void AtProcExit_Files(int code, Datum arg)
Definition: fd.c:2924
#define write(a, b, c)
Definition: win32.h:14
static Size SizeVfdCache
Definition: fd.c:212
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:188
void on_proc_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:292
#define DO_DB(A)
Definition: fd.c:165
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2840
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3326
long random(void)
Definition: random.c:22
ResourceOwner CurrentResourceOwner
Definition: resowner.c:138
int pg_fsync_writethrough(int fd)
Definition: fd.c:377
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:79
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2678
int max_safe_fds
Definition: fd.c:152
#define Min(x, y)
Definition: c.h:826
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:582
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2402
#define FD_DELETE_AT_CLOSE
Definition: fd.c:186
int log_temp_files
Definition: guc.c:455
mode_t FileGetRawMode(File file)
Definition: fd.c:2259
#define GLOBALTABLESPACE_OID
Definition: pg_tablespace.h:64
void _dosmaperr(unsigned long)
Definition: win32error.c:171
static Vfd * VfdCache
Definition: fd.c:211
static void Delete(File file)
Definition: fd.c:1009
int closedir(DIR *)
Definition: dirent.c:111
static int numTempTableSpaces
Definition: fd.c:272
int errcode(int sqlerrcode)
Definition: elog.c:575
#define MemSet(start, val, len)
Definition: c.h:877
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1467
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:365
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3158
int snprintf(char *str, size_t count, const char *fmt,...) pg_attribute_printf(3
#define PG_TEMP_FILE_PREFIX
Definition: fd.h:140
File PathNameOpenTemporaryFile(const char *path)
Definition: fd.c:1673
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1525
static bool reserveAllocatedDesc(void)
Definition: fd.c:2270
uint32 SubTransactionId
Definition: c.h:459
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1553
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
#define FilePosIsUnknown(pos)
Definition: fd.c:183
AllocateDescKind kind
Definition: fd.c:247
char * FilePathName(File file)
Definition: fd.c:2223
Definition: dirent.h:9
#define OidIsValid(objectId)
Definition: c.h:586
#define PG_BINARY
Definition: c.h:1049
static char * basedir
Definition: pg_basebackup.c:78
Oid MyDatabaseTableSpace
Definition: globals.c:79
int ClosePipeStream(FILE *file)
Definition: fd.c:2745
#define malloc(a)
Definition: header.h:50
static void LruDelete(File file)
Definition: fd.c:1028
void pg_usleep(long microsec)
Definition: signal.c:53
bool TempTablespacesAreSet(void)
Definition: fd.c:2828
#define fsync(fd)
Definition: win32_port.h:63
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2496
void pfree(void *pointer)
Definition: mcxt.c:936
mode_t fileMode
Definition: fd.c:203
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3130
static bool ReleaseLruFile(void)
Definition: fd.c:1165
Definition: dirent.c:25
#define ERROR
Definition: elog.h:43
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2393
static int LruInsert(File file)
Definition: fd.c:1091
void AtEOXact_Files(void)
Definition: fd.c:2910
#define FATAL
Definition: elog.h:52
static bool have_xact_temporary_files
Definition: fd.c:223
#define MAXPGPATH
DIR * opendir(const char *)
Definition: dirent.c:33
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2079
#define DEBUG2
Definition: elog.h:24
char * fileName
Definition: fd.c:200
#define PG_FILE_MODE_DEFAULT
Definition: fd.c:131
static char * buf
Definition: pg_test_fsync.c:67
Oid GetNextTempTableSpace(void)
Definition: fd.c:2858
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1186
static void CleanupTempFiles(bool isProcExit)
Definition: fd.c:2939
#define DEFAULTTABLESPACE_OID
Definition: pg_tablespace.h:63
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3433
int errdetail(const char *fmt,...)
Definition: elog.c:873
char * tablespace
Definition: pgbench.c:146
int errcode_for_file_access(void)
Definition: elog.c:598
void get_parent_directory(char *path)
Definition: path.c:854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2343
static int nfile
Definition: fd.c:217
unsigned int uint32
Definition: c.h:306
void SyncDataDirectory(void)
Definition: fd.c:3253
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2597
static int nextTempTableSpace
Definition: fd.c:273
int FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
Definition: fd.c:1958
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1259
int max_files_per_process
Definition: fd.c:139
static File AllocateVfd(void)
Definition: fd.c:1197
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2442
off_t seekPos
Definition: fd.c:198
unsigned short fdstate
Definition: fd.c:193
Definition: fd.c:190
off_t fileSize
Definition: fd.c:199
int fd
Definition: fd.c:192
#define ereport(elevel, rest)
Definition: elog.h:122
int FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
Definition: fd.c:1894
int link(const char *fromname, const char *toname)
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2800
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:608
static void Insert(File file)
Definition: fd.c:1069
ResourceOwner resowner
Definition: fd.c:194
#define S_ISREG(m)
Definition: win32_port.h:310
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3423
int CloseTransientFile(int fd)
Definition: fd.c:2563
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1311
static void ReleaseLruFiles(void)
Definition: fd.c:1187
#define WARNING
Definition: elog.h:40
#define stat(a, b)
Definition: win32_port.h:266
#define FileIsNotOpen(file)
Definition: fd.c:174
static int elevel
Definition: vacuumlazy.c:136
struct vfd Vfd
uintptr_t Datum
Definition: postgres.h:372
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:2879
unsigned int Index
Definition: c.h:423
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:422
#define FileIsValid(file)
Definition: fd.c:171
FILE * file
Definition: fd.c:251
static bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3186
#define InvalidOid
Definition: postgres_ext.h:36
#define VFD_CLOSED
Definition: fd.c:169
static uint64 temporary_files_size
Definition: fd.c:231
#define free(a)
Definition: header.h:65
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static void RegisterTemporaryFile(File file)
Definition: fd.c:1330
static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3064
#define PG_TEMP_FILES_DIR
Definition: fd.h:139
void FileClose(File file)
Definition: fd.c:1749
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:1839
static int FileAccess(File file)
Definition: fd.c:1275
#define Assert(condition)
Definition: c.h:680
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:642
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2663
File lruMoreRecently
Definition: fd.c:196
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:1867
void RemovePgTempFiles(void)
Definition: fd.c:3004
SubTransactionId create_subid
Definition: fd.c:248
WalTimeSample buffer[LAG_TRACKER_BUFFER_SIZE]
Definition: walsender.c:215
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1500
int durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:727
size_t Size
Definition: c.h:414
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1235
static const char * directory
Definition: zic.c:567
int sync_method
Definition: xlog.c:103
struct dirent * readdir(DIR *)
Definition: dirent.c:77
#define FD_MINFREE
Definition: fd.c:125
#define TABLESPACE_VERSION_DIRECTORY
Definition: catalog.h:26
#define realloc(a, b)
Definition: header.h:60
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:949
#define INT64_FORMAT
Definition: c.h:348
const char * name
Definition: encode.c:521
static long tempFileCounter
Definition: fd.c:265
int fd
Definition: fd.c:253
#define S_ISDIR(m)
Definition: win32_port.h:307
#define lstat(path, sb)
Definition: win32_port.h:255
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:691
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:939
int FreeFile(FILE *file)
Definition: fd.c:2535
void set_max_safe_fds(void)
Definition: fd.c:896
bool enableFsync
Definition: globals.c:111
static Oid * tempTableSpaces
Definition: fd.c:271
#define S_IRWXU
Definition: win32_port.h:280
void * palloc(Size size)
Definition: mcxt.c:835
int errmsg(const char *fmt,...)
Definition: elog.c:797
int FileGetRawFlags(File file)
Definition: fd.c:2249
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1175
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3458
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:961
union AllocateDesc::@27 desc
int i
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:187
void * arg
int FileGetRawDesc(File file)
Definition: fd.c:2239
static void FreeVfd(File file)
Definition: fd.c:1255
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:98
int pg_fsync(int fd)
Definition: fd.c:348
char d_name[MAX_PATH]
Definition: dirent.h:14
#define elog
Definition: elog.h:219
#define mkdir(a, b)
Definition: win32_port.h:58
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:334
int fileFlags
Definition: fd.c:202
off_t FileSeek(File file, off_t offset, int whence)
Definition: fd.c:2100
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1436
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1195
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2188
#define FileUnknownPos
Definition: fd.c:182
static int maxAllocatedDescs
Definition: fd.c:258
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3528
int File
Definition: fd.h:49
#define read(a, b, c)
Definition: win32.h:13
int FreeDir(DIR *dir)
Definition: fd.c:2715
int temp_file_limit
Definition: guc.c:458
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:234
void InitFileAccess(void)
Definition: fd.c:779
static int numAllocatedDescs
Definition: fd.c:257
bool pgwin32_is_junction(const char *path)
#define ftruncate(a, b)
Definition: win32_port.h:60