PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 1024 on many modern
20  * operating systems, but may be lower on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  *-------------------------------------------------------------------------
65  */
66 
67 #include "postgres.h"
68 
69 #include <sys/file.h>
70 #include <sys/param.h>
71 #include <sys/stat.h>
72 #ifndef WIN32
73 #include <sys/mman.h>
74 #endif
75 #include <limits.h>
76 #include <unistd.h>
77 #include <fcntl.h>
78 #ifdef HAVE_SYS_RESOURCE_H
79 #include <sys/resource.h> /* for getrlimit */
80 #endif
81 
82 #include "access/xact.h"
83 #include "access/xlog.h"
84 #include "catalog/pg_tablespace.h"
85 #include "common/file_perm.h"
86 #include "miscadmin.h"
87 #include "pgstat.h"
88 #include "portability/mem.h"
89 #include "storage/fd.h"
90 #include "storage/ipc.h"
91 #include "utils/guc.h"
92 #include "utils/resowner_private.h"
93 
94 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
95 #if defined(HAVE_SYNC_FILE_RANGE)
96 #define PG_FLUSH_DATA_WORKS 1
97 #elif !defined(WIN32) && defined(MS_ASYNC)
98 #define PG_FLUSH_DATA_WORKS 1
99 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
100 #define PG_FLUSH_DATA_WORKS 1
101 #endif
102 
103 /*
104  * We must leave some file descriptors free for system(), the dynamic loader,
105  * and other code that tries to open files without consulting fd.c. This
106  * is the number left free. (While we can be pretty sure we won't get
107  * EMFILE, there's never any guarantee that we won't get ENFILE due to
108  * other processes chewing up FDs. So it's a bad idea to try to open files
109  * without consulting fd.c. Nonetheless we cannot control all code.)
110  *
111  * Because this is just a fixed setting, we are effectively assuming that
112  * no such code will leave FDs open over the long term; otherwise the slop
113  * is likely to be insufficient. Note in particular that we expect that
114  * loading a shared library does not result in any permanent increase in
115  * the number of open files. (This appears to be true on most if not
116  * all platforms as of Feb 2004.)
117  */
118 #define NUM_RESERVED_FDS 10
119 
120 /*
121  * If we have fewer than this many usable FDs after allowing for the reserved
122  * ones, choke.
123  */
124 #define FD_MINFREE 10
125 
126 /*
127  * A number of platforms allow individual processes to open many more files
128  * than they can really support when *many* processes do the same thing.
129  * This GUC parameter lets the DBA limit max_safe_fds to something less than
130  * what the postmaster's initial probe suggests will work.
131  */
133 
134 /*
135  * Maximum number of file descriptors to open for either VFD entries or
136  * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
137  * to a conservative value, and remains that way indefinitely in bootstrap or
138  * standalone-backend cases. In normal postmaster operation, the postmaster
139  * calls set_max_safe_fds() late in initialization to update the value, and
140  * that value is then inherited by forked subprocesses.
141  *
142  * Note: the value of max_files_per_process is taken into account while
143  * setting this variable, and so need not be tested separately.
144  */
145 int max_safe_fds = 32; /* default if not changed */
146 
147 /* Whether it is safe to continue running after fsync() fails. */
148 bool data_sync_retry = false;
149 
150 /* Debugging.... */
151 
152 #ifdef FDDEBUG
153 #define DO_DB(A) \
154  do { \
155  int _do_db_save_errno = errno; \
156  A; \
157  errno = _do_db_save_errno; \
158  } while (0)
159 #else
160 #define DO_DB(A) \
161  ((void) 0)
162 #endif
163 
164 #define VFD_CLOSED (-1)
165 
166 #define FileIsValid(file) \
167  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
168 
169 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
170 
171 /* these are the assigned bits in fdstate below: */
172 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
173 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
174 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
175 
176 typedef struct vfd
177 {
178  int fd; /* current FD, or VFD_CLOSED if none */
179  unsigned short fdstate; /* bitflags for VFD's state */
180  ResourceOwner resowner; /* owner, for automatic cleanup */
181  File nextFree; /* link to next free VFD, if in freelist */
182  File lruMoreRecently; /* doubly linked recency-of-use list */
184  off_t fileSize; /* current size of file (0 if not temporary) */
185  char *fileName; /* name of file, or NULL for unused VFD */
186  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
187  int fileFlags; /* open(2) flags for (re)opening the file */
188  mode_t fileMode; /* mode to pass to open(2) */
189 } Vfd;
190 
191 /*
192  * Virtual File Descriptor array pointer and size. This grows as
193  * needed. 'File' values are indexes into this array.
194  * Note that VfdCache[0] is not a usable VFD, just a list header.
195  */
196 static Vfd *VfdCache;
197 static Size SizeVfdCache = 0;
198 
199 /*
200  * Number of file descriptors known to be in use by VFD entries.
201  */
202 static int nfile = 0;
203 
204 /*
205  * Flag to tell whether it's worth scanning VfdCache looking for temp files
206  * to close
207  */
208 static bool have_xact_temporary_files = false;
209 
210 /*
211  * Tracks the total size of all temporary files. Note: when temp_file_limit
212  * is being enforced, this cannot overflow since the limit cannot be more
213  * than INT_MAX kilobytes. When not enforcing, it could theoretically
214  * overflow, but we don't care.
215  */
216 static uint64 temporary_files_size = 0;
217 
218 /*
219  * List of OS handles opened with AllocateFile, AllocateDir and
220  * OpenTransientFile.
221  */
222 typedef enum
223 {
229 
230 typedef struct
231 {
234  union
235  {
236  FILE *file;
238  int fd;
239  } desc;
240 } AllocateDesc;
241 
242 static int numAllocatedDescs = 0;
243 static int maxAllocatedDescs = 0;
245 
246 /*
247  * Number of temporary files opened during the current session;
248  * this is used in generation of tempfile names.
249  */
250 static long tempFileCounter = 0;
251 
252 /*
253  * Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
254  * this has not been set in the current transaction.
255  */
256 static Oid *tempTableSpaces = NULL;
257 static int numTempTableSpaces = -1;
258 static int nextTempTableSpace = 0;
259 
260 
261 /*--------------------
262  *
263  * Private Routines
264  *
265  * Delete - delete a file from the Lru ring
266  * LruDelete - remove a file from the Lru ring and close its FD
267  * Insert - put a file at the front of the Lru ring
268  * LruInsert - put a file at the front of the Lru ring and open it
269  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
270  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
271  * AllocateVfd - grab a free (or new) file record (from VfdCache)
272  * FreeVfd - free a file record
273  *
274  * The Least Recently Used ring is a doubly linked list that begins and
275  * ends on element zero. Element zero is special -- it doesn't represent
276  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
277  * anchor that shows us the beginning/end of the ring.
278  * Only VFD elements that are currently really open (have an FD assigned) are
279  * in the Lru ring. Elements that are "virtually" open can be recognized
280  * by having a non-null fileName field.
281  *
282  * example:
283  *
284  * /--less----\ /---------\
285  * v \ v \
286  * #0 --more---> LeastRecentlyUsed --more-\ \
287  * ^\ | |
288  * \\less--> MostRecentlyUsedFile <---/ |
289  * \more---/ \--less--/
290  *
291  *--------------------
292  */
293 static void Delete(File file);
294 static void LruDelete(File file);
295 static void Insert(File file);
296 static int LruInsert(File file);
297 static bool ReleaseLruFile(void);
298 static void ReleaseLruFiles(void);
299 static File AllocateVfd(void);
300 static void FreeVfd(File file);
301 
302 static int FileAccess(File file);
303 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
304 static bool reserveAllocatedDesc(void);
305 static int FreeDesc(AllocateDesc *desc);
306 
307 static void AtProcExit_Files(int code, Datum arg);
308 static void CleanupTempFiles(bool isCommit, bool isProcExit);
309 static void RemovePgTempRelationFiles(const char *tsdirname);
310 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
311 
312 static void walkdir(const char *path,
313  void (*action) (const char *fname, bool isdir, int elevel),
314  bool process_symlinks,
315  int elevel);
316 #ifdef PG_FLUSH_DATA_WORKS
317 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
318 #endif
319 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
320 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
321 
322 static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
323 static int fsync_parent_path(const char *fname, int elevel);
324 
325 
326 /*
327  * pg_fsync --- do fsync with or without writethrough
328  */
329 int
331 {
332 #if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
333  struct stat st;
334 
335  /*
336  * Some operating system implementations of fsync() have requirements
337  * about the file access modes that were used when their file descriptor
338  * argument was opened, and these requirements differ depending on whether
339  * the file descriptor is for a directory.
340  *
341  * For any file descriptor that may eventually be handed to fsync(), we
342  * should have opened it with access modes that are compatible with
343  * fsync() on all supported systems, otherwise the code may not be
344  * portable, even if it runs ok on the current system.
345  *
346  * We assert here that a descriptor for a file was opened with write
347  * permissions (either O_RDWR or O_WRONLY) and for a directory without
348  * write permissions (O_RDONLY).
349  *
350  * Ignore any fstat errors and let the follow-up fsync() do its work.
351  * Doing this sanity check here counts for the case where fsync() is
352  * disabled.
353  */
354  if (fstat(fd, &st) == 0)
355  {
356  int desc_flags = fcntl(fd, F_GETFL);
357 
358  /*
359  * O_RDONLY is historically 0, so just make sure that for directories
360  * no write flags are used.
361  */
362  if (S_ISDIR(st.st_mode))
363  Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
364  else
365  Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
366  }
367  errno = 0;
368 #endif
369 
370  /* #if is to skip the sync_method test if there's no need for it */
371 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
373  return pg_fsync_writethrough(fd);
374  else
375 #endif
376  return pg_fsync_no_writethrough(fd);
377 }
378 
379 
380 /*
381  * pg_fsync_no_writethrough --- same as fsync except does nothing if
382  * enableFsync is off
383  */
384 int
386 {
387  if (enableFsync)
388  return fsync(fd);
389  else
390  return 0;
391 }
392 
393 /*
394  * pg_fsync_writethrough
395  */
396 int
398 {
399  if (enableFsync)
400  {
401 #ifdef WIN32
402  return _commit(fd);
403 #elif defined(F_FULLFSYNC)
404  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
405 #else
406  errno = ENOSYS;
407  return -1;
408 #endif
409  }
410  else
411  return 0;
412 }
413 
414 /*
415  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
416  *
417  * Not all platforms have fdatasync; treat as fsync if not available.
418  */
419 int
421 {
422  if (enableFsync)
423  {
424 #ifdef HAVE_FDATASYNC
425  return fdatasync(fd);
426 #else
427  return fsync(fd);
428 #endif
429  }
430  else
431  return 0;
432 }
433 
434 /*
435  * pg_flush_data --- advise OS that the described dirty data should be flushed
436  *
437  * offset of 0 with nbytes 0 means that the entire file should be flushed
438  */
439 void
440 pg_flush_data(int fd, off_t offset, off_t nbytes)
441 {
442  /*
443  * Right now file flushing is primarily used to avoid making later
444  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
445  * if fsyncs are disabled - that's a decision we might want to make
446  * configurable at some point.
447  */
448  if (!enableFsync)
449  return;
450 
451  /*
452  * We compile all alternatives that are supported on the current platform,
453  * to find portability problems more easily.
454  */
455 #if defined(HAVE_SYNC_FILE_RANGE)
456  {
457  int rc;
458  static bool not_implemented_by_kernel = false;
459 
460  if (not_implemented_by_kernel)
461  return;
462 
463  /*
464  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
465  * tells the OS that writeback for the specified blocks should be
466  * started, but that we don't want to wait for completion. Note that
467  * this call might block if too much dirty data exists in the range.
468  * This is the preferable method on OSs supporting it, as it works
469  * reliably when available (contrast to msync()) and doesn't flush out
470  * clean data (like FADV_DONTNEED).
471  */
472  rc = sync_file_range(fd, offset, nbytes,
473  SYNC_FILE_RANGE_WRITE);
474  if (rc != 0)
475  {
476  int elevel;
477 
478  /*
479  * For systems that don't have an implementation of
480  * sync_file_range() such as Windows WSL, generate only one
481  * warning and then suppress all further attempts by this process.
482  */
483  if (errno == ENOSYS)
484  {
485  elevel = WARNING;
486  not_implemented_by_kernel = true;
487  }
488  else
489  elevel = data_sync_elevel(WARNING);
490 
491  ereport(elevel,
493  errmsg("could not flush dirty data: %m")));
494  }
495 
496  return;
497  }
498 #endif
499 #if !defined(WIN32) && defined(MS_ASYNC)
500  {
501  void *p;
502  static int pagesize = 0;
503 
504  /*
505  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
506  * writeback. On linux it only does so if MS_SYNC is specified, but
507  * then it does the writeback synchronously. Luckily all common linux
508  * systems have sync_file_range(). This is preferable over
509  * FADV_DONTNEED because it doesn't flush out clean data.
510  *
511  * We map the file (mmap()), tell the kernel to sync back the contents
512  * (msync()), and then remove the mapping again (munmap()).
513  */
514 
515  /* mmap() needs actual length if we want to map whole file */
516  if (offset == 0 && nbytes == 0)
517  {
518  nbytes = lseek(fd, 0, SEEK_END);
519  if (nbytes < 0)
520  {
523  errmsg("could not determine dirty data size: %m")));
524  return;
525  }
526  }
527 
528  /*
529  * Some platforms reject partial-page mmap() attempts. To deal with
530  * that, just truncate the request to a page boundary. If any extra
531  * bytes don't get flushed, well, it's only a hint anyway.
532  */
533 
534  /* fetch pagesize only once */
535  if (pagesize == 0)
536  pagesize = sysconf(_SC_PAGESIZE);
537 
538  /* align length to pagesize, dropping any fractional page */
539  if (pagesize > 0)
540  nbytes = (nbytes / pagesize) * pagesize;
541 
542  /* fractional-page request is a no-op */
543  if (nbytes <= 0)
544  return;
545 
546  /*
547  * mmap could well fail, particularly on 32-bit platforms where there
548  * may simply not be enough address space. If so, silently fall
549  * through to the next implementation.
550  */
551  if (nbytes <= (off_t) SSIZE_MAX)
552  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
553  else
554  p = MAP_FAILED;
555 
556  if (p != MAP_FAILED)
557  {
558  int rc;
559 
560  rc = msync(p, (size_t) nbytes, MS_ASYNC);
561  if (rc != 0)
562  {
565  errmsg("could not flush dirty data: %m")));
566  /* NB: need to fall through to munmap()! */
567  }
568 
569  rc = munmap(p, (size_t) nbytes);
570  if (rc != 0)
571  {
572  /* FATAL error because mapping would remain */
573  ereport(FATAL,
575  errmsg("could not munmap() while flushing data: %m")));
576  }
577 
578  return;
579  }
580  }
581 #endif
582 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
583  {
584  int rc;
585 
586  /*
587  * Signal the kernel that the passed in range should not be cached
588  * anymore. This has the, desired, side effect of writing out dirty
589  * data, and the, undesired, side effect of likely discarding useful
590  * clean cached blocks. For the latter reason this is the least
591  * preferable method.
592  */
593 
594  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
595 
596  if (rc != 0)
597  {
598  /* don't error out, this is just a performance optimization */
601  errmsg("could not flush dirty data: %m")));
602  }
603 
604  return;
605  }
606 #endif
607 }
608 
609 
610 /*
611  * fsync_fname -- fsync a file or directory, handling errors properly
612  *
613  * Try to fsync a file or directory. When doing the latter, ignore errors that
614  * indicate the OS just doesn't allow/require fsyncing directories.
615  */
616 void
617 fsync_fname(const char *fname, bool isdir)
618 {
619  fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
620 }
621 
622 /*
623  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
624  *
625  * This routine ensures that, after returning, the effect of renaming file
626  * persists in case of a crash. A crash while this routine is running will
627  * leave you with either the pre-existing or the moved file in place of the
628  * new file; no mixed state or truncated files are possible.
629  *
630  * It does so by using fsync on the old filename and the possibly existing
631  * target filename before the rename, and the target file and directory after.
632  *
633  * Note that rename() cannot be used across arbitrary directories, as they
634  * might not be on the same filesystem. Therefore this routine does not
635  * support renaming across directories.
636  *
637  * Log errors with the caller specified severity.
638  *
639  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
640  * valid upon return.
641  */
642 int
643 durable_rename(const char *oldfile, const char *newfile, int elevel)
644 {
645  int fd;
646 
647  /*
648  * First fsync the old and target path (if it exists), to ensure that they
649  * are properly persistent on disk. Syncing the target file is not
650  * strictly necessary, but it makes it easier to reason about crashes;
651  * because it's then guaranteed that either source or target file exists
652  * after a crash.
653  */
654  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
655  return -1;
656 
657  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
658  if (fd < 0)
659  {
660  if (errno != ENOENT)
661  {
662  ereport(elevel,
664  errmsg("could not open file \"%s\": %m", newfile)));
665  return -1;
666  }
667  }
668  else
669  {
670  if (pg_fsync(fd) != 0)
671  {
672  int save_errno;
673 
674  /* close file upon error, might not be in transaction context */
675  save_errno = errno;
676  CloseTransientFile(fd);
677  errno = save_errno;
678 
679  ereport(elevel,
681  errmsg("could not fsync file \"%s\": %m", newfile)));
682  return -1;
683  }
684 
685  if (CloseTransientFile(fd) != 0)
686  {
687  ereport(elevel,
689  errmsg("could not close file \"%s\": %m", newfile)));
690  return -1;
691  }
692  }
693 
694  /* Time to do the real deal... */
695  if (rename(oldfile, newfile) < 0)
696  {
697  ereport(elevel,
699  errmsg("could not rename file \"%s\" to \"%s\": %m",
700  oldfile, newfile)));
701  return -1;
702  }
703 
704  /*
705  * To guarantee renaming the file is persistent, fsync the file with its
706  * new name, and its containing directory.
707  */
708  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
709  return -1;
710 
711  if (fsync_parent_path(newfile, elevel) != 0)
712  return -1;
713 
714  return 0;
715 }
716 
717 /*
718  * durable_unlink -- remove a file in a durable manner
719  *
720  * This routine ensures that, after returning, the effect of removing file
721  * persists in case of a crash. A crash while this routine is running will
722  * leave the system in no mixed state.
723  *
724  * It does so by using fsync on the parent directory of the file after the
725  * actual removal is done.
726  *
727  * Log errors with the severity specified by caller.
728  *
729  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
730  * valid upon return.
731  */
732 int
733 durable_unlink(const char *fname, int elevel)
734 {
735  if (unlink(fname) < 0)
736  {
737  ereport(elevel,
739  errmsg("could not remove file \"%s\": %m",
740  fname)));
741  return -1;
742  }
743 
744  /*
745  * To guarantee that the removal of the file is persistent, fsync its
746  * parent directory.
747  */
748  if (fsync_parent_path(fname, elevel) != 0)
749  return -1;
750 
751  return 0;
752 }
753 
754 /*
755  * durable_link_or_rename -- rename a file in a durable manner.
756  *
757  * Similar to durable_rename(), except that this routine tries (but does not
758  * guarantee) not to overwrite the target file.
759  *
760  * Note that a crash in an unfortunate moment can leave you with two links to
761  * the target file.
762  *
763  * Log errors with the caller specified severity.
764  *
765  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
766  * valid upon return.
767  */
768 int
769 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
770 {
771  /*
772  * Ensure that, if we crash directly after the rename/link, a file with
773  * valid contents is moved into place.
774  */
775  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
776  return -1;
777 
778 #ifdef HAVE_WORKING_LINK
779  if (link(oldfile, newfile) < 0)
780  {
781  ereport(elevel,
783  errmsg("could not link file \"%s\" to \"%s\": %m",
784  oldfile, newfile)));
785  return -1;
786  }
787  unlink(oldfile);
788 #else
789  /* XXX: Add racy file existence check? */
790  if (rename(oldfile, newfile) < 0)
791  {
792  ereport(elevel,
794  errmsg("could not rename file \"%s\" to \"%s\": %m",
795  oldfile, newfile)));
796  return -1;
797  }
798 #endif
799 
800  /*
801  * Make change persistent in case of an OS crash, both the new entry and
802  * its parent directory need to be flushed.
803  */
804  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
805  return -1;
806 
807  /* Same for parent directory */
808  if (fsync_parent_path(newfile, elevel) != 0)
809  return -1;
810 
811  return 0;
812 }
813 
814 /*
815  * InitFileAccess --- initialize this module during backend startup
816  *
817  * This is called during either normal or standalone backend start.
818  * It is *not* called in the postmaster.
819  */
820 void
822 {
823  Assert(SizeVfdCache == 0); /* call me only once */
824 
825  /* initialize cache header entry */
826  VfdCache = (Vfd *) malloc(sizeof(Vfd));
827  if (VfdCache == NULL)
828  ereport(FATAL,
829  (errcode(ERRCODE_OUT_OF_MEMORY),
830  errmsg("out of memory")));
831 
832  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
833  VfdCache->fd = VFD_CLOSED;
834 
835  SizeVfdCache = 1;
836 
837  /* register proc-exit hook to ensure temp files are dropped at exit */
839 }
840 
841 /*
842  * count_usable_fds --- count how many FDs the system will let us open,
843  * and estimate how many are already open.
844  *
845  * We stop counting if usable_fds reaches max_to_probe. Note: a small
846  * value of max_to_probe might result in an underestimate of already_open;
847  * we must fill in any "gaps" in the set of used FDs before the calculation
848  * of already_open will give the right answer. In practice, max_to_probe
849  * of a couple of dozen should be enough to ensure good results.
850  *
851  * We assume stdin (FD 0) is available for dup'ing
852  */
853 static void
854 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
855 {
856  int *fd;
857  int size;
858  int used = 0;
859  int highestfd = 0;
860  int j;
861 
862 #ifdef HAVE_GETRLIMIT
863  struct rlimit rlim;
864  int getrlimit_status;
865 #endif
866 
867  size = 1024;
868  fd = (int *) palloc(size * sizeof(int));
869 
870 #ifdef HAVE_GETRLIMIT
871 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
872  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
873 #else /* but BSD doesn't ... */
874  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
875 #endif /* RLIMIT_NOFILE */
876  if (getrlimit_status != 0)
877  ereport(WARNING, (errmsg("getrlimit failed: %m")));
878 #endif /* HAVE_GETRLIMIT */
879 
880  /* dup until failure or probe limit reached */
881  for (;;)
882  {
883  int thisfd;
884 
885 #ifdef HAVE_GETRLIMIT
886 
887  /*
888  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
889  * some platforms
890  */
891  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
892  break;
893 #endif
894 
895  thisfd = dup(0);
896  if (thisfd < 0)
897  {
898  /* Expect EMFILE or ENFILE, else it's fishy */
899  if (errno != EMFILE && errno != ENFILE)
900  elog(WARNING, "dup(0) failed after %d successes: %m", used);
901  break;
902  }
903 
904  if (used >= size)
905  {
906  size *= 2;
907  fd = (int *) repalloc(fd, size * sizeof(int));
908  }
909  fd[used++] = thisfd;
910 
911  if (highestfd < thisfd)
912  highestfd = thisfd;
913 
914  if (used >= max_to_probe)
915  break;
916  }
917 
918  /* release the files we opened */
919  for (j = 0; j < used; j++)
920  close(fd[j]);
921 
922  pfree(fd);
923 
924  /*
925  * Return results. usable_fds is just the number of successful dups. We
926  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
927  * number) and so already_open is highestfd+1 - usable_fds.
928  */
929  *usable_fds = used;
930  *already_open = highestfd + 1 - used;
931 }
932 
933 /*
934  * set_max_safe_fds
935  * Determine number of file descriptors that fd.c is allowed to use
936  */
937 void
939 {
940  int usable_fds;
941  int already_open;
942 
943  /*----------
944  * We want to set max_safe_fds to
945  * MIN(usable_fds, max_files_per_process - already_open)
946  * less the slop factor for files that are opened without consulting
947  * fd.c. This ensures that we won't exceed either max_files_per_process
948  * or the experimentally-determined EMFILE limit.
949  *----------
950  */
952  &usable_fds, &already_open);
953 
954  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
955 
956  /*
957  * Take off the FDs reserved for system() etc.
958  */
960 
961  /*
962  * Make sure we still have enough to get by.
963  */
964  if (max_safe_fds < FD_MINFREE)
965  ereport(FATAL,
966  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
967  errmsg("insufficient file descriptors available to start server process"),
968  errdetail("System allows %d, we need at least %d.",
971 
972  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
973  max_safe_fds, usable_fds, already_open);
974 }
975 
976 /*
977  * Open a file with BasicOpenFilePerm() and pass default file mode for the
978  * fileMode parameter.
979  */
980 int
982 {
983  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
984 }
985 
986 /*
987  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
988  *
989  * This is exported for use by places that really want a plain kernel FD,
990  * but need to be proof against running out of FDs. Once an FD has been
991  * successfully returned, it is the caller's responsibility to ensure that
992  * it will not be leaked on ereport()! Most users should *not* call this
993  * routine directly, but instead use the VFD abstraction level, which
994  * provides protection against descriptor leaks as well as management of
995  * files that need to be open for more than a short period of time.
996  *
997  * Ideally this should be the *only* direct call of open() in the backend.
998  * In practice, the postmaster calls open() directly, and there are some
999  * direct open() calls done early in backend startup. Those are OK since
1000  * this module wouldn't have any open files to close at that point anyway.
1001  */
1002 int
1004 {
1005  int fd;
1006 
1007 tryAgain:
1008  fd = open(fileName, fileFlags, fileMode);
1009 
1010  if (fd >= 0)
1011  return fd; /* success! */
1012 
1013  if (errno == EMFILE || errno == ENFILE)
1014  {
1015  int save_errno = errno;
1016 
1017  ereport(LOG,
1018  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
1019  errmsg("out of file descriptors: %m; release and retry")));
1020  errno = 0;
1021  if (ReleaseLruFile())
1022  goto tryAgain;
1023  errno = save_errno;
1024  }
1025 
1026  return -1; /* failure */
1027 }
1028 
1029 #if defined(FDDEBUG)
1030 
1031 static void
1032 _dump_lru(void)
1033 {
1034  int mru = VfdCache[0].lruLessRecently;
1035  Vfd *vfdP = &VfdCache[mru];
1036  char buf[2048];
1037 
1038  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1039  while (mru != 0)
1040  {
1041  mru = vfdP->lruLessRecently;
1042  vfdP = &VfdCache[mru];
1043  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1044  }
1045  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1046  elog(LOG, "%s", buf);
1047 }
1048 #endif /* FDDEBUG */
1049 
1050 static void
1052 {
1053  Vfd *vfdP;
1054 
1055  Assert(file != 0);
1056 
1057  DO_DB(elog(LOG, "Delete %d (%s)",
1058  file, VfdCache[file].fileName));
1059  DO_DB(_dump_lru());
1060 
1061  vfdP = &VfdCache[file];
1062 
1063  VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1064  VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1065 
1066  DO_DB(_dump_lru());
1067 }
1068 
1069 static void
1071 {
1072  Vfd *vfdP;
1073 
1074  Assert(file != 0);
1075 
1076  DO_DB(elog(LOG, "LruDelete %d (%s)",
1077  file, VfdCache[file].fileName));
1078 
1079  vfdP = &VfdCache[file];
1080 
1081  /*
1082  * Close the file. We aren't expecting this to fail; if it does, better
1083  * to leak the FD than to mess up our internal state.
1084  */
1085  if (close(vfdP->fd) != 0)
1087  "could not close file \"%s\": %m", vfdP->fileName);
1088  vfdP->fd = VFD_CLOSED;
1089  --nfile;
1090 
1091  /* delete the vfd record from the LRU ring */
1092  Delete(file);
1093 }
1094 
1095 static void
1097 {
1098  Vfd *vfdP;
1099 
1100  Assert(file != 0);
1101 
1102  DO_DB(elog(LOG, "Insert %d (%s)",
1103  file, VfdCache[file].fileName));
1104  DO_DB(_dump_lru());
1105 
1106  vfdP = &VfdCache[file];
1107 
1108  vfdP->lruMoreRecently = 0;
1109  vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1110  VfdCache[0].lruLessRecently = file;
1111  VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1112 
1113  DO_DB(_dump_lru());
1114 }
1115 
1116 /* returns 0 on success, -1 on re-open failure (with errno set) */
1117 static int
1119 {
1120  Vfd *vfdP;
1121 
1122  Assert(file != 0);
1123 
1124  DO_DB(elog(LOG, "LruInsert %d (%s)",
1125  file, VfdCache[file].fileName));
1126 
1127  vfdP = &VfdCache[file];
1128 
1129  if (FileIsNotOpen(file))
1130  {
1131  /* Close excess kernel FDs. */
1132  ReleaseLruFiles();
1133 
1134  /*
1135  * The open could still fail for lack of file descriptors, eg due to
1136  * overall system file table being full. So, be prepared to release
1137  * another FD if necessary...
1138  */
1139  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1140  vfdP->fileMode);
1141  if (vfdP->fd < 0)
1142  {
1143  DO_DB(elog(LOG, "re-open failed: %m"));
1144  return -1;
1145  }
1146  else
1147  {
1148  ++nfile;
1149  }
1150  }
1151 
1152  /*
1153  * put it at the head of the Lru ring
1154  */
1155 
1156  Insert(file);
1157 
1158  return 0;
1159 }
1160 
1161 /*
1162  * Release one kernel FD by closing the least-recently-used VFD.
1163  */
1164 static bool
1166 {
1167  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1168 
1169  if (nfile > 0)
1170  {
1171  /*
1172  * There are opened files and so there should be at least one used vfd
1173  * in the ring.
1174  */
1175  Assert(VfdCache[0].lruMoreRecently != 0);
1176  LruDelete(VfdCache[0].lruMoreRecently);
1177  return true; /* freed a file */
1178  }
1179  return false; /* no files available to free */
1180 }
1181 
1182 /*
1183  * Release kernel FDs as needed to get under the max_safe_fds limit.
1184  * After calling this, it's OK to try to open another file.
1185  */
1186 static void
1188 {
1189  while (nfile + numAllocatedDescs >= max_safe_fds)
1190  {
1191  if (!ReleaseLruFile())
1192  break;
1193  }
1194 }
1195 
1196 static File
1198 {
1199  Index i;
1200  File file;
1201 
1202  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1203 
1204  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1205 
1206  if (VfdCache[0].nextFree == 0)
1207  {
1208  /*
1209  * The free list is empty so it is time to increase the size of the
1210  * array. We choose to double it each time this happens. However,
1211  * there's not much point in starting *real* small.
1212  */
1213  Size newCacheSize = SizeVfdCache * 2;
1214  Vfd *newVfdCache;
1215 
1216  if (newCacheSize < 32)
1217  newCacheSize = 32;
1218 
1219  /*
1220  * Be careful not to clobber VfdCache ptr if realloc fails.
1221  */
1222  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1223  if (newVfdCache == NULL)
1224  ereport(ERROR,
1225  (errcode(ERRCODE_OUT_OF_MEMORY),
1226  errmsg("out of memory")));
1227  VfdCache = newVfdCache;
1228 
1229  /*
1230  * Initialize the new entries and link them into the free list.
1231  */
1232  for (i = SizeVfdCache; i < newCacheSize; i++)
1233  {
1234  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1235  VfdCache[i].nextFree = i + 1;
1236  VfdCache[i].fd = VFD_CLOSED;
1237  }
1238  VfdCache[newCacheSize - 1].nextFree = 0;
1239  VfdCache[0].nextFree = SizeVfdCache;
1240 
1241  /*
1242  * Record the new size
1243  */
1244  SizeVfdCache = newCacheSize;
1245  }
1246 
1247  file = VfdCache[0].nextFree;
1248 
1249  VfdCache[0].nextFree = VfdCache[file].nextFree;
1250 
1251  return file;
1252 }
1253 
1254 static void
1256 {
1257  Vfd *vfdP = &VfdCache[file];
1258 
1259  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1260  file, vfdP->fileName ? vfdP->fileName : ""));
1261 
1262  if (vfdP->fileName != NULL)
1263  {
1264  free(vfdP->fileName);
1265  vfdP->fileName = NULL;
1266  }
1267  vfdP->fdstate = 0x0;
1268 
1269  vfdP->nextFree = VfdCache[0].nextFree;
1270  VfdCache[0].nextFree = file;
1271 }
1272 
1273 /* returns 0 on success, -1 on re-open failure (with errno set) */
1274 static int
1276 {
1277  int returnValue;
1278 
1279  DO_DB(elog(LOG, "FileAccess %d (%s)",
1280  file, VfdCache[file].fileName));
1281 
1282  /*
1283  * Is the file open? If not, open it and put it at the head of the LRU
1284  * ring (possibly closing the least recently used file to get an FD).
1285  */
1286 
1287  if (FileIsNotOpen(file))
1288  {
1289  returnValue = LruInsert(file);
1290  if (returnValue != 0)
1291  return returnValue;
1292  }
1293  else if (VfdCache[0].lruLessRecently != file)
1294  {
1295  /*
1296  * We now know that the file is open and that it is not the last one
1297  * accessed, so we need to move it to the head of the Lru ring.
1298  */
1299 
1300  Delete(file);
1301  Insert(file);
1302  }
1303 
1304  return 0;
1305 }
1306 
1307 /*
1308  * Called whenever a temporary file is deleted to report its size.
1309  */
1310 static void
1311 ReportTemporaryFileUsage(const char *path, off_t size)
1312 {
1313  pgstat_report_tempfile(size);
1314 
1315  if (log_temp_files >= 0)
1316  {
1317  if ((size / 1024) >= log_temp_files)
1318  ereport(LOG,
1319  (errmsg("temporary file: path \"%s\", size %lu",
1320  path, (unsigned long) size)));
1321  }
1322 }
1323 
1324 /*
1325  * Called to register a temporary file for automatic close.
1326  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1327  * before the file was opened.
1328  */
1329 static void
1331 {
1333  VfdCache[file].resowner = CurrentResourceOwner;
1334 
1335  /* Backup mechanism for closing at end of xact. */
1336  VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1338 }
1339 
1340 /*
1341  * Called when we get a shared invalidation message on some relation.
1342  */
1343 #ifdef NOT_USED
1344 void
1345 FileInvalidate(File file)
1346 {
1347  Assert(FileIsValid(file));
1348  if (!FileIsNotOpen(file))
1349  LruDelete(file);
1350 }
1351 #endif
1352 
1353 /*
1354  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1355  * fileMode parameter.
1356  */
1357 File
1359 {
1360  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1361 }
1362 
1363 /*
1364  * open a file in an arbitrary directory
1365  *
1366  * NB: if the passed pathname is relative (which it usually is),
1367  * it will be interpreted relative to the process' working directory
1368  * (which should always be $PGDATA when this code is running).
1369  */
1370 File
1372 {
1373  char *fnamecopy;
1374  File file;
1375  Vfd *vfdP;
1376 
1377  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1378  fileName, fileFlags, fileMode));
1379 
1380  /*
1381  * We need a malloc'd copy of the file name; fail cleanly if no room.
1382  */
1383  fnamecopy = strdup(fileName);
1384  if (fnamecopy == NULL)
1385  ereport(ERROR,
1386  (errcode(ERRCODE_OUT_OF_MEMORY),
1387  errmsg("out of memory")));
1388 
1389  file = AllocateVfd();
1390  vfdP = &VfdCache[file];
1391 
1392  /* Close excess kernel FDs. */
1393  ReleaseLruFiles();
1394 
1395  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1396 
1397  if (vfdP->fd < 0)
1398  {
1399  int save_errno = errno;
1400 
1401  FreeVfd(file);
1402  free(fnamecopy);
1403  errno = save_errno;
1404  return -1;
1405  }
1406  ++nfile;
1407  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1408  vfdP->fd));
1409 
1410  Insert(file);
1411 
1412  vfdP->fileName = fnamecopy;
1413  /* Saved flags are adjusted to be OK for re-opening file */
1414  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1415  vfdP->fileMode = fileMode;
1416  vfdP->fileSize = 0;
1417  vfdP->fdstate = 0x0;
1418  vfdP->resowner = NULL;
1419 
1420  return file;
1421 }
1422 
1423 /*
1424  * Create directory 'directory'. If necessary, create 'basedir', which must
1425  * be the directory above it. This is designed for creating the top-level
1426  * temporary directory on demand before creating a directory underneath it.
1427  * Do nothing if the directory already exists.
1428  *
1429  * Directories created within the top-level temporary directory should begin
1430  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1431  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1432  * that do not need any particular prefix.
1433 */
1434 void
1436 {
1437  if (MakePGDirectory(directory) < 0)
1438  {
1439  if (errno == EEXIST)
1440  return;
1441 
1442  /*
1443  * Failed. Try to create basedir first in case it's missing. Tolerate
1444  * EEXIST to close a race against another process following the same
1445  * algorithm.
1446  */
1447  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1448  ereport(ERROR,
1450  errmsg("cannot create temporary directory \"%s\": %m",
1451  basedir)));
1452 
1453  /* Try again. */
1454  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1455  ereport(ERROR,
1457  errmsg("cannot create temporary subdirectory \"%s\": %m",
1458  directory)));
1459  }
1460 }
1461 
1462 /*
1463  * Delete a directory and everything in it, if it exists.
1464  */
1465 void
1466 PathNameDeleteTemporaryDir(const char *dirname)
1467 {
1468  struct stat statbuf;
1469 
1470  /* Silently ignore missing directory. */
1471  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1472  return;
1473 
1474  /*
1475  * Currently, walkdir doesn't offer a way for our passed in function to
1476  * maintain state. Perhaps it should, so that we could tell the caller
1477  * whether this operation succeeded or failed. Since this operation is
1478  * used in a cleanup path, we wouldn't actually behave differently: we'll
1479  * just log failures.
1480  */
1481  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1482 }
1483 
1484 /*
1485  * Open a temporary file that will disappear when we close it.
1486  *
1487  * This routine takes care of generating an appropriate tempfile name.
1488  * There's no need to pass in fileFlags or fileMode either, since only
1489  * one setting makes any sense for a temp file.
1490  *
1491  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1492  * to ensure it's closed and deleted when it's no longer needed, typically at
1493  * the end-of-transaction. In most cases, you don't want temporary files to
1494  * outlive the transaction that created them, so this should be false -- but
1495  * if you need "somewhat" temporary storage, this might be useful. In either
1496  * case, the file is removed when the File is explicitly closed.
1497  */
1498 File
1499 OpenTemporaryFile(bool interXact)
1500 {
1501  File file = 0;
1502 
1503  /*
1504  * Make sure the current resource owner has space for this File before we
1505  * open it, if we'll be registering it below.
1506  */
1507  if (!interXact)
1509 
1510  /*
1511  * If some temp tablespace(s) have been given to us, try to use the next
1512  * one. If a given tablespace can't be found, we silently fall back to
1513  * the database's default tablespace.
1514  *
1515  * BUT: if the temp file is slated to outlive the current transaction,
1516  * force it into the database's default tablespace, so that it will not
1517  * pose a threat to possible tablespace drop attempts.
1518  */
1519  if (numTempTableSpaces > 0 && !interXact)
1520  {
1521  Oid tblspcOid = GetNextTempTableSpace();
1522 
1523  if (OidIsValid(tblspcOid))
1524  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1525  }
1526 
1527  /*
1528  * If not, or if tablespace is bad, create in database's default
1529  * tablespace. MyDatabaseTableSpace should normally be set before we get
1530  * here, but just in case it isn't, fall back to pg_default tablespace.
1531  */
1532  if (file <= 0)
1535  DEFAULTTABLESPACE_OID,
1536  true);
1537 
1538  /* Mark it for deletion at close and temporary file size limit */
1539  VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1540 
1541  /* Register it with the current resource owner */
1542  if (!interXact)
1543  RegisterTemporaryFile(file);
1544 
1545  return file;
1546 }
1547 
1548 /*
1549  * Return the path of the temp directory in a given tablespace.
1550  */
1551 void
1553 {
1554  /*
1555  * Identify the tempfile directory for this tablespace.
1556  *
1557  * If someone tries to specify pg_global, use pg_default instead.
1558  */
1559  if (tablespace == InvalidOid ||
1560  tablespace == DEFAULTTABLESPACE_OID ||
1561  tablespace == GLOBALTABLESPACE_OID)
1562  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1563  else
1564  {
1565  /* All other tablespaces are accessed via symlinks */
1566  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1567  tablespace, TABLESPACE_VERSION_DIRECTORY,
1569  }
1570 }
1571 
1572 /*
1573  * Open a temporary file in a specific tablespace.
1574  * Subroutine for OpenTemporaryFile, which see for details.
1575  */
1576 static File
1577 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1578 {
1579  char tempdirpath[MAXPGPATH];
1580  char tempfilepath[MAXPGPATH];
1581  File file;
1582 
1583  TempTablespacePath(tempdirpath, tblspcOid);
1584 
1585  /*
1586  * Generate a tempfile name that should be unique within the current
1587  * database instance.
1588  */
1589  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1590  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1591 
1592  /*
1593  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1594  * temp file that can be reused.
1595  */
1596  file = PathNameOpenFile(tempfilepath,
1597  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1598  if (file <= 0)
1599  {
1600  /*
1601  * We might need to create the tablespace's tempfile directory, if no
1602  * one has yet done so.
1603  *
1604  * Don't check for an error from MakePGDirectory; it could fail if
1605  * someone else just did the same thing. If it doesn't work then
1606  * we'll bomb out on the second create attempt, instead.
1607  */
1608  (void) MakePGDirectory(tempdirpath);
1609 
1610  file = PathNameOpenFile(tempfilepath,
1611  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1612  if (file <= 0 && rejectError)
1613  elog(ERROR, "could not create temporary file \"%s\": %m",
1614  tempfilepath);
1615  }
1616 
1617  return file;
1618 }
1619 
1620 
1621 /*
1622  * Create a new file. The directory containing it must already exist. Files
1623  * created this way are subject to temp_file_limit and are automatically
1624  * closed at end of transaction, but are not automatically deleted on close
1625  * because they are intended to be shared between cooperating backends.
1626  *
1627  * If the file is inside the top-level temporary directory, its name should
1628  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1629  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1630  * inside a directory created with PathNameCreateTemporaryDir(), in which case
1631  * the prefix isn't needed.
1632  */
1633 File
1634 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1635 {
1636  File file;
1637 
1639 
1640  /*
1641  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1642  * temp file that can be reused.
1643  */
1644  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1645  if (file <= 0)
1646  {
1647  if (error_on_failure)
1648  ereport(ERROR,
1650  errmsg("could not create temporary file \"%s\": %m",
1651  path)));
1652  else
1653  return file;
1654  }
1655 
1656  /* Mark it for temp_file_limit accounting. */
1657  VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1658 
1659  /* Register it for automatic close. */
1660  RegisterTemporaryFile(file);
1661 
1662  return file;
1663 }
1664 
1665 /*
1666  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1667  * another backend. Files opened this way don't count against the
1668  * temp_file_limit of the caller, are read-only and are automatically closed
1669  * at the end of the transaction but are not deleted on close.
1670  */
1671 File
1672 PathNameOpenTemporaryFile(const char *path)
1673 {
1674  File file;
1675 
1677 
1678  /* We open the file read-only. */
1679  file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1680 
1681  /* If no such file, then we don't raise an error. */
1682  if (file <= 0 && errno != ENOENT)
1683  ereport(ERROR,
1685  errmsg("could not open temporary file \"%s\": %m",
1686  path)));
1687 
1688  if (file > 0)
1689  {
1690  /* Register it for automatic close. */
1691  RegisterTemporaryFile(file);
1692  }
1693 
1694  return file;
1695 }
1696 
1697 /*
1698  * Delete a file by pathname. Return true if the file existed, false if
1699  * didn't.
1700  */
1701 bool
1702 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1703 {
1704  struct stat filestats;
1705  int stat_errno;
1706 
1707  /* Get the final size for pgstat reporting. */
1708  if (stat(path, &filestats) != 0)
1709  stat_errno = errno;
1710  else
1711  stat_errno = 0;
1712 
1713  /*
1714  * Unlike FileClose's automatic file deletion code, we tolerate
1715  * non-existence to support BufFileDeleteShared which doesn't know how
1716  * many segments it has to delete until it runs out.
1717  */
1718  if (stat_errno == ENOENT)
1719  return false;
1720 
1721  if (unlink(path) < 0)
1722  {
1723  if (errno != ENOENT)
1724  ereport(error_on_failure ? ERROR : LOG,
1726  errmsg("could not unlink temporary file \"%s\": %m",
1727  path)));
1728  return false;
1729  }
1730 
1731  if (stat_errno == 0)
1732  ReportTemporaryFileUsage(path, filestats.st_size);
1733  else
1734  {
1735  errno = stat_errno;
1736  ereport(LOG,
1738  errmsg("could not stat file \"%s\": %m", path)));
1739  }
1740 
1741  return true;
1742 }
1743 
1744 /*
1745  * close a file when done with it
1746  */
1747 void
1749 {
1750  Vfd *vfdP;
1751 
1752  Assert(FileIsValid(file));
1753 
1754  DO_DB(elog(LOG, "FileClose: %d (%s)",
1755  file, VfdCache[file].fileName));
1756 
1757  vfdP = &VfdCache[file];
1758 
1759  if (!FileIsNotOpen(file))
1760  {
1761  /* close the file */
1762  if (close(vfdP->fd) != 0)
1763  {
1764  /*
1765  * We may need to panic on failure to close non-temporary files;
1766  * see LruDelete.
1767  */
1769  "could not close file \"%s\": %m", vfdP->fileName);
1770  }
1771 
1772  --nfile;
1773  vfdP->fd = VFD_CLOSED;
1774 
1775  /* remove the file from the lru ring */
1776  Delete(file);
1777  }
1778 
1779  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1780  {
1781  /* Subtract its size from current usage (do first in case of error) */
1782  temporary_files_size -= vfdP->fileSize;
1783  vfdP->fileSize = 0;
1784  }
1785 
1786  /*
1787  * Delete the file if it was temporary, and make a log entry if wanted
1788  */
1789  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1790  {
1791  struct stat filestats;
1792  int stat_errno;
1793 
1794  /*
1795  * If we get an error, as could happen within the ereport/elog calls,
1796  * we'll come right back here during transaction abort. Reset the
1797  * flag to ensure that we can't get into an infinite loop. This code
1798  * is arranged to ensure that the worst-case consequence is failing to
1799  * emit log message(s), not failing to attempt the unlink.
1800  */
1801  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1802 
1803 
1804  /* first try the stat() */
1805  if (stat(vfdP->fileName, &filestats))
1806  stat_errno = errno;
1807  else
1808  stat_errno = 0;
1809 
1810  /* in any case do the unlink */
1811  if (unlink(vfdP->fileName))
1812  elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1813 
1814  /* and last report the stat results */
1815  if (stat_errno == 0)
1816  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1817  else
1818  {
1819  errno = stat_errno;
1820  elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1821  }
1822  }
1823 
1824  /* Unregister it from the resource owner */
1825  if (vfdP->resowner)
1826  ResourceOwnerForgetFile(vfdP->resowner, file);
1827 
1828  /*
1829  * Return the Vfd slot to the free list
1830  */
1831  FreeVfd(file);
1832 }
1833 
1834 /*
1835  * FilePrefetch - initiate asynchronous read of a given range of the file.
1836  *
1837  * Currently the only implementation of this function is using posix_fadvise
1838  * which is the simplest standardized interface that accomplishes this.
1839  * We could add an implementation using libaio in the future; but note that
1840  * this API is inappropriate for libaio, which wants to have a buffer provided
1841  * to read into.
1842  */
1843 int
1844 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1845 {
1846 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1847  int returnCode;
1848 
1849  Assert(FileIsValid(file));
1850 
1851  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1852  file, VfdCache[file].fileName,
1853  (int64) offset, amount));
1854 
1855  returnCode = FileAccess(file);
1856  if (returnCode < 0)
1857  return returnCode;
1858 
1859  pgstat_report_wait_start(wait_event_info);
1860  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1861  POSIX_FADV_WILLNEED);
1863 
1864  return returnCode;
1865 #else
1866  Assert(FileIsValid(file));
1867  return 0;
1868 #endif
1869 }
1870 
1871 void
1872 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1873 {
1874  int returnCode;
1875 
1876  Assert(FileIsValid(file));
1877 
1878  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1879  file, VfdCache[file].fileName,
1880  (int64) offset, (int64) nbytes));
1881 
1882  if (nbytes <= 0)
1883  return;
1884 
1885  returnCode = FileAccess(file);
1886  if (returnCode < 0)
1887  return;
1888 
1889  pgstat_report_wait_start(wait_event_info);
1890  pg_flush_data(VfdCache[file].fd, offset, nbytes);
1892 }
1893 
1894 int
1895 FileRead(File file, char *buffer, int amount, off_t offset,
1896  uint32 wait_event_info)
1897 {
1898  int returnCode;
1899  Vfd *vfdP;
1900 
1901  Assert(FileIsValid(file));
1902 
1903  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1904  file, VfdCache[file].fileName,
1905  (int64) offset,
1906  amount, buffer));
1907 
1908  returnCode = FileAccess(file);
1909  if (returnCode < 0)
1910  return returnCode;
1911 
1912  vfdP = &VfdCache[file];
1913 
1914 retry:
1915  pgstat_report_wait_start(wait_event_info);
1916  returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
1918 
1919  if (returnCode < 0)
1920  {
1921  /*
1922  * Windows may run out of kernel buffers and return "Insufficient
1923  * system resources" error. Wait a bit and retry to solve it.
1924  *
1925  * It is rumored that EINTR is also possible on some Unix filesystems,
1926  * in which case immediate retry is indicated.
1927  */
1928 #ifdef WIN32
1929  DWORD error = GetLastError();
1930 
1931  switch (error)
1932  {
1933  case ERROR_NO_SYSTEM_RESOURCES:
1934  pg_usleep(1000L);
1935  errno = EINTR;
1936  break;
1937  default:
1938  _dosmaperr(error);
1939  break;
1940  }
1941 #endif
1942  /* OK to retry if interrupted */
1943  if (errno == EINTR)
1944  goto retry;
1945  }
1946 
1947  return returnCode;
1948 }
1949 
1950 int
1951 FileWrite(File file, char *buffer, int amount, off_t offset,
1952  uint32 wait_event_info)
1953 {
1954  int returnCode;
1955  Vfd *vfdP;
1956 
1957  Assert(FileIsValid(file));
1958 
1959  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1960  file, VfdCache[file].fileName,
1961  (int64) offset,
1962  amount, buffer));
1963 
1964  returnCode = FileAccess(file);
1965  if (returnCode < 0)
1966  return returnCode;
1967 
1968  vfdP = &VfdCache[file];
1969 
1970  /*
1971  * If enforcing temp_file_limit and it's a temp file, check to see if the
1972  * write would overrun temp_file_limit, and throw error if so. Note: it's
1973  * really a modularity violation to throw error here; we should set errno
1974  * and return -1. However, there's no way to report a suitable error
1975  * message if we do that. All current callers would just throw error
1976  * immediately anyway, so this is safe at present.
1977  */
1978  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
1979  {
1980  off_t past_write = offset + amount;
1981 
1982  if (past_write > vfdP->fileSize)
1983  {
1984  uint64 newTotal = temporary_files_size;
1985 
1986  newTotal += past_write - vfdP->fileSize;
1987  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
1988  ereport(ERROR,
1989  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
1990  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
1991  temp_file_limit)));
1992  }
1993  }
1994 
1995 retry:
1996  errno = 0;
1997  pgstat_report_wait_start(wait_event_info);
1998  returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
2000 
2001  /* if write didn't set errno, assume problem is no disk space */
2002  if (returnCode != amount && errno == 0)
2003  errno = ENOSPC;
2004 
2005  if (returnCode >= 0)
2006  {
2007  /*
2008  * Maintain fileSize and temporary_files_size if it's a temp file.
2009  */
2010  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2011  {
2012  off_t past_write = offset + amount;
2013 
2014  if (past_write > vfdP->fileSize)
2015  {
2016  temporary_files_size += past_write - vfdP->fileSize;
2017  vfdP->fileSize = past_write;
2018  }
2019  }
2020  }
2021  else
2022  {
2023  /*
2024  * See comments in FileRead()
2025  */
2026 #ifdef WIN32
2027  DWORD error = GetLastError();
2028 
2029  switch (error)
2030  {
2031  case ERROR_NO_SYSTEM_RESOURCES:
2032  pg_usleep(1000L);
2033  errno = EINTR;
2034  break;
2035  default:
2036  _dosmaperr(error);
2037  break;
2038  }
2039 #endif
2040  /* OK to retry if interrupted */
2041  if (errno == EINTR)
2042  goto retry;
2043  }
2044 
2045  return returnCode;
2046 }
2047 
2048 int
2049 FileSync(File file, uint32 wait_event_info)
2050 {
2051  int returnCode;
2052 
2053  Assert(FileIsValid(file));
2054 
2055  DO_DB(elog(LOG, "FileSync: %d (%s)",
2056  file, VfdCache[file].fileName));
2057 
2058  returnCode = FileAccess(file);
2059  if (returnCode < 0)
2060  return returnCode;
2061 
2062  pgstat_report_wait_start(wait_event_info);
2063  returnCode = pg_fsync(VfdCache[file].fd);
2065 
2066  return returnCode;
2067 }
2068 
2069 off_t
2071 {
2072  Assert(FileIsValid(file));
2073 
2074  DO_DB(elog(LOG, "FileSize %d (%s)",
2075  file, VfdCache[file].fileName));
2076 
2077  if (FileIsNotOpen(file))
2078  {
2079  if (FileAccess(file) < 0)
2080  return (off_t) -1;
2081  }
2082 
2083  return lseek(VfdCache[file].fd, 0, SEEK_END);
2084 }
2085 
2086 int
2087 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2088 {
2089  int returnCode;
2090 
2091  Assert(FileIsValid(file));
2092 
2093  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2094  file, VfdCache[file].fileName));
2095 
2096  returnCode = FileAccess(file);
2097  if (returnCode < 0)
2098  return returnCode;
2099 
2100  pgstat_report_wait_start(wait_event_info);
2101  returnCode = ftruncate(VfdCache[file].fd, offset);
2103 
2104  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2105  {
2106  /* adjust our state for truncation of a temp file */
2107  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2108  temporary_files_size -= VfdCache[file].fileSize - offset;
2109  VfdCache[file].fileSize = offset;
2110  }
2111 
2112  return returnCode;
2113 }
2114 
2115 /*
2116  * Return the pathname associated with an open file.
2117  *
2118  * The returned string points to an internal buffer, which is valid until
2119  * the file is closed.
2120  */
2121 char *
2123 {
2124  Assert(FileIsValid(file));
2125 
2126  return VfdCache[file].fileName;
2127 }
2128 
2129 /*
2130  * Return the raw file descriptor of an opened file.
2131  *
2132  * The returned file descriptor will be valid until the file is closed, but
2133  * there are a lot of things that can make that happen. So the caller should
2134  * be careful not to do much of anything else before it finishes using the
2135  * returned file descriptor.
2136  */
2137 int
2139 {
2140  Assert(FileIsValid(file));
2141  return VfdCache[file].fd;
2142 }
2143 
2144 /*
2145  * FileGetRawFlags - returns the file flags on open(2)
2146  */
2147 int
2149 {
2150  Assert(FileIsValid(file));
2151  return VfdCache[file].fileFlags;
2152 }
2153 
2154 /*
2155  * FileGetRawMode - returns the mode bitmask passed to open(2)
2156  */
2157 mode_t
2159 {
2160  Assert(FileIsValid(file));
2161  return VfdCache[file].fileMode;
2162 }
2163 
2164 /*
2165  * Make room for another allocatedDescs[] array entry if needed and possible.
2166  * Returns true if an array element is available.
2167  */
2168 static bool
2170 {
2171  AllocateDesc *newDescs;
2172  int newMax;
2173 
2174  /* Quick out if array already has a free slot. */
2176  return true;
2177 
2178  /*
2179  * If the array hasn't yet been created in the current process, initialize
2180  * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2181  * we will ever need, anyway. We don't want to look at max_safe_fds
2182  * immediately because set_max_safe_fds() may not have run yet.
2183  */
2184  if (allocatedDescs == NULL)
2185  {
2186  newMax = FD_MINFREE / 2;
2187  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2188  /* Out of memory already? Treat as fatal error. */
2189  if (newDescs == NULL)
2190  ereport(ERROR,
2191  (errcode(ERRCODE_OUT_OF_MEMORY),
2192  errmsg("out of memory")));
2193  allocatedDescs = newDescs;
2194  maxAllocatedDescs = newMax;
2195  return true;
2196  }
2197 
2198  /*
2199  * Consider enlarging the array beyond the initial allocation used above.
2200  * By the time this happens, max_safe_fds should be known accurately.
2201  *
2202  * We mustn't let allocated descriptors hog all the available FDs, and in
2203  * practice we'd better leave a reasonable number of FDs for VFD use. So
2204  * set the maximum to max_safe_fds / 2. (This should certainly be at
2205  * least as large as the initial size, FD_MINFREE / 2.)
2206  */
2207  newMax = max_safe_fds / 2;
2208  if (newMax > maxAllocatedDescs)
2209  {
2210  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2211  newMax * sizeof(AllocateDesc));
2212  /* Treat out-of-memory as a non-fatal error. */
2213  if (newDescs == NULL)
2214  return false;
2215  allocatedDescs = newDescs;
2216  maxAllocatedDescs = newMax;
2217  return true;
2218  }
2219 
2220  /* Can't enlarge allocatedDescs[] any more. */
2221  return false;
2222 }
2223 
2224 /*
2225  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2226  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2227  * necessary to open the file. When done, call FreeFile rather than fclose.
2228  *
2229  * Note that files that will be open for any significant length of time
2230  * should NOT be handled this way, since they cannot share kernel file
2231  * descriptors with other files; there is grave risk of running out of FDs
2232  * if anyone locks down too many FDs. Most callers of this routine are
2233  * simply reading a config file that they will read and close immediately.
2234  *
2235  * fd.c will automatically close all files opened with AllocateFile at
2236  * transaction commit or abort; this prevents FD leakage if a routine
2237  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2238  *
2239  * Ideally this should be the *only* direct call of fopen() in the backend.
2240  */
2241 FILE *
2242 AllocateFile(const char *name, const char *mode)
2243 {
2244  FILE *file;
2245 
2246  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2247  numAllocatedDescs, name));
2248 
2249  /* Can we allocate another non-virtual FD? */
2250  if (!reserveAllocatedDesc())
2251  ereport(ERROR,
2252  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2253  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2254  maxAllocatedDescs, name)));
2255 
2256  /* Close excess kernel FDs. */
2257  ReleaseLruFiles();
2258 
2259 TryAgain:
2260  if ((file = fopen(name, mode)) != NULL)
2261  {
2262  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2263 
2264  desc->kind = AllocateDescFile;
2265  desc->desc.file = file;
2268  return desc->desc.file;
2269  }
2270 
2271  if (errno == EMFILE || errno == ENFILE)
2272  {
2273  int save_errno = errno;
2274 
2275  ereport(LOG,
2276  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2277  errmsg("out of file descriptors: %m; release and retry")));
2278  errno = 0;
2279  if (ReleaseLruFile())
2280  goto TryAgain;
2281  errno = save_errno;
2282  }
2283 
2284  return NULL;
2285 }
2286 
2287 /*
2288  * Open a file with OpenTransientFilePerm() and pass default file mode for
2289  * the fileMode parameter.
2290  */
2291 int
2293 {
2294  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2295 }
2296 
2297 /*
2298  * Like AllocateFile, but returns an unbuffered fd like open(2)
2299  */
2300 int
2302 {
2303  int fd;
2304 
2305  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2306  numAllocatedDescs, fileName));
2307 
2308  /* Can we allocate another non-virtual FD? */
2309  if (!reserveAllocatedDesc())
2310  ereport(ERROR,
2311  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2312  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2313  maxAllocatedDescs, fileName)));
2314 
2315  /* Close excess kernel FDs. */
2316  ReleaseLruFiles();
2317 
2318  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2319 
2320  if (fd >= 0)
2321  {
2322  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2323 
2324  desc->kind = AllocateDescRawFD;
2325  desc->desc.fd = fd;
2328 
2329  return fd;
2330  }
2331 
2332  return -1; /* failure */
2333 }
2334 
2335 /*
2336  * Routines that want to initiate a pipe stream should use OpenPipeStream
2337  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2338  * necessary. When done, call ClosePipeStream rather than pclose.
2339  *
2340  * This function also ensures that the popen'd program is run with default
2341  * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2342  * uses. This ensures desirable response to, eg, closing a read pipe early.
2343  */
2344 FILE *
2345 OpenPipeStream(const char *command, const char *mode)
2346 {
2347  FILE *file;
2348  int save_errno;
2349 
2350  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2351  numAllocatedDescs, command));
2352 
2353  /* Can we allocate another non-virtual FD? */
2354  if (!reserveAllocatedDesc())
2355  ereport(ERROR,
2356  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2357  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2358  maxAllocatedDescs, command)));
2359 
2360  /* Close excess kernel FDs. */
2361  ReleaseLruFiles();
2362 
2363 TryAgain:
2364  fflush(stdout);
2365  fflush(stderr);
2367  errno = 0;
2368  file = popen(command, mode);
2369  save_errno = errno;
2371  errno = save_errno;
2372  if (file != NULL)
2373  {
2374  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2375 
2376  desc->kind = AllocateDescPipe;
2377  desc->desc.file = file;
2380  return desc->desc.file;
2381  }
2382 
2383  if (errno == EMFILE || errno == ENFILE)
2384  {
2385  ereport(LOG,
2386  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2387  errmsg("out of file descriptors: %m; release and retry")));
2388  if (ReleaseLruFile())
2389  goto TryAgain;
2390  errno = save_errno;
2391  }
2392 
2393  return NULL;
2394 }
2395 
2396 /*
2397  * Free an AllocateDesc of any type.
2398  *
2399  * The argument *must* point into the allocatedDescs[] array.
2400  */
2401 static int
2403 {
2404  int result;
2405 
2406  /* Close the underlying object */
2407  switch (desc->kind)
2408  {
2409  case AllocateDescFile:
2410  result = fclose(desc->desc.file);
2411  break;
2412  case AllocateDescPipe:
2413  result = pclose(desc->desc.file);
2414  break;
2415  case AllocateDescDir:
2416  result = closedir(desc->desc.dir);
2417  break;
2418  case AllocateDescRawFD:
2419  result = close(desc->desc.fd);
2420  break;
2421  default:
2422  elog(ERROR, "AllocateDesc kind not recognized");
2423  result = 0; /* keep compiler quiet */
2424  break;
2425  }
2426 
2427  /* Compact storage in the allocatedDescs array */
2429  *desc = allocatedDescs[numAllocatedDescs];
2430 
2431  return result;
2432 }
2433 
2434 /*
2435  * Close a file returned by AllocateFile.
2436  *
2437  * Note we do not check fclose's return value --- it is up to the caller
2438  * to handle close errors.
2439  */
2440 int
2441 FreeFile(FILE *file)
2442 {
2443  int i;
2444 
2445  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2446 
2447  /* Remove file from list of allocated files, if it's present */
2448  for (i = numAllocatedDescs; --i >= 0;)
2449  {
2450  AllocateDesc *desc = &allocatedDescs[i];
2451 
2452  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2453  return FreeDesc(desc);
2454  }
2455 
2456  /* Only get here if someone passes us a file not in allocatedDescs */
2457  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2458 
2459  return fclose(file);
2460 }
2461 
2462 /*
2463  * Close a file returned by OpenTransientFile.
2464  *
2465  * Note we do not check close's return value --- it is up to the caller
2466  * to handle close errors.
2467  */
2468 int
2470 {
2471  int i;
2472 
2473  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2474 
2475  /* Remove fd from list of allocated files, if it's present */
2476  for (i = numAllocatedDescs; --i >= 0;)
2477  {
2478  AllocateDesc *desc = &allocatedDescs[i];
2479 
2480  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2481  return FreeDesc(desc);
2482  }
2483 
2484  /* Only get here if someone passes us a file not in allocatedDescs */
2485  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2486 
2487  return close(fd);
2488 }
2489 
2490 /*
2491  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2492  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2493  * necessary to open the directory, and with closing it after an elog.
2494  * When done, call FreeDir rather than closedir.
2495  *
2496  * Returns NULL, with errno set, on failure. Note that failure detection
2497  * is commonly left to the following call of ReadDir or ReadDirExtended;
2498  * see the comments for ReadDir.
2499  *
2500  * Ideally this should be the *only* direct call of opendir() in the backend.
2501  */
2502 DIR *
2503 AllocateDir(const char *dirname)
2504 {
2505  DIR *dir;
2506 
2507  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2508  numAllocatedDescs, dirname));
2509 
2510  /* Can we allocate another non-virtual FD? */
2511  if (!reserveAllocatedDesc())
2512  ereport(ERROR,
2513  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2514  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2515  maxAllocatedDescs, dirname)));
2516 
2517  /* Close excess kernel FDs. */
2518  ReleaseLruFiles();
2519 
2520 TryAgain:
2521  if ((dir = opendir(dirname)) != NULL)
2522  {
2523  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2524 
2525  desc->kind = AllocateDescDir;
2526  desc->desc.dir = dir;
2529  return desc->desc.dir;
2530  }
2531 
2532  if (errno == EMFILE || errno == ENFILE)
2533  {
2534  int save_errno = errno;
2535 
2536  ereport(LOG,
2537  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2538  errmsg("out of file descriptors: %m; release and retry")));
2539  errno = 0;
2540  if (ReleaseLruFile())
2541  goto TryAgain;
2542  errno = save_errno;
2543  }
2544 
2545  return NULL;
2546 }
2547 
2548 /*
2549  * Read a directory opened with AllocateDir, ereport'ing any error.
2550  *
2551  * This is easier to use than raw readdir() since it takes care of some
2552  * otherwise rather tedious and error-prone manipulation of errno. Also,
2553  * if you are happy with a generic error message for AllocateDir failure,
2554  * you can just do
2555  *
2556  * dir = AllocateDir(path);
2557  * while ((dirent = ReadDir(dir, path)) != NULL)
2558  * process dirent;
2559  * FreeDir(dir);
2560  *
2561  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2562  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2563  * use this shortcut.)
2564  *
2565  * The pathname passed to AllocateDir must be passed to this routine too,
2566  * but it is only used for error reporting.
2567  */
2568 struct dirent *
2569 ReadDir(DIR *dir, const char *dirname)
2570 {
2571  return ReadDirExtended(dir, dirname, ERROR);
2572 }
2573 
2574 /*
2575  * Alternate version of ReadDir that allows caller to specify the elevel
2576  * for any error report (whether it's reporting an initial failure of
2577  * AllocateDir or a subsequent directory read failure).
2578  *
2579  * If elevel < ERROR, returns NULL after any error. With the normal coding
2580  * pattern, this will result in falling out of the loop immediately as
2581  * though the directory contained no (more) entries.
2582  */
2583 struct dirent *
2584 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2585 {
2586  struct dirent *dent;
2587 
2588  /* Give a generic message for AllocateDir failure, if caller didn't */
2589  if (dir == NULL)
2590  {
2591  ereport(elevel,
2593  errmsg("could not open directory \"%s\": %m",
2594  dirname)));
2595  return NULL;
2596  }
2597 
2598  errno = 0;
2599  if ((dent = readdir(dir)) != NULL)
2600  return dent;
2601 
2602  if (errno)
2603  ereport(elevel,
2605  errmsg("could not read directory \"%s\": %m",
2606  dirname)));
2607  return NULL;
2608 }
2609 
2610 /*
2611  * Close a directory opened with AllocateDir.
2612  *
2613  * Returns closedir's return value (with errno set if it's not 0).
2614  * Note we do not check the return value --- it is up to the caller
2615  * to handle close errors if wanted.
2616  *
2617  * Does nothing if dir == NULL; we assume that directory open failure was
2618  * already reported if desired.
2619  */
2620 int
2622 {
2623  int i;
2624 
2625  /* Nothing to do if AllocateDir failed */
2626  if (dir == NULL)
2627  return 0;
2628 
2629  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2630 
2631  /* Remove dir from list of allocated dirs, if it's present */
2632  for (i = numAllocatedDescs; --i >= 0;)
2633  {
2634  AllocateDesc *desc = &allocatedDescs[i];
2635 
2636  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2637  return FreeDesc(desc);
2638  }
2639 
2640  /* Only get here if someone passes us a dir not in allocatedDescs */
2641  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2642 
2643  return closedir(dir);
2644 }
2645 
2646 
2647 /*
2648  * Close a pipe stream returned by OpenPipeStream.
2649  */
2650 int
2651 ClosePipeStream(FILE *file)
2652 {
2653  int i;
2654 
2655  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2656 
2657  /* Remove file from list of allocated files, if it's present */
2658  for (i = numAllocatedDescs; --i >= 0;)
2659  {
2660  AllocateDesc *desc = &allocatedDescs[i];
2661 
2662  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2663  return FreeDesc(desc);
2664  }
2665 
2666  /* Only get here if someone passes us a file not in allocatedDescs */
2667  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2668 
2669  return pclose(file);
2670 }
2671 
2672 /*
2673  * closeAllVfds
2674  *
2675  * Force all VFDs into the physically-closed state, so that the fewest
2676  * possible number of kernel file descriptors are in use. There is no
2677  * change in the logical state of the VFDs.
2678  */
2679 void
2681 {
2682  Index i;
2683 
2684  if (SizeVfdCache > 0)
2685  {
2686  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2687  for (i = 1; i < SizeVfdCache; i++)
2688  {
2689  if (!FileIsNotOpen(i))
2690  LruDelete(i);
2691  }
2692  }
2693 }
2694 
2695 
2696 /*
2697  * SetTempTablespaces
2698  *
2699  * Define a list (actually an array) of OIDs of tablespaces to use for
2700  * temporary files. This list will be used until end of transaction,
2701  * unless this function is called again before then. It is caller's
2702  * responsibility that the passed-in array has adequate lifespan (typically
2703  * it'd be allocated in TopTransactionContext).
2704  */
2705 void
2706 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2707 {
2708  Assert(numSpaces >= 0);
2709  tempTableSpaces = tableSpaces;
2710  numTempTableSpaces = numSpaces;
2711 
2712  /*
2713  * Select a random starting point in the list. This is to minimize
2714  * conflicts between backends that are most likely sharing the same list
2715  * of temp tablespaces. Note that if we create multiple temp files in the
2716  * same transaction, we'll advance circularly through the list --- this
2717  * ensures that large temporary sort files are nicely spread across all
2718  * available tablespaces.
2719  */
2720  if (numSpaces > 1)
2721  nextTempTableSpace = random() % numSpaces;
2722  else
2723  nextTempTableSpace = 0;
2724 }
2725 
2726 /*
2727  * TempTablespacesAreSet
2728  *
2729  * Returns true if SetTempTablespaces has been called in current transaction.
2730  * (This is just so that tablespaces.c doesn't need its own per-transaction
2731  * state.)
2732  */
2733 bool
2735 {
2736  return (numTempTableSpaces >= 0);
2737 }
2738 
2739 /*
2740  * GetTempTablespaces
2741  *
2742  * Populate an array with the OIDs of the tablespaces that should be used for
2743  * temporary files. Return the number that were copied into the output array.
2744  */
2745 int
2746 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2747 {
2748  int i;
2749 
2751  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2752  tableSpaces[i] = tempTableSpaces[i];
2753 
2754  return i;
2755 }
2756 
2757 /*
2758  * GetNextTempTableSpace
2759  *
2760  * Select the next temp tablespace to use. A result of InvalidOid means
2761  * to use the current database's default tablespace.
2762  */
2763 Oid
2765 {
2766  if (numTempTableSpaces > 0)
2767  {
2768  /* Advance nextTempTableSpace counter with wraparound */
2770  nextTempTableSpace = 0;
2772  }
2773  return InvalidOid;
2774 }
2775 
2776 
2777 /*
2778  * AtEOSubXact_Files
2779  *
2780  * Take care of subtransaction commit/abort. At abort, we close temp files
2781  * that the subtransaction may have opened. At commit, we reassign the
2782  * files that were opened to the parent subtransaction.
2783  */
2784 void
2785 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2786  SubTransactionId parentSubid)
2787 {
2788  Index i;
2789 
2790  for (i = 0; i < numAllocatedDescs; i++)
2791  {
2792  if (allocatedDescs[i].create_subid == mySubid)
2793  {
2794  if (isCommit)
2795  allocatedDescs[i].create_subid = parentSubid;
2796  else
2797  {
2798  /* have to recheck the item after FreeDesc (ugly) */
2799  FreeDesc(&allocatedDescs[i--]);
2800  }
2801  }
2802  }
2803 }
2804 
2805 /*
2806  * AtEOXact_Files
2807  *
2808  * This routine is called during transaction commit or abort. All still-open
2809  * per-transaction temporary file VFDs are closed, which also causes the
2810  * underlying files to be deleted (although they should've been closed already
2811  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2812  * closed. We also forget any transaction-local temp tablespace list.
2813  *
2814  * The isCommit flag is used only to decide whether to emit warnings about
2815  * unclosed files.
2816  */
2817 void
2818 AtEOXact_Files(bool isCommit)
2819 {
2820  CleanupTempFiles(isCommit, false);
2821  tempTableSpaces = NULL;
2822  numTempTableSpaces = -1;
2823 }
2824 
2825 /*
2826  * AtProcExit_Files
2827  *
2828  * on_proc_exit hook to clean up temp files during backend shutdown.
2829  * Here, we want to clean up *all* temp files including interXact ones.
2830  */
2831 static void
2833 {
2834  CleanupTempFiles(false, true);
2835 }
2836 
2837 /*
2838  * Close temporary files and delete their underlying files.
2839  *
2840  * isCommit: if true, this is normal transaction commit, and we don't
2841  * expect any remaining files; warn if there are some.
2842  *
2843  * isProcExit: if true, this is being called as the backend process is
2844  * exiting. If that's the case, we should remove all temporary files; if
2845  * that's not the case, we are being called for transaction commit/abort
2846  * and should only remove transaction-local temp files. In either case,
2847  * also clean up "allocated" stdio files, dirs and fds.
2848  */
2849 static void
2850 CleanupTempFiles(bool isCommit, bool isProcExit)
2851 {
2852  Index i;
2853 
2854  /*
2855  * Careful here: at proc_exit we need extra cleanup, not just
2856  * xact_temporary files.
2857  */
2858  if (isProcExit || have_xact_temporary_files)
2859  {
2860  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2861  for (i = 1; i < SizeVfdCache; i++)
2862  {
2863  unsigned short fdstate = VfdCache[i].fdstate;
2864 
2865  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2866  VfdCache[i].fileName != NULL)
2867  {
2868  /*
2869  * If we're in the process of exiting a backend process, close
2870  * all temporary files. Otherwise, only close temporary files
2871  * local to the current transaction. They should be closed by
2872  * the ResourceOwner mechanism already, so this is just a
2873  * debugging cross-check.
2874  */
2875  if (isProcExit)
2876  FileClose(i);
2877  else if (fdstate & FD_CLOSE_AT_EOXACT)
2878  {
2879  elog(WARNING,
2880  "temporary file %s not closed at end-of-transaction",
2881  VfdCache[i].fileName);
2882  FileClose(i);
2883  }
2884  }
2885  }
2886 
2887  have_xact_temporary_files = false;
2888  }
2889 
2890  /* Complain if any allocated files remain open at commit. */
2891  if (isCommit && numAllocatedDescs > 0)
2892  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
2894 
2895  /* Clean up "allocated" stdio files, dirs and fds. */
2896  while (numAllocatedDescs > 0)
2897  FreeDesc(&allocatedDescs[0]);
2898 }
2899 
2900 
2901 /*
2902  * Remove temporary and temporary relation files left over from a prior
2903  * postmaster session
2904  *
2905  * This should be called during postmaster startup. It will forcibly
2906  * remove any leftover files created by OpenTemporaryFile and any leftover
2907  * temporary relation files created by mdcreate.
2908  *
2909  * NOTE: we could, but don't, call this during a post-backend-crash restart
2910  * cycle. The argument for not doing it is that someone might want to examine
2911  * the temp files for debugging purposes. This does however mean that
2912  * OpenTemporaryFile had better allow for collision with an existing temp
2913  * file name.
2914  *
2915  * NOTE: this function and its subroutines generally report syscall failures
2916  * with ereport(LOG) and keep going. Removing temp files is not so critical
2917  * that we should fail to start the database when we can't do it.
2918  */
2919 void
2921 {
2922  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
2923  DIR *spc_dir;
2924  struct dirent *spc_de;
2925 
2926  /*
2927  * First process temp files in pg_default ($PGDATA/base)
2928  */
2929  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
2930  RemovePgTempFilesInDir(temp_path, true, false);
2931  RemovePgTempRelationFiles("base");
2932 
2933  /*
2934  * Cycle through temp directories for all non-default tablespaces.
2935  */
2936  spc_dir = AllocateDir("pg_tblspc");
2937 
2938  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
2939  {
2940  if (strcmp(spc_de->d_name, ".") == 0 ||
2941  strcmp(spc_de->d_name, "..") == 0)
2942  continue;
2943 
2944  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
2946  RemovePgTempFilesInDir(temp_path, true, false);
2947 
2948  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
2950  RemovePgTempRelationFiles(temp_path);
2951  }
2952 
2953  FreeDir(spc_dir);
2954 
2955  /*
2956  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
2957  * DataDir as well. However, that is *not* cleaned here because doing so
2958  * would create a race condition. It's done separately, earlier in
2959  * postmaster startup.
2960  */
2961 }
2962 
2963 /*
2964  * Process one pgsql_tmp directory for RemovePgTempFiles.
2965  *
2966  * If missing_ok is true, it's all right for the named directory to not exist.
2967  * Any other problem results in a LOG message. (missing_ok should be true at
2968  * the top level, since pgsql_tmp directories are not created until needed.)
2969  *
2970  * At the top level, this should be called with unlink_all = false, so that
2971  * only files matching the temporary name prefix will be unlinked. When
2972  * recursing it will be called with unlink_all = true to unlink everything
2973  * under a top-level temporary directory.
2974  *
2975  * (These two flags could be replaced by one, but it seems clearer to keep
2976  * them separate.)
2977  */
2978 void
2979 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
2980 {
2981  DIR *temp_dir;
2982  struct dirent *temp_de;
2983  char rm_path[MAXPGPATH * 2];
2984 
2985  temp_dir = AllocateDir(tmpdirname);
2986 
2987  if (temp_dir == NULL && errno == ENOENT && missing_ok)
2988  return;
2989 
2990  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
2991  {
2992  if (strcmp(temp_de->d_name, ".") == 0 ||
2993  strcmp(temp_de->d_name, "..") == 0)
2994  continue;
2995 
2996  snprintf(rm_path, sizeof(rm_path), "%s/%s",
2997  tmpdirname, temp_de->d_name);
2998 
2999  if (unlink_all ||
3000  strncmp(temp_de->d_name,
3002  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3003  {
3004  struct stat statbuf;
3005 
3006  if (lstat(rm_path, &statbuf) < 0)
3007  {
3008  ereport(LOG,
3010  errmsg("could not stat file \"%s\": %m", rm_path)));
3011  continue;
3012  }
3013 
3014  if (S_ISDIR(statbuf.st_mode))
3015  {
3016  /* recursively remove contents, then directory itself */
3017  RemovePgTempFilesInDir(rm_path, false, true);
3018 
3019  if (rmdir(rm_path) < 0)
3020  ereport(LOG,
3022  errmsg("could not remove directory \"%s\": %m",
3023  rm_path)));
3024  }
3025  else
3026  {
3027  if (unlink(rm_path) < 0)
3028  ereport(LOG,
3030  errmsg("could not remove file \"%s\": %m",
3031  rm_path)));
3032  }
3033  }
3034  else
3035  ereport(LOG,
3036  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3037  rm_path)));
3038  }
3039 
3040  FreeDir(temp_dir);
3041 }
3042 
3043 /* Process one tablespace directory, look for per-DB subdirectories */
3044 static void
3045 RemovePgTempRelationFiles(const char *tsdirname)
3046 {
3047  DIR *ts_dir;
3048  struct dirent *de;
3049  char dbspace_path[MAXPGPATH * 2];
3050 
3051  ts_dir = AllocateDir(tsdirname);
3052 
3053  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3054  {
3055  /*
3056  * We're only interested in the per-database directories, which have
3057  * numeric names. Note that this code will also (properly) ignore "."
3058  * and "..".
3059  */
3060  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3061  continue;
3062 
3063  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3064  tsdirname, de->d_name);
3065  RemovePgTempRelationFilesInDbspace(dbspace_path);
3066  }
3067 
3068  FreeDir(ts_dir);
3069 }
3070 
3071 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3072 static void
3073 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3074 {
3075  DIR *dbspace_dir;
3076  struct dirent *de;
3077  char rm_path[MAXPGPATH * 2];
3078 
3079  dbspace_dir = AllocateDir(dbspacedirname);
3080 
3081  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3082  {
3083  if (!looks_like_temp_rel_name(de->d_name))
3084  continue;
3085 
3086  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3087  dbspacedirname, de->d_name);
3088 
3089  if (unlink(rm_path) < 0)
3090  ereport(LOG,
3092  errmsg("could not remove file \"%s\": %m",
3093  rm_path)));
3094  }
3095 
3096  FreeDir(dbspace_dir);
3097 }
3098 
3099 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3100 bool
3102 {
3103  int pos;
3104  int savepos;
3105 
3106  /* Must start with "t". */
3107  if (name[0] != 't')
3108  return false;
3109 
3110  /* Followed by a non-empty string of digits and then an underscore. */
3111  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3112  ;
3113  if (pos == 1 || name[pos] != '_')
3114  return false;
3115 
3116  /* Followed by another nonempty string of digits. */
3117  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3118  ;
3119  if (savepos == pos)
3120  return false;
3121 
3122  /* We might have _forkname or .segment or both. */
3123  if (name[pos] == '_')
3124  {
3125  int forkchar = forkname_chars(&name[pos + 1], NULL);
3126 
3127  if (forkchar <= 0)
3128  return false;
3129  pos += forkchar + 1;
3130  }
3131  if (name[pos] == '.')
3132  {
3133  int segchar;
3134 
3135  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3136  ;
3137  if (segchar <= 1)
3138  return false;
3139  pos += segchar;
3140  }
3141 
3142  /* Now we should be at the end. */
3143  if (name[pos] != '\0')
3144  return false;
3145  return true;
3146 }
3147 
3148 
3149 /*
3150  * Issue fsync recursively on PGDATA and all its contents.
3151  *
3152  * We fsync regular files and directories wherever they are, but we
3153  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3154  * Other symlinks are presumed to point at files we're not responsible
3155  * for fsyncing, and might not have privileges to write at all.
3156  *
3157  * Errors are logged but not considered fatal; that's because this is used
3158  * only during database startup, to deal with the possibility that there are
3159  * issued-but-unsynced writes pending against the data directory. We want to
3160  * ensure that such writes reach disk before anything that's done in the new
3161  * run. However, aborting on error would result in failure to start for
3162  * harmless cases such as read-only files in the data directory, and that's
3163  * not good either.
3164  *
3165  * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3166  * rewriting all changes again during recovery.
3167  *
3168  * Note we assume we're chdir'd into PGDATA to begin with.
3169  */
3170 void
3172 {
3173  bool xlog_is_symlink;
3174 
3175  /* We can skip this whole thing if fsync is disabled. */
3176  if (!enableFsync)
3177  return;
3178 
3179  /*
3180  * If pg_wal is a symlink, we'll need to recurse into it separately,
3181  * because the first walkdir below will ignore it.
3182  */
3183  xlog_is_symlink = false;
3184 
3185 #ifndef WIN32
3186  {
3187  struct stat st;
3188 
3189  if (lstat("pg_wal", &st) < 0)
3190  ereport(LOG,
3192  errmsg("could not stat file \"%s\": %m",
3193  "pg_wal")));
3194  else if (S_ISLNK(st.st_mode))
3195  xlog_is_symlink = true;
3196  }
3197 #else
3198  if (pgwin32_is_junction("pg_wal"))
3199  xlog_is_symlink = true;
3200 #endif
3201 
3202  /*
3203  * If possible, hint to the kernel that we're soon going to fsync the data
3204  * directory and its contents. Errors in this step are even less
3205  * interesting than normal, so log them only at DEBUG1.
3206  */
3207 #ifdef PG_FLUSH_DATA_WORKS
3208  walkdir(".", pre_sync_fname, false, DEBUG1);
3209  if (xlog_is_symlink)
3210  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3211  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3212 #endif
3213 
3214  /*
3215  * Now we do the fsync()s in the same order.
3216  *
3217  * The main call ignores symlinks, so in addition to specially processing
3218  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3219  * process_symlinks = true. Note that if there are any plain directories
3220  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3221  * so we don't worry about optimizing it.
3222  */
3223  walkdir(".", datadir_fsync_fname, false, LOG);
3224  if (xlog_is_symlink)
3225  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3226  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3227 }
3228 
3229 /*
3230  * walkdir: recursively walk a directory, applying the action to each
3231  * regular file and directory (including the named directory itself).
3232  *
3233  * If process_symlinks is true, the action and recursion are also applied
3234  * to regular files and directories that are pointed to by symlinks in the
3235  * given directory; otherwise symlinks are ignored. Symlinks are always
3236  * ignored in subdirectories, ie we intentionally don't pass down the
3237  * process_symlinks flag to recursive calls.
3238  *
3239  * Errors are reported at level elevel, which might be ERROR or less.
3240  *
3241  * See also walkdir in file_utils.c, which is a frontend version of this
3242  * logic.
3243  */
3244 static void
3245 walkdir(const char *path,
3246  void (*action) (const char *fname, bool isdir, int elevel),
3247  bool process_symlinks,
3248  int elevel)
3249 {
3250  DIR *dir;
3251  struct dirent *de;
3252 
3253  dir = AllocateDir(path);
3254 
3255  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3256  {
3257  char subpath[MAXPGPATH * 2];
3258  struct stat fst;
3259  int sret;
3260 
3262 
3263  if (strcmp(de->d_name, ".") == 0 ||
3264  strcmp(de->d_name, "..") == 0)
3265  continue;
3266 
3267  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3268 
3269  if (process_symlinks)
3270  sret = stat(subpath, &fst);
3271  else
3272  sret = lstat(subpath, &fst);
3273 
3274  if (sret < 0)
3275  {
3276  ereport(elevel,
3278  errmsg("could not stat file \"%s\": %m", subpath)));
3279  continue;
3280  }
3281 
3282  if (S_ISREG(fst.st_mode))
3283  (*action) (subpath, false, elevel);
3284  else if (S_ISDIR(fst.st_mode))
3285  walkdir(subpath, action, false, elevel);
3286  }
3287 
3288  FreeDir(dir); /* we ignore any error here */
3289 
3290  /*
3291  * It's important to fsync the destination directory itself as individual
3292  * file fsyncs don't guarantee that the directory entry for the file is
3293  * synced. However, skip this if AllocateDir failed; the action function
3294  * might not be robust against that.
3295  */
3296  if (dir)
3297  (*action) (path, true, elevel);
3298 }
3299 
3300 
3301 /*
3302  * Hint to the OS that it should get ready to fsync() this file.
3303  *
3304  * Ignores errors trying to open unreadable files, and logs other errors at a
3305  * caller-specified level.
3306  */
3307 #ifdef PG_FLUSH_DATA_WORKS
3308 
3309 static void
3310 pre_sync_fname(const char *fname, bool isdir, int elevel)
3311 {
3312  int fd;
3313 
3314  /* Don't try to flush directories, it'll likely just fail */
3315  if (isdir)
3316  return;
3317 
3318  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3319 
3320  if (fd < 0)
3321  {
3322  if (errno == EACCES)
3323  return;
3324  ereport(elevel,
3326  errmsg("could not open file \"%s\": %m", fname)));
3327  return;
3328  }
3329 
3330  /*
3331  * pg_flush_data() ignores errors, which is ok because this is only a
3332  * hint.
3333  */
3334  pg_flush_data(fd, 0, 0);
3335 
3336  if (CloseTransientFile(fd) != 0)
3337  ereport(elevel,
3339  errmsg("could not close file \"%s\": %m", fname)));
3340 }
3341 
3342 #endif /* PG_FLUSH_DATA_WORKS */
3343 
3344 static void
3345 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3346 {
3347  /*
3348  * We want to silently ignoring errors about unreadable files. Pass that
3349  * desire on to fsync_fname_ext().
3350  */
3351  fsync_fname_ext(fname, isdir, true, elevel);
3352 }
3353 
3354 static void
3355 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3356 {
3357  if (isdir)
3358  {
3359  if (rmdir(fname) != 0 && errno != ENOENT)
3360  ereport(elevel,
3362  errmsg("could not remove directory \"%s\": %m", fname)));
3363  }
3364  else
3365  {
3366  /* Use PathNameDeleteTemporaryFile to report filesize */
3367  PathNameDeleteTemporaryFile(fname, false);
3368  }
3369 }
3370 
3371 /*
3372  * fsync_fname_ext -- Try to fsync a file or directory
3373  *
3374  * If ignore_perm is true, ignore errors upon trying to open unreadable
3375  * files. Logs other errors at a caller-specified level.
3376  *
3377  * Returns 0 if the operation succeeded, -1 otherwise.
3378  */
3379 static int
3380 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3381 {
3382  int fd;
3383  int flags;
3384  int returncode;
3385 
3386  /*
3387  * Some OSs require directories to be opened read-only whereas other
3388  * systems don't allow us to fsync files opened read-only; so we need both
3389  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3390  * not writable by our userid, but we assume that's OK.
3391  */
3392  flags = PG_BINARY;
3393  if (!isdir)
3394  flags |= O_RDWR;
3395  else
3396  flags |= O_RDONLY;
3397 
3398  fd = OpenTransientFile(fname, flags);
3399 
3400  /*
3401  * Some OSs don't allow us to open directories at all (Windows returns
3402  * EACCES), just ignore the error in that case. If desired also silently
3403  * ignoring errors about unreadable files. Log others.
3404  */
3405  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3406  return 0;
3407  else if (fd < 0 && ignore_perm && errno == EACCES)
3408  return 0;
3409  else if (fd < 0)
3410  {
3411  ereport(elevel,
3413  errmsg("could not open file \"%s\": %m", fname)));
3414  return -1;
3415  }
3416 
3417  returncode = pg_fsync(fd);
3418 
3419  /*
3420  * Some OSes don't allow us to fsync directories at all, so we can ignore
3421  * those errors. Anything else needs to be logged.
3422  */
3423  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3424  {
3425  int save_errno;
3426 
3427  /* close file upon error, might not be in transaction context */
3428  save_errno = errno;
3429  (void) CloseTransientFile(fd);
3430  errno = save_errno;
3431 
3432  ereport(elevel,
3434  errmsg("could not fsync file \"%s\": %m", fname)));
3435  return -1;
3436  }
3437 
3438  if (CloseTransientFile(fd) != 0)
3439  {
3440  ereport(elevel,
3442  errmsg("could not close file \"%s\": %m", fname)));
3443  return -1;
3444  }
3445 
3446  return 0;
3447 }
3448 
3449 /*
3450  * fsync_parent_path -- fsync the parent path of a file or directory
3451  *
3452  * This is aimed at making file operations persistent on disk in case of
3453  * an OS crash or power failure.
3454  */
3455 static int
3456 fsync_parent_path(const char *fname, int elevel)
3457 {
3458  char parentpath[MAXPGPATH];
3459 
3460  strlcpy(parentpath, fname, MAXPGPATH);
3461  get_parent_directory(parentpath);
3462 
3463  /*
3464  * get_parent_directory() returns an empty string if the input argument is
3465  * just a file name (see comments in path.c), so handle that as being the
3466  * current directory.
3467  */
3468  if (strlen(parentpath) == 0)
3469  strlcpy(parentpath, ".", MAXPGPATH);
3470 
3471  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3472  return -1;
3473 
3474  return 0;
3475 }
3476 
3477 /*
3478  * Create a PostgreSQL data sub-directory
3479  *
3480  * The data directory itself, and most of its sub-directories, are created at
3481  * initdb time, but we do have some occasions when we create directories in
3482  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3483  * make sure that those directories are created consistently. Today, that means
3484  * making sure that the created directory has the correct permissions, which is
3485  * what pg_dir_create_mode tracks for us.
3486  *
3487  * Note that we also set the umask() based on what we understand the correct
3488  * permissions to be (see file_perm.c).
3489  *
3490  * For permissions other than the default, mkdir() can be used directly, but
3491  * be sure to consider carefully such cases -- a sub-directory with incorrect
3492  * permissions in a PostgreSQL data directory could cause backups and other
3493  * processes to fail.
3494  */
3495 int
3496 MakePGDirectory(const char *directoryName)
3497 {
3498  return mkdir(directoryName, pg_dir_create_mode);
3499 }
3500 
3501 /*
3502  * Return the passed-in error level, or PANIC if data_sync_retry is off.
3503  *
3504  * Failure to fsync any data file is cause for immediate panic, unless
3505  * data_sync_retry is enabled. Data may have been written to the operating
3506  * system and removed from our buffer pool already, and if we are running on
3507  * an operating system that forgets dirty data on write-back failure, there
3508  * may be only one copy of the data remaining: in the WAL. A later attempt to
3509  * fsync again might falsely report success. Therefore we must not allow any
3510  * further checkpoints to be attempted. data_sync_retry can in theory be
3511  * enabled on systems known not to drop dirty buffered data on write-back
3512  * failure (with the likely outcome that checkpoints will continue to fail
3513  * until the underlying problem is fixed).
3514  *
3515  * Any code that reports a failure from fsync() or related functions should
3516  * filter the error level with this function.
3517  */
3518 int
3519 data_sync_elevel(int elevel)
3520 {
3521  return data_sync_retry ? elevel : PANIC;
3522 }
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1371
File lruLessRecently
Definition: fd.c:183
void closeAllVfds(void)
Definition: fd.c:2680
static PgChecksumMode mode
Definition: pg_checksums.c:61
File nextFree
Definition: fd.c:181
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:854
int pg_file_create_mode
Definition: file_perm.c:19
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1702
#define MAP_FAILED
Definition: mem.h:45
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:40
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1634
#define NUM_RESERVED_FDS
Definition: fd.c:118
static AllocateDesc * allocatedDescs
Definition: fd.c:244
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1358
int pg_fdatasync(int fd)
Definition: fd.c:420
static void error(void)
Definition: sql-dyntest.c:147
union AllocateDesc::@26 desc
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
AllocateDescKind
Definition: fd.c:222
DIR * dir
Definition: fd.c:237
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1577
static void AtProcExit_Files(int code, Datum arg)
Definition: fd.c:2832
static Size SizeVfdCache
Definition: fd.c:197
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:174
void on_proc_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:305
#define DO_DB(A)
Definition: fd.c:160
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2746
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3245
long random(void)
Definition: random.c:22
ResourceOwner CurrentResourceOwner
Definition: resowner.c:142
int pg_fsync_writethrough(int fd)
Definition: fd.c:397
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:78
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2584
int max_safe_fds
Definition: fd.c:145
#define Min(x, y)
Definition: c.h:911
off_t FileSize(File file)
Definition: fd.c:2070
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:617
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2301
#define FD_DELETE_AT_CLOSE
Definition: fd.c:172
int log_temp_files
Definition: guc.c:518
mode_t FileGetRawMode(File file)
Definition: fd.c:2158
void _dosmaperr(unsigned long)
Definition: win32error.c:171
static Vfd * VfdCache
Definition: fd.c:196
static void Delete(File file)
Definition: fd.c:1051
int closedir(DIR *)
Definition: dirent.c:113
static int numTempTableSpaces
Definition: fd.c:257
#define PG_TEMP_FILES_DIR
Definition: pg_checksums.c:58
int errcode(int sqlerrcode)
Definition: elog.c:608
#define MemSet(start, val, len)
Definition: c.h:962
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1466
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:385
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3073
File PathNameOpenTemporaryFile(const char *path)
Definition: fd.c:1672
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1564
static bool reserveAllocatedDesc(void)
Definition: fd.c:2169
uint32 SubTransactionId
Definition: c.h:518
#define SIGPIPE
Definition: win32_port.h:159
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1552
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
AllocateDescKind kind
Definition: fd.c:232
char * FilePathName(File file)
Definition: fd.c:2122
Definition: dirent.h:9
#define OidIsValid(objectId)
Definition: c.h:645
#define PANIC
Definition: elog.h:53
#define PG_BINARY
Definition: c.h:1222
static char * basedir
ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset)
Definition: pwrite.c:27
void AtEOXact_Files(bool isCommit)
Definition: fd.c:2818
Oid MyDatabaseTableSpace
Definition: globals.c:87
int ClosePipeStream(FILE *file)
Definition: fd.c:2651
ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset)
Definition: pread.c:27
#define malloc(a)
Definition: header.h:50
static void LruDelete(File file)
Definition: fd.c:1070
void pg_usleep(long microsec)
Definition: signal.c:53
bool TempTablespacesAreSet(void)
Definition: fd.c:2734
#define fsync(fd)
Definition: win32_port.h:63
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2402
void pfree(void *pointer)
Definition: mcxt.c:1056
mode_t fileMode
Definition: fd.c:188
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3045
static bool ReleaseLruFile(void)
Definition: fd.c:1165
Definition: dirent.c:25
#define ERROR
Definition: elog.h:43
#define PG_TEMP_FILE_PREFIX
Definition: pg_checksums.c:59
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2292
static int LruInsert(File file)
Definition: fd.c:1118
#define FATAL
Definition: elog.h:52
static bool have_xact_temporary_files
Definition: fd.c:208
#define MAXPGPATH
DIR * opendir(const char *)
Definition: dirent.c:33
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2049
#define DEBUG2
Definition: elog.h:24
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:26
char * fileName
Definition: fd.c:185
static char * buf
Definition: pg_test_fsync.c:67
Oid GetNextTempTableSpace(void)
Definition: fd.c:2764
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1244
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3355
int errdetail(const char *fmt,...)
Definition: elog.c:955
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:2979
char * tablespace
Definition: pgbench.c:189
int errcode_for_file_access(void)
Definition: elog.c:631
void get_parent_directory(char *path)
Definition: path.c:854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2242
static int nfile
Definition: fd.c:202
unsigned int uint32
Definition: c.h:359
void SyncDataDirectory(void)
Definition: fd.c:3171
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2503
static int nextTempTableSpace
Definition: fd.c:258
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1342
int max_files_per_process
Definition: fd.c:132
static File AllocateVfd(void)
Definition: fd.c:1197
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2345
unsigned short fdstate
Definition: fd.c:179
Definition: fd.c:176
off_t fileSize
Definition: fd.c:184
int fd
Definition: fd.c:178
#define ereport(elevel, rest)
Definition: elog.h:141
int link(const char *fromname, const char *toname)
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2706
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:643
static void Insert(File file)
Definition: fd.c:1096
ResourceOwner resowner
Definition: fd.c:180
bool data_sync_retry
Definition: fd.c:148
#define S_ISREG(m)
Definition: win32_port.h:299
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3345
int CloseTransientFile(int fd)
Definition: fd.c:2469
#define SIG_IGN
Definition: win32_port.h:151
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1311
static void ReleaseLruFiles(void)
Definition: fd.c:1187
#define WARNING
Definition: elog.h:40
#define stat(a, b)
Definition: win32_port.h:255
#define FileIsNotOpen(file)
Definition: fd.c:169
int pg_dir_create_mode
Definition: file_perm.c:18
static int elevel
Definition: vacuumlazy.c:143
int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:1951
struct vfd Vfd
int data_sync_elevel(int elevel)
Definition: fd.c:3519
uintptr_t Datum
Definition: postgres.h:367
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:2785
unsigned int Index
Definition: c.h:476
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:440
#define FileIsValid(file)
Definition: fd.c:166
FILE * file
Definition: fd.c:236
#define InvalidOid
Definition: postgres_ext.h:36
#define VFD_CLOSED
Definition: fd.c:164
static uint64 temporary_files_size
Definition: fd.c:216
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3496
pqsigfunc pqsignal(int signum, pqsigfunc handler)
Definition: signal.c:170
#define free(a)
Definition: header.h:65
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static void RegisterTemporaryFile(File file)
Definition: fd.c:1330
void FileClose(File file)
Definition: fd.c:1748
#define SIG_DFL
Definition: win32_port.h:149
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:1844
static int FileAccess(File file)
Definition: fd.c:1275
#define Assert(condition)
Definition: c.h:739
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:707
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2569
File lruMoreRecently
Definition: fd.c:182
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:1872
void RemovePgTempFiles(void)
Definition: fd.c:2920
SubTransactionId create_subid
Definition: fd.c:233
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1499
int durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:769
size_t Size
Definition: c.h:467
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1318
static const char * directory
Definition: zic.c:622
int sync_method
Definition: xlog.c:102
struct dirent * readdir(DIR *)
Definition: dirent.c:77
#define FD_MINFREE
Definition: fd.c:124
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3101
#define realloc(a, b)
Definition: header.h:60
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1069
#define INT64_FORMAT
Definition: c.h:401
const char * name
Definition: encode.c:521
static long tempFileCounter
Definition: fd.c:250
int fd
Definition: fd.c:238
#define S_ISDIR(m)
Definition: win32_port.h:296
#define lstat(path, sb)
Definition: win32_port.h:244
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:733
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:981
int FreeFile(FILE *file)
Definition: fd.c:2441
void set_max_safe_fds(void)
Definition: fd.c:938
bool enableFsync
Definition: globals.c:119
static Oid * tempTableSpaces
Definition: fd.c:256
void * palloc(Size size)
Definition: mcxt.c:949
int errmsg(const char *fmt,...)
Definition: elog.c:822
int FileGetRawFlags(File file)
Definition: fd.c:2148
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1233
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3380
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1003
#define elog(elevel,...)
Definition: elog.h:228
int i
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:173
void * arg
int FileGetRawDesc(File file)
Definition: fd.c:2138
static void FreeVfd(File file)
Definition: fd.c:1255
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
int pg_fsync(int fd)
Definition: fd.c:330
char d_name[MAX_PATH]
Definition: dirent.h:14
#define mkdir(a, b)
Definition: win32_port.h:58
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:323
int fileFlags
Definition: fd.c:187
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1435
int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info)
Definition: fd.c:1895
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1253
#define snprintf
Definition: port.h:192
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2087
static int maxAllocatedDescs
Definition: fd.c:243
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:2850
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3456
int File
Definition: fd.h:45
int FreeDir(DIR *dir)
Definition: fd.c:2621
int temp_file_limit
Definition: guc.c:525
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:241
void InitFileAccess(void)
Definition: fd.c:821
static int numAllocatedDescs
Definition: fd.c:242
bool pgwin32_is_junction(const char *path)
#define ftruncate(a, b)
Definition: win32_port.h:60