PostgreSQL Source Code  git master
fd.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * fd.c
4  * Virtual file descriptor code.
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/fd.c
11  *
12  * NOTES:
13  *
14  * This code manages a cache of 'virtual' file descriptors (VFDs).
15  * The server opens many file descriptors for a variety of reasons,
16  * including base tables, scratch files (e.g., sort and hash spool
17  * files), and random calls to C library routines like system(3); it
18  * is quite easy to exceed system limits on the number of open files a
19  * single process can have. (This is around 256 on many modern
20  * operating systems, but can be as low as 32 on others.)
21  *
22  * VFDs are managed as an LRU pool, with actual OS file descriptors
23  * being opened and closed as needed. Obviously, if a routine is
24  * opened using these interfaces, all subsequent operations must also
25  * be through these interfaces (the File type is not a real file
26  * descriptor).
27  *
28  * For this scheme to work, most (if not all) routines throughout the
29  * server should use these interfaces instead of calling the C library
30  * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
31  * may find ourselves short of real file descriptors anyway.
32  *
33  * INTERFACE ROUTINES
34  *
35  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
36  * A File opened with OpenTemporaryFile is automatically deleted when the
37  * File is closed, either explicitly or implicitly at end of transaction or
38  * process exit. PathNameOpenFile is intended for files that are held open
39  * for a long time, like relation files. It is the caller's responsibility
40  * to close them, there is no automatic mechanism in fd.c for that.
41  *
42  * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
43  * temporary files that have names so that they can be shared between
44  * backends. Such files are automatically closed and count against the
45  * temporary file limit of the backend that creates them, but unlike anonymous
46  * files they are not automatically deleted. See sharedfileset.c for a shared
47  * ownership mechanism that provides automatic cleanup for shared files when
48  * the last of a group of backends detaches.
49  *
50  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
51  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
52  * They behave like the corresponding native functions, except that the handle
53  * is registered with the current subtransaction, and will be automatically
54  * closed at abort. These are intended mainly for short operations like
55  * reading a configuration file; there is a limit on the number of files that
56  * can be opened using these functions at any one time.
57  *
58  * Finally, BasicOpenFile is just a thin wrapper around open() that can
59  * release file descriptors in use by the virtual file descriptors if
60  * necessary. There is no automatic cleanup of file descriptors returned by
61  * BasicOpenFile, it is solely the caller's responsibility to close the file
62  * descriptor by calling close(2).
63  *
64  *-------------------------------------------------------------------------
65  */
66 
67 #include "postgres.h"
68 
69 #include <sys/file.h>
70 #include <sys/param.h>
71 #include <sys/stat.h>
72 #ifndef WIN32
73 #include <sys/mman.h>
74 #endif
75 #include <limits.h>
76 #include <unistd.h>
77 #include <fcntl.h>
78 #ifdef HAVE_SYS_RESOURCE_H
79 #include <sys/resource.h> /* for getrlimit */
80 #endif
81 
82 #include "miscadmin.h"
83 #include "access/xact.h"
84 #include "access/xlog.h"
85 #include "catalog/pg_tablespace.h"
86 #include "common/file_perm.h"
87 #include "pgstat.h"
88 #include "portability/mem.h"
89 #include "storage/fd.h"
90 #include "storage/ipc.h"
91 #include "utils/guc.h"
92 #include "utils/resowner_private.h"
93 
94 
95 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
96 #if defined(HAVE_SYNC_FILE_RANGE)
97 #define PG_FLUSH_DATA_WORKS 1
98 #elif !defined(WIN32) && defined(MS_ASYNC)
99 #define PG_FLUSH_DATA_WORKS 1
100 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
101 #define PG_FLUSH_DATA_WORKS 1
102 #endif
103 
104 /*
105  * We must leave some file descriptors free for system(), the dynamic loader,
106  * and other code that tries to open files without consulting fd.c. This
107  * is the number left free. (While we can be pretty sure we won't get
108  * EMFILE, there's never any guarantee that we won't get ENFILE due to
109  * other processes chewing up FDs. So it's a bad idea to try to open files
110  * without consulting fd.c. Nonetheless we cannot control all code.)
111  *
112  * Because this is just a fixed setting, we are effectively assuming that
113  * no such code will leave FDs open over the long term; otherwise the slop
114  * is likely to be insufficient. Note in particular that we expect that
115  * loading a shared library does not result in any permanent increase in
116  * the number of open files. (This appears to be true on most if not
117  * all platforms as of Feb 2004.)
118  */
119 #define NUM_RESERVED_FDS 10
120 
121 /*
122  * If we have fewer than this many usable FDs after allowing for the reserved
123  * ones, choke.
124  */
125 #define FD_MINFREE 10
126 
127 /*
128  * A number of platforms allow individual processes to open many more files
129  * than they can really support when *many* processes do the same thing.
130  * This GUC parameter lets the DBA limit max_safe_fds to something less than
131  * what the postmaster's initial probe suggests will work.
132  */
134 
135 /*
136  * Maximum number of file descriptors to open for either VFD entries or
137  * AllocateFile/AllocateDir/OpenTransientFile operations. This is initialized
138  * to a conservative value, and remains that way indefinitely in bootstrap or
139  * standalone-backend cases. In normal postmaster operation, the postmaster
140  * calls set_max_safe_fds() late in initialization to update the value, and
141  * that value is then inherited by forked subprocesses.
142  *
143  * Note: the value of max_files_per_process is taken into account while
144  * setting this variable, and so need not be tested separately.
145  */
146 int max_safe_fds = 32; /* default if not changed */
147 
148 
149 /* Debugging.... */
150 
151 #ifdef FDDEBUG
152 #define DO_DB(A) \
153  do { \
154  int _do_db_save_errno = errno; \
155  A; \
156  errno = _do_db_save_errno; \
157  } while (0)
158 #else
159 #define DO_DB(A) \
160  ((void) 0)
161 #endif
162 
163 #define VFD_CLOSED (-1)
164 
165 #define FileIsValid(file) \
166  ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
167 
168 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
169 
170 /*
171  * Note: a VFD's seekPos is normally always valid, but if for some reason
172  * an lseek() fails, it might become set to FileUnknownPos. We can struggle
173  * along without knowing the seek position in many cases, but in some places
174  * we have to fail if we don't have it.
175  */
176 #define FileUnknownPos ((off_t) -1)
177 #define FilePosIsUnknown(pos) ((pos) < 0)
178 
179 /* these are the assigned bits in fdstate below: */
180 #define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
181 #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
182 #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
183 
184 typedef struct vfd
185 {
186  int fd; /* current FD, or VFD_CLOSED if none */
187  unsigned short fdstate; /* bitflags for VFD's state */
188  ResourceOwner resowner; /* owner, for automatic cleanup */
189  File nextFree; /* link to next free VFD, if in freelist */
190  File lruMoreRecently; /* doubly linked recency-of-use list */
192  off_t seekPos; /* current logical file position, or -1 */
193  off_t fileSize; /* current size of file (0 if not temporary) */
194  char *fileName; /* name of file, or NULL for unused VFD */
195  /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
196  int fileFlags; /* open(2) flags for (re)opening the file */
197  mode_t fileMode; /* mode to pass to open(2) */
198 } Vfd;
199 
200 /*
201  * Virtual File Descriptor array pointer and size. This grows as
202  * needed. 'File' values are indexes into this array.
203  * Note that VfdCache[0] is not a usable VFD, just a list header.
204  */
205 static Vfd *VfdCache;
206 static Size SizeVfdCache = 0;
207 
208 /*
209  * Number of file descriptors known to be in use by VFD entries.
210  */
211 static int nfile = 0;
212 
213 /*
214  * Flag to tell whether it's worth scanning VfdCache looking for temp files
215  * to close
216  */
217 static bool have_xact_temporary_files = false;
218 
219 /*
220  * Tracks the total size of all temporary files. Note: when temp_file_limit
221  * is being enforced, this cannot overflow since the limit cannot be more
222  * than INT_MAX kilobytes. When not enforcing, it could theoretically
223  * overflow, but we don't care.
224  */
225 static uint64 temporary_files_size = 0;
226 
227 /*
228  * List of OS handles opened with AllocateFile, AllocateDir and
229  * OpenTransientFile.
230  */
231 typedef enum
232 {
238 
239 typedef struct
240 {
243  union
244  {
245  FILE *file;
247  int fd;
248  } desc;
249 } AllocateDesc;
250 
251 static int numAllocatedDescs = 0;
252 static int maxAllocatedDescs = 0;
254 
255 /*
256  * Number of temporary files opened during the current session;
257  * this is used in generation of tempfile names.
258  */
259 static long tempFileCounter = 0;
260 
261 /*
262  * Array of OIDs of temp tablespaces. When numTempTableSpaces is -1,
263  * this has not been set in the current transaction.
264  */
265 static Oid *tempTableSpaces = NULL;
266 static int numTempTableSpaces = -1;
267 static int nextTempTableSpace = 0;
268 
269 
270 /*--------------------
271  *
272  * Private Routines
273  *
274  * Delete - delete a file from the Lru ring
275  * LruDelete - remove a file from the Lru ring and close its FD
276  * Insert - put a file at the front of the Lru ring
277  * LruInsert - put a file at the front of the Lru ring and open it
278  * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
279  * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
280  * AllocateVfd - grab a free (or new) file record (from VfdArray)
281  * FreeVfd - free a file record
282  *
283  * The Least Recently Used ring is a doubly linked list that begins and
284  * ends on element zero. Element zero is special -- it doesn't represent
285  * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
286  * anchor that shows us the beginning/end of the ring.
287  * Only VFD elements that are currently really open (have an FD assigned) are
288  * in the Lru ring. Elements that are "virtually" open can be recognized
289  * by having a non-null fileName field.
290  *
291  * example:
292  *
293  * /--less----\ /---------\
294  * v \ v \
295  * #0 --more---> LeastRecentlyUsed --more-\ \
296  * ^\ | |
297  * \\less--> MostRecentlyUsedFile <---/ |
298  * \more---/ \--less--/
299  *
300  *--------------------
301  */
302 static void Delete(File file);
303 static void LruDelete(File file);
304 static void Insert(File file);
305 static int LruInsert(File file);
306 static bool ReleaseLruFile(void);
307 static void ReleaseLruFiles(void);
308 static File AllocateVfd(void);
309 static void FreeVfd(File file);
310 
311 static int FileAccess(File file);
312 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
313 static bool reserveAllocatedDesc(void);
314 static int FreeDesc(AllocateDesc *desc);
315 
316 static void AtProcExit_Files(int code, Datum arg);
317 static void CleanupTempFiles(bool isCommit, bool isProcExit);
318 static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok,
319  bool unlink_all);
320 static void RemovePgTempRelationFiles(const char *tsdirname);
321 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
322 
323 static void walkdir(const char *path,
324  void (*action) (const char *fname, bool isdir, int elevel),
325  bool process_symlinks,
326  int elevel);
327 #ifdef PG_FLUSH_DATA_WORKS
328 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
329 #endif
330 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
331 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
332 
333 static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
334 static int fsync_parent_path(const char *fname, int elevel);
335 
336 
337 /*
338  * pg_fsync --- do fsync with or without writethrough
339  */
340 int
342 {
343  /* #if is to skip the sync_method test if there's no need for it */
344 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
346  return pg_fsync_writethrough(fd);
347  else
348 #endif
349  return pg_fsync_no_writethrough(fd);
350 }
351 
352 
353 /*
354  * pg_fsync_no_writethrough --- same as fsync except does nothing if
355  * enableFsync is off
356  */
357 int
359 {
360  if (enableFsync)
361  return fsync(fd);
362  else
363  return 0;
364 }
365 
366 /*
367  * pg_fsync_writethrough
368  */
369 int
371 {
372  if (enableFsync)
373  {
374 #ifdef WIN32
375  return _commit(fd);
376 #elif defined(F_FULLFSYNC)
377  return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
378 #else
379  errno = ENOSYS;
380  return -1;
381 #endif
382  }
383  else
384  return 0;
385 }
386 
387 /*
388  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
389  *
390  * Not all platforms have fdatasync; treat as fsync if not available.
391  */
392 int
394 {
395  if (enableFsync)
396  {
397 #ifdef HAVE_FDATASYNC
398  return fdatasync(fd);
399 #else
400  return fsync(fd);
401 #endif
402  }
403  else
404  return 0;
405 }
406 
407 /*
408  * pg_flush_data --- advise OS that the described dirty data should be flushed
409  *
410  * offset of 0 with nbytes 0 means that the entire file should be flushed;
411  * in this case, this function may have side-effects on the file's
412  * seek position!
413  */
414 void
415 pg_flush_data(int fd, off_t offset, off_t nbytes)
416 {
417  /*
418  * Right now file flushing is primarily used to avoid making later
419  * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
420  * if fsyncs are disabled - that's a decision we might want to make
421  * configurable at some point.
422  */
423  if (!enableFsync)
424  return;
425 
426  /*
427  * We compile all alternatives that are supported on the current platform,
428  * to find portability problems more easily.
429  */
430 #if defined(HAVE_SYNC_FILE_RANGE)
431  {
432  int rc;
433 
434  /*
435  * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
436  * tells the OS that writeback for the specified blocks should be
437  * started, but that we don't want to wait for completion. Note that
438  * this call might block if too much dirty data exists in the range.
439  * This is the preferable method on OSs supporting it, as it works
440  * reliably when available (contrast to msync()) and doesn't flush out
441  * clean data (like FADV_DONTNEED).
442  */
443  rc = sync_file_range(fd, offset, nbytes,
444  SYNC_FILE_RANGE_WRITE);
445 
446  /* don't error out, this is just a performance optimization */
447  if (rc != 0)
448  {
451  errmsg("could not flush dirty data: %m")));
452  }
453 
454  return;
455  }
456 #endif
457 #if !defined(WIN32) && defined(MS_ASYNC)
458  {
459  void *p;
460  static int pagesize = 0;
461 
462  /*
463  * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
464  * writeback. On linux it only does so if MS_SYNC is specified, but
465  * then it does the writeback synchronously. Luckily all common linux
466  * systems have sync_file_range(). This is preferable over
467  * FADV_DONTNEED because it doesn't flush out clean data.
468  *
469  * We map the file (mmap()), tell the kernel to sync back the contents
470  * (msync()), and then remove the mapping again (munmap()).
471  */
472 
473  /* mmap() needs actual length if we want to map whole file */
474  if (offset == 0 && nbytes == 0)
475  {
476  nbytes = lseek(fd, 0, SEEK_END);
477  if (nbytes < 0)
478  {
481  errmsg("could not determine dirty data size: %m")));
482  return;
483  }
484  }
485 
486  /*
487  * Some platforms reject partial-page mmap() attempts. To deal with
488  * that, just truncate the request to a page boundary. If any extra
489  * bytes don't get flushed, well, it's only a hint anyway.
490  */
491 
492  /* fetch pagesize only once */
493  if (pagesize == 0)
494  pagesize = sysconf(_SC_PAGESIZE);
495 
496  /* align length to pagesize, dropping any fractional page */
497  if (pagesize > 0)
498  nbytes = (nbytes / pagesize) * pagesize;
499 
500  /* fractional-page request is a no-op */
501  if (nbytes <= 0)
502  return;
503 
504  /*
505  * mmap could well fail, particularly on 32-bit platforms where there
506  * may simply not be enough address space. If so, silently fall
507  * through to the next implementation.
508  */
509  if (nbytes <= (off_t) SSIZE_MAX)
510  p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
511  else
512  p = MAP_FAILED;
513 
514  if (p != MAP_FAILED)
515  {
516  int rc;
517 
518  rc = msync(p, (size_t) nbytes, MS_ASYNC);
519  if (rc != 0)
520  {
523  errmsg("could not flush dirty data: %m")));
524  /* NB: need to fall through to munmap()! */
525  }
526 
527  rc = munmap(p, (size_t) nbytes);
528  if (rc != 0)
529  {
530  /* FATAL error because mapping would remain */
531  ereport(FATAL,
533  errmsg("could not munmap() while flushing data: %m")));
534  }
535 
536  return;
537  }
538  }
539 #endif
540 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
541  {
542  int rc;
543 
544  /*
545  * Signal the kernel that the passed in range should not be cached
546  * anymore. This has the, desired, side effect of writing out dirty
547  * data, and the, undesired, side effect of likely discarding useful
548  * clean cached blocks. For the latter reason this is the least
549  * preferable method.
550  */
551 
552  rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
553 
554  if (rc != 0)
555  {
556  /* don't error out, this is just a performance optimization */
559  errmsg("could not flush dirty data: %m")));
560  }
561 
562  return;
563  }
564 #endif
565 }
566 
567 
568 /*
569  * fsync_fname -- fsync a file or directory, handling errors properly
570  *
571  * Try to fsync a file or directory. When doing the latter, ignore errors that
572  * indicate the OS just doesn't allow/require fsyncing directories.
573  */
574 void
575 fsync_fname(const char *fname, bool isdir)
576 {
577  fsync_fname_ext(fname, isdir, false, ERROR);
578 }
579 
580 /*
581  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
582  *
583  * This routine ensures that, after returning, the effect of renaming file
584  * persists in case of a crash. A crash while this routine is running will
585  * leave you with either the pre-existing or the moved file in place of the
586  * new file; no mixed state or truncated files are possible.
587  *
588  * It does so by using fsync on the old filename and the possibly existing
589  * target filename before the rename, and the target file and directory after.
590  *
591  * Note that rename() cannot be used across arbitrary directories, as they
592  * might not be on the same filesystem. Therefore this routine does not
593  * support renaming across directories.
594  *
595  * Log errors with the caller specified severity.
596  *
597  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
598  * valid upon return.
599  */
600 int
601 durable_rename(const char *oldfile, const char *newfile, int elevel)
602 {
603  int fd;
604 
605  /*
606  * First fsync the old and target path (if it exists), to ensure that they
607  * are properly persistent on disk. Syncing the target file is not
608  * strictly necessary, but it makes it easier to reason about crashes;
609  * because it's then guaranteed that either source or target file exists
610  * after a crash.
611  */
612  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
613  return -1;
614 
615  fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
616  if (fd < 0)
617  {
618  if (errno != ENOENT)
619  {
620  ereport(elevel,
622  errmsg("could not open file \"%s\": %m", newfile)));
623  return -1;
624  }
625  }
626  else
627  {
628  if (pg_fsync(fd) != 0)
629  {
630  int save_errno;
631 
632  /* close file upon error, might not be in transaction context */
633  save_errno = errno;
634  CloseTransientFile(fd);
635  errno = save_errno;
636 
637  ereport(elevel,
639  errmsg("could not fsync file \"%s\": %m", newfile)));
640  return -1;
641  }
642  CloseTransientFile(fd);
643  }
644 
645  /* Time to do the real deal... */
646  if (rename(oldfile, newfile) < 0)
647  {
648  ereport(elevel,
650  errmsg("could not rename file \"%s\" to \"%s\": %m",
651  oldfile, newfile)));
652  return -1;
653  }
654 
655  /*
656  * To guarantee renaming the file is persistent, fsync the file with its
657  * new name, and its containing directory.
658  */
659  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
660  return -1;
661 
662  if (fsync_parent_path(newfile, elevel) != 0)
663  return -1;
664 
665  return 0;
666 }
667 
668 /*
669  * durable_unlink -- remove a file in a durable manner
670  *
671  * This routine ensures that, after returning, the effect of removing file
672  * persists in case of a crash. A crash while this routine is running will
673  * leave the system in no mixed state.
674  *
675  * It does so by using fsync on the parent directory of the file after the
676  * actual removal is done.
677  *
678  * Log errors with the severity specified by caller.
679  *
680  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
681  * valid upon return.
682  */
683 int
684 durable_unlink(const char *fname, int elevel)
685 {
686  if (unlink(fname) < 0)
687  {
688  ereport(elevel,
690  errmsg("could not remove file \"%s\": %m",
691  fname)));
692  return -1;
693  }
694 
695  /*
696  * To guarantee that the removal of the file is persistent, fsync its
697  * parent directory.
698  */
699  if (fsync_parent_path(fname, elevel) != 0)
700  return -1;
701 
702  return 0;
703 }
704 
705 /*
706  * durable_link_or_rename -- rename a file in a durable manner.
707  *
708  * Similar to durable_rename(), except that this routine tries (but does not
709  * guarantee) not to overwrite the target file.
710  *
711  * Note that a crash in an unfortunate moment can leave you with two links to
712  * the target file.
713  *
714  * Log errors with the caller specified severity.
715  *
716  * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
717  * valid upon return.
718  */
719 int
720 durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
721 {
722  /*
723  * Ensure that, if we crash directly after the rename/link, a file with
724  * valid contents is moved into place.
725  */
726  if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
727  return -1;
728 
729 #if HAVE_WORKING_LINK
730  if (link(oldfile, newfile) < 0)
731  {
732  ereport(elevel,
734  errmsg("could not link file \"%s\" to \"%s\": %m",
735  oldfile, newfile)));
736  return -1;
737  }
738  unlink(oldfile);
739 #else
740  /* XXX: Add racy file existence check? */
741  if (rename(oldfile, newfile) < 0)
742  {
743  ereport(elevel,
745  errmsg("could not rename file \"%s\" to \"%s\": %m",
746  oldfile, newfile)));
747  return -1;
748  }
749 #endif
750 
751  /*
752  * Make change persistent in case of an OS crash, both the new entry and
753  * its parent directory need to be flushed.
754  */
755  if (fsync_fname_ext(newfile, false, false, elevel) != 0)
756  return -1;
757 
758  /* Same for parent directory */
759  if (fsync_parent_path(newfile, elevel) != 0)
760  return -1;
761 
762  return 0;
763 }
764 
765 /*
766  * InitFileAccess --- initialize this module during backend startup
767  *
768  * This is called during either normal or standalone backend start.
769  * It is *not* called in the postmaster.
770  */
771 void
773 {
774  Assert(SizeVfdCache == 0); /* call me only once */
775 
776  /* initialize cache header entry */
777  VfdCache = (Vfd *) malloc(sizeof(Vfd));
778  if (VfdCache == NULL)
779  ereport(FATAL,
780  (errcode(ERRCODE_OUT_OF_MEMORY),
781  errmsg("out of memory")));
782 
783  MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
784  VfdCache->fd = VFD_CLOSED;
785 
786  SizeVfdCache = 1;
787 
788  /* register proc-exit hook to ensure temp files are dropped at exit */
790 }
791 
792 /*
793  * count_usable_fds --- count how many FDs the system will let us open,
794  * and estimate how many are already open.
795  *
796  * We stop counting if usable_fds reaches max_to_probe. Note: a small
797  * value of max_to_probe might result in an underestimate of already_open;
798  * we must fill in any "gaps" in the set of used FDs before the calculation
799  * of already_open will give the right answer. In practice, max_to_probe
800  * of a couple of dozen should be enough to ensure good results.
801  *
802  * We assume stdin (FD 0) is available for dup'ing
803  */
804 static void
805 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
806 {
807  int *fd;
808  int size;
809  int used = 0;
810  int highestfd = 0;
811  int j;
812 
813 #ifdef HAVE_GETRLIMIT
814  struct rlimit rlim;
815  int getrlimit_status;
816 #endif
817 
818  size = 1024;
819  fd = (int *) palloc(size * sizeof(int));
820 
821 #ifdef HAVE_GETRLIMIT
822 #ifdef RLIMIT_NOFILE /* most platforms use RLIMIT_NOFILE */
823  getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
824 #else /* but BSD doesn't ... */
825  getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
826 #endif /* RLIMIT_NOFILE */
827  if (getrlimit_status != 0)
828  ereport(WARNING, (errmsg("getrlimit failed: %m")));
829 #endif /* HAVE_GETRLIMIT */
830 
831  /* dup until failure or probe limit reached */
832  for (;;)
833  {
834  int thisfd;
835 
836 #ifdef HAVE_GETRLIMIT
837 
838  /*
839  * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
840  * some platforms
841  */
842  if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
843  break;
844 #endif
845 
846  thisfd = dup(0);
847  if (thisfd < 0)
848  {
849  /* Expect EMFILE or ENFILE, else it's fishy */
850  if (errno != EMFILE && errno != ENFILE)
851  elog(WARNING, "dup(0) failed after %d successes: %m", used);
852  break;
853  }
854 
855  if (used >= size)
856  {
857  size *= 2;
858  fd = (int *) repalloc(fd, size * sizeof(int));
859  }
860  fd[used++] = thisfd;
861 
862  if (highestfd < thisfd)
863  highestfd = thisfd;
864 
865  if (used >= max_to_probe)
866  break;
867  }
868 
869  /* release the files we opened */
870  for (j = 0; j < used; j++)
871  close(fd[j]);
872 
873  pfree(fd);
874 
875  /*
876  * Return results. usable_fds is just the number of successful dups. We
877  * assume that the system limit is highestfd+1 (remember 0 is a legal FD
878  * number) and so already_open is highestfd+1 - usable_fds.
879  */
880  *usable_fds = used;
881  *already_open = highestfd + 1 - used;
882 }
883 
884 /*
885  * set_max_safe_fds
886  * Determine number of filedescriptors that fd.c is allowed to use
887  */
888 void
890 {
891  int usable_fds;
892  int already_open;
893 
894  /*----------
895  * We want to set max_safe_fds to
896  * MIN(usable_fds, max_files_per_process - already_open)
897  * less the slop factor for files that are opened without consulting
898  * fd.c. This ensures that we won't exceed either max_files_per_process
899  * or the experimentally-determined EMFILE limit.
900  *----------
901  */
903  &usable_fds, &already_open);
904 
905  max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
906 
907  /*
908  * Take off the FDs reserved for system() etc.
909  */
911 
912  /*
913  * Make sure we still have enough to get by.
914  */
915  if (max_safe_fds < FD_MINFREE)
916  ereport(FATAL,
917  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
918  errmsg("insufficient file descriptors available to start server process"),
919  errdetail("System allows %d, we need at least %d.",
922 
923  elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
924  max_safe_fds, usable_fds, already_open);
925 }
926 
927 /*
928  * Open a file with BasicOpenFilePerm() and pass default file mode for the
929  * fileMode parameter.
930  */
931 int
933 {
934  return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
935 }
936 
937 /*
938  * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
939  *
940  * This is exported for use by places that really want a plain kernel FD,
941  * but need to be proof against running out of FDs. Once an FD has been
942  * successfully returned, it is the caller's responsibility to ensure that
943  * it will not be leaked on ereport()! Most users should *not* call this
944  * routine directly, but instead use the VFD abstraction level, which
945  * provides protection against descriptor leaks as well as management of
946  * files that need to be open for more than a short period of time.
947  *
948  * Ideally this should be the *only* direct call of open() in the backend.
949  * In practice, the postmaster calls open() directly, and there are some
950  * direct open() calls done early in backend startup. Those are OK since
951  * this module wouldn't have any open files to close at that point anyway.
952  */
953 int
954 BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
955 {
956  int fd;
957 
958 tryAgain:
959  fd = open(fileName, fileFlags, fileMode);
960 
961  if (fd >= 0)
962  return fd; /* success! */
963 
964  if (errno == EMFILE || errno == ENFILE)
965  {
966  int save_errno = errno;
967 
968  ereport(LOG,
969  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
970  errmsg("out of file descriptors: %m; release and retry")));
971  errno = 0;
972  if (ReleaseLruFile())
973  goto tryAgain;
974  errno = save_errno;
975  }
976 
977  return -1; /* failure */
978 }
979 
980 #if defined(FDDEBUG)
981 
982 static void
983 _dump_lru(void)
984 {
985  int mru = VfdCache[0].lruLessRecently;
986  Vfd *vfdP = &VfdCache[mru];
987  char buf[2048];
988 
989  snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
990  while (mru != 0)
991  {
992  mru = vfdP->lruLessRecently;
993  vfdP = &VfdCache[mru];
994  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
995  }
996  snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
997  elog(LOG, "%s", buf);
998 }
999 #endif /* FDDEBUG */
1000 
1001 static void
1003 {
1004  Vfd *vfdP;
1005 
1006  Assert(file != 0);
1007 
1008  DO_DB(elog(LOG, "Delete %d (%s)",
1009  file, VfdCache[file].fileName));
1010  DO_DB(_dump_lru());
1011 
1012  vfdP = &VfdCache[file];
1013 
1014  VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1015  VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1016 
1017  DO_DB(_dump_lru());
1018 }
1019 
1020 static void
1022 {
1023  Vfd *vfdP;
1024 
1025  Assert(file != 0);
1026 
1027  DO_DB(elog(LOG, "LruDelete %d (%s)",
1028  file, VfdCache[file].fileName));
1029 
1030  vfdP = &VfdCache[file];
1031 
1032  /*
1033  * Normally we should know the seek position, but if for some reason we
1034  * have lost track of it, try again to get it. If we still can't get it,
1035  * we have a problem: we will be unable to restore the file seek position
1036  * when and if the file is re-opened. But we can't really throw an error
1037  * and refuse to close the file, or activities such as transaction cleanup
1038  * will be broken.
1039  */
1040  if (FilePosIsUnknown(vfdP->seekPos))
1041  {
1042  vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1043  if (FilePosIsUnknown(vfdP->seekPos))
1044  elog(LOG, "could not seek file \"%s\" before closing: %m",
1045  vfdP->fileName);
1046  }
1047 
1048  /*
1049  * Close the file. We aren't expecting this to fail; if it does, better
1050  * to leak the FD than to mess up our internal state.
1051  */
1052  if (close(vfdP->fd))
1053  elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1054  vfdP->fd = VFD_CLOSED;
1055  --nfile;
1056 
1057  /* delete the vfd record from the LRU ring */
1058  Delete(file);
1059 }
1060 
1061 static void
1063 {
1064  Vfd *vfdP;
1065 
1066  Assert(file != 0);
1067 
1068  DO_DB(elog(LOG, "Insert %d (%s)",
1069  file, VfdCache[file].fileName));
1070  DO_DB(_dump_lru());
1071 
1072  vfdP = &VfdCache[file];
1073 
1074  vfdP->lruMoreRecently = 0;
1075  vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1076  VfdCache[0].lruLessRecently = file;
1077  VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1078 
1079  DO_DB(_dump_lru());
1080 }
1081 
1082 /* returns 0 on success, -1 on re-open failure (with errno set) */
1083 static int
1085 {
1086  Vfd *vfdP;
1087 
1088  Assert(file != 0);
1089 
1090  DO_DB(elog(LOG, "LruInsert %d (%s)",
1091  file, VfdCache[file].fileName));
1092 
1093  vfdP = &VfdCache[file];
1094 
1095  if (FileIsNotOpen(file))
1096  {
1097  /* Close excess kernel FDs. */
1098  ReleaseLruFiles();
1099 
1100  /*
1101  * The open could still fail for lack of file descriptors, eg due to
1102  * overall system file table being full. So, be prepared to release
1103  * another FD if necessary...
1104  */
1105  vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1106  vfdP->fileMode);
1107  if (vfdP->fd < 0)
1108  {
1109  DO_DB(elog(LOG, "re-open failed: %m"));
1110  return -1;
1111  }
1112  else
1113  {
1114  ++nfile;
1115  }
1116 
1117  /*
1118  * Seek to the right position. We need no special case for seekPos
1119  * equal to FileUnknownPos, as lseek() will certainly reject that
1120  * (thus completing the logic noted in LruDelete() that we will fail
1121  * to re-open a file if we couldn't get its seek position before
1122  * closing).
1123  */
1124  if (vfdP->seekPos != (off_t) 0)
1125  {
1126  if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
1127  {
1128  /*
1129  * If we fail to restore the seek position, treat it like an
1130  * open() failure.
1131  */
1132  int save_errno = errno;
1133 
1134  elog(LOG, "could not seek file \"%s\" after re-opening: %m",
1135  vfdP->fileName);
1136  (void) close(vfdP->fd);
1137  vfdP->fd = VFD_CLOSED;
1138  --nfile;
1139  errno = save_errno;
1140  return -1;
1141  }
1142  }
1143  }
1144 
1145  /*
1146  * put it at the head of the Lru ring
1147  */
1148 
1149  Insert(file);
1150 
1151  return 0;
1152 }
1153 
1154 /*
1155  * Release one kernel FD by closing the least-recently-used VFD.
1156  */
1157 static bool
1159 {
1160  DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1161 
1162  if (nfile > 0)
1163  {
1164  /*
1165  * There are opened files and so there should be at least one used vfd
1166  * in the ring.
1167  */
1168  Assert(VfdCache[0].lruMoreRecently != 0);
1169  LruDelete(VfdCache[0].lruMoreRecently);
1170  return true; /* freed a file */
1171  }
1172  return false; /* no files available to free */
1173 }
1174 
1175 /*
1176  * Release kernel FDs as needed to get under the max_safe_fds limit.
1177  * After calling this, it's OK to try to open another file.
1178  */
1179 static void
1181 {
1182  while (nfile + numAllocatedDescs >= max_safe_fds)
1183  {
1184  if (!ReleaseLruFile())
1185  break;
1186  }
1187 }
1188 
1189 static File
1191 {
1192  Index i;
1193  File file;
1194 
1195  DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1196 
1197  Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1198 
1199  if (VfdCache[0].nextFree == 0)
1200  {
1201  /*
1202  * The free list is empty so it is time to increase the size of the
1203  * array. We choose to double it each time this happens. However,
1204  * there's not much point in starting *real* small.
1205  */
1206  Size newCacheSize = SizeVfdCache * 2;
1207  Vfd *newVfdCache;
1208 
1209  if (newCacheSize < 32)
1210  newCacheSize = 32;
1211 
1212  /*
1213  * Be careful not to clobber VfdCache ptr if realloc fails.
1214  */
1215  newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1216  if (newVfdCache == NULL)
1217  ereport(ERROR,
1218  (errcode(ERRCODE_OUT_OF_MEMORY),
1219  errmsg("out of memory")));
1220  VfdCache = newVfdCache;
1221 
1222  /*
1223  * Initialize the new entries and link them into the free list.
1224  */
1225  for (i = SizeVfdCache; i < newCacheSize; i++)
1226  {
1227  MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
1228  VfdCache[i].nextFree = i + 1;
1229  VfdCache[i].fd = VFD_CLOSED;
1230  }
1231  VfdCache[newCacheSize - 1].nextFree = 0;
1232  VfdCache[0].nextFree = SizeVfdCache;
1233 
1234  /*
1235  * Record the new size
1236  */
1237  SizeVfdCache = newCacheSize;
1238  }
1239 
1240  file = VfdCache[0].nextFree;
1241 
1242  VfdCache[0].nextFree = VfdCache[file].nextFree;
1243 
1244  return file;
1245 }
1246 
1247 static void
1249 {
1250  Vfd *vfdP = &VfdCache[file];
1251 
1252  DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1253  file, vfdP->fileName ? vfdP->fileName : ""));
1254 
1255  if (vfdP->fileName != NULL)
1256  {
1257  free(vfdP->fileName);
1258  vfdP->fileName = NULL;
1259  }
1260  vfdP->fdstate = 0x0;
1261 
1262  vfdP->nextFree = VfdCache[0].nextFree;
1263  VfdCache[0].nextFree = file;
1264 }
1265 
1266 /* returns 0 on success, -1 on re-open failure (with errno set) */
1267 static int
1269 {
1270  int returnValue;
1271 
1272  DO_DB(elog(LOG, "FileAccess %d (%s)",
1273  file, VfdCache[file].fileName));
1274 
1275  /*
1276  * Is the file open? If not, open it and put it at the head of the LRU
1277  * ring (possibly closing the least recently used file to get an FD).
1278  */
1279 
1280  if (FileIsNotOpen(file))
1281  {
1282  returnValue = LruInsert(file);
1283  if (returnValue != 0)
1284  return returnValue;
1285  }
1286  else if (VfdCache[0].lruLessRecently != file)
1287  {
1288  /*
1289  * We now know that the file is open and that it is not the last one
1290  * accessed, so we need to move it to the head of the Lru ring.
1291  */
1292 
1293  Delete(file);
1294  Insert(file);
1295  }
1296 
1297  return 0;
1298 }
1299 
1300 /*
1301  * Called whenever a temporary file is deleted to report its size.
1302  */
1303 static void
1304 ReportTemporaryFileUsage(const char *path, off_t size)
1305 {
1306  pgstat_report_tempfile(size);
1307 
1308  if (log_temp_files >= 0)
1309  {
1310  if ((size / 1024) >= log_temp_files)
1311  ereport(LOG,
1312  (errmsg("temporary file: path \"%s\", size %lu",
1313  path, (unsigned long) size)));
1314  }
1315 }
1316 
1317 /*
1318  * Called to register a temporary file for automatic close.
1319  * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
1320  * before the file was opened.
1321  */
1322 static void
1324 {
1326  VfdCache[file].resowner = CurrentResourceOwner;
1327 
1328  /* Backup mechanism for closing at end of xact. */
1329  VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
1331 }
1332 
1333 /*
1334  * Called when we get a shared invalidation message on some relation.
1335  */
1336 #ifdef NOT_USED
1337 void
1338 FileInvalidate(File file)
1339 {
1340  Assert(FileIsValid(file));
1341  if (!FileIsNotOpen(file))
1342  LruDelete(file);
1343 }
1344 #endif
1345 
1346 /*
1347  * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1348  * fileMode parameter.
1349  */
1350 File
1352 {
1353  return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1354 }
1355 
1356 /*
1357  * open a file in an arbitrary directory
1358  *
1359  * NB: if the passed pathname is relative (which it usually is),
1360  * it will be interpreted relative to the process' working directory
1361  * (which should always be $PGDATA when this code is running).
1362  */
1363 File
1365 {
1366  char *fnamecopy;
1367  File file;
1368  Vfd *vfdP;
1369 
1370  DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1371  fileName, fileFlags, fileMode));
1372 
1373  /*
1374  * We need a malloc'd copy of the file name; fail cleanly if no room.
1375  */
1376  fnamecopy = strdup(fileName);
1377  if (fnamecopy == NULL)
1378  ereport(ERROR,
1379  (errcode(ERRCODE_OUT_OF_MEMORY),
1380  errmsg("out of memory")));
1381 
1382  file = AllocateVfd();
1383  vfdP = &VfdCache[file];
1384 
1385  /* Close excess kernel FDs. */
1386  ReleaseLruFiles();
1387 
1388  vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1389 
1390  if (vfdP->fd < 0)
1391  {
1392  int save_errno = errno;
1393 
1394  FreeVfd(file);
1395  free(fnamecopy);
1396  errno = save_errno;
1397  return -1;
1398  }
1399  ++nfile;
1400  DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1401  vfdP->fd));
1402 
1403  Insert(file);
1404 
1405  vfdP->fileName = fnamecopy;
1406  /* Saved flags are adjusted to be OK for re-opening file */
1407  vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1408  vfdP->fileMode = fileMode;
1409  vfdP->seekPos = 0;
1410  vfdP->fileSize = 0;
1411  vfdP->fdstate = 0x0;
1412  vfdP->resowner = NULL;
1413 
1414  return file;
1415 }
1416 
1417 /*
1418  * Create directory 'directory'. If necessary, create 'basedir', which must
1419  * be the directory above it. This is designed for creating the top-level
1420  * temporary directory on demand before creating a directory underneath it.
1421  * Do nothing if the directory already exists.
1422  *
1423  * Directories created within the top-level temporary directory should begin
1424  * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1425  * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1426  * that do not need any particular prefix.
1427 */
1428 void
1430 {
1431  if (MakePGDirectory(directory) < 0)
1432  {
1433  if (errno == EEXIST)
1434  return;
1435 
1436  /*
1437  * Failed. Try to create basedir first in case it's missing. Tolerate
1438  * EEXIST to close a race against another process following the same
1439  * algorithm.
1440  */
1441  if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1442  ereport(ERROR,
1444  errmsg("cannot create temporary directory \"%s\": %m",
1445  basedir)));
1446 
1447  /* Try again. */
1448  if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1449  ereport(ERROR,
1451  errmsg("cannot create temporary subdirectory \"%s\": %m",
1452  directory)));
1453  }
1454 }
1455 
1456 /*
1457  * Delete a directory and everything in it, if it exists.
1458  */
1459 void
1460 PathNameDeleteTemporaryDir(const char *dirname)
1461 {
1462  struct stat statbuf;
1463 
1464  /* Silently ignore missing directory. */
1465  if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1466  return;
1467 
1468  /*
1469  * Currently, walkdir doesn't offer a way for our passed in function to
1470  * maintain state. Perhaps it should, so that we could tell the caller
1471  * whether this operation succeeded or failed. Since this operation is
1472  * used in a cleanup path, we wouldn't actually behave differently: we'll
1473  * just log failures.
1474  */
1475  walkdir(dirname, unlink_if_exists_fname, false, LOG);
1476 }
1477 
1478 /*
1479  * Open a temporary file that will disappear when we close it.
1480  *
1481  * This routine takes care of generating an appropriate tempfile name.
1482  * There's no need to pass in fileFlags or fileMode either, since only
1483  * one setting makes any sense for a temp file.
1484  *
1485  * Unless interXact is true, the file is remembered by CurrentResourceOwner
1486  * to ensure it's closed and deleted when it's no longer needed, typically at
1487  * the end-of-transaction. In most cases, you don't want temporary files to
1488  * outlive the transaction that created them, so this should be false -- but
1489  * if you need "somewhat" temporary storage, this might be useful. In either
1490  * case, the file is removed when the File is explicitly closed.
1491  */
1492 File
1493 OpenTemporaryFile(bool interXact)
1494 {
1495  File file = 0;
1496 
1497  /*
1498  * Make sure the current resource owner has space for this File before we
1499  * open it, if we'll be registering it below.
1500  */
1501  if (!interXact)
1503 
1504  /*
1505  * If some temp tablespace(s) have been given to us, try to use the next
1506  * one. If a given tablespace can't be found, we silently fall back to
1507  * the database's default tablespace.
1508  *
1509  * BUT: if the temp file is slated to outlive the current transaction,
1510  * force it into the database's default tablespace, so that it will not
1511  * pose a threat to possible tablespace drop attempts.
1512  */
1513  if (numTempTableSpaces > 0 && !interXact)
1514  {
1515  Oid tblspcOid = GetNextTempTableSpace();
1516 
1517  if (OidIsValid(tblspcOid))
1518  file = OpenTemporaryFileInTablespace(tblspcOid, false);
1519  }
1520 
1521  /*
1522  * If not, or if tablespace is bad, create in database's default
1523  * tablespace. MyDatabaseTableSpace should normally be set before we get
1524  * here, but just in case it isn't, fall back to pg_default tablespace.
1525  */
1526  if (file <= 0)
1529  DEFAULTTABLESPACE_OID,
1530  true);
1531 
1532  /* Mark it for deletion at close and temporary file size limit */
1533  VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
1534 
1535  /* Register it with the current resource owner */
1536  if (!interXact)
1537  RegisterTemporaryFile(file);
1538 
1539  return file;
1540 }
1541 
1542 /*
1543  * Return the path of the temp directory in a given tablespace.
1544  */
1545 void
1547 {
1548  /*
1549  * Identify the tempfile directory for this tablespace.
1550  *
1551  * If someone tries to specify pg_global, use pg_default instead.
1552  */
1553  if (tablespace == InvalidOid ||
1554  tablespace == DEFAULTTABLESPACE_OID ||
1555  tablespace == GLOBALTABLESPACE_OID)
1556  snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1557  else
1558  {
1559  /* All other tablespaces are accessed via symlinks */
1560  snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
1561  tablespace, TABLESPACE_VERSION_DIRECTORY,
1563  }
1564 }
1565 
1566 /*
1567  * Open a temporary file in a specific tablespace.
1568  * Subroutine for OpenTemporaryFile, which see for details.
1569  */
1570 static File
1571 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
1572 {
1573  char tempdirpath[MAXPGPATH];
1574  char tempfilepath[MAXPGPATH];
1575  File file;
1576 
1577  TempTablespacePath(tempdirpath, tblspcOid);
1578 
1579  /*
1580  * Generate a tempfile name that should be unique within the current
1581  * database instance.
1582  */
1583  snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1584  tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
1585 
1586  /*
1587  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1588  * temp file that can be reused.
1589  */
1590  file = PathNameOpenFile(tempfilepath,
1591  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1592  if (file <= 0)
1593  {
1594  /*
1595  * We might need to create the tablespace's tempfile directory, if no
1596  * one has yet done so.
1597  *
1598  * Don't check for an error from MakePGDirectory; it could fail if
1599  * someone else just did the same thing. If it doesn't work then
1600  * we'll bomb out on the second create attempt, instead.
1601  */
1602  (void) MakePGDirectory(tempdirpath);
1603 
1604  file = PathNameOpenFile(tempfilepath,
1605  O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1606  if (file <= 0 && rejectError)
1607  elog(ERROR, "could not create temporary file \"%s\": %m",
1608  tempfilepath);
1609  }
1610 
1611  return file;
1612 }
1613 
1614 
1615 /*
1616  * Create a new file. The directory containing it must already exist. Files
1617  * created this way are subject to temp_file_limit and are automatically
1618  * closed at end of transaction, but are not automatically deleted on close
1619  * because they are intended to be shared between cooperating backends.
1620  *
1621  * If the file is inside the top-level temporary directory, its name should
1622  * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1623  * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1624  * inside a directory created with PathnameCreateTemporaryDir(), in which case
1625  * the prefix isn't needed.
1626  */
1627 File
1628 PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1629 {
1630  File file;
1631 
1633 
1634  /*
1635  * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1636  * temp file that can be reused.
1637  */
1638  file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1639  if (file <= 0)
1640  {
1641  if (error_on_failure)
1642  ereport(ERROR,
1644  errmsg("could not create temporary file \"%s\": %m",
1645  path)));
1646  else
1647  return file;
1648  }
1649 
1650  /* Mark it for temp_file_limit accounting. */
1651  VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
1652 
1653  /* Register it for automatic close. */
1654  RegisterTemporaryFile(file);
1655 
1656  return file;
1657 }
1658 
1659 /*
1660  * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1661  * another backend. Files opened this way don't count against the
1662  * temp_file_limit of the caller, are read-only and are automatically closed
1663  * at the end of the transaction but are not deleted on close.
1664  */
1665 File
1666 PathNameOpenTemporaryFile(const char *path)
1667 {
1668  File file;
1669 
1671 
1672  /* We open the file read-only. */
1673  file = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
1674 
1675  /* If no such file, then we don't raise an error. */
1676  if (file <= 0 && errno != ENOENT)
1677  ereport(ERROR,
1679  errmsg("could not open temporary file \"%s\": %m",
1680  path)));
1681 
1682  if (file > 0)
1683  {
1684  /* Register it for automatic close. */
1685  RegisterTemporaryFile(file);
1686  }
1687 
1688  return file;
1689 }
1690 
1691 /*
1692  * Delete a file by pathname. Return true if the file existed, false if
1693  * didn't.
1694  */
1695 bool
1696 PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1697 {
1698  struct stat filestats;
1699  int stat_errno;
1700 
1701  /* Get the final size for pgstat reporting. */
1702  if (stat(path, &filestats) != 0)
1703  stat_errno = errno;
1704  else
1705  stat_errno = 0;
1706 
1707  /*
1708  * Unlike FileClose's automatic file deletion code, we tolerate
1709  * non-existence to support BufFileDeleteShared which doesn't know how
1710  * many segments it has to delete until it runs out.
1711  */
1712  if (stat_errno == ENOENT)
1713  return false;
1714 
1715  if (unlink(path) < 0)
1716  {
1717  if (errno != ENOENT)
1718  ereport(error_on_failure ? ERROR : LOG,
1720  errmsg("cannot unlink temporary file \"%s\": %m",
1721  path)));
1722  return false;
1723  }
1724 
1725  if (stat_errno == 0)
1726  ReportTemporaryFileUsage(path, filestats.st_size);
1727  else
1728  {
1729  errno = stat_errno;
1730  ereport(LOG,
1732  errmsg("could not stat file \"%s\": %m", path)));
1733  }
1734 
1735  return true;
1736 }
1737 
1738 /*
1739  * close a file when done with it
1740  */
1741 void
1743 {
1744  Vfd *vfdP;
1745 
1746  Assert(FileIsValid(file));
1747 
1748  DO_DB(elog(LOG, "FileClose: %d (%s)",
1749  file, VfdCache[file].fileName));
1750 
1751  vfdP = &VfdCache[file];
1752 
1753  if (!FileIsNotOpen(file))
1754  {
1755  /* close the file */
1756  if (close(vfdP->fd))
1757  elog(LOG, "could not close file \"%s\": %m", vfdP->fileName);
1758 
1759  --nfile;
1760  vfdP->fd = VFD_CLOSED;
1761 
1762  /* remove the file from the lru ring */
1763  Delete(file);
1764  }
1765 
1766  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1767  {
1768  /* Subtract its size from current usage (do first in case of error) */
1769  temporary_files_size -= vfdP->fileSize;
1770  vfdP->fileSize = 0;
1771  }
1772 
1773  /*
1774  * Delete the file if it was temporary, and make a log entry if wanted
1775  */
1776  if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
1777  {
1778  struct stat filestats;
1779  int stat_errno;
1780 
1781  /*
1782  * If we get an error, as could happen within the ereport/elog calls,
1783  * we'll come right back here during transaction abort. Reset the
1784  * flag to ensure that we can't get into an infinite loop. This code
1785  * is arranged to ensure that the worst-case consequence is failing to
1786  * emit log message(s), not failing to attempt the unlink.
1787  */
1788  vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
1789 
1790 
1791  /* first try the stat() */
1792  if (stat(vfdP->fileName, &filestats))
1793  stat_errno = errno;
1794  else
1795  stat_errno = 0;
1796 
1797  /* in any case do the unlink */
1798  if (unlink(vfdP->fileName))
1799  elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
1800 
1801  /* and last report the stat results */
1802  if (stat_errno == 0)
1803  ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
1804  else
1805  {
1806  errno = stat_errno;
1807  elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
1808  }
1809  }
1810 
1811  /* Unregister it from the resource owner */
1812  if (vfdP->resowner)
1813  ResourceOwnerForgetFile(vfdP->resowner, file);
1814 
1815  /*
1816  * Return the Vfd slot to the free list
1817  */
1818  FreeVfd(file);
1819 }
1820 
1821 /*
1822  * FilePrefetch - initiate asynchronous read of a given range of the file.
1823  * The logical seek position is unaffected.
1824  *
1825  * Currently the only implementation of this function is using posix_fadvise
1826  * which is the simplest standardized interface that accomplishes this.
1827  * We could add an implementation using libaio in the future; but note that
1828  * this API is inappropriate for libaio, which wants to have a buffer provided
1829  * to read into.
1830  */
1831 int
1832 FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
1833 {
1834 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
1835  int returnCode;
1836 
1837  Assert(FileIsValid(file));
1838 
1839  DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
1840  file, VfdCache[file].fileName,
1841  (int64) offset, amount));
1842 
1843  returnCode = FileAccess(file);
1844  if (returnCode < 0)
1845  return returnCode;
1846 
1847  pgstat_report_wait_start(wait_event_info);
1848  returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
1849  POSIX_FADV_WILLNEED);
1851 
1852  return returnCode;
1853 #else
1854  Assert(FileIsValid(file));
1855  return 0;
1856 #endif
1857 }
1858 
1859 void
1860 FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
1861 {
1862  int returnCode;
1863 
1864  Assert(FileIsValid(file));
1865 
1866  DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
1867  file, VfdCache[file].fileName,
1868  (int64) offset, (int64) nbytes));
1869 
1870  /*
1871  * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
1872  * file's seek position. We prefer to define that as a no-op here.
1873  */
1874  if (nbytes <= 0)
1875  return;
1876 
1877  returnCode = FileAccess(file);
1878  if (returnCode < 0)
1879  return;
1880 
1881  pgstat_report_wait_start(wait_event_info);
1882  pg_flush_data(VfdCache[file].fd, offset, nbytes);
1884 }
1885 
1886 int
1887 FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
1888 {
1889  int returnCode;
1890  Vfd *vfdP;
1891 
1892  Assert(FileIsValid(file));
1893 
1894  DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
1895  file, VfdCache[file].fileName,
1896  (int64) VfdCache[file].seekPos,
1897  amount, buffer));
1898 
1899  returnCode = FileAccess(file);
1900  if (returnCode < 0)
1901  return returnCode;
1902 
1903  vfdP = &VfdCache[file];
1904 
1905 retry:
1906  pgstat_report_wait_start(wait_event_info);
1907  returnCode = read(vfdP->fd, buffer, amount);
1909 
1910  if (returnCode >= 0)
1911  {
1912  /* if seekPos is unknown, leave it that way */
1913  if (!FilePosIsUnknown(vfdP->seekPos))
1914  vfdP->seekPos += returnCode;
1915  }
1916  else
1917  {
1918  /*
1919  * Windows may run out of kernel buffers and return "Insufficient
1920  * system resources" error. Wait a bit and retry to solve it.
1921  *
1922  * It is rumored that EINTR is also possible on some Unix filesystems,
1923  * in which case immediate retry is indicated.
1924  */
1925 #ifdef WIN32
1926  DWORD error = GetLastError();
1927 
1928  switch (error)
1929  {
1930  case ERROR_NO_SYSTEM_RESOURCES:
1931  pg_usleep(1000L);
1932  errno = EINTR;
1933  break;
1934  default:
1935  _dosmaperr(error);
1936  break;
1937  }
1938 #endif
1939  /* OK to retry if interrupted */
1940  if (errno == EINTR)
1941  goto retry;
1942 
1943  /* Trouble, so assume we don't know the file position anymore */
1944  vfdP->seekPos = FileUnknownPos;
1945  }
1946 
1947  return returnCode;
1948 }
1949 
1950 int
1951 FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
1952 {
1953  int returnCode;
1954  Vfd *vfdP;
1955 
1956  Assert(FileIsValid(file));
1957 
1958  DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
1959  file, VfdCache[file].fileName,
1960  (int64) VfdCache[file].seekPos,
1961  amount, buffer));
1962 
1963  returnCode = FileAccess(file);
1964  if (returnCode < 0)
1965  return returnCode;
1966 
1967  vfdP = &VfdCache[file];
1968 
1969  /*
1970  * If enforcing temp_file_limit and it's a temp file, check to see if the
1971  * write would overrun temp_file_limit, and throw error if so. Note: it's
1972  * really a modularity violation to throw error here; we should set errno
1973  * and return -1. However, there's no way to report a suitable error
1974  * message if we do that. All current callers would just throw error
1975  * immediately anyway, so this is safe at present.
1976  */
1977  if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
1978  {
1979  off_t newPos;
1980 
1981  /*
1982  * Normally we should know the seek position, but if for some reason
1983  * we have lost track of it, try again to get it. Here, it's fine to
1984  * throw an error if we still can't get it.
1985  */
1986  if (FilePosIsUnknown(vfdP->seekPos))
1987  {
1988  vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
1989  if (FilePosIsUnknown(vfdP->seekPos))
1990  elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
1991  }
1992 
1993  newPos = vfdP->seekPos + amount;
1994  if (newPos > vfdP->fileSize)
1995  {
1996  uint64 newTotal = temporary_files_size;
1997 
1998  newTotal += newPos - vfdP->fileSize;
1999  if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2000  ereport(ERROR,
2001  (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
2002  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
2003  temp_file_limit)));
2004  }
2005  }
2006 
2007 retry:
2008  errno = 0;
2009  pgstat_report_wait_start(wait_event_info);
2010  returnCode = write(vfdP->fd, buffer, amount);
2012 
2013  /* if write didn't set errno, assume problem is no disk space */
2014  if (returnCode != amount && errno == 0)
2015  errno = ENOSPC;
2016 
2017  if (returnCode >= 0)
2018  {
2019  /* if seekPos is unknown, leave it that way */
2020  if (!FilePosIsUnknown(vfdP->seekPos))
2021  vfdP->seekPos += returnCode;
2022 
2023  /*
2024  * Maintain fileSize and temporary_files_size if it's a temp file.
2025  *
2026  * If seekPos is -1 (unknown), this will do nothing; but we could only
2027  * get here in that state if we're not enforcing temporary_files_size,
2028  * so we don't care.
2029  */
2030  if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2031  {
2032  off_t newPos = vfdP->seekPos;
2033 
2034  if (newPos > vfdP->fileSize)
2035  {
2036  temporary_files_size += newPos - vfdP->fileSize;
2037  vfdP->fileSize = newPos;
2038  }
2039  }
2040  }
2041  else
2042  {
2043  /*
2044  * See comments in FileRead()
2045  */
2046 #ifdef WIN32
2047  DWORD error = GetLastError();
2048 
2049  switch (error)
2050  {
2051  case ERROR_NO_SYSTEM_RESOURCES:
2052  pg_usleep(1000L);
2053  errno = EINTR;
2054  break;
2055  default:
2056  _dosmaperr(error);
2057  break;
2058  }
2059 #endif
2060  /* OK to retry if interrupted */
2061  if (errno == EINTR)
2062  goto retry;
2063 
2064  /* Trouble, so assume we don't know the file position anymore */
2065  vfdP->seekPos = FileUnknownPos;
2066  }
2067 
2068  return returnCode;
2069 }
2070 
2071 int
2072 FileSync(File file, uint32 wait_event_info)
2073 {
2074  int returnCode;
2075 
2076  Assert(FileIsValid(file));
2077 
2078  DO_DB(elog(LOG, "FileSync: %d (%s)",
2079  file, VfdCache[file].fileName));
2080 
2081  returnCode = FileAccess(file);
2082  if (returnCode < 0)
2083  return returnCode;
2084 
2085  pgstat_report_wait_start(wait_event_info);
2086  returnCode = pg_fsync(VfdCache[file].fd);
2088 
2089  return returnCode;
2090 }
2091 
2092 off_t
2093 FileSeek(File file, off_t offset, int whence)
2094 {
2095  Vfd *vfdP;
2096 
2097  Assert(FileIsValid(file));
2098 
2099  DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
2100  file, VfdCache[file].fileName,
2101  (int64) VfdCache[file].seekPos,
2102  (int64) offset, whence));
2103 
2104  vfdP = &VfdCache[file];
2105 
2106  if (FileIsNotOpen(file))
2107  {
2108  switch (whence)
2109  {
2110  case SEEK_SET:
2111  if (offset < 0)
2112  {
2113  errno = EINVAL;
2114  return (off_t) -1;
2115  }
2116  vfdP->seekPos = offset;
2117  break;
2118  case SEEK_CUR:
2119  if (FilePosIsUnknown(vfdP->seekPos) ||
2120  vfdP->seekPos + offset < 0)
2121  {
2122  errno = EINVAL;
2123  return (off_t) -1;
2124  }
2125  vfdP->seekPos += offset;
2126  break;
2127  case SEEK_END:
2128  if (FileAccess(file) < 0)
2129  return (off_t) -1;
2130  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2131  break;
2132  default:
2133  elog(ERROR, "invalid whence: %d", whence);
2134  break;
2135  }
2136  }
2137  else
2138  {
2139  switch (whence)
2140  {
2141  case SEEK_SET:
2142  if (offset < 0)
2143  {
2144  errno = EINVAL;
2145  return (off_t) -1;
2146  }
2147  if (vfdP->seekPos != offset)
2148  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2149  break;
2150  case SEEK_CUR:
2151  if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
2152  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2153  break;
2154  case SEEK_END:
2155  vfdP->seekPos = lseek(vfdP->fd, offset, whence);
2156  break;
2157  default:
2158  elog(ERROR, "invalid whence: %d", whence);
2159  break;
2160  }
2161  }
2162 
2163  return vfdP->seekPos;
2164 }
2165 
2166 /*
2167  * XXX not actually used but here for completeness
2168  */
2169 #ifdef NOT_USED
2170 off_t
2171 FileTell(File file)
2172 {
2173  Assert(FileIsValid(file));
2174  DO_DB(elog(LOG, "FileTell %d (%s)",
2175  file, VfdCache[file].fileName));
2176  return VfdCache[file].seekPos;
2177 }
2178 #endif
2179 
2180 int
2181 FileTruncate(File file, off_t offset, uint32 wait_event_info)
2182 {
2183  int returnCode;
2184 
2185  Assert(FileIsValid(file));
2186 
2187  DO_DB(elog(LOG, "FileTruncate %d (%s)",
2188  file, VfdCache[file].fileName));
2189 
2190  returnCode = FileAccess(file);
2191  if (returnCode < 0)
2192  return returnCode;
2193 
2194  pgstat_report_wait_start(wait_event_info);
2195  returnCode = ftruncate(VfdCache[file].fd, offset);
2197 
2198  if (returnCode == 0 && VfdCache[file].fileSize > offset)
2199  {
2200  /* adjust our state for truncation of a temp file */
2201  Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2202  temporary_files_size -= VfdCache[file].fileSize - offset;
2203  VfdCache[file].fileSize = offset;
2204  }
2205 
2206  return returnCode;
2207 }
2208 
2209 /*
2210  * Return the pathname associated with an open file.
2211  *
2212  * The returned string points to an internal buffer, which is valid until
2213  * the file is closed.
2214  */
2215 char *
2217 {
2218  Assert(FileIsValid(file));
2219 
2220  return VfdCache[file].fileName;
2221 }
2222 
2223 /*
2224  * Return the raw file descriptor of an opened file.
2225  *
2226  * The returned file descriptor will be valid until the file is closed, but
2227  * there are a lot of things that can make that happen. So the caller should
2228  * be careful not to do much of anything else before it finishes using the
2229  * returned file descriptor.
2230  */
2231 int
2233 {
2234  Assert(FileIsValid(file));
2235  return VfdCache[file].fd;
2236 }
2237 
2238 /*
2239  * FileGetRawFlags - returns the file flags on open(2)
2240  */
2241 int
2243 {
2244  Assert(FileIsValid(file));
2245  return VfdCache[file].fileFlags;
2246 }
2247 
2248 /*
2249  * FileGetRawMode - returns the mode bitmask passed to open(2)
2250  */
2251 mode_t
2253 {
2254  Assert(FileIsValid(file));
2255  return VfdCache[file].fileMode;
2256 }
2257 
2258 /*
2259  * Make room for another allocatedDescs[] array entry if needed and possible.
2260  * Returns true if an array element is available.
2261  */
2262 static bool
2264 {
2265  AllocateDesc *newDescs;
2266  int newMax;
2267 
2268  /* Quick out if array already has a free slot. */
2270  return true;
2271 
2272  /*
2273  * If the array hasn't yet been created in the current process, initialize
2274  * it with FD_MINFREE / 2 elements. In many scenarios this is as many as
2275  * we will ever need, anyway. We don't want to look at max_safe_fds
2276  * immediately because set_max_safe_fds() may not have run yet.
2277  */
2278  if (allocatedDescs == NULL)
2279  {
2280  newMax = FD_MINFREE / 2;
2281  newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
2282  /* Out of memory already? Treat as fatal error. */
2283  if (newDescs == NULL)
2284  ereport(ERROR,
2285  (errcode(ERRCODE_OUT_OF_MEMORY),
2286  errmsg("out of memory")));
2287  allocatedDescs = newDescs;
2288  maxAllocatedDescs = newMax;
2289  return true;
2290  }
2291 
2292  /*
2293  * Consider enlarging the array beyond the initial allocation used above.
2294  * By the time this happens, max_safe_fds should be known accurately.
2295  *
2296  * We mustn't let allocated descriptors hog all the available FDs, and in
2297  * practice we'd better leave a reasonable number of FDs for VFD use. So
2298  * set the maximum to max_safe_fds / 2. (This should certainly be at
2299  * least as large as the initial size, FD_MINFREE / 2.)
2300  */
2301  newMax = max_safe_fds / 2;
2302  if (newMax > maxAllocatedDescs)
2303  {
2304  newDescs = (AllocateDesc *) realloc(allocatedDescs,
2305  newMax * sizeof(AllocateDesc));
2306  /* Treat out-of-memory as a non-fatal error. */
2307  if (newDescs == NULL)
2308  return false;
2309  allocatedDescs = newDescs;
2310  maxAllocatedDescs = newMax;
2311  return true;
2312  }
2313 
2314  /* Can't enlarge allocatedDescs[] any more. */
2315  return false;
2316 }
2317 
2318 /*
2319  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2320  * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2321  * necessary to open the file. When done, call FreeFile rather than fclose.
2322  *
2323  * Note that files that will be open for any significant length of time
2324  * should NOT be handled this way, since they cannot share kernel file
2325  * descriptors with other files; there is grave risk of running out of FDs
2326  * if anyone locks down too many FDs. Most callers of this routine are
2327  * simply reading a config file that they will read and close immediately.
2328  *
2329  * fd.c will automatically close all files opened with AllocateFile at
2330  * transaction commit or abort; this prevents FD leakage if a routine
2331  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2332  *
2333  * Ideally this should be the *only* direct call of fopen() in the backend.
2334  */
2335 FILE *
2336 AllocateFile(const char *name, const char *mode)
2337 {
2338  FILE *file;
2339 
2340  DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2341  numAllocatedDescs, name));
2342 
2343  /* Can we allocate another non-virtual FD? */
2344  if (!reserveAllocatedDesc())
2345  ereport(ERROR,
2346  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2347  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2348  maxAllocatedDescs, name)));
2349 
2350  /* Close excess kernel FDs. */
2351  ReleaseLruFiles();
2352 
2353 TryAgain:
2354  if ((file = fopen(name, mode)) != NULL)
2355  {
2356  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2357 
2358  desc->kind = AllocateDescFile;
2359  desc->desc.file = file;
2362  return desc->desc.file;
2363  }
2364 
2365  if (errno == EMFILE || errno == ENFILE)
2366  {
2367  int save_errno = errno;
2368 
2369  ereport(LOG,
2370  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2371  errmsg("out of file descriptors: %m; release and retry")));
2372  errno = 0;
2373  if (ReleaseLruFile())
2374  goto TryAgain;
2375  errno = save_errno;
2376  }
2377 
2378  return NULL;
2379 }
2380 
2381 /*
2382  * Open a file with OpenTransientFilePerm() and pass default file mode for
2383  * the fileMode parameter.
2384  */
2385 int
2387 {
2388  return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2389 }
2390 
2391 /*
2392  * Like AllocateFile, but returns an unbuffered fd like open(2)
2393  */
2394 int
2396 {
2397  int fd;
2398 
2399  DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2400  numAllocatedDescs, fileName));
2401 
2402  /* Can we allocate another non-virtual FD? */
2403  if (!reserveAllocatedDesc())
2404  ereport(ERROR,
2405  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2406  errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2407  maxAllocatedDescs, fileName)));
2408 
2409  /* Close excess kernel FDs. */
2410  ReleaseLruFiles();
2411 
2412  fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2413 
2414  if (fd >= 0)
2415  {
2416  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2417 
2418  desc->kind = AllocateDescRawFD;
2419  desc->desc.fd = fd;
2422 
2423  return fd;
2424  }
2425 
2426  return -1; /* failure */
2427 }
2428 
2429 /*
2430  * Routines that want to initiate a pipe stream should use OpenPipeStream
2431  * rather than plain popen(). This lets fd.c deal with freeing FDs if
2432  * necessary. When done, call ClosePipeStream rather than pclose.
2433  */
2434 FILE *
2435 OpenPipeStream(const char *command, const char *mode)
2436 {
2437  FILE *file;
2438 
2439  DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2440  numAllocatedDescs, command));
2441 
2442  /* Can we allocate another non-virtual FD? */
2443  if (!reserveAllocatedDesc())
2444  ereport(ERROR,
2445  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2446  errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2447  maxAllocatedDescs, command)));
2448 
2449  /* Close excess kernel FDs. */
2450  ReleaseLruFiles();
2451 
2452 TryAgain:
2453  fflush(stdout);
2454  fflush(stderr);
2455  errno = 0;
2456  if ((file = popen(command, mode)) != NULL)
2457  {
2458  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2459 
2460  desc->kind = AllocateDescPipe;
2461  desc->desc.file = file;
2464  return desc->desc.file;
2465  }
2466 
2467  if (errno == EMFILE || errno == ENFILE)
2468  {
2469  int save_errno = errno;
2470 
2471  ereport(LOG,
2472  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2473  errmsg("out of file descriptors: %m; release and retry")));
2474  errno = 0;
2475  if (ReleaseLruFile())
2476  goto TryAgain;
2477  errno = save_errno;
2478  }
2479 
2480  return NULL;
2481 }
2482 
2483 /*
2484  * Free an AllocateDesc of any type.
2485  *
2486  * The argument *must* point into the allocatedDescs[] array.
2487  */
2488 static int
2490 {
2491  int result;
2492 
2493  /* Close the underlying object */
2494  switch (desc->kind)
2495  {
2496  case AllocateDescFile:
2497  result = fclose(desc->desc.file);
2498  break;
2499  case AllocateDescPipe:
2500  result = pclose(desc->desc.file);
2501  break;
2502  case AllocateDescDir:
2503  result = closedir(desc->desc.dir);
2504  break;
2505  case AllocateDescRawFD:
2506  result = close(desc->desc.fd);
2507  break;
2508  default:
2509  elog(ERROR, "AllocateDesc kind not recognized");
2510  result = 0; /* keep compiler quiet */
2511  break;
2512  }
2513 
2514  /* Compact storage in the allocatedDescs array */
2516  *desc = allocatedDescs[numAllocatedDescs];
2517 
2518  return result;
2519 }
2520 
2521 /*
2522  * Close a file returned by AllocateFile.
2523  *
2524  * Note we do not check fclose's return value --- it is up to the caller
2525  * to handle close errors.
2526  */
2527 int
2528 FreeFile(FILE *file)
2529 {
2530  int i;
2531 
2532  DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2533 
2534  /* Remove file from list of allocated files, if it's present */
2535  for (i = numAllocatedDescs; --i >= 0;)
2536  {
2537  AllocateDesc *desc = &allocatedDescs[i];
2538 
2539  if (desc->kind == AllocateDescFile && desc->desc.file == file)
2540  return FreeDesc(desc);
2541  }
2542 
2543  /* Only get here if someone passes us a file not in allocatedDescs */
2544  elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2545 
2546  return fclose(file);
2547 }
2548 
2549 /*
2550  * Close a file returned by OpenTransientFile.
2551  *
2552  * Note we do not check close's return value --- it is up to the caller
2553  * to handle close errors.
2554  */
2555 int
2557 {
2558  int i;
2559 
2560  DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2561 
2562  /* Remove fd from list of allocated files, if it's present */
2563  for (i = numAllocatedDescs; --i >= 0;)
2564  {
2565  AllocateDesc *desc = &allocatedDescs[i];
2566 
2567  if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2568  return FreeDesc(desc);
2569  }
2570 
2571  /* Only get here if someone passes us a file not in allocatedDescs */
2572  elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2573 
2574  return close(fd);
2575 }
2576 
2577 /*
2578  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2579  * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2580  * necessary to open the directory, and with closing it after an elog.
2581  * When done, call FreeDir rather than closedir.
2582  *
2583  * Returns NULL, with errno set, on failure. Note that failure detection
2584  * is commonly left to the following call of ReadDir or ReadDirExtended;
2585  * see the comments for ReadDir.
2586  *
2587  * Ideally this should be the *only* direct call of opendir() in the backend.
2588  */
2589 DIR *
2590 AllocateDir(const char *dirname)
2591 {
2592  DIR *dir;
2593 
2594  DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2595  numAllocatedDescs, dirname));
2596 
2597  /* Can we allocate another non-virtual FD? */
2598  if (!reserveAllocatedDesc())
2599  ereport(ERROR,
2600  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2601  errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2602  maxAllocatedDescs, dirname)));
2603 
2604  /* Close excess kernel FDs. */
2605  ReleaseLruFiles();
2606 
2607 TryAgain:
2608  if ((dir = opendir(dirname)) != NULL)
2609  {
2610  AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
2611 
2612  desc->kind = AllocateDescDir;
2613  desc->desc.dir = dir;
2616  return desc->desc.dir;
2617  }
2618 
2619  if (errno == EMFILE || errno == ENFILE)
2620  {
2621  int save_errno = errno;
2622 
2623  ereport(LOG,
2624  (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
2625  errmsg("out of file descriptors: %m; release and retry")));
2626  errno = 0;
2627  if (ReleaseLruFile())
2628  goto TryAgain;
2629  errno = save_errno;
2630  }
2631 
2632  return NULL;
2633 }
2634 
2635 /*
2636  * Read a directory opened with AllocateDir, ereport'ing any error.
2637  *
2638  * This is easier to use than raw readdir() since it takes care of some
2639  * otherwise rather tedious and error-prone manipulation of errno. Also,
2640  * if you are happy with a generic error message for AllocateDir failure,
2641  * you can just do
2642  *
2643  * dir = AllocateDir(path);
2644  * while ((dirent = ReadDir(dir, path)) != NULL)
2645  * process dirent;
2646  * FreeDir(dir);
2647  *
2648  * since a NULL dir parameter is taken as indicating AllocateDir failed.
2649  * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2650  * use this shortcut.)
2651  *
2652  * The pathname passed to AllocateDir must be passed to this routine too,
2653  * but it is only used for error reporting.
2654  */
2655 struct dirent *
2656 ReadDir(DIR *dir, const char *dirname)
2657 {
2658  return ReadDirExtended(dir, dirname, ERROR);
2659 }
2660 
2661 /*
2662  * Alternate version of ReadDir that allows caller to specify the elevel
2663  * for any error report (whether it's reporting an initial failure of
2664  * AllocateDir or a subsequent directory read failure).
2665  *
2666  * If elevel < ERROR, returns NULL after any error. With the normal coding
2667  * pattern, this will result in falling out of the loop immediately as
2668  * though the directory contained no (more) entries.
2669  */
2670 struct dirent *
2671 ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2672 {
2673  struct dirent *dent;
2674 
2675  /* Give a generic message for AllocateDir failure, if caller didn't */
2676  if (dir == NULL)
2677  {
2678  ereport(elevel,
2680  errmsg("could not open directory \"%s\": %m",
2681  dirname)));
2682  return NULL;
2683  }
2684 
2685  errno = 0;
2686  if ((dent = readdir(dir)) != NULL)
2687  return dent;
2688 
2689  if (errno)
2690  ereport(elevel,
2692  errmsg("could not read directory \"%s\": %m",
2693  dirname)));
2694  return NULL;
2695 }
2696 
2697 /*
2698  * Close a directory opened with AllocateDir.
2699  *
2700  * Returns closedir's return value (with errno set if it's not 0).
2701  * Note we do not check the return value --- it is up to the caller
2702  * to handle close errors if wanted.
2703  *
2704  * Does nothing if dir == NULL; we assume that directory open failure was
2705  * already reported if desired.
2706  */
2707 int
2709 {
2710  int i;
2711 
2712  /* Nothing to do if AllocateDir failed */
2713  if (dir == NULL)
2714  return 0;
2715 
2716  DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
2717 
2718  /* Remove dir from list of allocated dirs, if it's present */
2719  for (i = numAllocatedDescs; --i >= 0;)
2720  {
2721  AllocateDesc *desc = &allocatedDescs[i];
2722 
2723  if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
2724  return FreeDesc(desc);
2725  }
2726 
2727  /* Only get here if someone passes us a dir not in allocatedDescs */
2728  elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
2729 
2730  return closedir(dir);
2731 }
2732 
2733 
2734 /*
2735  * Close a pipe stream returned by OpenPipeStream.
2736  */
2737 int
2738 ClosePipeStream(FILE *file)
2739 {
2740  int i;
2741 
2742  DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
2743 
2744  /* Remove file from list of allocated files, if it's present */
2745  for (i = numAllocatedDescs; --i >= 0;)
2746  {
2747  AllocateDesc *desc = &allocatedDescs[i];
2748 
2749  if (desc->kind == AllocateDescPipe && desc->desc.file == file)
2750  return FreeDesc(desc);
2751  }
2752 
2753  /* Only get here if someone passes us a file not in allocatedDescs */
2754  elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
2755 
2756  return pclose(file);
2757 }
2758 
2759 /*
2760  * closeAllVfds
2761  *
2762  * Force all VFDs into the physically-closed state, so that the fewest
2763  * possible number of kernel file descriptors are in use. There is no
2764  * change in the logical state of the VFDs.
2765  */
2766 void
2768 {
2769  Index i;
2770 
2771  if (SizeVfdCache > 0)
2772  {
2773  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2774  for (i = 1; i < SizeVfdCache; i++)
2775  {
2776  if (!FileIsNotOpen(i))
2777  LruDelete(i);
2778  }
2779  }
2780 }
2781 
2782 
2783 /*
2784  * SetTempTablespaces
2785  *
2786  * Define a list (actually an array) of OIDs of tablespaces to use for
2787  * temporary files. This list will be used until end of transaction,
2788  * unless this function is called again before then. It is caller's
2789  * responsibility that the passed-in array has adequate lifespan (typically
2790  * it'd be allocated in TopTransactionContext).
2791  */
2792 void
2793 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
2794 {
2795  Assert(numSpaces >= 0);
2796  tempTableSpaces = tableSpaces;
2797  numTempTableSpaces = numSpaces;
2798 
2799  /*
2800  * Select a random starting point in the list. This is to minimize
2801  * conflicts between backends that are most likely sharing the same list
2802  * of temp tablespaces. Note that if we create multiple temp files in the
2803  * same transaction, we'll advance circularly through the list --- this
2804  * ensures that large temporary sort files are nicely spread across all
2805  * available tablespaces.
2806  */
2807  if (numSpaces > 1)
2808  nextTempTableSpace = random() % numSpaces;
2809  else
2810  nextTempTableSpace = 0;
2811 }
2812 
2813 /*
2814  * TempTablespacesAreSet
2815  *
2816  * Returns true if SetTempTablespaces has been called in current transaction.
2817  * (This is just so that tablespaces.c doesn't need its own per-transaction
2818  * state.)
2819  */
2820 bool
2822 {
2823  return (numTempTableSpaces >= 0);
2824 }
2825 
2826 /*
2827  * GetTempTablespaces
2828  *
2829  * Populate an array with the OIDs of the tablespaces that should be used for
2830  * temporary files. Return the number that were copied into the output array.
2831  */
2832 int
2833 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
2834 {
2835  int i;
2836 
2838  for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
2839  tableSpaces[i] = tempTableSpaces[i];
2840 
2841  return i;
2842 }
2843 
2844 /*
2845  * GetNextTempTableSpace
2846  *
2847  * Select the next temp tablespace to use. A result of InvalidOid means
2848  * to use the current database's default tablespace.
2849  */
2850 Oid
2852 {
2853  if (numTempTableSpaces > 0)
2854  {
2855  /* Advance nextTempTableSpace counter with wraparound */
2857  nextTempTableSpace = 0;
2859  }
2860  return InvalidOid;
2861 }
2862 
2863 
2864 /*
2865  * AtEOSubXact_Files
2866  *
2867  * Take care of subtransaction commit/abort. At abort, we close temp files
2868  * that the subtransaction may have opened. At commit, we reassign the
2869  * files that were opened to the parent subtransaction.
2870  */
2871 void
2872 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
2873  SubTransactionId parentSubid)
2874 {
2875  Index i;
2876 
2877  for (i = 0; i < numAllocatedDescs; i++)
2878  {
2879  if (allocatedDescs[i].create_subid == mySubid)
2880  {
2881  if (isCommit)
2882  allocatedDescs[i].create_subid = parentSubid;
2883  else
2884  {
2885  /* have to recheck the item after FreeDesc (ugly) */
2886  FreeDesc(&allocatedDescs[i--]);
2887  }
2888  }
2889  }
2890 }
2891 
2892 /*
2893  * AtEOXact_Files
2894  *
2895  * This routine is called during transaction commit or abort. All still-open
2896  * per-transaction temporary file VFDs are closed, which also causes the
2897  * underlying files to be deleted (although they should've been closed already
2898  * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
2899  * closed. We also forget any transaction-local temp tablespace list.
2900  *
2901  * The isCommit flag is used only to decide whether to emit warnings about
2902  * unclosed files.
2903  */
2904 void
2905 AtEOXact_Files(bool isCommit)
2906 {
2907  CleanupTempFiles(isCommit, false);
2908  tempTableSpaces = NULL;
2909  numTempTableSpaces = -1;
2910 }
2911 
2912 /*
2913  * AtProcExit_Files
2914  *
2915  * on_proc_exit hook to clean up temp files during backend shutdown.
2916  * Here, we want to clean up *all* temp files including interXact ones.
2917  */
2918 static void
2920 {
2921  CleanupTempFiles(false, true);
2922 }
2923 
2924 /*
2925  * Close temporary files and delete their underlying files.
2926  *
2927  * isCommit: if true, this is normal transaction commit, and we don't
2928  * expect any remaining files; warn if there are some.
2929  *
2930  * isProcExit: if true, this is being called as the backend process is
2931  * exiting. If that's the case, we should remove all temporary files; if
2932  * that's not the case, we are being called for transaction commit/abort
2933  * and should only remove transaction-local temp files. In either case,
2934  * also clean up "allocated" stdio files, dirs and fds.
2935  */
2936 static void
2937 CleanupTempFiles(bool isCommit, bool isProcExit)
2938 {
2939  Index i;
2940 
2941  /*
2942  * Careful here: at proc_exit we need extra cleanup, not just
2943  * xact_temporary files.
2944  */
2945  if (isProcExit || have_xact_temporary_files)
2946  {
2947  Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
2948  for (i = 1; i < SizeVfdCache; i++)
2949  {
2950  unsigned short fdstate = VfdCache[i].fdstate;
2951 
2952  if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
2953  VfdCache[i].fileName != NULL)
2954  {
2955  /*
2956  * If we're in the process of exiting a backend process, close
2957  * all temporary files. Otherwise, only close temporary files
2958  * local to the current transaction. They should be closed by
2959  * the ResourceOwner mechanism already, so this is just a
2960  * debugging cross-check.
2961  */
2962  if (isProcExit)
2963  FileClose(i);
2964  else if (fdstate & FD_CLOSE_AT_EOXACT)
2965  {
2966  elog(WARNING,
2967  "temporary file %s not closed at end-of-transaction",
2968  VfdCache[i].fileName);
2969  FileClose(i);
2970  }
2971  }
2972  }
2973 
2974  have_xact_temporary_files = false;
2975  }
2976 
2977  /* Complain if any allocated files remain open at commit. */
2978  if (isCommit && numAllocatedDescs > 0)
2979  elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
2981 
2982  /* Clean up "allocated" stdio files, dirs and fds. */
2983  while (numAllocatedDescs > 0)
2984  FreeDesc(&allocatedDescs[0]);
2985 }
2986 
2987 
2988 /*
2989  * Remove temporary and temporary relation files left over from a prior
2990  * postmaster session
2991  *
2992  * This should be called during postmaster startup. It will forcibly
2993  * remove any leftover files created by OpenTemporaryFile and any leftover
2994  * temporary relation files created by mdcreate.
2995  *
2996  * NOTE: we could, but don't, call this during a post-backend-crash restart
2997  * cycle. The argument for not doing it is that someone might want to examine
2998  * the temp files for debugging purposes. This does however mean that
2999  * OpenTemporaryFile had better allow for collision with an existing temp
3000  * file name.
3001  *
3002  * NOTE: this function and its subroutines generally report syscall failures
3003  * with ereport(LOG) and keep going. Removing temp files is not so critical
3004  * that we should fail to start the database when we can't do it.
3005  */
3006 void
3008 {
3009  char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
3010  DIR *spc_dir;
3011  struct dirent *spc_de;
3012 
3013  /*
3014  * First process temp files in pg_default ($PGDATA/base)
3015  */
3016  snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3017  RemovePgTempFilesInDir(temp_path, true, false);
3018  RemovePgTempRelationFiles("base");
3019 
3020  /*
3021  * Cycle through temp directories for all non-default tablespaces.
3022  */
3023  spc_dir = AllocateDir("pg_tblspc");
3024 
3025  while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
3026  {
3027  if (strcmp(spc_de->d_name, ".") == 0 ||
3028  strcmp(spc_de->d_name, "..") == 0)
3029  continue;
3030 
3031  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
3033  RemovePgTempFilesInDir(temp_path, true, false);
3034 
3035  snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
3037  RemovePgTempRelationFiles(temp_path);
3038  }
3039 
3040  FreeDir(spc_dir);
3041 
3042  /*
3043  * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3044  * DataDir as well.
3045  */
3046 #ifdef EXEC_BACKEND
3048 #endif
3049 }
3050 
3051 /*
3052  * Process one pgsql_tmp directory for RemovePgTempFiles.
3053  *
3054  * If missing_ok is true, it's all right for the named directory to not exist.
3055  * Any other problem results in a LOG message. (missing_ok should be true at
3056  * the top level, since pgsql_tmp directories are not created until needed.)
3057  *
3058  * At the top level, this should be called with unlink_all = false, so that
3059  * only files matching the temporary name prefix will be unlinked. When
3060  * recursing it will be called with unlink_all = true to unlink everything
3061  * under a top-level temporary directory.
3062  *
3063  * (These two flags could be replaced by one, but it seems clearer to keep
3064  * them separate.)
3065  */
3066 static void
3067 RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3068 {
3069  DIR *temp_dir;
3070  struct dirent *temp_de;
3071  char rm_path[MAXPGPATH * 2];
3072 
3073  temp_dir = AllocateDir(tmpdirname);
3074 
3075  if (temp_dir == NULL && errno == ENOENT && missing_ok)
3076  return;
3077 
3078  while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
3079  {
3080  if (strcmp(temp_de->d_name, ".") == 0 ||
3081  strcmp(temp_de->d_name, "..") == 0)
3082  continue;
3083 
3084  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3085  tmpdirname, temp_de->d_name);
3086 
3087  if (unlink_all ||
3088  strncmp(temp_de->d_name,
3090  strlen(PG_TEMP_FILE_PREFIX)) == 0)
3091  {
3092  struct stat statbuf;
3093 
3094  if (lstat(rm_path, &statbuf) < 0)
3095  {
3096  ereport(LOG,
3098  errmsg("could not stat file \"%s\": %m", rm_path)));
3099  continue;
3100  }
3101 
3102  if (S_ISDIR(statbuf.st_mode))
3103  {
3104  /* recursively remove contents, then directory itself */
3105  RemovePgTempFilesInDir(rm_path, false, true);
3106 
3107  if (rmdir(rm_path) < 0)
3108  ereport(LOG,
3110  errmsg("could not remove directory \"%s\": %m",
3111  rm_path)));
3112  }
3113  else
3114  {
3115  if (unlink(rm_path) < 0)
3116  ereport(LOG,
3118  errmsg("could not remove file \"%s\": %m",
3119  rm_path)));
3120  }
3121  }
3122  else
3123  ereport(LOG,
3124  (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3125  rm_path)));
3126  }
3127 
3128  FreeDir(temp_dir);
3129 }
3130 
3131 /* Process one tablespace directory, look for per-DB subdirectories */
3132 static void
3133 RemovePgTempRelationFiles(const char *tsdirname)
3134 {
3135  DIR *ts_dir;
3136  struct dirent *de;
3137  char dbspace_path[MAXPGPATH * 2];
3138 
3139  ts_dir = AllocateDir(tsdirname);
3140 
3141  while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3142  {
3143  /*
3144  * We're only interested in the per-database directories, which have
3145  * numeric names. Note that this code will also (properly) ignore "."
3146  * and "..".
3147  */
3148  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3149  continue;
3150 
3151  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3152  tsdirname, de->d_name);
3153  RemovePgTempRelationFilesInDbspace(dbspace_path);
3154  }
3155 
3156  FreeDir(ts_dir);
3157 }
3158 
3159 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
3160 static void
3161 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
3162 {
3163  DIR *dbspace_dir;
3164  struct dirent *de;
3165  char rm_path[MAXPGPATH * 2];
3166 
3167  dbspace_dir = AllocateDir(dbspacedirname);
3168 
3169  while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
3170  {
3171  if (!looks_like_temp_rel_name(de->d_name))
3172  continue;
3173 
3174  snprintf(rm_path, sizeof(rm_path), "%s/%s",
3175  dbspacedirname, de->d_name);
3176 
3177  if (unlink(rm_path) < 0)
3178  ereport(LOG,
3180  errmsg("could not remove file \"%s\": %m",
3181  rm_path)));
3182  }
3183 
3184  FreeDir(dbspace_dir);
3185 }
3186 
3187 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3188 bool
3190 {
3191  int pos;
3192  int savepos;
3193 
3194  /* Must start with "t". */
3195  if (name[0] != 't')
3196  return false;
3197 
3198  /* Followed by a non-empty string of digits and then an underscore. */
3199  for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3200  ;
3201  if (pos == 1 || name[pos] != '_')
3202  return false;
3203 
3204  /* Followed by another nonempty string of digits. */
3205  for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3206  ;
3207  if (savepos == pos)
3208  return false;
3209 
3210  /* We might have _forkname or .segment or both. */
3211  if (name[pos] == '_')
3212  {
3213  int forkchar = forkname_chars(&name[pos + 1], NULL);
3214 
3215  if (forkchar <= 0)
3216  return false;
3217  pos += forkchar + 1;
3218  }
3219  if (name[pos] == '.')
3220  {
3221  int segchar;
3222 
3223  for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3224  ;
3225  if (segchar <= 1)
3226  return false;
3227  pos += segchar;
3228  }
3229 
3230  /* Now we should be at the end. */
3231  if (name[pos] != '\0')
3232  return false;
3233  return true;
3234 }
3235 
3236 
3237 /*
3238  * Issue fsync recursively on PGDATA and all its contents.
3239  *
3240  * We fsync regular files and directories wherever they are, but we
3241  * follow symlinks only for pg_wal and immediately under pg_tblspc.
3242  * Other symlinks are presumed to point at files we're not responsible
3243  * for fsyncing, and might not have privileges to write at all.
3244  *
3245  * Errors are logged but not considered fatal; that's because this is used
3246  * only during database startup, to deal with the possibility that there are
3247  * issued-but-unsynced writes pending against the data directory. We want to
3248  * ensure that such writes reach disk before anything that's done in the new
3249  * run. However, aborting on error would result in failure to start for
3250  * harmless cases such as read-only files in the data directory, and that's
3251  * not good either.
3252  *
3253  * Note we assume we're chdir'd into PGDATA to begin with.
3254  */
3255 void
3257 {
3258  bool xlog_is_symlink;
3259 
3260  /* We can skip this whole thing if fsync is disabled. */
3261  if (!enableFsync)
3262  return;
3263 
3264  /*
3265  * If pg_wal is a symlink, we'll need to recurse into it separately,
3266  * because the first walkdir below will ignore it.
3267  */
3268  xlog_is_symlink = false;
3269 
3270 #ifndef WIN32
3271  {
3272  struct stat st;
3273 
3274  if (lstat("pg_wal", &st) < 0)
3275  ereport(LOG,
3277  errmsg("could not stat file \"%s\": %m",
3278  "pg_wal")));
3279  else if (S_ISLNK(st.st_mode))
3280  xlog_is_symlink = true;
3281  }
3282 #else
3283  if (pgwin32_is_junction("pg_wal"))
3284  xlog_is_symlink = true;
3285 #endif
3286 
3287  /*
3288  * If possible, hint to the kernel that we're soon going to fsync the data
3289  * directory and its contents. Errors in this step are even less
3290  * interesting than normal, so log them only at DEBUG1.
3291  */
3292 #ifdef PG_FLUSH_DATA_WORKS
3293  walkdir(".", pre_sync_fname, false, DEBUG1);
3294  if (xlog_is_symlink)
3295  walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3296  walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
3297 #endif
3298 
3299  /*
3300  * Now we do the fsync()s in the same order.
3301  *
3302  * The main call ignores symlinks, so in addition to specially processing
3303  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3304  * process_symlinks = true. Note that if there are any plain directories
3305  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3306  * so we don't worry about optimizing it.
3307  */
3308  walkdir(".", datadir_fsync_fname, false, LOG);
3309  if (xlog_is_symlink)
3310  walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3311  walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
3312 }
3313 
3314 /*
3315  * walkdir: recursively walk a directory, applying the action to each
3316  * regular file and directory (including the named directory itself).
3317  *
3318  * If process_symlinks is true, the action and recursion are also applied
3319  * to regular files and directories that are pointed to by symlinks in the
3320  * given directory; otherwise symlinks are ignored. Symlinks are always
3321  * ignored in subdirectories, ie we intentionally don't pass down the
3322  * process_symlinks flag to recursive calls.
3323  *
3324  * Errors are reported at level elevel, which might be ERROR or less.
3325  *
3326  * See also walkdir in initdb.c, which is a frontend version of this logic.
3327  */
3328 static void
3329 walkdir(const char *path,
3330  void (*action) (const char *fname, bool isdir, int elevel),
3331  bool process_symlinks,
3332  int elevel)
3333 {
3334  DIR *dir;
3335  struct dirent *de;
3336 
3337  dir = AllocateDir(path);
3338 
3339  while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3340  {
3341  char subpath[MAXPGPATH * 2];
3342  struct stat fst;
3343  int sret;
3344 
3346 
3347  if (strcmp(de->d_name, ".") == 0 ||
3348  strcmp(de->d_name, "..") == 0)
3349  continue;
3350 
3351  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3352 
3353  if (process_symlinks)
3354  sret = stat(subpath, &fst);
3355  else
3356  sret = lstat(subpath, &fst);
3357 
3358  if (sret < 0)
3359  {
3360  ereport(elevel,
3362  errmsg("could not stat file \"%s\": %m", subpath)));
3363  continue;
3364  }
3365 
3366  if (S_ISREG(fst.st_mode))
3367  (*action) (subpath, false, elevel);
3368  else if (S_ISDIR(fst.st_mode))
3369  walkdir(subpath, action, false, elevel);
3370  }
3371 
3372  FreeDir(dir); /* we ignore any error here */
3373 
3374  /*
3375  * It's important to fsync the destination directory itself as individual
3376  * file fsyncs don't guarantee that the directory entry for the file is
3377  * synced. However, skip this if AllocateDir failed; the action function
3378  * might not be robust against that.
3379  */
3380  if (dir)
3381  (*action) (path, true, elevel);
3382 }
3383 
3384 
3385 /*
3386  * Hint to the OS that it should get ready to fsync() this file.
3387  *
3388  * Ignores errors trying to open unreadable files, and logs other errors at a
3389  * caller-specified level.
3390  */
3391 #ifdef PG_FLUSH_DATA_WORKS
3392 
3393 static void
3394 pre_sync_fname(const char *fname, bool isdir, int elevel)
3395 {
3396  int fd;
3397 
3398  /* Don't try to flush directories, it'll likely just fail */
3399  if (isdir)
3400  return;
3401 
3402  fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
3403 
3404  if (fd < 0)
3405  {
3406  if (errno == EACCES)
3407  return;
3408  ereport(elevel,
3410  errmsg("could not open file \"%s\": %m", fname)));
3411  return;
3412  }
3413 
3414  /*
3415  * pg_flush_data() ignores errors, which is ok because this is only a
3416  * hint.
3417  */
3418  pg_flush_data(fd, 0, 0);
3419 
3420  (void) CloseTransientFile(fd);
3421 }
3422 
3423 #endif /* PG_FLUSH_DATA_WORKS */
3424 
3425 static void
3426 datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3427 {
3428  /*
3429  * We want to silently ignoring errors about unreadable files. Pass that
3430  * desire on to fsync_fname_ext().
3431  */
3432  fsync_fname_ext(fname, isdir, true, elevel);
3433 }
3434 
3435 static void
3436 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3437 {
3438  if (isdir)
3439  {
3440  if (rmdir(fname) != 0 && errno != ENOENT)
3441  ereport(elevel,
3443  errmsg("could not rmdir directory \"%s\": %m", fname)));
3444  }
3445  else
3446  {
3447  /* Use PathNameDeleteTemporaryFile to report filesize */
3448  PathNameDeleteTemporaryFile(fname, false);
3449  }
3450 }
3451 
3452 /*
3453  * fsync_fname_ext -- Try to fsync a file or directory
3454  *
3455  * If ignore_perm is true, ignore errors upon trying to open unreadable
3456  * files. Logs other errors at a caller-specified level.
3457  *
3458  * Returns 0 if the operation succeeded, -1 otherwise.
3459  */
3460 static int
3461 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3462 {
3463  int fd;
3464  int flags;
3465  int returncode;
3466 
3467  /*
3468  * Some OSs require directories to be opened read-only whereas other
3469  * systems don't allow us to fsync files opened read-only; so we need both
3470  * cases here. Using O_RDWR will cause us to fail to fsync files that are
3471  * not writable by our userid, but we assume that's OK.
3472  */
3473  flags = PG_BINARY;
3474  if (!isdir)
3475  flags |= O_RDWR;
3476  else
3477  flags |= O_RDONLY;
3478 
3479  fd = OpenTransientFile(fname, flags);
3480 
3481  /*
3482  * Some OSs don't allow us to open directories at all (Windows returns
3483  * EACCES), just ignore the error in that case. If desired also silently
3484  * ignoring errors about unreadable files. Log others.
3485  */
3486  if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3487  return 0;
3488  else if (fd < 0 && ignore_perm && errno == EACCES)
3489  return 0;
3490  else if (fd < 0)
3491  {
3492  ereport(elevel,
3494  errmsg("could not open file \"%s\": %m", fname)));
3495  return -1;
3496  }
3497 
3498  returncode = pg_fsync(fd);
3499 
3500  /*
3501  * Some OSes don't allow us to fsync directories at all, so we can ignore
3502  * those errors. Anything else needs to be logged.
3503  */
3504  if (returncode != 0 && !(isdir && errno == EBADF))
3505  {
3506  int save_errno;
3507 
3508  /* close file upon error, might not be in transaction context */
3509  save_errno = errno;
3510  (void) CloseTransientFile(fd);
3511  errno = save_errno;
3512 
3513  ereport(elevel,
3515  errmsg("could not fsync file \"%s\": %m", fname)));
3516  return -1;
3517  }
3518 
3519  (void) CloseTransientFile(fd);
3520 
3521  return 0;
3522 }
3523 
3524 /*
3525  * fsync_parent_path -- fsync the parent path of a file or directory
3526  *
3527  * This is aimed at making file operations persistent on disk in case of
3528  * an OS crash or power failure.
3529  */
3530 static int
3531 fsync_parent_path(const char *fname, int elevel)
3532 {
3533  char parentpath[MAXPGPATH];
3534 
3535  strlcpy(parentpath, fname, MAXPGPATH);
3536  get_parent_directory(parentpath);
3537 
3538  /*
3539  * get_parent_directory() returns an empty string if the input argument is
3540  * just a file name (see comments in path.c), so handle that as being the
3541  * current directory.
3542  */
3543  if (strlen(parentpath) == 0)
3544  strlcpy(parentpath, ".", MAXPGPATH);
3545 
3546  if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3547  return -1;
3548 
3549  return 0;
3550 }
3551 
3552 /*
3553  * Create a PostgreSQL data sub-directory
3554  *
3555  * The data directory itself, along with most other directories, are created at
3556  * initdb-time, but we do have some occations where we create directories from
3557  * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3558  * make sure that those directories are created consistently. Today, that means
3559  * making sure that the created directory has the correct permissions, which is
3560  * what pg_dir_create_mode tracks for us.
3561  *
3562  * Note that we also set the umask() based on what we understand the correct
3563  * permissions to be (see file_perm.c).
3564  *
3565  * For permissions other than the default mkdir() can be used directly, but be
3566  * sure to consider carefully such cases -- a directory with incorrect
3567  * permissions in a PostgreSQL data directory could cause backups and other
3568  * processes to fail.
3569  */
3570 int
3571 MakePGDirectory(const char *directoryName)
3572 {
3573  return mkdir(directoryName, pg_dir_create_mode);
3574 }
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:1364
File lruLessRecently
Definition: fd.c:191
void closeAllVfds(void)
Definition: fd.c:2767
File nextFree
Definition: fd.c:189
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition: fd.c:805
int pg_file_create_mode
Definition: file_perm.c:20
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1696
#define MAP_FAILED
Definition: mem.h:45
#define DEBUG1
Definition: elog.h:25
int MyProcPid
Definition: globals.c:42
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition: fd.c:1628
#define NUM_RESERVED_FDS
Definition: fd.c:119
static AllocateDesc * allocatedDescs
Definition: fd.c:253
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:1351
int pg_fdatasync(int fd)
Definition: fd.c:393
static void error(void)
Definition: sql-dyntest.c:147
#define SYNC_METHOD_FSYNC_WRITETHROUGH
Definition: xlog.h:28
AllocateDescKind
Definition: fd.c:231
DIR * dir
Definition: fd.c:246
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition: fd.c:1571
static void AtProcExit_Files(int code, Datum arg)
Definition: fd.c:2919
#define write(a, b, c)
Definition: win32.h:14
static Size SizeVfdCache
Definition: fd.c:206
#define FD_TEMP_FILE_LIMIT
Definition: fd.c:182
void on_proc_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:303
#define DO_DB(A)
Definition: fd.c:159
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2833
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3329
long random(void)
Definition: random.c:22
ResourceOwner CurrentResourceOwner
Definition: resowner.c:140
int pg_fsync_writethrough(int fd)
Definition: fd.c:370
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:78
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition: fd.c:2671
int max_safe_fds
Definition: fd.c:146
#define Min(x, y)
Definition: c.h:857
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:575
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:2395
#define FD_DELETE_AT_CLOSE
Definition: fd.c:180
int log_temp_files
Definition: guc.c:457
mode_t FileGetRawMode(File file)
Definition: fd.c:2252
void _dosmaperr(unsigned long)
Definition: win32error.c:171
static Vfd * VfdCache
Definition: fd.c:205
static void Delete(File file)
Definition: fd.c:1002
int closedir(DIR *)
Definition: dirent.c:111
static int numTempTableSpaces
Definition: fd.c:266
int errcode(int sqlerrcode)
Definition: elog.c:575
#define MemSet(start, val, len)
Definition: c.h:908
void PathNameDeleteTemporaryDir(const char *dirname)
Definition: fd.c:1460
int pg_fsync_no_writethrough(int fd)
Definition: fd.c:358
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition: fd.c:3161
int snprintf(char *str, size_t count, const char *fmt,...) pg_attribute_printf(3
#define PG_TEMP_FILE_PREFIX
Definition: fd.h:144
File PathNameOpenTemporaryFile(const char *path)
Definition: fd.c:1666
void pgstat_report_tempfile(size_t filesize)
Definition: pgstat.c:1525
static bool reserveAllocatedDesc(void)
Definition: fd.c:2263
uint32 SubTransactionId
Definition: c.h:478
void TempTablespacePath(char *path, Oid tablespace)
Definition: fd.c:1546
#define LOG
Definition: elog.h:26
unsigned int Oid
Definition: postgres_ext.h:31
#define FilePosIsUnknown(pos)
Definition: fd.c:177
AllocateDescKind kind
Definition: fd.c:241
char * FilePathName(File file)
Definition: fd.c:2216
Definition: dirent.h:9
#define OidIsValid(objectId)
Definition: c.h:605
#define PG_BINARY
Definition: c.h:1080
static char * basedir
Definition: pg_basebackup.c:80
void AtEOXact_Files(bool isCommit)
Definition: fd.c:2905
Oid MyDatabaseTableSpace
Definition: globals.c:88
int ClosePipeStream(FILE *file)
Definition: fd.c:2738
#define malloc(a)
Definition: header.h:50
static void LruDelete(File file)
Definition: fd.c:1021
void pg_usleep(long microsec)
Definition: signal.c:53
bool TempTablespacesAreSet(void)
Definition: fd.c:2821
#define fsync(fd)
Definition: win32_port.h:63
static int FreeDesc(AllocateDesc *desc)
Definition: fd.c:2489
void pfree(void *pointer)
Definition: mcxt.c:1031
mode_t fileMode
Definition: fd.c:197
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition: fd.c:3133
static bool ReleaseLruFile(void)
Definition: fd.c:1158
Definition: dirent.c:25
#define ERROR
Definition: elog.h:43
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2386
static int LruInsert(File file)
Definition: fd.c:1084
#define FATAL
Definition: elog.h:52
static bool have_xact_temporary_files
Definition: fd.c:217
#define MAXPGPATH
DIR * opendir(const char *)
Definition: dirent.c:33
int FileSync(File file, uint32 wait_event_info)
Definition: fd.c:2072
#define DEBUG2
Definition: elog.h:24
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:26
char * fileName
Definition: fd.c:194
static char * buf
Definition: pg_test_fsync.c:67
Oid GetNextTempTableSpace(void)
Definition: fd.c:2851
void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition: resowner.c:1199
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3436
int errdetail(const char *fmt,...)
Definition: elog.c:873
char * tablespace
Definition: pgbench.c:156
int errcode_for_file_access(void)
Definition: elog.c:598
void get_parent_directory(char *path)
Definition: path.c:854
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2336
static int nfile
Definition: fd.c:211
unsigned int uint32
Definition: c.h:325
void SyncDataDirectory(void)
Definition: fd.c:3256
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2590
static int nextTempTableSpace
Definition: fd.c:267
int FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
Definition: fd.c:1951
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1260
int max_files_per_process
Definition: fd.c:133
static File AllocateVfd(void)
Definition: fd.c:1190
FILE * OpenPipeStream(const char *command, const char *mode)
Definition: fd.c:2435
off_t seekPos
Definition: fd.c:192
unsigned short fdstate
Definition: fd.c:187
Definition: fd.c:184
off_t fileSize
Definition: fd.c:193
int fd
Definition: fd.c:186
#define ereport(elevel, rest)
Definition: elog.h:122
int FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
Definition: fd.c:1887
int link(const char *fromname, const char *toname)
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition: fd.c:2793
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:601
static void Insert(File file)
Definition: fd.c:1062
ResourceOwner resowner
Definition: fd.c:188
#define S_ISREG(m)
Definition: win32_port.h:310
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition: fd.c:3426
int CloseTransientFile(int fd)
Definition: fd.c:2556
static void ReportTemporaryFileUsage(const char *path, off_t size)
Definition: fd.c:1304
static void ReleaseLruFiles(void)
Definition: fd.c:1180
#define WARNING
Definition: elog.h:40
#define stat(a, b)
Definition: win32_port.h:266
#define FileIsNotOpen(file)
Definition: fd.c:168
int pg_dir_create_mode
Definition: file_perm.c:19
static int elevel
Definition: vacuumlazy.c:144
struct vfd Vfd
uintptr_t Datum
Definition: postgres.h:367
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition: fd.c:2872
unsigned int Index
Definition: c.h:442
void pg_flush_data(int fd, off_t offset, off_t nbytes)
Definition: fd.c:415
#define FileIsValid(file)
Definition: fd.c:165
FILE * file
Definition: fd.c:245
#define InvalidOid
Definition: postgres_ext.h:36
#define VFD_CLOSED
Definition: fd.c:163
static uint64 temporary_files_size
Definition: fd.c:225
int MakePGDirectory(const char *directoryName)
Definition: fd.c:3571
#define free(a)
Definition: header.h:65
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static void RegisterTemporaryFile(File file)
Definition: fd.c:1323
static void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition: fd.c:3067
#define PG_TEMP_FILES_DIR
Definition: fd.h:143
void FileClose(File file)
Definition: fd.c:1742
int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info)
Definition: fd.c:1832
static int FileAccess(File file)
Definition: fd.c:1268
#define Assert(condition)
Definition: c.h:699
SubTransactionId GetCurrentSubTransactionId(void)
Definition: xact.c:641
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2656
File lruMoreRecently
Definition: fd.c:190
void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
Definition: fd.c:1860
void RemovePgTempFiles(void)
Definition: fd.c:3007
SubTransactionId create_subid
Definition: fd.c:242
WalTimeSample buffer[LAG_TRACKER_BUFFER_SIZE]
Definition: walsender.c:215
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1493
int durable_link_or_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:720
size_t Size
Definition: c.h:433
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1236
static const char * directory
Definition: zic.c:575
int sync_method
Definition: xlog.c:103
struct dirent * readdir(DIR *)
Definition: dirent.c:77
#define FD_MINFREE
Definition: fd.c:125
bool looks_like_temp_rel_name(const char *name)
Definition: fd.c:3189
#define realloc(a, b)
Definition: header.h:60
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1044
#define INT64_FORMAT
Definition: c.h:367
const char * name
Definition: encode.c:521
static long tempFileCounter
Definition: fd.c:259
int fd
Definition: fd.c:247
#define S_ISDIR(m)
Definition: win32_port.h:307
#define lstat(path, sb)
Definition: win32_port.h:255
int durable_unlink(const char *fname, int elevel)
Definition: fd.c:684
int BasicOpenFile(const char *fileName, int fileFlags)
Definition: fd.c:932
int FreeFile(FILE *file)
Definition: fd.c:2528
void set_max_safe_fds(void)
Definition: fd.c:889
bool enableFsync
Definition: globals.c:120
static Oid * tempTableSpaces
Definition: fd.c:265
void * palloc(Size size)
Definition: mcxt.c:924
int errmsg(const char *fmt,...)
Definition: elog.c:797
int FileGetRawFlags(File file)
Definition: fd.c:2242
void ResourceOwnerEnlargeFiles(ResourceOwner owner)
Definition: resowner.c:1188
static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition: fd.c:3461
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition: fd.c:954
union AllocateDesc::@27 desc
int i
#define FD_CLOSE_AT_EOXACT
Definition: fd.c:181
void * arg
int FileGetRawDesc(File file)
Definition: fd.c:2232
static void FreeVfd(File file)
Definition: fd.c:1248
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:98
int pg_fsync(int fd)
Definition: fd.c:341
char d_name[MAX_PATH]
Definition: dirent.h:14
#define elog
Definition: elog.h:219
#define mkdir(a, b)
Definition: win32_port.h:58
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:334
int fileFlags
Definition: fd.c:196
off_t FileSeek(File file, off_t offset, int whence)
Definition: fd.c:2093
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition: fd.c:1429
void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition: resowner.c:1208
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2181
#define FileUnknownPos
Definition: fd.c:176
static int maxAllocatedDescs
Definition: fd.c:252
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition: fd.c:2937
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3531
int File
Definition: fd.h:49
#define read(a, b, c)
Definition: win32.h:13
int FreeDir(DIR *dir)
Definition: fd.c:2708
int temp_file_limit
Definition: guc.c:460
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:234
void InitFileAccess(void)
Definition: fd.c:772
static int numAllocatedDescs
Definition: fd.c:251
bool pgwin32_is_junction(const char *path)
#define ftruncate(a, b)
Definition: win32_port.h:60