PostgreSQL Source Code git master
Loading...
Searching...
No Matches
fd.c File Reference
#include "postgres.h"
#include <dirent.h>
#include <sys/file.h>
#include <sys/param.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <limits.h>
#include <unistd.h>
#include <fcntl.h>
#include "access/xact.h"
#include "access/xlog.h"
#include "catalog/pg_tablespace.h"
#include "common/file_perm.h"
#include "common/file_utils.h"
#include "common/pg_prng.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/startup.h"
#include "storage/aio.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/guc_hooks.h"
#include "utils/resowner.h"
#include "utils/varlena.h"
#include "utils/wait_event.h"
Include dependency graph for fd.c:

Go to the source code of this file.

Data Structures

struct  vfd
 
struct  AllocateDesc
 

Macros

#define NUM_RESERVED_FDS   10
 
#define FD_MINFREE   48
 
#define DO_DB(A)    ((void) 0)
 
#define VFD_CLOSED   (-1)
 
#define FileIsValid(file)    ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
 
#define FileIsNotOpen(file)   (VfdCache[file].fd == VFD_CLOSED)
 
#define FD_DELETE_AT_CLOSE   (1 << 0) /* T = delete when closed */
 
#define FD_CLOSE_AT_EOXACT   (1 << 1) /* T = close at eoXact */
 
#define FD_TEMP_FILE_LIMIT   (1 << 2) /* T = respect temp_file_limit */
 

Typedefs

typedef struct vfd Vfd
 

Enumerations

enum  AllocateDescKind { AllocateDescFile , AllocateDescPipe , AllocateDescDir , AllocateDescRawFD }
 

Functions

static void Delete (File file)
 
static void LruDelete (File file)
 
static void Insert (File file)
 
static int LruInsert (File file)
 
static bool ReleaseLruFile (void)
 
static void ReleaseLruFiles (void)
 
static File AllocateVfd (void)
 
static void FreeVfd (File file)
 
static int FileAccess (File file)
 
static File OpenTemporaryFileInTablespace (Oid tblspcOid, bool rejectError)
 
static bool reserveAllocatedDesc (void)
 
static int FreeDesc (AllocateDesc *desc)
 
static void BeforeShmemExit_Files (int code, Datum arg)
 
static void CleanupTempFiles (bool isCommit, bool isProcExit)
 
static void RemovePgTempRelationFiles (const char *tsdirname)
 
static void RemovePgTempRelationFilesInDbspace (const char *dbspacedirname)
 
static void walkdir (const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
 
static void datadir_fsync_fname (const char *fname, bool isdir, int elevel)
 
static void unlink_if_exists_fname (const char *fname, bool isdir, int elevel)
 
static int fsync_parent_path (const char *fname, int elevel)
 
static void ResOwnerReleaseFile (Datum res)
 
static charResOwnerPrintFile (Datum res)
 
static void ResourceOwnerRememberFile (ResourceOwner owner, File file)
 
static void ResourceOwnerForgetFile (ResourceOwner owner, File file)
 
int pg_fsync (int fd)
 
int pg_fsync_no_writethrough (int fd)
 
int pg_fsync_writethrough (int fd)
 
int pg_fdatasync (int fd)
 
bool pg_file_exists (const char *name)
 
void pg_flush_data (int fd, pgoff_t offset, pgoff_t nbytes)
 
static int pg_ftruncate (int fd, pgoff_t length)
 
int pg_truncate (const char *path, pgoff_t length)
 
void fsync_fname (const char *fname, bool isdir)
 
int durable_rename (const char *oldfile, const char *newfile, int elevel)
 
int durable_unlink (const char *fname, int elevel)
 
void InitFileAccess (void)
 
void InitTemporaryFileAccess (void)
 
static void count_usable_fds (int max_to_probe, int *usable_fds, int *already_open)
 
void set_max_safe_fds (void)
 
int BasicOpenFile (const char *fileName, int fileFlags)
 
int BasicOpenFilePerm (const char *fileName, int fileFlags, mode_t fileMode)
 
bool AcquireExternalFD (void)
 
void ReserveExternalFD (void)
 
void ReleaseExternalFD (void)
 
static void ReportTemporaryFileUsage (const char *path, pgoff_t size)
 
static void RegisterTemporaryFile (File file)
 
File PathNameOpenFile (const char *fileName, int fileFlags)
 
File PathNameOpenFilePerm (const char *fileName, int fileFlags, mode_t fileMode)
 
void PathNameCreateTemporaryDir (const char *basedir, const char *directory)
 
void PathNameDeleteTemporaryDir (const char *dirname)
 
File OpenTemporaryFile (bool interXact)
 
void TempTablespacePath (char *path, Oid tablespace)
 
File PathNameCreateTemporaryFile (const char *path, bool error_on_failure)
 
File PathNameOpenTemporaryFile (const char *path, int mode)
 
bool PathNameDeleteTemporaryFile (const char *path, bool error_on_failure)
 
void FileClose (File file)
 
int FilePrefetch (File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
 
void FileWriteback (File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
 
ssize_t FileReadV (File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
 
int FileStartReadV (PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
 
ssize_t FileWriteV (File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
 
int FileSync (File file, uint32 wait_event_info)
 
int FileZero (File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
 
int FileFallocate (File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
 
pgoff_t FileSize (File file)
 
int FileTruncate (File file, pgoff_t offset, uint32 wait_event_info)
 
charFilePathName (File file)
 
int FileGetRawDesc (File file)
 
int FileGetRawFlags (File file)
 
mode_t FileGetRawMode (File file)
 
FILEAllocateFile (const char *name, const char *mode)
 
int OpenTransientFile (const char *fileName, int fileFlags)
 
int OpenTransientFilePerm (const char *fileName, int fileFlags, mode_t fileMode)
 
FILEOpenPipeStream (const char *command, const char *mode)
 
int FreeFile (FILE *file)
 
int CloseTransientFile (int fd)
 
DIRAllocateDir (const char *dirname)
 
struct direntReadDir (DIR *dir, const char *dirname)
 
struct direntReadDirExtended (DIR *dir, const char *dirname, int elevel)
 
int FreeDir (DIR *dir)
 
int ClosePipeStream (FILE *file)
 
void closeAllVfds (void)
 
void SetTempTablespaces (Oid *tableSpaces, int numSpaces)
 
bool TempTablespacesAreSet (void)
 
int GetTempTablespaces (Oid *tableSpaces, int numSpaces)
 
Oid GetNextTempTableSpace (void)
 
void AtEOSubXact_Files (bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
 
void AtEOXact_Files (bool isCommit)
 
void RemovePgTempFiles (void)
 
void RemovePgTempFilesInDir (const char *tmpdirname, bool missing_ok, bool unlink_all)
 
bool looks_like_temp_rel_name (const char *name)
 
void SyncDataDirectory (void)
 
int fsync_fname_ext (const char *fname, bool isdir, bool ignore_perm, int elevel)
 
int MakePGDirectory (const char *directoryName)
 
int data_sync_elevel (int elevel)
 
bool check_debug_io_direct (char **newval, void **extra, GucSource source)
 
void assign_debug_io_direct (const char *newval, void *extra)
 

Variables

int max_files_per_process = 1000
 
int max_safe_fds = FD_MINFREE
 
bool data_sync_retry = false
 
int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC
 
int file_extend_method = DEFAULT_FILE_EXTEND_METHOD
 
int io_direct_flags
 
static VfdVfdCache
 
static Size SizeVfdCache = 0
 
static int nfile = 0
 
static bool have_xact_temporary_files = false
 
static uint64 temporary_files_size = 0
 
static int numAllocatedDescs = 0
 
static int maxAllocatedDescs = 0
 
static AllocateDescallocatedDescs = NULL
 
static int numExternalFDs = 0
 
static long tempFileCounter = 0
 
static OidtempTableSpaces = NULL
 
static int numTempTableSpaces = -1
 
static int nextTempTableSpace = 0
 
static const ResourceOwnerDesc file_resowner_desc
 

Macro Definition Documentation

◆ DO_DB

#define DO_DB (   A)     ((void) 0)

Definition at line 184 of file fd.c.

201{
202 int fd; /* current FD, or VFD_CLOSED if none */
203 unsigned short fdstate; /* bitflags for VFD's state */
204 ResourceOwner resowner; /* owner, for automatic cleanup */
205 File nextFree; /* link to next free VFD, if in freelist */
206 File lruMoreRecently; /* doubly linked recency-of-use list */
207 File lruLessRecently;
208 pgoff_t fileSize; /* current size of file (0 if not temporary) */
209 char *fileName; /* name of file, or NULL for unused VFD */
210 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
211 int fileFlags; /* open(2) flags for (re)opening the file */
212 mode_t fileMode; /* mode to pass to open(2) */
213} Vfd;
214
215/*
216 * Virtual File Descriptor array pointer and size. This grows as
217 * needed. 'File' values are indexes into this array.
218 * Note that VfdCache[0] is not a usable VFD, just a list header.
219 */
220static Vfd *VfdCache;
221static Size SizeVfdCache = 0;
222
223/*
224 * Number of file descriptors known to be in use by VFD entries.
225 */
226static int nfile = 0;
227
228/*
229 * Flag to tell whether it's worth scanning VfdCache looking for temp files
230 * to close
231 */
232static bool have_xact_temporary_files = false;
233
234/*
235 * Tracks the total size of all temporary files. Note: when temp_file_limit
236 * is being enforced, this cannot overflow since the limit cannot be more
237 * than INT_MAX kilobytes. When not enforcing, it could theoretically
238 * overflow, but we don't care.
239 */
241
242/* Temporary file access initialized and not yet shut down? */
243#ifdef USE_ASSERT_CHECKING
244static bool temporary_files_allowed = false;
245#endif
246
247/*
248 * List of OS handles opened with AllocateFile, AllocateDir and
249 * OpenTransientFile.
250 */
251typedef enum
252{
258
259typedef struct
260{
261 AllocateDescKind kind;
262 SubTransactionId create_subid;
263 union
264 {
265 FILE *file;
266 DIR *dir;
267 int fd;
268 } desc;
270
271static int numAllocatedDescs = 0;
272static int maxAllocatedDescs = 0;
274
275/*
276 * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
277 */
278static int numExternalFDs = 0;
279
280/*
281 * Number of temporary files opened during the current session;
282 * this is used in generation of tempfile names.
283 */
284static long tempFileCounter = 0;
285
286/*
287 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
288 * indicating that the current database's default tablespace should be used.)
289 * When numTempTableSpaces is -1, this has not been set in the current
290 * transaction.
291 */
292static Oid *tempTableSpaces = NULL;
293static int numTempTableSpaces = -1;
294static int nextTempTableSpace = 0;
295
296
297/*--------------------
298 *
299 * Private Routines
300 *
301 * Delete - delete a file from the Lru ring
302 * LruDelete - remove a file from the Lru ring and close its FD
303 * Insert - put a file at the front of the Lru ring
304 * LruInsert - put a file at the front of the Lru ring and open it
305 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
306 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
307 * AllocateVfd - grab a free (or new) file record (from VfdCache)
308 * FreeVfd - free a file record
309 *
310 * The Least Recently Used ring is a doubly linked list that begins and
311 * ends on element zero. Element zero is special -- it doesn't represent
312 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
313 * anchor that shows us the beginning/end of the ring.
314 * Only VFD elements that are currently really open (have an FD assigned) are
315 * in the Lru ring. Elements that are "virtually" open can be recognized
316 * by having a non-null fileName field.
317 *
318 * example:
319 *
320 * /--less----\ /---------\
321 * v \ v \
322 * #0 --more---> LeastRecentlyUsed --more-\ \
323 * ^\ | |
324 * \\less--> MostRecentlyUsedFile <---/ |
325 * \more---/ \--less--/
326 *
327 *--------------------
328 */
329static void Delete(File file);
330static void LruDelete(File file);
331static void Insert(File file);
332static int LruInsert(File file);
333static bool ReleaseLruFile(void);
334static void ReleaseLruFiles(void);
335static File AllocateVfd(void);
336static void FreeVfd(File file);
337
338static int FileAccess(File file);
340static bool reserveAllocatedDesc(void);
341static int FreeDesc(AllocateDesc *desc);
342
343static void BeforeShmemExit_Files(int code, Datum arg);
344static void CleanupTempFiles(bool isCommit, bool isProcExit);
345static void RemovePgTempRelationFiles(const char *tsdirname);
347
348static void walkdir(const char *path,
349 void (*action) (const char *fname, bool isdir, int elevel),
350 bool process_symlinks,
351 int elevel);
352#ifdef PG_FLUSH_DATA_WORKS
353static void pre_sync_fname(const char *fname, bool isdir, int elevel);
354#endif
355static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
356static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
357
358static int fsync_parent_path(const char *fname, int elevel);
359
360
361/* ResourceOwner callbacks to hold virtual file descriptors */
362static void ResOwnerReleaseFile(Datum res);
363static char *ResOwnerPrintFile(Datum res);
364
366{
367 .name = "File",
368 .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
369 .release_priority = RELEASE_PRIO_FILES,
370 .ReleaseResource = ResOwnerReleaseFile,
371 .DebugPrint = ResOwnerPrintFile
372};
373
374/* Convenience wrappers over ResourceOwnerRemember/Forget */
375static inline void
377{
379}
380static inline void
382{
384}
385
386/*
387 * pg_fsync --- do fsync with or without writethrough
388 */
389int
390pg_fsync(int fd)
391{
392#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
393 struct stat st;
394
395 /*
396 * Some operating system implementations of fsync() have requirements
397 * about the file access modes that were used when their file descriptor
398 * argument was opened, and these requirements differ depending on whether
399 * the file descriptor is for a directory.
400 *
401 * For any file descriptor that may eventually be handed to fsync(), we
402 * should have opened it with access modes that are compatible with
403 * fsync() on all supported systems, otherwise the code may not be
404 * portable, even if it runs ok on the current system.
405 *
406 * We assert here that a descriptor for a file was opened with write
407 * permissions (i.e., not O_RDONLY) and for a directory without write
408 * permissions (O_RDONLY). Notice that the assertion check is made even
409 * if fsync() is disabled.
410 *
411 * If fstat() fails, ignore it and let the follow-up fsync() complain.
412 */
413 if (fstat(fd, &st) == 0)
414 {
415 int desc_flags = fcntl(fd, F_GETFL);
416
418
419 if (S_ISDIR(st.st_mode))
421 else
423 }
424 errno = 0;
425#endif
426
427 /* #if is to skip the wal_sync_method test if there's no need for it */
428#if defined(HAVE_FSYNC_WRITETHROUGH)
431 else
432#endif
434}
435
436
437/*
438 * pg_fsync_no_writethrough --- same as fsync except does nothing if
439 * enableFsync is off
440 */
441int
443{
444 int rc;
445
446 if (!enableFsync)
447 return 0;
448
449retry:
450 rc = fsync(fd);
451
452 if (rc == -1 && errno == EINTR)
453 goto retry;
454
455 return rc;
456}
457
458/*
459 * pg_fsync_writethrough
460 */
461int
463{
464 if (enableFsync)
465 {
466#if defined(F_FULLFSYNC)
467 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
468#else
469 errno = ENOSYS;
470 return -1;
471#endif
472 }
473 else
474 return 0;
475}
476
477/*
478 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
479 */
480int
481pg_fdatasync(int fd)
482{
483 int rc;
484
485 if (!enableFsync)
486 return 0;
487
488retry:
489 rc = fdatasync(fd);
490
491 if (rc == -1 && errno == EINTR)
492 goto retry;
493
494 return rc;
495}
496
497/*
498 * pg_file_exists -- check that a file exists.
499 *
500 * This requires an absolute path to the file. Returns true if the file is
501 * not a directory, false otherwise.
502 */
503bool
504pg_file_exists(const char *name)
505{
506 struct stat st;
507
508 Assert(name != NULL);
509
510 if (stat(name, &st) == 0)
511 return !S_ISDIR(st.st_mode);
512 else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
515 errmsg("could not access file \"%s\": %m", name)));
516
517 return false;
518}
519
520/*
521 * pg_flush_data --- advise OS that the described dirty data should be flushed
522 *
523 * offset of 0 with nbytes 0 means that the entire file should be flushed
524 */
525void
526pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
527{
528 /*
529 * Right now file flushing is primarily used to avoid making later
530 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
531 * if fsyncs are disabled - that's a decision we might want to make
532 * configurable at some point.
533 */
534 if (!enableFsync)
535 return;
536
537 /*
538 * We compile all alternatives that are supported on the current platform,
539 * to find portability problems more easily.
540 */
541#if defined(HAVE_SYNC_FILE_RANGE)
542 {
543 int rc;
544 static bool not_implemented_by_kernel = false;
545
547 return;
548
549retry:
550
551 /*
552 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
553 * tells the OS that writeback for the specified blocks should be
554 * started, but that we don't want to wait for completion. Note that
555 * this call might block if too much dirty data exists in the range.
556 * This is the preferable method on OSs supporting it, as it works
557 * reliably when available (contrast to msync()) and doesn't flush out
558 * clean data (like FADV_DONTNEED).
559 */
560 rc = sync_file_range(fd, offset, nbytes,
562 if (rc != 0)
563 {
564 int elevel;
565
566 if (rc == EINTR)
567 goto retry;
568
569 /*
570 * For systems that don't have an implementation of
571 * sync_file_range() such as Windows WSL, generate only one
572 * warning and then suppress all further attempts by this process.
573 */
574 if (errno == ENOSYS)
575 {
576 elevel = WARNING;
578 }
579 else
580 elevel = data_sync_elevel(WARNING);
581
582 ereport(elevel,
584 errmsg("could not flush dirty data: %m")));
585 }
586
587 return;
588 }
589#endif
590#if !defined(WIN32) && defined(MS_ASYNC)
591 {
592 void *p;
593 static int pagesize = 0;
594
595 /*
596 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
597 * writeback. On linux it only does so if MS_SYNC is specified, but
598 * then it does the writeback synchronously. Luckily all common linux
599 * systems have sync_file_range(). This is preferable over
600 * FADV_DONTNEED because it doesn't flush out clean data.
601 *
602 * We map the file (mmap()), tell the kernel to sync back the contents
603 * (msync()), and then remove the mapping again (munmap()).
604 */
605
606 /* mmap() needs actual length if we want to map whole file */
607 if (offset == 0 && nbytes == 0)
608 {
609 nbytes = lseek(fd, 0, SEEK_END);
610 if (nbytes < 0)
611 {
614 errmsg("could not determine dirty data size: %m")));
615 return;
616 }
617 }
618
619 /*
620 * Some platforms reject partial-page mmap() attempts. To deal with
621 * that, just truncate the request to a page boundary. If any extra
622 * bytes don't get flushed, well, it's only a hint anyway.
623 */
624
625 /* fetch pagesize only once */
626 if (pagesize == 0)
628
629 /* align length to pagesize, dropping any fractional page */
630 if (pagesize > 0)
631 nbytes = (nbytes / pagesize) * pagesize;
632
633 /* fractional-page request is a no-op */
634 if (nbytes <= 0)
635 return;
636
637 /*
638 * mmap could well fail, particularly on 32-bit platforms where there
639 * may simply not be enough address space. If so, silently fall
640 * through to the next implementation.
641 */
642 if (nbytes <= (pgoff_t) SSIZE_MAX)
643 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
644 else
645 p = MAP_FAILED;
646
647 if (p != MAP_FAILED)
648 {
649 int rc;
650
651 rc = msync(p, (size_t) nbytes, MS_ASYNC);
652 if (rc != 0)
653 {
656 errmsg("could not flush dirty data: %m")));
657 /* NB: need to fall through to munmap()! */
658 }
659
660 rc = munmap(p, (size_t) nbytes);
661 if (rc != 0)
662 {
663 /* FATAL error because mapping would remain */
666 errmsg("could not munmap() while flushing data: %m")));
667 }
668
669 return;
670 }
671 }
672#endif
673#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
674 {
675 int rc;
676
677 /*
678 * Signal the kernel that the passed in range should not be cached
679 * anymore. This has the, desired, side effect of writing out dirty
680 * data, and the, undesired, side effect of likely discarding useful
681 * clean cached blocks. For the latter reason this is the least
682 * preferable method.
683 */
684
685 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
686
687 if (rc != 0)
688 {
689 /* don't error out, this is just a performance optimization */
692 errmsg("could not flush dirty data: %m")));
693 }
694
695 return;
696 }
697#endif
698}
699
700/*
701 * Truncate an open file to a given length.
702 */
703static int
704pg_ftruncate(int fd, pgoff_t length)
705{
706 int ret;
707
708retry:
709 ret = ftruncate(fd, length);
710
711 if (ret == -1 && errno == EINTR)
712 goto retry;
713
714 return ret;
715}
716
717/*
718 * Truncate a file to a given length by name.
719 */
720int
721pg_truncate(const char *path, pgoff_t length)
722{
723 int ret;
724#ifdef WIN32
725 int save_errno;
726 int fd;
727
729 if (fd >= 0)
730 {
731 ret = pg_ftruncate(fd, length);
735 }
736 else
737 ret = -1;
738#else
739
740retry:
741 ret = truncate(path, length);
742
743 if (ret == -1 && errno == EINTR)
744 goto retry;
745#endif
746
747 return ret;
748}
749
750/*
751 * fsync_fname -- fsync a file or directory, handling errors properly
752 *
753 * Try to fsync a file or directory. When doing the latter, ignore errors that
754 * indicate the OS just doesn't allow/require fsyncing directories.
755 */
756void
757fsync_fname(const char *fname, bool isdir)
758{
760}
761
762/*
763 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
764 *
765 * This routine ensures that, after returning, the effect of renaming file
766 * persists in case of a crash. A crash while this routine is running will
767 * leave you with either the pre-existing or the moved file in place of the
768 * new file; no mixed state or truncated files are possible.
769 *
770 * It does so by using fsync on the old filename and the possibly existing
771 * target filename before the rename, and the target file and directory after.
772 *
773 * Note that rename() cannot be used across arbitrary directories, as they
774 * might not be on the same filesystem. Therefore this routine does not
775 * support renaming across directories.
776 *
777 * Log errors with the caller specified severity.
778 *
779 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
780 * valid upon return.
781 */
782int
783durable_rename(const char *oldfile, const char *newfile, int elevel)
784{
785 int fd;
786
787 /*
788 * First fsync the old and target path (if it exists), to ensure that they
789 * are properly persistent on disk. Syncing the target file is not
790 * strictly necessary, but it makes it easier to reason about crashes;
791 * because it's then guaranteed that either source or target file exists
792 * after a crash.
793 */
794 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
795 return -1;
796
798 if (fd < 0)
799 {
800 if (errno != ENOENT)
801 {
802 ereport(elevel,
804 errmsg("could not open file \"%s\": %m", newfile)));
805 return -1;
806 }
807 }
808 else
809 {
810 if (pg_fsync(fd) != 0)
811 {
812 int save_errno;
813
814 /* close file upon error, might not be in transaction context */
818
819 ereport(elevel,
821 errmsg("could not fsync file \"%s\": %m", newfile)));
822 return -1;
823 }
824
825 if (CloseTransientFile(fd) != 0)
826 {
827 ereport(elevel,
829 errmsg("could not close file \"%s\": %m", newfile)));
830 return -1;
831 }
832 }
833
834 /* Time to do the real deal... */
835 if (rename(oldfile, newfile) < 0)
836 {
837 ereport(elevel,
839 errmsg("could not rename file \"%s\" to \"%s\": %m",
840 oldfile, newfile)));
841 return -1;
842 }
843
844 /*
845 * To guarantee renaming the file is persistent, fsync the file with its
846 * new name, and its containing directory.
847 */
848 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
849 return -1;
850
851 if (fsync_parent_path(newfile, elevel) != 0)
852 return -1;
853
854 return 0;
855}
856
857/*
858 * durable_unlink -- remove a file in a durable manner
859 *
860 * This routine ensures that, after returning, the effect of removing file
861 * persists in case of a crash. A crash while this routine is running will
862 * leave the system in no mixed state.
863 *
864 * It does so by using fsync on the parent directory of the file after the
865 * actual removal is done.
866 *
867 * Log errors with the severity specified by caller.
868 *
869 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
870 * valid upon return.
871 */
872int
873durable_unlink(const char *fname, int elevel)
874{
875 if (unlink(fname) < 0)
876 {
877 ereport(elevel,
879 errmsg("could not remove file \"%s\": %m",
880 fname)));
881 return -1;
882 }
883
884 /*
885 * To guarantee that the removal of the file is persistent, fsync its
886 * parent directory.
887 */
888 if (fsync_parent_path(fname, elevel) != 0)
889 return -1;
890
891 return 0;
892}
893
894/*
895 * InitFileAccess --- initialize this module during backend startup
896 *
897 * This is called during either normal or standalone backend start.
898 * It is *not* called in the postmaster.
899 *
900 * Note that this does not initialize temporary file access, that is
901 * separately initialized via InitTemporaryFileAccess().
902 */
903void
904InitFileAccess(void)
905{
906 Assert(SizeVfdCache == 0); /* call me only once */
907
908 /* initialize cache header entry */
909 VfdCache = (Vfd *) malloc(sizeof(Vfd));
910 if (VfdCache == NULL)
913 errmsg("out of memory")));
914
915 MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
917
918 SizeVfdCache = 1;
919}
920
921/*
922 * InitTemporaryFileAccess --- initialize temporary file access during startup
923 *
924 * This is called during either normal or standalone backend start.
925 * It is *not* called in the postmaster.
926 *
927 * This is separate from InitFileAccess() because temporary file cleanup can
928 * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
929 * our reporting has to happen before that. Low level file access should be
930 * available for longer, hence the separate initialization / shutdown of
931 * temporary file handling.
932 */
933void
935{
936 Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
937 Assert(!temporary_files_allowed); /* call me only once */
938
939 /*
940 * Register before-shmem-exit hook to ensure temp files are dropped while
941 * we can still report stats.
942 */
944
945#ifdef USE_ASSERT_CHECKING
947#endif
948}
949
950/*
951 * count_usable_fds --- count how many FDs the system will let us open,
952 * and estimate how many are already open.
953 *
954 * We stop counting if usable_fds reaches max_to_probe. Note: a small
955 * value of max_to_probe might result in an underestimate of already_open;
956 * we must fill in any "gaps" in the set of used FDs before the calculation
957 * of already_open will give the right answer. In practice, max_to_probe
958 * of a couple of dozen should be enough to ensure good results.
959 *
960 * We assume stderr (FD 2) is available for dup'ing. While the calling
961 * script could theoretically close that, it would be a really bad idea,
962 * since then one risks loss of error messages from, e.g., libc.
963 */
964static void
966{
967 int *fd;
968 int size;
969 int used = 0;
970 int highestfd = 0;
971 int j;
972
973#ifdef HAVE_GETRLIMIT
974 struct rlimit rlim;
976#endif
977
978 size = 1024;
979 fd = (int *) palloc(size * sizeof(int));
980
981#ifdef HAVE_GETRLIMIT
983 if (getrlimit_status != 0)
984 ereport(WARNING, (errmsg("getrlimit failed: %m")));
985#endif /* HAVE_GETRLIMIT */
986
987 /* dup until failure or probe limit reached */
988 for (;;)
989 {
990 int thisfd;
991
992#ifdef HAVE_GETRLIMIT
993
994 /*
995 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
996 * some platforms
997 */
998 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
999 break;
1000#endif
1001
1002 thisfd = dup(2);
1003 if (thisfd < 0)
1004 {
1005 /* Expect EMFILE or ENFILE, else it's fishy */
1006 if (errno != EMFILE && errno != ENFILE)
1007 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1008 break;
1009 }
1010
1011 if (used >= size)
1012 {
1013 size *= 2;
1014 fd = (int *) repalloc(fd, size * sizeof(int));
1015 }
1016 fd[used++] = thisfd;
1017
1018 if (highestfd < thisfd)
1019 highestfd = thisfd;
1020
1021 if (used >= max_to_probe)
1022 break;
1023 }
1024
1025 /* release the files we opened */
1026 for (j = 0; j < used; j++)
1027 close(fd[j]);
1028
1029 pfree(fd);
1030
1031 /*
1032 * Return results. usable_fds is just the number of successful dups. We
1033 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1034 * number) and so already_open is highestfd+1 - usable_fds.
1035 */
1036 *usable_fds = used;
1037 *already_open = highestfd + 1 - used;
1038}
1039
1040/*
1041 * set_max_safe_fds
1042 * Determine number of file descriptors that fd.c is allowed to use
1043 */
1044void
1045set_max_safe_fds(void)
1046{
1047 int usable_fds;
1048 int already_open;
1049
1050 /*----------
1051 * We want to set max_safe_fds to
1052 * MIN(usable_fds, max_files_per_process)
1053 * less the slop factor for files that are opened without consulting
1054 * fd.c. This ensures that we won't allow to open more than
1055 * max_files_per_process, or the experimentally-determined EMFILE limit,
1056 * additional files.
1057 *----------
1058 */
1061
1063
1064 /*
1065 * Take off the FDs reserved for system() etc.
1066 */
1068
1069 /*
1070 * Make sure we still have enough to get by.
1071 */
1073 ereport(FATAL,
1075 errmsg("insufficient file descriptors available to start server process"),
1076 errdetail("System allows %d, server needs at least %d, %d files are already open.",
1079 already_open)));
1080
1081 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1083}
1084
1085/*
1086 * Open a file with BasicOpenFilePerm() and pass default file mode for the
1087 * fileMode parameter.
1088 */
1089int
1090BasicOpenFile(const char *fileName, int fileFlags)
1091{
1092 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1093}
1094
1095/*
1096 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1097 *
1098 * This is exported for use by places that really want a plain kernel FD,
1099 * but need to be proof against running out of FDs. Once an FD has been
1100 * successfully returned, it is the caller's responsibility to ensure that
1101 * it will not be leaked on ereport()! Most users should *not* call this
1102 * routine directly, but instead use the VFD abstraction level, which
1103 * provides protection against descriptor leaks as well as management of
1104 * files that need to be open for more than a short period of time.
1105 *
1106 * Ideally this should be the *only* direct call of open() in the backend.
1107 * In practice, the postmaster calls open() directly, and there are some
1108 * direct open() calls done early in backend startup. Those are OK since
1109 * this module wouldn't have any open files to close at that point anyway.
1110 */
1111int
1112BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1113{
1114 int fd;
1115
1116tryAgain:
1117#ifdef PG_O_DIRECT_USE_F_NOCACHE
1118 fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1119#else
1120 fd = open(fileName, fileFlags, fileMode);
1121#endif
1122
1123 if (fd >= 0)
1124 {
1125#ifdef PG_O_DIRECT_USE_F_NOCACHE
1126 if (fileFlags & PG_O_DIRECT)
1127 {
1128 if (fcntl(fd, F_NOCACHE, 1) < 0)
1129 {
1130 int save_errno = errno;
1131
1132 close(fd);
1133 errno = save_errno;
1134 return -1;
1135 }
1136 }
1137#endif
1138
1139 return fd; /* success! */
1140 }
1141
1142 if (errno == EMFILE || errno == ENFILE)
1143 {
1144 int save_errno = errno;
1145
1146 ereport(LOG,
1148 errmsg("out of file descriptors: %m; release and retry")));
1149 errno = 0;
1150 if (ReleaseLruFile())
1151 goto tryAgain;
1152 errno = save_errno;
1153 }
1154
1155 return -1; /* failure */
1156}
1157
1158/*
1159 * AcquireExternalFD - attempt to reserve an external file descriptor
1160 *
1161 * This should be used by callers that need to hold a file descriptor open
1162 * over more than a short interval, but cannot use any of the other facilities
1163 * provided by this module.
1164 *
1165 * The difference between this and the underlying ReserveExternalFD function
1166 * is that this will report failure (by setting errno and returning false)
1167 * if "too many" external FDs are already reserved. This should be used in
1168 * any code where the total number of FDs to be reserved is not predictable
1169 * and small.
1170 */
1171bool
1173{
1174 /*
1175 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1176 * "external" FDs.
1177 */
1178 if (numExternalFDs < max_safe_fds / 3)
1179 {
1181 return true;
1182 }
1183 errno = EMFILE;
1184 return false;
1185}
1186
1187/*
1188 * ReserveExternalFD - report external consumption of a file descriptor
1189 *
1190 * This should be used by callers that need to hold a file descriptor open
1191 * over more than a short interval, but cannot use any of the other facilities
1192 * provided by this module. This just tracks the use of the FD and closes
1193 * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1194 *
1195 * Call this directly only in code where failure to reserve the FD would be
1196 * fatal; for example, the WAL-writing code does so, since the alternative is
1197 * session failure. Also, it's very unwise to do so in code that could
1198 * consume more than one FD per process.
1199 *
1200 * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1201 * available, it doesn't matter too much whether this is called before or
1202 * after actually opening the FD; but doing so beforehand reduces the risk of
1203 * an EMFILE failure if not everybody played nice. In any case, it's solely
1204 * caller's responsibility to keep the external-FD count in sync with reality.
1205 */
1206void
1208{
1209 /*
1210 * Release VFDs if needed to stay safe. Because we do this before
1211 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1212 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1213 */
1215
1217}
1218
1219/*
1220 * ReleaseExternalFD - report release of an external file descriptor
1221 *
1222 * This is guaranteed not to change errno, so it can be used in failure paths.
1223 */
1224void
1226{
1229}
1230
1231
1232#if defined(FDDEBUG)
1233
1234static void
1235_dump_lru(void)
1236{
1237 int mru = VfdCache[0].lruLessRecently;
1238 Vfd *vfdP = &VfdCache[mru];
1239 char buf[2048];
1240
1241 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1242 while (mru != 0)
1243 {
1244 mru = vfdP->lruLessRecently;
1245 vfdP = &VfdCache[mru];
1246 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1247 }
1248 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1249 elog(LOG, "%s", buf);
1250}
1251#endif /* FDDEBUG */
1252
1253static void
1254Delete(File file)
1255{
1256 Vfd *vfdP;
1257
1258 Assert(file != 0);
1259
1260 DO_DB(elog(LOG, "Delete %d (%s)",
1261 file, VfdCache[file].fileName));
1262 DO_DB(_dump_lru());
1263
1264 vfdP = &VfdCache[file];
1265
1266 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1267 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1268
1269 DO_DB(_dump_lru());
1270}
1271
1272static void
1273LruDelete(File file)
1274{
1275 Vfd *vfdP;
1276
1277 Assert(file != 0);
1278
1279 DO_DB(elog(LOG, "LruDelete %d (%s)",
1280 file, VfdCache[file].fileName));
1281
1282 vfdP = &VfdCache[file];
1283
1285
1286 /*
1287 * Close the file. We aren't expecting this to fail; if it does, better
1288 * to leak the FD than to mess up our internal state.
1289 */
1290 if (close(vfdP->fd) != 0)
1292 "could not close file \"%s\": %m", vfdP->fileName);
1293 vfdP->fd = VFD_CLOSED;
1294 --nfile;
1295
1296 /* delete the vfd record from the LRU ring */
1297 Delete(file);
1298}
1299
1300static void
1301Insert(File file)
1302{
1303 Vfd *vfdP;
1304
1305 Assert(file != 0);
1306
1307 DO_DB(elog(LOG, "Insert %d (%s)",
1308 file, VfdCache[file].fileName));
1309 DO_DB(_dump_lru());
1310
1311 vfdP = &VfdCache[file];
1312
1313 vfdP->lruMoreRecently = 0;
1314 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1315 VfdCache[0].lruLessRecently = file;
1316 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1317
1318 DO_DB(_dump_lru());
1319}
1320
1321/* returns 0 on success, -1 on re-open failure (with errno set) */
1322static int
1323LruInsert(File file)
1324{
1325 Vfd *vfdP;
1326
1327 Assert(file != 0);
1328
1329 DO_DB(elog(LOG, "LruInsert %d (%s)",
1330 file, VfdCache[file].fileName));
1331
1332 vfdP = &VfdCache[file];
1333
1334 if (FileIsNotOpen(file))
1335 {
1336 /* Close excess kernel FDs. */
1338
1339 /*
1340 * The open could still fail for lack of file descriptors, eg due to
1341 * overall system file table being full. So, be prepared to release
1342 * another FD if necessary...
1343 */
1344 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1345 vfdP->fileMode);
1346 if (vfdP->fd < 0)
1347 {
1348 DO_DB(elog(LOG, "re-open failed: %m"));
1349 return -1;
1350 }
1351 else
1352 {
1353 ++nfile;
1354 }
1355 }
1356
1357 /*
1358 * put it at the head of the Lru ring
1359 */
1360
1361 Insert(file);
1362
1363 return 0;
1364}
1365
1366/*
1367 * Release one kernel FD by closing the least-recently-used VFD.
1368 */
1369static bool
1370ReleaseLruFile(void)
1371{
1372 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1373
1374 if (nfile > 0)
1375 {
1376 /*
1377 * There are opened files and so there should be at least one used vfd
1378 * in the ring.
1379 */
1380 Assert(VfdCache[0].lruMoreRecently != 0);
1381 LruDelete(VfdCache[0].lruMoreRecently);
1382 return true; /* freed a file */
1383 }
1384 return false; /* no files available to free */
1385}
1386
1387/*
1388 * Release kernel FDs as needed to get under the max_safe_fds limit.
1389 * After calling this, it's OK to try to open another file.
1390 */
1391static void
1392ReleaseLruFiles(void)
1393{
1395 {
1396 if (!ReleaseLruFile())
1397 break;
1398 }
1399}
1400
1401static File
1402AllocateVfd(void)
1403{
1404 Index i;
1405 File file;
1406
1407 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1408
1409 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1410
1411 if (VfdCache[0].nextFree == 0)
1412 {
1413 /*
1414 * The free list is empty so it is time to increase the size of the
1415 * array. We choose to double it each time this happens. However,
1416 * there's not much point in starting *real* small.
1417 */
1420
1421 if (newCacheSize < 32)
1422 newCacheSize = 32;
1423
1424 /*
1425 * Be careful not to clobber VfdCache ptr if realloc fails.
1426 */
1427 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1428 if (newVfdCache == NULL)
1429 ereport(ERROR,
1431 errmsg("out of memory")));
1433
1434 /*
1435 * Initialize the new entries and link them into the free list.
1436 */
1437 for (i = SizeVfdCache; i < newCacheSize; i++)
1438 {
1439 MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1440 VfdCache[i].nextFree = i + 1;
1442 }
1445
1446 /*
1447 * Record the new size
1448 */
1450 }
1451
1452 file = VfdCache[0].nextFree;
1453
1455
1456 return file;
1457}
1458
1459static void
1460FreeVfd(File file)
1461{
1462 Vfd *vfdP = &VfdCache[file];
1463
1464 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1465 file, vfdP->fileName ? vfdP->fileName : ""));
1466
1467 if (vfdP->fileName != NULL)
1468 {
1469 free(vfdP->fileName);
1470 vfdP->fileName = NULL;
1471 }
1472 vfdP->fdstate = 0x0;
1473
1474 vfdP->nextFree = VfdCache[0].nextFree;
1475 VfdCache[0].nextFree = file;
1476}
1477
1478/* returns 0 on success, -1 on re-open failure (with errno set) */
1479static int
1480FileAccess(File file)
1481{
1482 int returnValue;
1483
1484 DO_DB(elog(LOG, "FileAccess %d (%s)",
1485 file, VfdCache[file].fileName));
1486
1487 /*
1488 * Is the file open? If not, open it and put it at the head of the LRU
1489 * ring (possibly closing the least recently used file to get an FD).
1490 */
1491
1492 if (FileIsNotOpen(file))
1493 {
1494 returnValue = LruInsert(file);
1495 if (returnValue != 0)
1496 return returnValue;
1497 }
1498 else if (VfdCache[0].lruLessRecently != file)
1499 {
1500 /*
1501 * We now know that the file is open and that it is not the last one
1502 * accessed, so we need to move it to the head of the Lru ring.
1503 */
1504
1505 Delete(file);
1506 Insert(file);
1507 }
1508
1509 return 0;
1510}
1511
1512/*
1513 * Called whenever a temporary file is deleted to report its size.
1514 */
1515static void
1516ReportTemporaryFileUsage(const char *path, pgoff_t size)
1517{
1519
1520 if (log_temp_files >= 0)
1521 {
1522 if ((size / 1024) >= log_temp_files)
1523 ereport(LOG,
1524 (errmsg("temporary file: path \"%s\", size %lu",
1525 path, (unsigned long) size)));
1526 }
1527}
1528
1529/*
1530 * Called to register a temporary file for automatic close.
1531 * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1532 * before the file was opened.
1533 */
1534static void
1536{
1539
1540 /* Backup mechanism for closing at end of xact. */
1543}
1544
1545/*
1546 * Called when we get a shared invalidation message on some relation.
1547 */
1548#ifdef NOT_USED
1549void
1550FileInvalidate(File file)
1551{
1552 Assert(FileIsValid(file));
1553 if (!FileIsNotOpen(file))
1554 LruDelete(file);
1555}
1556#endif
1557
1558/*
1559 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1560 * fileMode parameter.
1561 */
1562File
1563PathNameOpenFile(const char *fileName, int fileFlags)
1564{
1565 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1566}
1567
1568/*
1569 * open a file in an arbitrary directory
1570 *
1571 * NB: if the passed pathname is relative (which it usually is),
1572 * it will be interpreted relative to the process' working directory
1573 * (which should always be $PGDATA when this code is running).
1574 */
1575File
1576PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1577{
1578 char *fnamecopy;
1579 File file;
1580 Vfd *vfdP;
1581
1582 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1583 fileName, fileFlags, fileMode));
1584
1585 /*
1586 * We need a malloc'd copy of the file name; fail cleanly if no room.
1587 */
1588 fnamecopy = strdup(fileName);
1589 if (fnamecopy == NULL)
1590 ereport(ERROR,
1592 errmsg("out of memory")));
1593
1594 file = AllocateVfd();
1595 vfdP = &VfdCache[file];
1596
1597 /* Close excess kernel FDs. */
1599
1600 /*
1601 * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1602 * client shouldn't be expected to know which kernel descriptors are
1603 * currently open, so it wouldn't make sense for them to be inherited by
1604 * executed subprograms.
1605 */
1606 fileFlags |= O_CLOEXEC;
1607
1608 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1609
1610 if (vfdP->fd < 0)
1611 {
1612 int save_errno = errno;
1613
1614 FreeVfd(file);
1615 free(fnamecopy);
1616 errno = save_errno;
1617 return -1;
1618 }
1619 ++nfile;
1620 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1621 vfdP->fd));
1622
1623 vfdP->fileName = fnamecopy;
1624 /* Saved flags are adjusted to be OK for re-opening file */
1625 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1626 vfdP->fileMode = fileMode;
1627 vfdP->fileSize = 0;
1628 vfdP->fdstate = 0x0;
1629 vfdP->resowner = NULL;
1630
1631 Insert(file);
1632
1633 return file;
1634}
1635
1636/*
1637 * Create directory 'directory'. If necessary, create 'basedir', which must
1638 * be the directory above it. This is designed for creating the top-level
1639 * temporary directory on demand before creating a directory underneath it.
1640 * Do nothing if the directory already exists.
1641 *
1642 * Directories created within the top-level temporary directory should begin
1643 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1644 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1645 * that do not need any particular prefix.
1646*/
1647void
1648PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1649{
1650 if (MakePGDirectory(directory) < 0)
1651 {
1652 if (errno == EEXIST)
1653 return;
1654
1655 /*
1656 * Failed. Try to create basedir first in case it's missing. Tolerate
1657 * EEXIST to close a race against another process following the same
1658 * algorithm.
1659 */
1660 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1661 ereport(ERROR,
1663 errmsg("cannot create temporary directory \"%s\": %m",
1664 basedir)));
1665
1666 /* Try again. */
1667 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1668 ereport(ERROR,
1670 errmsg("cannot create temporary subdirectory \"%s\": %m",
1671 directory)));
1672 }
1673}
1674
1675/*
1676 * Delete a directory and everything in it, if it exists.
1677 */
1678void
1679PathNameDeleteTemporaryDir(const char *dirname)
1680{
1681 struct stat statbuf;
1682
1683 /* Silently ignore missing directory. */
1684 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1685 return;
1686
1687 /*
1688 * Currently, walkdir doesn't offer a way for our passed in function to
1689 * maintain state. Perhaps it should, so that we could tell the caller
1690 * whether this operation succeeded or failed. Since this operation is
1691 * used in a cleanup path, we wouldn't actually behave differently: we'll
1692 * just log failures.
1693 */
1694 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1695}
1696
1697/*
1698 * Open a temporary file that will disappear when we close it.
1699 *
1700 * This routine takes care of generating an appropriate tempfile name.
1701 * There's no need to pass in fileFlags or fileMode either, since only
1702 * one setting makes any sense for a temp file.
1703 *
1704 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1705 * to ensure it's closed and deleted when it's no longer needed, typically at
1706 * the end-of-transaction. In most cases, you don't want temporary files to
1707 * outlive the transaction that created them, so this should be false -- but
1708 * if you need "somewhat" temporary storage, this might be useful. In either
1709 * case, the file is removed when the File is explicitly closed.
1710 */
1711File
1712OpenTemporaryFile(bool interXact)
1713{
1714 File file = 0;
1715
1716 Assert(temporary_files_allowed); /* check temp file access is up */
1717
1718 /*
1719 * Make sure the current resource owner has space for this File before we
1720 * open it, if we'll be registering it below.
1721 */
1722 if (!interXact)
1724
1725 /*
1726 * If some temp tablespace(s) have been given to us, try to use the next
1727 * one. If a given tablespace can't be found, we silently fall back to
1728 * the database's default tablespace.
1729 *
1730 * BUT: if the temp file is slated to outlive the current transaction,
1731 * force it into the database's default tablespace, so that it will not
1732 * pose a threat to possible tablespace drop attempts.
1733 */
1734 if (numTempTableSpaces > 0 && !interXact)
1735 {
1737
1738 if (OidIsValid(tblspcOid))
1740 }
1741
1742 /*
1743 * If not, or if tablespace is bad, create in database's default
1744 * tablespace. MyDatabaseTableSpace should normally be set before we get
1745 * here, but just in case it isn't, fall back to pg_default tablespace.
1746 */
1747 if (file <= 0)
1751 true);
1752
1753 /* Mark it for deletion at close and temporary file size limit */
1755
1756 /* Register it with the current resource owner */
1757 if (!interXact)
1759
1760 return file;
1761}
1762
1763/*
1764 * Return the path of the temp directory in a given tablespace.
1765 */
1766void
1768{
1769 /*
1770 * Identify the tempfile directory for this tablespace.
1771 *
1772 * If someone tries to specify pg_global, use pg_default instead.
1773 */
1774 if (tablespace == InvalidOid ||
1777 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1778 else
1779 {
1780 /* All other tablespaces are accessed via symlinks */
1781 snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1784 }
1785}
1786
1787/*
1788 * Open a temporary file in a specific tablespace.
1789 * Subroutine for OpenTemporaryFile, which see for details.
1790 */
1791static File
1793{
1794 char tempdirpath[MAXPGPATH];
1795 char tempfilepath[MAXPGPATH];
1796 File file;
1797
1799
1800 /*
1801 * Generate a tempfile name that should be unique within the current
1802 * database instance.
1803 */
1804 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1806
1807 /*
1808 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1809 * temp file that can be reused.
1810 */
1813 if (file <= 0)
1814 {
1815 /*
1816 * We might need to create the tablespace's tempfile directory, if no
1817 * one has yet done so.
1818 *
1819 * Don't check for an error from MakePGDirectory; it could fail if
1820 * someone else just did the same thing. If it doesn't work then
1821 * we'll bomb out on the second create attempt, instead.
1822 */
1824
1827 if (file <= 0 && rejectError)
1828 elog(ERROR, "could not create temporary file \"%s\": %m",
1829 tempfilepath);
1830 }
1831
1832 return file;
1833}
1834
1835
1836/*
1837 * Create a new file. The directory containing it must already exist. Files
1838 * created this way are subject to temp_file_limit and are automatically
1839 * closed at end of transaction, but are not automatically deleted on close
1840 * because they are intended to be shared between cooperating backends.
1841 *
1842 * If the file is inside the top-level temporary directory, its name should
1843 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1844 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1845 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1846 * the prefix isn't needed.
1847 */
1848File
1849PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1850{
1851 File file;
1852
1853 Assert(temporary_files_allowed); /* check temp file access is up */
1854
1856
1857 /*
1858 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1859 * temp file that can be reused.
1860 */
1861 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1862 if (file <= 0)
1863 {
1864 if (error_on_failure)
1865 ereport(ERROR,
1867 errmsg("could not create temporary file \"%s\": %m",
1868 path)));
1869 else
1870 return file;
1871 }
1872
1873 /* Mark it for temp_file_limit accounting. */
1875
1876 /* Register it for automatic close. */
1878
1879 return file;
1880}
1881
1882/*
1883 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1884 * another backend. Files opened this way don't count against the
1885 * temp_file_limit of the caller, are automatically closed at the end of the
1886 * transaction but are not deleted on close.
1887 */
1888File
1889PathNameOpenTemporaryFile(const char *path, int mode)
1890{
1891 File file;
1892
1893 Assert(temporary_files_allowed); /* check temp file access is up */
1894
1896
1897 file = PathNameOpenFile(path, mode | PG_BINARY);
1898
1899 /* If no such file, then we don't raise an error. */
1900 if (file <= 0 && errno != ENOENT)
1901 ereport(ERROR,
1903 errmsg("could not open temporary file \"%s\": %m",
1904 path)));
1905
1906 if (file > 0)
1907 {
1908 /* Register it for automatic close. */
1910 }
1911
1912 return file;
1913}
1914
1915/*
1916 * Delete a file by pathname. Return true if the file existed, false if
1917 * didn't.
1918 */
1919bool
1920PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1921{
1922 struct stat filestats;
1923 int stat_errno;
1924
1925 /* Get the final size for pgstat reporting. */
1926 if (stat(path, &filestats) != 0)
1927 stat_errno = errno;
1928 else
1929 stat_errno = 0;
1930
1931 /*
1932 * Unlike FileClose's automatic file deletion code, we tolerate
1933 * non-existence to support BufFileDeleteFileSet which doesn't know how
1934 * many segments it has to delete until it runs out.
1935 */
1936 if (stat_errno == ENOENT)
1937 return false;
1938
1939 if (unlink(path) < 0)
1940 {
1941 if (errno != ENOENT)
1944 errmsg("could not unlink temporary file \"%s\": %m",
1945 path)));
1946 return false;
1947 }
1948
1949 if (stat_errno == 0)
1950 ReportTemporaryFileUsage(path, filestats.st_size);
1951 else
1952 {
1953 errno = stat_errno;
1954 ereport(LOG,
1956 errmsg("could not stat file \"%s\": %m", path)));
1957 }
1958
1959 return true;
1960}
1961
1962/*
1963 * close a file when done with it
1964 */
1965void
1966FileClose(File file)
1967{
1968 Vfd *vfdP;
1969
1970 Assert(FileIsValid(file));
1971
1972 DO_DB(elog(LOG, "FileClose: %d (%s)",
1973 file, VfdCache[file].fileName));
1974
1975 vfdP = &VfdCache[file];
1976
1977 if (!FileIsNotOpen(file))
1978 {
1980
1981 /* close the file */
1982 if (close(vfdP->fd) != 0)
1983 {
1984 /*
1985 * We may need to panic on failure to close non-temporary files;
1986 * see LruDelete.
1987 */
1989 "could not close file \"%s\": %m", vfdP->fileName);
1990 }
1991
1992 --nfile;
1993 vfdP->fd = VFD_CLOSED;
1994
1995 /* remove the file from the lru ring */
1996 Delete(file);
1997 }
1998
1999 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2000 {
2001 /* Subtract its size from current usage (do first in case of error) */
2002 temporary_files_size -= vfdP->fileSize;
2003 vfdP->fileSize = 0;
2004 }
2005
2006 /*
2007 * Delete the file if it was temporary, and make a log entry if wanted
2008 */
2009 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2010 {
2011 struct stat filestats;
2012 int stat_errno;
2013
2014 /*
2015 * If we get an error, as could happen within the ereport/elog calls,
2016 * we'll come right back here during transaction abort. Reset the
2017 * flag to ensure that we can't get into an infinite loop. This code
2018 * is arranged to ensure that the worst-case consequence is failing to
2019 * emit log message(s), not failing to attempt the unlink.
2020 */
2021 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2022
2023
2024 /* first try the stat() */
2025 if (stat(vfdP->fileName, &filestats))
2026 stat_errno = errno;
2027 else
2028 stat_errno = 0;
2029
2030 /* in any case do the unlink */
2031 if (unlink(vfdP->fileName))
2032 ereport(LOG,
2034 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2035
2036 /* and last report the stat results */
2037 if (stat_errno == 0)
2038 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2039 else
2040 {
2041 errno = stat_errno;
2042 ereport(LOG,
2044 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2045 }
2046 }
2047
2048 /* Unregister it from the resource owner */
2049 if (vfdP->resowner)
2050 ResourceOwnerForgetFile(vfdP->resowner, file);
2051
2052 /*
2053 * Return the Vfd slot to the free list
2054 */
2055 FreeVfd(file);
2056}
2057
2058/*
2059 * FilePrefetch - initiate asynchronous read of a given range of the file.
2060 *
2061 * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2062 *
2063 * posix_fadvise() is the simplest standardized interface that accomplishes
2064 * this.
2065 */
2066int
2067FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2068{
2069 Assert(FileIsValid(file));
2070
2071 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2072 file, VfdCache[file].fileName,
2073 (int64) offset, (int64) amount));
2074
2075#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2076 {
2077 int returnCode;
2078
2079 returnCode = FileAccess(file);
2080 if (returnCode < 0)
2081 return returnCode;
2082
2083retry:
2084 pgstat_report_wait_start(wait_event_info);
2085 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2088
2089 if (returnCode == EINTR)
2090 goto retry;
2091
2092 return returnCode;
2093 }
2094#elif defined(__darwin__)
2095 {
2096 struct radvisory
2097 {
2098 off_t ra_offset; /* offset into the file */
2099 int ra_count; /* size of the read */
2100 } ra;
2101 int returnCode;
2102
2103 returnCode = FileAccess(file);
2104 if (returnCode < 0)
2105 return returnCode;
2106
2107 ra.ra_offset = offset;
2108 ra.ra_count = amount;
2109 pgstat_report_wait_start(wait_event_info);
2112 if (returnCode != -1)
2113 return 0;
2114 else
2115 return errno;
2116 }
2117#else
2118 return 0;
2119#endif
2120}
2121
2122void
2123FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
2124{
2125 int returnCode;
2126
2127 Assert(FileIsValid(file));
2128
2129 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2130 file, VfdCache[file].fileName,
2131 (int64) offset, (int64) nbytes));
2132
2133 if (nbytes <= 0)
2134 return;
2135
2136 if (VfdCache[file].fileFlags & PG_O_DIRECT)
2137 return;
2138
2139 returnCode = FileAccess(file);
2140 if (returnCode < 0)
2141 return;
2142
2143 pgstat_report_wait_start(wait_event_info);
2144 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2146}
2147
2148ssize_t
2149FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2150 uint32 wait_event_info)
2151{
2153 Vfd *vfdP;
2154
2155 Assert(FileIsValid(file));
2156
2157 DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2158 file, VfdCache[file].fileName,
2159 (int64) offset,
2160 iovcnt));
2161
2162 returnCode = FileAccess(file);
2163 if (returnCode < 0)
2164 return returnCode;
2165
2166 vfdP = &VfdCache[file];
2167
2168retry:
2169 pgstat_report_wait_start(wait_event_info);
2170 returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2172
2173 if (returnCode < 0)
2174 {
2175 /*
2176 * Windows may run out of kernel buffers and return "Insufficient
2177 * system resources" error. Wait a bit and retry to solve it.
2178 *
2179 * It is rumored that EINTR is also possible on some Unix filesystems,
2180 * in which case immediate retry is indicated.
2181 */
2182#ifdef WIN32
2184
2185 switch (error)
2186 {
2188 pg_usleep(1000L);
2189 errno = EINTR;
2190 break;
2191 default:
2193 break;
2194 }
2195#endif
2196 /* OK to retry if interrupted */
2197 if (errno == EINTR)
2198 goto retry;
2199 }
2200
2201 return returnCode;
2202}
2203
2204int
2206 int iovcnt, pgoff_t offset,
2207 uint32 wait_event_info)
2208{
2209 int returnCode;
2210 Vfd *vfdP;
2211
2212 Assert(FileIsValid(file));
2213
2214 DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2215 file, VfdCache[file].fileName,
2216 (int64) offset,
2217 iovcnt));
2218
2219 returnCode = FileAccess(file);
2220 if (returnCode < 0)
2221 return returnCode;
2222
2223 vfdP = &VfdCache[file];
2224
2225 pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2226
2227 return 0;
2228}
2229
2230ssize_t
2231FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2232 uint32 wait_event_info)
2233{
2235 Vfd *vfdP;
2236
2237 Assert(FileIsValid(file));
2238
2239 DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2240 file, VfdCache[file].fileName,
2241 (int64) offset,
2242 iovcnt));
2243
2244 returnCode = FileAccess(file);
2245 if (returnCode < 0)
2246 return returnCode;
2247
2248 vfdP = &VfdCache[file];
2249
2250 /*
2251 * If enforcing temp_file_limit and it's a temp file, check to see if the
2252 * write would overrun temp_file_limit, and throw error if so. Note: it's
2253 * really a modularity violation to throw error here; we should set errno
2254 * and return -1. However, there's no way to report a suitable error
2255 * message if we do that. All current callers would just throw error
2256 * immediately anyway, so this is safe at present.
2257 */
2258 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2259 {
2260 pgoff_t past_write = offset;
2261
2262 for (int i = 0; i < iovcnt; ++i)
2263 past_write += iov[i].iov_len;
2264
2265 if (past_write > vfdP->fileSize)
2266 {
2268
2270 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2271 ereport(ERROR,
2273 errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2274 temp_file_limit)));
2275 }
2276 }
2277
2278retry:
2279 pgstat_report_wait_start(wait_event_info);
2280 returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2282
2283 if (returnCode >= 0)
2284 {
2285 /*
2286 * Some callers expect short writes to set errno, and traditionally we
2287 * have assumed that they imply disk space shortage. We don't want to
2288 * waste CPU cycles adding up the total size here, so we'll just set
2289 * it for all successful writes in case such a caller determines that
2290 * the write was short and ereports "%m".
2291 */
2292 errno = ENOSPC;
2293
2294 /*
2295 * Maintain fileSize and temporary_files_size if it's a temp file.
2296 */
2297 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2298 {
2299 pgoff_t past_write = offset + returnCode;
2300
2301 if (past_write > vfdP->fileSize)
2302 {
2303 temporary_files_size += past_write - vfdP->fileSize;
2304 vfdP->fileSize = past_write;
2305 }
2306 }
2307 }
2308 else
2309 {
2310 /*
2311 * See comments in FileReadV()
2312 */
2313#ifdef WIN32
2315
2316 switch (error)
2317 {
2319 pg_usleep(1000L);
2320 errno = EINTR;
2321 break;
2322 default:
2324 break;
2325 }
2326#endif
2327 /* OK to retry if interrupted */
2328 if (errno == EINTR)
2329 goto retry;
2330 }
2331
2332 return returnCode;
2333}
2334
2335int
2336FileSync(File file, uint32 wait_event_info)
2337{
2338 int returnCode;
2339
2340 Assert(FileIsValid(file));
2341
2342 DO_DB(elog(LOG, "FileSync: %d (%s)",
2343 file, VfdCache[file].fileName));
2344
2345 returnCode = FileAccess(file);
2346 if (returnCode < 0)
2347 return returnCode;
2348
2349 pgstat_report_wait_start(wait_event_info);
2350 returnCode = pg_fsync(VfdCache[file].fd);
2352
2353 return returnCode;
2354}
2355
2356/*
2357 * Zero a region of the file.
2358 *
2359 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2360 * appropriate error.
2361 */
2362int
2363FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2364{
2365 int returnCode;
2367
2368 Assert(FileIsValid(file));
2369
2370 DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2371 file, VfdCache[file].fileName,
2372 (int64) offset, (int64) amount));
2373
2374 returnCode = FileAccess(file);
2375 if (returnCode < 0)
2376 return returnCode;
2377
2378 pgstat_report_wait_start(wait_event_info);
2379 written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2381
2382 if (written < 0)
2383 return -1;
2384 else if (written != amount)
2385 {
2386 /* if errno is unset, assume problem is no disk space */
2387 if (errno == 0)
2388 errno = ENOSPC;
2389 return -1;
2390 }
2391
2392 return 0;
2393}
2394
2395/*
2396 * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2397 * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2398 * use FileZero() instead.
2399 *
2400 * Note that at least glibc() implements posix_fallocate() in userspace if not
2401 * implemented by the filesystem. That's not the case for all environments
2402 * though.
2403 *
2404 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2405 * appropriate error.
2406 */
2407int
2408FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2409{
2410#ifdef HAVE_POSIX_FALLOCATE
2411 int returnCode;
2412
2413 Assert(FileIsValid(file));
2414
2415 DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2416 file, VfdCache[file].fileName,
2417 (int64) offset, (int64) amount));
2418
2419 returnCode = FileAccess(file);
2420 if (returnCode < 0)
2421 return -1;
2422
2423retry:
2424 pgstat_report_wait_start(wait_event_info);
2425 returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2427
2428 if (returnCode == 0)
2429 return 0;
2430 else if (returnCode == EINTR)
2431 goto retry;
2432
2433 /* for compatibility with %m printing etc */
2434 errno = returnCode;
2435
2436 /*
2437 * Return in cases of a "real" failure, if fallocate is not supported,
2438 * fall through to the FileZero() backed implementation.
2439 */
2441 return -1;
2442#endif
2443
2444 return FileZero(file, offset, amount, wait_event_info);
2445}
2446
2447pgoff_t
2448FileSize(File file)
2449{
2450 Assert(FileIsValid(file));
2451
2452 DO_DB(elog(LOG, "FileSize %d (%s)",
2453 file, VfdCache[file].fileName));
2454
2455 if (FileIsNotOpen(file))
2456 {
2457 if (FileAccess(file) < 0)
2458 return (pgoff_t) -1;
2459 }
2460
2461 return lseek(VfdCache[file].fd, 0, SEEK_END);
2462}
2463
2464int
2465FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
2466{
2467 int returnCode;
2468
2469 Assert(FileIsValid(file));
2470
2471 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2472 file, VfdCache[file].fileName));
2473
2474 returnCode = FileAccess(file);
2475 if (returnCode < 0)
2476 return returnCode;
2477
2478 pgstat_report_wait_start(wait_event_info);
2479 returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2481
2482 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2483 {
2484 /* adjust our state for truncation of a temp file */
2485 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2486 temporary_files_size -= VfdCache[file].fileSize - offset;
2487 VfdCache[file].fileSize = offset;
2488 }
2489
2490 return returnCode;
2491}
2492
2493/*
2494 * Return the pathname associated with an open file.
2495 *
2496 * The returned string points to an internal buffer, which is valid until
2497 * the file is closed.
2498 */
2499char *
2500FilePathName(File file)
2501{
2502 Assert(FileIsValid(file));
2503
2504 return VfdCache[file].fileName;
2505}
2506
2507/*
2508 * Return the raw file descriptor of an opened file.
2509 *
2510 * The returned file descriptor will be valid until the file is closed, but
2511 * there are a lot of things that can make that happen. So the caller should
2512 * be careful not to do much of anything else before it finishes using the
2513 * returned file descriptor.
2514 */
2515int
2516FileGetRawDesc(File file)
2517{
2518 int returnCode;
2519
2520 returnCode = FileAccess(file);
2521 if (returnCode < 0)
2522 return returnCode;
2523
2524 Assert(FileIsValid(file));
2525 return VfdCache[file].fd;
2526}
2527
2528/*
2529 * FileGetRawFlags - returns the file flags on open(2)
2530 */
2531int
2533{
2534 Assert(FileIsValid(file));
2535 return VfdCache[file].fileFlags;
2536}
2537
2538/*
2539 * FileGetRawMode - returns the mode bitmask passed to open(2)
2540 */
2541mode_t
2542FileGetRawMode(File file)
2543{
2544 Assert(FileIsValid(file));
2545 return VfdCache[file].fileMode;
2546}
2547
2548/*
2549 * Make room for another allocatedDescs[] array entry if needed and possible.
2550 * Returns true if an array element is available.
2551 */
2552static bool
2554{
2556 int newMax;
2557
2558 /* Quick out if array already has a free slot. */
2560 return true;
2561
2562 /*
2563 * If the array hasn't yet been created in the current process, initialize
2564 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2565 * we will ever need, anyway. We don't want to look at max_safe_fds
2566 * immediately because set_max_safe_fds() may not have run yet.
2567 */
2568 if (allocatedDescs == NULL)
2569 {
2570 newMax = FD_MINFREE / 3;
2572 /* Out of memory already? Treat as fatal error. */
2573 if (newDescs == NULL)
2574 ereport(ERROR,
2576 errmsg("out of memory")));
2579 return true;
2580 }
2581
2582 /*
2583 * Consider enlarging the array beyond the initial allocation used above.
2584 * By the time this happens, max_safe_fds should be known accurately.
2585 *
2586 * We mustn't let allocated descriptors hog all the available FDs, and in
2587 * practice we'd better leave a reasonable number of FDs for VFD use. So
2588 * set the maximum to max_safe_fds / 3. (This should certainly be at
2589 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2590 * tightening the restriction here.) Recall that "external" FDs are
2591 * allowed to consume another third of max_safe_fds.
2592 */
2593 newMax = max_safe_fds / 3;
2595 {
2597 newMax * sizeof(AllocateDesc));
2598 /* Treat out-of-memory as a non-fatal error. */
2599 if (newDescs == NULL)
2600 return false;
2603 return true;
2604 }
2605
2606 /* Can't enlarge allocatedDescs[] any more. */
2607 return false;
2608}
2609
2610/*
2611 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2612 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2613 * necessary to open the file. When done, call FreeFile rather than fclose.
2614 *
2615 * Note that files that will be open for any significant length of time
2616 * should NOT be handled this way, since they cannot share kernel file
2617 * descriptors with other files; there is grave risk of running out of FDs
2618 * if anyone locks down too many FDs. Most callers of this routine are
2619 * simply reading a config file that they will read and close immediately.
2620 *
2621 * fd.c will automatically close all files opened with AllocateFile at
2622 * transaction commit or abort; this prevents FD leakage if a routine
2623 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2624 *
2625 * Ideally this should be the *only* direct call of fopen() in the backend.
2626 */
2627FILE *
2628AllocateFile(const char *name, const char *mode)
2629{
2630 FILE *file;
2631
2632 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2634
2635 /* Can we allocate another non-virtual FD? */
2636 if (!reserveAllocatedDesc())
2637 ereport(ERROR,
2639 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2641
2642 /* Close excess kernel FDs. */
2644
2645TryAgain:
2646 if ((file = fopen(name, mode)) != NULL)
2647 {
2649
2650 desc->kind = AllocateDescFile;
2651 desc->desc.file = file;
2654 return desc->desc.file;
2655 }
2656
2657 if (errno == EMFILE || errno == ENFILE)
2658 {
2659 int save_errno = errno;
2660
2661 ereport(LOG,
2663 errmsg("out of file descriptors: %m; release and retry")));
2664 errno = 0;
2665 if (ReleaseLruFile())
2666 goto TryAgain;
2667 errno = save_errno;
2668 }
2669
2670 return NULL;
2671}
2672
2673/*
2674 * Open a file with OpenTransientFilePerm() and pass default file mode for
2675 * the fileMode parameter.
2676 */
2677int
2678OpenTransientFile(const char *fileName, int fileFlags)
2679{
2680 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2681}
2682
2683/*
2684 * Like AllocateFile, but returns an unbuffered fd like open(2)
2685 */
2686int
2687OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2688{
2689 int fd;
2690
2691 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2692 numAllocatedDescs, fileName));
2693
2694 /* Can we allocate another non-virtual FD? */
2695 if (!reserveAllocatedDesc())
2696 ereport(ERROR,
2698 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2699 maxAllocatedDescs, fileName)));
2700
2701 /* Close excess kernel FDs. */
2703
2704 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2705
2706 if (fd >= 0)
2707 {
2709
2710 desc->kind = AllocateDescRawFD;
2711 desc->desc.fd = fd;
2714
2715 return fd;
2716 }
2717
2718 return -1; /* failure */
2719}
2720
2721/*
2722 * Routines that want to initiate a pipe stream should use OpenPipeStream
2723 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2724 * necessary. When done, call ClosePipeStream rather than pclose.
2725 *
2726 * This function also ensures that the popen'd program is run with default
2727 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2728 * uses. This ensures desirable response to, eg, closing a read pipe early.
2729 */
2730FILE *
2731OpenPipeStream(const char *command, const char *mode)
2732{
2733 FILE *file;
2734 int save_errno;
2735
2736 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2737 numAllocatedDescs, command));
2738
2739 /* Can we allocate another non-virtual FD? */
2740 if (!reserveAllocatedDesc())
2741 ereport(ERROR,
2743 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2744 maxAllocatedDescs, command)));
2745
2746 /* Close excess kernel FDs. */
2748
2749TryAgain:
2750 fflush(NULL);
2752 errno = 0;
2753 file = popen(command, mode);
2754 save_errno = errno;
2756 errno = save_errno;
2757 if (file != NULL)
2758 {
2760
2761 desc->kind = AllocateDescPipe;
2762 desc->desc.file = file;
2765 return desc->desc.file;
2766 }
2767
2768 if (errno == EMFILE || errno == ENFILE)
2769 {
2770 ereport(LOG,
2772 errmsg("out of file descriptors: %m; release and retry")));
2773 if (ReleaseLruFile())
2774 goto TryAgain;
2775 errno = save_errno;
2776 }
2777
2778 return NULL;
2779}
2780
2781/*
2782 * Free an AllocateDesc of any type.
2783 *
2784 * The argument *must* point into the allocatedDescs[] array.
2785 */
2786static int
2788{
2789 int result;
2790
2791 /* Close the underlying object */
2792 switch (desc->kind)
2793 {
2794 case AllocateDescFile:
2795 result = fclose(desc->desc.file);
2796 break;
2797 case AllocateDescPipe:
2798 result = pclose(desc->desc.file);
2799 break;
2800 case AllocateDescDir:
2801 result = closedir(desc->desc.dir);
2802 break;
2803 case AllocateDescRawFD:
2804 pgaio_closing_fd(desc->desc.fd);
2805 result = close(desc->desc.fd);
2806 break;
2807 default:
2808 elog(ERROR, "AllocateDesc kind not recognized");
2809 result = 0; /* keep compiler quiet */
2810 break;
2811 }
2812
2813 /* Compact storage in the allocatedDescs array */
2816
2817 return result;
2818}
2819
2820/*
2821 * Close a file returned by AllocateFile.
2822 *
2823 * Note we do not check fclose's return value --- it is up to the caller
2824 * to handle close errors.
2825 */
2826int
2827FreeFile(FILE *file)
2828{
2829 int i;
2830
2831 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2832
2833 /* Remove file from list of allocated files, if it's present */
2834 for (i = numAllocatedDescs; --i >= 0;)
2835 {
2836 AllocateDesc *desc = &allocatedDescs[i];
2837
2838 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2839 return FreeDesc(desc);
2840 }
2841
2842 /* Only get here if someone passes us a file not in allocatedDescs */
2843 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2844
2845 return fclose(file);
2846}
2847
2848/*
2849 * Close a file returned by OpenTransientFile.
2850 *
2851 * Note we do not check close's return value --- it is up to the caller
2852 * to handle close errors.
2853 */
2854int
2856{
2857 int i;
2858
2859 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2860
2861 /* Remove fd from list of allocated files, if it's present */
2862 for (i = numAllocatedDescs; --i >= 0;)
2863 {
2864 AllocateDesc *desc = &allocatedDescs[i];
2865
2866 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2867 return FreeDesc(desc);
2868 }
2869
2870 /* Only get here if someone passes us a file not in allocatedDescs */
2871 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2872
2874
2875 return close(fd);
2876}
2877
2878/*
2879 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2880 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2881 * necessary to open the directory, and with closing it after an elog.
2882 * When done, call FreeDir rather than closedir.
2883 *
2884 * Returns NULL, with errno set, on failure. Note that failure detection
2885 * is commonly left to the following call of ReadDir or ReadDirExtended;
2886 * see the comments for ReadDir.
2887 *
2888 * Ideally this should be the *only* direct call of opendir() in the backend.
2889 */
2890DIR *
2891AllocateDir(const char *dirname)
2892{
2893 DIR *dir;
2894
2895 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2896 numAllocatedDescs, dirname));
2897
2898 /* Can we allocate another non-virtual FD? */
2899 if (!reserveAllocatedDesc())
2900 ereport(ERROR,
2902 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2903 maxAllocatedDescs, dirname)));
2904
2905 /* Close excess kernel FDs. */
2907
2908TryAgain:
2909 if ((dir = opendir(dirname)) != NULL)
2910 {
2912
2913 desc->kind = AllocateDescDir;
2914 desc->desc.dir = dir;
2917 return desc->desc.dir;
2918 }
2919
2920 if (errno == EMFILE || errno == ENFILE)
2921 {
2922 int save_errno = errno;
2923
2924 ereport(LOG,
2926 errmsg("out of file descriptors: %m; release and retry")));
2927 errno = 0;
2928 if (ReleaseLruFile())
2929 goto TryAgain;
2930 errno = save_errno;
2931 }
2932
2933 return NULL;
2934}
2935
2936/*
2937 * Read a directory opened with AllocateDir, ereport'ing any error.
2938 *
2939 * This is easier to use than raw readdir() since it takes care of some
2940 * otherwise rather tedious and error-prone manipulation of errno. Also,
2941 * if you are happy with a generic error message for AllocateDir failure,
2942 * you can just do
2943 *
2944 * dir = AllocateDir(path);
2945 * while ((dirent = ReadDir(dir, path)) != NULL)
2946 * process dirent;
2947 * FreeDir(dir);
2948 *
2949 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2950 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2951 * use this shortcut.)
2952 *
2953 * The pathname passed to AllocateDir must be passed to this routine too,
2954 * but it is only used for error reporting.
2955 */
2956struct dirent *
2957ReadDir(DIR *dir, const char *dirname)
2958{
2959 return ReadDirExtended(dir, dirname, ERROR);
2960}
2961
2962/*
2963 * Alternate version of ReadDir that allows caller to specify the elevel
2964 * for any error report (whether it's reporting an initial failure of
2965 * AllocateDir or a subsequent directory read failure).
2966 *
2967 * If elevel < ERROR, returns NULL after any error. With the normal coding
2968 * pattern, this will result in falling out of the loop immediately as
2969 * though the directory contained no (more) entries.
2970 */
2971struct dirent *
2972ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2973{
2974 struct dirent *dent;
2975
2976 /* Give a generic message for AllocateDir failure, if caller didn't */
2977 if (dir == NULL)
2978 {
2979 ereport(elevel,
2981 errmsg("could not open directory \"%s\": %m",
2982 dirname)));
2983 return NULL;
2984 }
2985
2986 errno = 0;
2987 if ((dent = readdir(dir)) != NULL)
2988 return dent;
2989
2990 if (errno)
2991 ereport(elevel,
2993 errmsg("could not read directory \"%s\": %m",
2994 dirname)));
2995 return NULL;
2996}
2997
2998/*
2999 * Close a directory opened with AllocateDir.
3000 *
3001 * Returns closedir's return value (with errno set if it's not 0).
3002 * Note we do not check the return value --- it is up to the caller
3003 * to handle close errors if wanted.
3004 *
3005 * Does nothing if dir == NULL; we assume that directory open failure was
3006 * already reported if desired.
3007 */
3008int
3009FreeDir(DIR *dir)
3010{
3011 int i;
3012
3013 /* Nothing to do if AllocateDir failed */
3014 if (dir == NULL)
3015 return 0;
3016
3017 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3018
3019 /* Remove dir from list of allocated dirs, if it's present */
3020 for (i = numAllocatedDescs; --i >= 0;)
3021 {
3022 AllocateDesc *desc = &allocatedDescs[i];
3023
3024 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3025 return FreeDesc(desc);
3026 }
3027
3028 /* Only get here if someone passes us a dir not in allocatedDescs */
3029 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3030
3031 return closedir(dir);
3032}
3033
3034
3035/*
3036 * Close a pipe stream returned by OpenPipeStream.
3037 */
3038int
3039ClosePipeStream(FILE *file)
3040{
3041 int i;
3042
3043 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3044
3045 /* Remove file from list of allocated files, if it's present */
3046 for (i = numAllocatedDescs; --i >= 0;)
3047 {
3048 AllocateDesc *desc = &allocatedDescs[i];
3049
3050 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3051 return FreeDesc(desc);
3052 }
3053
3054 /* Only get here if someone passes us a file not in allocatedDescs */
3055 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3056
3057 return pclose(file);
3058}
3059
3060/*
3061 * closeAllVfds
3062 *
3063 * Force all VFDs into the physically-closed state, so that the fewest
3064 * possible number of kernel file descriptors are in use. There is no
3065 * change in the logical state of the VFDs.
3066 */
3067void
3068closeAllVfds(void)
3069{
3070 Index i;
3071
3072 if (SizeVfdCache > 0)
3073 {
3074 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3075 for (i = 1; i < SizeVfdCache; i++)
3076 {
3077 if (!FileIsNotOpen(i))
3078 LruDelete(i);
3079 }
3080 }
3081}
3082
3083
3084/*
3085 * SetTempTablespaces
3086 *
3087 * Define a list (actually an array) of OIDs of tablespaces to use for
3088 * temporary files. This list will be used until end of transaction,
3089 * unless this function is called again before then. It is caller's
3090 * responsibility that the passed-in array has adequate lifespan (typically
3091 * it'd be allocated in TopTransactionContext).
3092 *
3093 * Some entries of the array may be InvalidOid, indicating that the current
3094 * database's default tablespace should be used.
3095 */
3096void
3098{
3099 Assert(numSpaces >= 0);
3102
3103 /*
3104 * Select a random starting point in the list. This is to minimize
3105 * conflicts between backends that are most likely sharing the same list
3106 * of temp tablespaces. Note that if we create multiple temp files in the
3107 * same transaction, we'll advance circularly through the list --- this
3108 * ensures that large temporary sort files are nicely spread across all
3109 * available tablespaces.
3110 */
3111 if (numSpaces > 1)
3113 0, numSpaces - 1);
3114 else
3116}
3117
3118/*
3119 * TempTablespacesAreSet
3120 *
3121 * Returns true if SetTempTablespaces has been called in current transaction.
3122 * (This is just so that tablespaces.c doesn't need its own per-transaction
3123 * state.)
3124 */
3125bool
3127{
3128 return (numTempTableSpaces >= 0);
3129}
3130
3131/*
3132 * GetTempTablespaces
3133 *
3134 * Populate an array with the OIDs of the tablespaces that should be used for
3135 * temporary files. (Some entries may be InvalidOid, indicating that the
3136 * current database's default tablespace should be used.) At most numSpaces
3137 * entries will be filled.
3138 * Returns the number of OIDs that were copied into the output array.
3139 */
3140int
3142{
3143 int i;
3144
3146 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3148
3149 return i;
3150}
3151
3152/*
3153 * GetNextTempTableSpace
3154 *
3155 * Select the next temp tablespace to use. A result of InvalidOid means
3156 * to use the current database's default tablespace.
3157 */
3158Oid
3160{
3161 if (numTempTableSpaces > 0)
3162 {
3163 /* Advance nextTempTableSpace counter with wraparound */
3167 }
3168 return InvalidOid;
3169}
3170
3171
3172/*
3173 * AtEOSubXact_Files
3174 *
3175 * Take care of subtransaction commit/abort. At abort, we close AllocateDescs
3176 * that the subtransaction may have opened. At commit, we reassign them to
3177 * the parent subtransaction. (Temporary files are tracked by ResourceOwners
3178 * instead.)
3179 */
3180void
3183{
3184 Index i;
3185
3186 for (i = 0; i < numAllocatedDescs; i++)
3187 {
3188 if (allocatedDescs[i].create_subid == mySubid)
3189 {
3190 if (isCommit)
3192 else
3193 {
3194 /* have to recheck the item after FreeDesc (ugly) */
3196 }
3197 }
3198 }
3199}
3200
3201/*
3202 * AtEOXact_Files
3203 *
3204 * This routine is called during transaction commit or abort. All still-open
3205 * per-transaction temporary file VFDs are closed, which also causes the
3206 * underlying files to be deleted (although they should've been closed already
3207 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3208 * closed. We also forget any transaction-local temp tablespace list.
3209 *
3210 * The isCommit flag is used only to decide whether to emit warnings about
3211 * unclosed files.
3212 */
3213void
3215{
3216 CleanupTempFiles(isCommit, false);
3218 numTempTableSpaces = -1;
3219}
3220
3221/*
3222 * BeforeShmemExit_Files
3223 *
3224 * before_shmem_exit hook to clean up temp files during backend shutdown.
3225 * Here, we want to clean up *all* temp files including interXact ones.
3226 */
3227static void
3229{
3230 CleanupTempFiles(false, true);
3231
3232 /* prevent further temp files from being created */
3233#ifdef USE_ASSERT_CHECKING
3235#endif
3236}
3237
3238/*
3239 * Close temporary files and delete their underlying files.
3240 *
3241 * isCommit: if true, this is normal transaction commit, and we don't
3242 * expect any remaining files; warn if there are some.
3243 *
3244 * isProcExit: if true, this is being called as the backend process is
3245 * exiting. If that's the case, we should remove all temporary files; if
3246 * that's not the case, we are being called for transaction commit/abort
3247 * and should only remove transaction-local temp files. In either case,
3248 * also clean up "allocated" stdio files, dirs and fds.
3249 */
3250static void
3252{
3253 Index i;
3254
3255 /*
3256 * Careful here: at proc_exit we need extra cleanup, not just
3257 * xact_temporary files.
3258 */
3260 {
3261 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3262 for (i = 1; i < SizeVfdCache; i++)
3263 {
3264 unsigned short fdstate = VfdCache[i].fdstate;
3265
3266 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3267 VfdCache[i].fileName != NULL)
3268 {
3269 /*
3270 * If we're in the process of exiting a backend process, close
3271 * all temporary files. Otherwise, only close temporary files
3272 * local to the current transaction. They should be closed by
3273 * the ResourceOwner mechanism already, so this is just a
3274 * debugging cross-check.
3275 */
3276 if (isProcExit)
3277 FileClose(i);
3278 else if (fdstate & FD_CLOSE_AT_EOXACT)
3279 {
3280 elog(WARNING,
3281 "temporary file %s not closed at end-of-transaction",
3282 VfdCache[i].fileName);
3283 FileClose(i);
3284 }
3285 }
3286 }
3287
3289 }
3290
3291 /* Complain if any allocated files remain open at commit. */
3292 if (isCommit && numAllocatedDescs > 0)
3293 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3295
3296 /* Clean up "allocated" stdio files, dirs and fds. */
3297 while (numAllocatedDescs > 0)
3299}
3300
3301
3302/*
3303 * Remove temporary and temporary relation files left over from a prior
3304 * postmaster session
3305 *
3306 * This should be called during postmaster startup. It will forcibly
3307 * remove any leftover files created by OpenTemporaryFile and any leftover
3308 * temporary relation files created by mdcreate.
3309 *
3310 * During post-backend-crash restart cycle, this routine is called when
3311 * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3312 * queries are using temp files could result in useless storage usage that can
3313 * only be reclaimed by a service restart. The argument against enabling it is
3314 * that someone might want to examine the temporary files for debugging
3315 * purposes. This does however mean that OpenTemporaryFile had better allow for
3316 * collision with an existing temp file name.
3317 *
3318 * NOTE: this function and its subroutines generally report syscall failures
3319 * with ereport(LOG) and keep going. Removing temp files is not so critical
3320 * that we should fail to start the database when we can't do it.
3321 */
3322void
3324{
3326 DIR *spc_dir;
3327 struct dirent *spc_de;
3328
3329 /*
3330 * First process temp files in pg_default ($PGDATA/base)
3331 */
3332 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3333 RemovePgTempFilesInDir(temp_path, true, false);
3335
3336 /*
3337 * Cycle through temp directories for all non-default tablespaces.
3338 */
3340
3342 {
3343 if (strcmp(spc_de->d_name, ".") == 0 ||
3344 strcmp(spc_de->d_name, "..") == 0)
3345 continue;
3346
3347 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3350 RemovePgTempFilesInDir(temp_path, true, false);
3351
3352 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3355 }
3356
3358
3359 /*
3360 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3361 * DataDir as well. However, that is *not* cleaned here because doing so
3362 * would create a race condition. It's done separately, earlier in
3363 * postmaster startup.
3364 */
3365}
3366
3367/*
3368 * Process one pgsql_tmp directory for RemovePgTempFiles.
3369 *
3370 * If missing_ok is true, it's all right for the named directory to not exist.
3371 * Any other problem results in a LOG message. (missing_ok should be true at
3372 * the top level, since pgsql_tmp directories are not created until needed.)
3373 *
3374 * At the top level, this should be called with unlink_all = false, so that
3375 * only files matching the temporary name prefix will be unlinked. When
3376 * recursing it will be called with unlink_all = true to unlink everything
3377 * under a top-level temporary directory.
3378 *
3379 * (These two flags could be replaced by one, but it seems clearer to keep
3380 * them separate.)
3381 */
3382void
3383RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3384{
3385 DIR *temp_dir;
3386 struct dirent *temp_de;
3387 char rm_path[MAXPGPATH * 2];
3388
3390
3391 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3392 return;
3393
3395 {
3396 if (strcmp(temp_de->d_name, ".") == 0 ||
3397 strcmp(temp_de->d_name, "..") == 0)
3398 continue;
3399
3400 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3401 tmpdirname, temp_de->d_name);
3402
3403 if (unlink_all ||
3404 strncmp(temp_de->d_name,
3407 {
3409
3410 if (type == PGFILETYPE_ERROR)
3411 continue;
3412 else if (type == PGFILETYPE_DIR)
3413 {
3414 /* recursively remove contents, then directory itself */
3415 RemovePgTempFilesInDir(rm_path, false, true);
3416
3417 if (rmdir(rm_path) < 0)
3418 ereport(LOG,
3420 errmsg("could not remove directory \"%s\": %m",
3421 rm_path)));
3422 }
3423 else
3424 {
3425 if (unlink(rm_path) < 0)
3426 ereport(LOG,
3428 errmsg("could not remove file \"%s\": %m",
3429 rm_path)));
3430 }
3431 }
3432 else
3433 ereport(LOG,
3434 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3435 rm_path)));
3436 }
3437
3439}
3440
3441/* Process one tablespace directory, look for per-DB subdirectories */
3442static void
3444{
3445 DIR *ts_dir;
3446 struct dirent *de;
3447 char dbspace_path[MAXPGPATH * 2];
3448
3450
3451 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3452 {
3453 /*
3454 * We're only interested in the per-database directories, which have
3455 * numeric names. Note that this code will also (properly) ignore "."
3456 * and "..".
3457 */
3458 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3459 continue;
3460
3461 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3462 tsdirname, de->d_name);
3464 }
3465
3466 FreeDir(ts_dir);
3467}
3468
3469/* Process one per-dbspace directory for RemovePgTempRelationFiles */
3470static void
3472{
3474 struct dirent *de;
3475 char rm_path[MAXPGPATH * 2];
3476
3478
3480 {
3481 if (!looks_like_temp_rel_name(de->d_name))
3482 continue;
3483
3484 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3485 dbspacedirname, de->d_name);
3486
3487 if (unlink(rm_path) < 0)
3488 ereport(LOG,
3490 errmsg("could not remove file \"%s\": %m",
3491 rm_path)));
3492 }
3493
3495}
3496
3497/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3498bool
3499looks_like_temp_rel_name(const char *name)
3500{
3501 int pos;
3502 int savepos;
3503
3504 /* Must start with "t". */
3505 if (name[0] != 't')
3506 return false;
3507
3508 /* Followed by a non-empty string of digits and then an underscore. */
3509 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3510 ;
3511 if (pos == 1 || name[pos] != '_')
3512 return false;
3513
3514 /* Followed by another nonempty string of digits. */
3515 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3516 ;
3517 if (savepos == pos)
3518 return false;
3519
3520 /* We might have _forkname or .segment or both. */
3521 if (name[pos] == '_')
3522 {
3523 int forkchar = forkname_chars(&name[pos + 1], NULL);
3524
3525 if (forkchar <= 0)
3526 return false;
3527 pos += forkchar + 1;
3528 }
3529 if (name[pos] == '.')
3530 {
3531 int segchar;
3532
3533 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3534 ;
3535 if (segchar <= 1)
3536 return false;
3537 pos += segchar;
3538 }
3539
3540 /* Now we should be at the end. */
3541 if (name[pos] != '\0')
3542 return false;
3543 return true;
3544}
3545
3546#ifdef HAVE_SYNCFS
3547static void
3548do_syncfs(const char *path)
3549{
3550 int fd;
3551
3552 ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3553 path);
3554
3555 fd = OpenTransientFile(path, O_RDONLY);
3556 if (fd < 0)
3557 {
3558 ereport(LOG,
3560 errmsg("could not open file \"%s\": %m", path)));
3561 return;
3562 }
3563 if (syncfs(fd) < 0)
3564 ereport(LOG,
3566 errmsg("could not synchronize file system for file \"%s\": %m", path)));
3568}
3569#endif
3570
3571/*
3572 * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3573 * all potential filesystem, depending on recovery_init_sync_method setting.
3574 *
3575 * We fsync regular files and directories wherever they are, but we
3576 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3577 * Other symlinks are presumed to point at files we're not responsible
3578 * for fsyncing, and might not have privileges to write at all.
3579 *
3580 * Errors are logged but not considered fatal; that's because this is used
3581 * only during database startup, to deal with the possibility that there are
3582 * issued-but-unsynced writes pending against the data directory. We want to
3583 * ensure that such writes reach disk before anything that's done in the new
3584 * run. However, aborting on error would result in failure to start for
3585 * harmless cases such as read-only files in the data directory, and that's
3586 * not good either.
3587 *
3588 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3589 * rewriting all changes again during recovery.
3590 *
3591 * Note we assume we're chdir'd into PGDATA to begin with.
3592 */
3593void
3595{
3596 bool xlog_is_symlink;
3597
3598 /* We can skip this whole thing if fsync is disabled. */
3599 if (!enableFsync)
3600 return;
3601
3602 /*
3603 * If pg_wal is a symlink, we'll need to recurse into it separately,
3604 * because the first walkdir below will ignore it.
3605 */
3606 xlog_is_symlink = false;
3607
3608 {
3609 struct stat st;
3610
3611 if (lstat("pg_wal", &st) < 0)
3612 ereport(LOG,
3614 errmsg("could not stat file \"%s\": %m",
3615 "pg_wal")));
3616 else if (S_ISLNK(st.st_mode))
3617 xlog_is_symlink = true;
3618 }
3619
3620#ifdef HAVE_SYNCFS
3622 {
3623 DIR *dir;
3624 struct dirent *de;
3625
3626 /*
3627 * On Linux, we don't have to open every single file one by one. We
3628 * can use syncfs() to sync whole filesystems. We only expect
3629 * filesystem boundaries to exist where we tolerate symlinks, namely
3630 * pg_wal and the tablespaces, so we call syncfs() for each of those
3631 * directories.
3632 */
3633
3634 /* Prepare to report progress syncing the data directory via syncfs. */
3636
3637 /* Sync the top level pgdata directory. */
3638 do_syncfs(".");
3639 /* If any tablespaces are configured, sync each of those. */
3641 while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3642 {
3643 char path[MAXPGPATH];
3644
3645 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3646 continue;
3647
3648 snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3649 do_syncfs(path);
3650 }
3651 FreeDir(dir);
3652 /* If pg_wal is a symlink, process that too. */
3653 if (xlog_is_symlink)
3654 do_syncfs("pg_wal");
3655 return;
3656 }
3657#endif /* !HAVE_SYNCFS */
3658
3659#ifdef PG_FLUSH_DATA_WORKS
3660 /* Prepare to report progress of the pre-fsync phase. */
3662
3663 /*
3664 * If possible, hint to the kernel that we're soon going to fsync the data
3665 * directory and its contents. Errors in this step are even less
3666 * interesting than normal, so log them only at DEBUG1.
3667 */
3668 walkdir(".", pre_sync_fname, false, DEBUG1);
3669 if (xlog_is_symlink)
3670 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3672#endif
3673
3674 /* Prepare to report progress syncing the data directory via fsync. */
3676
3677 /*
3678 * Now we do the fsync()s in the same order.
3679 *
3680 * The main call ignores symlinks, so in addition to specially processing
3681 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3682 * process_symlinks = true. Note that if there are any plain directories
3683 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3684 * so we don't worry about optimizing it.
3685 */
3686 walkdir(".", datadir_fsync_fname, false, LOG);
3687 if (xlog_is_symlink)
3688 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3690}
3691
3692/*
3693 * walkdir: recursively walk a directory, applying the action to each
3694 * regular file and directory (including the named directory itself).
3695 *
3696 * If process_symlinks is true, the action and recursion are also applied
3697 * to regular files and directories that are pointed to by symlinks in the
3698 * given directory; otherwise symlinks are ignored. Symlinks are always
3699 * ignored in subdirectories, ie we intentionally don't pass down the
3700 * process_symlinks flag to recursive calls.
3701 *
3702 * Errors are reported at level elevel, which might be ERROR or less.
3703 *
3704 * See also walkdir in file_utils.c, which is a frontend version of this
3705 * logic.
3706 */
3707static void
3708walkdir(const char *path,
3709 void (*action) (const char *fname, bool isdir, int elevel),
3710 bool process_symlinks,
3711 int elevel)
3712{
3713 DIR *dir;
3714 struct dirent *de;
3715
3716 dir = AllocateDir(path);
3717
3718 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3719 {
3720 char subpath[MAXPGPATH * 2];
3721
3723
3724 if (strcmp(de->d_name, ".") == 0 ||
3725 strcmp(de->d_name, "..") == 0)
3726 continue;
3727
3728 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3729
3730 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3731 {
3732 case PGFILETYPE_REG:
3733 (*action) (subpath, false, elevel);
3734 break;
3735 case PGFILETYPE_DIR:
3736 walkdir(subpath, action, false, elevel);
3737 break;
3738 default:
3739
3740 /*
3741 * Errors are already reported directly by get_dirent_type(),
3742 * and any remaining symlinks and unknown file types are
3743 * ignored.
3744 */
3745 break;
3746 }
3747 }
3748
3749 FreeDir(dir); /* we ignore any error here */
3750
3751 /*
3752 * It's important to fsync the destination directory itself as individual
3753 * file fsyncs don't guarantee that the directory entry for the file is
3754 * synced. However, skip this if AllocateDir failed; the action function
3755 * might not be robust against that.
3756 */
3757 if (dir)
3758 (*action) (path, true, elevel);
3759}
3760
3761
3762/*
3763 * Hint to the OS that it should get ready to fsync() this file.
3764 *
3765 * Ignores errors trying to open unreadable files, and logs other errors at a
3766 * caller-specified level.
3767 */
3768#ifdef PG_FLUSH_DATA_WORKS
3769
3770static void
3771pre_sync_fname(const char *fname, bool isdir, int elevel)
3772{
3773 int fd;
3774
3775 /* Don't try to flush directories, it'll likely just fail */
3776 if (isdir)
3777 return;
3778
3779 ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3780 fname);
3781
3783
3784 if (fd < 0)
3785 {
3786 if (errno == EACCES)
3787 return;
3788 ereport(elevel,
3790 errmsg("could not open file \"%s\": %m", fname)));
3791 return;
3792 }
3793
3794 /*
3795 * pg_flush_data() ignores errors, which is ok because this is only a
3796 * hint.
3797 */
3798 pg_flush_data(fd, 0, 0);
3799
3800 if (CloseTransientFile(fd) != 0)
3801 ereport(elevel,
3803 errmsg("could not close file \"%s\": %m", fname)));
3804}
3805
3806#endif /* PG_FLUSH_DATA_WORKS */
3807
3808static void
3809datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3810{
3811 ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3812 fname);
3813
3814 /*
3815 * We want to silently ignoring errors about unreadable files. Pass that
3816 * desire on to fsync_fname_ext().
3817 */
3818 fsync_fname_ext(fname, isdir, true, elevel);
3819}
3820
3821static void
3822unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3823{
3824 if (isdir)
3825 {
3826 if (rmdir(fname) != 0 && errno != ENOENT)
3827 ereport(elevel,
3829 errmsg("could not remove directory \"%s\": %m", fname)));
3830 }
3831 else
3832 {
3833 /* Use PathNameDeleteTemporaryFile to report filesize */
3834 PathNameDeleteTemporaryFile(fname, false);
3835 }
3836}
3837
3838/*
3839 * fsync_fname_ext -- Try to fsync a file or directory
3840 *
3841 * If ignore_perm is true, ignore errors upon trying to open unreadable
3842 * files. Logs other errors at a caller-specified level.
3843 *
3844 * Returns 0 if the operation succeeded, -1 otherwise.
3845 */
3846int
3847fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3848{
3849 int fd;
3850 int flags;
3851 int returncode;
3852
3853 /*
3854 * Some OSs require directories to be opened read-only whereas other
3855 * systems don't allow us to fsync files opened read-only; so we need both
3856 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3857 * not writable by our userid, but we assume that's OK.
3858 */
3859 flags = PG_BINARY;
3860 if (!isdir)
3861 flags |= O_RDWR;
3862 else
3863 flags |= O_RDONLY;
3864
3865 fd = OpenTransientFile(fname, flags);
3866
3867 /*
3868 * Some OSs don't allow us to open directories at all (Windows returns
3869 * EACCES), just ignore the error in that case. If desired also silently
3870 * ignoring errors about unreadable files. Log others.
3871 */
3872 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3873 return 0;
3874 else if (fd < 0 && ignore_perm && errno == EACCES)
3875 return 0;
3876 else if (fd < 0)
3877 {
3878 ereport(elevel,
3880 errmsg("could not open file \"%s\": %m", fname)));
3881 return -1;
3882 }
3883
3885
3886 /*
3887 * Some OSes don't allow us to fsync directories at all, so we can ignore
3888 * those errors. Anything else needs to be logged.
3889 */
3890 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3891 {
3892 int save_errno;
3893
3894 /* close file upon error, might not be in transaction context */
3895 save_errno = errno;
3897 errno = save_errno;
3898
3899 ereport(elevel,
3901 errmsg("could not fsync file \"%s\": %m", fname)));
3902 return -1;
3903 }
3904
3905 if (CloseTransientFile(fd) != 0)
3906 {
3907 ereport(elevel,
3909 errmsg("could not close file \"%s\": %m", fname)));
3910 return -1;
3911 }
3912
3913 return 0;
3914}
3915
3916/*
3917 * fsync_parent_path -- fsync the parent path of a file or directory
3918 *
3919 * This is aimed at making file operations persistent on disk in case of
3920 * an OS crash or power failure.
3921 */
3922static int
3923fsync_parent_path(const char *fname, int elevel)
3924{
3925 char parentpath[MAXPGPATH];
3926
3927 strlcpy(parentpath, fname, MAXPGPATH);
3929
3930 /*
3931 * get_parent_directory() returns an empty string if the input argument is
3932 * just a file name (see comments in path.c), so handle that as being the
3933 * current directory.
3934 */
3935 if (strlen(parentpath) == 0)
3937
3938 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3939 return -1;
3940
3941 return 0;
3942}
3943
3944/*
3945 * Create a PostgreSQL data sub-directory
3946 *
3947 * The data directory itself, and most of its sub-directories, are created at
3948 * initdb time, but we do have some occasions when we create directories in
3949 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3950 * make sure that those directories are created consistently. Today, that means
3951 * making sure that the created directory has the correct permissions, which is
3952 * what pg_dir_create_mode tracks for us.
3953 *
3954 * Note that we also set the umask() based on what we understand the correct
3955 * permissions to be (see file_perm.c).
3956 *
3957 * For permissions other than the default, mkdir() can be used directly, but
3958 * be sure to consider carefully such cases -- a sub-directory with incorrect
3959 * permissions in a PostgreSQL data directory could cause backups and other
3960 * processes to fail.
3961 */
3962int
3963MakePGDirectory(const char *directoryName)
3964{
3966}
3967
3968/*
3969 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3970 *
3971 * Failure to fsync any data file is cause for immediate panic, unless
3972 * data_sync_retry is enabled. Data may have been written to the operating
3973 * system and removed from our buffer pool already, and if we are running on
3974 * an operating system that forgets dirty data on write-back failure, there
3975 * may be only one copy of the data remaining: in the WAL. A later attempt to
3976 * fsync again might falsely report success. Therefore we must not allow any
3977 * further checkpoints to be attempted. data_sync_retry can in theory be
3978 * enabled on systems known not to drop dirty buffered data on write-back
3979 * failure (with the likely outcome that checkpoints will continue to fail
3980 * until the underlying problem is fixed).
3981 *
3982 * Any code that reports a failure from fsync() or related functions should
3983 * filter the error level with this function.
3984 */
3985int
3986data_sync_elevel(int elevel)
3987{
3988 return data_sync_retry ? elevel : PANIC;
3989}
3990
3991bool
3992check_debug_io_direct(char **newval, void **extra, GucSource source)
3993{
3994 bool result = true;
3995 int flags;
3996
3997#if PG_O_DIRECT == 0
3998 if (strcmp(*newval, "") != 0)
3999 {
4000 GUC_check_errdetail("\"%s\" is not supported on this platform.",
4001 "debug_io_direct");
4002 result = false;
4003 }
4004 flags = 0;
4005#else
4006 List *elemlist;
4007 ListCell *l;
4008 char *rawstring;
4009
4010 /* Need a modifiable copy of string */
4012
4013 if (!SplitGUCList(rawstring, ',', &elemlist))
4014 {
4015 GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4016 "debug_io_direct");
4019 return false;
4020 }
4021
4022 flags = 0;
4023 foreach(l, elemlist)
4024 {
4025 char *item = (char *) lfirst(l);
4026
4027 if (pg_strcasecmp(item, "data") == 0)
4028 flags |= IO_DIRECT_DATA;
4029 else if (pg_strcasecmp(item, "wal") == 0)
4030 flags |= IO_DIRECT_WAL;
4031 else if (pg_strcasecmp(item, "wal_init") == 0)
4032 flags |= IO_DIRECT_WAL_INIT;
4033 else
4034 {
4035 GUC_check_errdetail("Invalid option \"%s\".", item);
4036 result = false;
4037 break;
4038 }
4039 }
4040
4041 /*
4042 * It's possible to configure block sizes smaller than our assumed I/O
4043 * alignment size, which could result in invalid I/O requests.
4044 */
4045#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4046 if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4047 {
4048 GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4049 "debug_io_direct", "XLOG_BLCKSZ");
4050 result = false;
4051 }
4052#endif
4053#if BLCKSZ < PG_IO_ALIGN_SIZE
4054 if (result && (flags & IO_DIRECT_DATA))
4055 {
4056 GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4057 "debug_io_direct", "BLCKSZ");
4058 result = false;
4059 }
4060#endif
4061
4064#endif
4065
4066 if (!result)
4067 return result;
4068
4069 /* Save the flags in *extra, for use by assign_debug_io_direct */
4070 *extra = guc_malloc(LOG, sizeof(int));
4071 if (!*extra)
4072 return false;
4073 *((int *) *extra) = flags;
4074
4075 return result;
4076}
4077
4078void
4079assign_debug_io_direct(const char *newval, void *extra)
4080{
4081 int *flags = (int *) extra;
4082
4083 io_direct_flags = *flags;
4084}
4085
4086/* ResourceOwner callbacks */
4087
4088static void
4090{
4091 File file = (File) DatumGetInt32(res);
4092 Vfd *vfdP;
4093
4094 Assert(FileIsValid(file));
4095
4096 vfdP = &VfdCache[file];
4097 vfdP->resowner = NULL;
4098
4099 FileClose(file);
4100}
4101
4102static char *
4104{
4105 return psprintf("File %d", DatumGetInt32(res));
4106}
void pgaio_closing_fd(int fd)
Definition aio.c:1220
void pgaio_io_start_readv(PgAioHandle *ioh, int fd, int iovcnt, uint64 offset)
Definition aio_io.c:78
void begin_startup_progress_phase(void)
Definition startup.c:342
int fdatasync(int fd)
#define Min(x, y)
Definition c.h:1093
uint32 SubTransactionId
Definition c.h:742
#define INT64_FORMAT
Definition c.h:636
#define Assert(condition)
Definition c.h:945
int64_t int64
Definition c.h:615
#define PG_BINARY
Definition c.h:1376
uint64_t uint64
Definition c.h:619
uint32_t uint32
Definition c.h:618
unsigned int Index
Definition c.h:700
#define MemSet(start, val, len)
Definition c.h:1109
#define OidIsValid(objectId)
Definition c.h:860
size_t Size
Definition c.h:691
int closedir(DIR *)
Definition dirent.c:127
struct dirent * readdir(DIR *)
Definition dirent.c:78
DIR * opendir(const char *)
Definition dirent.c:33
Datum arg
Definition elog.c:1322
int errcode_for_file_access(void)
Definition elog.c:897
int errcode(int sqlerrcode)
Definition elog.c:874
#define LOG
Definition elog.h:31
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define FATAL
Definition elog.h:41
#define WARNING
Definition elog.h:36
#define DEBUG2
Definition elog.h:29
#define PANIC
Definition elog.h:42
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int pg_truncate(const char *path, pgoff_t length)
Definition fd.c:721
int max_files_per_process
Definition fd.c:147
int FileGetRawDesc(File file)
Definition fd.c:2516
int MakePGDirectory(const char *directoryName)
Definition fd.c:3963
int FreeDir(DIR *dir)
Definition fd.c:3009
int recovery_init_sync_method
Definition fd.c:166
static const ResourceOwnerDesc file_resowner_desc
Definition fd.c:365
int pg_fsync_no_writethrough(int fd)
Definition fd.c:442
#define FD_MINFREE
Definition fd.c:139
FILE * OpenPipeStream(const char *command, const char *mode)
Definition fd.c:2731
static int numTempTableSpaces
Definition fd.c:293
static bool ReleaseLruFile(void)
Definition fd.c:1370
void FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
Definition fd.c:2123
int io_direct_flags
Definition fd.c:172
#define FD_DELETE_AT_CLOSE
Definition fd.c:196
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1112
static int maxAllocatedDescs
Definition fd.c:272
static void Delete(File file)
Definition fd.c:1254
static int FreeDesc(AllocateDesc *desc)
Definition fd.c:2787
static long tempFileCounter
Definition fd.c:284
static char * ResOwnerPrintFile(Datum res)
Definition fd.c:4103
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition fd.c:783
char * FilePathName(File file)
Definition fd.c:2500
static void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition fd.c:381
static int pg_ftruncate(int fd, pgoff_t length)
Definition fd.c:704
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition fd.c:3141
static int numAllocatedDescs
Definition fd.c:271
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition fd.c:1889
static void LruDelete(File file)
Definition fd.c:1273
int pg_fdatasync(int fd)
Definition fd.c:481
#define FileIsValid(file)
Definition fd.c:190
void assign_debug_io_direct(const char *newval, void *extra)
Definition fd.c:4079
int FileSync(File file, uint32 wait_event_info)
Definition fd.c:2336
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2205
static int nfile
Definition fd.c:226
int CloseTransientFile(int fd)
Definition fd.c:2855
#define DO_DB(A)
Definition fd.c:184
int BasicOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1090
void closeAllVfds(void)
Definition fd.c:3068
int max_safe_fds
Definition fd.c:160
static File AllocateVfd(void)
Definition fd.c:1402
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition fd.c:1849
void PathNameDeleteTemporaryDir(const char *dirname)
Definition fd.c:1679
int ClosePipeStream(FILE *file)
Definition fd.c:3039
void AtEOXact_Files(bool isCommit)
Definition fd.c:3214
int FileGetRawFlags(File file)
Definition fd.c:2532
static Size SizeVfdCache
Definition fd.c:221
static int nextTempTableSpace
Definition fd.c:294
#define FD_CLOSE_AT_EOXACT
Definition fd.c:197
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition fd.c:3847
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition fd.c:3822
static void ResOwnerReleaseFile(Datum res)
Definition fd.c:4089
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition fd.c:3443
int FreeFile(FILE *file)
Definition fd.c:2827
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2149
mode_t FileGetRawMode(File file)
Definition fd.c:2542
static AllocateDesc * allocatedDescs
Definition fd.c:273
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition fd.c:2972
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition fd.c:965
int FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2408
static int FileAccess(File file)
Definition fd.c:1480
pgoff_t FileSize(File file)
Definition fd.c:2448
static void FreeVfd(File file)
Definition fd.c:1460
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition fd.c:462
void FileClose(File file)
Definition fd.c:1966
void ReleaseExternalFD(void)
Definition fd.c:1225
#define FD_TEMP_FILE_LIMIT
Definition fd.c:198
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition fd.c:3383
bool pg_file_exists(const char *name)
Definition fd.c:504
void RemovePgTempFiles(void)
Definition fd.c:3323
#define FileIsNotOpen(file)
Definition fd.c:193
bool TempTablespacesAreSet(void)
Definition fd.c:3126
void fsync_fname(const char *fname, bool isdir)
Definition fd.c:757
int data_sync_elevel(int elevel)
Definition fd.c:3986
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1563
static void Insert(File file)
Definition fd.c:1301
AllocateDescKind
Definition fd.c:252
@ AllocateDescDir
Definition fd.c:255
@ AllocateDescPipe
Definition fd.c:254
@ AllocateDescFile
Definition fd.c:253
@ AllocateDescRawFD
Definition fd.c:256
Oid GetNextTempTableSpace(void)
Definition fd.c:3159
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1576
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition fd.c:3809
static void ReportTemporaryFileUsage(const char *path, pgoff_t size)
Definition fd.c:1516
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition fd.c:1792
void pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
Definition fd.c:526
bool AcquireExternalFD(void)
Definition fd.c:1172
static void RegisterTemporaryFile(File file)
Definition fd.c:1535
#define NUM_RESERVED_FDS
Definition fd.c:130
DIR * AllocateDir(const char *dirname)
Definition fd.c:2891
static Oid * tempTableSpaces
Definition fd.c:292
int FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2465
static bool reserveAllocatedDesc(void)
Definition fd.c:2553
void InitFileAccess(void)
Definition fd.c:904
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition fd.c:3471
File OpenTemporaryFile(bool interXact)
Definition fd.c:1712
int durable_unlink(const char *fname, int elevel)
Definition fd.c:873
static uint64 temporary_files_size
Definition fd.c:240
void ReserveExternalFD(void)
Definition fd.c:1207
int FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2363
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition fd.c:2957
bool looks_like_temp_rel_name(const char *name)
Definition fd.c:3499
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition fd.c:1920
void set_max_safe_fds(void)
Definition fd.c:1045
int pg_fsync(int fd)
Definition fd.c:390
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition fd.c:3251
#define VFD_CLOSED
Definition fd.c:188
static bool have_xact_temporary_files
Definition fd.c:232
static int LruInsert(File file)
Definition fd.c:1323
static int numExternalFDs
Definition fd.c:278
static int fsync_parent_path(const char *fname, int elevel)
Definition fd.c:3923
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition fd.c:1648
FILE * AllocateFile(const char *name, const char *mode)
Definition fd.c:2628
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition fd.c:3181
int OpenTransientFile(const char *fileName, int fileFlags)
Definition fd.c:2678
void InitTemporaryFileAccess(void)
Definition fd.c:934
static Vfd * VfdCache
Definition fd.c:220
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:2687
bool data_sync_retry
Definition fd.c:163
int FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2067
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2231
static void ReleaseLruFiles(void)
Definition fd.c:1392
void SyncDataDirectory(void)
Definition fd.c:3594
bool check_debug_io_direct(char **newval, void **extra, GucSource source)
Definition fd.c:3992
static void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition fd.c:376
static void BeforeShmemExit_Files(int code, Datum arg)
Definition fd.c:3228
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition fd.c:3708
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition fd.c:3097
void TempTablespacePath(char *path, Oid tablespace)
Definition fd.c:1767
#define IO_DIRECT_WAL
Definition fd.h:55
#define IO_DIRECT_DATA
Definition fd.h:54
#define IO_DIRECT_WAL_INIT
Definition fd.h:56
int File
Definition fd.h:51
#define PG_O_DIRECT
Definition fd.h:123
int pg_file_create_mode
Definition file_perm.c:19
int pg_dir_create_mode
Definition file_perm.c:18
ssize_t pg_pwrite_zeros(int fd, size_t size, pgoff_t offset)
Definition file_utils.c:709
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition file_utils.c:547
#define PG_TEMP_FILES_DIR
Definition file_utils.h:63
#define PG_TEMP_FILE_PREFIX
Definition file_utils.h:64
PGFileType
Definition file_utils.h:19
@ PGFILETYPE_DIR
Definition file_utils.h:23
@ PGFILETYPE_REG
Definition file_utils.h:22
@ PGFILETYPE_ERROR
Definition file_utils.h:20
@ DATA_DIR_SYNC_METHOD_SYNCFS
Definition file_utils.h:30
int MyProcPid
Definition globals.c:47
bool enableFsync
Definition globals.c:129
Oid MyDatabaseTableSpace
Definition globals.c:96
void * guc_malloc(int elevel, size_t size)
Definition guc.c:637
#define newval
#define GUC_check_errdetail
Definition guc.h:507
GucSource
Definition guc.h:112
int temp_file_limit
Definition guc_tables.c:560
int log_temp_files
Definition guc_tables.c:555
#define close(a)
Definition win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:344
return true
Definition isn.c:130
int j
Definition isn.c:78
int i
Definition isn.c:77
void list_free(List *list)
Definition list.c:1546
Datum subpath(PG_FUNCTION_ARGS)
Definition ltree_op.c:311
char * pstrdup(const char *in)
Definition mcxt.c:1781
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define MAP_FAILED
Definition mem.h:43
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
static char * errmsg
static char * basedir
static PgChecksumMode mode
#define MAXPGPATH
static ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset)
Definition pg_iovec.h:54
static ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset)
Definition pg_iovec.h:93
#define lfirst(lc)
Definition pg_list.h:172
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition pg_prng.c:144
pg_prng_state pg_global_prng_state
Definition pg_prng.c:34
static rewind_source * source
Definition pg_rewind.c:89
static char buf[DEFAULT_XLOG_SEG_SIZE]
static char * tablespace
Definition pgbench.c:217
void pgstat_report_tempfile(size_t filesize)
#define pqsignal
Definition port.h:547
int pg_strcasecmp(const char *s1, const char *s2)
void get_parent_directory(char *path)
Definition path.c:1068
#define snprintf
Definition port.h:260
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
off_t pgoff_t
Definition port.h:421
uint64_t Datum
Definition postgres.h:70
static Datum Int32GetDatum(int32 X)
Definition postgres.h:212
static int32 DatumGetInt32(Datum X)
Definition postgres.h:202
#define InvalidOid
unsigned int Oid
static int fd(const char *x, int i)
static int fb(int x)
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
int forkname_chars(const char *str, ForkNumber *fork)
Definition relpath.c:81
#define PG_TBLSPC_DIR
Definition relpath.h:41
#define TABLESPACE_VERSION_DIRECTORY
Definition relpath.h:33
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition resowner.c:561
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition resowner.c:521
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449
@ RESOURCE_RELEASE_AFTER_LOCKS
Definition resowner.h:56
#define RELEASE_PRIO_FILES
Definition resowner.h:76
void pg_usleep(long microsec)
Definition signal.c:53
#define realloc(a, b)
#define free(a)
#define malloc(a)
static void error(void)
#define ereport_startup_progress(msg,...)
Definition startup.h:18
SubTransactionId create_subid
Definition fd.c:262
DIR * dir
Definition fd.c:266
FILE * file
Definition fd.c:265
int fd
Definition fd.c:267
union AllocateDesc::@20 desc
AllocateDescKind kind
Definition fd.c:261
Definition dirent.c:26
Definition pg_list.h:54
const char * name
Definition resowner.h:93
Definition fd.c:201
int fd
Definition fd.c:202
int fileFlags
Definition fd.c:211
File lruLessRecently
Definition fd.c:207
File lruMoreRecently
Definition fd.c:206
pgoff_t fileSize
Definition fd.c:208
char * fileName
Definition fd.c:209
ResourceOwner resowner
Definition fd.c:204
unsigned short fdstate
Definition fd.c:203
File nextFree
Definition fd.c:205
mode_t fileMode
Definition fd.c:212
bool SplitGUCList(char *rawstring, char separator, List **namelist)
Definition varlena.c:3025
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
const char * type
const char * name
#define fsync(fd)
Definition win32_port.h:83
#define stat
Definition win32_port.h:74
#define EINTR
Definition win32_port.h:361
#define EOPNOTSUPP
Definition win32_port.h:385
#define SIGPIPE
Definition win32_port.h:163
#define lstat(path, sb)
Definition win32_port.h:275
#define S_ISDIR(m)
Definition win32_port.h:315
void _dosmaperr(unsigned long)
Definition win32error.c:177
#define S_ISLNK(m)
Definition win32_port.h:334
#define mkdir(a, b)
Definition win32_port.h:80
#define fstat
Definition win32_port.h:73
#define O_CLOEXEC
Definition win32_port.h:344
SubTransactionId GetCurrentSubTransactionId(void)
Definition xact.c:793
int wal_sync_method
Definition xlog.c:134
@ WAL_SYNC_METHOD_FSYNC_WRITETHROUGH
Definition xlog.h:28
static const char * directory
Definition zic.c:648

◆ FD_CLOSE_AT_EOXACT

#define FD_CLOSE_AT_EOXACT   (1 << 1) /* T = close at eoXact */

Definition at line 197 of file fd.c.

◆ FD_DELETE_AT_CLOSE

#define FD_DELETE_AT_CLOSE   (1 << 0) /* T = delete when closed */

Definition at line 196 of file fd.c.

◆ FD_MINFREE

#define FD_MINFREE   48

Definition at line 139 of file fd.c.

◆ FD_TEMP_FILE_LIMIT

#define FD_TEMP_FILE_LIMIT   (1 << 2) /* T = respect temp_file_limit */

Definition at line 198 of file fd.c.

◆ FileIsNotOpen

#define FileIsNotOpen (   file)    (VfdCache[file].fd == VFD_CLOSED)

Definition at line 193 of file fd.c.

◆ FileIsValid

#define FileIsValid (   file)     ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)

Definition at line 190 of file fd.c.

◆ NUM_RESERVED_FDS

#define NUM_RESERVED_FDS   10

Definition at line 130 of file fd.c.

◆ VFD_CLOSED

#define VFD_CLOSED   (-1)

Definition at line 188 of file fd.c.

Typedef Documentation

◆ Vfd

typedef struct vfd Vfd

Enumeration Type Documentation

◆ AllocateDescKind

Enumerator
AllocateDescFile 
AllocateDescPipe 
AllocateDescDir 
AllocateDescRawFD 

Definition at line 251 of file fd.c.

Function Documentation

◆ AcquireExternalFD()

bool AcquireExternalFD ( void  )

Definition at line 1172 of file fd.c.

1173{
1174 /*
1175 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1176 * "external" FDs.
1177 */
1178 if (numExternalFDs < max_safe_fds / 3)
1179 {
1181 return true;
1182 }
1183 errno = EMFILE;
1184 return false;
1185}

References fb(), max_safe_fds, numExternalFDs, and ReserveExternalFD().

Referenced by CreateWaitEventSet(), and libpqsrv_connect_prepare().

◆ AllocateDir()

DIR * AllocateDir ( const char dirname)

Definition at line 2891 of file fd.c.

2892{
2893 DIR *dir;
2894
2895 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2896 numAllocatedDescs, dirname));
2897
2898 /* Can we allocate another non-virtual FD? */
2899 if (!reserveAllocatedDesc())
2900 ereport(ERROR,
2902 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2903 maxAllocatedDescs, dirname)));
2904
2905 /* Close excess kernel FDs. */
2907
2908TryAgain:
2909 if ((dir = opendir(dirname)) != NULL)
2910 {
2912
2913 desc->kind = AllocateDescDir;
2914 desc->desc.dir = dir;
2917 return desc->desc.dir;
2918 }
2919
2920 if (errno == EMFILE || errno == ENFILE)
2921 {
2922 int save_errno = errno;
2923
2924 ereport(LOG,
2926 errmsg("out of file descriptors: %m; release and retry")));
2927 errno = 0;
2928 if (ReleaseLruFile())
2929 goto TryAgain;
2930 errno = save_errno;
2931 }
2932
2933 return NULL;
2934}

References allocatedDescs, AllocateDescDir, AllocateDesc::create_subid, AllocateDesc::desc, AllocateDesc::dir, DO_DB, elog, ereport, errcode(), errmsg, ERROR, fb(), GetCurrentSubTransactionId(), AllocateDesc::kind, LOG, maxAllocatedDescs, numAllocatedDescs, opendir(), ReleaseLruFile(), ReleaseLruFiles(), and reserveAllocatedDesc().

Referenced by calculate_database_size(), calculate_tablespace_size(), CheckPointLogicalRewriteHeap(), CheckPointSnapBuild(), CheckTablespaceDirectory(), CleanupBackupHistory(), copydir(), db_dir_size(), DeleteAllExportedSnapshotFiles(), destroy_tablespace_directories(), directory_is_empty(), do_pg_backup_start(), dsm_cleanup_for_mmap(), extension_file_exists(), get_ext_ver_list(), GetConfFilesInDir(), getInstallationPaths(), GetWalSummaries(), movedb(), ParseTzFile(), perform_base_backup(), pg_available_extension_versions(), pg_available_extensions(), pg_ls_dir(), pg_ls_dir_files(), pg_tablespace_databases(), pg_tzenumerate_next(), pg_tzenumerate_start(), pgarch_readyXlog(), RelationCacheInitFileRemove(), RelationCacheInitFileRemoveInDir(), RemoveNonParentXlogFiles(), RemoveOldXlogFiles(), RemovePgTempFiles(), RemovePgTempFilesInDir(), RemovePgTempRelationFiles(), RemovePgTempRelationFilesInDbspace(), RemoveTempXlogFiles(), ReorderBufferCleanupSerializedTXNs(), ResetUnloggedRelations(), ResetUnloggedRelationsInDbspaceDir(), ResetUnloggedRelationsInTablespaceDir(), restoreTwoPhaseData(), scan_directory_ci(), sendDir(), SlruScanDirectory(), StartupReorderBuffer(), StartupReplicationSlots(), SyncDataDirectory(), UpdateLogicalMappings(), walkdir(), and XLogGetOldestSegno().

◆ AllocateFile()

FILE * AllocateFile ( const char name,
const char mode 
)

Definition at line 2628 of file fd.c.

2629{
2630 FILE *file;
2631
2632 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2634
2635 /* Can we allocate another non-virtual FD? */
2636 if (!reserveAllocatedDesc())
2637 ereport(ERROR,
2639 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2641
2642 /* Close excess kernel FDs. */
2644
2645TryAgain:
2646 if ((file = fopen(name, mode)) != NULL)
2647 {
2649
2650 desc->kind = AllocateDescFile;
2651 desc->desc.file = file;
2654 return desc->desc.file;
2655 }
2656
2657 if (errno == EMFILE || errno == ENFILE)
2658 {
2659 int save_errno = errno;
2660
2661 ereport(LOG,
2663 errmsg("out of file descriptors: %m; release and retry")));
2664 errno = 0;
2665 if (ReleaseLruFile())
2666 goto TryAgain;
2667 errno = save_errno;
2668 }
2669
2670 return NULL;
2671}

References allocatedDescs, AllocateDescFile, AllocateDesc::create_subid, AllocateDesc::desc, DO_DB, elog, ereport, errcode(), errmsg, ERROR, fb(), AllocateDesc::file, GetCurrentSubTransactionId(), AllocateDesc::kind, LOG, maxAllocatedDescs, mode, name, numAllocatedDescs, ReleaseLruFile(), ReleaseLruFiles(), and reserveAllocatedDesc().

Referenced by AlterSystemSetConfigFile(), apw_dump_now(), apw_load_buffers(), BeginCopyFrom(), BeginCopyTo(), checkControlFile(), do_pg_backup_stop(), entry_reset(), existsTimeLineHistory(), ExportSnapshot(), gc_qtexts(), GetHugePageSize(), ImportSnapshot(), load_dh_file(), load_relcache_init_file(), open_auth_file(), parse_extension_control_file(), ParseConfigFile(), ParseTzFile(), pg_current_logfile(), pg_promote(), pgss_shmem_shutdown(), pgss_shmem_startup(), pgstat_read_statsfile(), pgstat_write_statsfile(), read_backup_label(), read_binary_file(), read_tablespace_map(), read_whole_file(), readTimeLineHistory(), test_custom_stats_var_from_serialized_data(), test_custom_stats_var_to_serialized_data(), tsearch_readline_begin(), ValidatePgVersion(), write_relcache_init_file(), XLogArchiveForceDone(), and XLogArchiveNotify().

◆ AllocateVfd()

static File AllocateVfd ( void  )
static

Definition at line 1402 of file fd.c.

1403{
1404 Index i;
1405 File file;
1406
1407 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1408
1409 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1410
1411 if (VfdCache[0].nextFree == 0)
1412 {
1413 /*
1414 * The free list is empty so it is time to increase the size of the
1415 * array. We choose to double it each time this happens. However,
1416 * there's not much point in starting *real* small.
1417 */
1420
1421 if (newCacheSize < 32)
1422 newCacheSize = 32;
1423
1424 /*
1425 * Be careful not to clobber VfdCache ptr if realloc fails.
1426 */
1427 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1428 if (newVfdCache == NULL)
1429 ereport(ERROR,
1431 errmsg("out of memory")));
1433
1434 /*
1435 * Initialize the new entries and link them into the free list.
1436 */
1437 for (i = SizeVfdCache; i < newCacheSize; i++)
1438 {
1439 MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1440 VfdCache[i].nextFree = i + 1;
1442 }
1445
1446 /*
1447 * Record the new size
1448 */
1450 }
1451
1452 file = VfdCache[0].nextFree;
1453
1455
1456 return file;
1457}

References Assert, DO_DB, elog, ereport, errcode(), errmsg, ERROR, fb(), vfd::fd, i, LOG, MemSet, vfd::nextFree, realloc, SizeVfdCache, VFD_CLOSED, and VfdCache.

Referenced by PathNameOpenFilePerm().

◆ assign_debug_io_direct()

void assign_debug_io_direct ( const char newval,
void extra 
)

Definition at line 4079 of file fd.c.

4080{
4081 int *flags = (int *) extra;
4082
4083 io_direct_flags = *flags;
4084}

References io_direct_flags.

◆ AtEOSubXact_Files()

void AtEOSubXact_Files ( bool  isCommit,
SubTransactionId  mySubid,
SubTransactionId  parentSubid 
)

Definition at line 3181 of file fd.c.

3183{
3184 Index i;
3185
3186 for (i = 0; i < numAllocatedDescs; i++)
3187 {
3188 if (allocatedDescs[i].create_subid == mySubid)
3189 {
3190 if (isCommit)
3192 else
3193 {
3194 /* have to recheck the item after FreeDesc (ugly) */
3196 }
3197 }
3198 }
3199}

References allocatedDescs, AllocateDesc::create_subid, fb(), FreeDesc(), i, and numAllocatedDescs.

Referenced by AbortSubTransaction(), and CommitSubTransaction().

◆ AtEOXact_Files()

◆ BasicOpenFile()

int BasicOpenFile ( const char fileName,
int  fileFlags 
)

◆ BasicOpenFilePerm()

int BasicOpenFilePerm ( const char fileName,
int  fileFlags,
mode_t  fileMode 
)

Definition at line 1112 of file fd.c.

1113{
1114 int fd;
1115
1116tryAgain:
1117#ifdef PG_O_DIRECT_USE_F_NOCACHE
1118 fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1119#else
1120 fd = open(fileName, fileFlags, fileMode);
1121#endif
1122
1123 if (fd >= 0)
1124 {
1125#ifdef PG_O_DIRECT_USE_F_NOCACHE
1126 if (fileFlags & PG_O_DIRECT)
1127 {
1128 if (fcntl(fd, F_NOCACHE, 1) < 0)
1129 {
1130 int save_errno = errno;
1131
1132 close(fd);
1133 errno = save_errno;
1134 return -1;
1135 }
1136 }
1137#endif
1138
1139 return fd; /* success! */
1140 }
1141
1142 if (errno == EMFILE || errno == ENFILE)
1143 {
1144 int save_errno = errno;
1145
1146 ereport(LOG,
1148 errmsg("out of file descriptors: %m; release and retry")));
1149 errno = 0;
1150 if (ReleaseLruFile())
1151 goto tryAgain;
1152 errno = save_errno;
1153 }
1154
1155 return -1; /* failure */
1156}

References close, ereport, errcode(), errmsg, fb(), fd(), LOG, PG_O_DIRECT, and ReleaseLruFile().

Referenced by BasicOpenFile(), LruInsert(), OpenTransientFilePerm(), PathNameOpenFilePerm(), and readRecoverySignalFile().

◆ BeforeShmemExit_Files()

static void BeforeShmemExit_Files ( int  code,
Datum  arg 
)
static

Definition at line 3228 of file fd.c.

3229{
3230 CleanupTempFiles(false, true);
3231
3232 /* prevent further temp files from being created */
3233#ifdef USE_ASSERT_CHECKING
3235#endif
3236}

References CleanupTempFiles(), and fb().

Referenced by InitTemporaryFileAccess().

◆ check_debug_io_direct()

bool check_debug_io_direct ( char **  newval,
void **  extra,
GucSource  source 
)

Definition at line 3992 of file fd.c.

3993{
3994 bool result = true;
3995 int flags;
3996
3997#if PG_O_DIRECT == 0
3998 if (strcmp(*newval, "") != 0)
3999 {
4000 GUC_check_errdetail("\"%s\" is not supported on this platform.",
4001 "debug_io_direct");
4002 result = false;
4003 }
4004 flags = 0;
4005#else
4006 List *elemlist;
4007 ListCell *l;
4008 char *rawstring;
4009
4010 /* Need a modifiable copy of string */
4012
4013 if (!SplitGUCList(rawstring, ',', &elemlist))
4014 {
4015 GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4016 "debug_io_direct");
4019 return false;
4020 }
4021
4022 flags = 0;
4023 foreach(l, elemlist)
4024 {
4025 char *item = (char *) lfirst(l);
4026
4027 if (pg_strcasecmp(item, "data") == 0)
4028 flags |= IO_DIRECT_DATA;
4029 else if (pg_strcasecmp(item, "wal") == 0)
4030 flags |= IO_DIRECT_WAL;
4031 else if (pg_strcasecmp(item, "wal_init") == 0)
4032 flags |= IO_DIRECT_WAL_INIT;
4033 else
4034 {
4035 GUC_check_errdetail("Invalid option \"%s\".", item);
4036 result = false;
4037 break;
4038 }
4039 }
4040
4041 /*
4042 * It's possible to configure block sizes smaller than our assumed I/O
4043 * alignment size, which could result in invalid I/O requests.
4044 */
4045#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4046 if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4047 {
4048 GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4049 "debug_io_direct", "XLOG_BLCKSZ");
4050 result = false;
4051 }
4052#endif
4053#if BLCKSZ < PG_IO_ALIGN_SIZE
4054 if (result && (flags & IO_DIRECT_DATA))
4055 {
4056 GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4057 "debug_io_direct", "BLCKSZ");
4058 result = false;
4059 }
4060#endif
4061
4064#endif
4065
4066 if (!result)
4067 return result;
4068
4069 /* Save the flags in *extra, for use by assign_debug_io_direct */
4070 *extra = guc_malloc(LOG, sizeof(int));
4071 if (!*extra)
4072 return false;
4073 *((int *) *extra) = flags;
4074
4075 return result;
4076}

References fb(), GUC_check_errdetail, guc_malloc(), IO_DIRECT_DATA, IO_DIRECT_WAL, IO_DIRECT_WAL_INIT, lfirst, list_free(), LOG, newval, pfree(), pg_strcasecmp(), pstrdup(), and SplitGUCList().

◆ CleanupTempFiles()

static void CleanupTempFiles ( bool  isCommit,
bool  isProcExit 
)
static

Definition at line 3251 of file fd.c.

3252{
3253 Index i;
3254
3255 /*
3256 * Careful here: at proc_exit we need extra cleanup, not just
3257 * xact_temporary files.
3258 */
3260 {
3261 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3262 for (i = 1; i < SizeVfdCache; i++)
3263 {
3264 unsigned short fdstate = VfdCache[i].fdstate;
3265
3266 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3267 VfdCache[i].fileName != NULL)
3268 {
3269 /*
3270 * If we're in the process of exiting a backend process, close
3271 * all temporary files. Otherwise, only close temporary files
3272 * local to the current transaction. They should be closed by
3273 * the ResourceOwner mechanism already, so this is just a
3274 * debugging cross-check.
3275 */
3276 if (isProcExit)
3277 FileClose(i);
3278 else if (fdstate & FD_CLOSE_AT_EOXACT)
3279 {
3280 elog(WARNING,
3281 "temporary file %s not closed at end-of-transaction",
3282 VfdCache[i].fileName);
3283 FileClose(i);
3284 }
3285 }
3286 }
3287
3289 }
3290
3291 /* Complain if any allocated files remain open at commit. */
3292 if (isCommit && numAllocatedDescs > 0)
3293 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3295
3296 /* Clean up "allocated" stdio files, dirs and fds. */
3297 while (numAllocatedDescs > 0)
3299}

References allocatedDescs, Assert, elog, fb(), FD_CLOSE_AT_EOXACT, FD_DELETE_AT_CLOSE, vfd::fdstate, FileClose(), FileIsNotOpen, FreeDesc(), have_xact_temporary_files, i, numAllocatedDescs, SizeVfdCache, VfdCache, and WARNING.

Referenced by AtEOXact_Files(), and BeforeShmemExit_Files().

◆ closeAllVfds()

void closeAllVfds ( void  )

Definition at line 3068 of file fd.c.

3069{
3070 Index i;
3071
3072 if (SizeVfdCache > 0)
3073 {
3074 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3075 for (i = 1; i < SizeVfdCache; i++)
3076 {
3077 if (!FileIsNotOpen(i))
3078 LruDelete(i);
3079 }
3080 }
3081}

References Assert, FileIsNotOpen, i, LruDelete(), and SizeVfdCache.

Referenced by standard_ProcessUtility().

◆ ClosePipeStream()

int ClosePipeStream ( FILE file)

Definition at line 3039 of file fd.c.

3040{
3041 int i;
3042
3043 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3044
3045 /* Remove file from list of allocated files, if it's present */
3046 for (i = numAllocatedDescs; --i >= 0;)
3047 {
3048 AllocateDesc *desc = &allocatedDescs[i];
3049
3050 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3051 return FreeDesc(desc);
3052 }
3053
3054 /* Only get here if someone passes us a file not in allocatedDescs */
3055 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3056
3057 return pclose(file);
3058}

References allocatedDescs, AllocateDescPipe, AllocateDesc::desc, DO_DB, elog, fb(), AllocateDesc::file, FreeDesc(), i, AllocateDesc::kind, LOG, numAllocatedDescs, and WARNING.

Referenced by ClosePipeFromProgram(), ClosePipeToProgram(), pg_import_system_collations(), run_ssl_passphrase_command(), and shell_finish_command().

◆ CloseTransientFile()

int CloseTransientFile ( int  fd)

Definition at line 2855 of file fd.c.

2856{
2857 int i;
2858
2859 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2860
2861 /* Remove fd from list of allocated files, if it's present */
2862 for (i = numAllocatedDescs; --i >= 0;)
2863 {
2864 AllocateDesc *desc = &allocatedDescs[i];
2865
2866 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2867 return FreeDesc(desc);
2868 }
2869
2870 /* Only get here if someone passes us a file not in allocatedDescs */
2871 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2872
2874
2875 return close(fd);
2876}

References allocatedDescs, AllocateDescRawFD, close, AllocateDesc::desc, DO_DB, elog, AllocateDesc::fd, fd(), FreeDesc(), i, AllocateDesc::kind, LOG, numAllocatedDescs, pgaio_closing_fd(), and WARNING.

Referenced by ApplyLogicalMappingFile(), be_lo_export(), CheckPointLogicalRewriteHeap(), CheckPointReplicationOrigin(), clone_file(), compare_files(), copy_file(), CreateDirAndVersionFile(), dsm_impl_mmap(), durable_rename(), fsync_fname_ext(), get_controlfile_by_exact_path(), heap_xlog_logical_rewrite(), lo_import_internal(), perform_base_backup(), pg_truncate(), qtext_load_file(), qtext_store(), read_relmap_file(), ReadTwoPhaseFile(), RecreateTwoPhaseFile(), ReorderBufferSerializeChange(), ReorderBufferSerializeTXN(), RestoreSlotFromDisk(), SaveSlotToPath(), sendFile(), SendTimeLineHistory(), SimpleLruDoesPhysicalPageExist(), SimpleLruWriteAll(), SlruInternalWritePage(), SlruPhysicalReadPage(), SlruPhysicalWritePage(), SlruSyncFileTag(), SnapBuildRestoreContents(), SnapBuildRestoreSnapshot(), SnapBuildSerialize(), StartupReplicationOrigin(), write_relmap_file(), writeTimeLineHistory(), writeTimeLineHistoryFile(), and XLogFileCopy().

◆ count_usable_fds()

static void count_usable_fds ( int  max_to_probe,
int usable_fds,
int already_open 
)
static

Definition at line 965 of file fd.c.

966{
967 int *fd;
968 int size;
969 int used = 0;
970 int highestfd = 0;
971 int j;
972
973#ifdef HAVE_GETRLIMIT
974 struct rlimit rlim;
976#endif
977
978 size = 1024;
979 fd = (int *) palloc(size * sizeof(int));
980
981#ifdef HAVE_GETRLIMIT
983 if (getrlimit_status != 0)
984 ereport(WARNING, (errmsg("getrlimit failed: %m")));
985#endif /* HAVE_GETRLIMIT */
986
987 /* dup until failure or probe limit reached */
988 for (;;)
989 {
990 int thisfd;
991
992#ifdef HAVE_GETRLIMIT
993
994 /*
995 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
996 * some platforms
997 */
998 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
999 break;
1000#endif
1001
1002 thisfd = dup(2);
1003 if (thisfd < 0)
1004 {
1005 /* Expect EMFILE or ENFILE, else it's fishy */
1006 if (errno != EMFILE && errno != ENFILE)
1007 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1008 break;
1009 }
1010
1011 if (used >= size)
1012 {
1013 size *= 2;
1014 fd = (int *) repalloc(fd, size * sizeof(int));
1015 }
1016 fd[used++] = thisfd;
1017
1018 if (highestfd < thisfd)
1019 highestfd = thisfd;
1020
1021 if (used >= max_to_probe)
1022 break;
1023 }
1024
1025 /* release the files we opened */
1026 for (j = 0; j < used; j++)
1027 close(fd[j]);
1028
1029 pfree(fd);
1030
1031 /*
1032 * Return results. usable_fds is just the number of successful dups. We
1033 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1034 * number) and so already_open is highestfd+1 - usable_fds.
1035 */
1036 *usable_fds = used;
1037 *already_open = highestfd + 1 - used;
1038}

References close, elog, ereport, errmsg, fb(), fd(), j, palloc(), pfree(), repalloc(), and WARNING.

Referenced by set_max_safe_fds().

◆ data_sync_elevel()

◆ datadir_fsync_fname()

static void datadir_fsync_fname ( const char fname,
bool  isdir,
int  elevel 
)
static

Definition at line 3809 of file fd.c.

3810{
3811 ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3812 fname);
3813
3814 /*
3815 * We want to silently ignoring errors about unreadable files. Pass that
3816 * desire on to fsync_fname_ext().
3817 */
3818 fsync_fname_ext(fname, isdir, true, elevel);
3819}

References ereport_startup_progress, fb(), and fsync_fname_ext().

Referenced by SyncDataDirectory().

◆ Delete()

static void Delete ( File  file)
static

Definition at line 1254 of file fd.c.

1255{
1256 Vfd *vfdP;
1257
1258 Assert(file != 0);
1259
1260 DO_DB(elog(LOG, "Delete %d (%s)",
1261 file, VfdCache[file].fileName));
1262 DO_DB(_dump_lru());
1263
1264 vfdP = &VfdCache[file];
1265
1266 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1267 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1268
1269 DO_DB(_dump_lru());
1270}

References Assert, DO_DB, elog, fb(), LOG, vfd::lruLessRecently, vfd::lruMoreRecently, and VfdCache.

Referenced by FileAccess(), FileClose(), and LruDelete().

◆ durable_rename()

int durable_rename ( const char oldfile,
const char newfile,
int  elevel 
)

Definition at line 783 of file fd.c.

784{
785 int fd;
786
787 /*
788 * First fsync the old and target path (if it exists), to ensure that they
789 * are properly persistent on disk. Syncing the target file is not
790 * strictly necessary, but it makes it easier to reason about crashes;
791 * because it's then guaranteed that either source or target file exists
792 * after a crash.
793 */
794 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
795 return -1;
796
798 if (fd < 0)
799 {
800 if (errno != ENOENT)
801 {
802 ereport(elevel,
804 errmsg("could not open file \"%s\": %m", newfile)));
805 return -1;
806 }
807 }
808 else
809 {
810 if (pg_fsync(fd) != 0)
811 {
812 int save_errno;
813
814 /* close file upon error, might not be in transaction context */
818
819 ereport(elevel,
821 errmsg("could not fsync file \"%s\": %m", newfile)));
822 return -1;
823 }
824
825 if (CloseTransientFile(fd) != 0)
826 {
827 ereport(elevel,
829 errmsg("could not close file \"%s\": %m", newfile)));
830 return -1;
831 }
832 }
833
834 /* Time to do the real deal... */
835 if (rename(oldfile, newfile) < 0)
836 {
837 ereport(elevel,
839 errmsg("could not rename file \"%s\" to \"%s\": %m",
840 oldfile, newfile)));
841 return -1;
842 }
843
844 /*
845 * To guarantee renaming the file is persistent, fsync the file with its
846 * new name, and its containing directory.
847 */
848 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
849 return -1;
850
851 if (fsync_parent_path(newfile, elevel) != 0)
852 return -1;
853
854 return 0;
855}

References CloseTransientFile(), ereport, errcode_for_file_access(), errmsg, fb(), fd(), fsync_fname_ext(), fsync_parent_path(), OpenTransientFile(), PG_BINARY, and pg_fsync().

Referenced by AlterSystemSetConfigFile(), apw_dump_now(), BaseBackup(), basic_archive_file(), bbsink_server_end_manifest(), CheckPointReplicationOrigin(), cleanup_objects_atexit(), CleanupAfterArchiveRecovery(), dir_close(), InitWalRecovery(), InstallXLogFileSegment(), KeepFileRestoredFromArchive(), pgss_shmem_shutdown(), pgstat_write_statsfile(), StartupXLOG(), SummarizeWAL(), write_relmap_file(), writeTimeLineHistory(), writeTimeLineHistoryFile(), and XLogArchiveForceDone().

◆ durable_unlink()

int durable_unlink ( const char fname,
int  elevel 
)

Definition at line 873 of file fd.c.

874{
875 if (unlink(fname) < 0)
876 {
877 ereport(elevel,
879 errmsg("could not remove file \"%s\": %m",
880 fname)));
881 return -1;
882 }
883
884 /*
885 * To guarantee that the removal of the file is persistent, fsync its
886 * parent directory.
887 */
888 if (fsync_parent_path(fname, elevel) != 0)
889 return -1;
890
891 return 0;
892}

References ereport, errcode_for_file_access(), errmsg, fb(), and fsync_parent_path().

Referenced by InstallXLogFileSegment(), RemoveXlogFile(), and StartupXLOG().

◆ FileAccess()

static int FileAccess ( File  file)
static

Definition at line 1480 of file fd.c.

1481{
1482 int returnValue;
1483
1484 DO_DB(elog(LOG, "FileAccess %d (%s)",
1485 file, VfdCache[file].fileName));
1486
1487 /*
1488 * Is the file open? If not, open it and put it at the head of the LRU
1489 * ring (possibly closing the least recently used file to get an FD).
1490 */
1491
1492 if (FileIsNotOpen(file))
1493 {
1494 returnValue = LruInsert(file);
1495 if (returnValue != 0)
1496 return returnValue;
1497 }
1498 else if (VfdCache[0].lruLessRecently != file)
1499 {
1500 /*
1501 * We now know that the file is open and that it is not the last one
1502 * accessed, so we need to move it to the head of the Lru ring.
1503 */
1504
1505 Delete(file);
1506 Insert(file);
1507 }
1508
1509 return 0;
1510}

References Delete(), DO_DB, elog, fb(), FileIsNotOpen, Insert(), LOG, LruInsert(), and VfdCache.

Referenced by FileFallocate(), FileGetRawDesc(), FilePrefetch(), FileReadV(), FileSize(), FileStartReadV(), FileSync(), FileTruncate(), FileWriteback(), FileWriteV(), and FileZero().

◆ FileClose()

void FileClose ( File  file)

Definition at line 1966 of file fd.c.

1967{
1968 Vfd *vfdP;
1969
1970 Assert(FileIsValid(file));
1971
1972 DO_DB(elog(LOG, "FileClose: %d (%s)",
1973 file, VfdCache[file].fileName));
1974
1975 vfdP = &VfdCache[file];
1976
1977 if (!FileIsNotOpen(file))
1978 {
1980
1981 /* close the file */
1982 if (close(vfdP->fd) != 0)
1983 {
1984 /*
1985 * We may need to panic on failure to close non-temporary files;
1986 * see LruDelete.
1987 */
1989 "could not close file \"%s\": %m", vfdP->fileName);
1990 }
1991
1992 --nfile;
1993 vfdP->fd = VFD_CLOSED;
1994
1995 /* remove the file from the lru ring */
1996 Delete(file);
1997 }
1998
1999 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2000 {
2001 /* Subtract its size from current usage (do first in case of error) */
2002 temporary_files_size -= vfdP->fileSize;
2003 vfdP->fileSize = 0;
2004 }
2005
2006 /*
2007 * Delete the file if it was temporary, and make a log entry if wanted
2008 */
2009 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2010 {
2011 struct stat filestats;
2012 int stat_errno;
2013
2014 /*
2015 * If we get an error, as could happen within the ereport/elog calls,
2016 * we'll come right back here during transaction abort. Reset the
2017 * flag to ensure that we can't get into an infinite loop. This code
2018 * is arranged to ensure that the worst-case consequence is failing to
2019 * emit log message(s), not failing to attempt the unlink.
2020 */
2021 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2022
2023
2024 /* first try the stat() */
2025 if (stat(vfdP->fileName, &filestats))
2026 stat_errno = errno;
2027 else
2028 stat_errno = 0;
2029
2030 /* in any case do the unlink */
2031 if (unlink(vfdP->fileName))
2032 ereport(LOG,
2034 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2035
2036 /* and last report the stat results */
2037 if (stat_errno == 0)
2038 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2039 else
2040 {
2041 errno = stat_errno;
2042 ereport(LOG,
2044 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2045 }
2046 }
2047
2048 /* Unregister it from the resource owner */
2049 if (vfdP->resowner)
2050 ResourceOwnerForgetFile(vfdP->resowner, file);
2051
2052 /*
2053 * Return the Vfd slot to the free list
2054 */
2055 FreeVfd(file);
2056}

References Assert, close, data_sync_elevel(), Delete(), DO_DB, elog, ereport, errcode_for_file_access(), errmsg, fb(), FD_DELETE_AT_CLOSE, FD_TEMP_FILE_LIMIT, FileIsNotOpen, FileIsValid, FreeVfd(), LOG, nfile, pgaio_closing_fd(), ReportTemporaryFileUsage(), ResourceOwnerForgetFile(), stat, temporary_files_size, VFD_CLOSED, and VfdCache.

Referenced by bbsink_server_end_archive(), bbsink_server_end_manifest(), BufFileClose(), BufFileTruncateFileSet(), CleanupTempFiles(), logical_end_heap_rewrite(), mdclose(), mdimmedsync(), mdregistersync(), mdsyncfiletag(), mdtruncate(), pg_wal_summary_contents(), PrepareForIncrementalBackup(), ReorderBufferIterTXNFinish(), ReorderBufferRestoreChanges(), ResOwnerReleaseFile(), and SummarizeWAL().

◆ FileFallocate()

int FileFallocate ( File  file,
pgoff_t  offset,
pgoff_t  amount,
uint32  wait_event_info 
)

Definition at line 2408 of file fd.c.

2409{
2410#ifdef HAVE_POSIX_FALLOCATE
2411 int returnCode;
2412
2413 Assert(FileIsValid(file));
2414
2415 DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2416 file, VfdCache[file].fileName,
2417 (int64) offset, (int64) amount));
2418
2419 returnCode = FileAccess(file);
2420 if (returnCode < 0)
2421 return -1;
2422
2423retry:
2424 pgstat_report_wait_start(wait_event_info);
2425 returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2427
2428 if (returnCode == 0)
2429 return 0;
2430 else if (returnCode == EINTR)
2431 goto retry;
2432
2433 /* for compatibility with %m printing etc */
2434 errno = returnCode;
2435
2436 /*
2437 * Return in cases of a "real" failure, if fallocate is not supported,
2438 * fall through to the FileZero() backed implementation.
2439 */
2441 return -1;
2442#endif
2443
2444 return FileZero(file, offset, amount, wait_event_info);
2445}

References Assert, DO_DB, EINTR, elog, EOPNOTSUPP, fb(), fd(), FileAccess(), FileIsValid, FileZero(), INT64_FORMAT, LOG, pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by mdzeroextend().

◆ FileGetRawDesc()

int FileGetRawDesc ( File  file)

Definition at line 2516 of file fd.c.

2517{
2518 int returnCode;
2519
2520 returnCode = FileAccess(file);
2521 if (returnCode < 0)
2522 return returnCode;
2523
2524 Assert(FileIsValid(file));
2525 return VfdCache[file].fd;
2526}

References Assert, fb(), vfd::fd, FileAccess(), FileIsValid, and VfdCache.

Referenced by mdfd().

◆ FileGetRawFlags()

int FileGetRawFlags ( File  file)

Definition at line 2532 of file fd.c.

2533{
2534 Assert(FileIsValid(file));
2535 return VfdCache[file].fileFlags;
2536}

References Assert, vfd::fileFlags, FileIsValid, and VfdCache.

◆ FileGetRawMode()

mode_t FileGetRawMode ( File  file)

Definition at line 2542 of file fd.c.

2543{
2544 Assert(FileIsValid(file));
2545 return VfdCache[file].fileMode;
2546}

References Assert, FileIsValid, vfd::fileMode, and VfdCache.

◆ FilePathName()

◆ FilePrefetch()

int FilePrefetch ( File  file,
pgoff_t  offset,
pgoff_t  amount,
uint32  wait_event_info 
)

Definition at line 2067 of file fd.c.

2068{
2069 Assert(FileIsValid(file));
2070
2071 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2072 file, VfdCache[file].fileName,
2073 (int64) offset, (int64) amount));
2074
2075#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2076 {
2077 int returnCode;
2078
2079 returnCode = FileAccess(file);
2080 if (returnCode < 0)
2081 return returnCode;
2082
2083retry:
2084 pgstat_report_wait_start(wait_event_info);
2085 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2088
2089 if (returnCode == EINTR)
2090 goto retry;
2091
2092 return returnCode;
2093 }
2094#elif defined(__darwin__)
2095 {
2096 struct radvisory
2097 {
2098 off_t ra_offset; /* offset into the file */
2099 int ra_count; /* size of the read */
2100 } ra;
2101 int returnCode;
2102
2103 returnCode = FileAccess(file);
2104 if (returnCode < 0)
2105 return returnCode;
2106
2107 ra.ra_offset = offset;
2108 ra.ra_count = amount;
2109 pgstat_report_wait_start(wait_event_info);
2112 if (returnCode != -1)
2113 return 0;
2114 else
2115 return errno;
2116 }
2117#else
2118 return 0;
2119#endif
2120}

References Assert, DO_DB, EINTR, elog, fb(), fd(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by mdprefetch().

◆ FileReadV()

ssize_t FileReadV ( File  file,
const struct iovec iov,
int  iovcnt,
pgoff_t  offset,
uint32  wait_event_info 
)

Definition at line 2149 of file fd.c.

2151{
2153 Vfd *vfdP;
2154
2155 Assert(FileIsValid(file));
2156
2157 DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2158 file, VfdCache[file].fileName,
2159 (int64) offset,
2160 iovcnt));
2161
2162 returnCode = FileAccess(file);
2163 if (returnCode < 0)
2164 return returnCode;
2165
2166 vfdP = &VfdCache[file];
2167
2168retry:
2169 pgstat_report_wait_start(wait_event_info);
2170 returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2172
2173 if (returnCode < 0)
2174 {
2175 /*
2176 * Windows may run out of kernel buffers and return "Insufficient
2177 * system resources" error. Wait a bit and retry to solve it.
2178 *
2179 * It is rumored that EINTR is also possible on some Unix filesystems,
2180 * in which case immediate retry is indicated.
2181 */
2182#ifdef WIN32
2184
2185 switch (error)
2186 {
2188 pg_usleep(1000L);
2189 errno = EINTR;
2190 break;
2191 default:
2193 break;
2194 }
2195#endif
2196 /* OK to retry if interrupted */
2197 if (errno == EINTR)
2198 goto retry;
2199 }
2200
2201 return returnCode;
2202}

References _dosmaperr(), Assert, DO_DB, EINTR, elog, error(), fb(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pg_preadv(), pg_usleep(), pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by FileRead(), and mdreadv().

◆ FileSize()

pgoff_t FileSize ( File  file)

Definition at line 2448 of file fd.c.

2449{
2450 Assert(FileIsValid(file));
2451
2452 DO_DB(elog(LOG, "FileSize %d (%s)",
2453 file, VfdCache[file].fileName));
2454
2455 if (FileIsNotOpen(file))
2456 {
2457 if (FileAccess(file) < 0)
2458 return (pgoff_t) -1;
2459 }
2460
2461 return lseek(VfdCache[file].fd, 0, SEEK_END);
2462}

References Assert, DO_DB, elog, fb(), fd(), FileAccess(), FileIsNotOpen, FileIsValid, LOG, and VfdCache.

Referenced by _mdnblocks(), BufFileSeek(), and BufFileSize().

◆ FileStartReadV()

int FileStartReadV ( PgAioHandle ioh,
File  file,
int  iovcnt,
pgoff_t  offset,
uint32  wait_event_info 
)

Definition at line 2205 of file fd.c.

2208{
2209 int returnCode;
2210 Vfd *vfdP;
2211
2212 Assert(FileIsValid(file));
2213
2214 DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2215 file, VfdCache[file].fileName,
2216 (int64) offset,
2217 iovcnt));
2218
2219 returnCode = FileAccess(file);
2220 if (returnCode < 0)
2221 return returnCode;
2222
2223 vfdP = &VfdCache[file];
2224
2225 pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2226
2227 return 0;
2228}

References Assert, DO_DB, elog, fb(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pgaio_io_start_readv(), and VfdCache.

Referenced by mdstartreadv().

◆ FileSync()

int FileSync ( File  file,
uint32  wait_event_info 
)

Definition at line 2336 of file fd.c.

2337{
2338 int returnCode;
2339
2340 Assert(FileIsValid(file));
2341
2342 DO_DB(elog(LOG, "FileSync: %d (%s)",
2343 file, VfdCache[file].fileName));
2344
2345 returnCode = FileAccess(file);
2346 if (returnCode < 0)
2347 return returnCode;
2348
2349 pgstat_report_wait_start(wait_event_info);
2350 returnCode = pg_fsync(VfdCache[file].fd);
2352
2353 return returnCode;
2354}

References Assert, DO_DB, elog, fb(), fd(), FileAccess(), FileIsValid, LOG, pg_fsync(), pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by bbsink_server_end_archive(), logical_end_heap_rewrite(), mdimmedsync(), mdsyncfiletag(), and register_dirty_segment().

◆ FileTruncate()

int FileTruncate ( File  file,
pgoff_t  offset,
uint32  wait_event_info 
)

Definition at line 2465 of file fd.c.

2466{
2467 int returnCode;
2468
2469 Assert(FileIsValid(file));
2470
2471 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2472 file, VfdCache[file].fileName));
2473
2474 returnCode = FileAccess(file);
2475 if (returnCode < 0)
2476 return returnCode;
2477
2478 pgstat_report_wait_start(wait_event_info);
2479 returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2481
2482 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2483 {
2484 /* adjust our state for truncation of a temp file */
2485 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2486 temporary_files_size -= VfdCache[file].fileSize - offset;
2487 VfdCache[file].fileSize = offset;
2488 }
2489
2490 return returnCode;
2491}

References Assert, DO_DB, elog, fb(), fd(), FD_TEMP_FILE_LIMIT, FileAccess(), FileIsValid, vfd::fileSize, LOG, pg_ftruncate(), pgstat_report_wait_end(), pgstat_report_wait_start(), temporary_files_size, and VfdCache.

Referenced by BufFileTruncateFileSet(), and mdtruncate().

◆ FileWriteback()

void FileWriteback ( File  file,
pgoff_t  offset,
pgoff_t  nbytes,
uint32  wait_event_info 
)

Definition at line 2123 of file fd.c.

2124{
2125 int returnCode;
2126
2127 Assert(FileIsValid(file));
2128
2129 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2130 file, VfdCache[file].fileName,
2131 (int64) offset, (int64) nbytes));
2132
2133 if (nbytes <= 0)
2134 return;
2135
2136 if (VfdCache[file].fileFlags & PG_O_DIRECT)
2137 return;
2138
2139 returnCode = FileAccess(file);
2140 if (returnCode < 0)
2141 return;
2142
2143 pgstat_report_wait_start(wait_event_info);
2144 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2146}

References Assert, DO_DB, elog, fb(), fd(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pg_flush_data(), PG_O_DIRECT, pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by mdwriteback().

◆ FileWriteV()

ssize_t FileWriteV ( File  file,
const struct iovec iov,
int  iovcnt,
pgoff_t  offset,
uint32  wait_event_info 
)

Definition at line 2231 of file fd.c.

2233{
2235 Vfd *vfdP;
2236
2237 Assert(FileIsValid(file));
2238
2239 DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2240 file, VfdCache[file].fileName,
2241 (int64) offset,
2242 iovcnt));
2243
2244 returnCode = FileAccess(file);
2245 if (returnCode < 0)
2246 return returnCode;
2247
2248 vfdP = &VfdCache[file];
2249
2250 /*
2251 * If enforcing temp_file_limit and it's a temp file, check to see if the
2252 * write would overrun temp_file_limit, and throw error if so. Note: it's
2253 * really a modularity violation to throw error here; we should set errno
2254 * and return -1. However, there's no way to report a suitable error
2255 * message if we do that. All current callers would just throw error
2256 * immediately anyway, so this is safe at present.
2257 */
2258 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2259 {
2260 pgoff_t past_write = offset;
2261
2262 for (int i = 0; i < iovcnt; ++i)
2263 past_write += iov[i].iov_len;
2264
2265 if (past_write > vfdP->fileSize)
2266 {
2268
2270 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2271 ereport(ERROR,
2273 errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2274 temp_file_limit)));
2275 }
2276 }
2277
2278retry:
2279 pgstat_report_wait_start(wait_event_info);
2280 returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2282
2283 if (returnCode >= 0)
2284 {
2285 /*
2286 * Some callers expect short writes to set errno, and traditionally we
2287 * have assumed that they imply disk space shortage. We don't want to
2288 * waste CPU cycles adding up the total size here, so we'll just set
2289 * it for all successful writes in case such a caller determines that
2290 * the write was short and ereports "%m".
2291 */
2292 errno = ENOSPC;
2293
2294 /*
2295 * Maintain fileSize and temporary_files_size if it's a temp file.
2296 */
2297 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2298 {
2299 pgoff_t past_write = offset + returnCode;
2300
2301 if (past_write > vfdP->fileSize)
2302 {
2303 temporary_files_size += past_write - vfdP->fileSize;
2304 vfdP->fileSize = past_write;
2305 }
2306 }
2307 }
2308 else
2309 {
2310 /*
2311 * See comments in FileReadV()
2312 */
2313#ifdef WIN32
2315
2316 switch (error)
2317 {
2319 pg_usleep(1000L);
2320 errno = EINTR;
2321 break;
2322 default:
2324 break;
2325 }
2326#endif
2327 /* OK to retry if interrupted */
2328 if (errno == EINTR)
2329 goto retry;
2330 }
2331
2332 return returnCode;
2333}

References _dosmaperr(), Assert, DO_DB, EINTR, elog, ereport, errcode(), errmsg, ERROR, error(), fb(), FD_TEMP_FILE_LIMIT, FileAccess(), FileIsValid, vfd::fileSize, i, INT64_FORMAT, LOG, pg_pwritev(), pg_usleep(), pgstat_report_wait_end(), pgstat_report_wait_start(), temp_file_limit, temporary_files_size, and VfdCache.

Referenced by FileWrite(), and mdwritev().

◆ FileZero()

int FileZero ( File  file,
pgoff_t  offset,
pgoff_t  amount,
uint32  wait_event_info 
)

Definition at line 2363 of file fd.c.

2364{
2365 int returnCode;
2367
2368 Assert(FileIsValid(file));
2369
2370 DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2371 file, VfdCache[file].fileName,
2372 (int64) offset, (int64) amount));
2373
2374 returnCode = FileAccess(file);
2375 if (returnCode < 0)
2376 return returnCode;
2377
2378 pgstat_report_wait_start(wait_event_info);
2379 written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2381
2382 if (written < 0)
2383 return -1;
2384 else if (written != amount)
2385 {
2386 /* if errno is unset, assume problem is no disk space */
2387 if (errno == 0)
2388 errno = ENOSPC;
2389 return -1;
2390 }
2391
2392 return 0;
2393}

References Assert, DO_DB, elog, fb(), fd(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pg_pwrite_zeros(), pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by FileFallocate(), and mdzeroextend().

◆ FreeDesc()

static int FreeDesc ( AllocateDesc desc)
static

Definition at line 2787 of file fd.c.

2788{
2789 int result;
2790
2791 /* Close the underlying object */
2792 switch (desc->kind)
2793 {
2794 case AllocateDescFile:
2795 result = fclose(desc->desc.file);
2796 break;
2797 case AllocateDescPipe:
2798 result = pclose(desc->desc.file);
2799 break;
2800 case AllocateDescDir:
2801 result = closedir(desc->desc.dir);
2802 break;
2803 case AllocateDescRawFD:
2804 pgaio_closing_fd(desc->desc.fd);
2805 result = close(desc->desc.fd);
2806 break;
2807 default:
2808 elog(ERROR, "AllocateDesc kind not recognized");
2809 result = 0; /* keep compiler quiet */
2810 break;
2811 }
2812
2813 /* Compact storage in the allocatedDescs array */
2816
2817 return result;
2818}

References allocatedDescs, AllocateDescDir, AllocateDescFile, AllocateDescPipe, AllocateDescRawFD, close, closedir(), AllocateDesc::desc, AllocateDesc::dir, elog, ERROR, fb(), AllocateDesc::fd, AllocateDesc::file, AllocateDesc::kind, numAllocatedDescs, and pgaio_closing_fd().

Referenced by AtEOSubXact_Files(), CleanupTempFiles(), ClosePipeStream(), CloseTransientFile(), FreeDir(), and FreeFile().

◆ FreeDir()

int FreeDir ( DIR dir)

Definition at line 3009 of file fd.c.

3010{
3011 int i;
3012
3013 /* Nothing to do if AllocateDir failed */
3014 if (dir == NULL)
3015 return 0;
3016
3017 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3018
3019 /* Remove dir from list of allocated dirs, if it's present */
3020 for (i = numAllocatedDescs; --i >= 0;)
3021 {
3022 AllocateDesc *desc = &allocatedDescs[i];
3023
3024 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3025 return FreeDesc(desc);
3026 }
3027
3028 /* Only get here if someone passes us a dir not in allocatedDescs */
3029 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3030
3031 return closedir(dir);
3032}

References allocatedDescs, AllocateDescDir, closedir(), AllocateDesc::desc, AllocateDesc::dir, DO_DB, elog, fb(), FreeDesc(), i, AllocateDesc::kind, LOG, numAllocatedDescs, and WARNING.

Referenced by calculate_database_size(), calculate_tablespace_size(), CheckPointLogicalRewriteHeap(), CheckPointSnapBuild(), CleanupBackupHistory(), copydir(), db_dir_size(), DeleteAllExportedSnapshotFiles(), destroy_tablespace_directories(), directory_is_empty(), do_pg_backup_start(), dsm_cleanup_for_mmap(), extension_file_exists(), get_ext_ver_list(), GetConfFilesInDir(), getInstallationPaths(), GetWalSummaries(), movedb(), ParseTzFile(), perform_base_backup(), pg_available_extension_versions(), pg_available_extensions(), pg_ls_dir(), pg_ls_dir_files(), pg_tablespace_databases(), pg_tzenumerate_end(), pg_tzenumerate_next(), pgarch_readyXlog(), RelationCacheInitFileRemove(), RelationCacheInitFileRemoveInDir(), RemoveNonParentXlogFiles(), RemoveOldXlogFiles(), RemovePgTempFiles(), RemovePgTempFilesInDir(), RemovePgTempRelationFiles(), RemovePgTempRelationFilesInDbspace(), RemoveTempXlogFiles(), ReorderBufferCleanupSerializedTXNs(), ResetUnloggedRelations(), ResetUnloggedRelationsInDbspaceDir(), ResetUnloggedRelationsInTablespaceDir(), restoreTwoPhaseData(), scan_directory_ci(), sendDir(), SlruScanDirectory(), StartupReorderBuffer(), StartupReplicationSlots(), SyncDataDirectory(), UpdateLogicalMappings(), walkdir(), and XLogGetOldestSegno().

◆ FreeFile()

int FreeFile ( FILE file)

Definition at line 2827 of file fd.c.

2828{
2829 int i;
2830
2831 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2832
2833 /* Remove file from list of allocated files, if it's present */
2834 for (i = numAllocatedDescs; --i >= 0;)
2835 {
2836 AllocateDesc *desc = &allocatedDescs[i];
2837
2838 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2839 return FreeDesc(desc);
2840 }
2841
2842 /* Only get here if someone passes us a file not in allocatedDescs */
2843 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2844
2845 return fclose(file);
2846}

References allocatedDescs, AllocateDescFile, AllocateDesc::desc, DO_DB, elog, fb(), AllocateDesc::file, FreeDesc(), i, AllocateDesc::kind, LOG, numAllocatedDescs, and WARNING.

Referenced by AlterSystemSetConfigFile(), apw_dump_now(), apw_load_buffers(), checkControlFile(), do_pg_backup_stop(), EndCopy(), EndCopyFrom(), entry_reset(), existsTimeLineHistory(), ExportSnapshot(), free_auth_file(), gc_qtexts(), GetHugePageSize(), ImportSnapshot(), load_dh_file(), load_relcache_init_file(), parse_extension_control_file(), ParseConfigFile(), ParseTzFile(), pg_current_logfile(), pg_promote(), pgss_shmem_shutdown(), pgss_shmem_startup(), pgstat_read_statsfile(), pgstat_write_statsfile(), read_backup_label(), read_binary_file(), read_tablespace_map(), read_whole_file(), readTimeLineHistory(), test_custom_stats_var_finish(), tsearch_readline_end(), ValidatePgVersion(), write_relcache_init_file(), XLogArchiveForceDone(), and XLogArchiveNotify().

◆ FreeVfd()

static void FreeVfd ( File  file)
static

Definition at line 1460 of file fd.c.

1461{
1462 Vfd *vfdP = &VfdCache[file];
1463
1464 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1465 file, vfdP->fileName ? vfdP->fileName : ""));
1466
1467 if (vfdP->fileName != NULL)
1468 {
1469 free(vfdP->fileName);
1470 vfdP->fileName = NULL;
1471 }
1472 vfdP->fdstate = 0x0;
1473
1474 vfdP->nextFree = VfdCache[0].nextFree;
1475 VfdCache[0].nextFree = file;
1476}

References DO_DB, elog, fb(), free, LOG, vfd::nextFree, and VfdCache.

Referenced by FileClose(), and PathNameOpenFilePerm().

◆ fsync_fname()

◆ fsync_fname_ext()

int fsync_fname_ext ( const char fname,
bool  isdir,
bool  ignore_perm,
int  elevel 
)

Definition at line 3847 of file fd.c.

3848{
3849 int fd;
3850 int flags;
3851 int returncode;
3852
3853 /*
3854 * Some OSs require directories to be opened read-only whereas other
3855 * systems don't allow us to fsync files opened read-only; so we need both
3856 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3857 * not writable by our userid, but we assume that's OK.
3858 */
3859 flags = PG_BINARY;
3860 if (!isdir)
3861 flags |= O_RDWR;
3862 else
3863 flags |= O_RDONLY;
3864
3865 fd = OpenTransientFile(fname, flags);
3866
3867 /*
3868 * Some OSs don't allow us to open directories at all (Windows returns
3869 * EACCES), just ignore the error in that case. If desired also silently
3870 * ignoring errors about unreadable files. Log others.
3871 */
3872 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3873 return 0;
3874 else if (fd < 0 && ignore_perm && errno == EACCES)
3875 return 0;
3876 else if (fd < 0)
3877 {
3878 ereport(elevel,
3880 errmsg("could not open file \"%s\": %m", fname)));
3881 return -1;
3882 }
3883
3885
3886 /*
3887 * Some OSes don't allow us to fsync directories at all, so we can ignore
3888 * those errors. Anything else needs to be logged.
3889 */
3890 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3891 {
3892 int save_errno;
3893
3894 /* close file upon error, might not be in transaction context */
3895 save_errno = errno;
3897 errno = save_errno;
3898
3899 ereport(elevel,
3901 errmsg("could not fsync file \"%s\": %m", fname)));
3902 return -1;
3903 }
3904
3905 if (CloseTransientFile(fd) != 0)
3906 {
3907 ereport(elevel,
3909 errmsg("could not close file \"%s\": %m", fname)));
3910 return -1;
3911 }
3912
3913 return 0;
3914}

References CloseTransientFile(), ereport, errcode_for_file_access(), errmsg, fb(), fd(), OpenTransientFile(), PG_BINARY, and pg_fsync().

Referenced by datadir_fsync_fname(), durable_rename(), fsync_fname(), and fsync_parent_path().

◆ fsync_parent_path()

static int fsync_parent_path ( const char fname,
int  elevel 
)
static

Definition at line 3923 of file fd.c.

3924{
3925 char parentpath[MAXPGPATH];
3926
3927 strlcpy(parentpath, fname, MAXPGPATH);
3929
3930 /*
3931 * get_parent_directory() returns an empty string if the input argument is
3932 * just a file name (see comments in path.c), so handle that as being the
3933 * current directory.
3934 */
3935 if (strlen(parentpath) == 0)
3937
3938 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3939 return -1;
3940
3941 return 0;
3942}

References fb(), fsync_fname_ext(), get_parent_directory(), MAXPGPATH, and strlcpy().

Referenced by dir_close(), dir_open_for_write(), durable_rename(), durable_unlink(), swap_catalog_files(), and tar_finish().

◆ GetNextTempTableSpace()

Oid GetNextTempTableSpace ( void  )

Definition at line 3159 of file fd.c.

3160{
3161 if (numTempTableSpaces > 0)
3162 {
3163 /* Advance nextTempTableSpace counter with wraparound */
3167 }
3168 return InvalidOid;
3169}

References InvalidOid, nextTempTableSpace, numTempTableSpaces, and tempTableSpaces.

Referenced by GetDefaultTablespace(), and OpenTemporaryFile().

◆ GetTempTablespaces()

int GetTempTablespaces ( Oid tableSpaces,
int  numSpaces 
)

Definition at line 3141 of file fd.c.

3142{
3143 int i;
3144
3146 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3148
3149 return i;
3150}

References Assert, fb(), i, numTempTableSpaces, tempTableSpaces, and TempTablespacesAreSet().

Referenced by FileSetInit().

◆ InitFileAccess()

void InitFileAccess ( void  )

Definition at line 904 of file fd.c.

905{
906 Assert(SizeVfdCache == 0); /* call me only once */
907
908 /* initialize cache header entry */
909 VfdCache = (Vfd *) malloc(sizeof(Vfd));
910 if (VfdCache == NULL)
913 errmsg("out of memory")));
914
915 MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
917
918 SizeVfdCache = 1;
919}

References Assert, ereport, errcode(), errmsg, FATAL, fb(), vfd::fd, malloc, MemSet, SizeVfdCache, VFD_CLOSED, and VfdCache.

Referenced by BaseInit().

◆ InitTemporaryFileAccess()

void InitTemporaryFileAccess ( void  )

Definition at line 934 of file fd.c.

935{
936 Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
937 Assert(!temporary_files_allowed); /* call me only once */
938
939 /*
940 * Register before-shmem-exit hook to ensure temp files are dropped while
941 * we can still report stats.
942 */
944
945#ifdef USE_ASSERT_CHECKING
947#endif
948}

References Assert, before_shmem_exit(), BeforeShmemExit_Files(), fb(), and SizeVfdCache.

Referenced by BaseInit().

◆ Insert()

static void Insert ( File  file)
static

Definition at line 1301 of file fd.c.

1302{
1303 Vfd *vfdP;
1304
1305 Assert(file != 0);
1306
1307 DO_DB(elog(LOG, "Insert %d (%s)",
1308 file, VfdCache[file].fileName));
1309 DO_DB(_dump_lru());
1310
1311 vfdP = &VfdCache[file];
1312
1313 vfdP->lruMoreRecently = 0;
1314 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1315 VfdCache[0].lruLessRecently = file;
1316 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1317
1318 DO_DB(_dump_lru());
1319}

References Assert, DO_DB, elog, fb(), LOG, vfd::lruLessRecently, vfd::lruMoreRecently, and VfdCache.

Referenced by CreateCheckPoint(), FileAccess(), GetXLogInsertEndRecPtr(), GetXLogInsertRecPtr(), LruInsert(), PathNameOpenFilePerm(), ReserveXLogInsertLocation(), ReserveXLogSwitch(), StartupXLOG(), UpdateFullPageWrites(), WaitXLogInsertionsToFinish(), XLogInsertRecord(), and XLogWrite().

◆ looks_like_temp_rel_name()

bool looks_like_temp_rel_name ( const char name)

Definition at line 3499 of file fd.c.

3500{
3501 int pos;
3502 int savepos;
3503
3504 /* Must start with "t". */
3505 if (name[0] != 't')
3506 return false;
3507
3508 /* Followed by a non-empty string of digits and then an underscore. */
3509 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3510 ;
3511 if (pos == 1 || name[pos] != '_')
3512 return false;
3513
3514 /* Followed by another nonempty string of digits. */
3515 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3516 ;
3517 if (savepos == pos)
3518 return false;
3519
3520 /* We might have _forkname or .segment or both. */
3521 if (name[pos] == '_')
3522 {
3523 int forkchar = forkname_chars(&name[pos + 1], NULL);
3524
3525 if (forkchar <= 0)
3526 return false;
3527 pos += forkchar + 1;
3528 }
3529 if (name[pos] == '.')
3530 {
3531 int segchar;
3532
3533 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3534 ;
3535 if (segchar <= 1)
3536 return false;
3537 pos += segchar;
3538 }
3539
3540 /* Now we should be at the end. */
3541 if (name[pos] != '\0')
3542 return false;
3543 return true;
3544}

References fb(), forkname_chars(), and name.

Referenced by RemovePgTempRelationFilesInDbspace(), and sendDir().

◆ LruDelete()

static void LruDelete ( File  file)
static

Definition at line 1273 of file fd.c.

1274{
1275 Vfd *vfdP;
1276
1277 Assert(file != 0);
1278
1279 DO_DB(elog(LOG, "LruDelete %d (%s)",
1280 file, VfdCache[file].fileName));
1281
1282 vfdP = &VfdCache[file];
1283
1285
1286 /*
1287 * Close the file. We aren't expecting this to fail; if it does, better
1288 * to leak the FD than to mess up our internal state.
1289 */
1290 if (close(vfdP->fd) != 0)
1292 "could not close file \"%s\": %m", vfdP->fileName);
1293 vfdP->fd = VFD_CLOSED;
1294 --nfile;
1295
1296 /* delete the vfd record from the LRU ring */
1297 Delete(file);
1298}

References Assert, close, data_sync_elevel(), Delete(), DO_DB, elog, fb(), FD_TEMP_FILE_LIMIT, LOG, nfile, pgaio_closing_fd(), VFD_CLOSED, and VfdCache.

Referenced by closeAllVfds(), and ReleaseLruFile().

◆ LruInsert()

static int LruInsert ( File  file)
static

Definition at line 1323 of file fd.c.

1324{
1325 Vfd *vfdP;
1326
1327 Assert(file != 0);
1328
1329 DO_DB(elog(LOG, "LruInsert %d (%s)",
1330 file, VfdCache[file].fileName));
1331
1332 vfdP = &VfdCache[file];
1333
1334 if (FileIsNotOpen(file))
1335 {
1336 /* Close excess kernel FDs. */
1338
1339 /*
1340 * The open could still fail for lack of file descriptors, eg due to
1341 * overall system file table being full. So, be prepared to release
1342 * another FD if necessary...
1343 */
1344 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1345 vfdP->fileMode);
1346 if (vfdP->fd < 0)
1347 {
1348 DO_DB(elog(LOG, "re-open failed: %m"));
1349 return -1;
1350 }
1351 else
1352 {
1353 ++nfile;
1354 }
1355 }
1356
1357 /*
1358 * put it at the head of the Lru ring
1359 */
1360
1361 Insert(file);
1362
1363 return 0;
1364}

References Assert, BasicOpenFilePerm(), DO_DB, elog, fb(), FileIsNotOpen, Insert(), LOG, nfile, ReleaseLruFiles(), and VfdCache.

Referenced by FileAccess().

◆ MakePGDirectory()

◆ OpenPipeStream()

FILE * OpenPipeStream ( const char command,
const char mode 
)

Definition at line 2731 of file fd.c.

2732{
2733 FILE *file;
2734 int save_errno;
2735
2736 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2737 numAllocatedDescs, command));
2738
2739 /* Can we allocate another non-virtual FD? */
2740 if (!reserveAllocatedDesc())
2741 ereport(ERROR,
2743 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2744 maxAllocatedDescs, command)));
2745
2746 /* Close excess kernel FDs. */
2748
2749TryAgain:
2750 fflush(NULL);
2752 errno = 0;
2753 file = popen(command, mode);
2754 save_errno = errno;
2756 errno = save_errno;
2757 if (file != NULL)
2758 {
2760
2761 desc->kind = AllocateDescPipe;
2762 desc->desc.file = file;
2765 return desc->desc.file;
2766 }
2767
2768 if (errno == EMFILE || errno == ENFILE)
2769 {
2770 ereport(LOG,
2772 errmsg("out of file descriptors: %m; release and retry")));
2773 if (ReleaseLruFile())
2774 goto TryAgain;
2775 errno = save_errno;
2776 }
2777
2778 return NULL;
2779}

References allocatedDescs, AllocateDescPipe, AllocateDesc::create_subid, AllocateDesc::desc, DO_DB, elog, ereport, errcode(), errmsg, ERROR, fb(), AllocateDesc::file, GetCurrentSubTransactionId(), AllocateDesc::kind, LOG, maxAllocatedDescs, mode, numAllocatedDescs, pqsignal, ReleaseLruFile(), ReleaseLruFiles(), reserveAllocatedDesc(), and SIGPIPE.

Referenced by BeginCopyFrom(), BeginCopyTo(), pg_import_system_collations(), run_ssl_passphrase_command(), and shell_run_command().

◆ OpenTemporaryFile()

File OpenTemporaryFile ( bool  interXact)

Definition at line 1712 of file fd.c.

1713{
1714 File file = 0;
1715
1716 Assert(temporary_files_allowed); /* check temp file access is up */
1717
1718 /*
1719 * Make sure the current resource owner has space for this File before we
1720 * open it, if we'll be registering it below.
1721 */
1722 if (!interXact)
1724
1725 /*
1726 * If some temp tablespace(s) have been given to us, try to use the next
1727 * one. If a given tablespace can't be found, we silently fall back to
1728 * the database's default tablespace.
1729 *
1730 * BUT: if the temp file is slated to outlive the current transaction,
1731 * force it into the database's default tablespace, so that it will not
1732 * pose a threat to possible tablespace drop attempts.
1733 */
1734 if (numTempTableSpaces > 0 && !interXact)
1735 {
1737
1738 if (OidIsValid(tblspcOid))
1740 }
1741
1742 /*
1743 * If not, or if tablespace is bad, create in database's default
1744 * tablespace. MyDatabaseTableSpace should normally be set before we get
1745 * here, but just in case it isn't, fall back to pg_default tablespace.
1746 */
1747 if (file <= 0)
1751 true);
1752
1753 /* Mark it for deletion at close and temporary file size limit */
1755
1756 /* Register it with the current resource owner */
1757 if (!interXact)
1759
1760 return file;
1761}

References Assert, CurrentResourceOwner, fb(), FD_DELETE_AT_CLOSE, FD_TEMP_FILE_LIMIT, vfd::fdstate, GetNextTempTableSpace(), MyDatabaseTableSpace, numTempTableSpaces, OidIsValid, OpenTemporaryFileInTablespace(), RegisterTemporaryFile(), ResourceOwnerEnlarge(), and VfdCache.

Referenced by BufFileCreateTemp(), and extendBufFile().

◆ OpenTemporaryFileInTablespace()

static File OpenTemporaryFileInTablespace ( Oid  tblspcOid,
bool  rejectError 
)
static

Definition at line 1792 of file fd.c.

1793{
1794 char tempdirpath[MAXPGPATH];
1795 char tempfilepath[MAXPGPATH];
1796 File file;
1797
1799
1800 /*
1801 * Generate a tempfile name that should be unique within the current
1802 * database instance.
1803 */
1804 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1806
1807 /*
1808 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1809 * temp file that can be reused.
1810 */
1813 if (file <= 0)
1814 {
1815 /*
1816 * We might need to create the tablespace's tempfile directory, if no
1817 * one has yet done so.
1818 *
1819 * Don't check for an error from MakePGDirectory; it could fail if
1820 * someone else just did the same thing. If it doesn't work then
1821 * we'll bomb out on the second create attempt, instead.
1822 */
1824
1827 if (file <= 0 && rejectError)
1828 elog(ERROR, "could not create temporary file \"%s\": %m",
1829 tempfilepath);
1830 }
1831
1832 return file;
1833}

References elog, ERROR, fb(), MakePGDirectory(), MAXPGPATH, MyProcPid, PathNameOpenFile(), PG_BINARY, PG_TEMP_FILE_PREFIX, snprintf, tempFileCounter, and TempTablespacePath().

Referenced by OpenTemporaryFile().

◆ OpenTransientFile()

◆ OpenTransientFilePerm()

int OpenTransientFilePerm ( const char fileName,
int  fileFlags,
mode_t  fileMode 
)

Definition at line 2687 of file fd.c.

2688{
2689 int fd;
2690
2691 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2692 numAllocatedDescs, fileName));
2693
2694 /* Can we allocate another non-virtual FD? */
2695 if (!reserveAllocatedDesc())
2696 ereport(ERROR,
2698 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2699 maxAllocatedDescs, fileName)));
2700
2701 /* Close excess kernel FDs. */
2703
2704 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2705
2706 if (fd >= 0)
2707 {
2709
2710 desc->kind = AllocateDescRawFD;
2711 desc->desc.fd = fd;
2714
2715 return fd;
2716 }
2717
2718 return -1; /* failure */
2719}

References allocatedDescs, AllocateDescRawFD, BasicOpenFilePerm(), AllocateDesc::create_subid, AllocateDesc::desc, DO_DB, elog, ereport, errcode(), errmsg, ERROR, fb(), AllocateDesc::fd, fd(), GetCurrentSubTransactionId(), AllocateDesc::kind, LOG, maxAllocatedDescs, numAllocatedDescs, ReleaseLruFiles(), and reserveAllocatedDesc().

Referenced by be_lo_export(), and OpenTransientFile().

◆ PathNameCreateTemporaryDir()

void PathNameCreateTemporaryDir ( const char basedir,
const char directory 
)

Definition at line 1648 of file fd.c.

1649{
1650 if (MakePGDirectory(directory) < 0)
1651 {
1652 if (errno == EEXIST)
1653 return;
1654
1655 /*
1656 * Failed. Try to create basedir first in case it's missing. Tolerate
1657 * EEXIST to close a race against another process following the same
1658 * algorithm.
1659 */
1660 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1661 ereport(ERROR,
1663 errmsg("cannot create temporary directory \"%s\": %m",
1664 basedir)));
1665
1666 /* Try again. */
1667 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1668 ereport(ERROR,
1670 errmsg("cannot create temporary subdirectory \"%s\": %m",
1671 directory)));
1672 }
1673}

References basedir, directory, ereport, errcode_for_file_access(), errmsg, ERROR, fb(), and MakePGDirectory().

Referenced by FileSetCreate().

◆ PathNameCreateTemporaryFile()

File PathNameCreateTemporaryFile ( const char path,
bool  error_on_failure 
)

Definition at line 1849 of file fd.c.

1850{
1851 File file;
1852
1853 Assert(temporary_files_allowed); /* check temp file access is up */
1854
1856
1857 /*
1858 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1859 * temp file that can be reused.
1860 */
1861 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1862 if (file <= 0)
1863 {
1864 if (error_on_failure)
1865 ereport(ERROR,
1867 errmsg("could not create temporary file \"%s\": %m",
1868 path)));
1869 else
1870 return file;
1871 }
1872
1873 /* Mark it for temp_file_limit accounting. */
1875
1876 /* Register it for automatic close. */
1878
1879 return file;
1880}

References Assert, CurrentResourceOwner, ereport, errcode_for_file_access(), errmsg, ERROR, fb(), FD_TEMP_FILE_LIMIT, vfd::fdstate, PathNameOpenFile(), PG_BINARY, RegisterTemporaryFile(), ResourceOwnerEnlarge(), and VfdCache.

Referenced by FileSetCreate().

◆ PathNameDeleteTemporaryDir()

void PathNameDeleteTemporaryDir ( const char dirname)

Definition at line 1679 of file fd.c.

1680{
1681 struct stat statbuf;
1682
1683 /* Silently ignore missing directory. */
1684 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1685 return;
1686
1687 /*
1688 * Currently, walkdir doesn't offer a way for our passed in function to
1689 * maintain state. Perhaps it should, so that we could tell the caller
1690 * whether this operation succeeded or failed. Since this operation is
1691 * used in a cleanup path, we wouldn't actually behave differently: we'll
1692 * just log failures.
1693 */
1694 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1695}

References fb(), LOG, stat, unlink_if_exists_fname(), and walkdir().

Referenced by FileSetDeleteAll().

◆ PathNameDeleteTemporaryFile()

bool PathNameDeleteTemporaryFile ( const char path,
bool  error_on_failure 
)

Definition at line 1920 of file fd.c.

1921{
1922 struct stat filestats;
1923 int stat_errno;
1924
1925 /* Get the final size for pgstat reporting. */
1926 if (stat(path, &filestats) != 0)
1927 stat_errno = errno;
1928 else
1929 stat_errno = 0;
1930
1931 /*
1932 * Unlike FileClose's automatic file deletion code, we tolerate
1933 * non-existence to support BufFileDeleteFileSet which doesn't know how
1934 * many segments it has to delete until it runs out.
1935 */
1936 if (stat_errno == ENOENT)
1937 return false;
1938
1939 if (unlink(path) < 0)
1940 {
1941 if (errno != ENOENT)
1944 errmsg("could not unlink temporary file \"%s\": %m",
1945 path)));
1946 return false;
1947 }
1948
1949 if (stat_errno == 0)
1950 ReportTemporaryFileUsage(path, filestats.st_size);
1951 else
1952 {
1953 errno = stat_errno;
1954 ereport(LOG,
1956 errmsg("could not stat file \"%s\": %m", path)));
1957 }
1958
1959 return true;
1960}

References ereport, errcode_for_file_access(), errmsg, ERROR, fb(), LOG, ReportTemporaryFileUsage(), and stat.

Referenced by FileSetDelete(), and unlink_if_exists_fname().

◆ PathNameOpenFile()

◆ PathNameOpenFilePerm()

File PathNameOpenFilePerm ( const char fileName,
int  fileFlags,
mode_t  fileMode 
)

Definition at line 1576 of file fd.c.

1577{
1578 char *fnamecopy;
1579 File file;
1580 Vfd *vfdP;
1581
1582 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1583 fileName, fileFlags, fileMode));
1584
1585 /*
1586 * We need a malloc'd copy of the file name; fail cleanly if no room.
1587 */
1588 fnamecopy = strdup(fileName);
1589 if (fnamecopy == NULL)
1590 ereport(ERROR,
1592 errmsg("out of memory")));
1593
1594 file = AllocateVfd();
1595 vfdP = &VfdCache[file];
1596
1597 /* Close excess kernel FDs. */
1599
1600 /*
1601 * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1602 * client shouldn't be expected to know which kernel descriptors are
1603 * currently open, so it wouldn't make sense for them to be inherited by
1604 * executed subprograms.
1605 */
1606 fileFlags |= O_CLOEXEC;
1607
1608 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1609
1610 if (vfdP->fd < 0)
1611 {
1612 int save_errno = errno;
1613
1614 FreeVfd(file);
1615 free(fnamecopy);
1616 errno = save_errno;
1617 return -1;
1618 }
1619 ++nfile;
1620 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1621 vfdP->fd));
1622
1623 vfdP->fileName = fnamecopy;
1624 /* Saved flags are adjusted to be OK for re-opening file */
1625 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1626 vfdP->fileMode = fileMode;
1627 vfdP->fileSize = 0;
1628 vfdP->fdstate = 0x0;
1629 vfdP->resowner = NULL;
1630
1631 Insert(file);
1632
1633 return file;
1634}

References AllocateVfd(), BasicOpenFilePerm(), DO_DB, elog, ereport, errcode(), errmsg, ERROR, fb(), free, FreeVfd(), Insert(), LOG, nfile, O_CLOEXEC, ReleaseLruFiles(), and VfdCache.

Referenced by PathNameOpenFile().

◆ PathNameOpenTemporaryFile()

File PathNameOpenTemporaryFile ( const char path,
int  mode 
)

Definition at line 1889 of file fd.c.

1890{
1891 File file;
1892
1893 Assert(temporary_files_allowed); /* check temp file access is up */
1894
1896
1897 file = PathNameOpenFile(path, mode | PG_BINARY);
1898
1899 /* If no such file, then we don't raise an error. */
1900 if (file <= 0 && errno != ENOENT)
1901 ereport(ERROR,
1903 errmsg("could not open temporary file \"%s\": %m",
1904 path)));
1905
1906 if (file > 0)
1907 {
1908 /* Register it for automatic close. */
1910 }
1911
1912 return file;
1913}

References Assert, CurrentResourceOwner, ereport, errcode_for_file_access(), errmsg, ERROR, fb(), mode, PathNameOpenFile(), PG_BINARY, RegisterTemporaryFile(), and ResourceOwnerEnlarge().

Referenced by FileSetOpen().

◆ pg_fdatasync()

int pg_fdatasync ( int  fd)

Definition at line 481 of file fd.c.

482{
483 int rc;
484
485 if (!enableFsync)
486 return 0;
487
488retry:
489 rc = fdatasync(fd);
490
491 if (rc == -1 && errno == EINTR)
492 goto retry;
493
494 return rc;
495}

References EINTR, enableFsync, fb(), fd(), and fdatasync().

Referenced by issue_xlog_fsync().

◆ pg_file_exists()

bool pg_file_exists ( const char name)

Definition at line 504 of file fd.c.

505{
506 struct stat st;
507
508 Assert(name != NULL);
509
510 if (stat(name, &st) == 0)
511 return !S_ISDIR(st.st_mode);
512 else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
515 errmsg("could not access file \"%s\": %m", name)));
516
517 return false;
518}

References Assert, ereport, errcode_for_file_access(), errmsg, ERROR, fb(), name, S_ISDIR, stat::st_mode, and stat.

Referenced by expand_dynamic_library_name(), find_in_path(), find_in_paths(), and provider_init().

◆ pg_flush_data()

void pg_flush_data ( int  fd,
pgoff_t  offset,
pgoff_t  nbytes 
)

Definition at line 526 of file fd.c.

527{
528 /*
529 * Right now file flushing is primarily used to avoid making later
530 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
531 * if fsyncs are disabled - that's a decision we might want to make
532 * configurable at some point.
533 */
534 if (!enableFsync)
535 return;
536
537 /*
538 * We compile all alternatives that are supported on the current platform,
539 * to find portability problems more easily.
540 */
541#if defined(HAVE_SYNC_FILE_RANGE)
542 {
543 int rc;
544 static bool not_implemented_by_kernel = false;
545
547 return;
548
549retry:
550
551 /*
552 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
553 * tells the OS that writeback for the specified blocks should be
554 * started, but that we don't want to wait for completion. Note that
555 * this call might block if too much dirty data exists in the range.
556 * This is the preferable method on OSs supporting it, as it works
557 * reliably when available (contrast to msync()) and doesn't flush out
558 * clean data (like FADV_DONTNEED).
559 */
560 rc = sync_file_range(fd, offset, nbytes,
562 if (rc != 0)
563 {
564 int elevel;
565
566 if (rc == EINTR)
567 goto retry;
568
569 /*
570 * For systems that don't have an implementation of
571 * sync_file_range() such as Windows WSL, generate only one
572 * warning and then suppress all further attempts by this process.
573 */
574 if (errno == ENOSYS)
575 {
576 elevel = WARNING;
578 }
579 else
580 elevel = data_sync_elevel(WARNING);
581
582 ereport(elevel,
584 errmsg("could not flush dirty data: %m")));
585 }
586
587 return;
588 }
589#endif
590#if !defined(WIN32) && defined(MS_ASYNC)
591 {
592 void *p;
593 static int pagesize = 0;
594
595 /*
596 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
597 * writeback. On linux it only does so if MS_SYNC is specified, but
598 * then it does the writeback synchronously. Luckily all common linux
599 * systems have sync_file_range(). This is preferable over
600 * FADV_DONTNEED because it doesn't flush out clean data.
601 *
602 * We map the file (mmap()), tell the kernel to sync back the contents
603 * (msync()), and then remove the mapping again (munmap()).
604 */
605
606 /* mmap() needs actual length if we want to map whole file */
607 if (offset == 0 && nbytes == 0)
608 {
609 nbytes = lseek(fd, 0, SEEK_END);
610 if (nbytes < 0)
611 {
614 errmsg("could not determine dirty data size: %m")));
615 return;
616 }
617 }
618
619 /*
620 * Some platforms reject partial-page mmap() attempts. To deal with
621 * that, just truncate the request to a page boundary. If any extra
622 * bytes don't get flushed, well, it's only a hint anyway.
623 */
624
625 /* fetch pagesize only once */
626 if (pagesize == 0)
628
629 /* align length to pagesize, dropping any fractional page */
630 if (pagesize > 0)
631 nbytes = (nbytes / pagesize) * pagesize;
632
633 /* fractional-page request is a no-op */
634 if (nbytes <= 0)
635 return;
636
637 /*
638 * mmap could well fail, particularly on 32-bit platforms where there
639 * may simply not be enough address space. If so, silently fall
640 * through to the next implementation.
641 */
642 if (nbytes <= (pgoff_t) SSIZE_MAX)
643 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
644 else
645 p = MAP_FAILED;
646
647 if (p != MAP_FAILED)
648 {
649 int rc;
650
651 rc = msync(p, (size_t) nbytes, MS_ASYNC);
652 if (rc != 0)
653 {
656 errmsg("could not flush dirty data: %m")));
657 /* NB: need to fall through to munmap()! */
658 }
659
660 rc = munmap(p, (size_t) nbytes);
661 if (rc != 0)
662 {
663 /* FATAL error because mapping would remain */
666 errmsg("could not munmap() while flushing data: %m")));
667 }
668
669 return;
670 }
671 }
672#endif
673#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
674 {
675 int rc;
676
677 /*
678 * Signal the kernel that the passed in range should not be cached
679 * anymore. This has the, desired, side effect of writing out dirty
680 * data, and the, undesired, side effect of likely discarding useful
681 * clean cached blocks. For the latter reason this is the least
682 * preferable method.
683 */
684
685 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
686
687 if (rc != 0)
688 {
689 /* don't error out, this is just a performance optimization */
692 errmsg("could not flush dirty data: %m")));
693 }
694
695 return;
696 }
697#endif
698}

References data_sync_elevel(), EINTR, enableFsync, ereport, errcode_for_file_access(), errmsg, FATAL, fb(), fd(), MAP_FAILED, and WARNING.

Referenced by copy_file(), and FileWriteback().

◆ pg_fsync()

int pg_fsync ( int  fd)

Definition at line 390 of file fd.c.

391{
392#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
393 struct stat st;
394
395 /*
396 * Some operating system implementations of fsync() have requirements
397 * about the file access modes that were used when their file descriptor
398 * argument was opened, and these requirements differ depending on whether
399 * the file descriptor is for a directory.
400 *
401 * For any file descriptor that may eventually be handed to fsync(), we
402 * should have opened it with access modes that are compatible with
403 * fsync() on all supported systems, otherwise the code may not be
404 * portable, even if it runs ok on the current system.
405 *
406 * We assert here that a descriptor for a file was opened with write
407 * permissions (i.e., not O_RDONLY) and for a directory without write
408 * permissions (O_RDONLY). Notice that the assertion check is made even
409 * if fsync() is disabled.
410 *
411 * If fstat() fails, ignore it and let the follow-up fsync() complain.
412 */
413 if (fstat(fd, &st) == 0)
414 {
415 int desc_flags = fcntl(fd, F_GETFL);
416
418
419 if (S_ISDIR(st.st_mode))
421 else
423 }
424 errno = 0;
425#endif
426
427 /* #if is to skip the wal_sync_method test if there's no need for it */
428#if defined(HAVE_FSYNC_WRITETHROUGH)
431 else
432#endif
434}

References Assert, fb(), fd(), fstat, pg_fsync_no_writethrough(), pg_fsync_writethrough(), S_ISDIR, stat::st_mode, wal_sync_method, and WAL_SYNC_METHOD_FSYNC_WRITETHROUGH.

Referenced by AddToDataDirLockFile(), assign_wal_sync_method(), BootStrapXLOG(), CheckPointLogicalRewriteHeap(), CreateDirAndVersionFile(), CreateLockFile(), durable_rename(), FileSync(), fsync_fname_ext(), heap_xlog_logical_rewrite(), readRecoverySignalFile(), RecreateTwoPhaseFile(), RestoreSlotFromDisk(), SaveSlotToPath(), SlruPhysicalWritePage(), SlruSyncFileTag(), SnapBuildSerialize(), update_controlfile(), write_auto_conf_file(), WriteControlFile(), writeTimeLineHistory(), writeTimeLineHistoryFile(), XLogFileCopy(), and XLogFileInitInternal().

◆ pg_fsync_no_writethrough()

int pg_fsync_no_writethrough ( int  fd)

Definition at line 442 of file fd.c.

443{
444 int rc;
445
446 if (!enableFsync)
447 return 0;
448
449retry:
450 rc = fsync(fd);
451
452 if (rc == -1 && errno == EINTR)
453 goto retry;
454
455 return rc;
456}

References EINTR, enableFsync, fb(), fd(), and fsync.

Referenced by issue_xlog_fsync(), and pg_fsync().

◆ pg_fsync_writethrough()

int pg_fsync_writethrough ( int  fd)

Definition at line 462 of file fd.c.

463{
464 if (enableFsync)
465 {
466#if defined(F_FULLFSYNC)
467 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
468#else
469 errno = ENOSYS;
470 return -1;
471#endif
472 }
473 else
474 return 0;
475}

References enableFsync, fb(), and fd().

Referenced by issue_xlog_fsync(), pg_fsync(), and test_sync().

◆ pg_ftruncate()

static int pg_ftruncate ( int  fd,
pgoff_t  length 
)
static

Definition at line 704 of file fd.c.

705{
706 int ret;
707
708retry:
709 ret = ftruncate(fd, length);
710
711 if (ret == -1 && errno == EINTR)
712 goto retry;
713
714 return ret;
715}

References EINTR, fb(), and fd().

Referenced by FileTruncate(), and pg_truncate().

◆ pg_truncate()

int pg_truncate ( const char path,
pgoff_t  length 
)

Definition at line 721 of file fd.c.

722{
723 int ret;
724#ifdef WIN32
725 int save_errno;
726 int fd;
727
729 if (fd >= 0)
730 {
731 ret = pg_ftruncate(fd, length);
735 }
736 else
737 ret = -1;
738#else
739
740retry:
741 ret = truncate(path, length);
742
743 if (ret == -1 && errno == EINTR)
744 goto retry;
745#endif
746
747 return ret;
748}

References CloseTransientFile(), EINTR, fb(), fd(), OpenTransientFile(), PG_BINARY, and pg_ftruncate().

Referenced by do_truncate().

◆ ReadDir()

◆ ReadDirExtended()

struct dirent * ReadDirExtended ( DIR dir,
const char dirname,
int  elevel 
)

Definition at line 2972 of file fd.c.

2973{
2974 struct dirent *dent;
2975
2976 /* Give a generic message for AllocateDir failure, if caller didn't */
2977 if (dir == NULL)
2978 {
2979 ereport(elevel,
2981 errmsg("could not open directory \"%s\": %m",
2982 dirname)));
2983 return NULL;
2984 }
2985
2986 errno = 0;
2987 if ((dent = readdir(dir)) != NULL)
2988 return dent;
2989
2990 if (errno)
2991 ereport(elevel,
2993 errmsg("could not read directory \"%s\": %m",
2994 dirname)));
2995 return NULL;
2996}

References ereport, errcode_for_file_access(), errmsg, fb(), and readdir().

Referenced by DeleteAllExportedSnapshotFiles(), ReadDir(), RelationCacheInitFileRemove(), RelationCacheInitFileRemoveInDir(), RemovePgTempFiles(), RemovePgTempFilesInDir(), RemovePgTempRelationFiles(), RemovePgTempRelationFilesInDbspace(), ReorderBufferCleanupSerializedTXNs(), scan_directory_ci(), SyncDataDirectory(), and walkdir().

◆ RegisterTemporaryFile()

static void RegisterTemporaryFile ( File  file)
static

◆ ReleaseExternalFD()

◆ ReleaseLruFile()

static bool ReleaseLruFile ( void  )
static

Definition at line 1370 of file fd.c.

1371{
1372 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1373
1374 if (nfile > 0)
1375 {
1376 /*
1377 * There are opened files and so there should be at least one used vfd
1378 * in the ring.
1379 */
1380 Assert(VfdCache[0].lruMoreRecently != 0);
1381 LruDelete(VfdCache[0].lruMoreRecently);
1382 return true; /* freed a file */
1383 }
1384 return false; /* no files available to free */
1385}

References Assert, DO_DB, elog, LOG, LruDelete(), nfile, and VfdCache.

Referenced by AllocateDir(), AllocateFile(), BasicOpenFilePerm(), OpenPipeStream(), and ReleaseLruFiles().

◆ ReleaseLruFiles()

static void ReleaseLruFiles ( void  )
static

◆ RemovePgTempFiles()

void RemovePgTempFiles ( void  )

Definition at line 3323 of file fd.c.

3324{
3326 DIR *spc_dir;
3327 struct dirent *spc_de;
3328
3329 /*
3330 * First process temp files in pg_default ($PGDATA/base)
3331 */
3332 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3333 RemovePgTempFilesInDir(temp_path, true, false);
3335
3336 /*
3337 * Cycle through temp directories for all non-default tablespaces.
3338 */
3340
3342 {
3343 if (strcmp(spc_de->d_name, ".") == 0 ||
3344 strcmp(spc_de->d_name, "..") == 0)
3345 continue;
3346
3347 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3350 RemovePgTempFilesInDir(temp_path, true, false);
3351
3352 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3355 }
3356
3358
3359 /*
3360 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3361 * DataDir as well. However, that is *not* cleaned here because doing so
3362 * would create a race condition. It's done separately, earlier in
3363 * postmaster startup.
3364 */
3365}

References AllocateDir(), fb(), FreeDir(), LOG, MAXPGPATH, PG_TBLSPC_DIR, PG_TEMP_FILES_DIR, ReadDirExtended(), RemovePgTempFilesInDir(), RemovePgTempRelationFiles(), snprintf, and TABLESPACE_VERSION_DIRECTORY.

Referenced by PostmasterMain(), and PostmasterStateMachine().

◆ RemovePgTempFilesInDir()

void RemovePgTempFilesInDir ( const char tmpdirname,
bool  missing_ok,
bool  unlink_all 
)

Definition at line 3383 of file fd.c.

3384{
3385 DIR *temp_dir;
3386 struct dirent *temp_de;
3387 char rm_path[MAXPGPATH * 2];
3388
3390
3391 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3392 return;
3393
3395 {
3396 if (strcmp(temp_de->d_name, ".") == 0 ||
3397 strcmp(temp_de->d_name, "..") == 0)
3398 continue;
3399
3400 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3401 tmpdirname, temp_de->d_name);
3402
3403 if (unlink_all ||
3404 strncmp(temp_de->d_name,
3407 {
3409
3410 if (type == PGFILETYPE_ERROR)
3411 continue;
3412 else if (type == PGFILETYPE_DIR)
3413 {
3414 /* recursively remove contents, then directory itself */
3415 RemovePgTempFilesInDir(rm_path, false, true);
3416
3417 if (rmdir(rm_path) < 0)
3418 ereport(LOG,
3420 errmsg("could not remove directory \"%s\": %m",
3421 rm_path)));
3422 }
3423 else
3424 {
3425 if (unlink(rm_path) < 0)
3426 ereport(LOG,
3428 errmsg("could not remove file \"%s\": %m",
3429 rm_path)));
3430 }
3431 }
3432 else
3433 ereport(LOG,
3434 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3435 rm_path)));
3436 }
3437
3439}

References AllocateDir(), ereport, errcode_for_file_access(), errmsg, fb(), FreeDir(), get_dirent_type(), LOG, MAXPGPATH, PG_TEMP_FILE_PREFIX, PGFILETYPE_DIR, PGFILETYPE_ERROR, ReadDirExtended(), RemovePgTempFilesInDir(), snprintf, and type.

Referenced by PostmasterMain(), RemovePgTempFiles(), and RemovePgTempFilesInDir().

◆ RemovePgTempRelationFiles()

static void RemovePgTempRelationFiles ( const char tsdirname)
static

Definition at line 3443 of file fd.c.

3444{
3445 DIR *ts_dir;
3446 struct dirent *de;
3447 char dbspace_path[MAXPGPATH * 2];
3448
3450
3451 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3452 {
3453 /*
3454 * We're only interested in the per-database directories, which have
3455 * numeric names. Note that this code will also (properly) ignore "."
3456 * and "..".
3457 */
3458 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3459 continue;
3460
3461 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3462 tsdirname, de->d_name);
3464 }
3465
3466 FreeDir(ts_dir);
3467}

References AllocateDir(), fb(), FreeDir(), LOG, MAXPGPATH, ReadDirExtended(), RemovePgTempRelationFilesInDbspace(), and snprintf.

Referenced by RemovePgTempFiles().

◆ RemovePgTempRelationFilesInDbspace()

static void RemovePgTempRelationFilesInDbspace ( const char dbspacedirname)
static

Definition at line 3471 of file fd.c.

3472{
3474 struct dirent *de;
3475 char rm_path[MAXPGPATH * 2];
3476
3478
3480 {
3481 if (!looks_like_temp_rel_name(de->d_name))
3482 continue;
3483
3484 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3485 dbspacedirname, de->d_name);
3486
3487 if (unlink(rm_path) < 0)
3488 ereport(LOG,
3490 errmsg("could not remove file \"%s\": %m",
3491 rm_path)));
3492 }
3493
3495}

References AllocateDir(), ereport, errcode_for_file_access(), errmsg, fb(), FreeDir(), LOG, looks_like_temp_rel_name(), MAXPGPATH, ReadDirExtended(), and snprintf.

Referenced by RemovePgTempRelationFiles().

◆ ReportTemporaryFileUsage()

static void ReportTemporaryFileUsage ( const char path,
pgoff_t  size 
)
static

Definition at line 1516 of file fd.c.

1517{
1519
1520 if (log_temp_files >= 0)
1521 {
1522 if ((size / 1024) >= log_temp_files)
1523 ereport(LOG,
1524 (errmsg("temporary file: path \"%s\", size %lu",
1525 path, (unsigned long) size)));
1526 }
1527}

References ereport, errmsg, LOG, log_temp_files, and pgstat_report_tempfile().

Referenced by FileClose(), and PathNameDeleteTemporaryFile().

◆ reserveAllocatedDesc()

static bool reserveAllocatedDesc ( void  )
static

Definition at line 2553 of file fd.c.

2554{
2556 int newMax;
2557
2558 /* Quick out if array already has a free slot. */
2560 return true;
2561
2562 /*
2563 * If the array hasn't yet been created in the current process, initialize
2564 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2565 * we will ever need, anyway. We don't want to look at max_safe_fds
2566 * immediately because set_max_safe_fds() may not have run yet.
2567 */
2568 if (allocatedDescs == NULL)
2569 {
2570 newMax = FD_MINFREE / 3;
2572 /* Out of memory already? Treat as fatal error. */
2573 if (newDescs == NULL)
2574 ereport(ERROR,
2576 errmsg("out of memory")));
2579 return true;
2580 }
2581
2582 /*
2583 * Consider enlarging the array beyond the initial allocation used above.
2584 * By the time this happens, max_safe_fds should be known accurately.
2585 *
2586 * We mustn't let allocated descriptors hog all the available FDs, and in
2587 * practice we'd better leave a reasonable number of FDs for VFD use. So
2588 * set the maximum to max_safe_fds / 3. (This should certainly be at
2589 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2590 * tightening the restriction here.) Recall that "external" FDs are
2591 * allowed to consume another third of max_safe_fds.
2592 */
2593 newMax = max_safe_fds / 3;
2595 {
2597 newMax * sizeof(AllocateDesc));
2598 /* Treat out-of-memory as a non-fatal error. */
2599 if (newDescs == NULL)
2600 return false;
2603 return true;
2604 }
2605
2606 /* Can't enlarge allocatedDescs[] any more. */
2607 return false;
2608}

References allocatedDescs, ereport, errcode(), errmsg, ERROR, fb(), FD_MINFREE, malloc, max_safe_fds, maxAllocatedDescs, numAllocatedDescs, and realloc.

Referenced by AllocateDir(), AllocateFile(), OpenPipeStream(), and OpenTransientFilePerm().

◆ ReserveExternalFD()

void ReserveExternalFD ( void  )

Definition at line 1207 of file fd.c.

1208{
1209 /*
1210 * Release VFDs if needed to stay safe. Because we do this before
1211 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1212 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1213 */
1215
1217}

References numExternalFDs, and ReleaseLruFiles().

Referenced by AcquireExternalFD(), BackendInitialize(), dsm_impl_posix(), InitializeWaitEventSupport(), InitPostmasterDeathWatchHandle(), and XLogWrite().

◆ ResourceOwnerForgetFile()

static void ResourceOwnerForgetFile ( ResourceOwner  owner,
File  file 
)
inlinestatic

Definition at line 381 of file fd.c.

382{
384}

References file_resowner_desc, Int32GetDatum(), and ResourceOwnerForget().

Referenced by FileClose().

◆ ResourceOwnerRememberFile()

static void ResourceOwnerRememberFile ( ResourceOwner  owner,
File  file 
)
inlinestatic

Definition at line 376 of file fd.c.

377{
379}

References file_resowner_desc, Int32GetDatum(), and ResourceOwnerRemember().

Referenced by RegisterTemporaryFile().

◆ ResOwnerPrintFile()

static char * ResOwnerPrintFile ( Datum  res)
static

Definition at line 4103 of file fd.c.

4104{
4105 return psprintf("File %d", DatumGetInt32(res));
4106}

References DatumGetInt32(), and psprintf().

◆ ResOwnerReleaseFile()

static void ResOwnerReleaseFile ( Datum  res)
static

Definition at line 4089 of file fd.c.

4090{
4091 File file = (File) DatumGetInt32(res);
4092 Vfd *vfdP;
4093
4094 Assert(FileIsValid(file));
4095
4096 vfdP = &VfdCache[file];
4097 vfdP->resowner = NULL;
4098
4099 FileClose(file);
4100}

References Assert, DatumGetInt32(), fb(), FileClose(), FileIsValid, vfd::resowner, and VfdCache.

◆ set_max_safe_fds()

void set_max_safe_fds ( void  )

Definition at line 1045 of file fd.c.

1046{
1047 int usable_fds;
1048 int already_open;
1049
1050 /*----------
1051 * We want to set max_safe_fds to
1052 * MIN(usable_fds, max_files_per_process)
1053 * less the slop factor for files that are opened without consulting
1054 * fd.c. This ensures that we won't allow to open more than
1055 * max_files_per_process, or the experimentally-determined EMFILE limit,
1056 * additional files.
1057 *----------
1058 */
1061
1063
1064 /*
1065 * Take off the FDs reserved for system() etc.
1066 */
1068
1069 /*
1070 * Make sure we still have enough to get by.
1071 */
1073 ereport(FATAL,
1075 errmsg("insufficient file descriptors available to start server process"),
1076 errdetail("System allows %d, server needs at least %d, %d files are already open.",
1079 already_open)));
1080
1081 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1083}

References count_usable_fds(), DEBUG2, elog, ereport, errcode(), errdetail(), errmsg, FATAL, fb(), FD_MINFREE, max_files_per_process, max_safe_fds, Min, and NUM_RESERVED_FDS.

Referenced by BootstrapModeMain(), PostgresSingleUserMain(), and PostmasterMain().

◆ SetTempTablespaces()

void SetTempTablespaces ( Oid tableSpaces,
int  numSpaces 
)

Definition at line 3097 of file fd.c.

3098{
3099 Assert(numSpaces >= 0);
3102
3103 /*
3104 * Select a random starting point in the list. This is to minimize
3105 * conflicts between backends that are most likely sharing the same list
3106 * of temp tablespaces. Note that if we create multiple temp files in the
3107 * same transaction, we'll advance circularly through the list --- this
3108 * ensures that large temporary sort files are nicely spread across all
3109 * available tablespaces.
3110 */
3111 if (numSpaces > 1)
3113 0, numSpaces - 1);
3114 else
3116}

References Assert, fb(), nextTempTableSpace, numTempTableSpaces, pg_global_prng_state, pg_prng_uint64_range(), and tempTableSpaces.

Referenced by assign_temp_tablespaces(), and PrepareTempTablespaces().

◆ SyncDataDirectory()

void SyncDataDirectory ( void  )

Definition at line 3594 of file fd.c.

3595{
3596 bool xlog_is_symlink;
3597
3598 /* We can skip this whole thing if fsync is disabled. */
3599 if (!enableFsync)
3600 return;
3601
3602 /*
3603 * If pg_wal is a symlink, we'll need to recurse into it separately,
3604 * because the first walkdir below will ignore it.
3605 */
3606 xlog_is_symlink = false;
3607
3608 {
3609 struct stat st;
3610
3611 if (lstat("pg_wal", &st) < 0)
3612 ereport(LOG,
3614 errmsg("could not stat file \"%s\": %m",
3615 "pg_wal")));
3616 else if (S_ISLNK(st.st_mode))
3617 xlog_is_symlink = true;
3618 }
3619
3620#ifdef HAVE_SYNCFS
3622 {
3623 DIR *dir;
3624 struct dirent *de;
3625
3626 /*
3627 * On Linux, we don't have to open every single file one by one. We
3628 * can use syncfs() to sync whole filesystems. We only expect
3629 * filesystem boundaries to exist where we tolerate symlinks, namely
3630 * pg_wal and the tablespaces, so we call syncfs() for each of those
3631 * directories.
3632 */
3633
3634 /* Prepare to report progress syncing the data directory via syncfs. */
3636
3637 /* Sync the top level pgdata directory. */
3638 do_syncfs(".");
3639 /* If any tablespaces are configured, sync each of those. */
3641 while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3642 {
3643 char path[MAXPGPATH];
3644
3645 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3646 continue;
3647
3648 snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3649 do_syncfs(path);
3650 }
3651 FreeDir(dir);
3652 /* If pg_wal is a symlink, process that too. */
3653 if (xlog_is_symlink)
3654 do_syncfs("pg_wal");
3655 return;
3656 }
3657#endif /* !HAVE_SYNCFS */
3658
3659#ifdef PG_FLUSH_DATA_WORKS
3660 /* Prepare to report progress of the pre-fsync phase. */
3662
3663 /*
3664 * If possible, hint to the kernel that we're soon going to fsync the data
3665 * directory and its contents. Errors in this step are even less
3666 * interesting than normal, so log them only at DEBUG1.
3667 */
3668 walkdir(".", pre_sync_fname, false, DEBUG1);
3669 if (xlog_is_symlink)
3670 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3672#endif
3673
3674 /* Prepare to report progress syncing the data directory via fsync. */
3676
3677 /*
3678 * Now we do the fsync()s in the same order.
3679 *
3680 * The main call ignores symlinks, so in addition to specially processing
3681 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3682 * process_symlinks = true. Note that if there are any plain directories
3683 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3684 * so we don't worry about optimizing it.
3685 */
3686 walkdir(".", datadir_fsync_fname, false, LOG);
3687 if (xlog_is_symlink)
3688 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3690}

References AllocateDir(), begin_startup_progress_phase(), DATA_DIR_SYNC_METHOD_SYNCFS, datadir_fsync_fname(), DEBUG1, enableFsync, ereport, errcode_for_file_access(), errmsg, fb(), FreeDir(), LOG, lstat, MAXPGPATH, PG_TBLSPC_DIR, ReadDirExtended(), recovery_init_sync_method, S_ISLNK, snprintf, stat::st_mode, and walkdir().

Referenced by StartupXLOG().

◆ TempTablespacePath()

void TempTablespacePath ( char path,
Oid  tablespace 
)

Definition at line 1767 of file fd.c.

1768{
1769 /*
1770 * Identify the tempfile directory for this tablespace.
1771 *
1772 * If someone tries to specify pg_global, use pg_default instead.
1773 */
1774 if (tablespace == InvalidOid ||
1777 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1778 else
1779 {
1780 /* All other tablespaces are accessed via symlinks */
1781 snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1784 }
1785}

References fb(), InvalidOid, MAXPGPATH, PG_TBLSPC_DIR, PG_TEMP_FILES_DIR, snprintf, tablespace, and TABLESPACE_VERSION_DIRECTORY.

Referenced by FileSetCreate(), FileSetPath(), OpenTemporaryFileInTablespace(), and pg_ls_tmpdir().

◆ TempTablespacesAreSet()

bool TempTablespacesAreSet ( void  )

Definition at line 3126 of file fd.c.

3127{
3128 return (numTempTableSpaces >= 0);
3129}

References numTempTableSpaces.

Referenced by GetTempTablespaces(), and PrepareTempTablespaces().

◆ unlink_if_exists_fname()

static void unlink_if_exists_fname ( const char fname,
bool  isdir,
int  elevel 
)
static

Definition at line 3822 of file fd.c.

3823{
3824 if (isdir)
3825 {
3826 if (rmdir(fname) != 0 && errno != ENOENT)
3827 ereport(elevel,
3829 errmsg("could not remove directory \"%s\": %m", fname)));
3830 }
3831 else
3832 {
3833 /* Use PathNameDeleteTemporaryFile to report filesize */
3834 PathNameDeleteTemporaryFile(fname, false);
3835 }
3836}

References ereport, errcode_for_file_access(), errmsg, fb(), and PathNameDeleteTemporaryFile().

Referenced by PathNameDeleteTemporaryDir().

◆ walkdir()

static void walkdir ( const char path,
void(*)(const char *fname, bool isdir, int elevel)  action,
bool  process_symlinks,
int  elevel 
)
static

Definition at line 3708 of file fd.c.

3712{
3713 DIR *dir;
3714 struct dirent *de;
3715
3716 dir = AllocateDir(path);
3717
3718 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3719 {
3720 char subpath[MAXPGPATH * 2];
3721
3723
3724 if (strcmp(de->d_name, ".") == 0 ||
3725 strcmp(de->d_name, "..") == 0)
3726 continue;
3727
3728 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3729
3730 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3731 {
3732 case PGFILETYPE_REG:
3733 (*action) (subpath, false, elevel);
3734 break;
3735 case PGFILETYPE_DIR:
3736 walkdir(subpath, action, false, elevel);
3737 break;
3738 default:
3739
3740 /*
3741 * Errors are already reported directly by get_dirent_type(),
3742 * and any remaining symlinks and unknown file types are
3743 * ignored.
3744 */
3745 break;
3746 }
3747 }
3748
3749 FreeDir(dir); /* we ignore any error here */
3750
3751 /*
3752 * It's important to fsync the destination directory itself as individual
3753 * file fsyncs don't guarantee that the directory entry for the file is
3754 * synced. However, skip this if AllocateDir failed; the action function
3755 * might not be robust against that.
3756 */
3757 if (dir)
3758 (*action) (path, true, elevel);
3759}

References AllocateDir(), CHECK_FOR_INTERRUPTS, fb(), FreeDir(), get_dirent_type(), MAXPGPATH, PGFILETYPE_DIR, PGFILETYPE_REG, ReadDirExtended(), snprintf, subpath(), and walkdir().

Referenced by PathNameDeleteTemporaryDir(), SyncDataDirectory(), and walkdir().

Variable Documentation

◆ allocatedDescs

◆ data_sync_retry

bool data_sync_retry = false

Definition at line 163 of file fd.c.

Referenced by data_sync_elevel().

◆ file_extend_method

int file_extend_method = DEFAULT_FILE_EXTEND_METHOD

Definition at line 169 of file fd.c.

Referenced by mdzeroextend().

◆ file_resowner_desc

const ResourceOwnerDesc file_resowner_desc
static
Initial value:
=
{
.name = "File",
.release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
.release_priority = RELEASE_PRIO_FILES,
.ReleaseResource = ResOwnerReleaseFile,
.DebugPrint = ResOwnerPrintFile
}

Definition at line 365 of file fd.c.

366{
367 .name = "File",
368 .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
369 .release_priority = RELEASE_PRIO_FILES,
370 .ReleaseResource = ResOwnerReleaseFile,
371 .DebugPrint = ResOwnerPrintFile
372};

Referenced by ResourceOwnerForgetFile(), and ResourceOwnerRememberFile().

◆ have_xact_temporary_files

bool have_xact_temporary_files = false
static

Definition at line 232 of file fd.c.

Referenced by CleanupTempFiles(), and RegisterTemporaryFile().

◆ io_direct_flags

◆ max_files_per_process

int max_files_per_process = 1000

Definition at line 147 of file fd.c.

Referenced by set_max_safe_fds().

◆ max_safe_fds

int max_safe_fds = FD_MINFREE

Definition at line 160 of file fd.c.

Referenced by AcquireExternalFD(), ReleaseLruFiles(), reserveAllocatedDesc(), and set_max_safe_fds().

◆ maxAllocatedDescs

int maxAllocatedDescs = 0
static

◆ nextTempTableSpace

int nextTempTableSpace = 0
static

Definition at line 294 of file fd.c.

Referenced by GetNextTempTableSpace(), and SetTempTablespaces().

◆ nfile

int nfile = 0
static

◆ numAllocatedDescs

◆ numExternalFDs

int numExternalFDs = 0
static

Definition at line 278 of file fd.c.

Referenced by AcquireExternalFD(), ReleaseExternalFD(), ReleaseLruFiles(), and ReserveExternalFD().

◆ numTempTableSpaces

int numTempTableSpaces = -1
static

◆ recovery_init_sync_method

int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC

Definition at line 166 of file fd.c.

Referenced by SyncDataDirectory().

◆ SizeVfdCache

Size SizeVfdCache = 0
static

◆ tempFileCounter

long tempFileCounter = 0
static

Definition at line 284 of file fd.c.

Referenced by OpenTemporaryFileInTablespace().

◆ temporary_files_size

uint64 temporary_files_size = 0
static

Definition at line 240 of file fd.c.

Referenced by FileClose(), FileTruncate(), and FileWriteV().

◆ tempTableSpaces

Oid* tempTableSpaces = NULL
static

◆ VfdCache