PostgreSQL Source Code git master
Loading...
Searching...
No Matches
fd.c File Reference
#include "postgres.h"
#include <dirent.h>
#include <sys/file.h>
#include <sys/param.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <limits.h>
#include <unistd.h>
#include <fcntl.h>
#include "access/xact.h"
#include "access/xlog.h"
#include "catalog/pg_tablespace.h"
#include "common/file_perm.h"
#include "common/file_utils.h"
#include "common/pg_prng.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/startup.h"
#include "storage/aio.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/guc_hooks.h"
#include "utils/resowner.h"
#include "utils/varlena.h"
Include dependency graph for fd.c:

Go to the source code of this file.

Data Structures

struct  vfd
 
struct  AllocateDesc
 

Macros

#define NUM_RESERVED_FDS   10
 
#define FD_MINFREE   48
 
#define DO_DB(A)    ((void) 0)
 
#define VFD_CLOSED   (-1)
 
#define FileIsValid(file)    ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
 
#define FileIsNotOpen(file)   (VfdCache[file].fd == VFD_CLOSED)
 
#define FD_DELETE_AT_CLOSE   (1 << 0) /* T = delete when closed */
 
#define FD_CLOSE_AT_EOXACT   (1 << 1) /* T = close at eoXact */
 
#define FD_TEMP_FILE_LIMIT   (1 << 2) /* T = respect temp_file_limit */
 

Typedefs

typedef struct vfd Vfd
 

Enumerations

enum  AllocateDescKind { AllocateDescFile , AllocateDescPipe , AllocateDescDir , AllocateDescRawFD }
 

Functions

static void Delete (File file)
 
static void LruDelete (File file)
 
static void Insert (File file)
 
static int LruInsert (File file)
 
static bool ReleaseLruFile (void)
 
static void ReleaseLruFiles (void)
 
static File AllocateVfd (void)
 
static void FreeVfd (File file)
 
static int FileAccess (File file)
 
static File OpenTemporaryFileInTablespace (Oid tblspcOid, bool rejectError)
 
static bool reserveAllocatedDesc (void)
 
static int FreeDesc (AllocateDesc *desc)
 
static void BeforeShmemExit_Files (int code, Datum arg)
 
static void CleanupTempFiles (bool isCommit, bool isProcExit)
 
static void RemovePgTempRelationFiles (const char *tsdirname)
 
static void RemovePgTempRelationFilesInDbspace (const char *dbspacedirname)
 
static void walkdir (const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
 
static void datadir_fsync_fname (const char *fname, bool isdir, int elevel)
 
static void unlink_if_exists_fname (const char *fname, bool isdir, int elevel)
 
static int fsync_parent_path (const char *fname, int elevel)
 
static void ResOwnerReleaseFile (Datum res)
 
static charResOwnerPrintFile (Datum res)
 
static void ResourceOwnerRememberFile (ResourceOwner owner, File file)
 
static void ResourceOwnerForgetFile (ResourceOwner owner, File file)
 
int pg_fsync (int fd)
 
int pg_fsync_no_writethrough (int fd)
 
int pg_fsync_writethrough (int fd)
 
int pg_fdatasync (int fd)
 
bool pg_file_exists (const char *name)
 
void pg_flush_data (int fd, pgoff_t offset, pgoff_t nbytes)
 
static int pg_ftruncate (int fd, pgoff_t length)
 
int pg_truncate (const char *path, pgoff_t length)
 
void fsync_fname (const char *fname, bool isdir)
 
int durable_rename (const char *oldfile, const char *newfile, int elevel)
 
int durable_unlink (const char *fname, int elevel)
 
void InitFileAccess (void)
 
void InitTemporaryFileAccess (void)
 
static void count_usable_fds (int max_to_probe, int *usable_fds, int *already_open)
 
void set_max_safe_fds (void)
 
int BasicOpenFile (const char *fileName, int fileFlags)
 
int BasicOpenFilePerm (const char *fileName, int fileFlags, mode_t fileMode)
 
bool AcquireExternalFD (void)
 
void ReserveExternalFD (void)
 
void ReleaseExternalFD (void)
 
static void ReportTemporaryFileUsage (const char *path, pgoff_t size)
 
static void RegisterTemporaryFile (File file)
 
File PathNameOpenFile (const char *fileName, int fileFlags)
 
File PathNameOpenFilePerm (const char *fileName, int fileFlags, mode_t fileMode)
 
void PathNameCreateTemporaryDir (const char *basedir, const char *directory)
 
void PathNameDeleteTemporaryDir (const char *dirname)
 
File OpenTemporaryFile (bool interXact)
 
void TempTablespacePath (char *path, Oid tablespace)
 
File PathNameCreateTemporaryFile (const char *path, bool error_on_failure)
 
File PathNameOpenTemporaryFile (const char *path, int mode)
 
bool PathNameDeleteTemporaryFile (const char *path, bool error_on_failure)
 
void FileClose (File file)
 
int FilePrefetch (File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
 
void FileWriteback (File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
 
ssize_t FileReadV (File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
 
int FileStartReadV (PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
 
ssize_t FileWriteV (File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
 
int FileSync (File file, uint32 wait_event_info)
 
int FileZero (File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
 
int FileFallocate (File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
 
pgoff_t FileSize (File file)
 
int FileTruncate (File file, pgoff_t offset, uint32 wait_event_info)
 
charFilePathName (File file)
 
int FileGetRawDesc (File file)
 
int FileGetRawFlags (File file)
 
mode_t FileGetRawMode (File file)
 
FILEAllocateFile (const char *name, const char *mode)
 
int OpenTransientFile (const char *fileName, int fileFlags)
 
int OpenTransientFilePerm (const char *fileName, int fileFlags, mode_t fileMode)
 
FILEOpenPipeStream (const char *command, const char *mode)
 
int FreeFile (FILE *file)
 
int CloseTransientFile (int fd)
 
DIRAllocateDir (const char *dirname)
 
struct direntReadDir (DIR *dir, const char *dirname)
 
struct direntReadDirExtended (DIR *dir, const char *dirname, int elevel)
 
int FreeDir (DIR *dir)
 
int ClosePipeStream (FILE *file)
 
void closeAllVfds (void)
 
void SetTempTablespaces (Oid *tableSpaces, int numSpaces)
 
bool TempTablespacesAreSet (void)
 
int GetTempTablespaces (Oid *tableSpaces, int numSpaces)
 
Oid GetNextTempTableSpace (void)
 
void AtEOSubXact_Files (bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
 
void AtEOXact_Files (bool isCommit)
 
void RemovePgTempFiles (void)
 
void RemovePgTempFilesInDir (const char *tmpdirname, bool missing_ok, bool unlink_all)
 
bool looks_like_temp_rel_name (const char *name)
 
void SyncDataDirectory (void)
 
int fsync_fname_ext (const char *fname, bool isdir, bool ignore_perm, int elevel)
 
int MakePGDirectory (const char *directoryName)
 
int data_sync_elevel (int elevel)
 
bool check_debug_io_direct (char **newval, void **extra, GucSource source)
 
void assign_debug_io_direct (const char *newval, void *extra)
 

Variables

int max_files_per_process = 1000
 
int max_safe_fds = FD_MINFREE
 
bool data_sync_retry = false
 
int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC
 
int io_direct_flags
 
static VfdVfdCache
 
static Size SizeVfdCache = 0
 
static int nfile = 0
 
static bool have_xact_temporary_files = false
 
static uint64 temporary_files_size = 0
 
static int numAllocatedDescs = 0
 
static int maxAllocatedDescs = 0
 
static AllocateDescallocatedDescs = NULL
 
static int numExternalFDs = 0
 
static long tempFileCounter = 0
 
static OidtempTableSpaces = NULL
 
static int numTempTableSpaces = -1
 
static int nextTempTableSpace = 0
 
static const ResourceOwnerDesc file_resowner_desc
 

Macro Definition Documentation

◆ DO_DB

#define DO_DB (   A)     ((void) 0)

Definition at line 180 of file fd.c.

197{
198 int fd; /* current FD, or VFD_CLOSED if none */
199 unsigned short fdstate; /* bitflags for VFD's state */
200 ResourceOwner resowner; /* owner, for automatic cleanup */
201 File nextFree; /* link to next free VFD, if in freelist */
202 File lruMoreRecently; /* doubly linked recency-of-use list */
203 File lruLessRecently;
204 pgoff_t fileSize; /* current size of file (0 if not temporary) */
205 char *fileName; /* name of file, or NULL for unused VFD */
206 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
207 int fileFlags; /* open(2) flags for (re)opening the file */
208 mode_t fileMode; /* mode to pass to open(2) */
209} Vfd;
210
211/*
212 * Virtual File Descriptor array pointer and size. This grows as
213 * needed. 'File' values are indexes into this array.
214 * Note that VfdCache[0] is not a usable VFD, just a list header.
215 */
216static Vfd *VfdCache;
217static Size SizeVfdCache = 0;
218
219/*
220 * Number of file descriptors known to be in use by VFD entries.
221 */
222static int nfile = 0;
223
224/*
225 * Flag to tell whether it's worth scanning VfdCache looking for temp files
226 * to close
227 */
228static bool have_xact_temporary_files = false;
229
230/*
231 * Tracks the total size of all temporary files. Note: when temp_file_limit
232 * is being enforced, this cannot overflow since the limit cannot be more
233 * than INT_MAX kilobytes. When not enforcing, it could theoretically
234 * overflow, but we don't care.
235 */
237
238/* Temporary file access initialized and not yet shut down? */
239#ifdef USE_ASSERT_CHECKING
240static bool temporary_files_allowed = false;
241#endif
242
243/*
244 * List of OS handles opened with AllocateFile, AllocateDir and
245 * OpenTransientFile.
246 */
247typedef enum
248{
254
255typedef struct
256{
257 AllocateDescKind kind;
258 SubTransactionId create_subid;
259 union
260 {
261 FILE *file;
262 DIR *dir;
263 int fd;
264 } desc;
266
267static int numAllocatedDescs = 0;
268static int maxAllocatedDescs = 0;
270
271/*
272 * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
273 */
274static int numExternalFDs = 0;
275
276/*
277 * Number of temporary files opened during the current session;
278 * this is used in generation of tempfile names.
279 */
280static long tempFileCounter = 0;
281
282/*
283 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
284 * indicating that the current database's default tablespace should be used.)
285 * When numTempTableSpaces is -1, this has not been set in the current
286 * transaction.
287 */
288static Oid *tempTableSpaces = NULL;
289static int numTempTableSpaces = -1;
290static int nextTempTableSpace = 0;
291
292
293/*--------------------
294 *
295 * Private Routines
296 *
297 * Delete - delete a file from the Lru ring
298 * LruDelete - remove a file from the Lru ring and close its FD
299 * Insert - put a file at the front of the Lru ring
300 * LruInsert - put a file at the front of the Lru ring and open it
301 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
302 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
303 * AllocateVfd - grab a free (or new) file record (from VfdCache)
304 * FreeVfd - free a file record
305 *
306 * The Least Recently Used ring is a doubly linked list that begins and
307 * ends on element zero. Element zero is special -- it doesn't represent
308 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
309 * anchor that shows us the beginning/end of the ring.
310 * Only VFD elements that are currently really open (have an FD assigned) are
311 * in the Lru ring. Elements that are "virtually" open can be recognized
312 * by having a non-null fileName field.
313 *
314 * example:
315 *
316 * /--less----\ /---------\
317 * v \ v \
318 * #0 --more---> LeastRecentlyUsed --more-\ \
319 * ^\ | |
320 * \\less--> MostRecentlyUsedFile <---/ |
321 * \more---/ \--less--/
322 *
323 *--------------------
324 */
325static void Delete(File file);
326static void LruDelete(File file);
327static void Insert(File file);
328static int LruInsert(File file);
329static bool ReleaseLruFile(void);
330static void ReleaseLruFiles(void);
331static File AllocateVfd(void);
332static void FreeVfd(File file);
333
334static int FileAccess(File file);
336static bool reserveAllocatedDesc(void);
337static int FreeDesc(AllocateDesc *desc);
338
339static void BeforeShmemExit_Files(int code, Datum arg);
340static void CleanupTempFiles(bool isCommit, bool isProcExit);
341static void RemovePgTempRelationFiles(const char *tsdirname);
343
344static void walkdir(const char *path,
345 void (*action) (const char *fname, bool isdir, int elevel),
346 bool process_symlinks,
347 int elevel);
348#ifdef PG_FLUSH_DATA_WORKS
349static void pre_sync_fname(const char *fname, bool isdir, int elevel);
350#endif
351static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
352static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
353
354static int fsync_parent_path(const char *fname, int elevel);
355
356
357/* ResourceOwner callbacks to hold virtual file descriptors */
358static void ResOwnerReleaseFile(Datum res);
359static char *ResOwnerPrintFile(Datum res);
360
362{
363 .name = "File",
364 .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
365 .release_priority = RELEASE_PRIO_FILES,
366 .ReleaseResource = ResOwnerReleaseFile,
367 .DebugPrint = ResOwnerPrintFile
368};
369
370/* Convenience wrappers over ResourceOwnerRemember/Forget */
371static inline void
373{
375}
376static inline void
378{
380}
381
382/*
383 * pg_fsync --- do fsync with or without writethrough
384 */
385int
386pg_fsync(int fd)
387{
388#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
389 struct stat st;
390
391 /*
392 * Some operating system implementations of fsync() have requirements
393 * about the file access modes that were used when their file descriptor
394 * argument was opened, and these requirements differ depending on whether
395 * the file descriptor is for a directory.
396 *
397 * For any file descriptor that may eventually be handed to fsync(), we
398 * should have opened it with access modes that are compatible with
399 * fsync() on all supported systems, otherwise the code may not be
400 * portable, even if it runs ok on the current system.
401 *
402 * We assert here that a descriptor for a file was opened with write
403 * permissions (i.e., not O_RDONLY) and for a directory without write
404 * permissions (O_RDONLY). Notice that the assertion check is made even
405 * if fsync() is disabled.
406 *
407 * If fstat() fails, ignore it and let the follow-up fsync() complain.
408 */
409 if (fstat(fd, &st) == 0)
410 {
411 int desc_flags = fcntl(fd, F_GETFL);
412
414
415 if (S_ISDIR(st.st_mode))
417 else
419 }
420 errno = 0;
421#endif
422
423 /* #if is to skip the wal_sync_method test if there's no need for it */
424#if defined(HAVE_FSYNC_WRITETHROUGH)
427 else
428#endif
430}
431
432
433/*
434 * pg_fsync_no_writethrough --- same as fsync except does nothing if
435 * enableFsync is off
436 */
437int
439{
440 int rc;
441
442 if (!enableFsync)
443 return 0;
444
445retry:
446 rc = fsync(fd);
447
448 if (rc == -1 && errno == EINTR)
449 goto retry;
450
451 return rc;
452}
453
454/*
455 * pg_fsync_writethrough
456 */
457int
459{
460 if (enableFsync)
461 {
462#if defined(F_FULLFSYNC)
463 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
464#else
465 errno = ENOSYS;
466 return -1;
467#endif
468 }
469 else
470 return 0;
471}
472
473/*
474 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
475 */
476int
477pg_fdatasync(int fd)
478{
479 int rc;
480
481 if (!enableFsync)
482 return 0;
483
484retry:
485 rc = fdatasync(fd);
486
487 if (rc == -1 && errno == EINTR)
488 goto retry;
489
490 return rc;
491}
492
493/*
494 * pg_file_exists -- check that a file exists.
495 *
496 * This requires an absolute path to the file. Returns true if the file is
497 * not a directory, false otherwise.
498 */
499bool
500pg_file_exists(const char *name)
501{
502 struct stat st;
503
504 Assert(name != NULL);
505
506 if (stat(name, &st) == 0)
507 return !S_ISDIR(st.st_mode);
508 else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
511 errmsg("could not access file \"%s\": %m", name)));
512
513 return false;
514}
515
516/*
517 * pg_flush_data --- advise OS that the described dirty data should be flushed
518 *
519 * offset of 0 with nbytes 0 means that the entire file should be flushed
520 */
521void
522pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
523{
524 /*
525 * Right now file flushing is primarily used to avoid making later
526 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
527 * if fsyncs are disabled - that's a decision we might want to make
528 * configurable at some point.
529 */
530 if (!enableFsync)
531 return;
532
533 /*
534 * We compile all alternatives that are supported on the current platform,
535 * to find portability problems more easily.
536 */
537#if defined(HAVE_SYNC_FILE_RANGE)
538 {
539 int rc;
540 static bool not_implemented_by_kernel = false;
541
543 return;
544
545retry:
546
547 /*
548 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
549 * tells the OS that writeback for the specified blocks should be
550 * started, but that we don't want to wait for completion. Note that
551 * this call might block if too much dirty data exists in the range.
552 * This is the preferable method on OSs supporting it, as it works
553 * reliably when available (contrast to msync()) and doesn't flush out
554 * clean data (like FADV_DONTNEED).
555 */
556 rc = sync_file_range(fd, offset, nbytes,
558 if (rc != 0)
559 {
560 int elevel;
561
562 if (rc == EINTR)
563 goto retry;
564
565 /*
566 * For systems that don't have an implementation of
567 * sync_file_range() such as Windows WSL, generate only one
568 * warning and then suppress all further attempts by this process.
569 */
570 if (errno == ENOSYS)
571 {
572 elevel = WARNING;
574 }
575 else
576 elevel = data_sync_elevel(WARNING);
577
578 ereport(elevel,
580 errmsg("could not flush dirty data: %m")));
581 }
582
583 return;
584 }
585#endif
586#if !defined(WIN32) && defined(MS_ASYNC)
587 {
588 void *p;
589 static int pagesize = 0;
590
591 /*
592 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
593 * writeback. On linux it only does so if MS_SYNC is specified, but
594 * then it does the writeback synchronously. Luckily all common linux
595 * systems have sync_file_range(). This is preferable over
596 * FADV_DONTNEED because it doesn't flush out clean data.
597 *
598 * We map the file (mmap()), tell the kernel to sync back the contents
599 * (msync()), and then remove the mapping again (munmap()).
600 */
601
602 /* mmap() needs actual length if we want to map whole file */
603 if (offset == 0 && nbytes == 0)
604 {
605 nbytes = lseek(fd, 0, SEEK_END);
606 if (nbytes < 0)
607 {
610 errmsg("could not determine dirty data size: %m")));
611 return;
612 }
613 }
614
615 /*
616 * Some platforms reject partial-page mmap() attempts. To deal with
617 * that, just truncate the request to a page boundary. If any extra
618 * bytes don't get flushed, well, it's only a hint anyway.
619 */
620
621 /* fetch pagesize only once */
622 if (pagesize == 0)
624
625 /* align length to pagesize, dropping any fractional page */
626 if (pagesize > 0)
627 nbytes = (nbytes / pagesize) * pagesize;
628
629 /* fractional-page request is a no-op */
630 if (nbytes <= 0)
631 return;
632
633 /*
634 * mmap could well fail, particularly on 32-bit platforms where there
635 * may simply not be enough address space. If so, silently fall
636 * through to the next implementation.
637 */
638 if (nbytes <= (pgoff_t) SSIZE_MAX)
639 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
640 else
641 p = MAP_FAILED;
642
643 if (p != MAP_FAILED)
644 {
645 int rc;
646
647 rc = msync(p, (size_t) nbytes, MS_ASYNC);
648 if (rc != 0)
649 {
652 errmsg("could not flush dirty data: %m")));
653 /* NB: need to fall through to munmap()! */
654 }
655
656 rc = munmap(p, (size_t) nbytes);
657 if (rc != 0)
658 {
659 /* FATAL error because mapping would remain */
662 errmsg("could not munmap() while flushing data: %m")));
663 }
664
665 return;
666 }
667 }
668#endif
669#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
670 {
671 int rc;
672
673 /*
674 * Signal the kernel that the passed in range should not be cached
675 * anymore. This has the, desired, side effect of writing out dirty
676 * data, and the, undesired, side effect of likely discarding useful
677 * clean cached blocks. For the latter reason this is the least
678 * preferable method.
679 */
680
681 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
682
683 if (rc != 0)
684 {
685 /* don't error out, this is just a performance optimization */
688 errmsg("could not flush dirty data: %m")));
689 }
690
691 return;
692 }
693#endif
694}
695
696/*
697 * Truncate an open file to a given length.
698 */
699static int
700pg_ftruncate(int fd, pgoff_t length)
701{
702 int ret;
703
704retry:
705 ret = ftruncate(fd, length);
706
707 if (ret == -1 && errno == EINTR)
708 goto retry;
709
710 return ret;
711}
712
713/*
714 * Truncate a file to a given length by name.
715 */
716int
717pg_truncate(const char *path, pgoff_t length)
718{
719 int ret;
720#ifdef WIN32
721 int save_errno;
722 int fd;
723
725 if (fd >= 0)
726 {
727 ret = pg_ftruncate(fd, length);
731 }
732 else
733 ret = -1;
734#else
735
736retry:
737 ret = truncate(path, length);
738
739 if (ret == -1 && errno == EINTR)
740 goto retry;
741#endif
742
743 return ret;
744}
745
746/*
747 * fsync_fname -- fsync a file or directory, handling errors properly
748 *
749 * Try to fsync a file or directory. When doing the latter, ignore errors that
750 * indicate the OS just doesn't allow/require fsyncing directories.
751 */
752void
753fsync_fname(const char *fname, bool isdir)
754{
756}
757
758/*
759 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
760 *
761 * This routine ensures that, after returning, the effect of renaming file
762 * persists in case of a crash. A crash while this routine is running will
763 * leave you with either the pre-existing or the moved file in place of the
764 * new file; no mixed state or truncated files are possible.
765 *
766 * It does so by using fsync on the old filename and the possibly existing
767 * target filename before the rename, and the target file and directory after.
768 *
769 * Note that rename() cannot be used across arbitrary directories, as they
770 * might not be on the same filesystem. Therefore this routine does not
771 * support renaming across directories.
772 *
773 * Log errors with the caller specified severity.
774 *
775 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
776 * valid upon return.
777 */
778int
779durable_rename(const char *oldfile, const char *newfile, int elevel)
780{
781 int fd;
782
783 /*
784 * First fsync the old and target path (if it exists), to ensure that they
785 * are properly persistent on disk. Syncing the target file is not
786 * strictly necessary, but it makes it easier to reason about crashes;
787 * because it's then guaranteed that either source or target file exists
788 * after a crash.
789 */
790 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
791 return -1;
792
794 if (fd < 0)
795 {
796 if (errno != ENOENT)
797 {
798 ereport(elevel,
800 errmsg("could not open file \"%s\": %m", newfile)));
801 return -1;
802 }
803 }
804 else
805 {
806 if (pg_fsync(fd) != 0)
807 {
808 int save_errno;
809
810 /* close file upon error, might not be in transaction context */
814
815 ereport(elevel,
817 errmsg("could not fsync file \"%s\": %m", newfile)));
818 return -1;
819 }
820
821 if (CloseTransientFile(fd) != 0)
822 {
823 ereport(elevel,
825 errmsg("could not close file \"%s\": %m", newfile)));
826 return -1;
827 }
828 }
829
830 /* Time to do the real deal... */
831 if (rename(oldfile, newfile) < 0)
832 {
833 ereport(elevel,
835 errmsg("could not rename file \"%s\" to \"%s\": %m",
836 oldfile, newfile)));
837 return -1;
838 }
839
840 /*
841 * To guarantee renaming the file is persistent, fsync the file with its
842 * new name, and its containing directory.
843 */
844 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
845 return -1;
846
847 if (fsync_parent_path(newfile, elevel) != 0)
848 return -1;
849
850 return 0;
851}
852
853/*
854 * durable_unlink -- remove a file in a durable manner
855 *
856 * This routine ensures that, after returning, the effect of removing file
857 * persists in case of a crash. A crash while this routine is running will
858 * leave the system in no mixed state.
859 *
860 * It does so by using fsync on the parent directory of the file after the
861 * actual removal is done.
862 *
863 * Log errors with the severity specified by caller.
864 *
865 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
866 * valid upon return.
867 */
868int
869durable_unlink(const char *fname, int elevel)
870{
871 if (unlink(fname) < 0)
872 {
873 ereport(elevel,
875 errmsg("could not remove file \"%s\": %m",
876 fname)));
877 return -1;
878 }
879
880 /*
881 * To guarantee that the removal of the file is persistent, fsync its
882 * parent directory.
883 */
884 if (fsync_parent_path(fname, elevel) != 0)
885 return -1;
886
887 return 0;
888}
889
890/*
891 * InitFileAccess --- initialize this module during backend startup
892 *
893 * This is called during either normal or standalone backend start.
894 * It is *not* called in the postmaster.
895 *
896 * Note that this does not initialize temporary file access, that is
897 * separately initialized via InitTemporaryFileAccess().
898 */
899void
900InitFileAccess(void)
901{
902 Assert(SizeVfdCache == 0); /* call me only once */
903
904 /* initialize cache header entry */
905 VfdCache = (Vfd *) malloc(sizeof(Vfd));
906 if (VfdCache == NULL)
909 errmsg("out of memory")));
910
911 MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
913
914 SizeVfdCache = 1;
915}
916
917/*
918 * InitTemporaryFileAccess --- initialize temporary file access during startup
919 *
920 * This is called during either normal or standalone backend start.
921 * It is *not* called in the postmaster.
922 *
923 * This is separate from InitFileAccess() because temporary file cleanup can
924 * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
925 * our reporting has to happen before that. Low level file access should be
926 * available for longer, hence the separate initialization / shutdown of
927 * temporary file handling.
928 */
929void
931{
932 Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
933 Assert(!temporary_files_allowed); /* call me only once */
934
935 /*
936 * Register before-shmem-exit hook to ensure temp files are dropped while
937 * we can still report stats.
938 */
940
941#ifdef USE_ASSERT_CHECKING
943#endif
944}
945
946/*
947 * count_usable_fds --- count how many FDs the system will let us open,
948 * and estimate how many are already open.
949 *
950 * We stop counting if usable_fds reaches max_to_probe. Note: a small
951 * value of max_to_probe might result in an underestimate of already_open;
952 * we must fill in any "gaps" in the set of used FDs before the calculation
953 * of already_open will give the right answer. In practice, max_to_probe
954 * of a couple of dozen should be enough to ensure good results.
955 *
956 * We assume stderr (FD 2) is available for dup'ing. While the calling
957 * script could theoretically close that, it would be a really bad idea,
958 * since then one risks loss of error messages from, e.g., libc.
959 */
960static void
962{
963 int *fd;
964 int size;
965 int used = 0;
966 int highestfd = 0;
967 int j;
968
969#ifdef HAVE_GETRLIMIT
970 struct rlimit rlim;
972#endif
973
974 size = 1024;
975 fd = (int *) palloc(size * sizeof(int));
976
977#ifdef HAVE_GETRLIMIT
979 if (getrlimit_status != 0)
980 ereport(WARNING, (errmsg("getrlimit failed: %m")));
981#endif /* HAVE_GETRLIMIT */
982
983 /* dup until failure or probe limit reached */
984 for (;;)
985 {
986 int thisfd;
987
988#ifdef HAVE_GETRLIMIT
989
990 /*
991 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
992 * some platforms
993 */
994 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
995 break;
996#endif
997
998 thisfd = dup(2);
999 if (thisfd < 0)
1000 {
1001 /* Expect EMFILE or ENFILE, else it's fishy */
1002 if (errno != EMFILE && errno != ENFILE)
1003 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1004 break;
1005 }
1006
1007 if (used >= size)
1008 {
1009 size *= 2;
1010 fd = (int *) repalloc(fd, size * sizeof(int));
1011 }
1012 fd[used++] = thisfd;
1013
1014 if (highestfd < thisfd)
1015 highestfd = thisfd;
1016
1017 if (used >= max_to_probe)
1018 break;
1019 }
1020
1021 /* release the files we opened */
1022 for (j = 0; j < used; j++)
1023 close(fd[j]);
1024
1025 pfree(fd);
1026
1027 /*
1028 * Return results. usable_fds is just the number of successful dups. We
1029 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1030 * number) and so already_open is highestfd+1 - usable_fds.
1031 */
1032 *usable_fds = used;
1033 *already_open = highestfd + 1 - used;
1034}
1035
1036/*
1037 * set_max_safe_fds
1038 * Determine number of file descriptors that fd.c is allowed to use
1039 */
1040void
1041set_max_safe_fds(void)
1042{
1043 int usable_fds;
1044 int already_open;
1045
1046 /*----------
1047 * We want to set max_safe_fds to
1048 * MIN(usable_fds, max_files_per_process)
1049 * less the slop factor for files that are opened without consulting
1050 * fd.c. This ensures that we won't allow to open more than
1051 * max_files_per_process, or the experimentally-determined EMFILE limit,
1052 * additional files.
1053 *----------
1054 */
1057
1059
1060 /*
1061 * Take off the FDs reserved for system() etc.
1062 */
1064
1065 /*
1066 * Make sure we still have enough to get by.
1067 */
1069 ereport(FATAL,
1071 errmsg("insufficient file descriptors available to start server process"),
1072 errdetail("System allows %d, server needs at least %d, %d files are already open.",
1075 already_open)));
1076
1077 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1079}
1080
1081/*
1082 * Open a file with BasicOpenFilePerm() and pass default file mode for the
1083 * fileMode parameter.
1084 */
1085int
1086BasicOpenFile(const char *fileName, int fileFlags)
1087{
1088 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1089}
1090
1091/*
1092 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1093 *
1094 * This is exported for use by places that really want a plain kernel FD,
1095 * but need to be proof against running out of FDs. Once an FD has been
1096 * successfully returned, it is the caller's responsibility to ensure that
1097 * it will not be leaked on ereport()! Most users should *not* call this
1098 * routine directly, but instead use the VFD abstraction level, which
1099 * provides protection against descriptor leaks as well as management of
1100 * files that need to be open for more than a short period of time.
1101 *
1102 * Ideally this should be the *only* direct call of open() in the backend.
1103 * In practice, the postmaster calls open() directly, and there are some
1104 * direct open() calls done early in backend startup. Those are OK since
1105 * this module wouldn't have any open files to close at that point anyway.
1106 */
1107int
1108BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1109{
1110 int fd;
1111
1112tryAgain:
1113#ifdef PG_O_DIRECT_USE_F_NOCACHE
1114 fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1115#else
1116 fd = open(fileName, fileFlags, fileMode);
1117#endif
1118
1119 if (fd >= 0)
1120 {
1121#ifdef PG_O_DIRECT_USE_F_NOCACHE
1122 if (fileFlags & PG_O_DIRECT)
1123 {
1124 if (fcntl(fd, F_NOCACHE, 1) < 0)
1125 {
1126 int save_errno = errno;
1127
1128 close(fd);
1129 errno = save_errno;
1130 return -1;
1131 }
1132 }
1133#endif
1134
1135 return fd; /* success! */
1136 }
1137
1138 if (errno == EMFILE || errno == ENFILE)
1139 {
1140 int save_errno = errno;
1141
1142 ereport(LOG,
1144 errmsg("out of file descriptors: %m; release and retry")));
1145 errno = 0;
1146 if (ReleaseLruFile())
1147 goto tryAgain;
1148 errno = save_errno;
1149 }
1150
1151 return -1; /* failure */
1152}
1153
1154/*
1155 * AcquireExternalFD - attempt to reserve an external file descriptor
1156 *
1157 * This should be used by callers that need to hold a file descriptor open
1158 * over more than a short interval, but cannot use any of the other facilities
1159 * provided by this module.
1160 *
1161 * The difference between this and the underlying ReserveExternalFD function
1162 * is that this will report failure (by setting errno and returning false)
1163 * if "too many" external FDs are already reserved. This should be used in
1164 * any code where the total number of FDs to be reserved is not predictable
1165 * and small.
1166 */
1167bool
1169{
1170 /*
1171 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1172 * "external" FDs.
1173 */
1174 if (numExternalFDs < max_safe_fds / 3)
1175 {
1177 return true;
1178 }
1179 errno = EMFILE;
1180 return false;
1181}
1182
1183/*
1184 * ReserveExternalFD - report external consumption of a file descriptor
1185 *
1186 * This should be used by callers that need to hold a file descriptor open
1187 * over more than a short interval, but cannot use any of the other facilities
1188 * provided by this module. This just tracks the use of the FD and closes
1189 * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1190 *
1191 * Call this directly only in code where failure to reserve the FD would be
1192 * fatal; for example, the WAL-writing code does so, since the alternative is
1193 * session failure. Also, it's very unwise to do so in code that could
1194 * consume more than one FD per process.
1195 *
1196 * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1197 * available, it doesn't matter too much whether this is called before or
1198 * after actually opening the FD; but doing so beforehand reduces the risk of
1199 * an EMFILE failure if not everybody played nice. In any case, it's solely
1200 * caller's responsibility to keep the external-FD count in sync with reality.
1201 */
1202void
1204{
1205 /*
1206 * Release VFDs if needed to stay safe. Because we do this before
1207 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1208 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1209 */
1211
1213}
1214
1215/*
1216 * ReleaseExternalFD - report release of an external file descriptor
1217 *
1218 * This is guaranteed not to change errno, so it can be used in failure paths.
1219 */
1220void
1222{
1225}
1226
1227
1228#if defined(FDDEBUG)
1229
1230static void
1231_dump_lru(void)
1232{
1233 int mru = VfdCache[0].lruLessRecently;
1234 Vfd *vfdP = &VfdCache[mru];
1235 char buf[2048];
1236
1237 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1238 while (mru != 0)
1239 {
1240 mru = vfdP->lruLessRecently;
1241 vfdP = &VfdCache[mru];
1242 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1243 }
1244 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1245 elog(LOG, "%s", buf);
1246}
1247#endif /* FDDEBUG */
1248
1249static void
1250Delete(File file)
1251{
1252 Vfd *vfdP;
1253
1254 Assert(file != 0);
1255
1256 DO_DB(elog(LOG, "Delete %d (%s)",
1257 file, VfdCache[file].fileName));
1258 DO_DB(_dump_lru());
1259
1260 vfdP = &VfdCache[file];
1261
1262 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1263 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1264
1265 DO_DB(_dump_lru());
1266}
1267
1268static void
1269LruDelete(File file)
1270{
1271 Vfd *vfdP;
1272
1273 Assert(file != 0);
1274
1275 DO_DB(elog(LOG, "LruDelete %d (%s)",
1276 file, VfdCache[file].fileName));
1277
1278 vfdP = &VfdCache[file];
1279
1281
1282 /*
1283 * Close the file. We aren't expecting this to fail; if it does, better
1284 * to leak the FD than to mess up our internal state.
1285 */
1286 if (close(vfdP->fd) != 0)
1288 "could not close file \"%s\": %m", vfdP->fileName);
1289 vfdP->fd = VFD_CLOSED;
1290 --nfile;
1291
1292 /* delete the vfd record from the LRU ring */
1293 Delete(file);
1294}
1295
1296static void
1297Insert(File file)
1298{
1299 Vfd *vfdP;
1300
1301 Assert(file != 0);
1302
1303 DO_DB(elog(LOG, "Insert %d (%s)",
1304 file, VfdCache[file].fileName));
1305 DO_DB(_dump_lru());
1306
1307 vfdP = &VfdCache[file];
1308
1309 vfdP->lruMoreRecently = 0;
1310 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1311 VfdCache[0].lruLessRecently = file;
1312 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1313
1314 DO_DB(_dump_lru());
1315}
1316
1317/* returns 0 on success, -1 on re-open failure (with errno set) */
1318static int
1319LruInsert(File file)
1320{
1321 Vfd *vfdP;
1322
1323 Assert(file != 0);
1324
1325 DO_DB(elog(LOG, "LruInsert %d (%s)",
1326 file, VfdCache[file].fileName));
1327
1328 vfdP = &VfdCache[file];
1329
1330 if (FileIsNotOpen(file))
1331 {
1332 /* Close excess kernel FDs. */
1334
1335 /*
1336 * The open could still fail for lack of file descriptors, eg due to
1337 * overall system file table being full. So, be prepared to release
1338 * another FD if necessary...
1339 */
1340 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1341 vfdP->fileMode);
1342 if (vfdP->fd < 0)
1343 {
1344 DO_DB(elog(LOG, "re-open failed: %m"));
1345 return -1;
1346 }
1347 else
1348 {
1349 ++nfile;
1350 }
1351 }
1352
1353 /*
1354 * put it at the head of the Lru ring
1355 */
1356
1357 Insert(file);
1358
1359 return 0;
1360}
1361
1362/*
1363 * Release one kernel FD by closing the least-recently-used VFD.
1364 */
1365static bool
1366ReleaseLruFile(void)
1367{
1368 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1369
1370 if (nfile > 0)
1371 {
1372 /*
1373 * There are opened files and so there should be at least one used vfd
1374 * in the ring.
1375 */
1376 Assert(VfdCache[0].lruMoreRecently != 0);
1377 LruDelete(VfdCache[0].lruMoreRecently);
1378 return true; /* freed a file */
1379 }
1380 return false; /* no files available to free */
1381}
1382
1383/*
1384 * Release kernel FDs as needed to get under the max_safe_fds limit.
1385 * After calling this, it's OK to try to open another file.
1386 */
1387static void
1388ReleaseLruFiles(void)
1389{
1391 {
1392 if (!ReleaseLruFile())
1393 break;
1394 }
1395}
1396
1397static File
1398AllocateVfd(void)
1399{
1400 Index i;
1401 File file;
1402
1403 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1404
1405 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1406
1407 if (VfdCache[0].nextFree == 0)
1408 {
1409 /*
1410 * The free list is empty so it is time to increase the size of the
1411 * array. We choose to double it each time this happens. However,
1412 * there's not much point in starting *real* small.
1413 */
1416
1417 if (newCacheSize < 32)
1418 newCacheSize = 32;
1419
1420 /*
1421 * Be careful not to clobber VfdCache ptr if realloc fails.
1422 */
1423 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1424 if (newVfdCache == NULL)
1425 ereport(ERROR,
1427 errmsg("out of memory")));
1429
1430 /*
1431 * Initialize the new entries and link them into the free list.
1432 */
1433 for (i = SizeVfdCache; i < newCacheSize; i++)
1434 {
1435 MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1436 VfdCache[i].nextFree = i + 1;
1438 }
1441
1442 /*
1443 * Record the new size
1444 */
1446 }
1447
1448 file = VfdCache[0].nextFree;
1449
1451
1452 return file;
1453}
1454
1455static void
1456FreeVfd(File file)
1457{
1458 Vfd *vfdP = &VfdCache[file];
1459
1460 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1461 file, vfdP->fileName ? vfdP->fileName : ""));
1462
1463 if (vfdP->fileName != NULL)
1464 {
1465 free(vfdP->fileName);
1466 vfdP->fileName = NULL;
1467 }
1468 vfdP->fdstate = 0x0;
1469
1470 vfdP->nextFree = VfdCache[0].nextFree;
1471 VfdCache[0].nextFree = file;
1472}
1473
1474/* returns 0 on success, -1 on re-open failure (with errno set) */
1475static int
1476FileAccess(File file)
1477{
1478 int returnValue;
1479
1480 DO_DB(elog(LOG, "FileAccess %d (%s)",
1481 file, VfdCache[file].fileName));
1482
1483 /*
1484 * Is the file open? If not, open it and put it at the head of the LRU
1485 * ring (possibly closing the least recently used file to get an FD).
1486 */
1487
1488 if (FileIsNotOpen(file))
1489 {
1490 returnValue = LruInsert(file);
1491 if (returnValue != 0)
1492 return returnValue;
1493 }
1494 else if (VfdCache[0].lruLessRecently != file)
1495 {
1496 /*
1497 * We now know that the file is open and that it is not the last one
1498 * accessed, so we need to move it to the head of the Lru ring.
1499 */
1500
1501 Delete(file);
1502 Insert(file);
1503 }
1504
1505 return 0;
1506}
1507
1508/*
1509 * Called whenever a temporary file is deleted to report its size.
1510 */
1511static void
1512ReportTemporaryFileUsage(const char *path, pgoff_t size)
1513{
1515
1516 if (log_temp_files >= 0)
1517 {
1518 if ((size / 1024) >= log_temp_files)
1519 ereport(LOG,
1520 (errmsg("temporary file: path \"%s\", size %lu",
1521 path, (unsigned long) size)));
1522 }
1523}
1524
1525/*
1526 * Called to register a temporary file for automatic close.
1527 * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1528 * before the file was opened.
1529 */
1530static void
1532{
1535
1536 /* Backup mechanism for closing at end of xact. */
1539}
1540
1541/*
1542 * Called when we get a shared invalidation message on some relation.
1543 */
1544#ifdef NOT_USED
1545void
1546FileInvalidate(File file)
1547{
1548 Assert(FileIsValid(file));
1549 if (!FileIsNotOpen(file))
1550 LruDelete(file);
1551}
1552#endif
1553
1554/*
1555 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1556 * fileMode parameter.
1557 */
1558File
1559PathNameOpenFile(const char *fileName, int fileFlags)
1560{
1561 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1562}
1563
1564/*
1565 * open a file in an arbitrary directory
1566 *
1567 * NB: if the passed pathname is relative (which it usually is),
1568 * it will be interpreted relative to the process' working directory
1569 * (which should always be $PGDATA when this code is running).
1570 */
1571File
1572PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1573{
1574 char *fnamecopy;
1575 File file;
1576 Vfd *vfdP;
1577
1578 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1579 fileName, fileFlags, fileMode));
1580
1581 /*
1582 * We need a malloc'd copy of the file name; fail cleanly if no room.
1583 */
1584 fnamecopy = strdup(fileName);
1585 if (fnamecopy == NULL)
1586 ereport(ERROR,
1588 errmsg("out of memory")));
1589
1590 file = AllocateVfd();
1591 vfdP = &VfdCache[file];
1592
1593 /* Close excess kernel FDs. */
1595
1596 /*
1597 * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1598 * client shouldn't be expected to know which kernel descriptors are
1599 * currently open, so it wouldn't make sense for them to be inherited by
1600 * executed subprograms.
1601 */
1602 fileFlags |= O_CLOEXEC;
1603
1604 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1605
1606 if (vfdP->fd < 0)
1607 {
1608 int save_errno = errno;
1609
1610 FreeVfd(file);
1611 free(fnamecopy);
1612 errno = save_errno;
1613 return -1;
1614 }
1615 ++nfile;
1616 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1617 vfdP->fd));
1618
1619 vfdP->fileName = fnamecopy;
1620 /* Saved flags are adjusted to be OK for re-opening file */
1621 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1622 vfdP->fileMode = fileMode;
1623 vfdP->fileSize = 0;
1624 vfdP->fdstate = 0x0;
1625 vfdP->resowner = NULL;
1626
1627 Insert(file);
1628
1629 return file;
1630}
1631
1632/*
1633 * Create directory 'directory'. If necessary, create 'basedir', which must
1634 * be the directory above it. This is designed for creating the top-level
1635 * temporary directory on demand before creating a directory underneath it.
1636 * Do nothing if the directory already exists.
1637 *
1638 * Directories created within the top-level temporary directory should begin
1639 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1640 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1641 * that do not need any particular prefix.
1642*/
1643void
1644PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1645{
1646 if (MakePGDirectory(directory) < 0)
1647 {
1648 if (errno == EEXIST)
1649 return;
1650
1651 /*
1652 * Failed. Try to create basedir first in case it's missing. Tolerate
1653 * EEXIST to close a race against another process following the same
1654 * algorithm.
1655 */
1656 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1657 ereport(ERROR,
1659 errmsg("cannot create temporary directory \"%s\": %m",
1660 basedir)));
1661
1662 /* Try again. */
1663 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1664 ereport(ERROR,
1666 errmsg("cannot create temporary subdirectory \"%s\": %m",
1667 directory)));
1668 }
1669}
1670
1671/*
1672 * Delete a directory and everything in it, if it exists.
1673 */
1674void
1675PathNameDeleteTemporaryDir(const char *dirname)
1676{
1677 struct stat statbuf;
1678
1679 /* Silently ignore missing directory. */
1680 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1681 return;
1682
1683 /*
1684 * Currently, walkdir doesn't offer a way for our passed in function to
1685 * maintain state. Perhaps it should, so that we could tell the caller
1686 * whether this operation succeeded or failed. Since this operation is
1687 * used in a cleanup path, we wouldn't actually behave differently: we'll
1688 * just log failures.
1689 */
1690 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1691}
1692
1693/*
1694 * Open a temporary file that will disappear when we close it.
1695 *
1696 * This routine takes care of generating an appropriate tempfile name.
1697 * There's no need to pass in fileFlags or fileMode either, since only
1698 * one setting makes any sense for a temp file.
1699 *
1700 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1701 * to ensure it's closed and deleted when it's no longer needed, typically at
1702 * the end-of-transaction. In most cases, you don't want temporary files to
1703 * outlive the transaction that created them, so this should be false -- but
1704 * if you need "somewhat" temporary storage, this might be useful. In either
1705 * case, the file is removed when the File is explicitly closed.
1706 */
1707File
1708OpenTemporaryFile(bool interXact)
1709{
1710 File file = 0;
1711
1712 Assert(temporary_files_allowed); /* check temp file access is up */
1713
1714 /*
1715 * Make sure the current resource owner has space for this File before we
1716 * open it, if we'll be registering it below.
1717 */
1718 if (!interXact)
1720
1721 /*
1722 * If some temp tablespace(s) have been given to us, try to use the next
1723 * one. If a given tablespace can't be found, we silently fall back to
1724 * the database's default tablespace.
1725 *
1726 * BUT: if the temp file is slated to outlive the current transaction,
1727 * force it into the database's default tablespace, so that it will not
1728 * pose a threat to possible tablespace drop attempts.
1729 */
1730 if (numTempTableSpaces > 0 && !interXact)
1731 {
1733
1734 if (OidIsValid(tblspcOid))
1736 }
1737
1738 /*
1739 * If not, or if tablespace is bad, create in database's default
1740 * tablespace. MyDatabaseTableSpace should normally be set before we get
1741 * here, but just in case it isn't, fall back to pg_default tablespace.
1742 */
1743 if (file <= 0)
1747 true);
1748
1749 /* Mark it for deletion at close and temporary file size limit */
1751
1752 /* Register it with the current resource owner */
1753 if (!interXact)
1755
1756 return file;
1757}
1758
1759/*
1760 * Return the path of the temp directory in a given tablespace.
1761 */
1762void
1764{
1765 /*
1766 * Identify the tempfile directory for this tablespace.
1767 *
1768 * If someone tries to specify pg_global, use pg_default instead.
1769 */
1770 if (tablespace == InvalidOid ||
1773 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1774 else
1775 {
1776 /* All other tablespaces are accessed via symlinks */
1777 snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1780 }
1781}
1782
1783/*
1784 * Open a temporary file in a specific tablespace.
1785 * Subroutine for OpenTemporaryFile, which see for details.
1786 */
1787static File
1789{
1790 char tempdirpath[MAXPGPATH];
1791 char tempfilepath[MAXPGPATH];
1792 File file;
1793
1795
1796 /*
1797 * Generate a tempfile name that should be unique within the current
1798 * database instance.
1799 */
1800 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1802
1803 /*
1804 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1805 * temp file that can be reused.
1806 */
1809 if (file <= 0)
1810 {
1811 /*
1812 * We might need to create the tablespace's tempfile directory, if no
1813 * one has yet done so.
1814 *
1815 * Don't check for an error from MakePGDirectory; it could fail if
1816 * someone else just did the same thing. If it doesn't work then
1817 * we'll bomb out on the second create attempt, instead.
1818 */
1820
1823 if (file <= 0 && rejectError)
1824 elog(ERROR, "could not create temporary file \"%s\": %m",
1825 tempfilepath);
1826 }
1827
1828 return file;
1829}
1830
1831
1832/*
1833 * Create a new file. The directory containing it must already exist. Files
1834 * created this way are subject to temp_file_limit and are automatically
1835 * closed at end of transaction, but are not automatically deleted on close
1836 * because they are intended to be shared between cooperating backends.
1837 *
1838 * If the file is inside the top-level temporary directory, its name should
1839 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1840 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1841 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1842 * the prefix isn't needed.
1843 */
1844File
1845PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1846{
1847 File file;
1848
1849 Assert(temporary_files_allowed); /* check temp file access is up */
1850
1852
1853 /*
1854 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1855 * temp file that can be reused.
1856 */
1857 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1858 if (file <= 0)
1859 {
1860 if (error_on_failure)
1861 ereport(ERROR,
1863 errmsg("could not create temporary file \"%s\": %m",
1864 path)));
1865 else
1866 return file;
1867 }
1868
1869 /* Mark it for temp_file_limit accounting. */
1871
1872 /* Register it for automatic close. */
1874
1875 return file;
1876}
1877
1878/*
1879 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1880 * another backend. Files opened this way don't count against the
1881 * temp_file_limit of the caller, are automatically closed at the end of the
1882 * transaction but are not deleted on close.
1883 */
1884File
1885PathNameOpenTemporaryFile(const char *path, int mode)
1886{
1887 File file;
1888
1889 Assert(temporary_files_allowed); /* check temp file access is up */
1890
1892
1893 file = PathNameOpenFile(path, mode | PG_BINARY);
1894
1895 /* If no such file, then we don't raise an error. */
1896 if (file <= 0 && errno != ENOENT)
1897 ereport(ERROR,
1899 errmsg("could not open temporary file \"%s\": %m",
1900 path)));
1901
1902 if (file > 0)
1903 {
1904 /* Register it for automatic close. */
1906 }
1907
1908 return file;
1909}
1910
1911/*
1912 * Delete a file by pathname. Return true if the file existed, false if
1913 * didn't.
1914 */
1915bool
1916PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1917{
1918 struct stat filestats;
1919 int stat_errno;
1920
1921 /* Get the final size for pgstat reporting. */
1922 if (stat(path, &filestats) != 0)
1923 stat_errno = errno;
1924 else
1925 stat_errno = 0;
1926
1927 /*
1928 * Unlike FileClose's automatic file deletion code, we tolerate
1929 * non-existence to support BufFileDeleteFileSet which doesn't know how
1930 * many segments it has to delete until it runs out.
1931 */
1932 if (stat_errno == ENOENT)
1933 return false;
1934
1935 if (unlink(path) < 0)
1936 {
1937 if (errno != ENOENT)
1940 errmsg("could not unlink temporary file \"%s\": %m",
1941 path)));
1942 return false;
1943 }
1944
1945 if (stat_errno == 0)
1946 ReportTemporaryFileUsage(path, filestats.st_size);
1947 else
1948 {
1949 errno = stat_errno;
1950 ereport(LOG,
1952 errmsg("could not stat file \"%s\": %m", path)));
1953 }
1954
1955 return true;
1956}
1957
1958/*
1959 * close a file when done with it
1960 */
1961void
1962FileClose(File file)
1963{
1964 Vfd *vfdP;
1965
1966 Assert(FileIsValid(file));
1967
1968 DO_DB(elog(LOG, "FileClose: %d (%s)",
1969 file, VfdCache[file].fileName));
1970
1971 vfdP = &VfdCache[file];
1972
1973 if (!FileIsNotOpen(file))
1974 {
1976
1977 /* close the file */
1978 if (close(vfdP->fd) != 0)
1979 {
1980 /*
1981 * We may need to panic on failure to close non-temporary files;
1982 * see LruDelete.
1983 */
1985 "could not close file \"%s\": %m", vfdP->fileName);
1986 }
1987
1988 --nfile;
1989 vfdP->fd = VFD_CLOSED;
1990
1991 /* remove the file from the lru ring */
1992 Delete(file);
1993 }
1994
1995 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1996 {
1997 /* Subtract its size from current usage (do first in case of error) */
1998 temporary_files_size -= vfdP->fileSize;
1999 vfdP->fileSize = 0;
2000 }
2001
2002 /*
2003 * Delete the file if it was temporary, and make a log entry if wanted
2004 */
2005 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2006 {
2007 struct stat filestats;
2008 int stat_errno;
2009
2010 /*
2011 * If we get an error, as could happen within the ereport/elog calls,
2012 * we'll come right back here during transaction abort. Reset the
2013 * flag to ensure that we can't get into an infinite loop. This code
2014 * is arranged to ensure that the worst-case consequence is failing to
2015 * emit log message(s), not failing to attempt the unlink.
2016 */
2017 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2018
2019
2020 /* first try the stat() */
2021 if (stat(vfdP->fileName, &filestats))
2022 stat_errno = errno;
2023 else
2024 stat_errno = 0;
2025
2026 /* in any case do the unlink */
2027 if (unlink(vfdP->fileName))
2028 ereport(LOG,
2030 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2031
2032 /* and last report the stat results */
2033 if (stat_errno == 0)
2034 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2035 else
2036 {
2037 errno = stat_errno;
2038 ereport(LOG,
2040 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2041 }
2042 }
2043
2044 /* Unregister it from the resource owner */
2045 if (vfdP->resowner)
2046 ResourceOwnerForgetFile(vfdP->resowner, file);
2047
2048 /*
2049 * Return the Vfd slot to the free list
2050 */
2051 FreeVfd(file);
2052}
2053
2054/*
2055 * FilePrefetch - initiate asynchronous read of a given range of the file.
2056 *
2057 * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2058 *
2059 * posix_fadvise() is the simplest standardized interface that accomplishes
2060 * this.
2061 */
2062int
2063FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2064{
2065 Assert(FileIsValid(file));
2066
2067 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2068 file, VfdCache[file].fileName,
2069 (int64) offset, (int64) amount));
2070
2071#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2072 {
2073 int returnCode;
2074
2075 returnCode = FileAccess(file);
2076 if (returnCode < 0)
2077 return returnCode;
2078
2079retry:
2080 pgstat_report_wait_start(wait_event_info);
2081 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2084
2085 if (returnCode == EINTR)
2086 goto retry;
2087
2088 return returnCode;
2089 }
2090#elif defined(__darwin__)
2091 {
2092 struct radvisory
2093 {
2094 off_t ra_offset; /* offset into the file */
2095 int ra_count; /* size of the read */
2096 } ra;
2097 int returnCode;
2098
2099 returnCode = FileAccess(file);
2100 if (returnCode < 0)
2101 return returnCode;
2102
2103 ra.ra_offset = offset;
2104 ra.ra_count = amount;
2105 pgstat_report_wait_start(wait_event_info);
2108 if (returnCode != -1)
2109 return 0;
2110 else
2111 return errno;
2112 }
2113#else
2114 return 0;
2115#endif
2116}
2117
2118void
2119FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
2120{
2121 int returnCode;
2122
2123 Assert(FileIsValid(file));
2124
2125 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2126 file, VfdCache[file].fileName,
2127 (int64) offset, (int64) nbytes));
2128
2129 if (nbytes <= 0)
2130 return;
2131
2132 if (VfdCache[file].fileFlags & PG_O_DIRECT)
2133 return;
2134
2135 returnCode = FileAccess(file);
2136 if (returnCode < 0)
2137 return;
2138
2139 pgstat_report_wait_start(wait_event_info);
2140 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2142}
2143
2144ssize_t
2145FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2146 uint32 wait_event_info)
2147{
2149 Vfd *vfdP;
2150
2151 Assert(FileIsValid(file));
2152
2153 DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2154 file, VfdCache[file].fileName,
2155 (int64) offset,
2156 iovcnt));
2157
2158 returnCode = FileAccess(file);
2159 if (returnCode < 0)
2160 return returnCode;
2161
2162 vfdP = &VfdCache[file];
2163
2164retry:
2165 pgstat_report_wait_start(wait_event_info);
2166 returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2168
2169 if (returnCode < 0)
2170 {
2171 /*
2172 * Windows may run out of kernel buffers and return "Insufficient
2173 * system resources" error. Wait a bit and retry to solve it.
2174 *
2175 * It is rumored that EINTR is also possible on some Unix filesystems,
2176 * in which case immediate retry is indicated.
2177 */
2178#ifdef WIN32
2180
2181 switch (error)
2182 {
2184 pg_usleep(1000L);
2185 errno = EINTR;
2186 break;
2187 default:
2189 break;
2190 }
2191#endif
2192 /* OK to retry if interrupted */
2193 if (errno == EINTR)
2194 goto retry;
2195 }
2196
2197 return returnCode;
2198}
2199
2200int
2202 int iovcnt, pgoff_t offset,
2203 uint32 wait_event_info)
2204{
2205 int returnCode;
2206 Vfd *vfdP;
2207
2208 Assert(FileIsValid(file));
2209
2210 DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2211 file, VfdCache[file].fileName,
2212 (int64) offset,
2213 iovcnt));
2214
2215 returnCode = FileAccess(file);
2216 if (returnCode < 0)
2217 return returnCode;
2218
2219 vfdP = &VfdCache[file];
2220
2221 pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2222
2223 return 0;
2224}
2225
2226ssize_t
2227FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2228 uint32 wait_event_info)
2229{
2231 Vfd *vfdP;
2232
2233 Assert(FileIsValid(file));
2234
2235 DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2236 file, VfdCache[file].fileName,
2237 (int64) offset,
2238 iovcnt));
2239
2240 returnCode = FileAccess(file);
2241 if (returnCode < 0)
2242 return returnCode;
2243
2244 vfdP = &VfdCache[file];
2245
2246 /*
2247 * If enforcing temp_file_limit and it's a temp file, check to see if the
2248 * write would overrun temp_file_limit, and throw error if so. Note: it's
2249 * really a modularity violation to throw error here; we should set errno
2250 * and return -1. However, there's no way to report a suitable error
2251 * message if we do that. All current callers would just throw error
2252 * immediately anyway, so this is safe at present.
2253 */
2254 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2255 {
2256 pgoff_t past_write = offset;
2257
2258 for (int i = 0; i < iovcnt; ++i)
2259 past_write += iov[i].iov_len;
2260
2261 if (past_write > vfdP->fileSize)
2262 {
2264
2266 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2267 ereport(ERROR,
2269 errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2270 temp_file_limit)));
2271 }
2272 }
2273
2274retry:
2275 pgstat_report_wait_start(wait_event_info);
2276 returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2278
2279 if (returnCode >= 0)
2280 {
2281 /*
2282 * Some callers expect short writes to set errno, and traditionally we
2283 * have assumed that they imply disk space shortage. We don't want to
2284 * waste CPU cycles adding up the total size here, so we'll just set
2285 * it for all successful writes in case such a caller determines that
2286 * the write was short and ereports "%m".
2287 */
2288 errno = ENOSPC;
2289
2290 /*
2291 * Maintain fileSize and temporary_files_size if it's a temp file.
2292 */
2293 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2294 {
2295 pgoff_t past_write = offset + returnCode;
2296
2297 if (past_write > vfdP->fileSize)
2298 {
2299 temporary_files_size += past_write - vfdP->fileSize;
2300 vfdP->fileSize = past_write;
2301 }
2302 }
2303 }
2304 else
2305 {
2306 /*
2307 * See comments in FileReadV()
2308 */
2309#ifdef WIN32
2311
2312 switch (error)
2313 {
2315 pg_usleep(1000L);
2316 errno = EINTR;
2317 break;
2318 default:
2320 break;
2321 }
2322#endif
2323 /* OK to retry if interrupted */
2324 if (errno == EINTR)
2325 goto retry;
2326 }
2327
2328 return returnCode;
2329}
2330
2331int
2332FileSync(File file, uint32 wait_event_info)
2333{
2334 int returnCode;
2335
2336 Assert(FileIsValid(file));
2337
2338 DO_DB(elog(LOG, "FileSync: %d (%s)",
2339 file, VfdCache[file].fileName));
2340
2341 returnCode = FileAccess(file);
2342 if (returnCode < 0)
2343 return returnCode;
2344
2345 pgstat_report_wait_start(wait_event_info);
2346 returnCode = pg_fsync(VfdCache[file].fd);
2348
2349 return returnCode;
2350}
2351
2352/*
2353 * Zero a region of the file.
2354 *
2355 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2356 * appropriate error.
2357 */
2358int
2359FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2360{
2361 int returnCode;
2363
2364 Assert(FileIsValid(file));
2365
2366 DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2367 file, VfdCache[file].fileName,
2368 (int64) offset, (int64) amount));
2369
2370 returnCode = FileAccess(file);
2371 if (returnCode < 0)
2372 return returnCode;
2373
2374 pgstat_report_wait_start(wait_event_info);
2375 written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2377
2378 if (written < 0)
2379 return -1;
2380 else if (written != amount)
2381 {
2382 /* if errno is unset, assume problem is no disk space */
2383 if (errno == 0)
2384 errno = ENOSPC;
2385 return -1;
2386 }
2387
2388 return 0;
2389}
2390
2391/*
2392 * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2393 * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2394 * use FileZero() instead.
2395 *
2396 * Note that at least glibc() implements posix_fallocate() in userspace if not
2397 * implemented by the filesystem. That's not the case for all environments
2398 * though.
2399 *
2400 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2401 * appropriate error.
2402 */
2403int
2404FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2405{
2406#ifdef HAVE_POSIX_FALLOCATE
2407 int returnCode;
2408
2409 Assert(FileIsValid(file));
2410
2411 DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2412 file, VfdCache[file].fileName,
2413 (int64) offset, (int64) amount));
2414
2415 returnCode = FileAccess(file);
2416 if (returnCode < 0)
2417 return -1;
2418
2419retry:
2420 pgstat_report_wait_start(wait_event_info);
2421 returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2423
2424 if (returnCode == 0)
2425 return 0;
2426 else if (returnCode == EINTR)
2427 goto retry;
2428
2429 /* for compatibility with %m printing etc */
2430 errno = returnCode;
2431
2432 /*
2433 * Return in cases of a "real" failure, if fallocate is not supported,
2434 * fall through to the FileZero() backed implementation.
2435 */
2437 return -1;
2438#endif
2439
2440 return FileZero(file, offset, amount, wait_event_info);
2441}
2442
2443pgoff_t
2444FileSize(File file)
2445{
2446 Assert(FileIsValid(file));
2447
2448 DO_DB(elog(LOG, "FileSize %d (%s)",
2449 file, VfdCache[file].fileName));
2450
2451 if (FileIsNotOpen(file))
2452 {
2453 if (FileAccess(file) < 0)
2454 return (pgoff_t) -1;
2455 }
2456
2457 return lseek(VfdCache[file].fd, 0, SEEK_END);
2458}
2459
2460int
2461FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
2462{
2463 int returnCode;
2464
2465 Assert(FileIsValid(file));
2466
2467 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2468 file, VfdCache[file].fileName));
2469
2470 returnCode = FileAccess(file);
2471 if (returnCode < 0)
2472 return returnCode;
2473
2474 pgstat_report_wait_start(wait_event_info);
2475 returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2477
2478 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2479 {
2480 /* adjust our state for truncation of a temp file */
2481 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2482 temporary_files_size -= VfdCache[file].fileSize - offset;
2483 VfdCache[file].fileSize = offset;
2484 }
2485
2486 return returnCode;
2487}
2488
2489/*
2490 * Return the pathname associated with an open file.
2491 *
2492 * The returned string points to an internal buffer, which is valid until
2493 * the file is closed.
2494 */
2495char *
2496FilePathName(File file)
2497{
2498 Assert(FileIsValid(file));
2499
2500 return VfdCache[file].fileName;
2501}
2502
2503/*
2504 * Return the raw file descriptor of an opened file.
2505 *
2506 * The returned file descriptor will be valid until the file is closed, but
2507 * there are a lot of things that can make that happen. So the caller should
2508 * be careful not to do much of anything else before it finishes using the
2509 * returned file descriptor.
2510 */
2511int
2512FileGetRawDesc(File file)
2513{
2514 int returnCode;
2515
2516 returnCode = FileAccess(file);
2517 if (returnCode < 0)
2518 return returnCode;
2519
2520 Assert(FileIsValid(file));
2521 return VfdCache[file].fd;
2522}
2523
2524/*
2525 * FileGetRawFlags - returns the file flags on open(2)
2526 */
2527int
2529{
2530 Assert(FileIsValid(file));
2531 return VfdCache[file].fileFlags;
2532}
2533
2534/*
2535 * FileGetRawMode - returns the mode bitmask passed to open(2)
2536 */
2537mode_t
2538FileGetRawMode(File file)
2539{
2540 Assert(FileIsValid(file));
2541 return VfdCache[file].fileMode;
2542}
2543
2544/*
2545 * Make room for another allocatedDescs[] array entry if needed and possible.
2546 * Returns true if an array element is available.
2547 */
2548static bool
2550{
2552 int newMax;
2553
2554 /* Quick out if array already has a free slot. */
2556 return true;
2557
2558 /*
2559 * If the array hasn't yet been created in the current process, initialize
2560 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2561 * we will ever need, anyway. We don't want to look at max_safe_fds
2562 * immediately because set_max_safe_fds() may not have run yet.
2563 */
2564 if (allocatedDescs == NULL)
2565 {
2566 newMax = FD_MINFREE / 3;
2568 /* Out of memory already? Treat as fatal error. */
2569 if (newDescs == NULL)
2570 ereport(ERROR,
2572 errmsg("out of memory")));
2575 return true;
2576 }
2577
2578 /*
2579 * Consider enlarging the array beyond the initial allocation used above.
2580 * By the time this happens, max_safe_fds should be known accurately.
2581 *
2582 * We mustn't let allocated descriptors hog all the available FDs, and in
2583 * practice we'd better leave a reasonable number of FDs for VFD use. So
2584 * set the maximum to max_safe_fds / 3. (This should certainly be at
2585 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2586 * tightening the restriction here.) Recall that "external" FDs are
2587 * allowed to consume another third of max_safe_fds.
2588 */
2589 newMax = max_safe_fds / 3;
2591 {
2593 newMax * sizeof(AllocateDesc));
2594 /* Treat out-of-memory as a non-fatal error. */
2595 if (newDescs == NULL)
2596 return false;
2599 return true;
2600 }
2601
2602 /* Can't enlarge allocatedDescs[] any more. */
2603 return false;
2604}
2605
2606/*
2607 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2608 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2609 * necessary to open the file. When done, call FreeFile rather than fclose.
2610 *
2611 * Note that files that will be open for any significant length of time
2612 * should NOT be handled this way, since they cannot share kernel file
2613 * descriptors with other files; there is grave risk of running out of FDs
2614 * if anyone locks down too many FDs. Most callers of this routine are
2615 * simply reading a config file that they will read and close immediately.
2616 *
2617 * fd.c will automatically close all files opened with AllocateFile at
2618 * transaction commit or abort; this prevents FD leakage if a routine
2619 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2620 *
2621 * Ideally this should be the *only* direct call of fopen() in the backend.
2622 */
2623FILE *
2624AllocateFile(const char *name, const char *mode)
2625{
2626 FILE *file;
2627
2628 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2630
2631 /* Can we allocate another non-virtual FD? */
2632 if (!reserveAllocatedDesc())
2633 ereport(ERROR,
2635 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2637
2638 /* Close excess kernel FDs. */
2640
2641TryAgain:
2642 if ((file = fopen(name, mode)) != NULL)
2643 {
2645
2646 desc->kind = AllocateDescFile;
2647 desc->desc.file = file;
2650 return desc->desc.file;
2651 }
2652
2653 if (errno == EMFILE || errno == ENFILE)
2654 {
2655 int save_errno = errno;
2656
2657 ereport(LOG,
2659 errmsg("out of file descriptors: %m; release and retry")));
2660 errno = 0;
2661 if (ReleaseLruFile())
2662 goto TryAgain;
2663 errno = save_errno;
2664 }
2665
2666 return NULL;
2667}
2668
2669/*
2670 * Open a file with OpenTransientFilePerm() and pass default file mode for
2671 * the fileMode parameter.
2672 */
2673int
2674OpenTransientFile(const char *fileName, int fileFlags)
2675{
2676 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2677}
2678
2679/*
2680 * Like AllocateFile, but returns an unbuffered fd like open(2)
2681 */
2682int
2683OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2684{
2685 int fd;
2686
2687 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2688 numAllocatedDescs, fileName));
2689
2690 /* Can we allocate another non-virtual FD? */
2691 if (!reserveAllocatedDesc())
2692 ereport(ERROR,
2694 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2695 maxAllocatedDescs, fileName)));
2696
2697 /* Close excess kernel FDs. */
2699
2700 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2701
2702 if (fd >= 0)
2703 {
2705
2706 desc->kind = AllocateDescRawFD;
2707 desc->desc.fd = fd;
2710
2711 return fd;
2712 }
2713
2714 return -1; /* failure */
2715}
2716
2717/*
2718 * Routines that want to initiate a pipe stream should use OpenPipeStream
2719 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2720 * necessary. When done, call ClosePipeStream rather than pclose.
2721 *
2722 * This function also ensures that the popen'd program is run with default
2723 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2724 * uses. This ensures desirable response to, eg, closing a read pipe early.
2725 */
2726FILE *
2727OpenPipeStream(const char *command, const char *mode)
2728{
2729 FILE *file;
2730 int save_errno;
2731
2732 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2733 numAllocatedDescs, command));
2734
2735 /* Can we allocate another non-virtual FD? */
2736 if (!reserveAllocatedDesc())
2737 ereport(ERROR,
2739 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2740 maxAllocatedDescs, command)));
2741
2742 /* Close excess kernel FDs. */
2744
2745TryAgain:
2746 fflush(NULL);
2748 errno = 0;
2749 file = popen(command, mode);
2750 save_errno = errno;
2752 errno = save_errno;
2753 if (file != NULL)
2754 {
2756
2757 desc->kind = AllocateDescPipe;
2758 desc->desc.file = file;
2761 return desc->desc.file;
2762 }
2763
2764 if (errno == EMFILE || errno == ENFILE)
2765 {
2766 ereport(LOG,
2768 errmsg("out of file descriptors: %m; release and retry")));
2769 if (ReleaseLruFile())
2770 goto TryAgain;
2771 errno = save_errno;
2772 }
2773
2774 return NULL;
2775}
2776
2777/*
2778 * Free an AllocateDesc of any type.
2779 *
2780 * The argument *must* point into the allocatedDescs[] array.
2781 */
2782static int
2784{
2785 int result;
2786
2787 /* Close the underlying object */
2788 switch (desc->kind)
2789 {
2790 case AllocateDescFile:
2791 result = fclose(desc->desc.file);
2792 break;
2793 case AllocateDescPipe:
2794 result = pclose(desc->desc.file);
2795 break;
2796 case AllocateDescDir:
2797 result = closedir(desc->desc.dir);
2798 break;
2799 case AllocateDescRawFD:
2800 pgaio_closing_fd(desc->desc.fd);
2801 result = close(desc->desc.fd);
2802 break;
2803 default:
2804 elog(ERROR, "AllocateDesc kind not recognized");
2805 result = 0; /* keep compiler quiet */
2806 break;
2807 }
2808
2809 /* Compact storage in the allocatedDescs array */
2812
2813 return result;
2814}
2815
2816/*
2817 * Close a file returned by AllocateFile.
2818 *
2819 * Note we do not check fclose's return value --- it is up to the caller
2820 * to handle close errors.
2821 */
2822int
2823FreeFile(FILE *file)
2824{
2825 int i;
2826
2827 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2828
2829 /* Remove file from list of allocated files, if it's present */
2830 for (i = numAllocatedDescs; --i >= 0;)
2831 {
2832 AllocateDesc *desc = &allocatedDescs[i];
2833
2834 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2835 return FreeDesc(desc);
2836 }
2837
2838 /* Only get here if someone passes us a file not in allocatedDescs */
2839 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2840
2841 return fclose(file);
2842}
2843
2844/*
2845 * Close a file returned by OpenTransientFile.
2846 *
2847 * Note we do not check close's return value --- it is up to the caller
2848 * to handle close errors.
2849 */
2850int
2852{
2853 int i;
2854
2855 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2856
2857 /* Remove fd from list of allocated files, if it's present */
2858 for (i = numAllocatedDescs; --i >= 0;)
2859 {
2860 AllocateDesc *desc = &allocatedDescs[i];
2861
2862 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2863 return FreeDesc(desc);
2864 }
2865
2866 /* Only get here if someone passes us a file not in allocatedDescs */
2867 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2868
2870
2871 return close(fd);
2872}
2873
2874/*
2875 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2876 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2877 * necessary to open the directory, and with closing it after an elog.
2878 * When done, call FreeDir rather than closedir.
2879 *
2880 * Returns NULL, with errno set, on failure. Note that failure detection
2881 * is commonly left to the following call of ReadDir or ReadDirExtended;
2882 * see the comments for ReadDir.
2883 *
2884 * Ideally this should be the *only* direct call of opendir() in the backend.
2885 */
2886DIR *
2887AllocateDir(const char *dirname)
2888{
2889 DIR *dir;
2890
2891 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2892 numAllocatedDescs, dirname));
2893
2894 /* Can we allocate another non-virtual FD? */
2895 if (!reserveAllocatedDesc())
2896 ereport(ERROR,
2898 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2899 maxAllocatedDescs, dirname)));
2900
2901 /* Close excess kernel FDs. */
2903
2904TryAgain:
2905 if ((dir = opendir(dirname)) != NULL)
2906 {
2908
2909 desc->kind = AllocateDescDir;
2910 desc->desc.dir = dir;
2913 return desc->desc.dir;
2914 }
2915
2916 if (errno == EMFILE || errno == ENFILE)
2917 {
2918 int save_errno = errno;
2919
2920 ereport(LOG,
2922 errmsg("out of file descriptors: %m; release and retry")));
2923 errno = 0;
2924 if (ReleaseLruFile())
2925 goto TryAgain;
2926 errno = save_errno;
2927 }
2928
2929 return NULL;
2930}
2931
2932/*
2933 * Read a directory opened with AllocateDir, ereport'ing any error.
2934 *
2935 * This is easier to use than raw readdir() since it takes care of some
2936 * otherwise rather tedious and error-prone manipulation of errno. Also,
2937 * if you are happy with a generic error message for AllocateDir failure,
2938 * you can just do
2939 *
2940 * dir = AllocateDir(path);
2941 * while ((dirent = ReadDir(dir, path)) != NULL)
2942 * process dirent;
2943 * FreeDir(dir);
2944 *
2945 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2946 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2947 * use this shortcut.)
2948 *
2949 * The pathname passed to AllocateDir must be passed to this routine too,
2950 * but it is only used for error reporting.
2951 */
2952struct dirent *
2953ReadDir(DIR *dir, const char *dirname)
2954{
2955 return ReadDirExtended(dir, dirname, ERROR);
2956}
2957
2958/*
2959 * Alternate version of ReadDir that allows caller to specify the elevel
2960 * for any error report (whether it's reporting an initial failure of
2961 * AllocateDir or a subsequent directory read failure).
2962 *
2963 * If elevel < ERROR, returns NULL after any error. With the normal coding
2964 * pattern, this will result in falling out of the loop immediately as
2965 * though the directory contained no (more) entries.
2966 */
2967struct dirent *
2968ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2969{
2970 struct dirent *dent;
2971
2972 /* Give a generic message for AllocateDir failure, if caller didn't */
2973 if (dir == NULL)
2974 {
2975 ereport(elevel,
2977 errmsg("could not open directory \"%s\": %m",
2978 dirname)));
2979 return NULL;
2980 }
2981
2982 errno = 0;
2983 if ((dent = readdir(dir)) != NULL)
2984 return dent;
2985
2986 if (errno)
2987 ereport(elevel,
2989 errmsg("could not read directory \"%s\": %m",
2990 dirname)));
2991 return NULL;
2992}
2993
2994/*
2995 * Close a directory opened with AllocateDir.
2996 *
2997 * Returns closedir's return value (with errno set if it's not 0).
2998 * Note we do not check the return value --- it is up to the caller
2999 * to handle close errors if wanted.
3000 *
3001 * Does nothing if dir == NULL; we assume that directory open failure was
3002 * already reported if desired.
3003 */
3004int
3005FreeDir(DIR *dir)
3006{
3007 int i;
3008
3009 /* Nothing to do if AllocateDir failed */
3010 if (dir == NULL)
3011 return 0;
3012
3013 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3014
3015 /* Remove dir from list of allocated dirs, if it's present */
3016 for (i = numAllocatedDescs; --i >= 0;)
3017 {
3018 AllocateDesc *desc = &allocatedDescs[i];
3019
3020 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3021 return FreeDesc(desc);
3022 }
3023
3024 /* Only get here if someone passes us a dir not in allocatedDescs */
3025 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3026
3027 return closedir(dir);
3028}
3029
3030
3031/*
3032 * Close a pipe stream returned by OpenPipeStream.
3033 */
3034int
3035ClosePipeStream(FILE *file)
3036{
3037 int i;
3038
3039 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3040
3041 /* Remove file from list of allocated files, if it's present */
3042 for (i = numAllocatedDescs; --i >= 0;)
3043 {
3044 AllocateDesc *desc = &allocatedDescs[i];
3045
3046 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3047 return FreeDesc(desc);
3048 }
3049
3050 /* Only get here if someone passes us a file not in allocatedDescs */
3051 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3052
3053 return pclose(file);
3054}
3055
3056/*
3057 * closeAllVfds
3058 *
3059 * Force all VFDs into the physically-closed state, so that the fewest
3060 * possible number of kernel file descriptors are in use. There is no
3061 * change in the logical state of the VFDs.
3062 */
3063void
3064closeAllVfds(void)
3065{
3066 Index i;
3067
3068 if (SizeVfdCache > 0)
3069 {
3070 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3071 for (i = 1; i < SizeVfdCache; i++)
3072 {
3073 if (!FileIsNotOpen(i))
3074 LruDelete(i);
3075 }
3076 }
3077}
3078
3079
3080/*
3081 * SetTempTablespaces
3082 *
3083 * Define a list (actually an array) of OIDs of tablespaces to use for
3084 * temporary files. This list will be used until end of transaction,
3085 * unless this function is called again before then. It is caller's
3086 * responsibility that the passed-in array has adequate lifespan (typically
3087 * it'd be allocated in TopTransactionContext).
3088 *
3089 * Some entries of the array may be InvalidOid, indicating that the current
3090 * database's default tablespace should be used.
3091 */
3092void
3094{
3095 Assert(numSpaces >= 0);
3098
3099 /*
3100 * Select a random starting point in the list. This is to minimize
3101 * conflicts between backends that are most likely sharing the same list
3102 * of temp tablespaces. Note that if we create multiple temp files in the
3103 * same transaction, we'll advance circularly through the list --- this
3104 * ensures that large temporary sort files are nicely spread across all
3105 * available tablespaces.
3106 */
3107 if (numSpaces > 1)
3109 0, numSpaces - 1);
3110 else
3112}
3113
3114/*
3115 * TempTablespacesAreSet
3116 *
3117 * Returns true if SetTempTablespaces has been called in current transaction.
3118 * (This is just so that tablespaces.c doesn't need its own per-transaction
3119 * state.)
3120 */
3121bool
3123{
3124 return (numTempTableSpaces >= 0);
3125}
3126
3127/*
3128 * GetTempTablespaces
3129 *
3130 * Populate an array with the OIDs of the tablespaces that should be used for
3131 * temporary files. (Some entries may be InvalidOid, indicating that the
3132 * current database's default tablespace should be used.) At most numSpaces
3133 * entries will be filled.
3134 * Returns the number of OIDs that were copied into the output array.
3135 */
3136int
3138{
3139 int i;
3140
3142 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3144
3145 return i;
3146}
3147
3148/*
3149 * GetNextTempTableSpace
3150 *
3151 * Select the next temp tablespace to use. A result of InvalidOid means
3152 * to use the current database's default tablespace.
3153 */
3154Oid
3156{
3157 if (numTempTableSpaces > 0)
3158 {
3159 /* Advance nextTempTableSpace counter with wraparound */
3163 }
3164 return InvalidOid;
3165}
3166
3167
3168/*
3169 * AtEOSubXact_Files
3170 *
3171 * Take care of subtransaction commit/abort. At abort, we close AllocateDescs
3172 * that the subtransaction may have opened. At commit, we reassign them to
3173 * the parent subtransaction. (Temporary files are tracked by ResourceOwners
3174 * instead.)
3175 */
3176void
3179{
3180 Index i;
3181
3182 for (i = 0; i < numAllocatedDescs; i++)
3183 {
3184 if (allocatedDescs[i].create_subid == mySubid)
3185 {
3186 if (isCommit)
3188 else
3189 {
3190 /* have to recheck the item after FreeDesc (ugly) */
3192 }
3193 }
3194 }
3195}
3196
3197/*
3198 * AtEOXact_Files
3199 *
3200 * This routine is called during transaction commit or abort. All still-open
3201 * per-transaction temporary file VFDs are closed, which also causes the
3202 * underlying files to be deleted (although they should've been closed already
3203 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3204 * closed. We also forget any transaction-local temp tablespace list.
3205 *
3206 * The isCommit flag is used only to decide whether to emit warnings about
3207 * unclosed files.
3208 */
3209void
3211{
3212 CleanupTempFiles(isCommit, false);
3214 numTempTableSpaces = -1;
3215}
3216
3217/*
3218 * BeforeShmemExit_Files
3219 *
3220 * before_shmem_exit hook to clean up temp files during backend shutdown.
3221 * Here, we want to clean up *all* temp files including interXact ones.
3222 */
3223static void
3225{
3226 CleanupTempFiles(false, true);
3227
3228 /* prevent further temp files from being created */
3229#ifdef USE_ASSERT_CHECKING
3231#endif
3232}
3233
3234/*
3235 * Close temporary files and delete their underlying files.
3236 *
3237 * isCommit: if true, this is normal transaction commit, and we don't
3238 * expect any remaining files; warn if there are some.
3239 *
3240 * isProcExit: if true, this is being called as the backend process is
3241 * exiting. If that's the case, we should remove all temporary files; if
3242 * that's not the case, we are being called for transaction commit/abort
3243 * and should only remove transaction-local temp files. In either case,
3244 * also clean up "allocated" stdio files, dirs and fds.
3245 */
3246static void
3248{
3249 Index i;
3250
3251 /*
3252 * Careful here: at proc_exit we need extra cleanup, not just
3253 * xact_temporary files.
3254 */
3256 {
3257 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3258 for (i = 1; i < SizeVfdCache; i++)
3259 {
3260 unsigned short fdstate = VfdCache[i].fdstate;
3261
3262 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3263 VfdCache[i].fileName != NULL)
3264 {
3265 /*
3266 * If we're in the process of exiting a backend process, close
3267 * all temporary files. Otherwise, only close temporary files
3268 * local to the current transaction. They should be closed by
3269 * the ResourceOwner mechanism already, so this is just a
3270 * debugging cross-check.
3271 */
3272 if (isProcExit)
3273 FileClose(i);
3274 else if (fdstate & FD_CLOSE_AT_EOXACT)
3275 {
3276 elog(WARNING,
3277 "temporary file %s not closed at end-of-transaction",
3278 VfdCache[i].fileName);
3279 FileClose(i);
3280 }
3281 }
3282 }
3283
3285 }
3286
3287 /* Complain if any allocated files remain open at commit. */
3288 if (isCommit && numAllocatedDescs > 0)
3289 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3291
3292 /* Clean up "allocated" stdio files, dirs and fds. */
3293 while (numAllocatedDescs > 0)
3295}
3296
3297
3298/*
3299 * Remove temporary and temporary relation files left over from a prior
3300 * postmaster session
3301 *
3302 * This should be called during postmaster startup. It will forcibly
3303 * remove any leftover files created by OpenTemporaryFile and any leftover
3304 * temporary relation files created by mdcreate.
3305 *
3306 * During post-backend-crash restart cycle, this routine is called when
3307 * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3308 * queries are using temp files could result in useless storage usage that can
3309 * only be reclaimed by a service restart. The argument against enabling it is
3310 * that someone might want to examine the temporary files for debugging
3311 * purposes. This does however mean that OpenTemporaryFile had better allow for
3312 * collision with an existing temp file name.
3313 *
3314 * NOTE: this function and its subroutines generally report syscall failures
3315 * with ereport(LOG) and keep going. Removing temp files is not so critical
3316 * that we should fail to start the database when we can't do it.
3317 */
3318void
3320{
3322 DIR *spc_dir;
3323 struct dirent *spc_de;
3324
3325 /*
3326 * First process temp files in pg_default ($PGDATA/base)
3327 */
3328 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3329 RemovePgTempFilesInDir(temp_path, true, false);
3331
3332 /*
3333 * Cycle through temp directories for all non-default tablespaces.
3334 */
3336
3338 {
3339 if (strcmp(spc_de->d_name, ".") == 0 ||
3340 strcmp(spc_de->d_name, "..") == 0)
3341 continue;
3342
3343 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3346 RemovePgTempFilesInDir(temp_path, true, false);
3347
3348 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3351 }
3352
3354
3355 /*
3356 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3357 * DataDir as well. However, that is *not* cleaned here because doing so
3358 * would create a race condition. It's done separately, earlier in
3359 * postmaster startup.
3360 */
3361}
3362
3363/*
3364 * Process one pgsql_tmp directory for RemovePgTempFiles.
3365 *
3366 * If missing_ok is true, it's all right for the named directory to not exist.
3367 * Any other problem results in a LOG message. (missing_ok should be true at
3368 * the top level, since pgsql_tmp directories are not created until needed.)
3369 *
3370 * At the top level, this should be called with unlink_all = false, so that
3371 * only files matching the temporary name prefix will be unlinked. When
3372 * recursing it will be called with unlink_all = true to unlink everything
3373 * under a top-level temporary directory.
3374 *
3375 * (These two flags could be replaced by one, but it seems clearer to keep
3376 * them separate.)
3377 */
3378void
3379RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3380{
3381 DIR *temp_dir;
3382 struct dirent *temp_de;
3383 char rm_path[MAXPGPATH * 2];
3384
3386
3387 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3388 return;
3389
3391 {
3392 if (strcmp(temp_de->d_name, ".") == 0 ||
3393 strcmp(temp_de->d_name, "..") == 0)
3394 continue;
3395
3396 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3397 tmpdirname, temp_de->d_name);
3398
3399 if (unlink_all ||
3400 strncmp(temp_de->d_name,
3403 {
3405
3406 if (type == PGFILETYPE_ERROR)
3407 continue;
3408 else if (type == PGFILETYPE_DIR)
3409 {
3410 /* recursively remove contents, then directory itself */
3411 RemovePgTempFilesInDir(rm_path, false, true);
3412
3413 if (rmdir(rm_path) < 0)
3414 ereport(LOG,
3416 errmsg("could not remove directory \"%s\": %m",
3417 rm_path)));
3418 }
3419 else
3420 {
3421 if (unlink(rm_path) < 0)
3422 ereport(LOG,
3424 errmsg("could not remove file \"%s\": %m",
3425 rm_path)));
3426 }
3427 }
3428 else
3429 ereport(LOG,
3430 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3431 rm_path)));
3432 }
3433
3435}
3436
3437/* Process one tablespace directory, look for per-DB subdirectories */
3438static void
3440{
3441 DIR *ts_dir;
3442 struct dirent *de;
3443 char dbspace_path[MAXPGPATH * 2];
3444
3446
3447 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3448 {
3449 /*
3450 * We're only interested in the per-database directories, which have
3451 * numeric names. Note that this code will also (properly) ignore "."
3452 * and "..".
3453 */
3454 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3455 continue;
3456
3457 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3458 tsdirname, de->d_name);
3460 }
3461
3462 FreeDir(ts_dir);
3463}
3464
3465/* Process one per-dbspace directory for RemovePgTempRelationFiles */
3466static void
3468{
3470 struct dirent *de;
3471 char rm_path[MAXPGPATH * 2];
3472
3474
3476 {
3477 if (!looks_like_temp_rel_name(de->d_name))
3478 continue;
3479
3480 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3481 dbspacedirname, de->d_name);
3482
3483 if (unlink(rm_path) < 0)
3484 ereport(LOG,
3486 errmsg("could not remove file \"%s\": %m",
3487 rm_path)));
3488 }
3489
3491}
3492
3493/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3494bool
3495looks_like_temp_rel_name(const char *name)
3496{
3497 int pos;
3498 int savepos;
3499
3500 /* Must start with "t". */
3501 if (name[0] != 't')
3502 return false;
3503
3504 /* Followed by a non-empty string of digits and then an underscore. */
3505 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3506 ;
3507 if (pos == 1 || name[pos] != '_')
3508 return false;
3509
3510 /* Followed by another nonempty string of digits. */
3511 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3512 ;
3513 if (savepos == pos)
3514 return false;
3515
3516 /* We might have _forkname or .segment or both. */
3517 if (name[pos] == '_')
3518 {
3519 int forkchar = forkname_chars(&name[pos + 1], NULL);
3520
3521 if (forkchar <= 0)
3522 return false;
3523 pos += forkchar + 1;
3524 }
3525 if (name[pos] == '.')
3526 {
3527 int segchar;
3528
3529 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3530 ;
3531 if (segchar <= 1)
3532 return false;
3533 pos += segchar;
3534 }
3535
3536 /* Now we should be at the end. */
3537 if (name[pos] != '\0')
3538 return false;
3539 return true;
3540}
3541
3542#ifdef HAVE_SYNCFS
3543static void
3544do_syncfs(const char *path)
3545{
3546 int fd;
3547
3548 ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3549 path);
3550
3551 fd = OpenTransientFile(path, O_RDONLY);
3552 if (fd < 0)
3553 {
3554 ereport(LOG,
3556 errmsg("could not open file \"%s\": %m", path)));
3557 return;
3558 }
3559 if (syncfs(fd) < 0)
3560 ereport(LOG,
3562 errmsg("could not synchronize file system for file \"%s\": %m", path)));
3564}
3565#endif
3566
3567/*
3568 * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3569 * all potential filesystem, depending on recovery_init_sync_method setting.
3570 *
3571 * We fsync regular files and directories wherever they are, but we
3572 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3573 * Other symlinks are presumed to point at files we're not responsible
3574 * for fsyncing, and might not have privileges to write at all.
3575 *
3576 * Errors are logged but not considered fatal; that's because this is used
3577 * only during database startup, to deal with the possibility that there are
3578 * issued-but-unsynced writes pending against the data directory. We want to
3579 * ensure that such writes reach disk before anything that's done in the new
3580 * run. However, aborting on error would result in failure to start for
3581 * harmless cases such as read-only files in the data directory, and that's
3582 * not good either.
3583 *
3584 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3585 * rewriting all changes again during recovery.
3586 *
3587 * Note we assume we're chdir'd into PGDATA to begin with.
3588 */
3589void
3591{
3592 bool xlog_is_symlink;
3593
3594 /* We can skip this whole thing if fsync is disabled. */
3595 if (!enableFsync)
3596 return;
3597
3598 /*
3599 * If pg_wal is a symlink, we'll need to recurse into it separately,
3600 * because the first walkdir below will ignore it.
3601 */
3602 xlog_is_symlink = false;
3603
3604 {
3605 struct stat st;
3606
3607 if (lstat("pg_wal", &st) < 0)
3608 ereport(LOG,
3610 errmsg("could not stat file \"%s\": %m",
3611 "pg_wal")));
3612 else if (S_ISLNK(st.st_mode))
3613 xlog_is_symlink = true;
3614 }
3615
3616#ifdef HAVE_SYNCFS
3618 {
3619 DIR *dir;
3620 struct dirent *de;
3621
3622 /*
3623 * On Linux, we don't have to open every single file one by one. We
3624 * can use syncfs() to sync whole filesystems. We only expect
3625 * filesystem boundaries to exist where we tolerate symlinks, namely
3626 * pg_wal and the tablespaces, so we call syncfs() for each of those
3627 * directories.
3628 */
3629
3630 /* Prepare to report progress syncing the data directory via syncfs. */
3632
3633 /* Sync the top level pgdata directory. */
3634 do_syncfs(".");
3635 /* If any tablespaces are configured, sync each of those. */
3637 while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3638 {
3639 char path[MAXPGPATH];
3640
3641 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3642 continue;
3643
3644 snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3645 do_syncfs(path);
3646 }
3647 FreeDir(dir);
3648 /* If pg_wal is a symlink, process that too. */
3649 if (xlog_is_symlink)
3650 do_syncfs("pg_wal");
3651 return;
3652 }
3653#endif /* !HAVE_SYNCFS */
3654
3655#ifdef PG_FLUSH_DATA_WORKS
3656 /* Prepare to report progress of the pre-fsync phase. */
3658
3659 /*
3660 * If possible, hint to the kernel that we're soon going to fsync the data
3661 * directory and its contents. Errors in this step are even less
3662 * interesting than normal, so log them only at DEBUG1.
3663 */
3664 walkdir(".", pre_sync_fname, false, DEBUG1);
3665 if (xlog_is_symlink)
3666 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3668#endif
3669
3670 /* Prepare to report progress syncing the data directory via fsync. */
3672
3673 /*
3674 * Now we do the fsync()s in the same order.
3675 *
3676 * The main call ignores symlinks, so in addition to specially processing
3677 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3678 * process_symlinks = true. Note that if there are any plain directories
3679 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3680 * so we don't worry about optimizing it.
3681 */
3682 walkdir(".", datadir_fsync_fname, false, LOG);
3683 if (xlog_is_symlink)
3684 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3686}
3687
3688/*
3689 * walkdir: recursively walk a directory, applying the action to each
3690 * regular file and directory (including the named directory itself).
3691 *
3692 * If process_symlinks is true, the action and recursion are also applied
3693 * to regular files and directories that are pointed to by symlinks in the
3694 * given directory; otherwise symlinks are ignored. Symlinks are always
3695 * ignored in subdirectories, ie we intentionally don't pass down the
3696 * process_symlinks flag to recursive calls.
3697 *
3698 * Errors are reported at level elevel, which might be ERROR or less.
3699 *
3700 * See also walkdir in file_utils.c, which is a frontend version of this
3701 * logic.
3702 */
3703static void
3704walkdir(const char *path,
3705 void (*action) (const char *fname, bool isdir, int elevel),
3706 bool process_symlinks,
3707 int elevel)
3708{
3709 DIR *dir;
3710 struct dirent *de;
3711
3712 dir = AllocateDir(path);
3713
3714 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3715 {
3716 char subpath[MAXPGPATH * 2];
3717
3719
3720 if (strcmp(de->d_name, ".") == 0 ||
3721 strcmp(de->d_name, "..") == 0)
3722 continue;
3723
3724 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3725
3726 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3727 {
3728 case PGFILETYPE_REG:
3729 (*action) (subpath, false, elevel);
3730 break;
3731 case PGFILETYPE_DIR:
3732 walkdir(subpath, action, false, elevel);
3733 break;
3734 default:
3735
3736 /*
3737 * Errors are already reported directly by get_dirent_type(),
3738 * and any remaining symlinks and unknown file types are
3739 * ignored.
3740 */
3741 break;
3742 }
3743 }
3744
3745 FreeDir(dir); /* we ignore any error here */
3746
3747 /*
3748 * It's important to fsync the destination directory itself as individual
3749 * file fsyncs don't guarantee that the directory entry for the file is
3750 * synced. However, skip this if AllocateDir failed; the action function
3751 * might not be robust against that.
3752 */
3753 if (dir)
3754 (*action) (path, true, elevel);
3755}
3756
3757
3758/*
3759 * Hint to the OS that it should get ready to fsync() this file.
3760 *
3761 * Ignores errors trying to open unreadable files, and logs other errors at a
3762 * caller-specified level.
3763 */
3764#ifdef PG_FLUSH_DATA_WORKS
3765
3766static void
3767pre_sync_fname(const char *fname, bool isdir, int elevel)
3768{
3769 int fd;
3770
3771 /* Don't try to flush directories, it'll likely just fail */
3772 if (isdir)
3773 return;
3774
3775 ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3776 fname);
3777
3779
3780 if (fd < 0)
3781 {
3782 if (errno == EACCES)
3783 return;
3784 ereport(elevel,
3786 errmsg("could not open file \"%s\": %m", fname)));
3787 return;
3788 }
3789
3790 /*
3791 * pg_flush_data() ignores errors, which is ok because this is only a
3792 * hint.
3793 */
3794 pg_flush_data(fd, 0, 0);
3795
3796 if (CloseTransientFile(fd) != 0)
3797 ereport(elevel,
3799 errmsg("could not close file \"%s\": %m", fname)));
3800}
3801
3802#endif /* PG_FLUSH_DATA_WORKS */
3803
3804static void
3805datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3806{
3807 ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3808 fname);
3809
3810 /*
3811 * We want to silently ignoring errors about unreadable files. Pass that
3812 * desire on to fsync_fname_ext().
3813 */
3814 fsync_fname_ext(fname, isdir, true, elevel);
3815}
3816
3817static void
3818unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3819{
3820 if (isdir)
3821 {
3822 if (rmdir(fname) != 0 && errno != ENOENT)
3823 ereport(elevel,
3825 errmsg("could not remove directory \"%s\": %m", fname)));
3826 }
3827 else
3828 {
3829 /* Use PathNameDeleteTemporaryFile to report filesize */
3830 PathNameDeleteTemporaryFile(fname, false);
3831 }
3832}
3833
3834/*
3835 * fsync_fname_ext -- Try to fsync a file or directory
3836 *
3837 * If ignore_perm is true, ignore errors upon trying to open unreadable
3838 * files. Logs other errors at a caller-specified level.
3839 *
3840 * Returns 0 if the operation succeeded, -1 otherwise.
3841 */
3842int
3843fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3844{
3845 int fd;
3846 int flags;
3847 int returncode;
3848
3849 /*
3850 * Some OSs require directories to be opened read-only whereas other
3851 * systems don't allow us to fsync files opened read-only; so we need both
3852 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3853 * not writable by our userid, but we assume that's OK.
3854 */
3855 flags = PG_BINARY;
3856 if (!isdir)
3857 flags |= O_RDWR;
3858 else
3859 flags |= O_RDONLY;
3860
3861 fd = OpenTransientFile(fname, flags);
3862
3863 /*
3864 * Some OSs don't allow us to open directories at all (Windows returns
3865 * EACCES), just ignore the error in that case. If desired also silently
3866 * ignoring errors about unreadable files. Log others.
3867 */
3868 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3869 return 0;
3870 else if (fd < 0 && ignore_perm && errno == EACCES)
3871 return 0;
3872 else if (fd < 0)
3873 {
3874 ereport(elevel,
3876 errmsg("could not open file \"%s\": %m", fname)));
3877 return -1;
3878 }
3879
3881
3882 /*
3883 * Some OSes don't allow us to fsync directories at all, so we can ignore
3884 * those errors. Anything else needs to be logged.
3885 */
3886 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3887 {
3888 int save_errno;
3889
3890 /* close file upon error, might not be in transaction context */
3891 save_errno = errno;
3893 errno = save_errno;
3894
3895 ereport(elevel,
3897 errmsg("could not fsync file \"%s\": %m", fname)));
3898 return -1;
3899 }
3900
3901 if (CloseTransientFile(fd) != 0)
3902 {
3903 ereport(elevel,
3905 errmsg("could not close file \"%s\": %m", fname)));
3906 return -1;
3907 }
3908
3909 return 0;
3910}
3911
3912/*
3913 * fsync_parent_path -- fsync the parent path of a file or directory
3914 *
3915 * This is aimed at making file operations persistent on disk in case of
3916 * an OS crash or power failure.
3917 */
3918static int
3919fsync_parent_path(const char *fname, int elevel)
3920{
3921 char parentpath[MAXPGPATH];
3922
3923 strlcpy(parentpath, fname, MAXPGPATH);
3925
3926 /*
3927 * get_parent_directory() returns an empty string if the input argument is
3928 * just a file name (see comments in path.c), so handle that as being the
3929 * current directory.
3930 */
3931 if (strlen(parentpath) == 0)
3933
3934 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3935 return -1;
3936
3937 return 0;
3938}
3939
3940/*
3941 * Create a PostgreSQL data sub-directory
3942 *
3943 * The data directory itself, and most of its sub-directories, are created at
3944 * initdb time, but we do have some occasions when we create directories in
3945 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3946 * make sure that those directories are created consistently. Today, that means
3947 * making sure that the created directory has the correct permissions, which is
3948 * what pg_dir_create_mode tracks for us.
3949 *
3950 * Note that we also set the umask() based on what we understand the correct
3951 * permissions to be (see file_perm.c).
3952 *
3953 * For permissions other than the default, mkdir() can be used directly, but
3954 * be sure to consider carefully such cases -- a sub-directory with incorrect
3955 * permissions in a PostgreSQL data directory could cause backups and other
3956 * processes to fail.
3957 */
3958int
3959MakePGDirectory(const char *directoryName)
3960{
3962}
3963
3964/*
3965 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3966 *
3967 * Failure to fsync any data file is cause for immediate panic, unless
3968 * data_sync_retry is enabled. Data may have been written to the operating
3969 * system and removed from our buffer pool already, and if we are running on
3970 * an operating system that forgets dirty data on write-back failure, there
3971 * may be only one copy of the data remaining: in the WAL. A later attempt to
3972 * fsync again might falsely report success. Therefore we must not allow any
3973 * further checkpoints to be attempted. data_sync_retry can in theory be
3974 * enabled on systems known not to drop dirty buffered data on write-back
3975 * failure (with the likely outcome that checkpoints will continue to fail
3976 * until the underlying problem is fixed).
3977 *
3978 * Any code that reports a failure from fsync() or related functions should
3979 * filter the error level with this function.
3980 */
3981int
3982data_sync_elevel(int elevel)
3983{
3984 return data_sync_retry ? elevel : PANIC;
3985}
3986
3987bool
3988check_debug_io_direct(char **newval, void **extra, GucSource source)
3989{
3990 bool result = true;
3991 int flags;
3992
3993#if PG_O_DIRECT == 0
3994 if (strcmp(*newval, "") != 0)
3995 {
3996 GUC_check_errdetail("\"%s\" is not supported on this platform.",
3997 "debug_io_direct");
3998 result = false;
3999 }
4000 flags = 0;
4001#else
4002 List *elemlist;
4003 ListCell *l;
4004 char *rawstring;
4005
4006 /* Need a modifiable copy of string */
4008
4009 if (!SplitGUCList(rawstring, ',', &elemlist))
4010 {
4011 GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4012 "debug_io_direct");
4015 return false;
4016 }
4017
4018 flags = 0;
4019 foreach(l, elemlist)
4020 {
4021 char *item = (char *) lfirst(l);
4022
4023 if (pg_strcasecmp(item, "data") == 0)
4024 flags |= IO_DIRECT_DATA;
4025 else if (pg_strcasecmp(item, "wal") == 0)
4026 flags |= IO_DIRECT_WAL;
4027 else if (pg_strcasecmp(item, "wal_init") == 0)
4028 flags |= IO_DIRECT_WAL_INIT;
4029 else
4030 {
4031 GUC_check_errdetail("Invalid option \"%s\".", item);
4032 result = false;
4033 break;
4034 }
4035 }
4036
4037 /*
4038 * It's possible to configure block sizes smaller than our assumed I/O
4039 * alignment size, which could result in invalid I/O requests.
4040 */
4041#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4042 if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4043 {
4044 GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4045 "debug_io_direct", "XLOG_BLCKSZ");
4046 result = false;
4047 }
4048#endif
4049#if BLCKSZ < PG_IO_ALIGN_SIZE
4050 if (result && (flags & IO_DIRECT_DATA))
4051 {
4052 GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4053 "debug_io_direct", "BLCKSZ");
4054 result = false;
4055 }
4056#endif
4057
4060#endif
4061
4062 if (!result)
4063 return result;
4064
4065 /* Save the flags in *extra, for use by assign_debug_io_direct */
4066 *extra = guc_malloc(LOG, sizeof(int));
4067 if (!*extra)
4068 return false;
4069 *((int *) *extra) = flags;
4070
4071 return result;
4072}
4073
4074void
4075assign_debug_io_direct(const char *newval, void *extra)
4076{
4077 int *flags = (int *) extra;
4078
4079 io_direct_flags = *flags;
4080}
4081
4082/* ResourceOwner callbacks */
4083
4084static void
4086{
4087 File file = (File) DatumGetInt32(res);
4088 Vfd *vfdP;
4089
4090 Assert(FileIsValid(file));
4091
4092 vfdP = &VfdCache[file];
4093 vfdP->resowner = NULL;
4094
4095 FileClose(file);
4096}
4097
4098static char *
4100{
4101 return psprintf("File %d", DatumGetInt32(res));
4102}
void pgaio_closing_fd(int fd)
Definition aio.c:1220
void pgaio_io_start_readv(PgAioHandle *ioh, int fd, int iovcnt, uint64 offset)
Definition aio_io.c:78
void begin_startup_progress_phase(void)
Definition startup.c:343
int fdatasync(int fd)
#define Min(x, y)
Definition c.h:1007
uint32 SubTransactionId
Definition c.h:680
#define INT64_FORMAT
Definition c.h:574
#define Assert(condition)
Definition c.h:883
int64_t int64
Definition c.h:553
#define PG_BINARY
Definition c.h:1281
uint64_t uint64
Definition c.h:557
uint32_t uint32
Definition c.h:556
unsigned int Index
Definition c.h:638
#define MemSet(start, val, len)
Definition c.h:1023
#define OidIsValid(objectId)
Definition c.h:798
size_t Size
Definition c.h:629
int closedir(DIR *)
Definition dirent.c:127
struct dirent * readdir(DIR *)
Definition dirent.c:78
DIR * opendir(const char *)
Definition dirent.c:33
int errcode_for_file_access(void)
Definition elog.c:886
int errdetail(const char *fmt,...)
Definition elog.c:1216
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define LOG
Definition elog.h:31
#define FATAL
Definition elog.h:41
#define WARNING
Definition elog.h:36
#define DEBUG2
Definition elog.h:29
#define PANIC
Definition elog.h:42
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int pg_truncate(const char *path, pgoff_t length)
Definition fd.c:717
int max_files_per_process
Definition fd.c:146
int FileGetRawDesc(File file)
Definition fd.c:2512
int MakePGDirectory(const char *directoryName)
Definition fd.c:3959
int FreeDir(DIR *dir)
Definition fd.c:3005
int recovery_init_sync_method
Definition fd.c:165
static const ResourceOwnerDesc file_resowner_desc
Definition fd.c:361
int pg_fsync_no_writethrough(int fd)
Definition fd.c:438
#define FD_MINFREE
Definition fd.c:138
FILE * OpenPipeStream(const char *command, const char *mode)
Definition fd.c:2727
static int numTempTableSpaces
Definition fd.c:289
static bool ReleaseLruFile(void)
Definition fd.c:1366
void FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
Definition fd.c:2119
int io_direct_flags
Definition fd.c:168
#define FD_DELETE_AT_CLOSE
Definition fd.c:192
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1108
static int maxAllocatedDescs
Definition fd.c:268
static void Delete(File file)
Definition fd.c:1250
static int FreeDesc(AllocateDesc *desc)
Definition fd.c:2783
static long tempFileCounter
Definition fd.c:280
static char * ResOwnerPrintFile(Datum res)
Definition fd.c:4099
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition fd.c:779
char * FilePathName(File file)
Definition fd.c:2496
static void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition fd.c:377
static int pg_ftruncate(int fd, pgoff_t length)
Definition fd.c:700
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition fd.c:3137
static int numAllocatedDescs
Definition fd.c:267
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition fd.c:1885
static void LruDelete(File file)
Definition fd.c:1269
int pg_fdatasync(int fd)
Definition fd.c:477
#define FileIsValid(file)
Definition fd.c:186
void assign_debug_io_direct(const char *newval, void *extra)
Definition fd.c:4075
int FileSync(File file, uint32 wait_event_info)
Definition fd.c:2332
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2201
static int nfile
Definition fd.c:222
int CloseTransientFile(int fd)
Definition fd.c:2851
#define DO_DB(A)
Definition fd.c:180
int BasicOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1086
void closeAllVfds(void)
Definition fd.c:3064
int max_safe_fds
Definition fd.c:159
static File AllocateVfd(void)
Definition fd.c:1398
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition fd.c:1845
void PathNameDeleteTemporaryDir(const char *dirname)
Definition fd.c:1675
int ClosePipeStream(FILE *file)
Definition fd.c:3035
void AtEOXact_Files(bool isCommit)
Definition fd.c:3210
int FileGetRawFlags(File file)
Definition fd.c:2528
static Size SizeVfdCache
Definition fd.c:217
static int nextTempTableSpace
Definition fd.c:290
#define FD_CLOSE_AT_EOXACT
Definition fd.c:193
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition fd.c:3843
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition fd.c:3818
static void ResOwnerReleaseFile(Datum res)
Definition fd.c:4085
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition fd.c:3439
int FreeFile(FILE *file)
Definition fd.c:2823
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2145
mode_t FileGetRawMode(File file)
Definition fd.c:2538
static AllocateDesc * allocatedDescs
Definition fd.c:269
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition fd.c:2968
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition fd.c:961
int FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2404
static int FileAccess(File file)
Definition fd.c:1476
pgoff_t FileSize(File file)
Definition fd.c:2444
static void FreeVfd(File file)
Definition fd.c:1456
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition fd.c:458
void FileClose(File file)
Definition fd.c:1962
void ReleaseExternalFD(void)
Definition fd.c:1221
#define FD_TEMP_FILE_LIMIT
Definition fd.c:194
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition fd.c:3379
bool pg_file_exists(const char *name)
Definition fd.c:500
void RemovePgTempFiles(void)
Definition fd.c:3319
#define FileIsNotOpen(file)
Definition fd.c:189
bool TempTablespacesAreSet(void)
Definition fd.c:3122
void fsync_fname(const char *fname, bool isdir)
Definition fd.c:753
int data_sync_elevel(int elevel)
Definition fd.c:3982
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1559
static void Insert(File file)
Definition fd.c:1297
AllocateDescKind
Definition fd.c:248
@ AllocateDescDir
Definition fd.c:251
@ AllocateDescPipe
Definition fd.c:250
@ AllocateDescFile
Definition fd.c:249
@ AllocateDescRawFD
Definition fd.c:252
Oid GetNextTempTableSpace(void)
Definition fd.c:3155
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1572
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition fd.c:3805
static void ReportTemporaryFileUsage(const char *path, pgoff_t size)
Definition fd.c:1512
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition fd.c:1788
void pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
Definition fd.c:522
bool AcquireExternalFD(void)
Definition fd.c:1168
static void RegisterTemporaryFile(File file)
Definition fd.c:1531
#define NUM_RESERVED_FDS
Definition fd.c:129
DIR * AllocateDir(const char *dirname)
Definition fd.c:2887
static Oid * tempTableSpaces
Definition fd.c:288
int FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2461
static bool reserveAllocatedDesc(void)
Definition fd.c:2549
void InitFileAccess(void)
Definition fd.c:900
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition fd.c:3467
File OpenTemporaryFile(bool interXact)
Definition fd.c:1708
int durable_unlink(const char *fname, int elevel)
Definition fd.c:869
static uint64 temporary_files_size
Definition fd.c:236
void ReserveExternalFD(void)
Definition fd.c:1203
int FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2359
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition fd.c:2953
bool looks_like_temp_rel_name(const char *name)
Definition fd.c:3495
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition fd.c:1916
void set_max_safe_fds(void)
Definition fd.c:1041
int pg_fsync(int fd)
Definition fd.c:386
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition fd.c:3247
#define VFD_CLOSED
Definition fd.c:184
static bool have_xact_temporary_files
Definition fd.c:228
static int LruInsert(File file)
Definition fd.c:1319
static int numExternalFDs
Definition fd.c:274
static int fsync_parent_path(const char *fname, int elevel)
Definition fd.c:3919
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition fd.c:1644
FILE * AllocateFile(const char *name, const char *mode)
Definition fd.c:2624
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition fd.c:3177
int OpenTransientFile(const char *fileName, int fileFlags)
Definition fd.c:2674
void InitTemporaryFileAccess(void)
Definition fd.c:930
static Vfd * VfdCache
Definition fd.c:216
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:2683
bool data_sync_retry
Definition fd.c:162
int FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2063
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2227
static void ReleaseLruFiles(void)
Definition fd.c:1388
void SyncDataDirectory(void)
Definition fd.c:3590
bool check_debug_io_direct(char **newval, void **extra, GucSource source)
Definition fd.c:3988
static void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition fd.c:372
static void BeforeShmemExit_Files(int code, Datum arg)
Definition fd.c:3224
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition fd.c:3704
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition fd.c:3093
void TempTablespacePath(char *path, Oid tablespace)
Definition fd.c:1763
#define IO_DIRECT_WAL
Definition fd.h:55
#define IO_DIRECT_DATA
Definition fd.h:54
#define IO_DIRECT_WAL_INIT
Definition fd.h:56
int File
Definition fd.h:51
#define PG_O_DIRECT
Definition fd.h:112
int pg_file_create_mode
Definition file_perm.c:19
int pg_dir_create_mode
Definition file_perm.c:18
ssize_t pg_pwrite_zeros(int fd, size_t size, pgoff_t offset)
Definition file_utils.c:709
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition file_utils.c:547
#define PG_TEMP_FILES_DIR
Definition file_utils.h:63
#define PG_TEMP_FILE_PREFIX
Definition file_utils.h:64
PGFileType
Definition file_utils.h:19
@ PGFILETYPE_DIR
Definition file_utils.h:23
@ PGFILETYPE_REG
Definition file_utils.h:22
@ PGFILETYPE_ERROR
Definition file_utils.h:20
@ DATA_DIR_SYNC_METHOD_SYNCFS
Definition file_utils.h:30
int MyProcPid
Definition globals.c:47
bool enableFsync
Definition globals.c:129
Oid MyDatabaseTableSpace
Definition globals.c:96
void * guc_malloc(int elevel, size_t size)
Definition guc.c:636
#define newval
#define GUC_check_errdetail
Definition guc.h:505
GucSource
Definition guc.h:112
int temp_file_limit
Definition guc_tables.c:551
int log_temp_files
Definition guc_tables.c:546
#define close(a)
Definition win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:344
return true
Definition isn.c:130
int j
Definition isn.c:78
int i
Definition isn.c:77
void list_free(List *list)
Definition list.c:1546
Datum subpath(PG_FUNCTION_ARGS)
Definition ltree_op.c:311
char * pstrdup(const char *in)
Definition mcxt.c:1781
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define MAP_FAILED
Definition mem.h:45
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
void * arg
static char * basedir
static PgChecksumMode mode
#define MAXPGPATH
static ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset)
Definition pg_iovec.h:54
static ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset)
Definition pg_iovec.h:93
#define lfirst(lc)
Definition pg_list.h:172
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition pg_prng.c:144
pg_prng_state pg_global_prng_state
Definition pg_prng.c:34
static rewind_source * source
Definition pg_rewind.c:89
static char buf[DEFAULT_XLOG_SEG_SIZE]
static char * tablespace
Definition pgbench.c:217
void pgstat_report_tempfile(size_t filesize)
#define pqsignal
Definition port.h:547
int pg_strcasecmp(const char *s1, const char *s2)
void get_parent_directory(char *path)
Definition path.c:1068
#define snprintf
Definition port.h:260
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
off_t pgoff_t
Definition port.h:421
uint64_t Datum
Definition postgres.h:70
static Datum Int32GetDatum(int32 X)
Definition postgres.h:222
static int32 DatumGetInt32(Datum X)
Definition postgres.h:212
#define InvalidOid
unsigned int Oid
static int fd(const char *x, int i)
static int fb(int x)
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
int forkname_chars(const char *str, ForkNumber *fork)
Definition relpath.c:81
#define PG_TBLSPC_DIR
Definition relpath.h:41
#define TABLESPACE_VERSION_DIRECTORY
Definition relpath.h:33
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition resowner.c:561
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition resowner.c:521
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449
@ RESOURCE_RELEASE_AFTER_LOCKS
Definition resowner.h:56
#define RELEASE_PRIO_FILES
Definition resowner.h:76
void pg_usleep(long microsec)
Definition signal.c:53
#define realloc(a, b)
#define free(a)
#define malloc(a)
static void error(void)
#define ereport_startup_progress(msg,...)
Definition startup.h:18
SubTransactionId create_subid
Definition fd.c:258
DIR * dir
Definition fd.c:262
FILE * file
Definition fd.c:261
int fd
Definition fd.c:263
union AllocateDesc::@20 desc
AllocateDescKind kind
Definition fd.c:257
Definition dirent.c:26
Definition pg_list.h:54
const char * name
Definition resowner.h:93
Definition fd.c:197
int fd
Definition fd.c:198
int fileFlags
Definition fd.c:207
File lruLessRecently
Definition fd.c:203
File lruMoreRecently
Definition fd.c:202
pgoff_t fileSize
Definition fd.c:204
char * fileName
Definition fd.c:205
ResourceOwner resowner
Definition fd.c:200
unsigned short fdstate
Definition fd.c:199
File nextFree
Definition fd.c:201
mode_t fileMode
Definition fd.c:208
bool SplitGUCList(char *rawstring, char separator, List **namelist)
Definition varlena.c:2978
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
const char * type
const char * name
#define fsync(fd)
Definition win32_port.h:83
#define stat
Definition win32_port.h:74
#define EINTR
Definition win32_port.h:361
#define EOPNOTSUPP
Definition win32_port.h:385
#define SIGPIPE
Definition win32_port.h:163
#define lstat(path, sb)
Definition win32_port.h:275
#define S_ISDIR(m)
Definition win32_port.h:315
void _dosmaperr(unsigned long)
Definition win32error.c:177
#define S_ISLNK(m)
Definition win32_port.h:334
#define mkdir(a, b)
Definition win32_port.h:80
#define fstat
Definition win32_port.h:73
#define O_CLOEXEC
Definition win32_port.h:344
SubTransactionId GetCurrentSubTransactionId(void)
Definition xact.c:792
int wal_sync_method
Definition xlog.c:133
@ WAL_SYNC_METHOD_FSYNC_WRITETHROUGH
Definition xlog.h:28
static const char * directory
Definition zic.c:648

◆ FD_CLOSE_AT_EOXACT

#define FD_CLOSE_AT_EOXACT   (1 << 1) /* T = close at eoXact */

Definition at line 193 of file fd.c.

◆ FD_DELETE_AT_CLOSE

#define FD_DELETE_AT_CLOSE   (1 << 0) /* T = delete when closed */

Definition at line 192 of file fd.c.

◆ FD_MINFREE

#define FD_MINFREE   48

Definition at line 138 of file fd.c.

◆ FD_TEMP_FILE_LIMIT

#define FD_TEMP_FILE_LIMIT   (1 << 2) /* T = respect temp_file_limit */

Definition at line 194 of file fd.c.

◆ FileIsNotOpen

#define FileIsNotOpen (   file)    (VfdCache[file].fd == VFD_CLOSED)

Definition at line 189 of file fd.c.

◆ FileIsValid

#define FileIsValid (   file)     ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)

Definition at line 186 of file fd.c.

◆ NUM_RESERVED_FDS

#define NUM_RESERVED_FDS   10

Definition at line 129 of file fd.c.

◆ VFD_CLOSED

#define VFD_CLOSED   (-1)

Definition at line 184 of file fd.c.

Typedef Documentation

◆ Vfd

Enumeration Type Documentation

◆ AllocateDescKind

Enumerator
AllocateDescFile 
AllocateDescPipe 
AllocateDescDir 
AllocateDescRawFD 

Definition at line 247 of file fd.c.

Function Documentation

◆ AcquireExternalFD()

bool AcquireExternalFD ( void  )

Definition at line 1168 of file fd.c.

1169{
1170 /*
1171 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1172 * "external" FDs.
1173 */
1174 if (numExternalFDs < max_safe_fds / 3)
1175 {
1177 return true;
1178 }
1179 errno = EMFILE;
1180 return false;
1181}

References fb(), max_safe_fds, numExternalFDs, and ReserveExternalFD().

Referenced by CreateWaitEventSet(), and libpqsrv_connect_prepare().

◆ AllocateDir()

DIR * AllocateDir ( const char dirname)

Definition at line 2887 of file fd.c.

2888{
2889 DIR *dir;
2890
2891 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2892 numAllocatedDescs, dirname));
2893
2894 /* Can we allocate another non-virtual FD? */
2895 if (!reserveAllocatedDesc())
2896 ereport(ERROR,
2898 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2899 maxAllocatedDescs, dirname)));
2900
2901 /* Close excess kernel FDs. */
2903
2904TryAgain:
2905 if ((dir = opendir(dirname)) != NULL)
2906 {
2908
2909 desc->kind = AllocateDescDir;
2910 desc->desc.dir = dir;
2913 return desc->desc.dir;
2914 }
2915
2916 if (errno == EMFILE || errno == ENFILE)
2917 {
2918 int save_errno = errno;
2919
2920 ereport(LOG,
2922 errmsg("out of file descriptors: %m; release and retry")));
2923 errno = 0;
2924 if (ReleaseLruFile())
2925 goto TryAgain;
2926 errno = save_errno;
2927 }
2928
2929 return NULL;
2930}

References allocatedDescs, AllocateDescDir, AllocateDesc::create_subid, AllocateDesc::desc, AllocateDesc::dir, DO_DB, elog, ereport, errcode(), errmsg(), ERROR, fb(), GetCurrentSubTransactionId(), AllocateDesc::kind, LOG, maxAllocatedDescs, numAllocatedDescs, opendir(), ReleaseLruFile(), ReleaseLruFiles(), and reserveAllocatedDesc().

Referenced by calculate_database_size(), calculate_tablespace_size(), CheckPointLogicalRewriteHeap(), CheckPointSnapBuild(), CheckTablespaceDirectory(), CleanupBackupHistory(), copydir(), db_dir_size(), DeleteAllExportedSnapshotFiles(), destroy_tablespace_directories(), directory_is_empty(), do_pg_backup_start(), dsm_cleanup_for_mmap(), extension_file_exists(), get_ext_ver_list(), GetConfFilesInDir(), getInstallationPaths(), GetWalSummaries(), movedb(), ParseTzFile(), perform_base_backup(), pg_available_extension_versions(), pg_available_extensions(), pg_ls_dir(), pg_ls_dir_files(), pg_tablespace_databases(), pg_tzenumerate_next(), pg_tzenumerate_start(), pgarch_readyXlog(), RelationCacheInitFileRemove(), RelationCacheInitFileRemoveInDir(), RemoveNonParentXlogFiles(), RemoveOldXlogFiles(), RemovePgTempFiles(), RemovePgTempFilesInDir(), RemovePgTempRelationFiles(), RemovePgTempRelationFilesInDbspace(), RemoveTempXlogFiles(), ReorderBufferCleanupSerializedTXNs(), ResetUnloggedRelations(), ResetUnloggedRelationsInDbspaceDir(), ResetUnloggedRelationsInTablespaceDir(), restoreTwoPhaseData(), scan_directory_ci(), sendDir(), SlruScanDirectory(), StartupReorderBuffer(), StartupReplicationSlots(), SyncDataDirectory(), UpdateLogicalMappings(), walkdir(), and XLogGetOldestSegno().

◆ AllocateFile()

FILE * AllocateFile ( const char name,
const char mode 
)

Definition at line 2624 of file fd.c.

2625{
2626 FILE *file;
2627
2628 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2630
2631 /* Can we allocate another non-virtual FD? */
2632 if (!reserveAllocatedDesc())
2633 ereport(ERROR,
2635 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2637
2638 /* Close excess kernel FDs. */
2640
2641TryAgain:
2642 if ((file = fopen(name, mode)) != NULL)
2643 {
2645
2646 desc->kind = AllocateDescFile;
2647 desc->desc.file = file;
2650 return desc->desc.file;
2651 }
2652
2653 if (errno == EMFILE || errno == ENFILE)
2654 {
2655 int save_errno = errno;
2656
2657 ereport(LOG,
2659 errmsg("out of file descriptors: %m; release and retry")));
2660 errno = 0;
2661 if (ReleaseLruFile())
2662 goto TryAgain;
2663 errno = save_errno;
2664 }
2665
2666 return NULL;
2667}

References allocatedDescs, AllocateDescFile, AllocateDesc::create_subid, AllocateDesc::desc, DO_DB, elog, ereport, errcode(), errmsg(), ERROR, fb(), AllocateDesc::file, GetCurrentSubTransactionId(), AllocateDesc::kind, LOG, maxAllocatedDescs, mode, name, numAllocatedDescs, ReleaseLruFile(), ReleaseLruFiles(), and reserveAllocatedDesc().

Referenced by AlterSystemSetConfigFile(), apw_dump_now(), apw_load_buffers(), BeginCopyFrom(), BeginCopyTo(), checkControlFile(), do_pg_backup_stop(), entry_reset(), existsTimeLineHistory(), ExportSnapshot(), gc_qtexts(), GetHugePageSize(), ImportSnapshot(), load_dh_file(), load_relcache_init_file(), open_auth_file(), parse_extension_control_file(), ParseConfigFile(), ParseTzFile(), pg_current_logfile(), pg_promote(), pgss_shmem_shutdown(), pgss_shmem_startup(), pgstat_read_statsfile(), pgstat_write_statsfile(), read_backup_label(), read_binary_file(), read_tablespace_map(), read_whole_file(), readTimeLineHistory(), test_custom_stats_var_from_serialized_data(), test_custom_stats_var_to_serialized_data(), tsearch_readline_begin(), ValidatePgVersion(), write_relcache_init_file(), XLogArchiveForceDone(), and XLogArchiveNotify().

◆ AllocateVfd()

static File AllocateVfd ( void  )
static

Definition at line 1398 of file fd.c.

1399{
1400 Index i;
1401 File file;
1402
1403 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1404
1405 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1406
1407 if (VfdCache[0].nextFree == 0)
1408 {
1409 /*
1410 * The free list is empty so it is time to increase the size of the
1411 * array. We choose to double it each time this happens. However,
1412 * there's not much point in starting *real* small.
1413 */
1416
1417 if (newCacheSize < 32)
1418 newCacheSize = 32;
1419
1420 /*
1421 * Be careful not to clobber VfdCache ptr if realloc fails.
1422 */
1423 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1424 if (newVfdCache == NULL)
1425 ereport(ERROR,
1427 errmsg("out of memory")));
1429
1430 /*
1431 * Initialize the new entries and link them into the free list.
1432 */
1433 for (i = SizeVfdCache; i < newCacheSize; i++)
1434 {
1435 MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1436 VfdCache[i].nextFree = i + 1;
1438 }
1441
1442 /*
1443 * Record the new size
1444 */
1446 }
1447
1448 file = VfdCache[0].nextFree;
1449
1451
1452 return file;
1453}

References Assert, DO_DB, elog, ereport, errcode(), errmsg(), ERROR, fb(), vfd::fd, i, LOG, MemSet, vfd::nextFree, realloc, SizeVfdCache, VFD_CLOSED, and VfdCache.

Referenced by PathNameOpenFilePerm().

◆ assign_debug_io_direct()

void assign_debug_io_direct ( const char newval,
void extra 
)

Definition at line 4075 of file fd.c.

4076{
4077 int *flags = (int *) extra;
4078
4079 io_direct_flags = *flags;
4080}

References io_direct_flags.

◆ AtEOSubXact_Files()

void AtEOSubXact_Files ( bool  isCommit,
SubTransactionId  mySubid,
SubTransactionId  parentSubid 
)

Definition at line 3177 of file fd.c.

3179{
3180 Index i;
3181
3182 for (i = 0; i < numAllocatedDescs; i++)
3183 {
3184 if (allocatedDescs[i].create_subid == mySubid)
3185 {
3186 if (isCommit)
3188 else
3189 {
3190 /* have to recheck the item after FreeDesc (ugly) */
3192 }
3193 }
3194 }
3195}

References allocatedDescs, AllocateDesc::create_subid, fb(), FreeDesc(), i, and numAllocatedDescs.

Referenced by AbortSubTransaction(), and CommitSubTransaction().

◆ AtEOXact_Files()

◆ BasicOpenFile()

int BasicOpenFile ( const char fileName,
int  fileFlags 
)

◆ BasicOpenFilePerm()

int BasicOpenFilePerm ( const char fileName,
int  fileFlags,
mode_t  fileMode 
)

Definition at line 1108 of file fd.c.

1109{
1110 int fd;
1111
1112tryAgain:
1113#ifdef PG_O_DIRECT_USE_F_NOCACHE
1114 fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1115#else
1116 fd = open(fileName, fileFlags, fileMode);
1117#endif
1118
1119 if (fd >= 0)
1120 {
1121#ifdef PG_O_DIRECT_USE_F_NOCACHE
1122 if (fileFlags & PG_O_DIRECT)
1123 {
1124 if (fcntl(fd, F_NOCACHE, 1) < 0)
1125 {
1126 int save_errno = errno;
1127
1128 close(fd);
1129 errno = save_errno;
1130 return -1;
1131 }
1132 }
1133#endif
1134
1135 return fd; /* success! */
1136 }
1137
1138 if (errno == EMFILE || errno == ENFILE)
1139 {
1140 int save_errno = errno;
1141
1142 ereport(LOG,
1144 errmsg("out of file descriptors: %m; release and retry")));
1145 errno = 0;
1146 if (ReleaseLruFile())
1147 goto tryAgain;
1148 errno = save_errno;
1149 }
1150
1151 return -1; /* failure */
1152}

References close, ereport, errcode(), errmsg(), fb(), fd(), LOG, PG_O_DIRECT, and ReleaseLruFile().

Referenced by BasicOpenFile(), LruInsert(), OpenTransientFilePerm(), PathNameOpenFilePerm(), and readRecoverySignalFile().

◆ BeforeShmemExit_Files()

static void BeforeShmemExit_Files ( int  code,
Datum  arg 
)
static

Definition at line 3224 of file fd.c.

3225{
3226 CleanupTempFiles(false, true);
3227
3228 /* prevent further temp files from being created */
3229#ifdef USE_ASSERT_CHECKING
3231#endif
3232}

References CleanupTempFiles(), and fb().

Referenced by InitTemporaryFileAccess().

◆ check_debug_io_direct()

bool check_debug_io_direct ( char **  newval,
void **  extra,
GucSource  source 
)

Definition at line 3988 of file fd.c.

3989{
3990 bool result = true;
3991 int flags;
3992
3993#if PG_O_DIRECT == 0
3994 if (strcmp(*newval, "") != 0)
3995 {
3996 GUC_check_errdetail("\"%s\" is not supported on this platform.",
3997 "debug_io_direct");
3998 result = false;
3999 }
4000 flags = 0;
4001#else
4002 List *elemlist;
4003 ListCell *l;
4004 char *rawstring;
4005
4006 /* Need a modifiable copy of string */
4008
4009 if (!SplitGUCList(rawstring, ',', &elemlist))
4010 {
4011 GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4012 "debug_io_direct");
4015 return false;
4016 }
4017
4018 flags = 0;
4019 foreach(l, elemlist)
4020 {
4021 char *item = (char *) lfirst(l);
4022
4023 if (pg_strcasecmp(item, "data") == 0)
4024 flags |= IO_DIRECT_DATA;
4025 else if (pg_strcasecmp(item, "wal") == 0)
4026 flags |= IO_DIRECT_WAL;
4027 else if (pg_strcasecmp(item, "wal_init") == 0)
4028 flags |= IO_DIRECT_WAL_INIT;
4029 else
4030 {
4031 GUC_check_errdetail("Invalid option \"%s\".", item);
4032 result = false;
4033 break;
4034 }
4035 }
4036
4037 /*
4038 * It's possible to configure block sizes smaller than our assumed I/O
4039 * alignment size, which could result in invalid I/O requests.
4040 */
4041#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4042 if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4043 {
4044 GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4045 "debug_io_direct", "XLOG_BLCKSZ");
4046 result = false;
4047 }
4048#endif
4049#if BLCKSZ < PG_IO_ALIGN_SIZE
4050 if (result && (flags & IO_DIRECT_DATA))
4051 {
4052 GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4053 "debug_io_direct", "BLCKSZ");
4054 result = false;
4055 }
4056#endif
4057
4060#endif
4061
4062 if (!result)
4063 return result;
4064
4065 /* Save the flags in *extra, for use by assign_debug_io_direct */
4066 *extra = guc_malloc(LOG, sizeof(int));
4067 if (!*extra)
4068 return false;
4069 *((int *) *extra) = flags;
4070
4071 return result;
4072}

References fb(), GUC_check_errdetail, guc_malloc(), IO_DIRECT_DATA, IO_DIRECT_WAL, IO_DIRECT_WAL_INIT, lfirst, list_free(), LOG, newval, pfree(), pg_strcasecmp(), pstrdup(), and SplitGUCList().

◆ CleanupTempFiles()

static void CleanupTempFiles ( bool  isCommit,
bool  isProcExit 
)
static

Definition at line 3247 of file fd.c.

3248{
3249 Index i;
3250
3251 /*
3252 * Careful here: at proc_exit we need extra cleanup, not just
3253 * xact_temporary files.
3254 */
3256 {
3257 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3258 for (i = 1; i < SizeVfdCache; i++)
3259 {
3260 unsigned short fdstate = VfdCache[i].fdstate;
3261
3262 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3263 VfdCache[i].fileName != NULL)
3264 {
3265 /*
3266 * If we're in the process of exiting a backend process, close
3267 * all temporary files. Otherwise, only close temporary files
3268 * local to the current transaction. They should be closed by
3269 * the ResourceOwner mechanism already, so this is just a
3270 * debugging cross-check.
3271 */
3272 if (isProcExit)
3273 FileClose(i);
3274 else if (fdstate & FD_CLOSE_AT_EOXACT)
3275 {
3276 elog(WARNING,
3277 "temporary file %s not closed at end-of-transaction",
3278 VfdCache[i].fileName);
3279 FileClose(i);
3280 }
3281 }
3282 }
3283
3285 }
3286
3287 /* Complain if any allocated files remain open at commit. */
3288 if (isCommit && numAllocatedDescs > 0)
3289 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3291
3292 /* Clean up "allocated" stdio files, dirs and fds. */
3293 while (numAllocatedDescs > 0)
3295}

References allocatedDescs, Assert, elog, fb(), FD_CLOSE_AT_EOXACT, FD_DELETE_AT_CLOSE, vfd::fdstate, FileClose(), FileIsNotOpen, FreeDesc(), have_xact_temporary_files, i, numAllocatedDescs, SizeVfdCache, VfdCache, and WARNING.

Referenced by AtEOXact_Files(), and BeforeShmemExit_Files().

◆ closeAllVfds()

void closeAllVfds ( void  )

Definition at line 3064 of file fd.c.

3065{
3066 Index i;
3067
3068 if (SizeVfdCache > 0)
3069 {
3070 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3071 for (i = 1; i < SizeVfdCache; i++)
3072 {
3073 if (!FileIsNotOpen(i))
3074 LruDelete(i);
3075 }
3076 }
3077}

References Assert, FileIsNotOpen, i, LruDelete(), and SizeVfdCache.

Referenced by standard_ProcessUtility().

◆ ClosePipeStream()

int ClosePipeStream ( FILE file)

Definition at line 3035 of file fd.c.

3036{
3037 int i;
3038
3039 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3040
3041 /* Remove file from list of allocated files, if it's present */
3042 for (i = numAllocatedDescs; --i >= 0;)
3043 {
3044 AllocateDesc *desc = &allocatedDescs[i];
3045
3046 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3047 return FreeDesc(desc);
3048 }
3049
3050 /* Only get here if someone passes us a file not in allocatedDescs */
3051 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3052
3053 return pclose(file);
3054}

References allocatedDescs, AllocateDescPipe, AllocateDesc::desc, DO_DB, elog, fb(), AllocateDesc::file, FreeDesc(), i, AllocateDesc::kind, LOG, numAllocatedDescs, and WARNING.

Referenced by ClosePipeFromProgram(), ClosePipeToProgram(), pg_import_system_collations(), run_ssl_passphrase_command(), and shell_finish_command().

◆ CloseTransientFile()

int CloseTransientFile ( int  fd)

Definition at line 2851 of file fd.c.

2852{
2853 int i;
2854
2855 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2856
2857 /* Remove fd from list of allocated files, if it's present */
2858 for (i = numAllocatedDescs; --i >= 0;)
2859 {
2860 AllocateDesc *desc = &allocatedDescs[i];
2861
2862 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2863 return FreeDesc(desc);
2864 }
2865
2866 /* Only get here if someone passes us a file not in allocatedDescs */
2867 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2868
2870
2871 return close(fd);
2872}

References allocatedDescs, AllocateDescRawFD, close, AllocateDesc::desc, DO_DB, elog, AllocateDesc::fd, fd(), FreeDesc(), i, AllocateDesc::kind, LOG, numAllocatedDescs, pgaio_closing_fd(), and WARNING.

Referenced by ApplyLogicalMappingFile(), be_lo_export(), CheckPointLogicalRewriteHeap(), CheckPointReplicationOrigin(), clone_file(), compare_files(), copy_file(), CreateDirAndVersionFile(), dsm_impl_mmap(), durable_rename(), fsync_fname_ext(), get_controlfile_by_exact_path(), heap_xlog_logical_rewrite(), lo_import_internal(), perform_base_backup(), pg_truncate(), qtext_load_file(), qtext_store(), read_relmap_file(), ReadTwoPhaseFile(), RecreateTwoPhaseFile(), ReorderBufferSerializeChange(), ReorderBufferSerializeTXN(), RestoreSlotFromDisk(), SaveSlotToPath(), sendFile(), SendTimeLineHistory(), SimpleLruDoesPhysicalPageExist(), SimpleLruWriteAll(), SlruInternalWritePage(), SlruPhysicalReadPage(), SlruPhysicalWritePage(), SlruSyncFileTag(), SnapBuildRestoreContents(), SnapBuildRestoreSnapshot(), SnapBuildSerialize(), StartupReplicationOrigin(), write_relmap_file(), writeTimeLineHistory(), writeTimeLineHistoryFile(), and XLogFileCopy().

◆ count_usable_fds()

static void count_usable_fds ( int  max_to_probe,
int usable_fds,
int already_open 
)
static

Definition at line 961 of file fd.c.

962{
963 int *fd;
964 int size;
965 int used = 0;
966 int highestfd = 0;
967 int j;
968
969#ifdef HAVE_GETRLIMIT
970 struct rlimit rlim;
972#endif
973
974 size = 1024;
975 fd = (int *) palloc(size * sizeof(int));
976
977#ifdef HAVE_GETRLIMIT
979 if (getrlimit_status != 0)
980 ereport(WARNING, (errmsg("getrlimit failed: %m")));
981#endif /* HAVE_GETRLIMIT */
982
983 /* dup until failure or probe limit reached */
984 for (;;)
985 {
986 int thisfd;
987
988#ifdef HAVE_GETRLIMIT
989
990 /*
991 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
992 * some platforms
993 */
994 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
995 break;
996#endif
997
998 thisfd = dup(2);
999 if (thisfd < 0)
1000 {
1001 /* Expect EMFILE or ENFILE, else it's fishy */
1002 if (errno != EMFILE && errno != ENFILE)
1003 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1004 break;
1005 }
1006
1007 if (used >= size)
1008 {
1009 size *= 2;
1010 fd = (int *) repalloc(fd, size * sizeof(int));
1011 }
1012 fd[used++] = thisfd;
1013
1014 if (highestfd < thisfd)
1015 highestfd = thisfd;
1016
1017 if (used >= max_to_probe)
1018 break;
1019 }
1020
1021 /* release the files we opened */
1022 for (j = 0; j < used; j++)
1023 close(fd[j]);
1024
1025 pfree(fd);
1026
1027 /*
1028 * Return results. usable_fds is just the number of successful dups. We
1029 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1030 * number) and so already_open is highestfd+1 - usable_fds.
1031 */
1032 *usable_fds = used;
1033 *already_open = highestfd + 1 - used;
1034}

References close, elog, ereport, errmsg(), fb(), fd(), j, palloc(), pfree(), repalloc(), and WARNING.

Referenced by set_max_safe_fds().

◆ data_sync_elevel()

◆ datadir_fsync_fname()

static void datadir_fsync_fname ( const char fname,
bool  isdir,
int  elevel 
)
static

Definition at line 3805 of file fd.c.

3806{
3807 ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3808 fname);
3809
3810 /*
3811 * We want to silently ignoring errors about unreadable files. Pass that
3812 * desire on to fsync_fname_ext().
3813 */
3814 fsync_fname_ext(fname, isdir, true, elevel);
3815}

References ereport_startup_progress, fb(), and fsync_fname_ext().

Referenced by SyncDataDirectory().

◆ Delete()

static void Delete ( File  file)
static

Definition at line 1250 of file fd.c.

1251{
1252 Vfd *vfdP;
1253
1254 Assert(file != 0);
1255
1256 DO_DB(elog(LOG, "Delete %d (%s)",
1257 file, VfdCache[file].fileName));
1258 DO_DB(_dump_lru());
1259
1260 vfdP = &VfdCache[file];
1261
1262 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1263 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1264
1265 DO_DB(_dump_lru());
1266}

References Assert, DO_DB, elog, fb(), LOG, vfd::lruLessRecently, vfd::lruMoreRecently, and VfdCache.

Referenced by FileAccess(), FileClose(), and LruDelete().

◆ durable_rename()

int durable_rename ( const char oldfile,
const char newfile,
int  elevel 
)

Definition at line 779 of file fd.c.

780{
781 int fd;
782
783 /*
784 * First fsync the old and target path (if it exists), to ensure that they
785 * are properly persistent on disk. Syncing the target file is not
786 * strictly necessary, but it makes it easier to reason about crashes;
787 * because it's then guaranteed that either source or target file exists
788 * after a crash.
789 */
790 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
791 return -1;
792
794 if (fd < 0)
795 {
796 if (errno != ENOENT)
797 {
798 ereport(elevel,
800 errmsg("could not open file \"%s\": %m", newfile)));
801 return -1;
802 }
803 }
804 else
805 {
806 if (pg_fsync(fd) != 0)
807 {
808 int save_errno;
809
810 /* close file upon error, might not be in transaction context */
814
815 ereport(elevel,
817 errmsg("could not fsync file \"%s\": %m", newfile)));
818 return -1;
819 }
820
821 if (CloseTransientFile(fd) != 0)
822 {
823 ereport(elevel,
825 errmsg("could not close file \"%s\": %m", newfile)));
826 return -1;
827 }
828 }
829
830 /* Time to do the real deal... */
831 if (rename(oldfile, newfile) < 0)
832 {
833 ereport(elevel,
835 errmsg("could not rename file \"%s\" to \"%s\": %m",
836 oldfile, newfile)));
837 return -1;
838 }
839
840 /*
841 * To guarantee renaming the file is persistent, fsync the file with its
842 * new name, and its containing directory.
843 */
844 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
845 return -1;
846
847 if (fsync_parent_path(newfile, elevel) != 0)
848 return -1;
849
850 return 0;
851}

References CloseTransientFile(), ereport, errcode_for_file_access(), errmsg(), fb(), fd(), fsync_fname_ext(), fsync_parent_path(), OpenTransientFile(), PG_BINARY, and pg_fsync().

Referenced by AlterSystemSetConfigFile(), apw_dump_now(), BaseBackup(), basic_archive_file(), bbsink_server_end_manifest(), CheckPointReplicationOrigin(), cleanup_objects_atexit(), CleanupAfterArchiveRecovery(), dir_close(), InitWalRecovery(), InstallXLogFileSegment(), KeepFileRestoredFromArchive(), pgss_shmem_shutdown(), pgstat_write_statsfile(), StartupXLOG(), SummarizeWAL(), write_relmap_file(), writeTimeLineHistory(), writeTimeLineHistoryFile(), and XLogArchiveForceDone().

◆ durable_unlink()

int durable_unlink ( const char fname,
int  elevel 
)

Definition at line 869 of file fd.c.

870{
871 if (unlink(fname) < 0)
872 {
873 ereport(elevel,
875 errmsg("could not remove file \"%s\": %m",
876 fname)));
877 return -1;
878 }
879
880 /*
881 * To guarantee that the removal of the file is persistent, fsync its
882 * parent directory.
883 */
884 if (fsync_parent_path(fname, elevel) != 0)
885 return -1;
886
887 return 0;
888}

References ereport, errcode_for_file_access(), errmsg(), fb(), and fsync_parent_path().

Referenced by InstallXLogFileSegment(), RemoveXlogFile(), and StartupXLOG().

◆ FileAccess()

static int FileAccess ( File  file)
static

Definition at line 1476 of file fd.c.

1477{
1478 int returnValue;
1479
1480 DO_DB(elog(LOG, "FileAccess %d (%s)",
1481 file, VfdCache[file].fileName));
1482
1483 /*
1484 * Is the file open? If not, open it and put it at the head of the LRU
1485 * ring (possibly closing the least recently used file to get an FD).
1486 */
1487
1488 if (FileIsNotOpen(file))
1489 {
1490 returnValue = LruInsert(file);
1491 if (returnValue != 0)
1492 return returnValue;
1493 }
1494 else if (VfdCache[0].lruLessRecently != file)
1495 {
1496 /*
1497 * We now know that the file is open and that it is not the last one
1498 * accessed, so we need to move it to the head of the Lru ring.
1499 */
1500
1501 Delete(file);
1502 Insert(file);
1503 }
1504
1505 return 0;
1506}

References Delete(), DO_DB, elog, fb(), FileIsNotOpen, Insert(), LOG, LruInsert(), and VfdCache.

Referenced by FileFallocate(), FileGetRawDesc(), FilePrefetch(), FileReadV(), FileSize(), FileStartReadV(), FileSync(), FileTruncate(), FileWriteback(), FileWriteV(), and FileZero().

◆ FileClose()

void FileClose ( File  file)

Definition at line 1962 of file fd.c.

1963{
1964 Vfd *vfdP;
1965
1966 Assert(FileIsValid(file));
1967
1968 DO_DB(elog(LOG, "FileClose: %d (%s)",
1969 file, VfdCache[file].fileName));
1970
1971 vfdP = &VfdCache[file];
1972
1973 if (!FileIsNotOpen(file))
1974 {
1976
1977 /* close the file */
1978 if (close(vfdP->fd) != 0)
1979 {
1980 /*
1981 * We may need to panic on failure to close non-temporary files;
1982 * see LruDelete.
1983 */
1985 "could not close file \"%s\": %m", vfdP->fileName);
1986 }
1987
1988 --nfile;
1989 vfdP->fd = VFD_CLOSED;
1990
1991 /* remove the file from the lru ring */
1992 Delete(file);
1993 }
1994
1995 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1996 {
1997 /* Subtract its size from current usage (do first in case of error) */
1998 temporary_files_size -= vfdP->fileSize;
1999 vfdP->fileSize = 0;
2000 }
2001
2002 /*
2003 * Delete the file if it was temporary, and make a log entry if wanted
2004 */
2005 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2006 {
2007 struct stat filestats;
2008 int stat_errno;
2009
2010 /*
2011 * If we get an error, as could happen within the ereport/elog calls,
2012 * we'll come right back here during transaction abort. Reset the
2013 * flag to ensure that we can't get into an infinite loop. This code
2014 * is arranged to ensure that the worst-case consequence is failing to
2015 * emit log message(s), not failing to attempt the unlink.
2016 */
2017 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2018
2019
2020 /* first try the stat() */
2021 if (stat(vfdP->fileName, &filestats))
2022 stat_errno = errno;
2023 else
2024 stat_errno = 0;
2025
2026 /* in any case do the unlink */
2027 if (unlink(vfdP->fileName))
2028 ereport(LOG,
2030 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2031
2032 /* and last report the stat results */
2033 if (stat_errno == 0)
2034 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2035 else
2036 {
2037 errno = stat_errno;
2038 ereport(LOG,
2040 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2041 }
2042 }
2043
2044 /* Unregister it from the resource owner */
2045 if (vfdP->resowner)
2046 ResourceOwnerForgetFile(vfdP->resowner, file);
2047
2048 /*
2049 * Return the Vfd slot to the free list
2050 */
2051 FreeVfd(file);
2052}

References Assert, close, data_sync_elevel(), Delete(), DO_DB, elog, ereport, errcode_for_file_access(), errmsg(), fb(), FD_DELETE_AT_CLOSE, FD_TEMP_FILE_LIMIT, FileIsNotOpen, FileIsValid, FreeVfd(), LOG, nfile, pgaio_closing_fd(), ReportTemporaryFileUsage(), ResourceOwnerForgetFile(), stat, temporary_files_size, VFD_CLOSED, and VfdCache.

Referenced by bbsink_server_end_archive(), bbsink_server_end_manifest(), BufFileClose(), BufFileTruncateFileSet(), CleanupTempFiles(), logical_end_heap_rewrite(), mdclose(), mdimmedsync(), mdregistersync(), mdsyncfiletag(), mdtruncate(), pg_wal_summary_contents(), PrepareForIncrementalBackup(), ReorderBufferIterTXNFinish(), ReorderBufferRestoreChanges(), ResOwnerReleaseFile(), and SummarizeWAL().

◆ FileFallocate()

int FileFallocate ( File  file,
pgoff_t  offset,
pgoff_t  amount,
uint32  wait_event_info 
)

Definition at line 2404 of file fd.c.

2405{
2406#ifdef HAVE_POSIX_FALLOCATE
2407 int returnCode;
2408
2409 Assert(FileIsValid(file));
2410
2411 DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2412 file, VfdCache[file].fileName,
2413 (int64) offset, (int64) amount));
2414
2415 returnCode = FileAccess(file);
2416 if (returnCode < 0)
2417 return -1;
2418
2419retry:
2420 pgstat_report_wait_start(wait_event_info);
2421 returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2423
2424 if (returnCode == 0)
2425 return 0;
2426 else if (returnCode == EINTR)
2427 goto retry;
2428
2429 /* for compatibility with %m printing etc */
2430 errno = returnCode;
2431
2432 /*
2433 * Return in cases of a "real" failure, if fallocate is not supported,
2434 * fall through to the FileZero() backed implementation.
2435 */
2437 return -1;
2438#endif
2439
2440 return FileZero(file, offset, amount, wait_event_info);
2441}

References Assert, DO_DB, EINTR, elog, EOPNOTSUPP, fb(), fd(), FileAccess(), FileIsValid, FileZero(), INT64_FORMAT, LOG, pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by mdzeroextend().

◆ FileGetRawDesc()

int FileGetRawDesc ( File  file)

Definition at line 2512 of file fd.c.

2513{
2514 int returnCode;
2515
2516 returnCode = FileAccess(file);
2517 if (returnCode < 0)
2518 return returnCode;
2519
2520 Assert(FileIsValid(file));
2521 return VfdCache[file].fd;
2522}

References Assert, fb(), vfd::fd, FileAccess(), FileIsValid, and VfdCache.

Referenced by mdfd().

◆ FileGetRawFlags()

int FileGetRawFlags ( File  file)

Definition at line 2528 of file fd.c.

2529{
2530 Assert(FileIsValid(file));
2531 return VfdCache[file].fileFlags;
2532}

References Assert, vfd::fileFlags, FileIsValid, and VfdCache.

◆ FileGetRawMode()

mode_t FileGetRawMode ( File  file)

Definition at line 2538 of file fd.c.

2539{
2540 Assert(FileIsValid(file));
2541 return VfdCache[file].fileMode;
2542}

References Assert, FileIsValid, vfd::fileMode, and VfdCache.

◆ FilePathName()

◆ FilePrefetch()

int FilePrefetch ( File  file,
pgoff_t  offset,
pgoff_t  amount,
uint32  wait_event_info 
)

Definition at line 2063 of file fd.c.

2064{
2065 Assert(FileIsValid(file));
2066
2067 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2068 file, VfdCache[file].fileName,
2069 (int64) offset, (int64) amount));
2070
2071#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2072 {
2073 int returnCode;
2074
2075 returnCode = FileAccess(file);
2076 if (returnCode < 0)
2077 return returnCode;
2078
2079retry:
2080 pgstat_report_wait_start(wait_event_info);
2081 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2084
2085 if (returnCode == EINTR)
2086 goto retry;
2087
2088 return returnCode;
2089 }
2090#elif defined(__darwin__)
2091 {
2092 struct radvisory
2093 {
2094 off_t ra_offset; /* offset into the file */
2095 int ra_count; /* size of the read */
2096 } ra;
2097 int returnCode;
2098
2099 returnCode = FileAccess(file);
2100 if (returnCode < 0)
2101 return returnCode;
2102
2103 ra.ra_offset = offset;
2104 ra.ra_count = amount;
2105 pgstat_report_wait_start(wait_event_info);
2108 if (returnCode != -1)
2109 return 0;
2110 else
2111 return errno;
2112 }
2113#else
2114 return 0;
2115#endif
2116}

References Assert, DO_DB, EINTR, elog, fb(), fd(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by mdprefetch().

◆ FileReadV()

ssize_t FileReadV ( File  file,
const struct iovec iov,
int  iovcnt,
pgoff_t  offset,
uint32  wait_event_info 
)

Definition at line 2145 of file fd.c.

2147{
2149 Vfd *vfdP;
2150
2151 Assert(FileIsValid(file));
2152
2153 DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2154 file, VfdCache[file].fileName,
2155 (int64) offset,
2156 iovcnt));
2157
2158 returnCode = FileAccess(file);
2159 if (returnCode < 0)
2160 return returnCode;
2161
2162 vfdP = &VfdCache[file];
2163
2164retry:
2165 pgstat_report_wait_start(wait_event_info);
2166 returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2168
2169 if (returnCode < 0)
2170 {
2171 /*
2172 * Windows may run out of kernel buffers and return "Insufficient
2173 * system resources" error. Wait a bit and retry to solve it.
2174 *
2175 * It is rumored that EINTR is also possible on some Unix filesystems,
2176 * in which case immediate retry is indicated.
2177 */
2178#ifdef WIN32
2180
2181 switch (error)
2182 {
2184 pg_usleep(1000L);
2185 errno = EINTR;
2186 break;
2187 default:
2189 break;
2190 }
2191#endif
2192 /* OK to retry if interrupted */
2193 if (errno == EINTR)
2194 goto retry;
2195 }
2196
2197 return returnCode;
2198}

References _dosmaperr(), Assert, DO_DB, EINTR, elog, error(), fb(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pg_preadv(), pg_usleep(), pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by FileRead(), and mdreadv().

◆ FileSize()

pgoff_t FileSize ( File  file)

Definition at line 2444 of file fd.c.

2445{
2446 Assert(FileIsValid(file));
2447
2448 DO_DB(elog(LOG, "FileSize %d (%s)",
2449 file, VfdCache[file].fileName));
2450
2451 if (FileIsNotOpen(file))
2452 {
2453 if (FileAccess(file) < 0)
2454 return (pgoff_t) -1;
2455 }
2456
2457 return lseek(VfdCache[file].fd, 0, SEEK_END);
2458}

References Assert, DO_DB, elog, fb(), fd(), FileAccess(), FileIsNotOpen, FileIsValid, LOG, and VfdCache.

Referenced by _mdnblocks(), BufFileSeek(), and BufFileSize().

◆ FileStartReadV()

int FileStartReadV ( PgAioHandle ioh,
File  file,
int  iovcnt,
pgoff_t  offset,
uint32  wait_event_info 
)

Definition at line 2201 of file fd.c.

2204{
2205 int returnCode;
2206 Vfd *vfdP;
2207
2208 Assert(FileIsValid(file));
2209
2210 DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2211 file, VfdCache[file].fileName,
2212 (int64) offset,
2213 iovcnt));
2214
2215 returnCode = FileAccess(file);
2216 if (returnCode < 0)
2217 return returnCode;
2218
2219 vfdP = &VfdCache[file];
2220
2221 pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2222
2223 return 0;
2224}

References Assert, DO_DB, elog, fb(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pgaio_io_start_readv(), and VfdCache.

Referenced by mdstartreadv().

◆ FileSync()

int FileSync ( File  file,
uint32  wait_event_info 
)

Definition at line 2332 of file fd.c.

2333{
2334 int returnCode;
2335
2336 Assert(FileIsValid(file));
2337
2338 DO_DB(elog(LOG, "FileSync: %d (%s)",
2339 file, VfdCache[file].fileName));
2340
2341 returnCode = FileAccess(file);
2342 if (returnCode < 0)
2343 return returnCode;
2344
2345 pgstat_report_wait_start(wait_event_info);
2346 returnCode = pg_fsync(VfdCache[file].fd);
2348
2349 return returnCode;
2350}

References Assert, DO_DB, elog, fb(), fd(), FileAccess(), FileIsValid, LOG, pg_fsync(), pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by bbsink_server_end_archive(), logical_end_heap_rewrite(), mdimmedsync(), mdsyncfiletag(), and register_dirty_segment().

◆ FileTruncate()

int FileTruncate ( File  file,
pgoff_t  offset,
uint32  wait_event_info 
)

Definition at line 2461 of file fd.c.

2462{
2463 int returnCode;
2464
2465 Assert(FileIsValid(file));
2466
2467 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2468 file, VfdCache[file].fileName));
2469
2470 returnCode = FileAccess(file);
2471 if (returnCode < 0)
2472 return returnCode;
2473
2474 pgstat_report_wait_start(wait_event_info);
2475 returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2477
2478 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2479 {
2480 /* adjust our state for truncation of a temp file */
2481 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2482 temporary_files_size -= VfdCache[file].fileSize - offset;
2483 VfdCache[file].fileSize = offset;
2484 }
2485
2486 return returnCode;
2487}

References Assert, DO_DB, elog, fb(), fd(), FD_TEMP_FILE_LIMIT, FileAccess(), FileIsValid, vfd::fileSize, LOG, pg_ftruncate(), pgstat_report_wait_end(), pgstat_report_wait_start(), temporary_files_size, and VfdCache.

Referenced by BufFileTruncateFileSet(), and mdtruncate().

◆ FileWriteback()

void FileWriteback ( File  file,
pgoff_t  offset,
pgoff_t  nbytes,
uint32  wait_event_info 
)

Definition at line 2119 of file fd.c.

2120{
2121 int returnCode;
2122
2123 Assert(FileIsValid(file));
2124
2125 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2126 file, VfdCache[file].fileName,
2127 (int64) offset, (int64) nbytes));
2128
2129 if (nbytes <= 0)
2130 return;
2131
2132 if (VfdCache[file].fileFlags & PG_O_DIRECT)
2133 return;
2134
2135 returnCode = FileAccess(file);
2136 if (returnCode < 0)
2137 return;
2138
2139 pgstat_report_wait_start(wait_event_info);
2140 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2142}

References Assert, DO_DB, elog, fb(), fd(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pg_flush_data(), PG_O_DIRECT, pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by mdwriteback().

◆ FileWriteV()

ssize_t FileWriteV ( File  file,
const struct iovec iov,
int  iovcnt,
pgoff_t  offset,
uint32  wait_event_info 
)

Definition at line 2227 of file fd.c.

2229{
2231 Vfd *vfdP;
2232
2233 Assert(FileIsValid(file));
2234
2235 DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2236 file, VfdCache[file].fileName,
2237 (int64) offset,
2238 iovcnt));
2239
2240 returnCode = FileAccess(file);
2241 if (returnCode < 0)
2242 return returnCode;
2243
2244 vfdP = &VfdCache[file];
2245
2246 /*
2247 * If enforcing temp_file_limit and it's a temp file, check to see if the
2248 * write would overrun temp_file_limit, and throw error if so. Note: it's
2249 * really a modularity violation to throw error here; we should set errno
2250 * and return -1. However, there's no way to report a suitable error
2251 * message if we do that. All current callers would just throw error
2252 * immediately anyway, so this is safe at present.
2253 */
2254 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2255 {
2256 pgoff_t past_write = offset;
2257
2258 for (int i = 0; i < iovcnt; ++i)
2259 past_write += iov[i].iov_len;
2260
2261 if (past_write > vfdP->fileSize)
2262 {
2264
2266 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2267 ereport(ERROR,
2269 errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2270 temp_file_limit)));
2271 }
2272 }
2273
2274retry:
2275 pgstat_report_wait_start(wait_event_info);
2276 returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2278
2279 if (returnCode >= 0)
2280 {
2281 /*
2282 * Some callers expect short writes to set errno, and traditionally we
2283 * have assumed that they imply disk space shortage. We don't want to
2284 * waste CPU cycles adding up the total size here, so we'll just set
2285 * it for all successful writes in case such a caller determines that
2286 * the write was short and ereports "%m".
2287 */
2288 errno = ENOSPC;
2289
2290 /*
2291 * Maintain fileSize and temporary_files_size if it's a temp file.
2292 */
2293 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2294 {
2295 pgoff_t past_write = offset + returnCode;
2296
2297 if (past_write > vfdP->fileSize)
2298 {
2299 temporary_files_size += past_write - vfdP->fileSize;
2300 vfdP->fileSize = past_write;
2301 }
2302 }
2303 }
2304 else
2305 {
2306 /*
2307 * See comments in FileReadV()
2308 */
2309#ifdef WIN32
2311
2312 switch (error)
2313 {
2315 pg_usleep(1000L);
2316 errno = EINTR;
2317 break;
2318 default:
2320 break;
2321 }
2322#endif
2323 /* OK to retry if interrupted */
2324 if (errno == EINTR)
2325 goto retry;
2326 }
2327
2328 return returnCode;
2329}

References _dosmaperr(), Assert, DO_DB, EINTR, elog, ereport, errcode(), errmsg(), ERROR, error(), fb(), FD_TEMP_FILE_LIMIT, FileAccess(), FileIsValid, vfd::fileSize, i, INT64_FORMAT, LOG, pg_pwritev(), pg_usleep(), pgstat_report_wait_end(), pgstat_report_wait_start(), temp_file_limit, temporary_files_size, and VfdCache.

Referenced by FileWrite(), and mdwritev().

◆ FileZero()

int FileZero ( File  file,
pgoff_t  offset,
pgoff_t  amount,
uint32  wait_event_info 
)

Definition at line 2359 of file fd.c.

2360{
2361 int returnCode;
2363
2364 Assert(FileIsValid(file));
2365
2366 DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2367 file, VfdCache[file].fileName,
2368 (int64) offset, (int64) amount));
2369
2370 returnCode = FileAccess(file);
2371 if (returnCode < 0)
2372 return returnCode;
2373
2374 pgstat_report_wait_start(wait_event_info);
2375 written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2377
2378 if (written < 0)
2379 return -1;
2380 else if (written != amount)
2381 {
2382 /* if errno is unset, assume problem is no disk space */
2383 if (errno == 0)
2384 errno = ENOSPC;
2385 return -1;
2386 }
2387
2388 return 0;
2389}

References Assert, DO_DB, elog, fb(), fd(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pg_pwrite_zeros(), pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by FileFallocate(), and mdzeroextend().

◆ FreeDesc()

static int FreeDesc ( AllocateDesc desc)
static

Definition at line 2783 of file fd.c.

2784{
2785 int result;
2786
2787 /* Close the underlying object */
2788 switch (desc->kind)
2789 {
2790 case AllocateDescFile:
2791 result = fclose(desc->desc.file);
2792 break;
2793 case AllocateDescPipe:
2794 result = pclose(desc->desc.file);
2795 break;
2796 case AllocateDescDir:
2797 result = closedir(desc->desc.dir);
2798 break;
2799 case AllocateDescRawFD:
2800 pgaio_closing_fd(desc->desc.fd);
2801 result = close(desc->desc.fd);
2802 break;
2803 default:
2804 elog(ERROR, "AllocateDesc kind not recognized");
2805 result = 0; /* keep compiler quiet */
2806 break;
2807 }
2808
2809 /* Compact storage in the allocatedDescs array */
2812
2813 return result;
2814}

References allocatedDescs, AllocateDescDir, AllocateDescFile, AllocateDescPipe, AllocateDescRawFD, close, closedir(), AllocateDesc::desc, AllocateDesc::dir, elog, ERROR, fb(), AllocateDesc::fd, AllocateDesc::file, AllocateDesc::kind, numAllocatedDescs, and pgaio_closing_fd().

Referenced by AtEOSubXact_Files(), CleanupTempFiles(), ClosePipeStream(), CloseTransientFile(), FreeDir(), and FreeFile().

◆ FreeDir()

int FreeDir ( DIR dir)

Definition at line 3005 of file fd.c.

3006{
3007 int i;
3008
3009 /* Nothing to do if AllocateDir failed */
3010 if (dir == NULL)
3011 return 0;
3012
3013 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3014
3015 /* Remove dir from list of allocated dirs, if it's present */
3016 for (i = numAllocatedDescs; --i >= 0;)
3017 {
3018 AllocateDesc *desc = &allocatedDescs[i];
3019
3020 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3021 return FreeDesc(desc);
3022 }
3023
3024 /* Only get here if someone passes us a dir not in allocatedDescs */
3025 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3026
3027 return closedir(dir);
3028}

References allocatedDescs, AllocateDescDir, closedir(), AllocateDesc::desc, AllocateDesc::dir, DO_DB, elog, fb(), FreeDesc(), i, AllocateDesc::kind, LOG, numAllocatedDescs, and WARNING.

Referenced by calculate_database_size(), calculate_tablespace_size(), CheckPointLogicalRewriteHeap(), CheckPointSnapBuild(), CleanupBackupHistory(), copydir(), db_dir_size(), DeleteAllExportedSnapshotFiles(), destroy_tablespace_directories(), directory_is_empty(), do_pg_backup_start(), dsm_cleanup_for_mmap(), extension_file_exists(), get_ext_ver_list(), GetConfFilesInDir(), getInstallationPaths(), GetWalSummaries(), movedb(), ParseTzFile(), perform_base_backup(), pg_available_extension_versions(), pg_available_extensions(), pg_ls_dir(), pg_ls_dir_files(), pg_tablespace_databases(), pg_tzenumerate_end(), pg_tzenumerate_next(), pgarch_readyXlog(), RelationCacheInitFileRemove(), RelationCacheInitFileRemoveInDir(), RemoveNonParentXlogFiles(), RemoveOldXlogFiles(), RemovePgTempFiles(), RemovePgTempFilesInDir(), RemovePgTempRelationFiles(), RemovePgTempRelationFilesInDbspace(), RemoveTempXlogFiles(), ReorderBufferCleanupSerializedTXNs(), ResetUnloggedRelations(), ResetUnloggedRelationsInDbspaceDir(), ResetUnloggedRelationsInTablespaceDir(), restoreTwoPhaseData(), scan_directory_ci(), sendDir(), SlruScanDirectory(), StartupReorderBuffer(), StartupReplicationSlots(), SyncDataDirectory(), UpdateLogicalMappings(), walkdir(), and XLogGetOldestSegno().

◆ FreeFile()

int FreeFile ( FILE file)

Definition at line 2823 of file fd.c.

2824{
2825 int i;
2826
2827 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2828
2829 /* Remove file from list of allocated files, if it's present */
2830 for (i = numAllocatedDescs; --i >= 0;)
2831 {
2832 AllocateDesc *desc = &allocatedDescs[i];
2833
2834 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2835 return FreeDesc(desc);
2836 }
2837
2838 /* Only get here if someone passes us a file not in allocatedDescs */
2839 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2840
2841 return fclose(file);
2842}

References allocatedDescs, AllocateDescFile, AllocateDesc::desc, DO_DB, elog, fb(), AllocateDesc::file, FreeDesc(), i, AllocateDesc::kind, LOG, numAllocatedDescs, and WARNING.

Referenced by AlterSystemSetConfigFile(), apw_dump_now(), apw_load_buffers(), checkControlFile(), do_pg_backup_stop(), EndCopy(), EndCopyFrom(), entry_reset(), existsTimeLineHistory(), ExportSnapshot(), free_auth_file(), gc_qtexts(), GetHugePageSize(), ImportSnapshot(), load_dh_file(), load_relcache_init_file(), parse_extension_control_file(), ParseConfigFile(), ParseTzFile(), pg_current_logfile(), pg_promote(), pgss_shmem_shutdown(), pgss_shmem_startup(), pgstat_read_statsfile(), pgstat_write_statsfile(), read_backup_label(), read_binary_file(), read_tablespace_map(), read_whole_file(), readTimeLineHistory(), test_custom_stats_var_finish(), tsearch_readline_end(), ValidatePgVersion(), write_relcache_init_file(), XLogArchiveForceDone(), and XLogArchiveNotify().

◆ FreeVfd()

static void FreeVfd ( File  file)
static

Definition at line 1456 of file fd.c.

1457{
1458 Vfd *vfdP = &VfdCache[file];
1459
1460 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1461 file, vfdP->fileName ? vfdP->fileName : ""));
1462
1463 if (vfdP->fileName != NULL)
1464 {
1465 free(vfdP->fileName);
1466 vfdP->fileName = NULL;
1467 }
1468 vfdP->fdstate = 0x0;
1469
1470 vfdP->nextFree = VfdCache[0].nextFree;
1471 VfdCache[0].nextFree = file;
1472}

References DO_DB, elog, fb(), free, LOG, vfd::nextFree, and VfdCache.

Referenced by FileClose(), and PathNameOpenFilePerm().

◆ fsync_fname()

◆ fsync_fname_ext()

int fsync_fname_ext ( const char fname,
bool  isdir,
bool  ignore_perm,
int  elevel 
)

Definition at line 3843 of file fd.c.

3844{
3845 int fd;
3846 int flags;
3847 int returncode;
3848
3849 /*
3850 * Some OSs require directories to be opened read-only whereas other
3851 * systems don't allow us to fsync files opened read-only; so we need both
3852 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3853 * not writable by our userid, but we assume that's OK.
3854 */
3855 flags = PG_BINARY;
3856 if (!isdir)
3857 flags |= O_RDWR;
3858 else
3859 flags |= O_RDONLY;
3860
3861 fd = OpenTransientFile(fname, flags);
3862
3863 /*
3864 * Some OSs don't allow us to open directories at all (Windows returns
3865 * EACCES), just ignore the error in that case. If desired also silently
3866 * ignoring errors about unreadable files. Log others.
3867 */
3868 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3869 return 0;
3870 else if (fd < 0 && ignore_perm && errno == EACCES)
3871 return 0;
3872 else if (fd < 0)
3873 {
3874 ereport(elevel,
3876 errmsg("could not open file \"%s\": %m", fname)));
3877 return -1;
3878 }
3879
3881
3882 /*
3883 * Some OSes don't allow us to fsync directories at all, so we can ignore
3884 * those errors. Anything else needs to be logged.
3885 */
3886 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3887 {
3888 int save_errno;
3889
3890 /* close file upon error, might not be in transaction context */
3891 save_errno = errno;
3893 errno = save_errno;
3894
3895 ereport(elevel,
3897 errmsg("could not fsync file \"%s\": %m", fname)));
3898 return -1;
3899 }
3900
3901 if (CloseTransientFile(fd) != 0)
3902 {
3903 ereport(elevel,
3905 errmsg("could not close file \"%s\": %m", fname)));
3906 return -1;
3907 }
3908
3909 return 0;
3910}

References CloseTransientFile(), ereport, errcode_for_file_access(), errmsg(), fb(), fd(), OpenTransientFile(), PG_BINARY, and pg_fsync().

Referenced by datadir_fsync_fname(), durable_rename(), fsync_fname(), and fsync_parent_path().

◆ fsync_parent_path()

static int fsync_parent_path ( const char fname,
int  elevel 
)
static

Definition at line 3919 of file fd.c.

3920{
3921 char parentpath[MAXPGPATH];
3922
3923 strlcpy(parentpath, fname, MAXPGPATH);
3925
3926 /*
3927 * get_parent_directory() returns an empty string if the input argument is
3928 * just a file name (see comments in path.c), so handle that as being the
3929 * current directory.
3930 */
3931 if (strlen(parentpath) == 0)
3933
3934 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3935 return -1;
3936
3937 return 0;
3938}

References fb(), fsync_fname_ext(), get_parent_directory(), MAXPGPATH, and strlcpy().

Referenced by dir_close(), dir_open_for_write(), durable_rename(), durable_unlink(), swap_catalog_files(), and tar_finish().

◆ GetNextTempTableSpace()

Oid GetNextTempTableSpace ( void  )

Definition at line 3155 of file fd.c.

3156{
3157 if (numTempTableSpaces > 0)
3158 {
3159 /* Advance nextTempTableSpace counter with wraparound */
3163 }
3164 return InvalidOid;
3165}

References InvalidOid, nextTempTableSpace, numTempTableSpaces, and tempTableSpaces.

Referenced by GetDefaultTablespace(), and OpenTemporaryFile().

◆ GetTempTablespaces()

int GetTempTablespaces ( Oid tableSpaces,
int  numSpaces 
)

Definition at line 3137 of file fd.c.

3138{
3139 int i;
3140
3142 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3144
3145 return i;
3146}

References Assert, fb(), i, numTempTableSpaces, tempTableSpaces, and TempTablespacesAreSet().

Referenced by FileSetInit().

◆ InitFileAccess()

void InitFileAccess ( void  )

Definition at line 900 of file fd.c.

901{
902 Assert(SizeVfdCache == 0); /* call me only once */
903
904 /* initialize cache header entry */
905 VfdCache = (Vfd *) malloc(sizeof(Vfd));
906 if (VfdCache == NULL)
909 errmsg("out of memory")));
910
911 MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
913
914 SizeVfdCache = 1;
915}

References Assert, ereport, errcode(), errmsg(), FATAL, fb(), vfd::fd, malloc, MemSet, SizeVfdCache, VFD_CLOSED, and VfdCache.

Referenced by BaseInit().

◆ InitTemporaryFileAccess()

void InitTemporaryFileAccess ( void  )

Definition at line 930 of file fd.c.

931{
932 Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
933 Assert(!temporary_files_allowed); /* call me only once */
934
935 /*
936 * Register before-shmem-exit hook to ensure temp files are dropped while
937 * we can still report stats.
938 */
940
941#ifdef USE_ASSERT_CHECKING
943#endif
944}

References Assert, before_shmem_exit(), BeforeShmemExit_Files(), fb(), and SizeVfdCache.

Referenced by BaseInit().

◆ Insert()

static void Insert ( File  file)
static

Definition at line 1297 of file fd.c.

1298{
1299 Vfd *vfdP;
1300
1301 Assert(file != 0);
1302
1303 DO_DB(elog(LOG, "Insert %d (%s)",
1304 file, VfdCache[file].fileName));
1305 DO_DB(_dump_lru());
1306
1307 vfdP = &VfdCache[file];
1308
1309 vfdP->lruMoreRecently = 0;
1310 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1311 VfdCache[0].lruLessRecently = file;
1312 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1313
1314 DO_DB(_dump_lru());
1315}

References Assert, DO_DB, elog, fb(), LOG, vfd::lruLessRecently, vfd::lruMoreRecently, and VfdCache.

Referenced by AdvanceXLInsertBuffer(), CreateCheckPoint(), FileAccess(), GetXLogInsertRecPtr(), LruInsert(), PathNameOpenFilePerm(), ReserveXLogInsertLocation(), ReserveXLogSwitch(), StartupXLOG(), UpdateFullPageWrites(), WaitXLogInsertionsToFinish(), XLogInsertRecord(), and XLogWrite().

◆ looks_like_temp_rel_name()

bool looks_like_temp_rel_name ( const char name)

Definition at line 3495 of file fd.c.

3496{
3497 int pos;
3498 int savepos;
3499
3500 /* Must start with "t". */
3501 if (name[0] != 't')
3502 return false;
3503
3504 /* Followed by a non-empty string of digits and then an underscore. */
3505 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3506 ;
3507 if (pos == 1 || name[pos] != '_')
3508 return false;
3509
3510 /* Followed by another nonempty string of digits. */
3511 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3512 ;
3513 if (savepos == pos)
3514 return false;
3515
3516 /* We might have _forkname or .segment or both. */
3517 if (name[pos] == '_')
3518 {
3519 int forkchar = forkname_chars(&name[pos + 1], NULL);
3520
3521 if (forkchar <= 0)
3522 return false;
3523 pos += forkchar + 1;
3524 }
3525 if (name[pos] == '.')
3526 {
3527 int segchar;
3528
3529 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3530 ;
3531 if (segchar <= 1)
3532 return false;
3533 pos += segchar;
3534 }
3535
3536 /* Now we should be at the end. */
3537 if (name[pos] != '\0')
3538 return false;
3539 return true;
3540}

References fb(), forkname_chars(), and name.

Referenced by RemovePgTempRelationFilesInDbspace(), and sendDir().

◆ LruDelete()

static void LruDelete ( File  file)
static

Definition at line 1269 of file fd.c.

1270{
1271 Vfd *vfdP;
1272
1273 Assert(file != 0);
1274
1275 DO_DB(elog(LOG, "LruDelete %d (%s)",
1276 file, VfdCache[file].fileName));
1277
1278 vfdP = &VfdCache[file];
1279
1281
1282 /*
1283 * Close the file. We aren't expecting this to fail; if it does, better
1284 * to leak the FD than to mess up our internal state.
1285 */
1286 if (close(vfdP->fd) != 0)
1288 "could not close file \"%s\": %m", vfdP->fileName);
1289 vfdP->fd = VFD_CLOSED;
1290 --nfile;
1291
1292 /* delete the vfd record from the LRU ring */
1293 Delete(file);
1294}

References Assert, close, data_sync_elevel(), Delete(), DO_DB, elog, fb(), FD_TEMP_FILE_LIMIT, LOG, nfile, pgaio_closing_fd(), VFD_CLOSED, and VfdCache.

Referenced by closeAllVfds(), and ReleaseLruFile().

◆ LruInsert()

static int LruInsert ( File  file)
static

Definition at line 1319 of file fd.c.

1320{
1321 Vfd *vfdP;
1322
1323 Assert(file != 0);
1324
1325 DO_DB(elog(LOG, "LruInsert %d (%s)",
1326 file, VfdCache[file].fileName));
1327
1328 vfdP = &VfdCache[file];
1329
1330 if (FileIsNotOpen(file))
1331 {
1332 /* Close excess kernel FDs. */
1334
1335 /*
1336 * The open could still fail for lack of file descriptors, eg due to
1337 * overall system file table being full. So, be prepared to release
1338 * another FD if necessary...
1339 */
1340 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1341 vfdP->fileMode);
1342 if (vfdP->fd < 0)
1343 {
1344 DO_DB(elog(LOG, "re-open failed: %m"));
1345 return -1;
1346 }
1347 else
1348 {
1349 ++nfile;
1350 }
1351 }
1352
1353 /*
1354 * put it at the head of the Lru ring
1355 */
1356
1357 Insert(file);
1358
1359 return 0;
1360}

References Assert, BasicOpenFilePerm(), DO_DB, elog, fb(), FileIsNotOpen, Insert(), LOG, nfile, ReleaseLruFiles(), and VfdCache.

Referenced by FileAccess().

◆ MakePGDirectory()

◆ OpenPipeStream()

FILE * OpenPipeStream ( const char command,
const char mode 
)

Definition at line 2727 of file fd.c.

2728{
2729 FILE *file;
2730 int save_errno;
2731
2732 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2733 numAllocatedDescs, command));
2734
2735 /* Can we allocate another non-virtual FD? */
2736 if (!reserveAllocatedDesc())
2737 ereport(ERROR,
2739 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2740 maxAllocatedDescs, command)));
2741
2742 /* Close excess kernel FDs. */
2744
2745TryAgain:
2746 fflush(NULL);
2748 errno = 0;
2749 file = popen(command, mode);
2750 save_errno = errno;
2752 errno = save_errno;
2753 if (file != NULL)
2754 {
2756
2757 desc->kind = AllocateDescPipe;
2758 desc->desc.file = file;
2761 return desc->desc.file;
2762 }
2763
2764 if (errno == EMFILE || errno == ENFILE)
2765 {
2766 ereport(LOG,
2768 errmsg("out of file descriptors: %m; release and retry")));
2769 if (ReleaseLruFile())
2770 goto TryAgain;
2771 errno = save_errno;
2772 }
2773
2774 return NULL;
2775}

References allocatedDescs, AllocateDescPipe, AllocateDesc::create_subid, AllocateDesc::desc, DO_DB, elog, ereport, errcode(), errmsg(), ERROR, fb(), AllocateDesc::file, GetCurrentSubTransactionId(), AllocateDesc::kind, LOG, maxAllocatedDescs, mode, numAllocatedDescs, pqsignal, ReleaseLruFile(), ReleaseLruFiles(), reserveAllocatedDesc(), and SIGPIPE.

Referenced by BeginCopyFrom(), BeginCopyTo(), pg_import_system_collations(), run_ssl_passphrase_command(), and shell_run_command().

◆ OpenTemporaryFile()

File OpenTemporaryFile ( bool  interXact)

Definition at line 1708 of file fd.c.

1709{
1710 File file = 0;
1711
1712 Assert(temporary_files_allowed); /* check temp file access is up */
1713
1714 /*
1715 * Make sure the current resource owner has space for this File before we
1716 * open it, if we'll be registering it below.
1717 */
1718 if (!interXact)
1720
1721 /*
1722 * If some temp tablespace(s) have been given to us, try to use the next
1723 * one. If a given tablespace can't be found, we silently fall back to
1724 * the database's default tablespace.
1725 *
1726 * BUT: if the temp file is slated to outlive the current transaction,
1727 * force it into the database's default tablespace, so that it will not
1728 * pose a threat to possible tablespace drop attempts.
1729 */
1730 if (numTempTableSpaces > 0 && !interXact)
1731 {
1733
1734 if (OidIsValid(tblspcOid))
1736 }
1737
1738 /*
1739 * If not, or if tablespace is bad, create in database's default
1740 * tablespace. MyDatabaseTableSpace should normally be set before we get
1741 * here, but just in case it isn't, fall back to pg_default tablespace.
1742 */
1743 if (file <= 0)
1747 true);
1748
1749 /* Mark it for deletion at close and temporary file size limit */
1751
1752 /* Register it with the current resource owner */
1753 if (!interXact)
1755
1756 return file;
1757}

References Assert, CurrentResourceOwner, fb(), FD_DELETE_AT_CLOSE, FD_TEMP_FILE_LIMIT, vfd::fdstate, GetNextTempTableSpace(), MyDatabaseTableSpace, numTempTableSpaces, OidIsValid, OpenTemporaryFileInTablespace(), RegisterTemporaryFile(), ResourceOwnerEnlarge(), and VfdCache.

Referenced by BufFileCreateTemp(), and extendBufFile().

◆ OpenTemporaryFileInTablespace()

static File OpenTemporaryFileInTablespace ( Oid  tblspcOid,
bool  rejectError 
)
static

Definition at line 1788 of file fd.c.

1789{
1790 char tempdirpath[MAXPGPATH];
1791 char tempfilepath[MAXPGPATH];
1792 File file;
1793
1795
1796 /*
1797 * Generate a tempfile name that should be unique within the current
1798 * database instance.
1799 */
1800 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1802
1803 /*
1804 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1805 * temp file that can be reused.
1806 */
1809 if (file <= 0)
1810 {
1811 /*
1812 * We might need to create the tablespace's tempfile directory, if no
1813 * one has yet done so.
1814 *
1815 * Don't check for an error from MakePGDirectory; it could fail if
1816 * someone else just did the same thing. If it doesn't work then
1817 * we'll bomb out on the second create attempt, instead.
1818 */
1820
1823 if (file <= 0 && rejectError)
1824 elog(ERROR, "could not create temporary file \"%s\": %m",
1825 tempfilepath);
1826 }
1827
1828 return file;
1829}

References elog, ERROR, fb(), MakePGDirectory(), MAXPGPATH, MyProcPid, PathNameOpenFile(), PG_BINARY, PG_TEMP_FILE_PREFIX, snprintf, tempFileCounter, and TempTablespacePath().

Referenced by OpenTemporaryFile().

◆ OpenTransientFile()

◆ OpenTransientFilePerm()

int OpenTransientFilePerm ( const char fileName,
int  fileFlags,
mode_t  fileMode 
)

Definition at line 2683 of file fd.c.

2684{
2685 int fd;
2686
2687 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2688 numAllocatedDescs, fileName));
2689
2690 /* Can we allocate another non-virtual FD? */
2691 if (!reserveAllocatedDesc())
2692 ereport(ERROR,
2694 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2695 maxAllocatedDescs, fileName)));
2696
2697 /* Close excess kernel FDs. */
2699
2700 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2701
2702 if (fd >= 0)
2703 {
2705
2706 desc->kind = AllocateDescRawFD;
2707 desc->desc.fd = fd;
2710
2711 return fd;
2712 }
2713
2714 return -1; /* failure */
2715}

References allocatedDescs, AllocateDescRawFD, BasicOpenFilePerm(), AllocateDesc::create_subid, AllocateDesc::desc, DO_DB, elog, ereport, errcode(), errmsg(), ERROR, fb(), AllocateDesc::fd, fd(), GetCurrentSubTransactionId(), AllocateDesc::kind, LOG, maxAllocatedDescs, numAllocatedDescs, ReleaseLruFiles(), and reserveAllocatedDesc().

Referenced by be_lo_export(), and OpenTransientFile().

◆ PathNameCreateTemporaryDir()

void PathNameCreateTemporaryDir ( const char basedir,
const char directory 
)

Definition at line 1644 of file fd.c.

1645{
1646 if (MakePGDirectory(directory) < 0)
1647 {
1648 if (errno == EEXIST)
1649 return;
1650
1651 /*
1652 * Failed. Try to create basedir first in case it's missing. Tolerate
1653 * EEXIST to close a race against another process following the same
1654 * algorithm.
1655 */
1656 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1657 ereport(ERROR,
1659 errmsg("cannot create temporary directory \"%s\": %m",
1660 basedir)));
1661
1662 /* Try again. */
1663 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1664 ereport(ERROR,
1666 errmsg("cannot create temporary subdirectory \"%s\": %m",
1667 directory)));
1668 }
1669}

References basedir, directory, ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), and MakePGDirectory().

Referenced by FileSetCreate().

◆ PathNameCreateTemporaryFile()

File PathNameCreateTemporaryFile ( const char path,
bool  error_on_failure 
)

Definition at line 1845 of file fd.c.

1846{
1847 File file;
1848
1849 Assert(temporary_files_allowed); /* check temp file access is up */
1850
1852
1853 /*
1854 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1855 * temp file that can be reused.
1856 */
1857 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1858 if (file <= 0)
1859 {
1860 if (error_on_failure)
1861 ereport(ERROR,
1863 errmsg("could not create temporary file \"%s\": %m",
1864 path)));
1865 else
1866 return file;
1867 }
1868
1869 /* Mark it for temp_file_limit accounting. */
1871
1872 /* Register it for automatic close. */
1874
1875 return file;
1876}

References Assert, CurrentResourceOwner, ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), FD_TEMP_FILE_LIMIT, vfd::fdstate, PathNameOpenFile(), PG_BINARY, RegisterTemporaryFile(), ResourceOwnerEnlarge(), and VfdCache.

Referenced by FileSetCreate().

◆ PathNameDeleteTemporaryDir()

void PathNameDeleteTemporaryDir ( const char dirname)

Definition at line 1675 of file fd.c.

1676{
1677 struct stat statbuf;
1678
1679 /* Silently ignore missing directory. */
1680 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1681 return;
1682
1683 /*
1684 * Currently, walkdir doesn't offer a way for our passed in function to
1685 * maintain state. Perhaps it should, so that we could tell the caller
1686 * whether this operation succeeded or failed. Since this operation is
1687 * used in a cleanup path, we wouldn't actually behave differently: we'll
1688 * just log failures.
1689 */
1690 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1691}

References fb(), LOG, stat, unlink_if_exists_fname(), and walkdir().

Referenced by FileSetDeleteAll().

◆ PathNameDeleteTemporaryFile()

bool PathNameDeleteTemporaryFile ( const char path,
bool  error_on_failure 
)

Definition at line 1916 of file fd.c.

1917{
1918 struct stat filestats;
1919 int stat_errno;
1920
1921 /* Get the final size for pgstat reporting. */
1922 if (stat(path, &filestats) != 0)
1923 stat_errno = errno;
1924 else
1925 stat_errno = 0;
1926
1927 /*
1928 * Unlike FileClose's automatic file deletion code, we tolerate
1929 * non-existence to support BufFileDeleteFileSet which doesn't know how
1930 * many segments it has to delete until it runs out.
1931 */
1932 if (stat_errno == ENOENT)
1933 return false;
1934
1935 if (unlink(path) < 0)
1936 {
1937 if (errno != ENOENT)
1940 errmsg("could not unlink temporary file \"%s\": %m",
1941 path)));
1942 return false;
1943 }
1944
1945 if (stat_errno == 0)
1946 ReportTemporaryFileUsage(path, filestats.st_size);
1947 else
1948 {
1949 errno = stat_errno;
1950 ereport(LOG,
1952 errmsg("could not stat file \"%s\": %m", path)));
1953 }
1954
1955 return true;
1956}

References ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), LOG, ReportTemporaryFileUsage(), and stat.

Referenced by FileSetDelete(), and unlink_if_exists_fname().

◆ PathNameOpenFile()

◆ PathNameOpenFilePerm()

File PathNameOpenFilePerm ( const char fileName,
int  fileFlags,
mode_t  fileMode 
)

Definition at line 1572 of file fd.c.

1573{
1574 char *fnamecopy;
1575 File file;
1576 Vfd *vfdP;
1577
1578 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1579 fileName, fileFlags, fileMode));
1580
1581 /*
1582 * We need a malloc'd copy of the file name; fail cleanly if no room.
1583 */
1584 fnamecopy = strdup(fileName);
1585 if (fnamecopy == NULL)
1586 ereport(ERROR,
1588 errmsg("out of memory")));
1589
1590 file = AllocateVfd();
1591 vfdP = &VfdCache[file];
1592
1593 /* Close excess kernel FDs. */
1595
1596 /*
1597 * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1598 * client shouldn't be expected to know which kernel descriptors are
1599 * currently open, so it wouldn't make sense for them to be inherited by
1600 * executed subprograms.
1601 */
1602 fileFlags |= O_CLOEXEC;
1603
1604 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1605
1606 if (vfdP->fd < 0)
1607 {
1608 int save_errno = errno;
1609
1610 FreeVfd(file);
1611 free(fnamecopy);
1612 errno = save_errno;
1613 return -1;
1614 }
1615 ++nfile;
1616 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1617 vfdP->fd));
1618
1619 vfdP->fileName = fnamecopy;
1620 /* Saved flags are adjusted to be OK for re-opening file */
1621 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1622 vfdP->fileMode = fileMode;
1623 vfdP->fileSize = 0;
1624 vfdP->fdstate = 0x0;
1625 vfdP->resowner = NULL;
1626
1627 Insert(file);
1628
1629 return file;
1630}

References AllocateVfd(), BasicOpenFilePerm(), DO_DB, elog, ereport, errcode(), errmsg(), ERROR, fb(), free, FreeVfd(), Insert(), LOG, nfile, O_CLOEXEC, ReleaseLruFiles(), and VfdCache.

Referenced by PathNameOpenFile().

◆ PathNameOpenTemporaryFile()

File PathNameOpenTemporaryFile ( const char path,
int  mode 
)

Definition at line 1885 of file fd.c.

1886{
1887 File file;
1888
1889 Assert(temporary_files_allowed); /* check temp file access is up */
1890
1892
1893 file = PathNameOpenFile(path, mode | PG_BINARY);
1894
1895 /* If no such file, then we don't raise an error. */
1896 if (file <= 0 && errno != ENOENT)
1897 ereport(ERROR,
1899 errmsg("could not open temporary file \"%s\": %m",
1900 path)));
1901
1902 if (file > 0)
1903 {
1904 /* Register it for automatic close. */
1906 }
1907
1908 return file;
1909}

References Assert, CurrentResourceOwner, ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), mode, PathNameOpenFile(), PG_BINARY, RegisterTemporaryFile(), and ResourceOwnerEnlarge().

Referenced by FileSetOpen().

◆ pg_fdatasync()

int pg_fdatasync ( int  fd)

Definition at line 477 of file fd.c.

478{
479 int rc;
480
481 if (!enableFsync)
482 return 0;
483
484retry:
485 rc = fdatasync(fd);
486
487 if (rc == -1 && errno == EINTR)
488 goto retry;
489
490 return rc;
491}

References EINTR, enableFsync, fb(), fd(), and fdatasync().

Referenced by issue_xlog_fsync().

◆ pg_file_exists()

bool pg_file_exists ( const char name)

Definition at line 500 of file fd.c.

501{
502 struct stat st;
503
504 Assert(name != NULL);
505
506 if (stat(name, &st) == 0)
507 return !S_ISDIR(st.st_mode);
508 else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
511 errmsg("could not access file \"%s\": %m", name)));
512
513 return false;
514}

References Assert, ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), name, S_ISDIR, stat::st_mode, and stat.

Referenced by expand_dynamic_library_name(), find_in_path(), find_in_paths(), and provider_init().

◆ pg_flush_data()

void pg_flush_data ( int  fd,
pgoff_t  offset,
pgoff_t  nbytes 
)

Definition at line 522 of file fd.c.

523{
524 /*
525 * Right now file flushing is primarily used to avoid making later
526 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
527 * if fsyncs are disabled - that's a decision we might want to make
528 * configurable at some point.
529 */
530 if (!enableFsync)
531 return;
532
533 /*
534 * We compile all alternatives that are supported on the current platform,
535 * to find portability problems more easily.
536 */
537#if defined(HAVE_SYNC_FILE_RANGE)
538 {
539 int rc;
540 static bool not_implemented_by_kernel = false;
541
543 return;
544
545retry:
546
547 /*
548 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
549 * tells the OS that writeback for the specified blocks should be
550 * started, but that we don't want to wait for completion. Note that
551 * this call might block if too much dirty data exists in the range.
552 * This is the preferable method on OSs supporting it, as it works
553 * reliably when available (contrast to msync()) and doesn't flush out
554 * clean data (like FADV_DONTNEED).
555 */
556 rc = sync_file_range(fd, offset, nbytes,
558 if (rc != 0)
559 {
560 int elevel;
561
562 if (rc == EINTR)
563 goto retry;
564
565 /*
566 * For systems that don't have an implementation of
567 * sync_file_range() such as Windows WSL, generate only one
568 * warning and then suppress all further attempts by this process.
569 */
570 if (errno == ENOSYS)
571 {
572 elevel = WARNING;
574 }
575 else
576 elevel = data_sync_elevel(WARNING);
577
578 ereport(elevel,
580 errmsg("could not flush dirty data: %m")));
581 }
582
583 return;
584 }
585#endif
586#if !defined(WIN32) && defined(MS_ASYNC)
587 {
588 void *p;
589 static int pagesize = 0;
590
591 /*
592 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
593 * writeback. On linux it only does so if MS_SYNC is specified, but
594 * then it does the writeback synchronously. Luckily all common linux
595 * systems have sync_file_range(). This is preferable over
596 * FADV_DONTNEED because it doesn't flush out clean data.
597 *
598 * We map the file (mmap()), tell the kernel to sync back the contents
599 * (msync()), and then remove the mapping again (munmap()).
600 */
601
602 /* mmap() needs actual length if we want to map whole file */
603 if (offset == 0 && nbytes == 0)
604 {
605 nbytes = lseek(fd, 0, SEEK_END);
606 if (nbytes < 0)
607 {
610 errmsg("could not determine dirty data size: %m")));
611 return;
612 }
613 }
614
615 /*
616 * Some platforms reject partial-page mmap() attempts. To deal with
617 * that, just truncate the request to a page boundary. If any extra
618 * bytes don't get flushed, well, it's only a hint anyway.
619 */
620
621 /* fetch pagesize only once */
622 if (pagesize == 0)
624
625 /* align length to pagesize, dropping any fractional page */
626 if (pagesize > 0)
627 nbytes = (nbytes / pagesize) * pagesize;
628
629 /* fractional-page request is a no-op */
630 if (nbytes <= 0)
631 return;
632
633 /*
634 * mmap could well fail, particularly on 32-bit platforms where there
635 * may simply not be enough address space. If so, silently fall
636 * through to the next implementation.
637 */
638 if (nbytes <= (pgoff_t) SSIZE_MAX)
639 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
640 else
641 p = MAP_FAILED;
642
643 if (p != MAP_FAILED)
644 {
645 int rc;
646
647 rc = msync(p, (size_t) nbytes, MS_ASYNC);
648 if (rc != 0)
649 {
652 errmsg("could not flush dirty data: %m")));
653 /* NB: need to fall through to munmap()! */
654 }
655
656 rc = munmap(p, (size_t) nbytes);
657 if (rc != 0)
658 {
659 /* FATAL error because mapping would remain */
662 errmsg("could not munmap() while flushing data: %m")));
663 }
664
665 return;
666 }
667 }
668#endif
669#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
670 {
671 int rc;
672
673 /*
674 * Signal the kernel that the passed in range should not be cached
675 * anymore. This has the, desired, side effect of writing out dirty
676 * data, and the, undesired, side effect of likely discarding useful
677 * clean cached blocks. For the latter reason this is the least
678 * preferable method.
679 */
680
681 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
682
683 if (rc != 0)
684 {
685 /* don't error out, this is just a performance optimization */
688 errmsg("could not flush dirty data: %m")));
689 }
690
691 return;
692 }
693#endif
694}

References data_sync_elevel(), EINTR, enableFsync, ereport, errcode_for_file_access(), errmsg(), FATAL, fb(), fd(), MAP_FAILED, and WARNING.

Referenced by copy_file(), and FileWriteback().

◆ pg_fsync()

int pg_fsync ( int  fd)

Definition at line 386 of file fd.c.

387{
388#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
389 struct stat st;
390
391 /*
392 * Some operating system implementations of fsync() have requirements
393 * about the file access modes that were used when their file descriptor
394 * argument was opened, and these requirements differ depending on whether
395 * the file descriptor is for a directory.
396 *
397 * For any file descriptor that may eventually be handed to fsync(), we
398 * should have opened it with access modes that are compatible with
399 * fsync() on all supported systems, otherwise the code may not be
400 * portable, even if it runs ok on the current system.
401 *
402 * We assert here that a descriptor for a file was opened with write
403 * permissions (i.e., not O_RDONLY) and for a directory without write
404 * permissions (O_RDONLY). Notice that the assertion check is made even
405 * if fsync() is disabled.
406 *
407 * If fstat() fails, ignore it and let the follow-up fsync() complain.
408 */
409 if (fstat(fd, &st) == 0)
410 {
411 int desc_flags = fcntl(fd, F_GETFL);
412
414
415 if (S_ISDIR(st.st_mode))
417 else
419 }
420 errno = 0;
421#endif
422
423 /* #if is to skip the wal_sync_method test if there's no need for it */
424#if defined(HAVE_FSYNC_WRITETHROUGH)
427 else
428#endif
430}

References Assert, fb(), fd(), fstat, pg_fsync_no_writethrough(), pg_fsync_writethrough(), S_ISDIR, stat::st_mode, wal_sync_method, and WAL_SYNC_METHOD_FSYNC_WRITETHROUGH.

Referenced by AddToDataDirLockFile(), assign_wal_sync_method(), BootStrapXLOG(), CheckPointLogicalRewriteHeap(), CreateDirAndVersionFile(), CreateLockFile(), durable_rename(), FileSync(), fsync_fname_ext(), heap_xlog_logical_rewrite(), readRecoverySignalFile(), RecreateTwoPhaseFile(), RestoreSlotFromDisk(), SaveSlotToPath(), SlruPhysicalWritePage(), SlruSyncFileTag(), SnapBuildSerialize(), update_controlfile(), write_auto_conf_file(), WriteControlFile(), writeTimeLineHistory(), writeTimeLineHistoryFile(), XLogFileCopy(), and XLogFileInitInternal().

◆ pg_fsync_no_writethrough()

int pg_fsync_no_writethrough ( int  fd)

Definition at line 438 of file fd.c.

439{
440 int rc;
441
442 if (!enableFsync)
443 return 0;
444
445retry:
446 rc = fsync(fd);
447
448 if (rc == -1 && errno == EINTR)
449 goto retry;
450
451 return rc;
452}

References EINTR, enableFsync, fb(), fd(), and fsync.

Referenced by issue_xlog_fsync(), and pg_fsync().

◆ pg_fsync_writethrough()

int pg_fsync_writethrough ( int  fd)

Definition at line 458 of file fd.c.

459{
460 if (enableFsync)
461 {
462#if defined(F_FULLFSYNC)
463 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
464#else
465 errno = ENOSYS;
466 return -1;
467#endif
468 }
469 else
470 return 0;
471}

References enableFsync, fb(), and fd().

Referenced by issue_xlog_fsync(), pg_fsync(), and test_sync().

◆ pg_ftruncate()

static int pg_ftruncate ( int  fd,
pgoff_t  length 
)
static

Definition at line 700 of file fd.c.

701{
702 int ret;
703
704retry:
705 ret = ftruncate(fd, length);
706
707 if (ret == -1 && errno == EINTR)
708 goto retry;
709
710 return ret;
711}

References EINTR, fb(), and fd().

Referenced by FileTruncate(), and pg_truncate().

◆ pg_truncate()

int pg_truncate ( const char path,
pgoff_t  length 
)

Definition at line 717 of file fd.c.

718{
719 int ret;
720#ifdef WIN32
721 int save_errno;
722 int fd;
723
725 if (fd >= 0)
726 {
727 ret = pg_ftruncate(fd, length);
731 }
732 else
733 ret = -1;
734#else
735
736retry:
737 ret = truncate(path, length);
738
739 if (ret == -1 && errno == EINTR)
740 goto retry;
741#endif
742
743 return ret;
744}

References CloseTransientFile(), EINTR, fb(), fd(), OpenTransientFile(), PG_BINARY, and pg_ftruncate().

Referenced by do_truncate().

◆ ReadDir()

◆ ReadDirExtended()

struct dirent * ReadDirExtended ( DIR dir,
const char dirname,
int  elevel 
)

Definition at line 2968 of file fd.c.

2969{
2970 struct dirent *dent;
2971
2972 /* Give a generic message for AllocateDir failure, if caller didn't */
2973 if (dir == NULL)
2974 {
2975 ereport(elevel,
2977 errmsg("could not open directory \"%s\": %m",
2978 dirname)));
2979 return NULL;
2980 }
2981
2982 errno = 0;
2983 if ((dent = readdir(dir)) != NULL)
2984 return dent;
2985
2986 if (errno)
2987 ereport(elevel,
2989 errmsg("could not read directory \"%s\": %m",
2990 dirname)));
2991 return NULL;
2992}

References ereport, errcode_for_file_access(), errmsg(), fb(), and readdir().

Referenced by DeleteAllExportedSnapshotFiles(), ReadDir(), RelationCacheInitFileRemove(), RelationCacheInitFileRemoveInDir(), RemovePgTempFiles(), RemovePgTempFilesInDir(), RemovePgTempRelationFiles(), RemovePgTempRelationFilesInDbspace(), ReorderBufferCleanupSerializedTXNs(), scan_directory_ci(), SyncDataDirectory(), and walkdir().

◆ RegisterTemporaryFile()

static void RegisterTemporaryFile ( File  file)
static

◆ ReleaseExternalFD()

◆ ReleaseLruFile()

static bool ReleaseLruFile ( void  )
static

Definition at line 1366 of file fd.c.

1367{
1368 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1369
1370 if (nfile > 0)
1371 {
1372 /*
1373 * There are opened files and so there should be at least one used vfd
1374 * in the ring.
1375 */
1376 Assert(VfdCache[0].lruMoreRecently != 0);
1377 LruDelete(VfdCache[0].lruMoreRecently);
1378 return true; /* freed a file */
1379 }
1380 return false; /* no files available to free */
1381}

References Assert, DO_DB, elog, LOG, LruDelete(), nfile, and VfdCache.

Referenced by AllocateDir(), AllocateFile(), BasicOpenFilePerm(), OpenPipeStream(), and ReleaseLruFiles().

◆ ReleaseLruFiles()

static void ReleaseLruFiles ( void  )
static

◆ RemovePgTempFiles()

void RemovePgTempFiles ( void  )

Definition at line 3319 of file fd.c.

3320{
3322 DIR *spc_dir;
3323 struct dirent *spc_de;
3324
3325 /*
3326 * First process temp files in pg_default ($PGDATA/base)
3327 */
3328 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3329 RemovePgTempFilesInDir(temp_path, true, false);
3331
3332 /*
3333 * Cycle through temp directories for all non-default tablespaces.
3334 */
3336
3338 {
3339 if (strcmp(spc_de->d_name, ".") == 0 ||
3340 strcmp(spc_de->d_name, "..") == 0)
3341 continue;
3342
3343 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3346 RemovePgTempFilesInDir(temp_path, true, false);
3347
3348 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3351 }
3352
3354
3355 /*
3356 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3357 * DataDir as well. However, that is *not* cleaned here because doing so
3358 * would create a race condition. It's done separately, earlier in
3359 * postmaster startup.
3360 */
3361}

References AllocateDir(), fb(), FreeDir(), LOG, MAXPGPATH, PG_TBLSPC_DIR, PG_TEMP_FILES_DIR, ReadDirExtended(), RemovePgTempFilesInDir(), RemovePgTempRelationFiles(), snprintf, and TABLESPACE_VERSION_DIRECTORY.

Referenced by PostmasterMain(), and PostmasterStateMachine().

◆ RemovePgTempFilesInDir()

void RemovePgTempFilesInDir ( const char tmpdirname,
bool  missing_ok,
bool  unlink_all 
)

Definition at line 3379 of file fd.c.

3380{
3381 DIR *temp_dir;
3382 struct dirent *temp_de;
3383 char rm_path[MAXPGPATH * 2];
3384
3386
3387 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3388 return;
3389
3391 {
3392 if (strcmp(temp_de->d_name, ".") == 0 ||
3393 strcmp(temp_de->d_name, "..") == 0)
3394 continue;
3395
3396 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3397 tmpdirname, temp_de->d_name);
3398
3399 if (unlink_all ||
3400 strncmp(temp_de->d_name,
3403 {
3405
3406 if (type == PGFILETYPE_ERROR)
3407 continue;
3408 else if (type == PGFILETYPE_DIR)
3409 {
3410 /* recursively remove contents, then directory itself */
3411 RemovePgTempFilesInDir(rm_path, false, true);
3412
3413 if (rmdir(rm_path) < 0)
3414 ereport(LOG,
3416 errmsg("could not remove directory \"%s\": %m",
3417 rm_path)));
3418 }
3419 else
3420 {
3421 if (unlink(rm_path) < 0)
3422 ereport(LOG,
3424 errmsg("could not remove file \"%s\": %m",
3425 rm_path)));
3426 }
3427 }
3428 else
3429 ereport(LOG,
3430 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3431 rm_path)));
3432 }
3433
3435}

References AllocateDir(), ereport, errcode_for_file_access(), errmsg(), fb(), FreeDir(), get_dirent_type(), LOG, MAXPGPATH, PG_TEMP_FILE_PREFIX, PGFILETYPE_DIR, PGFILETYPE_ERROR, ReadDirExtended(), RemovePgTempFilesInDir(), snprintf, and type.

Referenced by PostmasterMain(), RemovePgTempFiles(), and RemovePgTempFilesInDir().

◆ RemovePgTempRelationFiles()

static void RemovePgTempRelationFiles ( const char tsdirname)
static

Definition at line 3439 of file fd.c.

3440{
3441 DIR *ts_dir;
3442 struct dirent *de;
3443 char dbspace_path[MAXPGPATH * 2];
3444
3446
3447 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3448 {
3449 /*
3450 * We're only interested in the per-database directories, which have
3451 * numeric names. Note that this code will also (properly) ignore "."
3452 * and "..".
3453 */
3454 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3455 continue;
3456
3457 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3458 tsdirname, de->d_name);
3460 }
3461
3462 FreeDir(ts_dir);
3463}

References AllocateDir(), fb(), FreeDir(), LOG, MAXPGPATH, ReadDirExtended(), RemovePgTempRelationFilesInDbspace(), and snprintf.

Referenced by RemovePgTempFiles().

◆ RemovePgTempRelationFilesInDbspace()

static void RemovePgTempRelationFilesInDbspace ( const char dbspacedirname)
static

Definition at line 3467 of file fd.c.

3468{
3470 struct dirent *de;
3471 char rm_path[MAXPGPATH * 2];
3472
3474
3476 {
3477 if (!looks_like_temp_rel_name(de->d_name))
3478 continue;
3479
3480 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3481 dbspacedirname, de->d_name);
3482
3483 if (unlink(rm_path) < 0)
3484 ereport(LOG,
3486 errmsg("could not remove file \"%s\": %m",
3487 rm_path)));
3488 }
3489
3491}

References AllocateDir(), ereport, errcode_for_file_access(), errmsg(), fb(), FreeDir(), LOG, looks_like_temp_rel_name(), MAXPGPATH, ReadDirExtended(), and snprintf.

Referenced by RemovePgTempRelationFiles().

◆ ReportTemporaryFileUsage()

static void ReportTemporaryFileUsage ( const char path,
pgoff_t  size 
)
static

Definition at line 1512 of file fd.c.

1513{
1515
1516 if (log_temp_files >= 0)
1517 {
1518 if ((size / 1024) >= log_temp_files)
1519 ereport(LOG,
1520 (errmsg("temporary file: path \"%s\", size %lu",
1521 path, (unsigned long) size)));
1522 }
1523}

References ereport, errmsg(), LOG, log_temp_files, and pgstat_report_tempfile().

Referenced by FileClose(), and PathNameDeleteTemporaryFile().

◆ reserveAllocatedDesc()

static bool reserveAllocatedDesc ( void  )
static

Definition at line 2549 of file fd.c.

2550{
2552 int newMax;
2553
2554 /* Quick out if array already has a free slot. */
2556 return true;
2557
2558 /*
2559 * If the array hasn't yet been created in the current process, initialize
2560 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2561 * we will ever need, anyway. We don't want to look at max_safe_fds
2562 * immediately because set_max_safe_fds() may not have run yet.
2563 */
2564 if (allocatedDescs == NULL)
2565 {
2566 newMax = FD_MINFREE / 3;
2568 /* Out of memory already? Treat as fatal error. */
2569 if (newDescs == NULL)
2570 ereport(ERROR,
2572 errmsg("out of memory")));
2575 return true;
2576 }
2577
2578 /*
2579 * Consider enlarging the array beyond the initial allocation used above.
2580 * By the time this happens, max_safe_fds should be known accurately.
2581 *
2582 * We mustn't let allocated descriptors hog all the available FDs, and in
2583 * practice we'd better leave a reasonable number of FDs for VFD use. So
2584 * set the maximum to max_safe_fds / 3. (This should certainly be at
2585 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2586 * tightening the restriction here.) Recall that "external" FDs are
2587 * allowed to consume another third of max_safe_fds.
2588 */
2589 newMax = max_safe_fds / 3;
2591 {
2593 newMax * sizeof(AllocateDesc));
2594 /* Treat out-of-memory as a non-fatal error. */
2595 if (newDescs == NULL)
2596 return false;
2599 return true;
2600 }
2601
2602 /* Can't enlarge allocatedDescs[] any more. */
2603 return false;
2604}

References allocatedDescs, ereport, errcode(), errmsg(), ERROR, fb(), FD_MINFREE, malloc, max_safe_fds, maxAllocatedDescs, numAllocatedDescs, and realloc.

Referenced by AllocateDir(), AllocateFile(), OpenPipeStream(), and OpenTransientFilePerm().

◆ ReserveExternalFD()

void ReserveExternalFD ( void  )

Definition at line 1203 of file fd.c.

1204{
1205 /*
1206 * Release VFDs if needed to stay safe. Because we do this before
1207 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1208 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1209 */
1211
1213}

References numExternalFDs, and ReleaseLruFiles().

Referenced by AcquireExternalFD(), BackendInitialize(), dsm_impl_posix(), InitializeWaitEventSupport(), InitPostmasterDeathWatchHandle(), and XLogWrite().

◆ ResourceOwnerForgetFile()

static void ResourceOwnerForgetFile ( ResourceOwner  owner,
File  file 
)
inlinestatic

Definition at line 377 of file fd.c.

378{
380}

References file_resowner_desc, Int32GetDatum(), and ResourceOwnerForget().

Referenced by FileClose().

◆ ResourceOwnerRememberFile()

static void ResourceOwnerRememberFile ( ResourceOwner  owner,
File  file 
)
inlinestatic

Definition at line 372 of file fd.c.

373{
375}

References file_resowner_desc, Int32GetDatum(), and ResourceOwnerRemember().

Referenced by RegisterTemporaryFile().

◆ ResOwnerPrintFile()

static char * ResOwnerPrintFile ( Datum  res)
static

Definition at line 4099 of file fd.c.

4100{
4101 return psprintf("File %d", DatumGetInt32(res));
4102}

References DatumGetInt32(), and psprintf().

◆ ResOwnerReleaseFile()

static void ResOwnerReleaseFile ( Datum  res)
static

Definition at line 4085 of file fd.c.

4086{
4087 File file = (File) DatumGetInt32(res);
4088 Vfd *vfdP;
4089
4090 Assert(FileIsValid(file));
4091
4092 vfdP = &VfdCache[file];
4093 vfdP->resowner = NULL;
4094
4095 FileClose(file);
4096}

References Assert, DatumGetInt32(), fb(), FileClose(), FileIsValid, vfd::resowner, and VfdCache.

◆ set_max_safe_fds()

void set_max_safe_fds ( void  )

Definition at line 1041 of file fd.c.

1042{
1043 int usable_fds;
1044 int already_open;
1045
1046 /*----------
1047 * We want to set max_safe_fds to
1048 * MIN(usable_fds, max_files_per_process)
1049 * less the slop factor for files that are opened without consulting
1050 * fd.c. This ensures that we won't allow to open more than
1051 * max_files_per_process, or the experimentally-determined EMFILE limit,
1052 * additional files.
1053 *----------
1054 */
1057
1059
1060 /*
1061 * Take off the FDs reserved for system() etc.
1062 */
1064
1065 /*
1066 * Make sure we still have enough to get by.
1067 */
1069 ereport(FATAL,
1071 errmsg("insufficient file descriptors available to start server process"),
1072 errdetail("System allows %d, server needs at least %d, %d files are already open.",
1075 already_open)));
1076
1077 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1079}

References count_usable_fds(), DEBUG2, elog, ereport, errcode(), errdetail(), errmsg(), FATAL, fb(), FD_MINFREE, max_files_per_process, max_safe_fds, Min, and NUM_RESERVED_FDS.

Referenced by BootstrapModeMain(), PostgresSingleUserMain(), and PostmasterMain().

◆ SetTempTablespaces()

void SetTempTablespaces ( Oid tableSpaces,
int  numSpaces 
)

Definition at line 3093 of file fd.c.

3094{
3095 Assert(numSpaces >= 0);
3098
3099 /*
3100 * Select a random starting point in the list. This is to minimize
3101 * conflicts between backends that are most likely sharing the same list
3102 * of temp tablespaces. Note that if we create multiple temp files in the
3103 * same transaction, we'll advance circularly through the list --- this
3104 * ensures that large temporary sort files are nicely spread across all
3105 * available tablespaces.
3106 */
3107 if (numSpaces > 1)
3109 0, numSpaces - 1);
3110 else
3112}

References Assert, fb(), nextTempTableSpace, numTempTableSpaces, pg_global_prng_state, pg_prng_uint64_range(), and tempTableSpaces.

Referenced by assign_temp_tablespaces(), and PrepareTempTablespaces().

◆ SyncDataDirectory()

void SyncDataDirectory ( void  )

Definition at line 3590 of file fd.c.

3591{
3592 bool xlog_is_symlink;
3593
3594 /* We can skip this whole thing if fsync is disabled. */
3595 if (!enableFsync)
3596 return;
3597
3598 /*
3599 * If pg_wal is a symlink, we'll need to recurse into it separately,
3600 * because the first walkdir below will ignore it.
3601 */
3602 xlog_is_symlink = false;
3603
3604 {
3605 struct stat st;
3606
3607 if (lstat("pg_wal", &st) < 0)
3608 ereport(LOG,
3610 errmsg("could not stat file \"%s\": %m",
3611 "pg_wal")));
3612 else if (S_ISLNK(st.st_mode))
3613 xlog_is_symlink = true;
3614 }
3615
3616#ifdef HAVE_SYNCFS
3618 {
3619 DIR *dir;
3620 struct dirent *de;
3621
3622 /*
3623 * On Linux, we don't have to open every single file one by one. We
3624 * can use syncfs() to sync whole filesystems. We only expect
3625 * filesystem boundaries to exist where we tolerate symlinks, namely
3626 * pg_wal and the tablespaces, so we call syncfs() for each of those
3627 * directories.
3628 */
3629
3630 /* Prepare to report progress syncing the data directory via syncfs. */
3632
3633 /* Sync the top level pgdata directory. */
3634 do_syncfs(".");
3635 /* If any tablespaces are configured, sync each of those. */
3637 while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3638 {
3639 char path[MAXPGPATH];
3640
3641 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3642 continue;
3643
3644 snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3645 do_syncfs(path);
3646 }
3647 FreeDir(dir);
3648 /* If pg_wal is a symlink, process that too. */
3649 if (xlog_is_symlink)
3650 do_syncfs("pg_wal");
3651 return;
3652 }
3653#endif /* !HAVE_SYNCFS */
3654
3655#ifdef PG_FLUSH_DATA_WORKS
3656 /* Prepare to report progress of the pre-fsync phase. */
3658
3659 /*
3660 * If possible, hint to the kernel that we're soon going to fsync the data
3661 * directory and its contents. Errors in this step are even less
3662 * interesting than normal, so log them only at DEBUG1.
3663 */
3664 walkdir(".", pre_sync_fname, false, DEBUG1);
3665 if (xlog_is_symlink)
3666 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3668#endif
3669
3670 /* Prepare to report progress syncing the data directory via fsync. */
3672
3673 /*
3674 * Now we do the fsync()s in the same order.
3675 *
3676 * The main call ignores symlinks, so in addition to specially processing
3677 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3678 * process_symlinks = true. Note that if there are any plain directories
3679 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3680 * so we don't worry about optimizing it.
3681 */
3682 walkdir(".", datadir_fsync_fname, false, LOG);
3683 if (xlog_is_symlink)
3684 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3686}

References AllocateDir(), begin_startup_progress_phase(), DATA_DIR_SYNC_METHOD_SYNCFS, datadir_fsync_fname(), DEBUG1, enableFsync, ereport, errcode_for_file_access(), errmsg(), fb(), FreeDir(), LOG, lstat, MAXPGPATH, PG_TBLSPC_DIR, ReadDirExtended(), recovery_init_sync_method, S_ISLNK, snprintf, stat::st_mode, and walkdir().

Referenced by StartupXLOG().

◆ TempTablespacePath()

void TempTablespacePath ( char path,
Oid  tablespace 
)

Definition at line 1763 of file fd.c.

1764{
1765 /*
1766 * Identify the tempfile directory for this tablespace.
1767 *
1768 * If someone tries to specify pg_global, use pg_default instead.
1769 */
1770 if (tablespace == InvalidOid ||
1773 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1774 else
1775 {
1776 /* All other tablespaces are accessed via symlinks */
1777 snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1780 }
1781}

References fb(), InvalidOid, MAXPGPATH, PG_TBLSPC_DIR, PG_TEMP_FILES_DIR, snprintf, tablespace, and TABLESPACE_VERSION_DIRECTORY.

Referenced by FileSetCreate(), FileSetPath(), OpenTemporaryFileInTablespace(), and pg_ls_tmpdir().

◆ TempTablespacesAreSet()

bool TempTablespacesAreSet ( void  )

Definition at line 3122 of file fd.c.

3123{
3124 return (numTempTableSpaces >= 0);
3125}

References numTempTableSpaces.

Referenced by GetTempTablespaces(), and PrepareTempTablespaces().

◆ unlink_if_exists_fname()

static void unlink_if_exists_fname ( const char fname,
bool  isdir,
int  elevel 
)
static

Definition at line 3818 of file fd.c.

3819{
3820 if (isdir)
3821 {
3822 if (rmdir(fname) != 0 && errno != ENOENT)
3823 ereport(elevel,
3825 errmsg("could not remove directory \"%s\": %m", fname)));
3826 }
3827 else
3828 {
3829 /* Use PathNameDeleteTemporaryFile to report filesize */
3830 PathNameDeleteTemporaryFile(fname, false);
3831 }
3832}

References ereport, errcode_for_file_access(), errmsg(), fb(), and PathNameDeleteTemporaryFile().

Referenced by PathNameDeleteTemporaryDir().

◆ walkdir()

static void walkdir ( const char path,
void(*)(const char *fname, bool isdir, int elevel)  action,
bool  process_symlinks,
int  elevel 
)
static

Definition at line 3704 of file fd.c.

3708{
3709 DIR *dir;
3710 struct dirent *de;
3711
3712 dir = AllocateDir(path);
3713
3714 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3715 {
3716 char subpath[MAXPGPATH * 2];
3717
3719
3720 if (strcmp(de->d_name, ".") == 0 ||
3721 strcmp(de->d_name, "..") == 0)
3722 continue;
3723
3724 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3725
3726 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3727 {
3728 case PGFILETYPE_REG:
3729 (*action) (subpath, false, elevel);
3730 break;
3731 case PGFILETYPE_DIR:
3732 walkdir(subpath, action, false, elevel);
3733 break;
3734 default:
3735
3736 /*
3737 * Errors are already reported directly by get_dirent_type(),
3738 * and any remaining symlinks and unknown file types are
3739 * ignored.
3740 */
3741 break;
3742 }
3743 }
3744
3745 FreeDir(dir); /* we ignore any error here */
3746
3747 /*
3748 * It's important to fsync the destination directory itself as individual
3749 * file fsyncs don't guarantee that the directory entry for the file is
3750 * synced. However, skip this if AllocateDir failed; the action function
3751 * might not be robust against that.
3752 */
3753 if (dir)
3754 (*action) (path, true, elevel);
3755}

References AllocateDir(), CHECK_FOR_INTERRUPTS, fb(), FreeDir(), get_dirent_type(), MAXPGPATH, PGFILETYPE_DIR, PGFILETYPE_REG, ReadDirExtended(), snprintf, subpath(), and walkdir().

Referenced by PathNameDeleteTemporaryDir(), SyncDataDirectory(), and walkdir().

Variable Documentation

◆ allocatedDescs

◆ data_sync_retry

bool data_sync_retry = false

Definition at line 162 of file fd.c.

Referenced by data_sync_elevel().

◆ file_resowner_desc

const ResourceOwnerDesc file_resowner_desc
static
Initial value:
=
{
.name = "File",
.release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
.release_priority = RELEASE_PRIO_FILES,
.ReleaseResource = ResOwnerReleaseFile,
.DebugPrint = ResOwnerPrintFile
}

Definition at line 361 of file fd.c.

362{
363 .name = "File",
364 .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
365 .release_priority = RELEASE_PRIO_FILES,
366 .ReleaseResource = ResOwnerReleaseFile,
367 .DebugPrint = ResOwnerPrintFile
368};

Referenced by ResourceOwnerForgetFile(), and ResourceOwnerRememberFile().

◆ have_xact_temporary_files

bool have_xact_temporary_files = false
static

Definition at line 228 of file fd.c.

Referenced by CleanupTempFiles(), and RegisterTemporaryFile().

◆ io_direct_flags

◆ max_files_per_process

int max_files_per_process = 1000

Definition at line 146 of file fd.c.

Referenced by set_max_safe_fds().

◆ max_safe_fds

int max_safe_fds = FD_MINFREE

Definition at line 159 of file fd.c.

Referenced by AcquireExternalFD(), ReleaseLruFiles(), reserveAllocatedDesc(), and set_max_safe_fds().

◆ maxAllocatedDescs

int maxAllocatedDescs = 0
static

◆ nextTempTableSpace

int nextTempTableSpace = 0
static

Definition at line 290 of file fd.c.

Referenced by GetNextTempTableSpace(), and SetTempTablespaces().

◆ nfile

int nfile = 0
static

◆ numAllocatedDescs

◆ numExternalFDs

int numExternalFDs = 0
static

Definition at line 274 of file fd.c.

Referenced by AcquireExternalFD(), ReleaseExternalFD(), ReleaseLruFiles(), and ReserveExternalFD().

◆ numTempTableSpaces

int numTempTableSpaces = -1
static

◆ recovery_init_sync_method

int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC

Definition at line 165 of file fd.c.

Referenced by SyncDataDirectory().

◆ SizeVfdCache

Size SizeVfdCache = 0
static

◆ tempFileCounter

long tempFileCounter = 0
static

Definition at line 280 of file fd.c.

Referenced by OpenTemporaryFileInTablespace().

◆ temporary_files_size

uint64 temporary_files_size = 0
static

Definition at line 236 of file fd.c.

Referenced by FileClose(), FileTruncate(), and FileWriteV().

◆ tempTableSpaces

Oid* tempTableSpaces = NULL
static

◆ VfdCache