PostgreSQL Source Code git master
Loading...
Searching...
No Matches
fd.c File Reference
#include "postgres.h"
#include <dirent.h>
#include <sys/file.h>
#include <sys/param.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <limits.h>
#include <unistd.h>
#include <fcntl.h>
#include "access/xact.h"
#include "access/xlog.h"
#include "catalog/pg_tablespace.h"
#include "common/file_perm.h"
#include "common/file_utils.h"
#include "common/pg_prng.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/startup.h"
#include "storage/aio.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/guc_hooks.h"
#include "utils/resowner.h"
#include "utils/varlena.h"
Include dependency graph for fd.c:

Go to the source code of this file.

Data Structures

struct  vfd
 
struct  AllocateDesc
 

Macros

#define NUM_RESERVED_FDS   10
 
#define FD_MINFREE   48
 
#define DO_DB(A)    ((void) 0)
 
#define VFD_CLOSED   (-1)
 
#define FileIsValid(file)    ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
 
#define FileIsNotOpen(file)   (VfdCache[file].fd == VFD_CLOSED)
 
#define FD_DELETE_AT_CLOSE   (1 << 0) /* T = delete when closed */
 
#define FD_CLOSE_AT_EOXACT   (1 << 1) /* T = close at eoXact */
 
#define FD_TEMP_FILE_LIMIT   (1 << 2) /* T = respect temp_file_limit */
 

Typedefs

typedef struct vfd Vfd
 

Enumerations

enum  AllocateDescKind { AllocateDescFile , AllocateDescPipe , AllocateDescDir , AllocateDescRawFD }
 

Functions

static void Delete (File file)
 
static void LruDelete (File file)
 
static void Insert (File file)
 
static int LruInsert (File file)
 
static bool ReleaseLruFile (void)
 
static void ReleaseLruFiles (void)
 
static File AllocateVfd (void)
 
static void FreeVfd (File file)
 
static int FileAccess (File file)
 
static File OpenTemporaryFileInTablespace (Oid tblspcOid, bool rejectError)
 
static bool reserveAllocatedDesc (void)
 
static int FreeDesc (AllocateDesc *desc)
 
static void BeforeShmemExit_Files (int code, Datum arg)
 
static void CleanupTempFiles (bool isCommit, bool isProcExit)
 
static void RemovePgTempRelationFiles (const char *tsdirname)
 
static void RemovePgTempRelationFilesInDbspace (const char *dbspacedirname)
 
static void walkdir (const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
 
static void datadir_fsync_fname (const char *fname, bool isdir, int elevel)
 
static void unlink_if_exists_fname (const char *fname, bool isdir, int elevel)
 
static int fsync_parent_path (const char *fname, int elevel)
 
static void ResOwnerReleaseFile (Datum res)
 
static charResOwnerPrintFile (Datum res)
 
static void ResourceOwnerRememberFile (ResourceOwner owner, File file)
 
static void ResourceOwnerForgetFile (ResourceOwner owner, File file)
 
int pg_fsync (int fd)
 
int pg_fsync_no_writethrough (int fd)
 
int pg_fsync_writethrough (int fd)
 
int pg_fdatasync (int fd)
 
bool pg_file_exists (const char *name)
 
void pg_flush_data (int fd, pgoff_t offset, pgoff_t nbytes)
 
static int pg_ftruncate (int fd, pgoff_t length)
 
int pg_truncate (const char *path, pgoff_t length)
 
void fsync_fname (const char *fname, bool isdir)
 
int durable_rename (const char *oldfile, const char *newfile, int elevel)
 
int durable_unlink (const char *fname, int elevel)
 
void InitFileAccess (void)
 
void InitTemporaryFileAccess (void)
 
static void count_usable_fds (int max_to_probe, int *usable_fds, int *already_open)
 
void set_max_safe_fds (void)
 
int BasicOpenFile (const char *fileName, int fileFlags)
 
int BasicOpenFilePerm (const char *fileName, int fileFlags, mode_t fileMode)
 
bool AcquireExternalFD (void)
 
void ReserveExternalFD (void)
 
void ReleaseExternalFD (void)
 
static void ReportTemporaryFileUsage (const char *path, pgoff_t size)
 
static void RegisterTemporaryFile (File file)
 
File PathNameOpenFile (const char *fileName, int fileFlags)
 
File PathNameOpenFilePerm (const char *fileName, int fileFlags, mode_t fileMode)
 
void PathNameCreateTemporaryDir (const char *basedir, const char *directory)
 
void PathNameDeleteTemporaryDir (const char *dirname)
 
File OpenTemporaryFile (bool interXact)
 
void TempTablespacePath (char *path, Oid tablespace)
 
File PathNameCreateTemporaryFile (const char *path, bool error_on_failure)
 
File PathNameOpenTemporaryFile (const char *path, int mode)
 
bool PathNameDeleteTemporaryFile (const char *path, bool error_on_failure)
 
void FileClose (File file)
 
int FilePrefetch (File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
 
void FileWriteback (File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
 
ssize_t FileReadV (File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
 
int FileStartReadV (PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
 
ssize_t FileWriteV (File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
 
int FileSync (File file, uint32 wait_event_info)
 
int FileZero (File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
 
int FileFallocate (File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
 
pgoff_t FileSize (File file)
 
int FileTruncate (File file, pgoff_t offset, uint32 wait_event_info)
 
charFilePathName (File file)
 
int FileGetRawDesc (File file)
 
int FileGetRawFlags (File file)
 
mode_t FileGetRawMode (File file)
 
FILEAllocateFile (const char *name, const char *mode)
 
int OpenTransientFile (const char *fileName, int fileFlags)
 
int OpenTransientFilePerm (const char *fileName, int fileFlags, mode_t fileMode)
 
FILEOpenPipeStream (const char *command, const char *mode)
 
int FreeFile (FILE *file)
 
int CloseTransientFile (int fd)
 
DIRAllocateDir (const char *dirname)
 
struct direntReadDir (DIR *dir, const char *dirname)
 
struct direntReadDirExtended (DIR *dir, const char *dirname, int elevel)
 
int FreeDir (DIR *dir)
 
int ClosePipeStream (FILE *file)
 
void closeAllVfds (void)
 
void SetTempTablespaces (Oid *tableSpaces, int numSpaces)
 
bool TempTablespacesAreSet (void)
 
int GetTempTablespaces (Oid *tableSpaces, int numSpaces)
 
Oid GetNextTempTableSpace (void)
 
void AtEOSubXact_Files (bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
 
void AtEOXact_Files (bool isCommit)
 
void RemovePgTempFiles (void)
 
void RemovePgTempFilesInDir (const char *tmpdirname, bool missing_ok, bool unlink_all)
 
bool looks_like_temp_rel_name (const char *name)
 
void SyncDataDirectory (void)
 
int fsync_fname_ext (const char *fname, bool isdir, bool ignore_perm, int elevel)
 
int MakePGDirectory (const char *directoryName)
 
int data_sync_elevel (int elevel)
 
bool check_debug_io_direct (char **newval, void **extra, GucSource source)
 
void assign_debug_io_direct (const char *newval, void *extra)
 

Variables

int max_files_per_process = 1000
 
int max_safe_fds = FD_MINFREE
 
bool data_sync_retry = false
 
int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC
 
int file_extend_method = DEFAULT_FILE_EXTEND_METHOD
 
int io_direct_flags
 
static VfdVfdCache
 
static Size SizeVfdCache = 0
 
static int nfile = 0
 
static bool have_xact_temporary_files = false
 
static uint64 temporary_files_size = 0
 
static int numAllocatedDescs = 0
 
static int maxAllocatedDescs = 0
 
static AllocateDescallocatedDescs = NULL
 
static int numExternalFDs = 0
 
static long tempFileCounter = 0
 
static OidtempTableSpaces = NULL
 
static int numTempTableSpaces = -1
 
static int nextTempTableSpace = 0
 
static const ResourceOwnerDesc file_resowner_desc
 

Macro Definition Documentation

◆ DO_DB

#define DO_DB (   A)     ((void) 0)

Definition at line 183 of file fd.c.

200{
201 int fd; /* current FD, or VFD_CLOSED if none */
202 unsigned short fdstate; /* bitflags for VFD's state */
203 ResourceOwner resowner; /* owner, for automatic cleanup */
204 File nextFree; /* link to next free VFD, if in freelist */
205 File lruMoreRecently; /* doubly linked recency-of-use list */
206 File lruLessRecently;
207 pgoff_t fileSize; /* current size of file (0 if not temporary) */
208 char *fileName; /* name of file, or NULL for unused VFD */
209 /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
210 int fileFlags; /* open(2) flags for (re)opening the file */
211 mode_t fileMode; /* mode to pass to open(2) */
212} Vfd;
213
214/*
215 * Virtual File Descriptor array pointer and size. This grows as
216 * needed. 'File' values are indexes into this array.
217 * Note that VfdCache[0] is not a usable VFD, just a list header.
218 */
219static Vfd *VfdCache;
220static Size SizeVfdCache = 0;
221
222/*
223 * Number of file descriptors known to be in use by VFD entries.
224 */
225static int nfile = 0;
226
227/*
228 * Flag to tell whether it's worth scanning VfdCache looking for temp files
229 * to close
230 */
231static bool have_xact_temporary_files = false;
232
233/*
234 * Tracks the total size of all temporary files. Note: when temp_file_limit
235 * is being enforced, this cannot overflow since the limit cannot be more
236 * than INT_MAX kilobytes. When not enforcing, it could theoretically
237 * overflow, but we don't care.
238 */
240
241/* Temporary file access initialized and not yet shut down? */
242#ifdef USE_ASSERT_CHECKING
243static bool temporary_files_allowed = false;
244#endif
245
246/*
247 * List of OS handles opened with AllocateFile, AllocateDir and
248 * OpenTransientFile.
249 */
250typedef enum
251{
257
258typedef struct
259{
260 AllocateDescKind kind;
261 SubTransactionId create_subid;
262 union
263 {
264 FILE *file;
265 DIR *dir;
266 int fd;
267 } desc;
269
270static int numAllocatedDescs = 0;
271static int maxAllocatedDescs = 0;
273
274/*
275 * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
276 */
277static int numExternalFDs = 0;
278
279/*
280 * Number of temporary files opened during the current session;
281 * this is used in generation of tempfile names.
282 */
283static long tempFileCounter = 0;
284
285/*
286 * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
287 * indicating that the current database's default tablespace should be used.)
288 * When numTempTableSpaces is -1, this has not been set in the current
289 * transaction.
290 */
291static Oid *tempTableSpaces = NULL;
292static int numTempTableSpaces = -1;
293static int nextTempTableSpace = 0;
294
295
296/*--------------------
297 *
298 * Private Routines
299 *
300 * Delete - delete a file from the Lru ring
301 * LruDelete - remove a file from the Lru ring and close its FD
302 * Insert - put a file at the front of the Lru ring
303 * LruInsert - put a file at the front of the Lru ring and open it
304 * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
305 * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
306 * AllocateVfd - grab a free (or new) file record (from VfdCache)
307 * FreeVfd - free a file record
308 *
309 * The Least Recently Used ring is a doubly linked list that begins and
310 * ends on element zero. Element zero is special -- it doesn't represent
311 * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
312 * anchor that shows us the beginning/end of the ring.
313 * Only VFD elements that are currently really open (have an FD assigned) are
314 * in the Lru ring. Elements that are "virtually" open can be recognized
315 * by having a non-null fileName field.
316 *
317 * example:
318 *
319 * /--less----\ /---------\
320 * v \ v \
321 * #0 --more---> LeastRecentlyUsed --more-\ \
322 * ^\ | |
323 * \\less--> MostRecentlyUsedFile <---/ |
324 * \more---/ \--less--/
325 *
326 *--------------------
327 */
328static void Delete(File file);
329static void LruDelete(File file);
330static void Insert(File file);
331static int LruInsert(File file);
332static bool ReleaseLruFile(void);
333static void ReleaseLruFiles(void);
334static File AllocateVfd(void);
335static void FreeVfd(File file);
336
337static int FileAccess(File file);
339static bool reserveAllocatedDesc(void);
340static int FreeDesc(AllocateDesc *desc);
341
342static void BeforeShmemExit_Files(int code, Datum arg);
343static void CleanupTempFiles(bool isCommit, bool isProcExit);
344static void RemovePgTempRelationFiles(const char *tsdirname);
346
347static void walkdir(const char *path,
348 void (*action) (const char *fname, bool isdir, int elevel),
349 bool process_symlinks,
350 int elevel);
351#ifdef PG_FLUSH_DATA_WORKS
352static void pre_sync_fname(const char *fname, bool isdir, int elevel);
353#endif
354static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
355static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
356
357static int fsync_parent_path(const char *fname, int elevel);
358
359
360/* ResourceOwner callbacks to hold virtual file descriptors */
361static void ResOwnerReleaseFile(Datum res);
362static char *ResOwnerPrintFile(Datum res);
363
365{
366 .name = "File",
367 .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
368 .release_priority = RELEASE_PRIO_FILES,
369 .ReleaseResource = ResOwnerReleaseFile,
370 .DebugPrint = ResOwnerPrintFile
371};
372
373/* Convenience wrappers over ResourceOwnerRemember/Forget */
374static inline void
376{
378}
379static inline void
381{
383}
384
385/*
386 * pg_fsync --- do fsync with or without writethrough
387 */
388int
389pg_fsync(int fd)
390{
391#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
392 struct stat st;
393
394 /*
395 * Some operating system implementations of fsync() have requirements
396 * about the file access modes that were used when their file descriptor
397 * argument was opened, and these requirements differ depending on whether
398 * the file descriptor is for a directory.
399 *
400 * For any file descriptor that may eventually be handed to fsync(), we
401 * should have opened it with access modes that are compatible with
402 * fsync() on all supported systems, otherwise the code may not be
403 * portable, even if it runs ok on the current system.
404 *
405 * We assert here that a descriptor for a file was opened with write
406 * permissions (i.e., not O_RDONLY) and for a directory without write
407 * permissions (O_RDONLY). Notice that the assertion check is made even
408 * if fsync() is disabled.
409 *
410 * If fstat() fails, ignore it and let the follow-up fsync() complain.
411 */
412 if (fstat(fd, &st) == 0)
413 {
414 int desc_flags = fcntl(fd, F_GETFL);
415
417
418 if (S_ISDIR(st.st_mode))
420 else
422 }
423 errno = 0;
424#endif
425
426 /* #if is to skip the wal_sync_method test if there's no need for it */
427#if defined(HAVE_FSYNC_WRITETHROUGH)
430 else
431#endif
433}
434
435
436/*
437 * pg_fsync_no_writethrough --- same as fsync except does nothing if
438 * enableFsync is off
439 */
440int
442{
443 int rc;
444
445 if (!enableFsync)
446 return 0;
447
448retry:
449 rc = fsync(fd);
450
451 if (rc == -1 && errno == EINTR)
452 goto retry;
453
454 return rc;
455}
456
457/*
458 * pg_fsync_writethrough
459 */
460int
462{
463 if (enableFsync)
464 {
465#if defined(F_FULLFSYNC)
466 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
467#else
468 errno = ENOSYS;
469 return -1;
470#endif
471 }
472 else
473 return 0;
474}
475
476/*
477 * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
478 */
479int
480pg_fdatasync(int fd)
481{
482 int rc;
483
484 if (!enableFsync)
485 return 0;
486
487retry:
488 rc = fdatasync(fd);
489
490 if (rc == -1 && errno == EINTR)
491 goto retry;
492
493 return rc;
494}
495
496/*
497 * pg_file_exists -- check that a file exists.
498 *
499 * This requires an absolute path to the file. Returns true if the file is
500 * not a directory, false otherwise.
501 */
502bool
503pg_file_exists(const char *name)
504{
505 struct stat st;
506
507 Assert(name != NULL);
508
509 if (stat(name, &st) == 0)
510 return !S_ISDIR(st.st_mode);
511 else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
514 errmsg("could not access file \"%s\": %m", name)));
515
516 return false;
517}
518
519/*
520 * pg_flush_data --- advise OS that the described dirty data should be flushed
521 *
522 * offset of 0 with nbytes 0 means that the entire file should be flushed
523 */
524void
525pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
526{
527 /*
528 * Right now file flushing is primarily used to avoid making later
529 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
530 * if fsyncs are disabled - that's a decision we might want to make
531 * configurable at some point.
532 */
533 if (!enableFsync)
534 return;
535
536 /*
537 * We compile all alternatives that are supported on the current platform,
538 * to find portability problems more easily.
539 */
540#if defined(HAVE_SYNC_FILE_RANGE)
541 {
542 int rc;
543 static bool not_implemented_by_kernel = false;
544
546 return;
547
548retry:
549
550 /*
551 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
552 * tells the OS that writeback for the specified blocks should be
553 * started, but that we don't want to wait for completion. Note that
554 * this call might block if too much dirty data exists in the range.
555 * This is the preferable method on OSs supporting it, as it works
556 * reliably when available (contrast to msync()) and doesn't flush out
557 * clean data (like FADV_DONTNEED).
558 */
559 rc = sync_file_range(fd, offset, nbytes,
561 if (rc != 0)
562 {
563 int elevel;
564
565 if (rc == EINTR)
566 goto retry;
567
568 /*
569 * For systems that don't have an implementation of
570 * sync_file_range() such as Windows WSL, generate only one
571 * warning and then suppress all further attempts by this process.
572 */
573 if (errno == ENOSYS)
574 {
575 elevel = WARNING;
577 }
578 else
579 elevel = data_sync_elevel(WARNING);
580
581 ereport(elevel,
583 errmsg("could not flush dirty data: %m")));
584 }
585
586 return;
587 }
588#endif
589#if !defined(WIN32) && defined(MS_ASYNC)
590 {
591 void *p;
592 static int pagesize = 0;
593
594 /*
595 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
596 * writeback. On linux it only does so if MS_SYNC is specified, but
597 * then it does the writeback synchronously. Luckily all common linux
598 * systems have sync_file_range(). This is preferable over
599 * FADV_DONTNEED because it doesn't flush out clean data.
600 *
601 * We map the file (mmap()), tell the kernel to sync back the contents
602 * (msync()), and then remove the mapping again (munmap()).
603 */
604
605 /* mmap() needs actual length if we want to map whole file */
606 if (offset == 0 && nbytes == 0)
607 {
608 nbytes = lseek(fd, 0, SEEK_END);
609 if (nbytes < 0)
610 {
613 errmsg("could not determine dirty data size: %m")));
614 return;
615 }
616 }
617
618 /*
619 * Some platforms reject partial-page mmap() attempts. To deal with
620 * that, just truncate the request to a page boundary. If any extra
621 * bytes don't get flushed, well, it's only a hint anyway.
622 */
623
624 /* fetch pagesize only once */
625 if (pagesize == 0)
627
628 /* align length to pagesize, dropping any fractional page */
629 if (pagesize > 0)
630 nbytes = (nbytes / pagesize) * pagesize;
631
632 /* fractional-page request is a no-op */
633 if (nbytes <= 0)
634 return;
635
636 /*
637 * mmap could well fail, particularly on 32-bit platforms where there
638 * may simply not be enough address space. If so, silently fall
639 * through to the next implementation.
640 */
641 if (nbytes <= (pgoff_t) SSIZE_MAX)
642 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
643 else
644 p = MAP_FAILED;
645
646 if (p != MAP_FAILED)
647 {
648 int rc;
649
650 rc = msync(p, (size_t) nbytes, MS_ASYNC);
651 if (rc != 0)
652 {
655 errmsg("could not flush dirty data: %m")));
656 /* NB: need to fall through to munmap()! */
657 }
658
659 rc = munmap(p, (size_t) nbytes);
660 if (rc != 0)
661 {
662 /* FATAL error because mapping would remain */
665 errmsg("could not munmap() while flushing data: %m")));
666 }
667
668 return;
669 }
670 }
671#endif
672#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
673 {
674 int rc;
675
676 /*
677 * Signal the kernel that the passed in range should not be cached
678 * anymore. This has the, desired, side effect of writing out dirty
679 * data, and the, undesired, side effect of likely discarding useful
680 * clean cached blocks. For the latter reason this is the least
681 * preferable method.
682 */
683
684 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
685
686 if (rc != 0)
687 {
688 /* don't error out, this is just a performance optimization */
691 errmsg("could not flush dirty data: %m")));
692 }
693
694 return;
695 }
696#endif
697}
698
699/*
700 * Truncate an open file to a given length.
701 */
702static int
703pg_ftruncate(int fd, pgoff_t length)
704{
705 int ret;
706
707retry:
708 ret = ftruncate(fd, length);
709
710 if (ret == -1 && errno == EINTR)
711 goto retry;
712
713 return ret;
714}
715
716/*
717 * Truncate a file to a given length by name.
718 */
719int
720pg_truncate(const char *path, pgoff_t length)
721{
722 int ret;
723#ifdef WIN32
724 int save_errno;
725 int fd;
726
728 if (fd >= 0)
729 {
730 ret = pg_ftruncate(fd, length);
734 }
735 else
736 ret = -1;
737#else
738
739retry:
740 ret = truncate(path, length);
741
742 if (ret == -1 && errno == EINTR)
743 goto retry;
744#endif
745
746 return ret;
747}
748
749/*
750 * fsync_fname -- fsync a file or directory, handling errors properly
751 *
752 * Try to fsync a file or directory. When doing the latter, ignore errors that
753 * indicate the OS just doesn't allow/require fsyncing directories.
754 */
755void
756fsync_fname(const char *fname, bool isdir)
757{
759}
760
761/*
762 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
763 *
764 * This routine ensures that, after returning, the effect of renaming file
765 * persists in case of a crash. A crash while this routine is running will
766 * leave you with either the pre-existing or the moved file in place of the
767 * new file; no mixed state or truncated files are possible.
768 *
769 * It does so by using fsync on the old filename and the possibly existing
770 * target filename before the rename, and the target file and directory after.
771 *
772 * Note that rename() cannot be used across arbitrary directories, as they
773 * might not be on the same filesystem. Therefore this routine does not
774 * support renaming across directories.
775 *
776 * Log errors with the caller specified severity.
777 *
778 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
779 * valid upon return.
780 */
781int
782durable_rename(const char *oldfile, const char *newfile, int elevel)
783{
784 int fd;
785
786 /*
787 * First fsync the old and target path (if it exists), to ensure that they
788 * are properly persistent on disk. Syncing the target file is not
789 * strictly necessary, but it makes it easier to reason about crashes;
790 * because it's then guaranteed that either source or target file exists
791 * after a crash.
792 */
793 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
794 return -1;
795
797 if (fd < 0)
798 {
799 if (errno != ENOENT)
800 {
801 ereport(elevel,
803 errmsg("could not open file \"%s\": %m", newfile)));
804 return -1;
805 }
806 }
807 else
808 {
809 if (pg_fsync(fd) != 0)
810 {
811 int save_errno;
812
813 /* close file upon error, might not be in transaction context */
817
818 ereport(elevel,
820 errmsg("could not fsync file \"%s\": %m", newfile)));
821 return -1;
822 }
823
824 if (CloseTransientFile(fd) != 0)
825 {
826 ereport(elevel,
828 errmsg("could not close file \"%s\": %m", newfile)));
829 return -1;
830 }
831 }
832
833 /* Time to do the real deal... */
834 if (rename(oldfile, newfile) < 0)
835 {
836 ereport(elevel,
838 errmsg("could not rename file \"%s\" to \"%s\": %m",
839 oldfile, newfile)));
840 return -1;
841 }
842
843 /*
844 * To guarantee renaming the file is persistent, fsync the file with its
845 * new name, and its containing directory.
846 */
847 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
848 return -1;
849
850 if (fsync_parent_path(newfile, elevel) != 0)
851 return -1;
852
853 return 0;
854}
855
856/*
857 * durable_unlink -- remove a file in a durable manner
858 *
859 * This routine ensures that, after returning, the effect of removing file
860 * persists in case of a crash. A crash while this routine is running will
861 * leave the system in no mixed state.
862 *
863 * It does so by using fsync on the parent directory of the file after the
864 * actual removal is done.
865 *
866 * Log errors with the severity specified by caller.
867 *
868 * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
869 * valid upon return.
870 */
871int
872durable_unlink(const char *fname, int elevel)
873{
874 if (unlink(fname) < 0)
875 {
876 ereport(elevel,
878 errmsg("could not remove file \"%s\": %m",
879 fname)));
880 return -1;
881 }
882
883 /*
884 * To guarantee that the removal of the file is persistent, fsync its
885 * parent directory.
886 */
887 if (fsync_parent_path(fname, elevel) != 0)
888 return -1;
889
890 return 0;
891}
892
893/*
894 * InitFileAccess --- initialize this module during backend startup
895 *
896 * This is called during either normal or standalone backend start.
897 * It is *not* called in the postmaster.
898 *
899 * Note that this does not initialize temporary file access, that is
900 * separately initialized via InitTemporaryFileAccess().
901 */
902void
903InitFileAccess(void)
904{
905 Assert(SizeVfdCache == 0); /* call me only once */
906
907 /* initialize cache header entry */
908 VfdCache = (Vfd *) malloc(sizeof(Vfd));
909 if (VfdCache == NULL)
912 errmsg("out of memory")));
913
914 MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
916
917 SizeVfdCache = 1;
918}
919
920/*
921 * InitTemporaryFileAccess --- initialize temporary file access during startup
922 *
923 * This is called during either normal or standalone backend start.
924 * It is *not* called in the postmaster.
925 *
926 * This is separate from InitFileAccess() because temporary file cleanup can
927 * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
928 * our reporting has to happen before that. Low level file access should be
929 * available for longer, hence the separate initialization / shutdown of
930 * temporary file handling.
931 */
932void
934{
935 Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
936 Assert(!temporary_files_allowed); /* call me only once */
937
938 /*
939 * Register before-shmem-exit hook to ensure temp files are dropped while
940 * we can still report stats.
941 */
943
944#ifdef USE_ASSERT_CHECKING
946#endif
947}
948
949/*
950 * count_usable_fds --- count how many FDs the system will let us open,
951 * and estimate how many are already open.
952 *
953 * We stop counting if usable_fds reaches max_to_probe. Note: a small
954 * value of max_to_probe might result in an underestimate of already_open;
955 * we must fill in any "gaps" in the set of used FDs before the calculation
956 * of already_open will give the right answer. In practice, max_to_probe
957 * of a couple of dozen should be enough to ensure good results.
958 *
959 * We assume stderr (FD 2) is available for dup'ing. While the calling
960 * script could theoretically close that, it would be a really bad idea,
961 * since then one risks loss of error messages from, e.g., libc.
962 */
963static void
965{
966 int *fd;
967 int size;
968 int used = 0;
969 int highestfd = 0;
970 int j;
971
972#ifdef HAVE_GETRLIMIT
973 struct rlimit rlim;
975#endif
976
977 size = 1024;
978 fd = (int *) palloc(size * sizeof(int));
979
980#ifdef HAVE_GETRLIMIT
982 if (getrlimit_status != 0)
983 ereport(WARNING, (errmsg("getrlimit failed: %m")));
984#endif /* HAVE_GETRLIMIT */
985
986 /* dup until failure or probe limit reached */
987 for (;;)
988 {
989 int thisfd;
990
991#ifdef HAVE_GETRLIMIT
992
993 /*
994 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
995 * some platforms
996 */
997 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
998 break;
999#endif
1000
1001 thisfd = dup(2);
1002 if (thisfd < 0)
1003 {
1004 /* Expect EMFILE or ENFILE, else it's fishy */
1005 if (errno != EMFILE && errno != ENFILE)
1006 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1007 break;
1008 }
1009
1010 if (used >= size)
1011 {
1012 size *= 2;
1013 fd = (int *) repalloc(fd, size * sizeof(int));
1014 }
1015 fd[used++] = thisfd;
1016
1017 if (highestfd < thisfd)
1018 highestfd = thisfd;
1019
1020 if (used >= max_to_probe)
1021 break;
1022 }
1023
1024 /* release the files we opened */
1025 for (j = 0; j < used; j++)
1026 close(fd[j]);
1027
1028 pfree(fd);
1029
1030 /*
1031 * Return results. usable_fds is just the number of successful dups. We
1032 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1033 * number) and so already_open is highestfd+1 - usable_fds.
1034 */
1035 *usable_fds = used;
1036 *already_open = highestfd + 1 - used;
1037}
1038
1039/*
1040 * set_max_safe_fds
1041 * Determine number of file descriptors that fd.c is allowed to use
1042 */
1043void
1044set_max_safe_fds(void)
1045{
1046 int usable_fds;
1047 int already_open;
1048
1049 /*----------
1050 * We want to set max_safe_fds to
1051 * MIN(usable_fds, max_files_per_process)
1052 * less the slop factor for files that are opened without consulting
1053 * fd.c. This ensures that we won't allow to open more than
1054 * max_files_per_process, or the experimentally-determined EMFILE limit,
1055 * additional files.
1056 *----------
1057 */
1060
1062
1063 /*
1064 * Take off the FDs reserved for system() etc.
1065 */
1067
1068 /*
1069 * Make sure we still have enough to get by.
1070 */
1072 ereport(FATAL,
1074 errmsg("insufficient file descriptors available to start server process"),
1075 errdetail("System allows %d, server needs at least %d, %d files are already open.",
1078 already_open)));
1079
1080 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1082}
1083
1084/*
1085 * Open a file with BasicOpenFilePerm() and pass default file mode for the
1086 * fileMode parameter.
1087 */
1088int
1089BasicOpenFile(const char *fileName, int fileFlags)
1090{
1091 return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1092}
1093
1094/*
1095 * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
1096 *
1097 * This is exported for use by places that really want a plain kernel FD,
1098 * but need to be proof against running out of FDs. Once an FD has been
1099 * successfully returned, it is the caller's responsibility to ensure that
1100 * it will not be leaked on ereport()! Most users should *not* call this
1101 * routine directly, but instead use the VFD abstraction level, which
1102 * provides protection against descriptor leaks as well as management of
1103 * files that need to be open for more than a short period of time.
1104 *
1105 * Ideally this should be the *only* direct call of open() in the backend.
1106 * In practice, the postmaster calls open() directly, and there are some
1107 * direct open() calls done early in backend startup. Those are OK since
1108 * this module wouldn't have any open files to close at that point anyway.
1109 */
1110int
1111BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1112{
1113 int fd;
1114
1115tryAgain:
1116#ifdef PG_O_DIRECT_USE_F_NOCACHE
1117 fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1118#else
1119 fd = open(fileName, fileFlags, fileMode);
1120#endif
1121
1122 if (fd >= 0)
1123 {
1124#ifdef PG_O_DIRECT_USE_F_NOCACHE
1125 if (fileFlags & PG_O_DIRECT)
1126 {
1127 if (fcntl(fd, F_NOCACHE, 1) < 0)
1128 {
1129 int save_errno = errno;
1130
1131 close(fd);
1132 errno = save_errno;
1133 return -1;
1134 }
1135 }
1136#endif
1137
1138 return fd; /* success! */
1139 }
1140
1141 if (errno == EMFILE || errno == ENFILE)
1142 {
1143 int save_errno = errno;
1144
1145 ereport(LOG,
1147 errmsg("out of file descriptors: %m; release and retry")));
1148 errno = 0;
1149 if (ReleaseLruFile())
1150 goto tryAgain;
1151 errno = save_errno;
1152 }
1153
1154 return -1; /* failure */
1155}
1156
1157/*
1158 * AcquireExternalFD - attempt to reserve an external file descriptor
1159 *
1160 * This should be used by callers that need to hold a file descriptor open
1161 * over more than a short interval, but cannot use any of the other facilities
1162 * provided by this module.
1163 *
1164 * The difference between this and the underlying ReserveExternalFD function
1165 * is that this will report failure (by setting errno and returning false)
1166 * if "too many" external FDs are already reserved. This should be used in
1167 * any code where the total number of FDs to be reserved is not predictable
1168 * and small.
1169 */
1170bool
1172{
1173 /*
1174 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1175 * "external" FDs.
1176 */
1177 if (numExternalFDs < max_safe_fds / 3)
1178 {
1180 return true;
1181 }
1182 errno = EMFILE;
1183 return false;
1184}
1185
1186/*
1187 * ReserveExternalFD - report external consumption of a file descriptor
1188 *
1189 * This should be used by callers that need to hold a file descriptor open
1190 * over more than a short interval, but cannot use any of the other facilities
1191 * provided by this module. This just tracks the use of the FD and closes
1192 * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
1193 *
1194 * Call this directly only in code where failure to reserve the FD would be
1195 * fatal; for example, the WAL-writing code does so, since the alternative is
1196 * session failure. Also, it's very unwise to do so in code that could
1197 * consume more than one FD per process.
1198 *
1199 * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
1200 * available, it doesn't matter too much whether this is called before or
1201 * after actually opening the FD; but doing so beforehand reduces the risk of
1202 * an EMFILE failure if not everybody played nice. In any case, it's solely
1203 * caller's responsibility to keep the external-FD count in sync with reality.
1204 */
1205void
1207{
1208 /*
1209 * Release VFDs if needed to stay safe. Because we do this before
1210 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1211 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1212 */
1214
1216}
1217
1218/*
1219 * ReleaseExternalFD - report release of an external file descriptor
1220 *
1221 * This is guaranteed not to change errno, so it can be used in failure paths.
1222 */
1223void
1225{
1228}
1229
1230
1231#if defined(FDDEBUG)
1232
1233static void
1234_dump_lru(void)
1235{
1236 int mru = VfdCache[0].lruLessRecently;
1237 Vfd *vfdP = &VfdCache[mru];
1238 char buf[2048];
1239
1240 snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
1241 while (mru != 0)
1242 {
1243 mru = vfdP->lruLessRecently;
1244 vfdP = &VfdCache[mru];
1245 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
1246 }
1247 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
1248 elog(LOG, "%s", buf);
1249}
1250#endif /* FDDEBUG */
1251
1252static void
1253Delete(File file)
1254{
1255 Vfd *vfdP;
1256
1257 Assert(file != 0);
1258
1259 DO_DB(elog(LOG, "Delete %d (%s)",
1260 file, VfdCache[file].fileName));
1261 DO_DB(_dump_lru());
1262
1263 vfdP = &VfdCache[file];
1264
1265 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1266 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1267
1268 DO_DB(_dump_lru());
1269}
1270
1271static void
1272LruDelete(File file)
1273{
1274 Vfd *vfdP;
1275
1276 Assert(file != 0);
1277
1278 DO_DB(elog(LOG, "LruDelete %d (%s)",
1279 file, VfdCache[file].fileName));
1280
1281 vfdP = &VfdCache[file];
1282
1284
1285 /*
1286 * Close the file. We aren't expecting this to fail; if it does, better
1287 * to leak the FD than to mess up our internal state.
1288 */
1289 if (close(vfdP->fd) != 0)
1291 "could not close file \"%s\": %m", vfdP->fileName);
1292 vfdP->fd = VFD_CLOSED;
1293 --nfile;
1294
1295 /* delete the vfd record from the LRU ring */
1296 Delete(file);
1297}
1298
1299static void
1300Insert(File file)
1301{
1302 Vfd *vfdP;
1303
1304 Assert(file != 0);
1305
1306 DO_DB(elog(LOG, "Insert %d (%s)",
1307 file, VfdCache[file].fileName));
1308 DO_DB(_dump_lru());
1309
1310 vfdP = &VfdCache[file];
1311
1312 vfdP->lruMoreRecently = 0;
1313 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1314 VfdCache[0].lruLessRecently = file;
1315 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1316
1317 DO_DB(_dump_lru());
1318}
1319
1320/* returns 0 on success, -1 on re-open failure (with errno set) */
1321static int
1322LruInsert(File file)
1323{
1324 Vfd *vfdP;
1325
1326 Assert(file != 0);
1327
1328 DO_DB(elog(LOG, "LruInsert %d (%s)",
1329 file, VfdCache[file].fileName));
1330
1331 vfdP = &VfdCache[file];
1332
1333 if (FileIsNotOpen(file))
1334 {
1335 /* Close excess kernel FDs. */
1337
1338 /*
1339 * The open could still fail for lack of file descriptors, eg due to
1340 * overall system file table being full. So, be prepared to release
1341 * another FD if necessary...
1342 */
1343 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1344 vfdP->fileMode);
1345 if (vfdP->fd < 0)
1346 {
1347 DO_DB(elog(LOG, "re-open failed: %m"));
1348 return -1;
1349 }
1350 else
1351 {
1352 ++nfile;
1353 }
1354 }
1355
1356 /*
1357 * put it at the head of the Lru ring
1358 */
1359
1360 Insert(file);
1361
1362 return 0;
1363}
1364
1365/*
1366 * Release one kernel FD by closing the least-recently-used VFD.
1367 */
1368static bool
1369ReleaseLruFile(void)
1370{
1371 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1372
1373 if (nfile > 0)
1374 {
1375 /*
1376 * There are opened files and so there should be at least one used vfd
1377 * in the ring.
1378 */
1379 Assert(VfdCache[0].lruMoreRecently != 0);
1380 LruDelete(VfdCache[0].lruMoreRecently);
1381 return true; /* freed a file */
1382 }
1383 return false; /* no files available to free */
1384}
1385
1386/*
1387 * Release kernel FDs as needed to get under the max_safe_fds limit.
1388 * After calling this, it's OK to try to open another file.
1389 */
1390static void
1391ReleaseLruFiles(void)
1392{
1394 {
1395 if (!ReleaseLruFile())
1396 break;
1397 }
1398}
1399
1400static File
1401AllocateVfd(void)
1402{
1403 Index i;
1404 File file;
1405
1406 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1407
1408 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1409
1410 if (VfdCache[0].nextFree == 0)
1411 {
1412 /*
1413 * The free list is empty so it is time to increase the size of the
1414 * array. We choose to double it each time this happens. However,
1415 * there's not much point in starting *real* small.
1416 */
1419
1420 if (newCacheSize < 32)
1421 newCacheSize = 32;
1422
1423 /*
1424 * Be careful not to clobber VfdCache ptr if realloc fails.
1425 */
1426 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1427 if (newVfdCache == NULL)
1428 ereport(ERROR,
1430 errmsg("out of memory")));
1432
1433 /*
1434 * Initialize the new entries and link them into the free list.
1435 */
1436 for (i = SizeVfdCache; i < newCacheSize; i++)
1437 {
1438 MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1439 VfdCache[i].nextFree = i + 1;
1441 }
1444
1445 /*
1446 * Record the new size
1447 */
1449 }
1450
1451 file = VfdCache[0].nextFree;
1452
1454
1455 return file;
1456}
1457
1458static void
1459FreeVfd(File file)
1460{
1461 Vfd *vfdP = &VfdCache[file];
1462
1463 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1464 file, vfdP->fileName ? vfdP->fileName : ""));
1465
1466 if (vfdP->fileName != NULL)
1467 {
1468 free(vfdP->fileName);
1469 vfdP->fileName = NULL;
1470 }
1471 vfdP->fdstate = 0x0;
1472
1473 vfdP->nextFree = VfdCache[0].nextFree;
1474 VfdCache[0].nextFree = file;
1475}
1476
1477/* returns 0 on success, -1 on re-open failure (with errno set) */
1478static int
1479FileAccess(File file)
1480{
1481 int returnValue;
1482
1483 DO_DB(elog(LOG, "FileAccess %d (%s)",
1484 file, VfdCache[file].fileName));
1485
1486 /*
1487 * Is the file open? If not, open it and put it at the head of the LRU
1488 * ring (possibly closing the least recently used file to get an FD).
1489 */
1490
1491 if (FileIsNotOpen(file))
1492 {
1493 returnValue = LruInsert(file);
1494 if (returnValue != 0)
1495 return returnValue;
1496 }
1497 else if (VfdCache[0].lruLessRecently != file)
1498 {
1499 /*
1500 * We now know that the file is open and that it is not the last one
1501 * accessed, so we need to move it to the head of the Lru ring.
1502 */
1503
1504 Delete(file);
1505 Insert(file);
1506 }
1507
1508 return 0;
1509}
1510
1511/*
1512 * Called whenever a temporary file is deleted to report its size.
1513 */
1514static void
1515ReportTemporaryFileUsage(const char *path, pgoff_t size)
1516{
1518
1519 if (log_temp_files >= 0)
1520 {
1521 if ((size / 1024) >= log_temp_files)
1522 ereport(LOG,
1523 (errmsg("temporary file: path \"%s\", size %lu",
1524 path, (unsigned long) size)));
1525 }
1526}
1527
1528/*
1529 * Called to register a temporary file for automatic close.
1530 * ResourceOwnerEnlarge(CurrentResourceOwner) must have been called
1531 * before the file was opened.
1532 */
1533static void
1535{
1538
1539 /* Backup mechanism for closing at end of xact. */
1542}
1543
1544/*
1545 * Called when we get a shared invalidation message on some relation.
1546 */
1547#ifdef NOT_USED
1548void
1549FileInvalidate(File file)
1550{
1551 Assert(FileIsValid(file));
1552 if (!FileIsNotOpen(file))
1553 LruDelete(file);
1554}
1555#endif
1556
1557/*
1558 * Open a file with PathNameOpenFilePerm() and pass default file mode for the
1559 * fileMode parameter.
1560 */
1561File
1562PathNameOpenFile(const char *fileName, int fileFlags)
1563{
1564 return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
1565}
1566
1567/*
1568 * open a file in an arbitrary directory
1569 *
1570 * NB: if the passed pathname is relative (which it usually is),
1571 * it will be interpreted relative to the process' working directory
1572 * (which should always be $PGDATA when this code is running).
1573 */
1574File
1575PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
1576{
1577 char *fnamecopy;
1578 File file;
1579 Vfd *vfdP;
1580
1581 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1582 fileName, fileFlags, fileMode));
1583
1584 /*
1585 * We need a malloc'd copy of the file name; fail cleanly if no room.
1586 */
1587 fnamecopy = strdup(fileName);
1588 if (fnamecopy == NULL)
1589 ereport(ERROR,
1591 errmsg("out of memory")));
1592
1593 file = AllocateVfd();
1594 vfdP = &VfdCache[file];
1595
1596 /* Close excess kernel FDs. */
1598
1599 /*
1600 * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1601 * client shouldn't be expected to know which kernel descriptors are
1602 * currently open, so it wouldn't make sense for them to be inherited by
1603 * executed subprograms.
1604 */
1605 fileFlags |= O_CLOEXEC;
1606
1607 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1608
1609 if (vfdP->fd < 0)
1610 {
1611 int save_errno = errno;
1612
1613 FreeVfd(file);
1614 free(fnamecopy);
1615 errno = save_errno;
1616 return -1;
1617 }
1618 ++nfile;
1619 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1620 vfdP->fd));
1621
1622 vfdP->fileName = fnamecopy;
1623 /* Saved flags are adjusted to be OK for re-opening file */
1624 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1625 vfdP->fileMode = fileMode;
1626 vfdP->fileSize = 0;
1627 vfdP->fdstate = 0x0;
1628 vfdP->resowner = NULL;
1629
1630 Insert(file);
1631
1632 return file;
1633}
1634
1635/*
1636 * Create directory 'directory'. If necessary, create 'basedir', which must
1637 * be the directory above it. This is designed for creating the top-level
1638 * temporary directory on demand before creating a directory underneath it.
1639 * Do nothing if the directory already exists.
1640 *
1641 * Directories created within the top-level temporary directory should begin
1642 * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
1643 * deleted at startup by RemovePgTempFiles(). Further subdirectories below
1644 * that do not need any particular prefix.
1645*/
1646void
1647PathNameCreateTemporaryDir(const char *basedir, const char *directory)
1648{
1649 if (MakePGDirectory(directory) < 0)
1650 {
1651 if (errno == EEXIST)
1652 return;
1653
1654 /*
1655 * Failed. Try to create basedir first in case it's missing. Tolerate
1656 * EEXIST to close a race against another process following the same
1657 * algorithm.
1658 */
1659 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1660 ereport(ERROR,
1662 errmsg("cannot create temporary directory \"%s\": %m",
1663 basedir)));
1664
1665 /* Try again. */
1666 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1667 ereport(ERROR,
1669 errmsg("cannot create temporary subdirectory \"%s\": %m",
1670 directory)));
1671 }
1672}
1673
1674/*
1675 * Delete a directory and everything in it, if it exists.
1676 */
1677void
1678PathNameDeleteTemporaryDir(const char *dirname)
1679{
1680 struct stat statbuf;
1681
1682 /* Silently ignore missing directory. */
1683 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1684 return;
1685
1686 /*
1687 * Currently, walkdir doesn't offer a way for our passed in function to
1688 * maintain state. Perhaps it should, so that we could tell the caller
1689 * whether this operation succeeded or failed. Since this operation is
1690 * used in a cleanup path, we wouldn't actually behave differently: we'll
1691 * just log failures.
1692 */
1693 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1694}
1695
1696/*
1697 * Open a temporary file that will disappear when we close it.
1698 *
1699 * This routine takes care of generating an appropriate tempfile name.
1700 * There's no need to pass in fileFlags or fileMode either, since only
1701 * one setting makes any sense for a temp file.
1702 *
1703 * Unless interXact is true, the file is remembered by CurrentResourceOwner
1704 * to ensure it's closed and deleted when it's no longer needed, typically at
1705 * the end-of-transaction. In most cases, you don't want temporary files to
1706 * outlive the transaction that created them, so this should be false -- but
1707 * if you need "somewhat" temporary storage, this might be useful. In either
1708 * case, the file is removed when the File is explicitly closed.
1709 */
1710File
1711OpenTemporaryFile(bool interXact)
1712{
1713 File file = 0;
1714
1715 Assert(temporary_files_allowed); /* check temp file access is up */
1716
1717 /*
1718 * Make sure the current resource owner has space for this File before we
1719 * open it, if we'll be registering it below.
1720 */
1721 if (!interXact)
1723
1724 /*
1725 * If some temp tablespace(s) have been given to us, try to use the next
1726 * one. If a given tablespace can't be found, we silently fall back to
1727 * the database's default tablespace.
1728 *
1729 * BUT: if the temp file is slated to outlive the current transaction,
1730 * force it into the database's default tablespace, so that it will not
1731 * pose a threat to possible tablespace drop attempts.
1732 */
1733 if (numTempTableSpaces > 0 && !interXact)
1734 {
1736
1737 if (OidIsValid(tblspcOid))
1739 }
1740
1741 /*
1742 * If not, or if tablespace is bad, create in database's default
1743 * tablespace. MyDatabaseTableSpace should normally be set before we get
1744 * here, but just in case it isn't, fall back to pg_default tablespace.
1745 */
1746 if (file <= 0)
1750 true);
1751
1752 /* Mark it for deletion at close and temporary file size limit */
1754
1755 /* Register it with the current resource owner */
1756 if (!interXact)
1758
1759 return file;
1760}
1761
1762/*
1763 * Return the path of the temp directory in a given tablespace.
1764 */
1765void
1767{
1768 /*
1769 * Identify the tempfile directory for this tablespace.
1770 *
1771 * If someone tries to specify pg_global, use pg_default instead.
1772 */
1773 if (tablespace == InvalidOid ||
1776 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1777 else
1778 {
1779 /* All other tablespaces are accessed via symlinks */
1780 snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1783 }
1784}
1785
1786/*
1787 * Open a temporary file in a specific tablespace.
1788 * Subroutine for OpenTemporaryFile, which see for details.
1789 */
1790static File
1792{
1793 char tempdirpath[MAXPGPATH];
1794 char tempfilepath[MAXPGPATH];
1795 File file;
1796
1798
1799 /*
1800 * Generate a tempfile name that should be unique within the current
1801 * database instance.
1802 */
1803 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1805
1806 /*
1807 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1808 * temp file that can be reused.
1809 */
1812 if (file <= 0)
1813 {
1814 /*
1815 * We might need to create the tablespace's tempfile directory, if no
1816 * one has yet done so.
1817 *
1818 * Don't check for an error from MakePGDirectory; it could fail if
1819 * someone else just did the same thing. If it doesn't work then
1820 * we'll bomb out on the second create attempt, instead.
1821 */
1823
1826 if (file <= 0 && rejectError)
1827 elog(ERROR, "could not create temporary file \"%s\": %m",
1828 tempfilepath);
1829 }
1830
1831 return file;
1832}
1833
1834
1835/*
1836 * Create a new file. The directory containing it must already exist. Files
1837 * created this way are subject to temp_file_limit and are automatically
1838 * closed at end of transaction, but are not automatically deleted on close
1839 * because they are intended to be shared between cooperating backends.
1840 *
1841 * If the file is inside the top-level temporary directory, its name should
1842 * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
1843 * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
1844 * inside a directory created with PathNameCreateTemporaryDir(), in which case
1845 * the prefix isn't needed.
1846 */
1847File
1848PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
1849{
1850 File file;
1851
1852 Assert(temporary_files_allowed); /* check temp file access is up */
1853
1855
1856 /*
1857 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1858 * temp file that can be reused.
1859 */
1860 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1861 if (file <= 0)
1862 {
1863 if (error_on_failure)
1864 ereport(ERROR,
1866 errmsg("could not create temporary file \"%s\": %m",
1867 path)));
1868 else
1869 return file;
1870 }
1871
1872 /* Mark it for temp_file_limit accounting. */
1874
1875 /* Register it for automatic close. */
1877
1878 return file;
1879}
1880
1881/*
1882 * Open a file that was created with PathNameCreateTemporaryFile, possibly in
1883 * another backend. Files opened this way don't count against the
1884 * temp_file_limit of the caller, are automatically closed at the end of the
1885 * transaction but are not deleted on close.
1886 */
1887File
1888PathNameOpenTemporaryFile(const char *path, int mode)
1889{
1890 File file;
1891
1892 Assert(temporary_files_allowed); /* check temp file access is up */
1893
1895
1896 file = PathNameOpenFile(path, mode | PG_BINARY);
1897
1898 /* If no such file, then we don't raise an error. */
1899 if (file <= 0 && errno != ENOENT)
1900 ereport(ERROR,
1902 errmsg("could not open temporary file \"%s\": %m",
1903 path)));
1904
1905 if (file > 0)
1906 {
1907 /* Register it for automatic close. */
1909 }
1910
1911 return file;
1912}
1913
1914/*
1915 * Delete a file by pathname. Return true if the file existed, false if
1916 * didn't.
1917 */
1918bool
1919PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
1920{
1921 struct stat filestats;
1922 int stat_errno;
1923
1924 /* Get the final size for pgstat reporting. */
1925 if (stat(path, &filestats) != 0)
1926 stat_errno = errno;
1927 else
1928 stat_errno = 0;
1929
1930 /*
1931 * Unlike FileClose's automatic file deletion code, we tolerate
1932 * non-existence to support BufFileDeleteFileSet which doesn't know how
1933 * many segments it has to delete until it runs out.
1934 */
1935 if (stat_errno == ENOENT)
1936 return false;
1937
1938 if (unlink(path) < 0)
1939 {
1940 if (errno != ENOENT)
1943 errmsg("could not unlink temporary file \"%s\": %m",
1944 path)));
1945 return false;
1946 }
1947
1948 if (stat_errno == 0)
1949 ReportTemporaryFileUsage(path, filestats.st_size);
1950 else
1951 {
1952 errno = stat_errno;
1953 ereport(LOG,
1955 errmsg("could not stat file \"%s\": %m", path)));
1956 }
1957
1958 return true;
1959}
1960
1961/*
1962 * close a file when done with it
1963 */
1964void
1965FileClose(File file)
1966{
1967 Vfd *vfdP;
1968
1969 Assert(FileIsValid(file));
1970
1971 DO_DB(elog(LOG, "FileClose: %d (%s)",
1972 file, VfdCache[file].fileName));
1973
1974 vfdP = &VfdCache[file];
1975
1976 if (!FileIsNotOpen(file))
1977 {
1979
1980 /* close the file */
1981 if (close(vfdP->fd) != 0)
1982 {
1983 /*
1984 * We may need to panic on failure to close non-temporary files;
1985 * see LruDelete.
1986 */
1988 "could not close file \"%s\": %m", vfdP->fileName);
1989 }
1990
1991 --nfile;
1992 vfdP->fd = VFD_CLOSED;
1993
1994 /* remove the file from the lru ring */
1995 Delete(file);
1996 }
1997
1998 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1999 {
2000 /* Subtract its size from current usage (do first in case of error) */
2001 temporary_files_size -= vfdP->fileSize;
2002 vfdP->fileSize = 0;
2003 }
2004
2005 /*
2006 * Delete the file if it was temporary, and make a log entry if wanted
2007 */
2008 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2009 {
2010 struct stat filestats;
2011 int stat_errno;
2012
2013 /*
2014 * If we get an error, as could happen within the ereport/elog calls,
2015 * we'll come right back here during transaction abort. Reset the
2016 * flag to ensure that we can't get into an infinite loop. This code
2017 * is arranged to ensure that the worst-case consequence is failing to
2018 * emit log message(s), not failing to attempt the unlink.
2019 */
2020 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2021
2022
2023 /* first try the stat() */
2024 if (stat(vfdP->fileName, &filestats))
2025 stat_errno = errno;
2026 else
2027 stat_errno = 0;
2028
2029 /* in any case do the unlink */
2030 if (unlink(vfdP->fileName))
2031 ereport(LOG,
2033 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2034
2035 /* and last report the stat results */
2036 if (stat_errno == 0)
2037 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2038 else
2039 {
2040 errno = stat_errno;
2041 ereport(LOG,
2043 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2044 }
2045 }
2046
2047 /* Unregister it from the resource owner */
2048 if (vfdP->resowner)
2049 ResourceOwnerForgetFile(vfdP->resowner, file);
2050
2051 /*
2052 * Return the Vfd slot to the free list
2053 */
2054 FreeVfd(file);
2055}
2056
2057/*
2058 * FilePrefetch - initiate asynchronous read of a given range of the file.
2059 *
2060 * Returns 0 on success, otherwise an errno error code (like posix_fadvise()).
2061 *
2062 * posix_fadvise() is the simplest standardized interface that accomplishes
2063 * this.
2064 */
2065int
2066FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2067{
2068 Assert(FileIsValid(file));
2069
2070 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2071 file, VfdCache[file].fileName,
2072 (int64) offset, (int64) amount));
2073
2074#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2075 {
2076 int returnCode;
2077
2078 returnCode = FileAccess(file);
2079 if (returnCode < 0)
2080 return returnCode;
2081
2082retry:
2083 pgstat_report_wait_start(wait_event_info);
2084 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2087
2088 if (returnCode == EINTR)
2089 goto retry;
2090
2091 return returnCode;
2092 }
2093#elif defined(__darwin__)
2094 {
2095 struct radvisory
2096 {
2097 off_t ra_offset; /* offset into the file */
2098 int ra_count; /* size of the read */
2099 } ra;
2100 int returnCode;
2101
2102 returnCode = FileAccess(file);
2103 if (returnCode < 0)
2104 return returnCode;
2105
2106 ra.ra_offset = offset;
2107 ra.ra_count = amount;
2108 pgstat_report_wait_start(wait_event_info);
2111 if (returnCode != -1)
2112 return 0;
2113 else
2114 return errno;
2115 }
2116#else
2117 return 0;
2118#endif
2119}
2120
2121void
2122FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
2123{
2124 int returnCode;
2125
2126 Assert(FileIsValid(file));
2127
2128 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2129 file, VfdCache[file].fileName,
2130 (int64) offset, (int64) nbytes));
2131
2132 if (nbytes <= 0)
2133 return;
2134
2135 if (VfdCache[file].fileFlags & PG_O_DIRECT)
2136 return;
2137
2138 returnCode = FileAccess(file);
2139 if (returnCode < 0)
2140 return;
2141
2142 pgstat_report_wait_start(wait_event_info);
2143 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2145}
2146
2147ssize_t
2148FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2149 uint32 wait_event_info)
2150{
2152 Vfd *vfdP;
2153
2154 Assert(FileIsValid(file));
2155
2156 DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2157 file, VfdCache[file].fileName,
2158 (int64) offset,
2159 iovcnt));
2160
2161 returnCode = FileAccess(file);
2162 if (returnCode < 0)
2163 return returnCode;
2164
2165 vfdP = &VfdCache[file];
2166
2167retry:
2168 pgstat_report_wait_start(wait_event_info);
2169 returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2171
2172 if (returnCode < 0)
2173 {
2174 /*
2175 * Windows may run out of kernel buffers and return "Insufficient
2176 * system resources" error. Wait a bit and retry to solve it.
2177 *
2178 * It is rumored that EINTR is also possible on some Unix filesystems,
2179 * in which case immediate retry is indicated.
2180 */
2181#ifdef WIN32
2183
2184 switch (error)
2185 {
2187 pg_usleep(1000L);
2188 errno = EINTR;
2189 break;
2190 default:
2192 break;
2193 }
2194#endif
2195 /* OK to retry if interrupted */
2196 if (errno == EINTR)
2197 goto retry;
2198 }
2199
2200 return returnCode;
2201}
2202
2203int
2205 int iovcnt, pgoff_t offset,
2206 uint32 wait_event_info)
2207{
2208 int returnCode;
2209 Vfd *vfdP;
2210
2211 Assert(FileIsValid(file));
2212
2213 DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2214 file, VfdCache[file].fileName,
2215 (int64) offset,
2216 iovcnt));
2217
2218 returnCode = FileAccess(file);
2219 if (returnCode < 0)
2220 return returnCode;
2221
2222 vfdP = &VfdCache[file];
2223
2224 pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2225
2226 return 0;
2227}
2228
2229ssize_t
2230FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset,
2231 uint32 wait_event_info)
2232{
2234 Vfd *vfdP;
2235
2236 Assert(FileIsValid(file));
2237
2238 DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2239 file, VfdCache[file].fileName,
2240 (int64) offset,
2241 iovcnt));
2242
2243 returnCode = FileAccess(file);
2244 if (returnCode < 0)
2245 return returnCode;
2246
2247 vfdP = &VfdCache[file];
2248
2249 /*
2250 * If enforcing temp_file_limit and it's a temp file, check to see if the
2251 * write would overrun temp_file_limit, and throw error if so. Note: it's
2252 * really a modularity violation to throw error here; we should set errno
2253 * and return -1. However, there's no way to report a suitable error
2254 * message if we do that. All current callers would just throw error
2255 * immediately anyway, so this is safe at present.
2256 */
2257 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2258 {
2259 pgoff_t past_write = offset;
2260
2261 for (int i = 0; i < iovcnt; ++i)
2262 past_write += iov[i].iov_len;
2263
2264 if (past_write > vfdP->fileSize)
2265 {
2267
2269 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2270 ereport(ERROR,
2272 errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2273 temp_file_limit)));
2274 }
2275 }
2276
2277retry:
2278 pgstat_report_wait_start(wait_event_info);
2279 returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2281
2282 if (returnCode >= 0)
2283 {
2284 /*
2285 * Some callers expect short writes to set errno, and traditionally we
2286 * have assumed that they imply disk space shortage. We don't want to
2287 * waste CPU cycles adding up the total size here, so we'll just set
2288 * it for all successful writes in case such a caller determines that
2289 * the write was short and ereports "%m".
2290 */
2291 errno = ENOSPC;
2292
2293 /*
2294 * Maintain fileSize and temporary_files_size if it's a temp file.
2295 */
2296 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2297 {
2298 pgoff_t past_write = offset + returnCode;
2299
2300 if (past_write > vfdP->fileSize)
2301 {
2302 temporary_files_size += past_write - vfdP->fileSize;
2303 vfdP->fileSize = past_write;
2304 }
2305 }
2306 }
2307 else
2308 {
2309 /*
2310 * See comments in FileReadV()
2311 */
2312#ifdef WIN32
2314
2315 switch (error)
2316 {
2318 pg_usleep(1000L);
2319 errno = EINTR;
2320 break;
2321 default:
2323 break;
2324 }
2325#endif
2326 /* OK to retry if interrupted */
2327 if (errno == EINTR)
2328 goto retry;
2329 }
2330
2331 return returnCode;
2332}
2333
2334int
2335FileSync(File file, uint32 wait_event_info)
2336{
2337 int returnCode;
2338
2339 Assert(FileIsValid(file));
2340
2341 DO_DB(elog(LOG, "FileSync: %d (%s)",
2342 file, VfdCache[file].fileName));
2343
2344 returnCode = FileAccess(file);
2345 if (returnCode < 0)
2346 return returnCode;
2347
2348 pgstat_report_wait_start(wait_event_info);
2349 returnCode = pg_fsync(VfdCache[file].fd);
2351
2352 return returnCode;
2353}
2354
2355/*
2356 * Zero a region of the file.
2357 *
2358 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2359 * appropriate error.
2360 */
2361int
2362FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2363{
2364 int returnCode;
2366
2367 Assert(FileIsValid(file));
2368
2369 DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2370 file, VfdCache[file].fileName,
2371 (int64) offset, (int64) amount));
2372
2373 returnCode = FileAccess(file);
2374 if (returnCode < 0)
2375 return returnCode;
2376
2377 pgstat_report_wait_start(wait_event_info);
2378 written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2380
2381 if (written < 0)
2382 return -1;
2383 else if (written != amount)
2384 {
2385 /* if errno is unset, assume problem is no disk space */
2386 if (errno == 0)
2387 errno = ENOSPC;
2388 return -1;
2389 }
2390
2391 return 0;
2392}
2393
2394/*
2395 * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
2396 * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
2397 * use FileZero() instead.
2398 *
2399 * Note that at least glibc() implements posix_fallocate() in userspace if not
2400 * implemented by the filesystem. That's not the case for all environments
2401 * though.
2402 *
2403 * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
2404 * appropriate error.
2405 */
2406int
2407FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
2408{
2409#ifdef HAVE_POSIX_FALLOCATE
2410 int returnCode;
2411
2412 Assert(FileIsValid(file));
2413
2414 DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2415 file, VfdCache[file].fileName,
2416 (int64) offset, (int64) amount));
2417
2418 returnCode = FileAccess(file);
2419 if (returnCode < 0)
2420 return -1;
2421
2422retry:
2423 pgstat_report_wait_start(wait_event_info);
2424 returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2426
2427 if (returnCode == 0)
2428 return 0;
2429 else if (returnCode == EINTR)
2430 goto retry;
2431
2432 /* for compatibility with %m printing etc */
2433 errno = returnCode;
2434
2435 /*
2436 * Return in cases of a "real" failure, if fallocate is not supported,
2437 * fall through to the FileZero() backed implementation.
2438 */
2440 return -1;
2441#endif
2442
2443 return FileZero(file, offset, amount, wait_event_info);
2444}
2445
2446pgoff_t
2447FileSize(File file)
2448{
2449 Assert(FileIsValid(file));
2450
2451 DO_DB(elog(LOG, "FileSize %d (%s)",
2452 file, VfdCache[file].fileName));
2453
2454 if (FileIsNotOpen(file))
2455 {
2456 if (FileAccess(file) < 0)
2457 return (pgoff_t) -1;
2458 }
2459
2460 return lseek(VfdCache[file].fd, 0, SEEK_END);
2461}
2462
2463int
2464FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
2465{
2466 int returnCode;
2467
2468 Assert(FileIsValid(file));
2469
2470 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2471 file, VfdCache[file].fileName));
2472
2473 returnCode = FileAccess(file);
2474 if (returnCode < 0)
2475 return returnCode;
2476
2477 pgstat_report_wait_start(wait_event_info);
2478 returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2480
2481 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2482 {
2483 /* adjust our state for truncation of a temp file */
2484 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2485 temporary_files_size -= VfdCache[file].fileSize - offset;
2486 VfdCache[file].fileSize = offset;
2487 }
2488
2489 return returnCode;
2490}
2491
2492/*
2493 * Return the pathname associated with an open file.
2494 *
2495 * The returned string points to an internal buffer, which is valid until
2496 * the file is closed.
2497 */
2498char *
2499FilePathName(File file)
2500{
2501 Assert(FileIsValid(file));
2502
2503 return VfdCache[file].fileName;
2504}
2505
2506/*
2507 * Return the raw file descriptor of an opened file.
2508 *
2509 * The returned file descriptor will be valid until the file is closed, but
2510 * there are a lot of things that can make that happen. So the caller should
2511 * be careful not to do much of anything else before it finishes using the
2512 * returned file descriptor.
2513 */
2514int
2515FileGetRawDesc(File file)
2516{
2517 int returnCode;
2518
2519 returnCode = FileAccess(file);
2520 if (returnCode < 0)
2521 return returnCode;
2522
2523 Assert(FileIsValid(file));
2524 return VfdCache[file].fd;
2525}
2526
2527/*
2528 * FileGetRawFlags - returns the file flags on open(2)
2529 */
2530int
2532{
2533 Assert(FileIsValid(file));
2534 return VfdCache[file].fileFlags;
2535}
2536
2537/*
2538 * FileGetRawMode - returns the mode bitmask passed to open(2)
2539 */
2540mode_t
2541FileGetRawMode(File file)
2542{
2543 Assert(FileIsValid(file));
2544 return VfdCache[file].fileMode;
2545}
2546
2547/*
2548 * Make room for another allocatedDescs[] array entry if needed and possible.
2549 * Returns true if an array element is available.
2550 */
2551static bool
2553{
2555 int newMax;
2556
2557 /* Quick out if array already has a free slot. */
2559 return true;
2560
2561 /*
2562 * If the array hasn't yet been created in the current process, initialize
2563 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2564 * we will ever need, anyway. We don't want to look at max_safe_fds
2565 * immediately because set_max_safe_fds() may not have run yet.
2566 */
2567 if (allocatedDescs == NULL)
2568 {
2569 newMax = FD_MINFREE / 3;
2571 /* Out of memory already? Treat as fatal error. */
2572 if (newDescs == NULL)
2573 ereport(ERROR,
2575 errmsg("out of memory")));
2578 return true;
2579 }
2580
2581 /*
2582 * Consider enlarging the array beyond the initial allocation used above.
2583 * By the time this happens, max_safe_fds should be known accurately.
2584 *
2585 * We mustn't let allocated descriptors hog all the available FDs, and in
2586 * practice we'd better leave a reasonable number of FDs for VFD use. So
2587 * set the maximum to max_safe_fds / 3. (This should certainly be at
2588 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2589 * tightening the restriction here.) Recall that "external" FDs are
2590 * allowed to consume another third of max_safe_fds.
2591 */
2592 newMax = max_safe_fds / 3;
2594 {
2596 newMax * sizeof(AllocateDesc));
2597 /* Treat out-of-memory as a non-fatal error. */
2598 if (newDescs == NULL)
2599 return false;
2602 return true;
2603 }
2604
2605 /* Can't enlarge allocatedDescs[] any more. */
2606 return false;
2607}
2608
2609/*
2610 * Routines that want to use stdio (ie, FILE*) should use AllocateFile
2611 * rather than plain fopen(). This lets fd.c deal with freeing FDs if
2612 * necessary to open the file. When done, call FreeFile rather than fclose.
2613 *
2614 * Note that files that will be open for any significant length of time
2615 * should NOT be handled this way, since they cannot share kernel file
2616 * descriptors with other files; there is grave risk of running out of FDs
2617 * if anyone locks down too many FDs. Most callers of this routine are
2618 * simply reading a config file that they will read and close immediately.
2619 *
2620 * fd.c will automatically close all files opened with AllocateFile at
2621 * transaction commit or abort; this prevents FD leakage if a routine
2622 * that calls AllocateFile is terminated prematurely by ereport(ERROR).
2623 *
2624 * Ideally this should be the *only* direct call of fopen() in the backend.
2625 */
2626FILE *
2627AllocateFile(const char *name, const char *mode)
2628{
2629 FILE *file;
2630
2631 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2633
2634 /* Can we allocate another non-virtual FD? */
2635 if (!reserveAllocatedDesc())
2636 ereport(ERROR,
2638 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2640
2641 /* Close excess kernel FDs. */
2643
2644TryAgain:
2645 if ((file = fopen(name, mode)) != NULL)
2646 {
2648
2649 desc->kind = AllocateDescFile;
2650 desc->desc.file = file;
2653 return desc->desc.file;
2654 }
2655
2656 if (errno == EMFILE || errno == ENFILE)
2657 {
2658 int save_errno = errno;
2659
2660 ereport(LOG,
2662 errmsg("out of file descriptors: %m; release and retry")));
2663 errno = 0;
2664 if (ReleaseLruFile())
2665 goto TryAgain;
2666 errno = save_errno;
2667 }
2668
2669 return NULL;
2670}
2671
2672/*
2673 * Open a file with OpenTransientFilePerm() and pass default file mode for
2674 * the fileMode parameter.
2675 */
2676int
2677OpenTransientFile(const char *fileName, int fileFlags)
2678{
2679 return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
2680}
2681
2682/*
2683 * Like AllocateFile, but returns an unbuffered fd like open(2)
2684 */
2685int
2686OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
2687{
2688 int fd;
2689
2690 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2691 numAllocatedDescs, fileName));
2692
2693 /* Can we allocate another non-virtual FD? */
2694 if (!reserveAllocatedDesc())
2695 ereport(ERROR,
2697 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2698 maxAllocatedDescs, fileName)));
2699
2700 /* Close excess kernel FDs. */
2702
2703 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2704
2705 if (fd >= 0)
2706 {
2708
2709 desc->kind = AllocateDescRawFD;
2710 desc->desc.fd = fd;
2713
2714 return fd;
2715 }
2716
2717 return -1; /* failure */
2718}
2719
2720/*
2721 * Routines that want to initiate a pipe stream should use OpenPipeStream
2722 * rather than plain popen(). This lets fd.c deal with freeing FDs if
2723 * necessary. When done, call ClosePipeStream rather than pclose.
2724 *
2725 * This function also ensures that the popen'd program is run with default
2726 * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
2727 * uses. This ensures desirable response to, eg, closing a read pipe early.
2728 */
2729FILE *
2730OpenPipeStream(const char *command, const char *mode)
2731{
2732 FILE *file;
2733 int save_errno;
2734
2735 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2736 numAllocatedDescs, command));
2737
2738 /* Can we allocate another non-virtual FD? */
2739 if (!reserveAllocatedDesc())
2740 ereport(ERROR,
2742 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2743 maxAllocatedDescs, command)));
2744
2745 /* Close excess kernel FDs. */
2747
2748TryAgain:
2749 fflush(NULL);
2751 errno = 0;
2752 file = popen(command, mode);
2753 save_errno = errno;
2755 errno = save_errno;
2756 if (file != NULL)
2757 {
2759
2760 desc->kind = AllocateDescPipe;
2761 desc->desc.file = file;
2764 return desc->desc.file;
2765 }
2766
2767 if (errno == EMFILE || errno == ENFILE)
2768 {
2769 ereport(LOG,
2771 errmsg("out of file descriptors: %m; release and retry")));
2772 if (ReleaseLruFile())
2773 goto TryAgain;
2774 errno = save_errno;
2775 }
2776
2777 return NULL;
2778}
2779
2780/*
2781 * Free an AllocateDesc of any type.
2782 *
2783 * The argument *must* point into the allocatedDescs[] array.
2784 */
2785static int
2787{
2788 int result;
2789
2790 /* Close the underlying object */
2791 switch (desc->kind)
2792 {
2793 case AllocateDescFile:
2794 result = fclose(desc->desc.file);
2795 break;
2796 case AllocateDescPipe:
2797 result = pclose(desc->desc.file);
2798 break;
2799 case AllocateDescDir:
2800 result = closedir(desc->desc.dir);
2801 break;
2802 case AllocateDescRawFD:
2803 pgaio_closing_fd(desc->desc.fd);
2804 result = close(desc->desc.fd);
2805 break;
2806 default:
2807 elog(ERROR, "AllocateDesc kind not recognized");
2808 result = 0; /* keep compiler quiet */
2809 break;
2810 }
2811
2812 /* Compact storage in the allocatedDescs array */
2815
2816 return result;
2817}
2818
2819/*
2820 * Close a file returned by AllocateFile.
2821 *
2822 * Note we do not check fclose's return value --- it is up to the caller
2823 * to handle close errors.
2824 */
2825int
2826FreeFile(FILE *file)
2827{
2828 int i;
2829
2830 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2831
2832 /* Remove file from list of allocated files, if it's present */
2833 for (i = numAllocatedDescs; --i >= 0;)
2834 {
2835 AllocateDesc *desc = &allocatedDescs[i];
2836
2837 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2838 return FreeDesc(desc);
2839 }
2840
2841 /* Only get here if someone passes us a file not in allocatedDescs */
2842 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2843
2844 return fclose(file);
2845}
2846
2847/*
2848 * Close a file returned by OpenTransientFile.
2849 *
2850 * Note we do not check close's return value --- it is up to the caller
2851 * to handle close errors.
2852 */
2853int
2855{
2856 int i;
2857
2858 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2859
2860 /* Remove fd from list of allocated files, if it's present */
2861 for (i = numAllocatedDescs; --i >= 0;)
2862 {
2863 AllocateDesc *desc = &allocatedDescs[i];
2864
2865 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2866 return FreeDesc(desc);
2867 }
2868
2869 /* Only get here if someone passes us a file not in allocatedDescs */
2870 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2871
2873
2874 return close(fd);
2875}
2876
2877/*
2878 * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
2879 * rather than plain opendir(). This lets fd.c deal with freeing FDs if
2880 * necessary to open the directory, and with closing it after an elog.
2881 * When done, call FreeDir rather than closedir.
2882 *
2883 * Returns NULL, with errno set, on failure. Note that failure detection
2884 * is commonly left to the following call of ReadDir or ReadDirExtended;
2885 * see the comments for ReadDir.
2886 *
2887 * Ideally this should be the *only* direct call of opendir() in the backend.
2888 */
2889DIR *
2890AllocateDir(const char *dirname)
2891{
2892 DIR *dir;
2893
2894 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2895 numAllocatedDescs, dirname));
2896
2897 /* Can we allocate another non-virtual FD? */
2898 if (!reserveAllocatedDesc())
2899 ereport(ERROR,
2901 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2902 maxAllocatedDescs, dirname)));
2903
2904 /* Close excess kernel FDs. */
2906
2907TryAgain:
2908 if ((dir = opendir(dirname)) != NULL)
2909 {
2911
2912 desc->kind = AllocateDescDir;
2913 desc->desc.dir = dir;
2916 return desc->desc.dir;
2917 }
2918
2919 if (errno == EMFILE || errno == ENFILE)
2920 {
2921 int save_errno = errno;
2922
2923 ereport(LOG,
2925 errmsg("out of file descriptors: %m; release and retry")));
2926 errno = 0;
2927 if (ReleaseLruFile())
2928 goto TryAgain;
2929 errno = save_errno;
2930 }
2931
2932 return NULL;
2933}
2934
2935/*
2936 * Read a directory opened with AllocateDir, ereport'ing any error.
2937 *
2938 * This is easier to use than raw readdir() since it takes care of some
2939 * otherwise rather tedious and error-prone manipulation of errno. Also,
2940 * if you are happy with a generic error message for AllocateDir failure,
2941 * you can just do
2942 *
2943 * dir = AllocateDir(path);
2944 * while ((dirent = ReadDir(dir, path)) != NULL)
2945 * process dirent;
2946 * FreeDir(dir);
2947 *
2948 * since a NULL dir parameter is taken as indicating AllocateDir failed.
2949 * (Make sure errno isn't changed between AllocateDir and ReadDir if you
2950 * use this shortcut.)
2951 *
2952 * The pathname passed to AllocateDir must be passed to this routine too,
2953 * but it is only used for error reporting.
2954 */
2955struct dirent *
2956ReadDir(DIR *dir, const char *dirname)
2957{
2958 return ReadDirExtended(dir, dirname, ERROR);
2959}
2960
2961/*
2962 * Alternate version of ReadDir that allows caller to specify the elevel
2963 * for any error report (whether it's reporting an initial failure of
2964 * AllocateDir or a subsequent directory read failure).
2965 *
2966 * If elevel < ERROR, returns NULL after any error. With the normal coding
2967 * pattern, this will result in falling out of the loop immediately as
2968 * though the directory contained no (more) entries.
2969 */
2970struct dirent *
2971ReadDirExtended(DIR *dir, const char *dirname, int elevel)
2972{
2973 struct dirent *dent;
2974
2975 /* Give a generic message for AllocateDir failure, if caller didn't */
2976 if (dir == NULL)
2977 {
2978 ereport(elevel,
2980 errmsg("could not open directory \"%s\": %m",
2981 dirname)));
2982 return NULL;
2983 }
2984
2985 errno = 0;
2986 if ((dent = readdir(dir)) != NULL)
2987 return dent;
2988
2989 if (errno)
2990 ereport(elevel,
2992 errmsg("could not read directory \"%s\": %m",
2993 dirname)));
2994 return NULL;
2995}
2996
2997/*
2998 * Close a directory opened with AllocateDir.
2999 *
3000 * Returns closedir's return value (with errno set if it's not 0).
3001 * Note we do not check the return value --- it is up to the caller
3002 * to handle close errors if wanted.
3003 *
3004 * Does nothing if dir == NULL; we assume that directory open failure was
3005 * already reported if desired.
3006 */
3007int
3008FreeDir(DIR *dir)
3009{
3010 int i;
3011
3012 /* Nothing to do if AllocateDir failed */
3013 if (dir == NULL)
3014 return 0;
3015
3016 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3017
3018 /* Remove dir from list of allocated dirs, if it's present */
3019 for (i = numAllocatedDescs; --i >= 0;)
3020 {
3021 AllocateDesc *desc = &allocatedDescs[i];
3022
3023 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3024 return FreeDesc(desc);
3025 }
3026
3027 /* Only get here if someone passes us a dir not in allocatedDescs */
3028 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3029
3030 return closedir(dir);
3031}
3032
3033
3034/*
3035 * Close a pipe stream returned by OpenPipeStream.
3036 */
3037int
3038ClosePipeStream(FILE *file)
3039{
3040 int i;
3041
3042 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3043
3044 /* Remove file from list of allocated files, if it's present */
3045 for (i = numAllocatedDescs; --i >= 0;)
3046 {
3047 AllocateDesc *desc = &allocatedDescs[i];
3048
3049 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3050 return FreeDesc(desc);
3051 }
3052
3053 /* Only get here if someone passes us a file not in allocatedDescs */
3054 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3055
3056 return pclose(file);
3057}
3058
3059/*
3060 * closeAllVfds
3061 *
3062 * Force all VFDs into the physically-closed state, so that the fewest
3063 * possible number of kernel file descriptors are in use. There is no
3064 * change in the logical state of the VFDs.
3065 */
3066void
3067closeAllVfds(void)
3068{
3069 Index i;
3070
3071 if (SizeVfdCache > 0)
3072 {
3073 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3074 for (i = 1; i < SizeVfdCache; i++)
3075 {
3076 if (!FileIsNotOpen(i))
3077 LruDelete(i);
3078 }
3079 }
3080}
3081
3082
3083/*
3084 * SetTempTablespaces
3085 *
3086 * Define a list (actually an array) of OIDs of tablespaces to use for
3087 * temporary files. This list will be used until end of transaction,
3088 * unless this function is called again before then. It is caller's
3089 * responsibility that the passed-in array has adequate lifespan (typically
3090 * it'd be allocated in TopTransactionContext).
3091 *
3092 * Some entries of the array may be InvalidOid, indicating that the current
3093 * database's default tablespace should be used.
3094 */
3095void
3097{
3098 Assert(numSpaces >= 0);
3101
3102 /*
3103 * Select a random starting point in the list. This is to minimize
3104 * conflicts between backends that are most likely sharing the same list
3105 * of temp tablespaces. Note that if we create multiple temp files in the
3106 * same transaction, we'll advance circularly through the list --- this
3107 * ensures that large temporary sort files are nicely spread across all
3108 * available tablespaces.
3109 */
3110 if (numSpaces > 1)
3112 0, numSpaces - 1);
3113 else
3115}
3116
3117/*
3118 * TempTablespacesAreSet
3119 *
3120 * Returns true if SetTempTablespaces has been called in current transaction.
3121 * (This is just so that tablespaces.c doesn't need its own per-transaction
3122 * state.)
3123 */
3124bool
3126{
3127 return (numTempTableSpaces >= 0);
3128}
3129
3130/*
3131 * GetTempTablespaces
3132 *
3133 * Populate an array with the OIDs of the tablespaces that should be used for
3134 * temporary files. (Some entries may be InvalidOid, indicating that the
3135 * current database's default tablespace should be used.) At most numSpaces
3136 * entries will be filled.
3137 * Returns the number of OIDs that were copied into the output array.
3138 */
3139int
3141{
3142 int i;
3143
3145 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3147
3148 return i;
3149}
3150
3151/*
3152 * GetNextTempTableSpace
3153 *
3154 * Select the next temp tablespace to use. A result of InvalidOid means
3155 * to use the current database's default tablespace.
3156 */
3157Oid
3159{
3160 if (numTempTableSpaces > 0)
3161 {
3162 /* Advance nextTempTableSpace counter with wraparound */
3166 }
3167 return InvalidOid;
3168}
3169
3170
3171/*
3172 * AtEOSubXact_Files
3173 *
3174 * Take care of subtransaction commit/abort. At abort, we close AllocateDescs
3175 * that the subtransaction may have opened. At commit, we reassign them to
3176 * the parent subtransaction. (Temporary files are tracked by ResourceOwners
3177 * instead.)
3178 */
3179void
3182{
3183 Index i;
3184
3185 for (i = 0; i < numAllocatedDescs; i++)
3186 {
3187 if (allocatedDescs[i].create_subid == mySubid)
3188 {
3189 if (isCommit)
3191 else
3192 {
3193 /* have to recheck the item after FreeDesc (ugly) */
3195 }
3196 }
3197 }
3198}
3199
3200/*
3201 * AtEOXact_Files
3202 *
3203 * This routine is called during transaction commit or abort. All still-open
3204 * per-transaction temporary file VFDs are closed, which also causes the
3205 * underlying files to be deleted (although they should've been closed already
3206 * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
3207 * closed. We also forget any transaction-local temp tablespace list.
3208 *
3209 * The isCommit flag is used only to decide whether to emit warnings about
3210 * unclosed files.
3211 */
3212void
3214{
3215 CleanupTempFiles(isCommit, false);
3217 numTempTableSpaces = -1;
3218}
3219
3220/*
3221 * BeforeShmemExit_Files
3222 *
3223 * before_shmem_exit hook to clean up temp files during backend shutdown.
3224 * Here, we want to clean up *all* temp files including interXact ones.
3225 */
3226static void
3228{
3229 CleanupTempFiles(false, true);
3230
3231 /* prevent further temp files from being created */
3232#ifdef USE_ASSERT_CHECKING
3234#endif
3235}
3236
3237/*
3238 * Close temporary files and delete their underlying files.
3239 *
3240 * isCommit: if true, this is normal transaction commit, and we don't
3241 * expect any remaining files; warn if there are some.
3242 *
3243 * isProcExit: if true, this is being called as the backend process is
3244 * exiting. If that's the case, we should remove all temporary files; if
3245 * that's not the case, we are being called for transaction commit/abort
3246 * and should only remove transaction-local temp files. In either case,
3247 * also clean up "allocated" stdio files, dirs and fds.
3248 */
3249static void
3251{
3252 Index i;
3253
3254 /*
3255 * Careful here: at proc_exit we need extra cleanup, not just
3256 * xact_temporary files.
3257 */
3259 {
3260 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3261 for (i = 1; i < SizeVfdCache; i++)
3262 {
3263 unsigned short fdstate = VfdCache[i].fdstate;
3264
3265 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3266 VfdCache[i].fileName != NULL)
3267 {
3268 /*
3269 * If we're in the process of exiting a backend process, close
3270 * all temporary files. Otherwise, only close temporary files
3271 * local to the current transaction. They should be closed by
3272 * the ResourceOwner mechanism already, so this is just a
3273 * debugging cross-check.
3274 */
3275 if (isProcExit)
3276 FileClose(i);
3277 else if (fdstate & FD_CLOSE_AT_EOXACT)
3278 {
3279 elog(WARNING,
3280 "temporary file %s not closed at end-of-transaction",
3281 VfdCache[i].fileName);
3282 FileClose(i);
3283 }
3284 }
3285 }
3286
3288 }
3289
3290 /* Complain if any allocated files remain open at commit. */
3291 if (isCommit && numAllocatedDescs > 0)
3292 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3294
3295 /* Clean up "allocated" stdio files, dirs and fds. */
3296 while (numAllocatedDescs > 0)
3298}
3299
3300
3301/*
3302 * Remove temporary and temporary relation files left over from a prior
3303 * postmaster session
3304 *
3305 * This should be called during postmaster startup. It will forcibly
3306 * remove any leftover files created by OpenTemporaryFile and any leftover
3307 * temporary relation files created by mdcreate.
3308 *
3309 * During post-backend-crash restart cycle, this routine is called when
3310 * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
3311 * queries are using temp files could result in useless storage usage that can
3312 * only be reclaimed by a service restart. The argument against enabling it is
3313 * that someone might want to examine the temporary files for debugging
3314 * purposes. This does however mean that OpenTemporaryFile had better allow for
3315 * collision with an existing temp file name.
3316 *
3317 * NOTE: this function and its subroutines generally report syscall failures
3318 * with ereport(LOG) and keep going. Removing temp files is not so critical
3319 * that we should fail to start the database when we can't do it.
3320 */
3321void
3323{
3325 DIR *spc_dir;
3326 struct dirent *spc_de;
3327
3328 /*
3329 * First process temp files in pg_default ($PGDATA/base)
3330 */
3331 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3332 RemovePgTempFilesInDir(temp_path, true, false);
3334
3335 /*
3336 * Cycle through temp directories for all non-default tablespaces.
3337 */
3339
3341 {
3342 if (strcmp(spc_de->d_name, ".") == 0 ||
3343 strcmp(spc_de->d_name, "..") == 0)
3344 continue;
3345
3346 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3349 RemovePgTempFilesInDir(temp_path, true, false);
3350
3351 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3354 }
3355
3357
3358 /*
3359 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3360 * DataDir as well. However, that is *not* cleaned here because doing so
3361 * would create a race condition. It's done separately, earlier in
3362 * postmaster startup.
3363 */
3364}
3365
3366/*
3367 * Process one pgsql_tmp directory for RemovePgTempFiles.
3368 *
3369 * If missing_ok is true, it's all right for the named directory to not exist.
3370 * Any other problem results in a LOG message. (missing_ok should be true at
3371 * the top level, since pgsql_tmp directories are not created until needed.)
3372 *
3373 * At the top level, this should be called with unlink_all = false, so that
3374 * only files matching the temporary name prefix will be unlinked. When
3375 * recursing it will be called with unlink_all = true to unlink everything
3376 * under a top-level temporary directory.
3377 *
3378 * (These two flags could be replaced by one, but it seems clearer to keep
3379 * them separate.)
3380 */
3381void
3382RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
3383{
3384 DIR *temp_dir;
3385 struct dirent *temp_de;
3386 char rm_path[MAXPGPATH * 2];
3387
3389
3390 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3391 return;
3392
3394 {
3395 if (strcmp(temp_de->d_name, ".") == 0 ||
3396 strcmp(temp_de->d_name, "..") == 0)
3397 continue;
3398
3399 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3400 tmpdirname, temp_de->d_name);
3401
3402 if (unlink_all ||
3403 strncmp(temp_de->d_name,
3406 {
3408
3409 if (type == PGFILETYPE_ERROR)
3410 continue;
3411 else if (type == PGFILETYPE_DIR)
3412 {
3413 /* recursively remove contents, then directory itself */
3414 RemovePgTempFilesInDir(rm_path, false, true);
3415
3416 if (rmdir(rm_path) < 0)
3417 ereport(LOG,
3419 errmsg("could not remove directory \"%s\": %m",
3420 rm_path)));
3421 }
3422 else
3423 {
3424 if (unlink(rm_path) < 0)
3425 ereport(LOG,
3427 errmsg("could not remove file \"%s\": %m",
3428 rm_path)));
3429 }
3430 }
3431 else
3432 ereport(LOG,
3433 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3434 rm_path)));
3435 }
3436
3438}
3439
3440/* Process one tablespace directory, look for per-DB subdirectories */
3441static void
3443{
3444 DIR *ts_dir;
3445 struct dirent *de;
3446 char dbspace_path[MAXPGPATH * 2];
3447
3449
3450 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3451 {
3452 /*
3453 * We're only interested in the per-database directories, which have
3454 * numeric names. Note that this code will also (properly) ignore "."
3455 * and "..".
3456 */
3457 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3458 continue;
3459
3460 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3461 tsdirname, de->d_name);
3463 }
3464
3465 FreeDir(ts_dir);
3466}
3467
3468/* Process one per-dbspace directory for RemovePgTempRelationFiles */
3469static void
3471{
3473 struct dirent *de;
3474 char rm_path[MAXPGPATH * 2];
3475
3477
3479 {
3480 if (!looks_like_temp_rel_name(de->d_name))
3481 continue;
3482
3483 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3484 dbspacedirname, de->d_name);
3485
3486 if (unlink(rm_path) < 0)
3487 ereport(LOG,
3489 errmsg("could not remove file \"%s\": %m",
3490 rm_path)));
3491 }
3492
3494}
3495
3496/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
3497bool
3498looks_like_temp_rel_name(const char *name)
3499{
3500 int pos;
3501 int savepos;
3502
3503 /* Must start with "t". */
3504 if (name[0] != 't')
3505 return false;
3506
3507 /* Followed by a non-empty string of digits and then an underscore. */
3508 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3509 ;
3510 if (pos == 1 || name[pos] != '_')
3511 return false;
3512
3513 /* Followed by another nonempty string of digits. */
3514 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3515 ;
3516 if (savepos == pos)
3517 return false;
3518
3519 /* We might have _forkname or .segment or both. */
3520 if (name[pos] == '_')
3521 {
3522 int forkchar = forkname_chars(&name[pos + 1], NULL);
3523
3524 if (forkchar <= 0)
3525 return false;
3526 pos += forkchar + 1;
3527 }
3528 if (name[pos] == '.')
3529 {
3530 int segchar;
3531
3532 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3533 ;
3534 if (segchar <= 1)
3535 return false;
3536 pos += segchar;
3537 }
3538
3539 /* Now we should be at the end. */
3540 if (name[pos] != '\0')
3541 return false;
3542 return true;
3543}
3544
3545#ifdef HAVE_SYNCFS
3546static void
3547do_syncfs(const char *path)
3548{
3549 int fd;
3550
3551 ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3552 path);
3553
3554 fd = OpenTransientFile(path, O_RDONLY);
3555 if (fd < 0)
3556 {
3557 ereport(LOG,
3559 errmsg("could not open file \"%s\": %m", path)));
3560 return;
3561 }
3562 if (syncfs(fd) < 0)
3563 ereport(LOG,
3565 errmsg("could not synchronize file system for file \"%s\": %m", path)));
3567}
3568#endif
3569
3570/*
3571 * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
3572 * all potential filesystem, depending on recovery_init_sync_method setting.
3573 *
3574 * We fsync regular files and directories wherever they are, but we
3575 * follow symlinks only for pg_wal and immediately under pg_tblspc.
3576 * Other symlinks are presumed to point at files we're not responsible
3577 * for fsyncing, and might not have privileges to write at all.
3578 *
3579 * Errors are logged but not considered fatal; that's because this is used
3580 * only during database startup, to deal with the possibility that there are
3581 * issued-but-unsynced writes pending against the data directory. We want to
3582 * ensure that such writes reach disk before anything that's done in the new
3583 * run. However, aborting on error would result in failure to start for
3584 * harmless cases such as read-only files in the data directory, and that's
3585 * not good either.
3586 *
3587 * Note that if we previously crashed due to a PANIC on fsync(), we'll be
3588 * rewriting all changes again during recovery.
3589 *
3590 * Note we assume we're chdir'd into PGDATA to begin with.
3591 */
3592void
3594{
3595 bool xlog_is_symlink;
3596
3597 /* We can skip this whole thing if fsync is disabled. */
3598 if (!enableFsync)
3599 return;
3600
3601 /*
3602 * If pg_wal is a symlink, we'll need to recurse into it separately,
3603 * because the first walkdir below will ignore it.
3604 */
3605 xlog_is_symlink = false;
3606
3607 {
3608 struct stat st;
3609
3610 if (lstat("pg_wal", &st) < 0)
3611 ereport(LOG,
3613 errmsg("could not stat file \"%s\": %m",
3614 "pg_wal")));
3615 else if (S_ISLNK(st.st_mode))
3616 xlog_is_symlink = true;
3617 }
3618
3619#ifdef HAVE_SYNCFS
3621 {
3622 DIR *dir;
3623 struct dirent *de;
3624
3625 /*
3626 * On Linux, we don't have to open every single file one by one. We
3627 * can use syncfs() to sync whole filesystems. We only expect
3628 * filesystem boundaries to exist where we tolerate symlinks, namely
3629 * pg_wal and the tablespaces, so we call syncfs() for each of those
3630 * directories.
3631 */
3632
3633 /* Prepare to report progress syncing the data directory via syncfs. */
3635
3636 /* Sync the top level pgdata directory. */
3637 do_syncfs(".");
3638 /* If any tablespaces are configured, sync each of those. */
3640 while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3641 {
3642 char path[MAXPGPATH];
3643
3644 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3645 continue;
3646
3647 snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3648 do_syncfs(path);
3649 }
3650 FreeDir(dir);
3651 /* If pg_wal is a symlink, process that too. */
3652 if (xlog_is_symlink)
3653 do_syncfs("pg_wal");
3654 return;
3655 }
3656#endif /* !HAVE_SYNCFS */
3657
3658#ifdef PG_FLUSH_DATA_WORKS
3659 /* Prepare to report progress of the pre-fsync phase. */
3661
3662 /*
3663 * If possible, hint to the kernel that we're soon going to fsync the data
3664 * directory and its contents. Errors in this step are even less
3665 * interesting than normal, so log them only at DEBUG1.
3666 */
3667 walkdir(".", pre_sync_fname, false, DEBUG1);
3668 if (xlog_is_symlink)
3669 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3671#endif
3672
3673 /* Prepare to report progress syncing the data directory via fsync. */
3675
3676 /*
3677 * Now we do the fsync()s in the same order.
3678 *
3679 * The main call ignores symlinks, so in addition to specially processing
3680 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3681 * process_symlinks = true. Note that if there are any plain directories
3682 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3683 * so we don't worry about optimizing it.
3684 */
3685 walkdir(".", datadir_fsync_fname, false, LOG);
3686 if (xlog_is_symlink)
3687 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3689}
3690
3691/*
3692 * walkdir: recursively walk a directory, applying the action to each
3693 * regular file and directory (including the named directory itself).
3694 *
3695 * If process_symlinks is true, the action and recursion are also applied
3696 * to regular files and directories that are pointed to by symlinks in the
3697 * given directory; otherwise symlinks are ignored. Symlinks are always
3698 * ignored in subdirectories, ie we intentionally don't pass down the
3699 * process_symlinks flag to recursive calls.
3700 *
3701 * Errors are reported at level elevel, which might be ERROR or less.
3702 *
3703 * See also walkdir in file_utils.c, which is a frontend version of this
3704 * logic.
3705 */
3706static void
3707walkdir(const char *path,
3708 void (*action) (const char *fname, bool isdir, int elevel),
3709 bool process_symlinks,
3710 int elevel)
3711{
3712 DIR *dir;
3713 struct dirent *de;
3714
3715 dir = AllocateDir(path);
3716
3717 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3718 {
3719 char subpath[MAXPGPATH * 2];
3720
3722
3723 if (strcmp(de->d_name, ".") == 0 ||
3724 strcmp(de->d_name, "..") == 0)
3725 continue;
3726
3727 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3728
3729 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3730 {
3731 case PGFILETYPE_REG:
3732 (*action) (subpath, false, elevel);
3733 break;
3734 case PGFILETYPE_DIR:
3735 walkdir(subpath, action, false, elevel);
3736 break;
3737 default:
3738
3739 /*
3740 * Errors are already reported directly by get_dirent_type(),
3741 * and any remaining symlinks and unknown file types are
3742 * ignored.
3743 */
3744 break;
3745 }
3746 }
3747
3748 FreeDir(dir); /* we ignore any error here */
3749
3750 /*
3751 * It's important to fsync the destination directory itself as individual
3752 * file fsyncs don't guarantee that the directory entry for the file is
3753 * synced. However, skip this if AllocateDir failed; the action function
3754 * might not be robust against that.
3755 */
3756 if (dir)
3757 (*action) (path, true, elevel);
3758}
3759
3760
3761/*
3762 * Hint to the OS that it should get ready to fsync() this file.
3763 *
3764 * Ignores errors trying to open unreadable files, and logs other errors at a
3765 * caller-specified level.
3766 */
3767#ifdef PG_FLUSH_DATA_WORKS
3768
3769static void
3770pre_sync_fname(const char *fname, bool isdir, int elevel)
3771{
3772 int fd;
3773
3774 /* Don't try to flush directories, it'll likely just fail */
3775 if (isdir)
3776 return;
3777
3778 ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3779 fname);
3780
3782
3783 if (fd < 0)
3784 {
3785 if (errno == EACCES)
3786 return;
3787 ereport(elevel,
3789 errmsg("could not open file \"%s\": %m", fname)));
3790 return;
3791 }
3792
3793 /*
3794 * pg_flush_data() ignores errors, which is ok because this is only a
3795 * hint.
3796 */
3797 pg_flush_data(fd, 0, 0);
3798
3799 if (CloseTransientFile(fd) != 0)
3800 ereport(elevel,
3802 errmsg("could not close file \"%s\": %m", fname)));
3803}
3804
3805#endif /* PG_FLUSH_DATA_WORKS */
3806
3807static void
3808datadir_fsync_fname(const char *fname, bool isdir, int elevel)
3809{
3810 ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3811 fname);
3812
3813 /*
3814 * We want to silently ignoring errors about unreadable files. Pass that
3815 * desire on to fsync_fname_ext().
3816 */
3817 fsync_fname_ext(fname, isdir, true, elevel);
3818}
3819
3820static void
3821unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
3822{
3823 if (isdir)
3824 {
3825 if (rmdir(fname) != 0 && errno != ENOENT)
3826 ereport(elevel,
3828 errmsg("could not remove directory \"%s\": %m", fname)));
3829 }
3830 else
3831 {
3832 /* Use PathNameDeleteTemporaryFile to report filesize */
3833 PathNameDeleteTemporaryFile(fname, false);
3834 }
3835}
3836
3837/*
3838 * fsync_fname_ext -- Try to fsync a file or directory
3839 *
3840 * If ignore_perm is true, ignore errors upon trying to open unreadable
3841 * files. Logs other errors at a caller-specified level.
3842 *
3843 * Returns 0 if the operation succeeded, -1 otherwise.
3844 */
3845int
3846fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
3847{
3848 int fd;
3849 int flags;
3850 int returncode;
3851
3852 /*
3853 * Some OSs require directories to be opened read-only whereas other
3854 * systems don't allow us to fsync files opened read-only; so we need both
3855 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3856 * not writable by our userid, but we assume that's OK.
3857 */
3858 flags = PG_BINARY;
3859 if (!isdir)
3860 flags |= O_RDWR;
3861 else
3862 flags |= O_RDONLY;
3863
3864 fd = OpenTransientFile(fname, flags);
3865
3866 /*
3867 * Some OSs don't allow us to open directories at all (Windows returns
3868 * EACCES), just ignore the error in that case. If desired also silently
3869 * ignoring errors about unreadable files. Log others.
3870 */
3871 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3872 return 0;
3873 else if (fd < 0 && ignore_perm && errno == EACCES)
3874 return 0;
3875 else if (fd < 0)
3876 {
3877 ereport(elevel,
3879 errmsg("could not open file \"%s\": %m", fname)));
3880 return -1;
3881 }
3882
3884
3885 /*
3886 * Some OSes don't allow us to fsync directories at all, so we can ignore
3887 * those errors. Anything else needs to be logged.
3888 */
3889 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3890 {
3891 int save_errno;
3892
3893 /* close file upon error, might not be in transaction context */
3894 save_errno = errno;
3896 errno = save_errno;
3897
3898 ereport(elevel,
3900 errmsg("could not fsync file \"%s\": %m", fname)));
3901 return -1;
3902 }
3903
3904 if (CloseTransientFile(fd) != 0)
3905 {
3906 ereport(elevel,
3908 errmsg("could not close file \"%s\": %m", fname)));
3909 return -1;
3910 }
3911
3912 return 0;
3913}
3914
3915/*
3916 * fsync_parent_path -- fsync the parent path of a file or directory
3917 *
3918 * This is aimed at making file operations persistent on disk in case of
3919 * an OS crash or power failure.
3920 */
3921static int
3922fsync_parent_path(const char *fname, int elevel)
3923{
3924 char parentpath[MAXPGPATH];
3925
3926 strlcpy(parentpath, fname, MAXPGPATH);
3928
3929 /*
3930 * get_parent_directory() returns an empty string if the input argument is
3931 * just a file name (see comments in path.c), so handle that as being the
3932 * current directory.
3933 */
3934 if (strlen(parentpath) == 0)
3936
3937 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3938 return -1;
3939
3940 return 0;
3941}
3942
3943/*
3944 * Create a PostgreSQL data sub-directory
3945 *
3946 * The data directory itself, and most of its sub-directories, are created at
3947 * initdb time, but we do have some occasions when we create directories in
3948 * the backend (CREATE TABLESPACE, for example). In those cases, we want to
3949 * make sure that those directories are created consistently. Today, that means
3950 * making sure that the created directory has the correct permissions, which is
3951 * what pg_dir_create_mode tracks for us.
3952 *
3953 * Note that we also set the umask() based on what we understand the correct
3954 * permissions to be (see file_perm.c).
3955 *
3956 * For permissions other than the default, mkdir() can be used directly, but
3957 * be sure to consider carefully such cases -- a sub-directory with incorrect
3958 * permissions in a PostgreSQL data directory could cause backups and other
3959 * processes to fail.
3960 */
3961int
3962MakePGDirectory(const char *directoryName)
3963{
3965}
3966
3967/*
3968 * Return the passed-in error level, or PANIC if data_sync_retry is off.
3969 *
3970 * Failure to fsync any data file is cause for immediate panic, unless
3971 * data_sync_retry is enabled. Data may have been written to the operating
3972 * system and removed from our buffer pool already, and if we are running on
3973 * an operating system that forgets dirty data on write-back failure, there
3974 * may be only one copy of the data remaining: in the WAL. A later attempt to
3975 * fsync again might falsely report success. Therefore we must not allow any
3976 * further checkpoints to be attempted. data_sync_retry can in theory be
3977 * enabled on systems known not to drop dirty buffered data on write-back
3978 * failure (with the likely outcome that checkpoints will continue to fail
3979 * until the underlying problem is fixed).
3980 *
3981 * Any code that reports a failure from fsync() or related functions should
3982 * filter the error level with this function.
3983 */
3984int
3985data_sync_elevel(int elevel)
3986{
3987 return data_sync_retry ? elevel : PANIC;
3988}
3989
3990bool
3991check_debug_io_direct(char **newval, void **extra, GucSource source)
3992{
3993 bool result = true;
3994 int flags;
3995
3996#if PG_O_DIRECT == 0
3997 if (strcmp(*newval, "") != 0)
3998 {
3999 GUC_check_errdetail("\"%s\" is not supported on this platform.",
4000 "debug_io_direct");
4001 result = false;
4002 }
4003 flags = 0;
4004#else
4005 List *elemlist;
4006 ListCell *l;
4007 char *rawstring;
4008
4009 /* Need a modifiable copy of string */
4011
4012 if (!SplitGUCList(rawstring, ',', &elemlist))
4013 {
4014 GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4015 "debug_io_direct");
4018 return false;
4019 }
4020
4021 flags = 0;
4022 foreach(l, elemlist)
4023 {
4024 char *item = (char *) lfirst(l);
4025
4026 if (pg_strcasecmp(item, "data") == 0)
4027 flags |= IO_DIRECT_DATA;
4028 else if (pg_strcasecmp(item, "wal") == 0)
4029 flags |= IO_DIRECT_WAL;
4030 else if (pg_strcasecmp(item, "wal_init") == 0)
4031 flags |= IO_DIRECT_WAL_INIT;
4032 else
4033 {
4034 GUC_check_errdetail("Invalid option \"%s\".", item);
4035 result = false;
4036 break;
4037 }
4038 }
4039
4040 /*
4041 * It's possible to configure block sizes smaller than our assumed I/O
4042 * alignment size, which could result in invalid I/O requests.
4043 */
4044#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4045 if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4046 {
4047 GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4048 "debug_io_direct", "XLOG_BLCKSZ");
4049 result = false;
4050 }
4051#endif
4052#if BLCKSZ < PG_IO_ALIGN_SIZE
4053 if (result && (flags & IO_DIRECT_DATA))
4054 {
4055 GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4056 "debug_io_direct", "BLCKSZ");
4057 result = false;
4058 }
4059#endif
4060
4063#endif
4064
4065 if (!result)
4066 return result;
4067
4068 /* Save the flags in *extra, for use by assign_debug_io_direct */
4069 *extra = guc_malloc(LOG, sizeof(int));
4070 if (!*extra)
4071 return false;
4072 *((int *) *extra) = flags;
4073
4074 return result;
4075}
4076
4077void
4078assign_debug_io_direct(const char *newval, void *extra)
4079{
4080 int *flags = (int *) extra;
4081
4082 io_direct_flags = *flags;
4083}
4084
4085/* ResourceOwner callbacks */
4086
4087static void
4089{
4090 File file = (File) DatumGetInt32(res);
4091 Vfd *vfdP;
4092
4093 Assert(FileIsValid(file));
4094
4095 vfdP = &VfdCache[file];
4096 vfdP->resowner = NULL;
4097
4098 FileClose(file);
4099}
4100
4101static char *
4103{
4104 return psprintf("File %d", DatumGetInt32(res));
4105}
void pgaio_closing_fd(int fd)
Definition aio.c:1220
void pgaio_io_start_readv(PgAioHandle *ioh, int fd, int iovcnt, uint64 offset)
Definition aio_io.c:78
void begin_startup_progress_phase(void)
Definition startup.c:342
int fdatasync(int fd)
#define Min(x, y)
Definition c.h:997
uint32 SubTransactionId
Definition c.h:670
#define INT64_FORMAT
Definition c.h:564
#define Assert(condition)
Definition c.h:873
int64_t int64
Definition c.h:543
#define PG_BINARY
Definition c.h:1287
uint64_t uint64
Definition c.h:547
uint32_t uint32
Definition c.h:546
unsigned int Index
Definition c.h:628
#define MemSet(start, val, len)
Definition c.h:1013
#define OidIsValid(objectId)
Definition c.h:788
size_t Size
Definition c.h:619
int closedir(DIR *)
Definition dirent.c:127
struct dirent * readdir(DIR *)
Definition dirent.c:78
DIR * opendir(const char *)
Definition dirent.c:33
int errcode_for_file_access(void)
Definition elog.c:886
int errdetail(const char *fmt,...)
Definition elog.c:1216
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define LOG
Definition elog.h:31
#define FATAL
Definition elog.h:41
#define WARNING
Definition elog.h:36
#define DEBUG2
Definition elog.h:29
#define PANIC
Definition elog.h:42
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
int pg_truncate(const char *path, pgoff_t length)
Definition fd.c:720
int max_files_per_process
Definition fd.c:146
int FileGetRawDesc(File file)
Definition fd.c:2515
int MakePGDirectory(const char *directoryName)
Definition fd.c:3962
int FreeDir(DIR *dir)
Definition fd.c:3008
int recovery_init_sync_method
Definition fd.c:165
static const ResourceOwnerDesc file_resowner_desc
Definition fd.c:364
int pg_fsync_no_writethrough(int fd)
Definition fd.c:441
#define FD_MINFREE
Definition fd.c:138
FILE * OpenPipeStream(const char *command, const char *mode)
Definition fd.c:2730
static int numTempTableSpaces
Definition fd.c:292
static bool ReleaseLruFile(void)
Definition fd.c:1369
void FileWriteback(File file, pgoff_t offset, pgoff_t nbytes, uint32 wait_event_info)
Definition fd.c:2122
int io_direct_flags
Definition fd.c:171
#define FD_DELETE_AT_CLOSE
Definition fd.c:195
int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1111
static int maxAllocatedDescs
Definition fd.c:271
static void Delete(File file)
Definition fd.c:1253
static int FreeDesc(AllocateDesc *desc)
Definition fd.c:2786
static long tempFileCounter
Definition fd.c:283
static char * ResOwnerPrintFile(Datum res)
Definition fd.c:4102
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition fd.c:782
char * FilePathName(File file)
Definition fd.c:2499
static void ResourceOwnerForgetFile(ResourceOwner owner, File file)
Definition fd.c:380
static int pg_ftruncate(int fd, pgoff_t length)
Definition fd.c:703
int GetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition fd.c:3140
static int numAllocatedDescs
Definition fd.c:270
File PathNameOpenTemporaryFile(const char *path, int mode)
Definition fd.c:1888
static void LruDelete(File file)
Definition fd.c:1272
int pg_fdatasync(int fd)
Definition fd.c:480
#define FileIsValid(file)
Definition fd.c:189
void assign_debug_io_direct(const char *newval, void *extra)
Definition fd.c:4078
int FileSync(File file, uint32 wait_event_info)
Definition fd.c:2335
int FileStartReadV(PgAioHandle *ioh, File file, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2204
static int nfile
Definition fd.c:225
int CloseTransientFile(int fd)
Definition fd.c:2854
#define DO_DB(A)
Definition fd.c:183
int BasicOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1089
void closeAllVfds(void)
Definition fd.c:3067
int max_safe_fds
Definition fd.c:159
static File AllocateVfd(void)
Definition fd.c:1401
File PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
Definition fd.c:1848
void PathNameDeleteTemporaryDir(const char *dirname)
Definition fd.c:1678
int ClosePipeStream(FILE *file)
Definition fd.c:3038
void AtEOXact_Files(bool isCommit)
Definition fd.c:3213
int FileGetRawFlags(File file)
Definition fd.c:2531
static Size SizeVfdCache
Definition fd.c:220
static int nextTempTableSpace
Definition fd.c:293
#define FD_CLOSE_AT_EOXACT
Definition fd.c:196
int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
Definition fd.c:3846
static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
Definition fd.c:3821
static void ResOwnerReleaseFile(Datum res)
Definition fd.c:4088
static void RemovePgTempRelationFiles(const char *tsdirname)
Definition fd.c:3442
int FreeFile(FILE *file)
Definition fd.c:2826
ssize_t FileReadV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2148
mode_t FileGetRawMode(File file)
Definition fd.c:2541
static AllocateDesc * allocatedDescs
Definition fd.c:272
struct dirent * ReadDirExtended(DIR *dir, const char *dirname, int elevel)
Definition fd.c:2971
static void count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
Definition fd.c:964
int FileFallocate(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2407
static int FileAccess(File file)
Definition fd.c:1479
pgoff_t FileSize(File file)
Definition fd.c:2447
static void FreeVfd(File file)
Definition fd.c:1459
struct vfd Vfd
int pg_fsync_writethrough(int fd)
Definition fd.c:461
void FileClose(File file)
Definition fd.c:1965
void ReleaseExternalFD(void)
Definition fd.c:1224
#define FD_TEMP_FILE_LIMIT
Definition fd.c:197
void RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
Definition fd.c:3382
bool pg_file_exists(const char *name)
Definition fd.c:503
void RemovePgTempFiles(void)
Definition fd.c:3322
#define FileIsNotOpen(file)
Definition fd.c:192
bool TempTablespacesAreSet(void)
Definition fd.c:3125
void fsync_fname(const char *fname, bool isdir)
Definition fd.c:756
int data_sync_elevel(int elevel)
Definition fd.c:3985
File PathNameOpenFile(const char *fileName, int fileFlags)
Definition fd.c:1562
static void Insert(File file)
Definition fd.c:1300
AllocateDescKind
Definition fd.c:251
@ AllocateDescDir
Definition fd.c:254
@ AllocateDescPipe
Definition fd.c:253
@ AllocateDescFile
Definition fd.c:252
@ AllocateDescRawFD
Definition fd.c:255
Oid GetNextTempTableSpace(void)
Definition fd.c:3158
File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:1575
static void datadir_fsync_fname(const char *fname, bool isdir, int elevel)
Definition fd.c:3808
static void ReportTemporaryFileUsage(const char *path, pgoff_t size)
Definition fd.c:1515
static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
Definition fd.c:1791
void pg_flush_data(int fd, pgoff_t offset, pgoff_t nbytes)
Definition fd.c:525
bool AcquireExternalFD(void)
Definition fd.c:1171
static void RegisterTemporaryFile(File file)
Definition fd.c:1534
#define NUM_RESERVED_FDS
Definition fd.c:129
DIR * AllocateDir(const char *dirname)
Definition fd.c:2890
static Oid * tempTableSpaces
Definition fd.c:291
int FileTruncate(File file, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2464
static bool reserveAllocatedDesc(void)
Definition fd.c:2552
void InitFileAccess(void)
Definition fd.c:903
static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
Definition fd.c:3470
File OpenTemporaryFile(bool interXact)
Definition fd.c:1711
int durable_unlink(const char *fname, int elevel)
Definition fd.c:872
static uint64 temporary_files_size
Definition fd.c:239
void ReserveExternalFD(void)
Definition fd.c:1206
int FileZero(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2362
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition fd.c:2956
bool looks_like_temp_rel_name(const char *name)
Definition fd.c:3498
bool PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
Definition fd.c:1919
void set_max_safe_fds(void)
Definition fd.c:1044
int pg_fsync(int fd)
Definition fd.c:389
static void CleanupTempFiles(bool isCommit, bool isProcExit)
Definition fd.c:3250
#define VFD_CLOSED
Definition fd.c:187
static bool have_xact_temporary_files
Definition fd.c:231
static int LruInsert(File file)
Definition fd.c:1322
static int numExternalFDs
Definition fd.c:277
static int fsync_parent_path(const char *fname, int elevel)
Definition fd.c:3922
void PathNameCreateTemporaryDir(const char *basedir, const char *directory)
Definition fd.c:1647
FILE * AllocateFile(const char *name, const char *mode)
Definition fd.c:2627
void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid)
Definition fd.c:3180
int OpenTransientFile(const char *fileName, int fileFlags)
Definition fd.c:2677
void InitTemporaryFileAccess(void)
Definition fd.c:933
static Vfd * VfdCache
Definition fd.c:219
int OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
Definition fd.c:2686
bool data_sync_retry
Definition fd.c:162
int FilePrefetch(File file, pgoff_t offset, pgoff_t amount, uint32 wait_event_info)
Definition fd.c:2066
ssize_t FileWriteV(File file, const struct iovec *iov, int iovcnt, pgoff_t offset, uint32 wait_event_info)
Definition fd.c:2230
static void ReleaseLruFiles(void)
Definition fd.c:1391
void SyncDataDirectory(void)
Definition fd.c:3593
bool check_debug_io_direct(char **newval, void **extra, GucSource source)
Definition fd.c:3991
static void ResourceOwnerRememberFile(ResourceOwner owner, File file)
Definition fd.c:375
static void BeforeShmemExit_Files(int code, Datum arg)
Definition fd.c:3227
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition fd.c:3707
void SetTempTablespaces(Oid *tableSpaces, int numSpaces)
Definition fd.c:3096
void TempTablespacePath(char *path, Oid tablespace)
Definition fd.c:1766
#define IO_DIRECT_WAL
Definition fd.h:55
#define IO_DIRECT_DATA
Definition fd.h:54
#define IO_DIRECT_WAL_INIT
Definition fd.h:56
int File
Definition fd.h:51
#define PG_O_DIRECT
Definition fd.h:123
int pg_file_create_mode
Definition file_perm.c:19
int pg_dir_create_mode
Definition file_perm.c:18
ssize_t pg_pwrite_zeros(int fd, size_t size, pgoff_t offset)
Definition file_utils.c:709
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition file_utils.c:547
#define PG_TEMP_FILES_DIR
Definition file_utils.h:63
#define PG_TEMP_FILE_PREFIX
Definition file_utils.h:64
PGFileType
Definition file_utils.h:19
@ PGFILETYPE_DIR
Definition file_utils.h:23
@ PGFILETYPE_REG
Definition file_utils.h:22
@ PGFILETYPE_ERROR
Definition file_utils.h:20
@ DATA_DIR_SYNC_METHOD_SYNCFS
Definition file_utils.h:30
int MyProcPid
Definition globals.c:47
bool enableFsync
Definition globals.c:129
Oid MyDatabaseTableSpace
Definition globals.c:96
void * guc_malloc(int elevel, size_t size)
Definition guc.c:636
#define newval
#define GUC_check_errdetail
Definition guc.h:505
GucSource
Definition guc.h:112
int temp_file_limit
Definition guc_tables.c:560
int log_temp_files
Definition guc_tables.c:555
#define close(a)
Definition win32.h:12
void before_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:344
return true
Definition isn.c:130
int j
Definition isn.c:78
int i
Definition isn.c:77
void list_free(List *list)
Definition list.c:1546
Datum subpath(PG_FUNCTION_ARGS)
Definition ltree_op.c:311
char * pstrdup(const char *in)
Definition mcxt.c:1781
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define MAP_FAILED
Definition mem.h:43
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
void * arg
static char * basedir
static PgChecksumMode mode
#define MAXPGPATH
static ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset)
Definition pg_iovec.h:54
static ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, pgoff_t offset)
Definition pg_iovec.h:93
#define lfirst(lc)
Definition pg_list.h:172
uint64 pg_prng_uint64_range(pg_prng_state *state, uint64 rmin, uint64 rmax)
Definition pg_prng.c:144
pg_prng_state pg_global_prng_state
Definition pg_prng.c:34
static rewind_source * source
Definition pg_rewind.c:89
static char buf[DEFAULT_XLOG_SEG_SIZE]
static char * tablespace
Definition pgbench.c:217
void pgstat_report_tempfile(size_t filesize)
#define pqsignal
Definition port.h:547
int pg_strcasecmp(const char *s1, const char *s2)
void get_parent_directory(char *path)
Definition path.c:1068
#define snprintf
Definition port.h:260
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition strlcpy.c:45
off_t pgoff_t
Definition port.h:421
uint64_t Datum
Definition postgres.h:70
static Datum Int32GetDatum(int32 X)
Definition postgres.h:222
static int32 DatumGetInt32(Datum X)
Definition postgres.h:212
#define InvalidOid
unsigned int Oid
static int fd(const char *x, int i)
static int fb(int x)
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
int forkname_chars(const char *str, ForkNumber *fork)
Definition relpath.c:81
#define PG_TBLSPC_DIR
Definition relpath.h:41
#define TABLESPACE_VERSION_DIRECTORY
Definition relpath.h:33
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition resowner.c:561
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
Definition resowner.c:521
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449
@ RESOURCE_RELEASE_AFTER_LOCKS
Definition resowner.h:56
#define RELEASE_PRIO_FILES
Definition resowner.h:76
void pg_usleep(long microsec)
Definition signal.c:53
#define realloc(a, b)
#define free(a)
#define malloc(a)
static void error(void)
#define ereport_startup_progress(msg,...)
Definition startup.h:18
SubTransactionId create_subid
Definition fd.c:261
DIR * dir
Definition fd.c:265
FILE * file
Definition fd.c:264
int fd
Definition fd.c:266
union AllocateDesc::@20 desc
AllocateDescKind kind
Definition fd.c:260
Definition dirent.c:26
Definition pg_list.h:54
const char * name
Definition resowner.h:93
Definition fd.c:200
int fd
Definition fd.c:201
int fileFlags
Definition fd.c:210
File lruLessRecently
Definition fd.c:206
File lruMoreRecently
Definition fd.c:205
pgoff_t fileSize
Definition fd.c:207
char * fileName
Definition fd.c:208
ResourceOwner resowner
Definition fd.c:203
unsigned short fdstate
Definition fd.c:202
File nextFree
Definition fd.c:204
mode_t fileMode
Definition fd.c:211
bool SplitGUCList(char *rawstring, char separator, List **namelist)
Definition varlena.c:2978
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
const char * type
const char * name
#define fsync(fd)
Definition win32_port.h:83
#define stat
Definition win32_port.h:74
#define EINTR
Definition win32_port.h:361
#define EOPNOTSUPP
Definition win32_port.h:385
#define SIGPIPE
Definition win32_port.h:163
#define lstat(path, sb)
Definition win32_port.h:275
#define S_ISDIR(m)
Definition win32_port.h:315
void _dosmaperr(unsigned long)
Definition win32error.c:177
#define S_ISLNK(m)
Definition win32_port.h:334
#define mkdir(a, b)
Definition win32_port.h:80
#define fstat
Definition win32_port.h:73
#define O_CLOEXEC
Definition win32_port.h:344
SubTransactionId GetCurrentSubTransactionId(void)
Definition xact.c:792
int wal_sync_method
Definition xlog.c:133
@ WAL_SYNC_METHOD_FSYNC_WRITETHROUGH
Definition xlog.h:28
static const char * directory
Definition zic.c:648

◆ FD_CLOSE_AT_EOXACT

#define FD_CLOSE_AT_EOXACT   (1 << 1) /* T = close at eoXact */

Definition at line 196 of file fd.c.

◆ FD_DELETE_AT_CLOSE

#define FD_DELETE_AT_CLOSE   (1 << 0) /* T = delete when closed */

Definition at line 195 of file fd.c.

◆ FD_MINFREE

#define FD_MINFREE   48

Definition at line 138 of file fd.c.

◆ FD_TEMP_FILE_LIMIT

#define FD_TEMP_FILE_LIMIT   (1 << 2) /* T = respect temp_file_limit */

Definition at line 197 of file fd.c.

◆ FileIsNotOpen

#define FileIsNotOpen (   file)    (VfdCache[file].fd == VFD_CLOSED)

Definition at line 192 of file fd.c.

◆ FileIsValid

#define FileIsValid (   file)     ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)

Definition at line 189 of file fd.c.

◆ NUM_RESERVED_FDS

#define NUM_RESERVED_FDS   10

Definition at line 129 of file fd.c.

◆ VFD_CLOSED

#define VFD_CLOSED   (-1)

Definition at line 187 of file fd.c.

Typedef Documentation

◆ Vfd

Enumeration Type Documentation

◆ AllocateDescKind

Enumerator
AllocateDescFile 
AllocateDescPipe 
AllocateDescDir 
AllocateDescRawFD 

Definition at line 250 of file fd.c.

Function Documentation

◆ AcquireExternalFD()

bool AcquireExternalFD ( void  )

Definition at line 1171 of file fd.c.

1172{
1173 /*
1174 * We don't want more than max_safe_fds / 3 FDs to be consumed for
1175 * "external" FDs.
1176 */
1177 if (numExternalFDs < max_safe_fds / 3)
1178 {
1180 return true;
1181 }
1182 errno = EMFILE;
1183 return false;
1184}

References fb(), max_safe_fds, numExternalFDs, and ReserveExternalFD().

Referenced by CreateWaitEventSet(), and libpqsrv_connect_prepare().

◆ AllocateDir()

DIR * AllocateDir ( const char dirname)

Definition at line 2890 of file fd.c.

2891{
2892 DIR *dir;
2893
2894 DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
2895 numAllocatedDescs, dirname));
2896
2897 /* Can we allocate another non-virtual FD? */
2898 if (!reserveAllocatedDesc())
2899 ereport(ERROR,
2901 errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
2902 maxAllocatedDescs, dirname)));
2903
2904 /* Close excess kernel FDs. */
2906
2907TryAgain:
2908 if ((dir = opendir(dirname)) != NULL)
2909 {
2911
2912 desc->kind = AllocateDescDir;
2913 desc->desc.dir = dir;
2916 return desc->desc.dir;
2917 }
2918
2919 if (errno == EMFILE || errno == ENFILE)
2920 {
2921 int save_errno = errno;
2922
2923 ereport(LOG,
2925 errmsg("out of file descriptors: %m; release and retry")));
2926 errno = 0;
2927 if (ReleaseLruFile())
2928 goto TryAgain;
2929 errno = save_errno;
2930 }
2931
2932 return NULL;
2933}

References allocatedDescs, AllocateDescDir, AllocateDesc::create_subid, AllocateDesc::desc, AllocateDesc::dir, DO_DB, elog, ereport, errcode(), errmsg(), ERROR, fb(), GetCurrentSubTransactionId(), AllocateDesc::kind, LOG, maxAllocatedDescs, numAllocatedDescs, opendir(), ReleaseLruFile(), ReleaseLruFiles(), and reserveAllocatedDesc().

Referenced by calculate_database_size(), calculate_tablespace_size(), CheckPointLogicalRewriteHeap(), CheckPointSnapBuild(), CheckTablespaceDirectory(), CleanupBackupHistory(), copydir(), db_dir_size(), DeleteAllExportedSnapshotFiles(), destroy_tablespace_directories(), directory_is_empty(), do_pg_backup_start(), dsm_cleanup_for_mmap(), extension_file_exists(), get_ext_ver_list(), GetConfFilesInDir(), getInstallationPaths(), GetWalSummaries(), movedb(), ParseTzFile(), perform_base_backup(), pg_available_extension_versions(), pg_available_extensions(), pg_ls_dir(), pg_ls_dir_files(), pg_tablespace_databases(), pg_tzenumerate_next(), pg_tzenumerate_start(), pgarch_readyXlog(), RelationCacheInitFileRemove(), RelationCacheInitFileRemoveInDir(), RemoveNonParentXlogFiles(), RemoveOldXlogFiles(), RemovePgTempFiles(), RemovePgTempFilesInDir(), RemovePgTempRelationFiles(), RemovePgTempRelationFilesInDbspace(), RemoveTempXlogFiles(), ReorderBufferCleanupSerializedTXNs(), ResetUnloggedRelations(), ResetUnloggedRelationsInDbspaceDir(), ResetUnloggedRelationsInTablespaceDir(), restoreTwoPhaseData(), scan_directory_ci(), sendDir(), SlruScanDirectory(), StartupReorderBuffer(), StartupReplicationSlots(), SyncDataDirectory(), UpdateLogicalMappings(), walkdir(), and XLogGetOldestSegno().

◆ AllocateFile()

FILE * AllocateFile ( const char name,
const char mode 
)

Definition at line 2627 of file fd.c.

2628{
2629 FILE *file;
2630
2631 DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
2633
2634 /* Can we allocate another non-virtual FD? */
2635 if (!reserveAllocatedDesc())
2636 ereport(ERROR,
2638 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2640
2641 /* Close excess kernel FDs. */
2643
2644TryAgain:
2645 if ((file = fopen(name, mode)) != NULL)
2646 {
2648
2649 desc->kind = AllocateDescFile;
2650 desc->desc.file = file;
2653 return desc->desc.file;
2654 }
2655
2656 if (errno == EMFILE || errno == ENFILE)
2657 {
2658 int save_errno = errno;
2659
2660 ereport(LOG,
2662 errmsg("out of file descriptors: %m; release and retry")));
2663 errno = 0;
2664 if (ReleaseLruFile())
2665 goto TryAgain;
2666 errno = save_errno;
2667 }
2668
2669 return NULL;
2670}

References allocatedDescs, AllocateDescFile, AllocateDesc::create_subid, AllocateDesc::desc, DO_DB, elog, ereport, errcode(), errmsg(), ERROR, fb(), AllocateDesc::file, GetCurrentSubTransactionId(), AllocateDesc::kind, LOG, maxAllocatedDescs, mode, name, numAllocatedDescs, ReleaseLruFile(), ReleaseLruFiles(), and reserveAllocatedDesc().

Referenced by AlterSystemSetConfigFile(), apw_dump_now(), apw_load_buffers(), BeginCopyFrom(), BeginCopyTo(), checkControlFile(), do_pg_backup_stop(), entry_reset(), existsTimeLineHistory(), ExportSnapshot(), gc_qtexts(), GetHugePageSize(), ImportSnapshot(), load_dh_file(), load_relcache_init_file(), open_auth_file(), parse_extension_control_file(), ParseConfigFile(), ParseTzFile(), pg_current_logfile(), pg_promote(), pgss_shmem_shutdown(), pgss_shmem_startup(), pgstat_read_statsfile(), pgstat_write_statsfile(), read_backup_label(), read_binary_file(), read_tablespace_map(), read_whole_file(), readTimeLineHistory(), test_custom_stats_var_from_serialized_data(), test_custom_stats_var_to_serialized_data(), tsearch_readline_begin(), ValidatePgVersion(), write_relcache_init_file(), XLogArchiveForceDone(), and XLogArchiveNotify().

◆ AllocateVfd()

static File AllocateVfd ( void  )
static

Definition at line 1401 of file fd.c.

1402{
1403 Index i;
1404 File file;
1405
1406 DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
1407
1408 Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
1409
1410 if (VfdCache[0].nextFree == 0)
1411 {
1412 /*
1413 * The free list is empty so it is time to increase the size of the
1414 * array. We choose to double it each time this happens. However,
1415 * there's not much point in starting *real* small.
1416 */
1419
1420 if (newCacheSize < 32)
1421 newCacheSize = 32;
1422
1423 /*
1424 * Be careful not to clobber VfdCache ptr if realloc fails.
1425 */
1426 newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
1427 if (newVfdCache == NULL)
1428 ereport(ERROR,
1430 errmsg("out of memory")));
1432
1433 /*
1434 * Initialize the new entries and link them into the free list.
1435 */
1436 for (i = SizeVfdCache; i < newCacheSize; i++)
1437 {
1438 MemSet(&(VfdCache[i]), 0, sizeof(Vfd));
1439 VfdCache[i].nextFree = i + 1;
1441 }
1444
1445 /*
1446 * Record the new size
1447 */
1449 }
1450
1451 file = VfdCache[0].nextFree;
1452
1454
1455 return file;
1456}

References Assert, DO_DB, elog, ereport, errcode(), errmsg(), ERROR, fb(), vfd::fd, i, LOG, MemSet, vfd::nextFree, realloc, SizeVfdCache, VFD_CLOSED, and VfdCache.

Referenced by PathNameOpenFilePerm().

◆ assign_debug_io_direct()

void assign_debug_io_direct ( const char newval,
void extra 
)

Definition at line 4078 of file fd.c.

4079{
4080 int *flags = (int *) extra;
4081
4082 io_direct_flags = *flags;
4083}

References io_direct_flags.

◆ AtEOSubXact_Files()

void AtEOSubXact_Files ( bool  isCommit,
SubTransactionId  mySubid,
SubTransactionId  parentSubid 
)

Definition at line 3180 of file fd.c.

3182{
3183 Index i;
3184
3185 for (i = 0; i < numAllocatedDescs; i++)
3186 {
3187 if (allocatedDescs[i].create_subid == mySubid)
3188 {
3189 if (isCommit)
3191 else
3192 {
3193 /* have to recheck the item after FreeDesc (ugly) */
3195 }
3196 }
3197 }
3198}

References allocatedDescs, AllocateDesc::create_subid, fb(), FreeDesc(), i, and numAllocatedDescs.

Referenced by AbortSubTransaction(), and CommitSubTransaction().

◆ AtEOXact_Files()

◆ BasicOpenFile()

int BasicOpenFile ( const char fileName,
int  fileFlags 
)

◆ BasicOpenFilePerm()

int BasicOpenFilePerm ( const char fileName,
int  fileFlags,
mode_t  fileMode 
)

Definition at line 1111 of file fd.c.

1112{
1113 int fd;
1114
1115tryAgain:
1116#ifdef PG_O_DIRECT_USE_F_NOCACHE
1117 fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
1118#else
1119 fd = open(fileName, fileFlags, fileMode);
1120#endif
1121
1122 if (fd >= 0)
1123 {
1124#ifdef PG_O_DIRECT_USE_F_NOCACHE
1125 if (fileFlags & PG_O_DIRECT)
1126 {
1127 if (fcntl(fd, F_NOCACHE, 1) < 0)
1128 {
1129 int save_errno = errno;
1130
1131 close(fd);
1132 errno = save_errno;
1133 return -1;
1134 }
1135 }
1136#endif
1137
1138 return fd; /* success! */
1139 }
1140
1141 if (errno == EMFILE || errno == ENFILE)
1142 {
1143 int save_errno = errno;
1144
1145 ereport(LOG,
1147 errmsg("out of file descriptors: %m; release and retry")));
1148 errno = 0;
1149 if (ReleaseLruFile())
1150 goto tryAgain;
1151 errno = save_errno;
1152 }
1153
1154 return -1; /* failure */
1155}

References close, ereport, errcode(), errmsg(), fb(), fd(), LOG, PG_O_DIRECT, and ReleaseLruFile().

Referenced by BasicOpenFile(), LruInsert(), OpenTransientFilePerm(), PathNameOpenFilePerm(), and readRecoverySignalFile().

◆ BeforeShmemExit_Files()

static void BeforeShmemExit_Files ( int  code,
Datum  arg 
)
static

Definition at line 3227 of file fd.c.

3228{
3229 CleanupTempFiles(false, true);
3230
3231 /* prevent further temp files from being created */
3232#ifdef USE_ASSERT_CHECKING
3234#endif
3235}

References CleanupTempFiles(), and fb().

Referenced by InitTemporaryFileAccess().

◆ check_debug_io_direct()

bool check_debug_io_direct ( char **  newval,
void **  extra,
GucSource  source 
)

Definition at line 3991 of file fd.c.

3992{
3993 bool result = true;
3994 int flags;
3995
3996#if PG_O_DIRECT == 0
3997 if (strcmp(*newval, "") != 0)
3998 {
3999 GUC_check_errdetail("\"%s\" is not supported on this platform.",
4000 "debug_io_direct");
4001 result = false;
4002 }
4003 flags = 0;
4004#else
4005 List *elemlist;
4006 ListCell *l;
4007 char *rawstring;
4008
4009 /* Need a modifiable copy of string */
4011
4012 if (!SplitGUCList(rawstring, ',', &elemlist))
4013 {
4014 GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
4015 "debug_io_direct");
4018 return false;
4019 }
4020
4021 flags = 0;
4022 foreach(l, elemlist)
4023 {
4024 char *item = (char *) lfirst(l);
4025
4026 if (pg_strcasecmp(item, "data") == 0)
4027 flags |= IO_DIRECT_DATA;
4028 else if (pg_strcasecmp(item, "wal") == 0)
4029 flags |= IO_DIRECT_WAL;
4030 else if (pg_strcasecmp(item, "wal_init") == 0)
4031 flags |= IO_DIRECT_WAL_INIT;
4032 else
4033 {
4034 GUC_check_errdetail("Invalid option \"%s\".", item);
4035 result = false;
4036 break;
4037 }
4038 }
4039
4040 /*
4041 * It's possible to configure block sizes smaller than our assumed I/O
4042 * alignment size, which could result in invalid I/O requests.
4043 */
4044#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
4045 if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
4046 {
4047 GUC_check_errdetail("\"%s\" is not supported for WAL because %s is too small.",
4048 "debug_io_direct", "XLOG_BLCKSZ");
4049 result = false;
4050 }
4051#endif
4052#if BLCKSZ < PG_IO_ALIGN_SIZE
4053 if (result && (flags & IO_DIRECT_DATA))
4054 {
4055 GUC_check_errdetail("\"%s\" is not supported for data because %s is too small.",
4056 "debug_io_direct", "BLCKSZ");
4057 result = false;
4058 }
4059#endif
4060
4063#endif
4064
4065 if (!result)
4066 return result;
4067
4068 /* Save the flags in *extra, for use by assign_debug_io_direct */
4069 *extra = guc_malloc(LOG, sizeof(int));
4070 if (!*extra)
4071 return false;
4072 *((int *) *extra) = flags;
4073
4074 return result;
4075}

References fb(), GUC_check_errdetail, guc_malloc(), IO_DIRECT_DATA, IO_DIRECT_WAL, IO_DIRECT_WAL_INIT, lfirst, list_free(), LOG, newval, pfree(), pg_strcasecmp(), pstrdup(), and SplitGUCList().

◆ CleanupTempFiles()

static void CleanupTempFiles ( bool  isCommit,
bool  isProcExit 
)
static

Definition at line 3250 of file fd.c.

3251{
3252 Index i;
3253
3254 /*
3255 * Careful here: at proc_exit we need extra cleanup, not just
3256 * xact_temporary files.
3257 */
3259 {
3260 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3261 for (i = 1; i < SizeVfdCache; i++)
3262 {
3263 unsigned short fdstate = VfdCache[i].fdstate;
3264
3265 if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
3266 VfdCache[i].fileName != NULL)
3267 {
3268 /*
3269 * If we're in the process of exiting a backend process, close
3270 * all temporary files. Otherwise, only close temporary files
3271 * local to the current transaction. They should be closed by
3272 * the ResourceOwner mechanism already, so this is just a
3273 * debugging cross-check.
3274 */
3275 if (isProcExit)
3276 FileClose(i);
3277 else if (fdstate & FD_CLOSE_AT_EOXACT)
3278 {
3279 elog(WARNING,
3280 "temporary file %s not closed at end-of-transaction",
3281 VfdCache[i].fileName);
3282 FileClose(i);
3283 }
3284 }
3285 }
3286
3288 }
3289
3290 /* Complain if any allocated files remain open at commit. */
3291 if (isCommit && numAllocatedDescs > 0)
3292 elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
3294
3295 /* Clean up "allocated" stdio files, dirs and fds. */
3296 while (numAllocatedDescs > 0)
3298}

References allocatedDescs, Assert, elog, fb(), FD_CLOSE_AT_EOXACT, FD_DELETE_AT_CLOSE, vfd::fdstate, FileClose(), FileIsNotOpen, FreeDesc(), have_xact_temporary_files, i, numAllocatedDescs, SizeVfdCache, VfdCache, and WARNING.

Referenced by AtEOXact_Files(), and BeforeShmemExit_Files().

◆ closeAllVfds()

void closeAllVfds ( void  )

Definition at line 3067 of file fd.c.

3068{
3069 Index i;
3070
3071 if (SizeVfdCache > 0)
3072 {
3073 Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
3074 for (i = 1; i < SizeVfdCache; i++)
3075 {
3076 if (!FileIsNotOpen(i))
3077 LruDelete(i);
3078 }
3079 }
3080}

References Assert, FileIsNotOpen, i, LruDelete(), and SizeVfdCache.

Referenced by standard_ProcessUtility().

◆ ClosePipeStream()

int ClosePipeStream ( FILE file)

Definition at line 3038 of file fd.c.

3039{
3040 int i;
3041
3042 DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
3043
3044 /* Remove file from list of allocated files, if it's present */
3045 for (i = numAllocatedDescs; --i >= 0;)
3046 {
3047 AllocateDesc *desc = &allocatedDescs[i];
3048
3049 if (desc->kind == AllocateDescPipe && desc->desc.file == file)
3050 return FreeDesc(desc);
3051 }
3052
3053 /* Only get here if someone passes us a file not in allocatedDescs */
3054 elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
3055
3056 return pclose(file);
3057}

References allocatedDescs, AllocateDescPipe, AllocateDesc::desc, DO_DB, elog, fb(), AllocateDesc::file, FreeDesc(), i, AllocateDesc::kind, LOG, numAllocatedDescs, and WARNING.

Referenced by ClosePipeFromProgram(), ClosePipeToProgram(), pg_import_system_collations(), run_ssl_passphrase_command(), and shell_finish_command().

◆ CloseTransientFile()

int CloseTransientFile ( int  fd)

Definition at line 2854 of file fd.c.

2855{
2856 int i;
2857
2858 DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
2859
2860 /* Remove fd from list of allocated files, if it's present */
2861 for (i = numAllocatedDescs; --i >= 0;)
2862 {
2863 AllocateDesc *desc = &allocatedDescs[i];
2864
2865 if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
2866 return FreeDesc(desc);
2867 }
2868
2869 /* Only get here if someone passes us a file not in allocatedDescs */
2870 elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
2871
2873
2874 return close(fd);
2875}

References allocatedDescs, AllocateDescRawFD, close, AllocateDesc::desc, DO_DB, elog, AllocateDesc::fd, fd(), FreeDesc(), i, AllocateDesc::kind, LOG, numAllocatedDescs, pgaio_closing_fd(), and WARNING.

Referenced by ApplyLogicalMappingFile(), be_lo_export(), CheckPointLogicalRewriteHeap(), CheckPointReplicationOrigin(), clone_file(), compare_files(), copy_file(), CreateDirAndVersionFile(), dsm_impl_mmap(), durable_rename(), fsync_fname_ext(), get_controlfile_by_exact_path(), heap_xlog_logical_rewrite(), lo_import_internal(), perform_base_backup(), pg_truncate(), qtext_load_file(), qtext_store(), read_relmap_file(), ReadTwoPhaseFile(), RecreateTwoPhaseFile(), ReorderBufferSerializeChange(), ReorderBufferSerializeTXN(), RestoreSlotFromDisk(), SaveSlotToPath(), sendFile(), SendTimeLineHistory(), SimpleLruDoesPhysicalPageExist(), SimpleLruWriteAll(), SlruInternalWritePage(), SlruPhysicalReadPage(), SlruPhysicalWritePage(), SlruSyncFileTag(), SnapBuildRestoreContents(), SnapBuildRestoreSnapshot(), SnapBuildSerialize(), StartupReplicationOrigin(), write_relmap_file(), writeTimeLineHistory(), writeTimeLineHistoryFile(), and XLogFileCopy().

◆ count_usable_fds()

static void count_usable_fds ( int  max_to_probe,
int usable_fds,
int already_open 
)
static

Definition at line 964 of file fd.c.

965{
966 int *fd;
967 int size;
968 int used = 0;
969 int highestfd = 0;
970 int j;
971
972#ifdef HAVE_GETRLIMIT
973 struct rlimit rlim;
975#endif
976
977 size = 1024;
978 fd = (int *) palloc(size * sizeof(int));
979
980#ifdef HAVE_GETRLIMIT
982 if (getrlimit_status != 0)
983 ereport(WARNING, (errmsg("getrlimit failed: %m")));
984#endif /* HAVE_GETRLIMIT */
985
986 /* dup until failure or probe limit reached */
987 for (;;)
988 {
989 int thisfd;
990
991#ifdef HAVE_GETRLIMIT
992
993 /*
994 * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
995 * some platforms
996 */
997 if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
998 break;
999#endif
1000
1001 thisfd = dup(2);
1002 if (thisfd < 0)
1003 {
1004 /* Expect EMFILE or ENFILE, else it's fishy */
1005 if (errno != EMFILE && errno != ENFILE)
1006 elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
1007 break;
1008 }
1009
1010 if (used >= size)
1011 {
1012 size *= 2;
1013 fd = (int *) repalloc(fd, size * sizeof(int));
1014 }
1015 fd[used++] = thisfd;
1016
1017 if (highestfd < thisfd)
1018 highestfd = thisfd;
1019
1020 if (used >= max_to_probe)
1021 break;
1022 }
1023
1024 /* release the files we opened */
1025 for (j = 0; j < used; j++)
1026 close(fd[j]);
1027
1028 pfree(fd);
1029
1030 /*
1031 * Return results. usable_fds is just the number of successful dups. We
1032 * assume that the system limit is highestfd+1 (remember 0 is a legal FD
1033 * number) and so already_open is highestfd+1 - usable_fds.
1034 */
1035 *usable_fds = used;
1036 *already_open = highestfd + 1 - used;
1037}

References close, elog, ereport, errmsg(), fb(), fd(), j, palloc(), pfree(), repalloc(), and WARNING.

Referenced by set_max_safe_fds().

◆ data_sync_elevel()

◆ datadir_fsync_fname()

static void datadir_fsync_fname ( const char fname,
bool  isdir,
int  elevel 
)
static

Definition at line 3808 of file fd.c.

3809{
3810 ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3811 fname);
3812
3813 /*
3814 * We want to silently ignoring errors about unreadable files. Pass that
3815 * desire on to fsync_fname_ext().
3816 */
3817 fsync_fname_ext(fname, isdir, true, elevel);
3818}

References ereport_startup_progress, fb(), and fsync_fname_ext().

Referenced by SyncDataDirectory().

◆ Delete()

static void Delete ( File  file)
static

Definition at line 1253 of file fd.c.

1254{
1255 Vfd *vfdP;
1256
1257 Assert(file != 0);
1258
1259 DO_DB(elog(LOG, "Delete %d (%s)",
1260 file, VfdCache[file].fileName));
1261 DO_DB(_dump_lru());
1262
1263 vfdP = &VfdCache[file];
1264
1265 VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
1266 VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
1267
1268 DO_DB(_dump_lru());
1269}

References Assert, DO_DB, elog, fb(), LOG, vfd::lruLessRecently, vfd::lruMoreRecently, and VfdCache.

Referenced by FileAccess(), FileClose(), and LruDelete().

◆ durable_rename()

int durable_rename ( const char oldfile,
const char newfile,
int  elevel 
)

Definition at line 782 of file fd.c.

783{
784 int fd;
785
786 /*
787 * First fsync the old and target path (if it exists), to ensure that they
788 * are properly persistent on disk. Syncing the target file is not
789 * strictly necessary, but it makes it easier to reason about crashes;
790 * because it's then guaranteed that either source or target file exists
791 * after a crash.
792 */
793 if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
794 return -1;
795
797 if (fd < 0)
798 {
799 if (errno != ENOENT)
800 {
801 ereport(elevel,
803 errmsg("could not open file \"%s\": %m", newfile)));
804 return -1;
805 }
806 }
807 else
808 {
809 if (pg_fsync(fd) != 0)
810 {
811 int save_errno;
812
813 /* close file upon error, might not be in transaction context */
817
818 ereport(elevel,
820 errmsg("could not fsync file \"%s\": %m", newfile)));
821 return -1;
822 }
823
824 if (CloseTransientFile(fd) != 0)
825 {
826 ereport(elevel,
828 errmsg("could not close file \"%s\": %m", newfile)));
829 return -1;
830 }
831 }
832
833 /* Time to do the real deal... */
834 if (rename(oldfile, newfile) < 0)
835 {
836 ereport(elevel,
838 errmsg("could not rename file \"%s\" to \"%s\": %m",
839 oldfile, newfile)));
840 return -1;
841 }
842
843 /*
844 * To guarantee renaming the file is persistent, fsync the file with its
845 * new name, and its containing directory.
846 */
847 if (fsync_fname_ext(newfile, false, false, elevel) != 0)
848 return -1;
849
850 if (fsync_parent_path(newfile, elevel) != 0)
851 return -1;
852
853 return 0;
854}

References CloseTransientFile(), ereport, errcode_for_file_access(), errmsg(), fb(), fd(), fsync_fname_ext(), fsync_parent_path(), OpenTransientFile(), PG_BINARY, and pg_fsync().

Referenced by AlterSystemSetConfigFile(), apw_dump_now(), BaseBackup(), basic_archive_file(), bbsink_server_end_manifest(), CheckPointReplicationOrigin(), cleanup_objects_atexit(), CleanupAfterArchiveRecovery(), dir_close(), InitWalRecovery(), InstallXLogFileSegment(), KeepFileRestoredFromArchive(), pgss_shmem_shutdown(), pgstat_write_statsfile(), StartupXLOG(), SummarizeWAL(), write_relmap_file(), writeTimeLineHistory(), writeTimeLineHistoryFile(), and XLogArchiveForceDone().

◆ durable_unlink()

int durable_unlink ( const char fname,
int  elevel 
)

Definition at line 872 of file fd.c.

873{
874 if (unlink(fname) < 0)
875 {
876 ereport(elevel,
878 errmsg("could not remove file \"%s\": %m",
879 fname)));
880 return -1;
881 }
882
883 /*
884 * To guarantee that the removal of the file is persistent, fsync its
885 * parent directory.
886 */
887 if (fsync_parent_path(fname, elevel) != 0)
888 return -1;
889
890 return 0;
891}

References ereport, errcode_for_file_access(), errmsg(), fb(), and fsync_parent_path().

Referenced by InstallXLogFileSegment(), RemoveXlogFile(), and StartupXLOG().

◆ FileAccess()

static int FileAccess ( File  file)
static

Definition at line 1479 of file fd.c.

1480{
1481 int returnValue;
1482
1483 DO_DB(elog(LOG, "FileAccess %d (%s)",
1484 file, VfdCache[file].fileName));
1485
1486 /*
1487 * Is the file open? If not, open it and put it at the head of the LRU
1488 * ring (possibly closing the least recently used file to get an FD).
1489 */
1490
1491 if (FileIsNotOpen(file))
1492 {
1493 returnValue = LruInsert(file);
1494 if (returnValue != 0)
1495 return returnValue;
1496 }
1497 else if (VfdCache[0].lruLessRecently != file)
1498 {
1499 /*
1500 * We now know that the file is open and that it is not the last one
1501 * accessed, so we need to move it to the head of the Lru ring.
1502 */
1503
1504 Delete(file);
1505 Insert(file);
1506 }
1507
1508 return 0;
1509}

References Delete(), DO_DB, elog, fb(), FileIsNotOpen, Insert(), LOG, LruInsert(), and VfdCache.

Referenced by FileFallocate(), FileGetRawDesc(), FilePrefetch(), FileReadV(), FileSize(), FileStartReadV(), FileSync(), FileTruncate(), FileWriteback(), FileWriteV(), and FileZero().

◆ FileClose()

void FileClose ( File  file)

Definition at line 1965 of file fd.c.

1966{
1967 Vfd *vfdP;
1968
1969 Assert(FileIsValid(file));
1970
1971 DO_DB(elog(LOG, "FileClose: %d (%s)",
1972 file, VfdCache[file].fileName));
1973
1974 vfdP = &VfdCache[file];
1975
1976 if (!FileIsNotOpen(file))
1977 {
1979
1980 /* close the file */
1981 if (close(vfdP->fd) != 0)
1982 {
1983 /*
1984 * We may need to panic on failure to close non-temporary files;
1985 * see LruDelete.
1986 */
1988 "could not close file \"%s\": %m", vfdP->fileName);
1989 }
1990
1991 --nfile;
1992 vfdP->fd = VFD_CLOSED;
1993
1994 /* remove the file from the lru ring */
1995 Delete(file);
1996 }
1997
1998 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
1999 {
2000 /* Subtract its size from current usage (do first in case of error) */
2001 temporary_files_size -= vfdP->fileSize;
2002 vfdP->fileSize = 0;
2003 }
2004
2005 /*
2006 * Delete the file if it was temporary, and make a log entry if wanted
2007 */
2008 if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
2009 {
2010 struct stat filestats;
2011 int stat_errno;
2012
2013 /*
2014 * If we get an error, as could happen within the ereport/elog calls,
2015 * we'll come right back here during transaction abort. Reset the
2016 * flag to ensure that we can't get into an infinite loop. This code
2017 * is arranged to ensure that the worst-case consequence is failing to
2018 * emit log message(s), not failing to attempt the unlink.
2019 */
2020 vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
2021
2022
2023 /* first try the stat() */
2024 if (stat(vfdP->fileName, &filestats))
2025 stat_errno = errno;
2026 else
2027 stat_errno = 0;
2028
2029 /* in any case do the unlink */
2030 if (unlink(vfdP->fileName))
2031 ereport(LOG,
2033 errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
2034
2035 /* and last report the stat results */
2036 if (stat_errno == 0)
2037 ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
2038 else
2039 {
2040 errno = stat_errno;
2041 ereport(LOG,
2043 errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
2044 }
2045 }
2046
2047 /* Unregister it from the resource owner */
2048 if (vfdP->resowner)
2049 ResourceOwnerForgetFile(vfdP->resowner, file);
2050
2051 /*
2052 * Return the Vfd slot to the free list
2053 */
2054 FreeVfd(file);
2055}

References Assert, close, data_sync_elevel(), Delete(), DO_DB, elog, ereport, errcode_for_file_access(), errmsg(), fb(), FD_DELETE_AT_CLOSE, FD_TEMP_FILE_LIMIT, FileIsNotOpen, FileIsValid, FreeVfd(), LOG, nfile, pgaio_closing_fd(), ReportTemporaryFileUsage(), ResourceOwnerForgetFile(), stat, temporary_files_size, VFD_CLOSED, and VfdCache.

Referenced by bbsink_server_end_archive(), bbsink_server_end_manifest(), BufFileClose(), BufFileTruncateFileSet(), CleanupTempFiles(), logical_end_heap_rewrite(), mdclose(), mdimmedsync(), mdregistersync(), mdsyncfiletag(), mdtruncate(), pg_wal_summary_contents(), PrepareForIncrementalBackup(), ReorderBufferIterTXNFinish(), ReorderBufferRestoreChanges(), ResOwnerReleaseFile(), and SummarizeWAL().

◆ FileFallocate()

int FileFallocate ( File  file,
pgoff_t  offset,
pgoff_t  amount,
uint32  wait_event_info 
)

Definition at line 2407 of file fd.c.

2408{
2409#ifdef HAVE_POSIX_FALLOCATE
2410 int returnCode;
2411
2412 Assert(FileIsValid(file));
2413
2414 DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2415 file, VfdCache[file].fileName,
2416 (int64) offset, (int64) amount));
2417
2418 returnCode = FileAccess(file);
2419 if (returnCode < 0)
2420 return -1;
2421
2422retry:
2423 pgstat_report_wait_start(wait_event_info);
2424 returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
2426
2427 if (returnCode == 0)
2428 return 0;
2429 else if (returnCode == EINTR)
2430 goto retry;
2431
2432 /* for compatibility with %m printing etc */
2433 errno = returnCode;
2434
2435 /*
2436 * Return in cases of a "real" failure, if fallocate is not supported,
2437 * fall through to the FileZero() backed implementation.
2438 */
2440 return -1;
2441#endif
2442
2443 return FileZero(file, offset, amount, wait_event_info);
2444}

References Assert, DO_DB, EINTR, elog, EOPNOTSUPP, fb(), fd(), FileAccess(), FileIsValid, FileZero(), INT64_FORMAT, LOG, pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by mdzeroextend().

◆ FileGetRawDesc()

int FileGetRawDesc ( File  file)

Definition at line 2515 of file fd.c.

2516{
2517 int returnCode;
2518
2519 returnCode = FileAccess(file);
2520 if (returnCode < 0)
2521 return returnCode;
2522
2523 Assert(FileIsValid(file));
2524 return VfdCache[file].fd;
2525}

References Assert, fb(), vfd::fd, FileAccess(), FileIsValid, and VfdCache.

Referenced by mdfd().

◆ FileGetRawFlags()

int FileGetRawFlags ( File  file)

Definition at line 2531 of file fd.c.

2532{
2533 Assert(FileIsValid(file));
2534 return VfdCache[file].fileFlags;
2535}

References Assert, vfd::fileFlags, FileIsValid, and VfdCache.

◆ FileGetRawMode()

mode_t FileGetRawMode ( File  file)

Definition at line 2541 of file fd.c.

2542{
2543 Assert(FileIsValid(file));
2544 return VfdCache[file].fileMode;
2545}

References Assert, FileIsValid, vfd::fileMode, and VfdCache.

◆ FilePathName()

◆ FilePrefetch()

int FilePrefetch ( File  file,
pgoff_t  offset,
pgoff_t  amount,
uint32  wait_event_info 
)

Definition at line 2066 of file fd.c.

2067{
2068 Assert(FileIsValid(file));
2069
2070 DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2071 file, VfdCache[file].fileName,
2072 (int64) offset, (int64) amount));
2073
2074#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
2075 {
2076 int returnCode;
2077
2078 returnCode = FileAccess(file);
2079 if (returnCode < 0)
2080 return returnCode;
2081
2082retry:
2083 pgstat_report_wait_start(wait_event_info);
2084 returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
2087
2088 if (returnCode == EINTR)
2089 goto retry;
2090
2091 return returnCode;
2092 }
2093#elif defined(__darwin__)
2094 {
2095 struct radvisory
2096 {
2097 off_t ra_offset; /* offset into the file */
2098 int ra_count; /* size of the read */
2099 } ra;
2100 int returnCode;
2101
2102 returnCode = FileAccess(file);
2103 if (returnCode < 0)
2104 return returnCode;
2105
2106 ra.ra_offset = offset;
2107 ra.ra_count = amount;
2108 pgstat_report_wait_start(wait_event_info);
2111 if (returnCode != -1)
2112 return 0;
2113 else
2114 return errno;
2115 }
2116#else
2117 return 0;
2118#endif
2119}

References Assert, DO_DB, EINTR, elog, fb(), fd(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by mdprefetch().

◆ FileReadV()

ssize_t FileReadV ( File  file,
const struct iovec iov,
int  iovcnt,
pgoff_t  offset,
uint32  wait_event_info 
)

Definition at line 2148 of file fd.c.

2150{
2152 Vfd *vfdP;
2153
2154 Assert(FileIsValid(file));
2155
2156 DO_DB(elog(LOG, "FileReadV: %d (%s) " INT64_FORMAT " %d",
2157 file, VfdCache[file].fileName,
2158 (int64) offset,
2159 iovcnt));
2160
2161 returnCode = FileAccess(file);
2162 if (returnCode < 0)
2163 return returnCode;
2164
2165 vfdP = &VfdCache[file];
2166
2167retry:
2168 pgstat_report_wait_start(wait_event_info);
2169 returnCode = pg_preadv(vfdP->fd, iov, iovcnt, offset);
2171
2172 if (returnCode < 0)
2173 {
2174 /*
2175 * Windows may run out of kernel buffers and return "Insufficient
2176 * system resources" error. Wait a bit and retry to solve it.
2177 *
2178 * It is rumored that EINTR is also possible on some Unix filesystems,
2179 * in which case immediate retry is indicated.
2180 */
2181#ifdef WIN32
2183
2184 switch (error)
2185 {
2187 pg_usleep(1000L);
2188 errno = EINTR;
2189 break;
2190 default:
2192 break;
2193 }
2194#endif
2195 /* OK to retry if interrupted */
2196 if (errno == EINTR)
2197 goto retry;
2198 }
2199
2200 return returnCode;
2201}

References _dosmaperr(), Assert, DO_DB, EINTR, elog, error(), fb(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pg_preadv(), pg_usleep(), pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by FileRead(), and mdreadv().

◆ FileSize()

pgoff_t FileSize ( File  file)

Definition at line 2447 of file fd.c.

2448{
2449 Assert(FileIsValid(file));
2450
2451 DO_DB(elog(LOG, "FileSize %d (%s)",
2452 file, VfdCache[file].fileName));
2453
2454 if (FileIsNotOpen(file))
2455 {
2456 if (FileAccess(file) < 0)
2457 return (pgoff_t) -1;
2458 }
2459
2460 return lseek(VfdCache[file].fd, 0, SEEK_END);
2461}

References Assert, DO_DB, elog, fb(), fd(), FileAccess(), FileIsNotOpen, FileIsValid, LOG, and VfdCache.

Referenced by _mdnblocks(), BufFileSeek(), and BufFileSize().

◆ FileStartReadV()

int FileStartReadV ( PgAioHandle ioh,
File  file,
int  iovcnt,
pgoff_t  offset,
uint32  wait_event_info 
)

Definition at line 2204 of file fd.c.

2207{
2208 int returnCode;
2209 Vfd *vfdP;
2210
2211 Assert(FileIsValid(file));
2212
2213 DO_DB(elog(LOG, "FileStartReadV: %d (%s) " INT64_FORMAT " %d",
2214 file, VfdCache[file].fileName,
2215 (int64) offset,
2216 iovcnt));
2217
2218 returnCode = FileAccess(file);
2219 if (returnCode < 0)
2220 return returnCode;
2221
2222 vfdP = &VfdCache[file];
2223
2224 pgaio_io_start_readv(ioh, vfdP->fd, iovcnt, offset);
2225
2226 return 0;
2227}

References Assert, DO_DB, elog, fb(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pgaio_io_start_readv(), and VfdCache.

Referenced by mdstartreadv().

◆ FileSync()

int FileSync ( File  file,
uint32  wait_event_info 
)

Definition at line 2335 of file fd.c.

2336{
2337 int returnCode;
2338
2339 Assert(FileIsValid(file));
2340
2341 DO_DB(elog(LOG, "FileSync: %d (%s)",
2342 file, VfdCache[file].fileName));
2343
2344 returnCode = FileAccess(file);
2345 if (returnCode < 0)
2346 return returnCode;
2347
2348 pgstat_report_wait_start(wait_event_info);
2349 returnCode = pg_fsync(VfdCache[file].fd);
2351
2352 return returnCode;
2353}

References Assert, DO_DB, elog, fb(), fd(), FileAccess(), FileIsValid, LOG, pg_fsync(), pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by bbsink_server_end_archive(), logical_end_heap_rewrite(), mdimmedsync(), mdsyncfiletag(), and register_dirty_segment().

◆ FileTruncate()

int FileTruncate ( File  file,
pgoff_t  offset,
uint32  wait_event_info 
)

Definition at line 2464 of file fd.c.

2465{
2466 int returnCode;
2467
2468 Assert(FileIsValid(file));
2469
2470 DO_DB(elog(LOG, "FileTruncate %d (%s)",
2471 file, VfdCache[file].fileName));
2472
2473 returnCode = FileAccess(file);
2474 if (returnCode < 0)
2475 return returnCode;
2476
2477 pgstat_report_wait_start(wait_event_info);
2478 returnCode = pg_ftruncate(VfdCache[file].fd, offset);
2480
2481 if (returnCode == 0 && VfdCache[file].fileSize > offset)
2482 {
2483 /* adjust our state for truncation of a temp file */
2484 Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
2485 temporary_files_size -= VfdCache[file].fileSize - offset;
2486 VfdCache[file].fileSize = offset;
2487 }
2488
2489 return returnCode;
2490}

References Assert, DO_DB, elog, fb(), fd(), FD_TEMP_FILE_LIMIT, FileAccess(), FileIsValid, vfd::fileSize, LOG, pg_ftruncate(), pgstat_report_wait_end(), pgstat_report_wait_start(), temporary_files_size, and VfdCache.

Referenced by BufFileTruncateFileSet(), and mdtruncate().

◆ FileWriteback()

void FileWriteback ( File  file,
pgoff_t  offset,
pgoff_t  nbytes,
uint32  wait_event_info 
)

Definition at line 2122 of file fd.c.

2123{
2124 int returnCode;
2125
2126 Assert(FileIsValid(file));
2127
2128 DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2129 file, VfdCache[file].fileName,
2130 (int64) offset, (int64) nbytes));
2131
2132 if (nbytes <= 0)
2133 return;
2134
2135 if (VfdCache[file].fileFlags & PG_O_DIRECT)
2136 return;
2137
2138 returnCode = FileAccess(file);
2139 if (returnCode < 0)
2140 return;
2141
2142 pgstat_report_wait_start(wait_event_info);
2143 pg_flush_data(VfdCache[file].fd, offset, nbytes);
2145}

References Assert, DO_DB, elog, fb(), fd(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pg_flush_data(), PG_O_DIRECT, pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by mdwriteback().

◆ FileWriteV()

ssize_t FileWriteV ( File  file,
const struct iovec iov,
int  iovcnt,
pgoff_t  offset,
uint32  wait_event_info 
)

Definition at line 2230 of file fd.c.

2232{
2234 Vfd *vfdP;
2235
2236 Assert(FileIsValid(file));
2237
2238 DO_DB(elog(LOG, "FileWriteV: %d (%s) " INT64_FORMAT " %d",
2239 file, VfdCache[file].fileName,
2240 (int64) offset,
2241 iovcnt));
2242
2243 returnCode = FileAccess(file);
2244 if (returnCode < 0)
2245 return returnCode;
2246
2247 vfdP = &VfdCache[file];
2248
2249 /*
2250 * If enforcing temp_file_limit and it's a temp file, check to see if the
2251 * write would overrun temp_file_limit, and throw error if so. Note: it's
2252 * really a modularity violation to throw error here; we should set errno
2253 * and return -1. However, there's no way to report a suitable error
2254 * message if we do that. All current callers would just throw error
2255 * immediately anyway, so this is safe at present.
2256 */
2257 if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
2258 {
2259 pgoff_t past_write = offset;
2260
2261 for (int i = 0; i < iovcnt; ++i)
2262 past_write += iov[i].iov_len;
2263
2264 if (past_write > vfdP->fileSize)
2265 {
2267
2269 if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
2270 ereport(ERROR,
2272 errmsg("temporary file size exceeds \"temp_file_limit\" (%dkB)",
2273 temp_file_limit)));
2274 }
2275 }
2276
2277retry:
2278 pgstat_report_wait_start(wait_event_info);
2279 returnCode = pg_pwritev(vfdP->fd, iov, iovcnt, offset);
2281
2282 if (returnCode >= 0)
2283 {
2284 /*
2285 * Some callers expect short writes to set errno, and traditionally we
2286 * have assumed that they imply disk space shortage. We don't want to
2287 * waste CPU cycles adding up the total size here, so we'll just set
2288 * it for all successful writes in case such a caller determines that
2289 * the write was short and ereports "%m".
2290 */
2291 errno = ENOSPC;
2292
2293 /*
2294 * Maintain fileSize and temporary_files_size if it's a temp file.
2295 */
2296 if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
2297 {
2298 pgoff_t past_write = offset + returnCode;
2299
2300 if (past_write > vfdP->fileSize)
2301 {
2302 temporary_files_size += past_write - vfdP->fileSize;
2303 vfdP->fileSize = past_write;
2304 }
2305 }
2306 }
2307 else
2308 {
2309 /*
2310 * See comments in FileReadV()
2311 */
2312#ifdef WIN32
2314
2315 switch (error)
2316 {
2318 pg_usleep(1000L);
2319 errno = EINTR;
2320 break;
2321 default:
2323 break;
2324 }
2325#endif
2326 /* OK to retry if interrupted */
2327 if (errno == EINTR)
2328 goto retry;
2329 }
2330
2331 return returnCode;
2332}

References _dosmaperr(), Assert, DO_DB, EINTR, elog, ereport, errcode(), errmsg(), ERROR, error(), fb(), FD_TEMP_FILE_LIMIT, FileAccess(), FileIsValid, vfd::fileSize, i, INT64_FORMAT, LOG, pg_pwritev(), pg_usleep(), pgstat_report_wait_end(), pgstat_report_wait_start(), temp_file_limit, temporary_files_size, and VfdCache.

Referenced by FileWrite(), and mdwritev().

◆ FileZero()

int FileZero ( File  file,
pgoff_t  offset,
pgoff_t  amount,
uint32  wait_event_info 
)

Definition at line 2362 of file fd.c.

2363{
2364 int returnCode;
2366
2367 Assert(FileIsValid(file));
2368
2369 DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
2370 file, VfdCache[file].fileName,
2371 (int64) offset, (int64) amount));
2372
2373 returnCode = FileAccess(file);
2374 if (returnCode < 0)
2375 return returnCode;
2376
2377 pgstat_report_wait_start(wait_event_info);
2378 written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
2380
2381 if (written < 0)
2382 return -1;
2383 else if (written != amount)
2384 {
2385 /* if errno is unset, assume problem is no disk space */
2386 if (errno == 0)
2387 errno = ENOSPC;
2388 return -1;
2389 }
2390
2391 return 0;
2392}

References Assert, DO_DB, elog, fb(), fd(), FileAccess(), FileIsValid, INT64_FORMAT, LOG, pg_pwrite_zeros(), pgstat_report_wait_end(), pgstat_report_wait_start(), and VfdCache.

Referenced by FileFallocate(), and mdzeroextend().

◆ FreeDesc()

static int FreeDesc ( AllocateDesc desc)
static

Definition at line 2786 of file fd.c.

2787{
2788 int result;
2789
2790 /* Close the underlying object */
2791 switch (desc->kind)
2792 {
2793 case AllocateDescFile:
2794 result = fclose(desc->desc.file);
2795 break;
2796 case AllocateDescPipe:
2797 result = pclose(desc->desc.file);
2798 break;
2799 case AllocateDescDir:
2800 result = closedir(desc->desc.dir);
2801 break;
2802 case AllocateDescRawFD:
2803 pgaio_closing_fd(desc->desc.fd);
2804 result = close(desc->desc.fd);
2805 break;
2806 default:
2807 elog(ERROR, "AllocateDesc kind not recognized");
2808 result = 0; /* keep compiler quiet */
2809 break;
2810 }
2811
2812 /* Compact storage in the allocatedDescs array */
2815
2816 return result;
2817}

References allocatedDescs, AllocateDescDir, AllocateDescFile, AllocateDescPipe, AllocateDescRawFD, close, closedir(), AllocateDesc::desc, AllocateDesc::dir, elog, ERROR, fb(), AllocateDesc::fd, AllocateDesc::file, AllocateDesc::kind, numAllocatedDescs, and pgaio_closing_fd().

Referenced by AtEOSubXact_Files(), CleanupTempFiles(), ClosePipeStream(), CloseTransientFile(), FreeDir(), and FreeFile().

◆ FreeDir()

int FreeDir ( DIR dir)

Definition at line 3008 of file fd.c.

3009{
3010 int i;
3011
3012 /* Nothing to do if AllocateDir failed */
3013 if (dir == NULL)
3014 return 0;
3015
3016 DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
3017
3018 /* Remove dir from list of allocated dirs, if it's present */
3019 for (i = numAllocatedDescs; --i >= 0;)
3020 {
3021 AllocateDesc *desc = &allocatedDescs[i];
3022
3023 if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
3024 return FreeDesc(desc);
3025 }
3026
3027 /* Only get here if someone passes us a dir not in allocatedDescs */
3028 elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
3029
3030 return closedir(dir);
3031}

References allocatedDescs, AllocateDescDir, closedir(), AllocateDesc::desc, AllocateDesc::dir, DO_DB, elog, fb(), FreeDesc(), i, AllocateDesc::kind, LOG, numAllocatedDescs, and WARNING.

Referenced by calculate_database_size(), calculate_tablespace_size(), CheckPointLogicalRewriteHeap(), CheckPointSnapBuild(), CleanupBackupHistory(), copydir(), db_dir_size(), DeleteAllExportedSnapshotFiles(), destroy_tablespace_directories(), directory_is_empty(), do_pg_backup_start(), dsm_cleanup_for_mmap(), extension_file_exists(), get_ext_ver_list(), GetConfFilesInDir(), getInstallationPaths(), GetWalSummaries(), movedb(), ParseTzFile(), perform_base_backup(), pg_available_extension_versions(), pg_available_extensions(), pg_ls_dir(), pg_ls_dir_files(), pg_tablespace_databases(), pg_tzenumerate_end(), pg_tzenumerate_next(), pgarch_readyXlog(), RelationCacheInitFileRemove(), RelationCacheInitFileRemoveInDir(), RemoveNonParentXlogFiles(), RemoveOldXlogFiles(), RemovePgTempFiles(), RemovePgTempFilesInDir(), RemovePgTempRelationFiles(), RemovePgTempRelationFilesInDbspace(), RemoveTempXlogFiles(), ReorderBufferCleanupSerializedTXNs(), ResetUnloggedRelations(), ResetUnloggedRelationsInDbspaceDir(), ResetUnloggedRelationsInTablespaceDir(), restoreTwoPhaseData(), scan_directory_ci(), sendDir(), SlruScanDirectory(), StartupReorderBuffer(), StartupReplicationSlots(), SyncDataDirectory(), UpdateLogicalMappings(), walkdir(), and XLogGetOldestSegno().

◆ FreeFile()

int FreeFile ( FILE file)

Definition at line 2826 of file fd.c.

2827{
2828 int i;
2829
2830 DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
2831
2832 /* Remove file from list of allocated files, if it's present */
2833 for (i = numAllocatedDescs; --i >= 0;)
2834 {
2835 AllocateDesc *desc = &allocatedDescs[i];
2836
2837 if (desc->kind == AllocateDescFile && desc->desc.file == file)
2838 return FreeDesc(desc);
2839 }
2840
2841 /* Only get here if someone passes us a file not in allocatedDescs */
2842 elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
2843
2844 return fclose(file);
2845}

References allocatedDescs, AllocateDescFile, AllocateDesc::desc, DO_DB, elog, fb(), AllocateDesc::file, FreeDesc(), i, AllocateDesc::kind, LOG, numAllocatedDescs, and WARNING.

Referenced by AlterSystemSetConfigFile(), apw_dump_now(), apw_load_buffers(), checkControlFile(), do_pg_backup_stop(), EndCopy(), EndCopyFrom(), entry_reset(), existsTimeLineHistory(), ExportSnapshot(), free_auth_file(), gc_qtexts(), GetHugePageSize(), ImportSnapshot(), load_dh_file(), load_relcache_init_file(), parse_extension_control_file(), ParseConfigFile(), ParseTzFile(), pg_current_logfile(), pg_promote(), pgss_shmem_shutdown(), pgss_shmem_startup(), pgstat_read_statsfile(), pgstat_write_statsfile(), read_backup_label(), read_binary_file(), read_tablespace_map(), read_whole_file(), readTimeLineHistory(), test_custom_stats_var_finish(), tsearch_readline_end(), ValidatePgVersion(), write_relcache_init_file(), XLogArchiveForceDone(), and XLogArchiveNotify().

◆ FreeVfd()

static void FreeVfd ( File  file)
static

Definition at line 1459 of file fd.c.

1460{
1461 Vfd *vfdP = &VfdCache[file];
1462
1463 DO_DB(elog(LOG, "FreeVfd: %d (%s)",
1464 file, vfdP->fileName ? vfdP->fileName : ""));
1465
1466 if (vfdP->fileName != NULL)
1467 {
1468 free(vfdP->fileName);
1469 vfdP->fileName = NULL;
1470 }
1471 vfdP->fdstate = 0x0;
1472
1473 vfdP->nextFree = VfdCache[0].nextFree;
1474 VfdCache[0].nextFree = file;
1475}

References DO_DB, elog, fb(), free, LOG, vfd::nextFree, and VfdCache.

Referenced by FileClose(), and PathNameOpenFilePerm().

◆ fsync_fname()

◆ fsync_fname_ext()

int fsync_fname_ext ( const char fname,
bool  isdir,
bool  ignore_perm,
int  elevel 
)

Definition at line 3846 of file fd.c.

3847{
3848 int fd;
3849 int flags;
3850 int returncode;
3851
3852 /*
3853 * Some OSs require directories to be opened read-only whereas other
3854 * systems don't allow us to fsync files opened read-only; so we need both
3855 * cases here. Using O_RDWR will cause us to fail to fsync files that are
3856 * not writable by our userid, but we assume that's OK.
3857 */
3858 flags = PG_BINARY;
3859 if (!isdir)
3860 flags |= O_RDWR;
3861 else
3862 flags |= O_RDONLY;
3863
3864 fd = OpenTransientFile(fname, flags);
3865
3866 /*
3867 * Some OSs don't allow us to open directories at all (Windows returns
3868 * EACCES), just ignore the error in that case. If desired also silently
3869 * ignoring errors about unreadable files. Log others.
3870 */
3871 if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
3872 return 0;
3873 else if (fd < 0 && ignore_perm && errno == EACCES)
3874 return 0;
3875 else if (fd < 0)
3876 {
3877 ereport(elevel,
3879 errmsg("could not open file \"%s\": %m", fname)));
3880 return -1;
3881 }
3882
3884
3885 /*
3886 * Some OSes don't allow us to fsync directories at all, so we can ignore
3887 * those errors. Anything else needs to be logged.
3888 */
3889 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
3890 {
3891 int save_errno;
3892
3893 /* close file upon error, might not be in transaction context */
3894 save_errno = errno;
3896 errno = save_errno;
3897
3898 ereport(elevel,
3900 errmsg("could not fsync file \"%s\": %m", fname)));
3901 return -1;
3902 }
3903
3904 if (CloseTransientFile(fd) != 0)
3905 {
3906 ereport(elevel,
3908 errmsg("could not close file \"%s\": %m", fname)));
3909 return -1;
3910 }
3911
3912 return 0;
3913}

References CloseTransientFile(), ereport, errcode_for_file_access(), errmsg(), fb(), fd(), OpenTransientFile(), PG_BINARY, and pg_fsync().

Referenced by datadir_fsync_fname(), durable_rename(), fsync_fname(), and fsync_parent_path().

◆ fsync_parent_path()

static int fsync_parent_path ( const char fname,
int  elevel 
)
static

Definition at line 3922 of file fd.c.

3923{
3924 char parentpath[MAXPGPATH];
3925
3926 strlcpy(parentpath, fname, MAXPGPATH);
3928
3929 /*
3930 * get_parent_directory() returns an empty string if the input argument is
3931 * just a file name (see comments in path.c), so handle that as being the
3932 * current directory.
3933 */
3934 if (strlen(parentpath) == 0)
3936
3937 if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
3938 return -1;
3939
3940 return 0;
3941}

References fb(), fsync_fname_ext(), get_parent_directory(), MAXPGPATH, and strlcpy().

Referenced by dir_close(), dir_open_for_write(), durable_rename(), durable_unlink(), swap_catalog_files(), and tar_finish().

◆ GetNextTempTableSpace()

Oid GetNextTempTableSpace ( void  )

Definition at line 3158 of file fd.c.

3159{
3160 if (numTempTableSpaces > 0)
3161 {
3162 /* Advance nextTempTableSpace counter with wraparound */
3166 }
3167 return InvalidOid;
3168}

References InvalidOid, nextTempTableSpace, numTempTableSpaces, and tempTableSpaces.

Referenced by GetDefaultTablespace(), and OpenTemporaryFile().

◆ GetTempTablespaces()

int GetTempTablespaces ( Oid tableSpaces,
int  numSpaces 
)

Definition at line 3140 of file fd.c.

3141{
3142 int i;
3143
3145 for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
3147
3148 return i;
3149}

References Assert, fb(), i, numTempTableSpaces, tempTableSpaces, and TempTablespacesAreSet().

Referenced by FileSetInit().

◆ InitFileAccess()

void InitFileAccess ( void  )

Definition at line 903 of file fd.c.

904{
905 Assert(SizeVfdCache == 0); /* call me only once */
906
907 /* initialize cache header entry */
908 VfdCache = (Vfd *) malloc(sizeof(Vfd));
909 if (VfdCache == NULL)
912 errmsg("out of memory")));
913
914 MemSet(&(VfdCache[0]), 0, sizeof(Vfd));
916
917 SizeVfdCache = 1;
918}

References Assert, ereport, errcode(), errmsg(), FATAL, fb(), vfd::fd, malloc, MemSet, SizeVfdCache, VFD_CLOSED, and VfdCache.

Referenced by BaseInit().

◆ InitTemporaryFileAccess()

void InitTemporaryFileAccess ( void  )

Definition at line 933 of file fd.c.

934{
935 Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
936 Assert(!temporary_files_allowed); /* call me only once */
937
938 /*
939 * Register before-shmem-exit hook to ensure temp files are dropped while
940 * we can still report stats.
941 */
943
944#ifdef USE_ASSERT_CHECKING
946#endif
947}

References Assert, before_shmem_exit(), BeforeShmemExit_Files(), fb(), and SizeVfdCache.

Referenced by BaseInit().

◆ Insert()

static void Insert ( File  file)
static

Definition at line 1300 of file fd.c.

1301{
1302 Vfd *vfdP;
1303
1304 Assert(file != 0);
1305
1306 DO_DB(elog(LOG, "Insert %d (%s)",
1307 file, VfdCache[file].fileName));
1308 DO_DB(_dump_lru());
1309
1310 vfdP = &VfdCache[file];
1311
1312 vfdP->lruMoreRecently = 0;
1313 vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
1314 VfdCache[0].lruLessRecently = file;
1315 VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
1316
1317 DO_DB(_dump_lru());
1318}

References Assert, DO_DB, elog, fb(), LOG, vfd::lruLessRecently, vfd::lruMoreRecently, and VfdCache.

Referenced by AdvanceXLInsertBuffer(), CreateCheckPoint(), FileAccess(), GetXLogInsertRecPtr(), LruInsert(), PathNameOpenFilePerm(), ReserveXLogInsertLocation(), ReserveXLogSwitch(), StartupXLOG(), UpdateFullPageWrites(), WaitXLogInsertionsToFinish(), XLogInsertRecord(), and XLogWrite().

◆ looks_like_temp_rel_name()

bool looks_like_temp_rel_name ( const char name)

Definition at line 3498 of file fd.c.

3499{
3500 int pos;
3501 int savepos;
3502
3503 /* Must start with "t". */
3504 if (name[0] != 't')
3505 return false;
3506
3507 /* Followed by a non-empty string of digits and then an underscore. */
3508 for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
3509 ;
3510 if (pos == 1 || name[pos] != '_')
3511 return false;
3512
3513 /* Followed by another nonempty string of digits. */
3514 for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
3515 ;
3516 if (savepos == pos)
3517 return false;
3518
3519 /* We might have _forkname or .segment or both. */
3520 if (name[pos] == '_')
3521 {
3522 int forkchar = forkname_chars(&name[pos + 1], NULL);
3523
3524 if (forkchar <= 0)
3525 return false;
3526 pos += forkchar + 1;
3527 }
3528 if (name[pos] == '.')
3529 {
3530 int segchar;
3531
3532 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
3533 ;
3534 if (segchar <= 1)
3535 return false;
3536 pos += segchar;
3537 }
3538
3539 /* Now we should be at the end. */
3540 if (name[pos] != '\0')
3541 return false;
3542 return true;
3543}

References fb(), forkname_chars(), and name.

Referenced by RemovePgTempRelationFilesInDbspace(), and sendDir().

◆ LruDelete()

static void LruDelete ( File  file)
static

Definition at line 1272 of file fd.c.

1273{
1274 Vfd *vfdP;
1275
1276 Assert(file != 0);
1277
1278 DO_DB(elog(LOG, "LruDelete %d (%s)",
1279 file, VfdCache[file].fileName));
1280
1281 vfdP = &VfdCache[file];
1282
1284
1285 /*
1286 * Close the file. We aren't expecting this to fail; if it does, better
1287 * to leak the FD than to mess up our internal state.
1288 */
1289 if (close(vfdP->fd) != 0)
1291 "could not close file \"%s\": %m", vfdP->fileName);
1292 vfdP->fd = VFD_CLOSED;
1293 --nfile;
1294
1295 /* delete the vfd record from the LRU ring */
1296 Delete(file);
1297}

References Assert, close, data_sync_elevel(), Delete(), DO_DB, elog, fb(), FD_TEMP_FILE_LIMIT, LOG, nfile, pgaio_closing_fd(), VFD_CLOSED, and VfdCache.

Referenced by closeAllVfds(), and ReleaseLruFile().

◆ LruInsert()

static int LruInsert ( File  file)
static

Definition at line 1322 of file fd.c.

1323{
1324 Vfd *vfdP;
1325
1326 Assert(file != 0);
1327
1328 DO_DB(elog(LOG, "LruInsert %d (%s)",
1329 file, VfdCache[file].fileName));
1330
1331 vfdP = &VfdCache[file];
1332
1333 if (FileIsNotOpen(file))
1334 {
1335 /* Close excess kernel FDs. */
1337
1338 /*
1339 * The open could still fail for lack of file descriptors, eg due to
1340 * overall system file table being full. So, be prepared to release
1341 * another FD if necessary...
1342 */
1343 vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
1344 vfdP->fileMode);
1345 if (vfdP->fd < 0)
1346 {
1347 DO_DB(elog(LOG, "re-open failed: %m"));
1348 return -1;
1349 }
1350 else
1351 {
1352 ++nfile;
1353 }
1354 }
1355
1356 /*
1357 * put it at the head of the Lru ring
1358 */
1359
1360 Insert(file);
1361
1362 return 0;
1363}

References Assert, BasicOpenFilePerm(), DO_DB, elog, fb(), FileIsNotOpen, Insert(), LOG, nfile, ReleaseLruFiles(), and VfdCache.

Referenced by FileAccess().

◆ MakePGDirectory()

◆ OpenPipeStream()

FILE * OpenPipeStream ( const char command,
const char mode 
)

Definition at line 2730 of file fd.c.

2731{
2732 FILE *file;
2733 int save_errno;
2734
2735 DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
2736 numAllocatedDescs, command));
2737
2738 /* Can we allocate another non-virtual FD? */
2739 if (!reserveAllocatedDesc())
2740 ereport(ERROR,
2742 errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
2743 maxAllocatedDescs, command)));
2744
2745 /* Close excess kernel FDs. */
2747
2748TryAgain:
2749 fflush(NULL);
2751 errno = 0;
2752 file = popen(command, mode);
2753 save_errno = errno;
2755 errno = save_errno;
2756 if (file != NULL)
2757 {
2759
2760 desc->kind = AllocateDescPipe;
2761 desc->desc.file = file;
2764 return desc->desc.file;
2765 }
2766
2767 if (errno == EMFILE || errno == ENFILE)
2768 {
2769 ereport(LOG,
2771 errmsg("out of file descriptors: %m; release and retry")));
2772 if (ReleaseLruFile())
2773 goto TryAgain;
2774 errno = save_errno;
2775 }
2776
2777 return NULL;
2778}

References allocatedDescs, AllocateDescPipe, AllocateDesc::create_subid, AllocateDesc::desc, DO_DB, elog, ereport, errcode(), errmsg(), ERROR, fb(), AllocateDesc::file, GetCurrentSubTransactionId(), AllocateDesc::kind, LOG, maxAllocatedDescs, mode, numAllocatedDescs, pqsignal, ReleaseLruFile(), ReleaseLruFiles(), reserveAllocatedDesc(), and SIGPIPE.

Referenced by BeginCopyFrom(), BeginCopyTo(), pg_import_system_collations(), run_ssl_passphrase_command(), and shell_run_command().

◆ OpenTemporaryFile()

File OpenTemporaryFile ( bool  interXact)

Definition at line 1711 of file fd.c.

1712{
1713 File file = 0;
1714
1715 Assert(temporary_files_allowed); /* check temp file access is up */
1716
1717 /*
1718 * Make sure the current resource owner has space for this File before we
1719 * open it, if we'll be registering it below.
1720 */
1721 if (!interXact)
1723
1724 /*
1725 * If some temp tablespace(s) have been given to us, try to use the next
1726 * one. If a given tablespace can't be found, we silently fall back to
1727 * the database's default tablespace.
1728 *
1729 * BUT: if the temp file is slated to outlive the current transaction,
1730 * force it into the database's default tablespace, so that it will not
1731 * pose a threat to possible tablespace drop attempts.
1732 */
1733 if (numTempTableSpaces > 0 && !interXact)
1734 {
1736
1737 if (OidIsValid(tblspcOid))
1739 }
1740
1741 /*
1742 * If not, or if tablespace is bad, create in database's default
1743 * tablespace. MyDatabaseTableSpace should normally be set before we get
1744 * here, but just in case it isn't, fall back to pg_default tablespace.
1745 */
1746 if (file <= 0)
1750 true);
1751
1752 /* Mark it for deletion at close and temporary file size limit */
1754
1755 /* Register it with the current resource owner */
1756 if (!interXact)
1758
1759 return file;
1760}

References Assert, CurrentResourceOwner, fb(), FD_DELETE_AT_CLOSE, FD_TEMP_FILE_LIMIT, vfd::fdstate, GetNextTempTableSpace(), MyDatabaseTableSpace, numTempTableSpaces, OidIsValid, OpenTemporaryFileInTablespace(), RegisterTemporaryFile(), ResourceOwnerEnlarge(), and VfdCache.

Referenced by BufFileCreateTemp(), and extendBufFile().

◆ OpenTemporaryFileInTablespace()

static File OpenTemporaryFileInTablespace ( Oid  tblspcOid,
bool  rejectError 
)
static

Definition at line 1791 of file fd.c.

1792{
1793 char tempdirpath[MAXPGPATH];
1794 char tempfilepath[MAXPGPATH];
1795 File file;
1796
1798
1799 /*
1800 * Generate a tempfile name that should be unique within the current
1801 * database instance.
1802 */
1803 snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
1805
1806 /*
1807 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1808 * temp file that can be reused.
1809 */
1812 if (file <= 0)
1813 {
1814 /*
1815 * We might need to create the tablespace's tempfile directory, if no
1816 * one has yet done so.
1817 *
1818 * Don't check for an error from MakePGDirectory; it could fail if
1819 * someone else just did the same thing. If it doesn't work then
1820 * we'll bomb out on the second create attempt, instead.
1821 */
1823
1826 if (file <= 0 && rejectError)
1827 elog(ERROR, "could not create temporary file \"%s\": %m",
1828 tempfilepath);
1829 }
1830
1831 return file;
1832}

References elog, ERROR, fb(), MakePGDirectory(), MAXPGPATH, MyProcPid, PathNameOpenFile(), PG_BINARY, PG_TEMP_FILE_PREFIX, snprintf, tempFileCounter, and TempTablespacePath().

Referenced by OpenTemporaryFile().

◆ OpenTransientFile()

◆ OpenTransientFilePerm()

int OpenTransientFilePerm ( const char fileName,
int  fileFlags,
mode_t  fileMode 
)

Definition at line 2686 of file fd.c.

2687{
2688 int fd;
2689
2690 DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
2691 numAllocatedDescs, fileName));
2692
2693 /* Can we allocate another non-virtual FD? */
2694 if (!reserveAllocatedDesc())
2695 ereport(ERROR,
2697 errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
2698 maxAllocatedDescs, fileName)));
2699
2700 /* Close excess kernel FDs. */
2702
2703 fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
2704
2705 if (fd >= 0)
2706 {
2708
2709 desc->kind = AllocateDescRawFD;
2710 desc->desc.fd = fd;
2713
2714 return fd;
2715 }
2716
2717 return -1; /* failure */
2718}

References allocatedDescs, AllocateDescRawFD, BasicOpenFilePerm(), AllocateDesc::create_subid, AllocateDesc::desc, DO_DB, elog, ereport, errcode(), errmsg(), ERROR, fb(), AllocateDesc::fd, fd(), GetCurrentSubTransactionId(), AllocateDesc::kind, LOG, maxAllocatedDescs, numAllocatedDescs, ReleaseLruFiles(), and reserveAllocatedDesc().

Referenced by be_lo_export(), and OpenTransientFile().

◆ PathNameCreateTemporaryDir()

void PathNameCreateTemporaryDir ( const char basedir,
const char directory 
)

Definition at line 1647 of file fd.c.

1648{
1649 if (MakePGDirectory(directory) < 0)
1650 {
1651 if (errno == EEXIST)
1652 return;
1653
1654 /*
1655 * Failed. Try to create basedir first in case it's missing. Tolerate
1656 * EEXIST to close a race against another process following the same
1657 * algorithm.
1658 */
1659 if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
1660 ereport(ERROR,
1662 errmsg("cannot create temporary directory \"%s\": %m",
1663 basedir)));
1664
1665 /* Try again. */
1666 if (MakePGDirectory(directory) < 0 && errno != EEXIST)
1667 ereport(ERROR,
1669 errmsg("cannot create temporary subdirectory \"%s\": %m",
1670 directory)));
1671 }
1672}

References basedir, directory, ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), and MakePGDirectory().

Referenced by FileSetCreate().

◆ PathNameCreateTemporaryFile()

File PathNameCreateTemporaryFile ( const char path,
bool  error_on_failure 
)

Definition at line 1848 of file fd.c.

1849{
1850 File file;
1851
1852 Assert(temporary_files_allowed); /* check temp file access is up */
1853
1855
1856 /*
1857 * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
1858 * temp file that can be reused.
1859 */
1860 file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
1861 if (file <= 0)
1862 {
1863 if (error_on_failure)
1864 ereport(ERROR,
1866 errmsg("could not create temporary file \"%s\": %m",
1867 path)));
1868 else
1869 return file;
1870 }
1871
1872 /* Mark it for temp_file_limit accounting. */
1874
1875 /* Register it for automatic close. */
1877
1878 return file;
1879}

References Assert, CurrentResourceOwner, ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), FD_TEMP_FILE_LIMIT, vfd::fdstate, PathNameOpenFile(), PG_BINARY, RegisterTemporaryFile(), ResourceOwnerEnlarge(), and VfdCache.

Referenced by FileSetCreate().

◆ PathNameDeleteTemporaryDir()

void PathNameDeleteTemporaryDir ( const char dirname)

Definition at line 1678 of file fd.c.

1679{
1680 struct stat statbuf;
1681
1682 /* Silently ignore missing directory. */
1683 if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
1684 return;
1685
1686 /*
1687 * Currently, walkdir doesn't offer a way for our passed in function to
1688 * maintain state. Perhaps it should, so that we could tell the caller
1689 * whether this operation succeeded or failed. Since this operation is
1690 * used in a cleanup path, we wouldn't actually behave differently: we'll
1691 * just log failures.
1692 */
1693 walkdir(dirname, unlink_if_exists_fname, false, LOG);
1694}

References fb(), LOG, stat, unlink_if_exists_fname(), and walkdir().

Referenced by FileSetDeleteAll().

◆ PathNameDeleteTemporaryFile()

bool PathNameDeleteTemporaryFile ( const char path,
bool  error_on_failure 
)

Definition at line 1919 of file fd.c.

1920{
1921 struct stat filestats;
1922 int stat_errno;
1923
1924 /* Get the final size for pgstat reporting. */
1925 if (stat(path, &filestats) != 0)
1926 stat_errno = errno;
1927 else
1928 stat_errno = 0;
1929
1930 /*
1931 * Unlike FileClose's automatic file deletion code, we tolerate
1932 * non-existence to support BufFileDeleteFileSet which doesn't know how
1933 * many segments it has to delete until it runs out.
1934 */
1935 if (stat_errno == ENOENT)
1936 return false;
1937
1938 if (unlink(path) < 0)
1939 {
1940 if (errno != ENOENT)
1943 errmsg("could not unlink temporary file \"%s\": %m",
1944 path)));
1945 return false;
1946 }
1947
1948 if (stat_errno == 0)
1949 ReportTemporaryFileUsage(path, filestats.st_size);
1950 else
1951 {
1952 errno = stat_errno;
1953 ereport(LOG,
1955 errmsg("could not stat file \"%s\": %m", path)));
1956 }
1957
1958 return true;
1959}

References ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), LOG, ReportTemporaryFileUsage(), and stat.

Referenced by FileSetDelete(), and unlink_if_exists_fname().

◆ PathNameOpenFile()

◆ PathNameOpenFilePerm()

File PathNameOpenFilePerm ( const char fileName,
int  fileFlags,
mode_t  fileMode 
)

Definition at line 1575 of file fd.c.

1576{
1577 char *fnamecopy;
1578 File file;
1579 Vfd *vfdP;
1580
1581 DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
1582 fileName, fileFlags, fileMode));
1583
1584 /*
1585 * We need a malloc'd copy of the file name; fail cleanly if no room.
1586 */
1587 fnamecopy = strdup(fileName);
1588 if (fnamecopy == NULL)
1589 ereport(ERROR,
1591 errmsg("out of memory")));
1592
1593 file = AllocateVfd();
1594 vfdP = &VfdCache[file];
1595
1596 /* Close excess kernel FDs. */
1598
1599 /*
1600 * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
1601 * client shouldn't be expected to know which kernel descriptors are
1602 * currently open, so it wouldn't make sense for them to be inherited by
1603 * executed subprograms.
1604 */
1605 fileFlags |= O_CLOEXEC;
1606
1607 vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
1608
1609 if (vfdP->fd < 0)
1610 {
1611 int save_errno = errno;
1612
1613 FreeVfd(file);
1614 free(fnamecopy);
1615 errno = save_errno;
1616 return -1;
1617 }
1618 ++nfile;
1619 DO_DB(elog(LOG, "PathNameOpenFile: success %d",
1620 vfdP->fd));
1621
1622 vfdP->fileName = fnamecopy;
1623 /* Saved flags are adjusted to be OK for re-opening file */
1624 vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
1625 vfdP->fileMode = fileMode;
1626 vfdP->fileSize = 0;
1627 vfdP->fdstate = 0x0;
1628 vfdP->resowner = NULL;
1629
1630 Insert(file);
1631
1632 return file;
1633}

References AllocateVfd(), BasicOpenFilePerm(), DO_DB, elog, ereport, errcode(), errmsg(), ERROR, fb(), free, FreeVfd(), Insert(), LOG, nfile, O_CLOEXEC, ReleaseLruFiles(), and VfdCache.

Referenced by PathNameOpenFile().

◆ PathNameOpenTemporaryFile()

File PathNameOpenTemporaryFile ( const char path,
int  mode 
)

Definition at line 1888 of file fd.c.

1889{
1890 File file;
1891
1892 Assert(temporary_files_allowed); /* check temp file access is up */
1893
1895
1896 file = PathNameOpenFile(path, mode | PG_BINARY);
1897
1898 /* If no such file, then we don't raise an error. */
1899 if (file <= 0 && errno != ENOENT)
1900 ereport(ERROR,
1902 errmsg("could not open temporary file \"%s\": %m",
1903 path)));
1904
1905 if (file > 0)
1906 {
1907 /* Register it for automatic close. */
1909 }
1910
1911 return file;
1912}

References Assert, CurrentResourceOwner, ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), mode, PathNameOpenFile(), PG_BINARY, RegisterTemporaryFile(), and ResourceOwnerEnlarge().

Referenced by FileSetOpen().

◆ pg_fdatasync()

int pg_fdatasync ( int  fd)

Definition at line 480 of file fd.c.

481{
482 int rc;
483
484 if (!enableFsync)
485 return 0;
486
487retry:
488 rc = fdatasync(fd);
489
490 if (rc == -1 && errno == EINTR)
491 goto retry;
492
493 return rc;
494}

References EINTR, enableFsync, fb(), fd(), and fdatasync().

Referenced by issue_xlog_fsync().

◆ pg_file_exists()

bool pg_file_exists ( const char name)

Definition at line 503 of file fd.c.

504{
505 struct stat st;
506
507 Assert(name != NULL);
508
509 if (stat(name, &st) == 0)
510 return !S_ISDIR(st.st_mode);
511 else if (!(errno == ENOENT || errno == ENOTDIR || errno == EACCES))
514 errmsg("could not access file \"%s\": %m", name)));
515
516 return false;
517}

References Assert, ereport, errcode_for_file_access(), errmsg(), ERROR, fb(), name, S_ISDIR, stat::st_mode, and stat.

Referenced by expand_dynamic_library_name(), find_in_path(), find_in_paths(), and provider_init().

◆ pg_flush_data()

void pg_flush_data ( int  fd,
pgoff_t  offset,
pgoff_t  nbytes 
)

Definition at line 525 of file fd.c.

526{
527 /*
528 * Right now file flushing is primarily used to avoid making later
529 * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
530 * if fsyncs are disabled - that's a decision we might want to make
531 * configurable at some point.
532 */
533 if (!enableFsync)
534 return;
535
536 /*
537 * We compile all alternatives that are supported on the current platform,
538 * to find portability problems more easily.
539 */
540#if defined(HAVE_SYNC_FILE_RANGE)
541 {
542 int rc;
543 static bool not_implemented_by_kernel = false;
544
546 return;
547
548retry:
549
550 /*
551 * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
552 * tells the OS that writeback for the specified blocks should be
553 * started, but that we don't want to wait for completion. Note that
554 * this call might block if too much dirty data exists in the range.
555 * This is the preferable method on OSs supporting it, as it works
556 * reliably when available (contrast to msync()) and doesn't flush out
557 * clean data (like FADV_DONTNEED).
558 */
559 rc = sync_file_range(fd, offset, nbytes,
561 if (rc != 0)
562 {
563 int elevel;
564
565 if (rc == EINTR)
566 goto retry;
567
568 /*
569 * For systems that don't have an implementation of
570 * sync_file_range() such as Windows WSL, generate only one
571 * warning and then suppress all further attempts by this process.
572 */
573 if (errno == ENOSYS)
574 {
575 elevel = WARNING;
577 }
578 else
579 elevel = data_sync_elevel(WARNING);
580
581 ereport(elevel,
583 errmsg("could not flush dirty data: %m")));
584 }
585
586 return;
587 }
588#endif
589#if !defined(WIN32) && defined(MS_ASYNC)
590 {
591 void *p;
592 static int pagesize = 0;
593
594 /*
595 * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
596 * writeback. On linux it only does so if MS_SYNC is specified, but
597 * then it does the writeback synchronously. Luckily all common linux
598 * systems have sync_file_range(). This is preferable over
599 * FADV_DONTNEED because it doesn't flush out clean data.
600 *
601 * We map the file (mmap()), tell the kernel to sync back the contents
602 * (msync()), and then remove the mapping again (munmap()).
603 */
604
605 /* mmap() needs actual length if we want to map whole file */
606 if (offset == 0 && nbytes == 0)
607 {
608 nbytes = lseek(fd, 0, SEEK_END);
609 if (nbytes < 0)
610 {
613 errmsg("could not determine dirty data size: %m")));
614 return;
615 }
616 }
617
618 /*
619 * Some platforms reject partial-page mmap() attempts. To deal with
620 * that, just truncate the request to a page boundary. If any extra
621 * bytes don't get flushed, well, it's only a hint anyway.
622 */
623
624 /* fetch pagesize only once */
625 if (pagesize == 0)
627
628 /* align length to pagesize, dropping any fractional page */
629 if (pagesize > 0)
630 nbytes = (nbytes / pagesize) * pagesize;
631
632 /* fractional-page request is a no-op */
633 if (nbytes <= 0)
634 return;
635
636 /*
637 * mmap could well fail, particularly on 32-bit platforms where there
638 * may simply not be enough address space. If so, silently fall
639 * through to the next implementation.
640 */
641 if (nbytes <= (pgoff_t) SSIZE_MAX)
642 p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
643 else
644 p = MAP_FAILED;
645
646 if (p != MAP_FAILED)
647 {
648 int rc;
649
650 rc = msync(p, (size_t) nbytes, MS_ASYNC);
651 if (rc != 0)
652 {
655 errmsg("could not flush dirty data: %m")));
656 /* NB: need to fall through to munmap()! */
657 }
658
659 rc = munmap(p, (size_t) nbytes);
660 if (rc != 0)
661 {
662 /* FATAL error because mapping would remain */
665 errmsg("could not munmap() while flushing data: %m")));
666 }
667
668 return;
669 }
670 }
671#endif
672#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
673 {
674 int rc;
675
676 /*
677 * Signal the kernel that the passed in range should not be cached
678 * anymore. This has the, desired, side effect of writing out dirty
679 * data, and the, undesired, side effect of likely discarding useful
680 * clean cached blocks. For the latter reason this is the least
681 * preferable method.
682 */
683
684 rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
685
686 if (rc != 0)
687 {
688 /* don't error out, this is just a performance optimization */
691 errmsg("could not flush dirty data: %m")));
692 }
693
694 return;
695 }
696#endif
697}

References data_sync_elevel(), EINTR, enableFsync, ereport, errcode_for_file_access(), errmsg(), FATAL, fb(), fd(), MAP_FAILED, and WARNING.

Referenced by copy_file(), and FileWriteback().

◆ pg_fsync()

int pg_fsync ( int  fd)

Definition at line 389 of file fd.c.

390{
391#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
392 struct stat st;
393
394 /*
395 * Some operating system implementations of fsync() have requirements
396 * about the file access modes that were used when their file descriptor
397 * argument was opened, and these requirements differ depending on whether
398 * the file descriptor is for a directory.
399 *
400 * For any file descriptor that may eventually be handed to fsync(), we
401 * should have opened it with access modes that are compatible with
402 * fsync() on all supported systems, otherwise the code may not be
403 * portable, even if it runs ok on the current system.
404 *
405 * We assert here that a descriptor for a file was opened with write
406 * permissions (i.e., not O_RDONLY) and for a directory without write
407 * permissions (O_RDONLY). Notice that the assertion check is made even
408 * if fsync() is disabled.
409 *
410 * If fstat() fails, ignore it and let the follow-up fsync() complain.
411 */
412 if (fstat(fd, &st) == 0)
413 {
414 int desc_flags = fcntl(fd, F_GETFL);
415
417
418 if (S_ISDIR(st.st_mode))
420 else
422 }
423 errno = 0;
424#endif
425
426 /* #if is to skip the wal_sync_method test if there's no need for it */
427#if defined(HAVE_FSYNC_WRITETHROUGH)
430 else
431#endif
433}

References Assert, fb(), fd(), fstat, pg_fsync_no_writethrough(), pg_fsync_writethrough(), S_ISDIR, stat::st_mode, wal_sync_method, and WAL_SYNC_METHOD_FSYNC_WRITETHROUGH.

Referenced by AddToDataDirLockFile(), assign_wal_sync_method(), BootStrapXLOG(), CheckPointLogicalRewriteHeap(), CreateDirAndVersionFile(), CreateLockFile(), durable_rename(), FileSync(), fsync_fname_ext(), heap_xlog_logical_rewrite(), readRecoverySignalFile(), RecreateTwoPhaseFile(), RestoreSlotFromDisk(), SaveSlotToPath(), SlruPhysicalWritePage(), SlruSyncFileTag(), SnapBuildSerialize(), update_controlfile(), write_auto_conf_file(), WriteControlFile(), writeTimeLineHistory(), writeTimeLineHistoryFile(), XLogFileCopy(), and XLogFileInitInternal().

◆ pg_fsync_no_writethrough()

int pg_fsync_no_writethrough ( int  fd)

Definition at line 441 of file fd.c.

442{
443 int rc;
444
445 if (!enableFsync)
446 return 0;
447
448retry:
449 rc = fsync(fd);
450
451 if (rc == -1 && errno == EINTR)
452 goto retry;
453
454 return rc;
455}

References EINTR, enableFsync, fb(), fd(), and fsync.

Referenced by issue_xlog_fsync(), and pg_fsync().

◆ pg_fsync_writethrough()

int pg_fsync_writethrough ( int  fd)

Definition at line 461 of file fd.c.

462{
463 if (enableFsync)
464 {
465#if defined(F_FULLFSYNC)
466 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
467#else
468 errno = ENOSYS;
469 return -1;
470#endif
471 }
472 else
473 return 0;
474}

References enableFsync, fb(), and fd().

Referenced by issue_xlog_fsync(), pg_fsync(), and test_sync().

◆ pg_ftruncate()

static int pg_ftruncate ( int  fd,
pgoff_t  length 
)
static

Definition at line 703 of file fd.c.

704{
705 int ret;
706
707retry:
708 ret = ftruncate(fd, length);
709
710 if (ret == -1 && errno == EINTR)
711 goto retry;
712
713 return ret;
714}

References EINTR, fb(), and fd().

Referenced by FileTruncate(), and pg_truncate().

◆ pg_truncate()

int pg_truncate ( const char path,
pgoff_t  length 
)

Definition at line 720 of file fd.c.

721{
722 int ret;
723#ifdef WIN32
724 int save_errno;
725 int fd;
726
728 if (fd >= 0)
729 {
730 ret = pg_ftruncate(fd, length);
734 }
735 else
736 ret = -1;
737#else
738
739retry:
740 ret = truncate(path, length);
741
742 if (ret == -1 && errno == EINTR)
743 goto retry;
744#endif
745
746 return ret;
747}

References CloseTransientFile(), EINTR, fb(), fd(), OpenTransientFile(), PG_BINARY, and pg_ftruncate().

Referenced by do_truncate().

◆ ReadDir()

◆ ReadDirExtended()

struct dirent * ReadDirExtended ( DIR dir,
const char dirname,
int  elevel 
)

Definition at line 2971 of file fd.c.

2972{
2973 struct dirent *dent;
2974
2975 /* Give a generic message for AllocateDir failure, if caller didn't */
2976 if (dir == NULL)
2977 {
2978 ereport(elevel,
2980 errmsg("could not open directory \"%s\": %m",
2981 dirname)));
2982 return NULL;
2983 }
2984
2985 errno = 0;
2986 if ((dent = readdir(dir)) != NULL)
2987 return dent;
2988
2989 if (errno)
2990 ereport(elevel,
2992 errmsg("could not read directory \"%s\": %m",
2993 dirname)));
2994 return NULL;
2995}

References ereport, errcode_for_file_access(), errmsg(), fb(), and readdir().

Referenced by DeleteAllExportedSnapshotFiles(), ReadDir(), RelationCacheInitFileRemove(), RelationCacheInitFileRemoveInDir(), RemovePgTempFiles(), RemovePgTempFilesInDir(), RemovePgTempRelationFiles(), RemovePgTempRelationFilesInDbspace(), ReorderBufferCleanupSerializedTXNs(), scan_directory_ci(), SyncDataDirectory(), and walkdir().

◆ RegisterTemporaryFile()

static void RegisterTemporaryFile ( File  file)
static

◆ ReleaseExternalFD()

◆ ReleaseLruFile()

static bool ReleaseLruFile ( void  )
static

Definition at line 1369 of file fd.c.

1370{
1371 DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
1372
1373 if (nfile > 0)
1374 {
1375 /*
1376 * There are opened files and so there should be at least one used vfd
1377 * in the ring.
1378 */
1379 Assert(VfdCache[0].lruMoreRecently != 0);
1380 LruDelete(VfdCache[0].lruMoreRecently);
1381 return true; /* freed a file */
1382 }
1383 return false; /* no files available to free */
1384}

References Assert, DO_DB, elog, LOG, LruDelete(), nfile, and VfdCache.

Referenced by AllocateDir(), AllocateFile(), BasicOpenFilePerm(), OpenPipeStream(), and ReleaseLruFiles().

◆ ReleaseLruFiles()

static void ReleaseLruFiles ( void  )
static

◆ RemovePgTempFiles()

void RemovePgTempFiles ( void  )

Definition at line 3322 of file fd.c.

3323{
3325 DIR *spc_dir;
3326 struct dirent *spc_de;
3327
3328 /*
3329 * First process temp files in pg_default ($PGDATA/base)
3330 */
3331 snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
3332 RemovePgTempFilesInDir(temp_path, true, false);
3334
3335 /*
3336 * Cycle through temp directories for all non-default tablespaces.
3337 */
3339
3341 {
3342 if (strcmp(spc_de->d_name, ".") == 0 ||
3343 strcmp(spc_de->d_name, "..") == 0)
3344 continue;
3345
3346 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s/%s",
3349 RemovePgTempFilesInDir(temp_path, true, false);
3350
3351 snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
3354 }
3355
3357
3358 /*
3359 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
3360 * DataDir as well. However, that is *not* cleaned here because doing so
3361 * would create a race condition. It's done separately, earlier in
3362 * postmaster startup.
3363 */
3364}

References AllocateDir(), fb(), FreeDir(), LOG, MAXPGPATH, PG_TBLSPC_DIR, PG_TEMP_FILES_DIR, ReadDirExtended(), RemovePgTempFilesInDir(), RemovePgTempRelationFiles(), snprintf, and TABLESPACE_VERSION_DIRECTORY.

Referenced by PostmasterMain(), and PostmasterStateMachine().

◆ RemovePgTempFilesInDir()

void RemovePgTempFilesInDir ( const char tmpdirname,
bool  missing_ok,
bool  unlink_all 
)

Definition at line 3382 of file fd.c.

3383{
3384 DIR *temp_dir;
3385 struct dirent *temp_de;
3386 char rm_path[MAXPGPATH * 2];
3387
3389
3390 if (temp_dir == NULL && errno == ENOENT && missing_ok)
3391 return;
3392
3394 {
3395 if (strcmp(temp_de->d_name, ".") == 0 ||
3396 strcmp(temp_de->d_name, "..") == 0)
3397 continue;
3398
3399 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3400 tmpdirname, temp_de->d_name);
3401
3402 if (unlink_all ||
3403 strncmp(temp_de->d_name,
3406 {
3408
3409 if (type == PGFILETYPE_ERROR)
3410 continue;
3411 else if (type == PGFILETYPE_DIR)
3412 {
3413 /* recursively remove contents, then directory itself */
3414 RemovePgTempFilesInDir(rm_path, false, true);
3415
3416 if (rmdir(rm_path) < 0)
3417 ereport(LOG,
3419 errmsg("could not remove directory \"%s\": %m",
3420 rm_path)));
3421 }
3422 else
3423 {
3424 if (unlink(rm_path) < 0)
3425 ereport(LOG,
3427 errmsg("could not remove file \"%s\": %m",
3428 rm_path)));
3429 }
3430 }
3431 else
3432 ereport(LOG,
3433 (errmsg("unexpected file found in temporary-files directory: \"%s\"",
3434 rm_path)));
3435 }
3436
3438}

References AllocateDir(), ereport, errcode_for_file_access(), errmsg(), fb(), FreeDir(), get_dirent_type(), LOG, MAXPGPATH, PG_TEMP_FILE_PREFIX, PGFILETYPE_DIR, PGFILETYPE_ERROR, ReadDirExtended(), RemovePgTempFilesInDir(), snprintf, and type.

Referenced by PostmasterMain(), RemovePgTempFiles(), and RemovePgTempFilesInDir().

◆ RemovePgTempRelationFiles()

static void RemovePgTempRelationFiles ( const char tsdirname)
static

Definition at line 3442 of file fd.c.

3443{
3444 DIR *ts_dir;
3445 struct dirent *de;
3446 char dbspace_path[MAXPGPATH * 2];
3447
3449
3450 while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
3451 {
3452 /*
3453 * We're only interested in the per-database directories, which have
3454 * numeric names. Note that this code will also (properly) ignore "."
3455 * and "..".
3456 */
3457 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
3458 continue;
3459
3460 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
3461 tsdirname, de->d_name);
3463 }
3464
3465 FreeDir(ts_dir);
3466}

References AllocateDir(), fb(), FreeDir(), LOG, MAXPGPATH, ReadDirExtended(), RemovePgTempRelationFilesInDbspace(), and snprintf.

Referenced by RemovePgTempFiles().

◆ RemovePgTempRelationFilesInDbspace()

static void RemovePgTempRelationFilesInDbspace ( const char dbspacedirname)
static

Definition at line 3470 of file fd.c.

3471{
3473 struct dirent *de;
3474 char rm_path[MAXPGPATH * 2];
3475
3477
3479 {
3480 if (!looks_like_temp_rel_name(de->d_name))
3481 continue;
3482
3483 snprintf(rm_path, sizeof(rm_path), "%s/%s",
3484 dbspacedirname, de->d_name);
3485
3486 if (unlink(rm_path) < 0)
3487 ereport(LOG,
3489 errmsg("could not remove file \"%s\": %m",
3490 rm_path)));
3491 }
3492
3494}

References AllocateDir(), ereport, errcode_for_file_access(), errmsg(), fb(), FreeDir(), LOG, looks_like_temp_rel_name(), MAXPGPATH, ReadDirExtended(), and snprintf.

Referenced by RemovePgTempRelationFiles().

◆ ReportTemporaryFileUsage()

static void ReportTemporaryFileUsage ( const char path,
pgoff_t  size 
)
static

Definition at line 1515 of file fd.c.

1516{
1518
1519 if (log_temp_files >= 0)
1520 {
1521 if ((size / 1024) >= log_temp_files)
1522 ereport(LOG,
1523 (errmsg("temporary file: path \"%s\", size %lu",
1524 path, (unsigned long) size)));
1525 }
1526}

References ereport, errmsg(), LOG, log_temp_files, and pgstat_report_tempfile().

Referenced by FileClose(), and PathNameDeleteTemporaryFile().

◆ reserveAllocatedDesc()

static bool reserveAllocatedDesc ( void  )
static

Definition at line 2552 of file fd.c.

2553{
2555 int newMax;
2556
2557 /* Quick out if array already has a free slot. */
2559 return true;
2560
2561 /*
2562 * If the array hasn't yet been created in the current process, initialize
2563 * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
2564 * we will ever need, anyway. We don't want to look at max_safe_fds
2565 * immediately because set_max_safe_fds() may not have run yet.
2566 */
2567 if (allocatedDescs == NULL)
2568 {
2569 newMax = FD_MINFREE / 3;
2571 /* Out of memory already? Treat as fatal error. */
2572 if (newDescs == NULL)
2573 ereport(ERROR,
2575 errmsg("out of memory")));
2578 return true;
2579 }
2580
2581 /*
2582 * Consider enlarging the array beyond the initial allocation used above.
2583 * By the time this happens, max_safe_fds should be known accurately.
2584 *
2585 * We mustn't let allocated descriptors hog all the available FDs, and in
2586 * practice we'd better leave a reasonable number of FDs for VFD use. So
2587 * set the maximum to max_safe_fds / 3. (This should certainly be at
2588 * least as large as the initial size, FD_MINFREE / 3, so we aren't
2589 * tightening the restriction here.) Recall that "external" FDs are
2590 * allowed to consume another third of max_safe_fds.
2591 */
2592 newMax = max_safe_fds / 3;
2594 {
2596 newMax * sizeof(AllocateDesc));
2597 /* Treat out-of-memory as a non-fatal error. */
2598 if (newDescs == NULL)
2599 return false;
2602 return true;
2603 }
2604
2605 /* Can't enlarge allocatedDescs[] any more. */
2606 return false;
2607}

References allocatedDescs, ereport, errcode(), errmsg(), ERROR, fb(), FD_MINFREE, malloc, max_safe_fds, maxAllocatedDescs, numAllocatedDescs, and realloc.

Referenced by AllocateDir(), AllocateFile(), OpenPipeStream(), and OpenTransientFilePerm().

◆ ReserveExternalFD()

void ReserveExternalFD ( void  )

Definition at line 1206 of file fd.c.

1207{
1208 /*
1209 * Release VFDs if needed to stay safe. Because we do this before
1210 * incrementing numExternalFDs, the final state will be as desired, i.e.,
1211 * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
1212 */
1214
1216}

References numExternalFDs, and ReleaseLruFiles().

Referenced by AcquireExternalFD(), BackendInitialize(), dsm_impl_posix(), InitializeWaitEventSupport(), InitPostmasterDeathWatchHandle(), and XLogWrite().

◆ ResourceOwnerForgetFile()

static void ResourceOwnerForgetFile ( ResourceOwner  owner,
File  file 
)
inlinestatic

Definition at line 380 of file fd.c.

381{
383}

References file_resowner_desc, Int32GetDatum(), and ResourceOwnerForget().

Referenced by FileClose().

◆ ResourceOwnerRememberFile()

static void ResourceOwnerRememberFile ( ResourceOwner  owner,
File  file 
)
inlinestatic

Definition at line 375 of file fd.c.

376{
378}

References file_resowner_desc, Int32GetDatum(), and ResourceOwnerRemember().

Referenced by RegisterTemporaryFile().

◆ ResOwnerPrintFile()

static char * ResOwnerPrintFile ( Datum  res)
static

Definition at line 4102 of file fd.c.

4103{
4104 return psprintf("File %d", DatumGetInt32(res));
4105}

References DatumGetInt32(), and psprintf().

◆ ResOwnerReleaseFile()

static void ResOwnerReleaseFile ( Datum  res)
static

Definition at line 4088 of file fd.c.

4089{
4090 File file = (File) DatumGetInt32(res);
4091 Vfd *vfdP;
4092
4093 Assert(FileIsValid(file));
4094
4095 vfdP = &VfdCache[file];
4096 vfdP->resowner = NULL;
4097
4098 FileClose(file);
4099}

References Assert, DatumGetInt32(), fb(), FileClose(), FileIsValid, vfd::resowner, and VfdCache.

◆ set_max_safe_fds()

void set_max_safe_fds ( void  )

Definition at line 1044 of file fd.c.

1045{
1046 int usable_fds;
1047 int already_open;
1048
1049 /*----------
1050 * We want to set max_safe_fds to
1051 * MIN(usable_fds, max_files_per_process)
1052 * less the slop factor for files that are opened without consulting
1053 * fd.c. This ensures that we won't allow to open more than
1054 * max_files_per_process, or the experimentally-determined EMFILE limit,
1055 * additional files.
1056 *----------
1057 */
1060
1062
1063 /*
1064 * Take off the FDs reserved for system() etc.
1065 */
1067
1068 /*
1069 * Make sure we still have enough to get by.
1070 */
1072 ereport(FATAL,
1074 errmsg("insufficient file descriptors available to start server process"),
1075 errdetail("System allows %d, server needs at least %d, %d files are already open.",
1078 already_open)));
1079
1080 elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
1082}

References count_usable_fds(), DEBUG2, elog, ereport, errcode(), errdetail(), errmsg(), FATAL, fb(), FD_MINFREE, max_files_per_process, max_safe_fds, Min, and NUM_RESERVED_FDS.

Referenced by BootstrapModeMain(), PostgresSingleUserMain(), and PostmasterMain().

◆ SetTempTablespaces()

void SetTempTablespaces ( Oid tableSpaces,
int  numSpaces 
)

Definition at line 3096 of file fd.c.

3097{
3098 Assert(numSpaces >= 0);
3101
3102 /*
3103 * Select a random starting point in the list. This is to minimize
3104 * conflicts between backends that are most likely sharing the same list
3105 * of temp tablespaces. Note that if we create multiple temp files in the
3106 * same transaction, we'll advance circularly through the list --- this
3107 * ensures that large temporary sort files are nicely spread across all
3108 * available tablespaces.
3109 */
3110 if (numSpaces > 1)
3112 0, numSpaces - 1);
3113 else
3115}

References Assert, fb(), nextTempTableSpace, numTempTableSpaces, pg_global_prng_state, pg_prng_uint64_range(), and tempTableSpaces.

Referenced by assign_temp_tablespaces(), and PrepareTempTablespaces().

◆ SyncDataDirectory()

void SyncDataDirectory ( void  )

Definition at line 3593 of file fd.c.

3594{
3595 bool xlog_is_symlink;
3596
3597 /* We can skip this whole thing if fsync is disabled. */
3598 if (!enableFsync)
3599 return;
3600
3601 /*
3602 * If pg_wal is a symlink, we'll need to recurse into it separately,
3603 * because the first walkdir below will ignore it.
3604 */
3605 xlog_is_symlink = false;
3606
3607 {
3608 struct stat st;
3609
3610 if (lstat("pg_wal", &st) < 0)
3611 ereport(LOG,
3613 errmsg("could not stat file \"%s\": %m",
3614 "pg_wal")));
3615 else if (S_ISLNK(st.st_mode))
3616 xlog_is_symlink = true;
3617 }
3618
3619#ifdef HAVE_SYNCFS
3621 {
3622 DIR *dir;
3623 struct dirent *de;
3624
3625 /*
3626 * On Linux, we don't have to open every single file one by one. We
3627 * can use syncfs() to sync whole filesystems. We only expect
3628 * filesystem boundaries to exist where we tolerate symlinks, namely
3629 * pg_wal and the tablespaces, so we call syncfs() for each of those
3630 * directories.
3631 */
3632
3633 /* Prepare to report progress syncing the data directory via syncfs. */
3635
3636 /* Sync the top level pgdata directory. */
3637 do_syncfs(".");
3638 /* If any tablespaces are configured, sync each of those. */
3640 while ((de = ReadDirExtended(dir, PG_TBLSPC_DIR, LOG)))
3641 {
3642 char path[MAXPGPATH];
3643
3644 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
3645 continue;
3646
3647 snprintf(path, MAXPGPATH, "%s/%s", PG_TBLSPC_DIR, de->d_name);
3648 do_syncfs(path);
3649 }
3650 FreeDir(dir);
3651 /* If pg_wal is a symlink, process that too. */
3652 if (xlog_is_symlink)
3653 do_syncfs("pg_wal");
3654 return;
3655 }
3656#endif /* !HAVE_SYNCFS */
3657
3658#ifdef PG_FLUSH_DATA_WORKS
3659 /* Prepare to report progress of the pre-fsync phase. */
3661
3662 /*
3663 * If possible, hint to the kernel that we're soon going to fsync the data
3664 * directory and its contents. Errors in this step are even less
3665 * interesting than normal, so log them only at DEBUG1.
3666 */
3667 walkdir(".", pre_sync_fname, false, DEBUG1);
3668 if (xlog_is_symlink)
3669 walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
3671#endif
3672
3673 /* Prepare to report progress syncing the data directory via fsync. */
3675
3676 /*
3677 * Now we do the fsync()s in the same order.
3678 *
3679 * The main call ignores symlinks, so in addition to specially processing
3680 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
3681 * process_symlinks = true. Note that if there are any plain directories
3682 * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
3683 * so we don't worry about optimizing it.
3684 */
3685 walkdir(".", datadir_fsync_fname, false, LOG);
3686 if (xlog_is_symlink)
3687 walkdir("pg_wal", datadir_fsync_fname, false, LOG);
3689}

References AllocateDir(), begin_startup_progress_phase(), DATA_DIR_SYNC_METHOD_SYNCFS, datadir_fsync_fname(), DEBUG1, enableFsync, ereport, errcode_for_file_access(), errmsg(), fb(), FreeDir(), LOG, lstat, MAXPGPATH, PG_TBLSPC_DIR, ReadDirExtended(), recovery_init_sync_method, S_ISLNK, snprintf, stat::st_mode, and walkdir().

Referenced by StartupXLOG().

◆ TempTablespacePath()

void TempTablespacePath ( char path,
Oid  tablespace 
)

Definition at line 1766 of file fd.c.

1767{
1768 /*
1769 * Identify the tempfile directory for this tablespace.
1770 *
1771 * If someone tries to specify pg_global, use pg_default instead.
1772 */
1773 if (tablespace == InvalidOid ||
1776 snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
1777 else
1778 {
1779 /* All other tablespaces are accessed via symlinks */
1780 snprintf(path, MAXPGPATH, "%s/%u/%s/%s",
1783 }
1784}

References fb(), InvalidOid, MAXPGPATH, PG_TBLSPC_DIR, PG_TEMP_FILES_DIR, snprintf, tablespace, and TABLESPACE_VERSION_DIRECTORY.

Referenced by FileSetCreate(), FileSetPath(), OpenTemporaryFileInTablespace(), and pg_ls_tmpdir().

◆ TempTablespacesAreSet()

bool TempTablespacesAreSet ( void  )

Definition at line 3125 of file fd.c.

3126{
3127 return (numTempTableSpaces >= 0);
3128}

References numTempTableSpaces.

Referenced by GetTempTablespaces(), and PrepareTempTablespaces().

◆ unlink_if_exists_fname()

static void unlink_if_exists_fname ( const char fname,
bool  isdir,
int  elevel 
)
static

Definition at line 3821 of file fd.c.

3822{
3823 if (isdir)
3824 {
3825 if (rmdir(fname) != 0 && errno != ENOENT)
3826 ereport(elevel,
3828 errmsg("could not remove directory \"%s\": %m", fname)));
3829 }
3830 else
3831 {
3832 /* Use PathNameDeleteTemporaryFile to report filesize */
3833 PathNameDeleteTemporaryFile(fname, false);
3834 }
3835}

References ereport, errcode_for_file_access(), errmsg(), fb(), and PathNameDeleteTemporaryFile().

Referenced by PathNameDeleteTemporaryDir().

◆ walkdir()

static void walkdir ( const char path,
void(*)(const char *fname, bool isdir, int elevel)  action,
bool  process_symlinks,
int  elevel 
)
static

Definition at line 3707 of file fd.c.

3711{
3712 DIR *dir;
3713 struct dirent *de;
3714
3715 dir = AllocateDir(path);
3716
3717 while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
3718 {
3719 char subpath[MAXPGPATH * 2];
3720
3722
3723 if (strcmp(de->d_name, ".") == 0 ||
3724 strcmp(de->d_name, "..") == 0)
3725 continue;
3726
3727 snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
3728
3729 switch (get_dirent_type(subpath, de, process_symlinks, elevel))
3730 {
3731 case PGFILETYPE_REG:
3732 (*action) (subpath, false, elevel);
3733 break;
3734 case PGFILETYPE_DIR:
3735 walkdir(subpath, action, false, elevel);
3736 break;
3737 default:
3738
3739 /*
3740 * Errors are already reported directly by get_dirent_type(),
3741 * and any remaining symlinks and unknown file types are
3742 * ignored.
3743 */
3744 break;
3745 }
3746 }
3747
3748 FreeDir(dir); /* we ignore any error here */
3749
3750 /*
3751 * It's important to fsync the destination directory itself as individual
3752 * file fsyncs don't guarantee that the directory entry for the file is
3753 * synced. However, skip this if AllocateDir failed; the action function
3754 * might not be robust against that.
3755 */
3756 if (dir)
3757 (*action) (path, true, elevel);
3758}

References AllocateDir(), CHECK_FOR_INTERRUPTS, fb(), FreeDir(), get_dirent_type(), MAXPGPATH, PGFILETYPE_DIR, PGFILETYPE_REG, ReadDirExtended(), snprintf, subpath(), and walkdir().

Referenced by PathNameDeleteTemporaryDir(), SyncDataDirectory(), and walkdir().

Variable Documentation

◆ allocatedDescs

◆ data_sync_retry

bool data_sync_retry = false

Definition at line 162 of file fd.c.

Referenced by data_sync_elevel().

◆ file_extend_method

int file_extend_method = DEFAULT_FILE_EXTEND_METHOD

Definition at line 168 of file fd.c.

Referenced by mdzeroextend().

◆ file_resowner_desc

const ResourceOwnerDesc file_resowner_desc
static
Initial value:
=
{
.name = "File",
.release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
.release_priority = RELEASE_PRIO_FILES,
.ReleaseResource = ResOwnerReleaseFile,
.DebugPrint = ResOwnerPrintFile
}

Definition at line 364 of file fd.c.

365{
366 .name = "File",
367 .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
368 .release_priority = RELEASE_PRIO_FILES,
369 .ReleaseResource = ResOwnerReleaseFile,
370 .DebugPrint = ResOwnerPrintFile
371};

Referenced by ResourceOwnerForgetFile(), and ResourceOwnerRememberFile().

◆ have_xact_temporary_files

bool have_xact_temporary_files = false
static

Definition at line 231 of file fd.c.

Referenced by CleanupTempFiles(), and RegisterTemporaryFile().

◆ io_direct_flags

◆ max_files_per_process

int max_files_per_process = 1000

Definition at line 146 of file fd.c.

Referenced by set_max_safe_fds().

◆ max_safe_fds

int max_safe_fds = FD_MINFREE

Definition at line 159 of file fd.c.

Referenced by AcquireExternalFD(), ReleaseLruFiles(), reserveAllocatedDesc(), and set_max_safe_fds().

◆ maxAllocatedDescs

int maxAllocatedDescs = 0
static

◆ nextTempTableSpace

int nextTempTableSpace = 0
static

Definition at line 293 of file fd.c.

Referenced by GetNextTempTableSpace(), and SetTempTablespaces().

◆ nfile

int nfile = 0
static

◆ numAllocatedDescs

◆ numExternalFDs

int numExternalFDs = 0
static

Definition at line 277 of file fd.c.

Referenced by AcquireExternalFD(), ReleaseExternalFD(), ReleaseLruFiles(), and ReserveExternalFD().

◆ numTempTableSpaces

int numTempTableSpaces = -1
static

◆ recovery_init_sync_method

int recovery_init_sync_method = DATA_DIR_SYNC_METHOD_FSYNC

Definition at line 165 of file fd.c.

Referenced by SyncDataDirectory().

◆ SizeVfdCache

Size SizeVfdCache = 0
static

◆ tempFileCounter

long tempFileCounter = 0
static

Definition at line 283 of file fd.c.

Referenced by OpenTemporaryFileInTablespace().

◆ temporary_files_size

uint64 temporary_files_size = 0
static

Definition at line 239 of file fd.c.

Referenced by FileClose(), FileTruncate(), and FileWriteV().

◆ tempTableSpaces

Oid* tempTableSpaces = NULL
static

◆ VfdCache