PostgreSQL Source Code  git master
file_utils.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * File-processing utility routines.
4  *
5  * Assorted utility functions to work on files.
6  *
7  *
8  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
9  * Portions Copyright (c) 1994, Regents of the University of California
10  *
11  * src/common/file_utils.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #ifndef FRONTEND
17 #include "postgres.h"
18 #else
19 #include "postgres_fe.h"
20 #endif
21 
22 #include <dirent.h>
23 #include <fcntl.h>
24 #include <sys/stat.h>
25 #include <unistd.h>
26 
27 #include "common/file_utils.h"
28 #ifdef FRONTEND
29 #include "common/logging.h"
30 #endif
31 #include "port/pg_iovec.h"
32 
33 #ifdef FRONTEND
34 
35 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
36 #if defined(HAVE_SYNC_FILE_RANGE)
37 #define PG_FLUSH_DATA_WORKS 1
38 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
39 #define PG_FLUSH_DATA_WORKS 1
40 #endif
41 
42 /*
43  * pg_xlog has been renamed to pg_wal in version 10.
44  */
45 #define MINIMUM_VERSION_FOR_PG_WAL 100000
46 
47 #ifdef PG_FLUSH_DATA_WORKS
48 static int pre_sync_fname(const char *fname, bool isdir);
49 #endif
50 static void walkdir(const char *path,
51  int (*action) (const char *fname, bool isdir),
52  bool process_symlinks);
53 
54 /*
55  * Issue fsync recursively on PGDATA and all its contents.
56  *
57  * We fsync regular files and directories wherever they are, but we follow
58  * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
59  * Other symlinks are presumed to point at files we're not responsible for
60  * fsyncing, and might not have privileges to write at all.
61  *
62  * serverVersion indicates the version of the server to be fsync'd.
63  */
64 void
65 fsync_pgdata(const char *pg_data,
66  int serverVersion)
67 {
68  bool xlog_is_symlink;
69  char pg_wal[MAXPGPATH];
70  char pg_tblspc[MAXPGPATH];
71 
72  /* handle renaming of pg_xlog to pg_wal in post-10 clusters */
73  snprintf(pg_wal, MAXPGPATH, "%s/%s", pg_data,
74  serverVersion < MINIMUM_VERSION_FOR_PG_WAL ? "pg_xlog" : "pg_wal");
75  snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", pg_data);
76 
77  /*
78  * If pg_wal is a symlink, we'll need to recurse into it separately,
79  * because the first walkdir below will ignore it.
80  */
81  xlog_is_symlink = false;
82 
83  {
84  struct stat st;
85 
86  if (lstat(pg_wal, &st) < 0)
87  pg_log_error("could not stat file \"%s\": %m", pg_wal);
88  else if (S_ISLNK(st.st_mode))
89  xlog_is_symlink = true;
90  }
91 
92  /*
93  * If possible, hint to the kernel that we're soon going to fsync the data
94  * directory and its contents.
95  */
96 #ifdef PG_FLUSH_DATA_WORKS
97  walkdir(pg_data, pre_sync_fname, false);
98  if (xlog_is_symlink)
99  walkdir(pg_wal, pre_sync_fname, false);
100  walkdir(pg_tblspc, pre_sync_fname, true);
101 #endif
102 
103  /*
104  * Now we do the fsync()s in the same order.
105  *
106  * The main call ignores symlinks, so in addition to specially processing
107  * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
108  * process_symlinks = true. Note that if there are any plain directories
109  * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
110  * so we don't worry about optimizing it.
111  */
112  walkdir(pg_data, fsync_fname, false);
113  if (xlog_is_symlink)
114  walkdir(pg_wal, fsync_fname, false);
115  walkdir(pg_tblspc, fsync_fname, true);
116 }
117 
118 /*
119  * Issue fsync recursively on the given directory and all its contents.
120  *
121  * This is a convenient wrapper on top of walkdir().
122  */
123 void
124 fsync_dir_recurse(const char *dir)
125 {
126  /*
127  * If possible, hint to the kernel that we're soon going to fsync the data
128  * directory and its contents.
129  */
130 #ifdef PG_FLUSH_DATA_WORKS
131  walkdir(dir, pre_sync_fname, false);
132 #endif
133 
134  walkdir(dir, fsync_fname, false);
135 }
136 
137 /*
138  * walkdir: recursively walk a directory, applying the action to each
139  * regular file and directory (including the named directory itself).
140  *
141  * If process_symlinks is true, the action and recursion are also applied
142  * to regular files and directories that are pointed to by symlinks in the
143  * given directory; otherwise symlinks are ignored. Symlinks are always
144  * ignored in subdirectories, ie we intentionally don't pass down the
145  * process_symlinks flag to recursive calls.
146  *
147  * Errors are reported but not considered fatal.
148  *
149  * See also walkdir in fd.c, which is a backend version of this logic.
150  */
151 static void
152 walkdir(const char *path,
153  int (*action) (const char *fname, bool isdir),
154  bool process_symlinks)
155 {
156  DIR *dir;
157  struct dirent *de;
158 
159  dir = opendir(path);
160  if (dir == NULL)
161  {
162  pg_log_error("could not open directory \"%s\": %m", path);
163  return;
164  }
165 
166  while (errno = 0, (de = readdir(dir)) != NULL)
167  {
168  char subpath[MAXPGPATH * 2];
169 
170  if (strcmp(de->d_name, ".") == 0 ||
171  strcmp(de->d_name, "..") == 0)
172  continue;
173 
174  snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
175 
176  switch (get_dirent_type(subpath, de, process_symlinks, PG_LOG_ERROR))
177  {
178  case PGFILETYPE_REG:
179  (*action) (subpath, false);
180  break;
181  case PGFILETYPE_DIR:
182  walkdir(subpath, action, false);
183  break;
184  default:
185 
186  /*
187  * Errors are already reported directly by get_dirent_type(),
188  * and any remaining symlinks and unknown file types are
189  * ignored.
190  */
191  break;
192  }
193  }
194 
195  if (errno)
196  pg_log_error("could not read directory \"%s\": %m", path);
197 
198  (void) closedir(dir);
199 
200  /*
201  * It's important to fsync the destination directory itself as individual
202  * file fsyncs don't guarantee that the directory entry for the file is
203  * synced. Recent versions of ext4 have made the window much wider but
204  * it's been an issue for ext3 and other filesystems in the past.
205  */
206  (*action) (path, true);
207 }
208 
209 /*
210  * Hint to the OS that it should get ready to fsync() this file.
211  *
212  * Ignores errors trying to open unreadable files, and reports other errors
213  * non-fatally.
214  */
215 #ifdef PG_FLUSH_DATA_WORKS
216 
217 static int
218 pre_sync_fname(const char *fname, bool isdir)
219 {
220  int fd;
221 
222  fd = open(fname, O_RDONLY | PG_BINARY, 0);
223 
224  if (fd < 0)
225  {
226  if (errno == EACCES || (isdir && errno == EISDIR))
227  return 0;
228  pg_log_error("could not open file \"%s\": %m", fname);
229  return -1;
230  }
231 
232  /*
233  * We do what pg_flush_data() would do in the backend: prefer to use
234  * sync_file_range, but fall back to posix_fadvise. We ignore errors
235  * because this is only a hint.
236  */
237 #if defined(HAVE_SYNC_FILE_RANGE)
238  (void) sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
239 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
240  (void) posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
241 #else
242 #error PG_FLUSH_DATA_WORKS should not have been defined
243 #endif
244 
245  (void) close(fd);
246  return 0;
247 }
248 
249 #endif /* PG_FLUSH_DATA_WORKS */
250 
251 /*
252  * fsync_fname -- Try to fsync a file or directory
253  *
254  * Ignores errors trying to open unreadable files, or trying to fsync
255  * directories on systems where that isn't allowed/required. All other errors
256  * are fatal.
257  */
258 int
259 fsync_fname(const char *fname, bool isdir)
260 {
261  int fd;
262  int flags;
263  int returncode;
264 
265  /*
266  * Some OSs require directories to be opened read-only whereas other
267  * systems don't allow us to fsync files opened read-only; so we need both
268  * cases here. Using O_RDWR will cause us to fail to fsync files that are
269  * not writable by our userid, but we assume that's OK.
270  */
271  flags = PG_BINARY;
272  if (!isdir)
273  flags |= O_RDWR;
274  else
275  flags |= O_RDONLY;
276 
277  /*
278  * Open the file, silently ignoring errors about unreadable files (or
279  * unsupported operations, e.g. opening a directory under Windows), and
280  * logging others.
281  */
282  fd = open(fname, flags, 0);
283  if (fd < 0)
284  {
285  if (errno == EACCES || (isdir && errno == EISDIR))
286  return 0;
287  pg_log_error("could not open file \"%s\": %m", fname);
288  return -1;
289  }
290 
291  returncode = fsync(fd);
292 
293  /*
294  * Some OSes don't allow us to fsync directories at all, so we can ignore
295  * those errors. Anything else needs to be reported.
296  */
297  if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
298  {
299  pg_log_error("could not fsync file \"%s\": %m", fname);
300  (void) close(fd);
302  }
303 
304  (void) close(fd);
305  return 0;
306 }
307 
308 /*
309  * fsync_parent_path -- fsync the parent path of a file or directory
310  *
311  * This is aimed at making file operations persistent on disk in case of
312  * an OS crash or power failure.
313  */
314 int
315 fsync_parent_path(const char *fname)
316 {
317  char parentpath[MAXPGPATH];
318 
319  strlcpy(parentpath, fname, MAXPGPATH);
320  get_parent_directory(parentpath);
321 
322  /*
323  * get_parent_directory() returns an empty string if the input argument is
324  * just a file name (see comments in path.c), so handle that as being the
325  * current directory.
326  */
327  if (strlen(parentpath) == 0)
328  strlcpy(parentpath, ".", MAXPGPATH);
329 
330  if (fsync_fname(parentpath, true) != 0)
331  return -1;
332 
333  return 0;
334 }
335 
336 /*
337  * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
338  *
339  * Wrapper around rename, similar to the backend version.
340  */
341 int
342 durable_rename(const char *oldfile, const char *newfile)
343 {
344  int fd;
345 
346  /*
347  * First fsync the old and target path (if it exists), to ensure that they
348  * are properly persistent on disk. Syncing the target file is not
349  * strictly necessary, but it makes it easier to reason about crashes;
350  * because it's then guaranteed that either source or target file exists
351  * after a crash.
352  */
353  if (fsync_fname(oldfile, false) != 0)
354  return -1;
355 
356  fd = open(newfile, PG_BINARY | O_RDWR, 0);
357  if (fd < 0)
358  {
359  if (errno != ENOENT)
360  {
361  pg_log_error("could not open file \"%s\": %m", newfile);
362  return -1;
363  }
364  }
365  else
366  {
367  if (fsync(fd) != 0)
368  {
369  pg_log_error("could not fsync file \"%s\": %m", newfile);
370  close(fd);
372  }
373  close(fd);
374  }
375 
376  /* Time to do the real deal... */
377  if (rename(oldfile, newfile) != 0)
378  {
379  pg_log_error("could not rename file \"%s\" to \"%s\": %m",
380  oldfile, newfile);
381  return -1;
382  }
383 
384  /*
385  * To guarantee renaming the file is persistent, fsync the file with its
386  * new name, and its containing directory.
387  */
388  if (fsync_fname(newfile, false) != 0)
389  return -1;
390 
391  if (fsync_parent_path(newfile) != 0)
392  return -1;
393 
394  return 0;
395 }
396 
397 #endif /* FRONTEND */
398 
399 /*
400  * Return the type of a directory entry.
401  *
402  * In frontend code, elevel should be a level from logging.h; in backend code
403  * it should be a level from elog.h.
404  */
406 get_dirent_type(const char *path,
407  const struct dirent *de,
408  bool look_through_symlinks,
409  int elevel)
410 {
411  PGFileType result;
412 
413  /*
414  * Some systems tell us the type directly in the dirent struct, but that's
415  * a BSD and Linux extension not required by POSIX. Even when the
416  * interface is present, sometimes the type is unknown, depending on the
417  * filesystem.
418  */
419 #if defined(DT_REG) && defined(DT_DIR) && defined(DT_LNK)
420  if (de->d_type == DT_REG)
421  result = PGFILETYPE_REG;
422  else if (de->d_type == DT_DIR)
423  result = PGFILETYPE_DIR;
424  else if (de->d_type == DT_LNK && !look_through_symlinks)
425  result = PGFILETYPE_LNK;
426  else
427  result = PGFILETYPE_UNKNOWN;
428 #else
429  result = PGFILETYPE_UNKNOWN;
430 #endif
431 
432  if (result == PGFILETYPE_UNKNOWN)
433  {
434  struct stat fst;
435  int sret;
436 
437 
438  if (look_through_symlinks)
439  sret = stat(path, &fst);
440  else
441  sret = lstat(path, &fst);
442 
443  if (sret < 0)
444  {
445  result = PGFILETYPE_ERROR;
446 #ifdef FRONTEND
447  pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", path);
448 #else
449  ereport(elevel,
451  errmsg("could not stat file \"%s\": %m", path)));
452 #endif
453  }
454  else if (S_ISREG(fst.st_mode))
455  result = PGFILETYPE_REG;
456  else if (S_ISDIR(fst.st_mode))
457  result = PGFILETYPE_DIR;
458  else if (S_ISLNK(fst.st_mode))
459  result = PGFILETYPE_LNK;
460  }
461 
462  return result;
463 }
464 
465 /*
466  * pg_pwritev_with_retry
467  *
468  * Convenience wrapper for pg_pwritev() that retries on partial write. If an
469  * error is returned, it is unspecified how much has been written.
470  */
471 ssize_t
472 pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
473 {
474  struct iovec iov_copy[PG_IOV_MAX];
475  ssize_t sum = 0;
476  ssize_t part;
477 
478  /* We'd better have space to make a copy, in case we need to retry. */
479  if (iovcnt > PG_IOV_MAX)
480  {
481  errno = EINVAL;
482  return -1;
483  }
484 
485  for (;;)
486  {
487  /* Write as much as we can. */
488  part = pg_pwritev(fd, iov, iovcnt, offset);
489  if (part < 0)
490  return -1;
491 
492 #ifdef SIMULATE_SHORT_WRITE
493  part = Min(part, 4096);
494 #endif
495 
496  /* Count our progress. */
497  sum += part;
498  offset += part;
499 
500  /* Step over iovecs that are done. */
501  while (iovcnt > 0 && iov->iov_len <= part)
502  {
503  part -= iov->iov_len;
504  ++iov;
505  --iovcnt;
506  }
507 
508  /* Are they all done? */
509  if (iovcnt == 0)
510  {
511  /* We don't expect the kernel to write more than requested. */
512  Assert(part == 0);
513  break;
514  }
515 
516  /*
517  * Move whatever's left to the front of our mutable copy and adjust
518  * the leading iovec.
519  */
520  Assert(iovcnt > 0);
521  memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
522  Assert(iov->iov_len > part);
523  iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
524  iov_copy[0].iov_len -= part;
525  iov = iov_copy;
526  }
527 
528  return sum;
529 }
530 
531 /*
532  * pg_pwrite_zeros
533  *
534  * Writes zeros to file worth "size" bytes, using vectored I/O.
535  *
536  * Returns the total amount of data written. On failure, a negative value
537  * is returned with errno set.
538  */
539 ssize_t
540 pg_pwrite_zeros(int fd, size_t size)
541 {
542  PGAlignedBlock zbuffer; /* worth BLCKSZ */
543  size_t zbuffer_sz;
544  struct iovec iov[PG_IOV_MAX];
545  int blocks;
546  size_t remaining_size = 0;
547  int i;
548  ssize_t written;
549  ssize_t total_written = 0;
550 
551  zbuffer_sz = sizeof(zbuffer.data);
552 
553  /* Zero-fill the buffer. */
554  memset(zbuffer.data, 0, zbuffer_sz);
555 
556  /* Prepare to write out a lot of copies of our zero buffer at once. */
557  for (i = 0; i < lengthof(iov); ++i)
558  {
559  iov[i].iov_base = zbuffer.data;
560  iov[i].iov_len = zbuffer_sz;
561  }
562 
563  /* Loop, writing as many blocks as we can for each system call. */
564  blocks = size / zbuffer_sz;
565  remaining_size = size % zbuffer_sz;
566  for (i = 0; i < blocks;)
567  {
568  int iovcnt = Min(blocks - i, lengthof(iov));
569  off_t offset = i * zbuffer_sz;
570 
571  written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
572 
573  if (written < 0)
574  return written;
575 
576  i += iovcnt;
577  total_written += written;
578  }
579 
580  /* Now, write the remaining size, if any, of the file with zeros. */
581  if (remaining_size > 0)
582  {
583  /* We'll never write more than one block here */
584  int iovcnt = 1;
585 
586  /* Jump on to the end of previously written blocks */
587  off_t offset = i * zbuffer_sz;
588 
589  iov[0].iov_len = remaining_size;
590 
591  written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
592 
593  if (written < 0)
594  return written;
595 
596  total_written += written;
597  }
598 
599  Assert(total_written == size);
600 
601  return total_written;
602 }
#define Min(x, y)
Definition: c.h:988
#define PG_BINARY
Definition: c.h:1260
#define lengthof(array)
Definition: c.h:772
int closedir(DIR *)
Definition: dirent.c:127
#define DT_DIR
Definition: dirent.h:28
struct dirent * readdir(DIR *)
Definition: dirent.c:78
#define DT_REG
Definition: dirent.h:30
#define DT_LNK
Definition: dirent.h:31
DIR * opendir(const char *)
Definition: dirent.c:33
int errcode_for_file_access(void)
Definition: elog.c:881
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define ereport(elevel,...)
Definition: elog.h:149
int durable_rename(const char *oldfile, const char *newfile, int elevel)
Definition: fd.c:688
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:662
static int fsync_parent_path(const char *fname, int elevel)
Definition: fd.c:3673
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
Definition: fd.c:3458
ssize_t pg_pwrite_zeros(int fd, size_t size)
Definition: file_utils.c:540
ssize_t pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: file_utils.c:472
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
Definition: file_utils.c:406
PGFileType
Definition: file_utils.h:19
@ PGFILETYPE_LNK
Definition: file_utils.h:24
@ PGFILETYPE_UNKNOWN
Definition: file_utils.h:21
@ PGFILETYPE_DIR
Definition: file_utils.h:23
@ PGFILETYPE_REG
Definition: file_utils.h:22
@ PGFILETYPE_ERROR
Definition: file_utils.h:20
static char * pg_data
Definition: initdb.c:127
#define close(a)
Definition: win32.h:12
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
exit(1)
void pg_log_generic(enum pg_log_level level, enum pg_log_part part, const char *pg_restrict fmt,...)
Definition: logging.c:205
#define pg_log_error(...)
Definition: logging.h:106
@ PG_LOG_PRIMARY
Definition: logging.h:67
@ PG_LOG_ERROR
Definition: logging.h:43
Datum subpath(PG_FUNCTION_ARGS)
Definition: ltree_op.c:241
#define MINIMUM_VERSION_FOR_PG_WAL
Definition: pg_basebackup.c:86
#define MAXPGPATH
ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
Definition: pwritev.c:22
#define PG_IOV_MAX
Definition: pg_iovec.h:36
void get_parent_directory(char *path)
Definition: path.c:977
#define snprintf
Definition: port.h:238
size_t strlcpy(char *dst, const char *src, size_t siz)
Definition: strlcpy.c:45
static int fd(const char *x, int i)
Definition: preproc-init.c:105
#define EXIT_FAILURE
Definition: settings.h:166
Definition: dirent.c:26
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
unsigned char d_type
Definition: dirent.h:13
unsigned short st_mode
Definition: win32_port.h:270
char data[BLCKSZ]
Definition: c.h:1130
#define fsync(fd)
Definition: win32_port.h:85
#define stat
Definition: win32_port.h:286
#define lstat(path, sb)
Definition: win32_port.h:287
#define S_ISDIR(m)
Definition: win32_port.h:327
#define S_ISLNK(m)
Definition: win32_port.h:346
#define S_ISREG(m)
Definition: win32_port.h:330