PostgreSQL Source Code  git master
buffile.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * buffile.c
4  * Management of large buffered temporary files.
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/buffile.c
11  *
12  * NOTES:
13  *
14  * BufFiles provide a very incomplete emulation of stdio atop virtual Files
15  * (as managed by fd.c). Currently, we only support the buffered-I/O
16  * aspect of stdio: a read or write of the low-level File occurs only
17  * when the buffer is filled or emptied. This is an even bigger win
18  * for virtual Files than for ordinary kernel files, since reducing the
19  * frequency with which a virtual File is touched reduces "thrashing"
20  * of opening/closing file descriptors.
21  *
22  * Note that BufFile structs are allocated with palloc(), and therefore
23  * will go away automatically at query/transaction end. Since the underlying
24  * virtual Files are made with OpenTemporaryFile, all resources for
25  * the file are certain to be cleaned up even if processing is aborted
26  * by ereport(ERROR). The data structures required are made in the
27  * palloc context that was current when the BufFile was created, and
28  * any external resources such as temp files are owned by the ResourceOwner
29  * that was current at that time.
30  *
31  * BufFile also supports temporary files that exceed the OS file size limit
32  * (by opening multiple fd.c temporary files). This is an essential feature
33  * for sorts and hashjoins on large amounts of data.
34  *
35  * BufFile supports temporary files that can be shared with other backends, as
36  * infrastructure for parallel execution. Such files need to be created as a
37  * member of a SharedFileSet that all participants are attached to.
38  *
39  * BufFile also supports temporary files that can be used by the single backend
40  * when the corresponding files need to be survived across the transaction and
41  * need to be opened and closed multiple times. Such files need to be created
42  * as a member of a FileSet.
43  *-------------------------------------------------------------------------
44  */
45 
46 #include "postgres.h"
47 
48 #include "commands/tablespace.h"
49 #include "executor/instrument.h"
50 #include "miscadmin.h"
51 #include "pgstat.h"
52 #include "storage/buffile.h"
53 #include "storage/bufmgr.h"
54 #include "storage/fd.h"
55 #include "utils/resowner.h"
56 
57 /*
58  * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE.
59  * The reason is that we'd like large BufFiles to be spread across multiple
60  * tablespaces when available.
61  */
62 #define MAX_PHYSICAL_FILESIZE 0x40000000
63 #define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ)
64 
65 /*
66  * This data structure represents a buffered file that consists of one or
67  * more physical files (each accessed through a virtual file descriptor
68  * managed by fd.c).
69  */
70 struct BufFile
71 {
72  int numFiles; /* number of physical files in set */
73  /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
74  File *files; /* palloc'd array with numFiles entries */
75 
76  bool isInterXact; /* keep open over transactions? */
77  bool dirty; /* does buffer need to be written? */
78  bool readOnly; /* has the file been set to read only? */
79 
80  FileSet *fileset; /* space for fileset based segment files */
81  const char *name; /* name of fileset based BufFile */
82 
83  /*
84  * resowner is the ResourceOwner to use for underlying temp files. (We
85  * don't need to remember the memory context we're using explicitly,
86  * because after creation we only repalloc our arrays larger.)
87  */
89 
90  /*
91  * "current pos" is position of start of buffer within the logical file.
92  * Position as seen by user of BufFile is (curFile, curOffset + pos).
93  */
94  int curFile; /* file index (0..n) part of current pos */
95  off_t curOffset; /* offset part of current pos */
96  int pos; /* next read/write position in buffer */
97  int nbytes; /* total # of valid bytes in buffer */
98 
99  /*
100  * XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid
101  * wasting per-file alignment padding when some users create many files.
102  */
104 };
105 
106 static BufFile *makeBufFileCommon(int nfiles);
107 static BufFile *makeBufFile(File firstfile);
108 static void extendBufFile(BufFile *file);
109 static void BufFileLoadBuffer(BufFile *file);
110 static void BufFileDumpBuffer(BufFile *file);
111 static void BufFileFlush(BufFile *file);
112 static File MakeNewFileSetSegment(BufFile *buffile, int segment);
113 
114 /*
115  * Create BufFile and perform the common initialization.
116  */
117 static BufFile *
118 makeBufFileCommon(int nfiles)
119 {
120  BufFile *file = (BufFile *) palloc(sizeof(BufFile));
121 
122  file->numFiles = nfiles;
123  file->isInterXact = false;
124  file->dirty = false;
126  file->curFile = 0;
127  file->curOffset = 0;
128  file->pos = 0;
129  file->nbytes = 0;
130 
131  return file;
132 }
133 
134 /*
135  * Create a BufFile given the first underlying physical file.
136  * NOTE: caller must set isInterXact if appropriate.
137  */
138 static BufFile *
139 makeBufFile(File firstfile)
140 {
141  BufFile *file = makeBufFileCommon(1);
142 
143  file->files = (File *) palloc(sizeof(File));
144  file->files[0] = firstfile;
145  file->readOnly = false;
146  file->fileset = NULL;
147  file->name = NULL;
148 
149  return file;
150 }
151 
152 /*
153  * Add another component temp file.
154  */
155 static void
157 {
158  File pfile;
159  ResourceOwner oldowner;
160 
161  /* Be sure to associate the file with the BufFile's resource owner */
162  oldowner = CurrentResourceOwner;
164 
165  if (file->fileset == NULL)
166  pfile = OpenTemporaryFile(file->isInterXact);
167  else
168  pfile = MakeNewFileSetSegment(file, file->numFiles);
169 
170  Assert(pfile >= 0);
171 
172  CurrentResourceOwner = oldowner;
173 
174  file->files = (File *) repalloc(file->files,
175  (file->numFiles + 1) * sizeof(File));
176  file->files[file->numFiles] = pfile;
177  file->numFiles++;
178 }
179 
180 /*
181  * Create a BufFile for a new temporary file (which will expand to become
182  * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are
183  * written to it).
184  *
185  * If interXact is true, the temp file will not be automatically deleted
186  * at end of transaction.
187  *
188  * Note: if interXact is true, the caller had better be calling us in a
189  * memory context, and with a resource owner, that will survive across
190  * transaction boundaries.
191  */
192 BufFile *
193 BufFileCreateTemp(bool interXact)
194 {
195  BufFile *file;
196  File pfile;
197 
198  /*
199  * Ensure that temp tablespaces are set up for OpenTemporaryFile to use.
200  * Possibly the caller will have done this already, but it seems useful to
201  * double-check here. Failure to do this at all would result in the temp
202  * files always getting placed in the default tablespace, which is a
203  * pretty hard-to-detect bug. Callers may prefer to do it earlier if they
204  * want to be sure that any required catalog access is done in some other
205  * resource context.
206  */
208 
209  pfile = OpenTemporaryFile(interXact);
210  Assert(pfile >= 0);
211 
212  file = makeBufFile(pfile);
213  file->isInterXact = interXact;
214 
215  return file;
216 }
217 
218 /*
219  * Build the name for a given segment of a given BufFile.
220  */
221 static void
222 FileSetSegmentName(char *name, const char *buffile_name, int segment)
223 {
224  snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment);
225 }
226 
227 /*
228  * Create a new segment file backing a fileset based BufFile.
229  */
230 static File
231 MakeNewFileSetSegment(BufFile *buffile, int segment)
232 {
233  char name[MAXPGPATH];
234  File file;
235 
236  /*
237  * It is possible that there are files left over from before a crash
238  * restart with the same name. In order for BufFileOpenFileSet() not to
239  * get confused about how many segments there are, we'll unlink the next
240  * segment number if it already exists.
241  */
242  FileSetSegmentName(name, buffile->name, segment + 1);
243  FileSetDelete(buffile->fileset, name, true);
244 
245  /* Create the new segment. */
246  FileSetSegmentName(name, buffile->name, segment);
247  file = FileSetCreate(buffile->fileset, name);
248 
249  /* FileSetCreate would've errored out */
250  Assert(file > 0);
251 
252  return file;
253 }
254 
255 /*
256  * Create a BufFile that can be discovered and opened read-only by other
257  * backends that are attached to the same SharedFileSet using the same name.
258  *
259  * The naming scheme for fileset based BufFiles is left up to the calling code.
260  * The name will appear as part of one or more filenames on disk, and might
261  * provide clues to administrators about which subsystem is generating
262  * temporary file data. Since each SharedFileSet object is backed by one or
263  * more uniquely named temporary directory, names don't conflict with
264  * unrelated SharedFileSet objects.
265  */
266 BufFile *
267 BufFileCreateFileSet(FileSet *fileset, const char *name)
268 {
269  BufFile *file;
270 
271  file = makeBufFileCommon(1);
272  file->fileset = fileset;
273  file->name = pstrdup(name);
274  file->files = (File *) palloc(sizeof(File));
275  file->files[0] = MakeNewFileSetSegment(file, 0);
276  file->readOnly = false;
277 
278  return file;
279 }
280 
281 /*
282  * Open a file that was previously created in another backend (or this one)
283  * with BufFileCreateFileSet in the same FileSet using the same name.
284  * The backend that created the file must have called BufFileClose() or
285  * BufFileExportFileSet() to make sure that it is ready to be opened by other
286  * backends and render it read-only. If missing_ok is true, which indicates
287  * that missing files can be safely ignored, then return NULL if the BufFile
288  * with the given name is not found, otherwise, throw an error.
289  */
290 BufFile *
291 BufFileOpenFileSet(FileSet *fileset, const char *name, int mode,
292  bool missing_ok)
293 {
294  BufFile *file;
295  char segment_name[MAXPGPATH];
296  Size capacity = 16;
297  File *files;
298  int nfiles = 0;
299 
300  files = palloc(sizeof(File) * capacity);
301 
302  /*
303  * We don't know how many segments there are, so we'll probe the
304  * filesystem to find out.
305  */
306  for (;;)
307  {
308  /* See if we need to expand our file segment array. */
309  if (nfiles + 1 > capacity)
310  {
311  capacity *= 2;
312  files = repalloc(files, sizeof(File) * capacity);
313  }
314  /* Try to load a segment. */
315  FileSetSegmentName(segment_name, name, nfiles);
316  files[nfiles] = FileSetOpen(fileset, segment_name, mode);
317  if (files[nfiles] <= 0)
318  break;
319  ++nfiles;
320 
322  }
323 
324  /*
325  * If we didn't find any files at all, then no BufFile exists with this
326  * name.
327  */
328  if (nfiles == 0)
329  {
330  /* free the memory */
331  pfree(files);
332 
333  if (missing_ok)
334  return NULL;
335 
336  ereport(ERROR,
338  errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m",
339  segment_name, name)));
340  }
341 
342  file = makeBufFileCommon(nfiles);
343  file->files = files;
344  file->readOnly = (mode == O_RDONLY);
345  file->fileset = fileset;
346  file->name = pstrdup(name);
347 
348  return file;
349 }
350 
351 /*
352  * Delete a BufFile that was created by BufFileCreateFileSet in the given
353  * FileSet using the given name.
354  *
355  * It is not necessary to delete files explicitly with this function. It is
356  * provided only as a way to delete files proactively, rather than waiting for
357  * the FileSet to be cleaned up.
358  *
359  * Only one backend should attempt to delete a given name, and should know
360  * that it exists and has been exported or closed otherwise missing_ok should
361  * be passed true.
362  */
363 void
364 BufFileDeleteFileSet(FileSet *fileset, const char *name, bool missing_ok)
365 {
366  char segment_name[MAXPGPATH];
367  int segment = 0;
368  bool found = false;
369 
370  /*
371  * We don't know how many segments the file has. We'll keep deleting
372  * until we run out. If we don't manage to find even an initial segment,
373  * raise an error.
374  */
375  for (;;)
376  {
377  FileSetSegmentName(segment_name, name, segment);
378  if (!FileSetDelete(fileset, segment_name, true))
379  break;
380  found = true;
381  ++segment;
382 
384  }
385 
386  if (!found && !missing_ok)
387  elog(ERROR, "could not delete unknown BufFile \"%s\"", name);
388 }
389 
390 /*
391  * BufFileExportFileSet --- flush and make read-only, in preparation for sharing.
392  */
393 void
395 {
396  /* Must be a file belonging to a FileSet. */
397  Assert(file->fileset != NULL);
398 
399  /* It's probably a bug if someone calls this twice. */
400  Assert(!file->readOnly);
401 
402  BufFileFlush(file);
403  file->readOnly = true;
404 }
405 
406 /*
407  * Close a BufFile
408  *
409  * Like fclose(), this also implicitly FileCloses the underlying File.
410  */
411 void
413 {
414  int i;
415 
416  /* flush any unwritten data */
417  BufFileFlush(file);
418  /* close and delete the underlying file(s) */
419  for (i = 0; i < file->numFiles; i++)
420  FileClose(file->files[i]);
421  /* release the buffer space */
422  pfree(file->files);
423  pfree(file);
424 }
425 
426 /*
427  * BufFileLoadBuffer
428  *
429  * Load some data into buffer, if possible, starting from curOffset.
430  * At call, must have dirty = false, pos and nbytes = 0.
431  * On exit, nbytes is number of bytes loaded.
432  */
433 static void
435 {
436  File thisfile;
437  instr_time io_start;
438  instr_time io_time;
439 
440  /*
441  * Advance to next component file if necessary and possible.
442  */
443  if (file->curOffset >= MAX_PHYSICAL_FILESIZE &&
444  file->curFile + 1 < file->numFiles)
445  {
446  file->curFile++;
447  file->curOffset = 0;
448  }
449 
450  thisfile = file->files[file->curFile];
451 
452  if (track_io_timing)
453  INSTR_TIME_SET_CURRENT(io_start);
454  else
455  INSTR_TIME_SET_ZERO(io_start);
456 
457  /*
458  * Read whatever we can get, up to a full bufferload.
459  */
460  file->nbytes = FileRead(thisfile,
461  file->buffer.data,
462  sizeof(file->buffer),
463  file->curOffset,
464  WAIT_EVENT_BUFFILE_READ);
465  if (file->nbytes < 0)
466  {
467  file->nbytes = 0;
468  ereport(ERROR,
470  errmsg("could not read file \"%s\": %m",
471  FilePathName(thisfile))));
472  }
473 
474  if (track_io_timing)
475  {
476  INSTR_TIME_SET_CURRENT(io_time);
478  }
479 
480  /* we choose not to advance curOffset here */
481 
482  if (file->nbytes > 0)
484 }
485 
486 /*
487  * BufFileDumpBuffer
488  *
489  * Dump buffer contents starting at curOffset.
490  * At call, should have dirty = true, nbytes > 0.
491  * On exit, dirty is cleared if successful write, and curOffset is advanced.
492  */
493 static void
495 {
496  int wpos = 0;
497  int bytestowrite;
498  File thisfile;
499 
500  /*
501  * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it
502  * crosses a component-file boundary; so we need a loop.
503  */
504  while (wpos < file->nbytes)
505  {
506  off_t availbytes;
507  instr_time io_start;
508  instr_time io_time;
509 
510  /*
511  * Advance to next component file if necessary and possible.
512  */
513  if (file->curOffset >= MAX_PHYSICAL_FILESIZE)
514  {
515  while (file->curFile + 1 >= file->numFiles)
516  extendBufFile(file);
517  file->curFile++;
518  file->curOffset = 0;
519  }
520 
521  /*
522  * Determine how much we need to write into this file.
523  */
524  bytestowrite = file->nbytes - wpos;
525  availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
526 
527  if ((off_t) bytestowrite > availbytes)
528  bytestowrite = (int) availbytes;
529 
530  thisfile = file->files[file->curFile];
531 
532  if (track_io_timing)
533  INSTR_TIME_SET_CURRENT(io_start);
534  else
535  INSTR_TIME_SET_ZERO(io_start);
536 
537  bytestowrite = FileWrite(thisfile,
538  file->buffer.data + wpos,
539  bytestowrite,
540  file->curOffset,
541  WAIT_EVENT_BUFFILE_WRITE);
542  if (bytestowrite <= 0)
543  ereport(ERROR,
545  errmsg("could not write to file \"%s\": %m",
546  FilePathName(thisfile))));
547 
548  if (track_io_timing)
549  {
550  INSTR_TIME_SET_CURRENT(io_time);
552  }
553 
554  file->curOffset += bytestowrite;
555  wpos += bytestowrite;
556 
558  }
559  file->dirty = false;
560 
561  /*
562  * At this point, curOffset has been advanced to the end of the buffer,
563  * ie, its original value + nbytes. We need to make it point to the
564  * logical file position, ie, original value + pos, in case that is less
565  * (as could happen due to a small backwards seek in a dirty buffer!)
566  */
567  file->curOffset -= (file->nbytes - file->pos);
568  if (file->curOffset < 0) /* handle possible segment crossing */
569  {
570  file->curFile--;
571  Assert(file->curFile >= 0);
573  }
574 
575  /*
576  * Now we can set the buffer empty without changing the logical position
577  */
578  file->pos = 0;
579  file->nbytes = 0;
580 }
581 
582 /*
583  * BufFileRead variants
584  *
585  * Like fread() except we assume 1-byte element size and report I/O errors via
586  * ereport().
587  *
588  * If 'exact' is true, then an error is also raised if the number of bytes
589  * read is not exactly 'size' (no short reads). If 'exact' and 'eofOK' are
590  * true, then reading zero bytes is ok.
591  */
592 static size_t
593 BufFileReadCommon(BufFile *file, void *ptr, size_t size, bool exact, bool eofOK)
594 {
595  size_t start_size = size;
596  size_t nread = 0;
597  size_t nthistime;
598 
599  BufFileFlush(file);
600 
601  while (size > 0)
602  {
603  if (file->pos >= file->nbytes)
604  {
605  /* Try to load more data into buffer. */
606  file->curOffset += file->pos;
607  file->pos = 0;
608  file->nbytes = 0;
609  BufFileLoadBuffer(file);
610  if (file->nbytes <= 0)
611  break; /* no more data available */
612  }
613 
614  nthistime = file->nbytes - file->pos;
615  if (nthistime > size)
616  nthistime = size;
617  Assert(nthistime > 0);
618 
619  memcpy(ptr, file->buffer.data + file->pos, nthistime);
620 
621  file->pos += nthistime;
622  ptr = (char *) ptr + nthistime;
623  size -= nthistime;
624  nread += nthistime;
625  }
626 
627  if (exact &&
628  (nread != start_size && !(nread == 0 && eofOK)))
629  ereport(ERROR,
631  file->name ?
632  errmsg("could not read from file set \"%s\": read only %zu of %zu bytes",
633  file->name, nread, start_size) :
634  errmsg("could not read from temporary file: read only %zu of %zu bytes",
635  nread, start_size));
636 
637  return nread;
638 }
639 
640 /*
641  * Legacy interface where the caller needs to check for end of file or short
642  * reads.
643  */
644 size_t
645 BufFileRead(BufFile *file, void *ptr, size_t size)
646 {
647  return BufFileReadCommon(file, ptr, size, false, false);
648 }
649 
650 /*
651  * Require read of exactly the specified size.
652  */
653 void
654 BufFileReadExact(BufFile *file, void *ptr, size_t size)
655 {
656  BufFileReadCommon(file, ptr, size, true, false);
657 }
658 
659 /*
660  * Require read of exactly the specified size, but optionally allow end of
661  * file (in which case 0 is returned).
662  */
663 size_t
664 BufFileReadMaybeEOF(BufFile *file, void *ptr, size_t size, bool eofOK)
665 {
666  return BufFileReadCommon(file, ptr, size, true, eofOK);
667 }
668 
669 /*
670  * BufFileWrite
671  *
672  * Like fwrite() except we assume 1-byte element size and report errors via
673  * ereport().
674  */
675 void
676 BufFileWrite(BufFile *file, const void *ptr, size_t size)
677 {
678  size_t nthistime;
679 
680  Assert(!file->readOnly);
681 
682  while (size > 0)
683  {
684  if (file->pos >= BLCKSZ)
685  {
686  /* Buffer full, dump it out */
687  if (file->dirty)
688  BufFileDumpBuffer(file);
689  else
690  {
691  /* Hmm, went directly from reading to writing? */
692  file->curOffset += file->pos;
693  file->pos = 0;
694  file->nbytes = 0;
695  }
696  }
697 
698  nthistime = BLCKSZ - file->pos;
699  if (nthistime > size)
700  nthistime = size;
701  Assert(nthistime > 0);
702 
703  memcpy(file->buffer.data + file->pos, ptr, nthistime);
704 
705  file->dirty = true;
706  file->pos += nthistime;
707  if (file->nbytes < file->pos)
708  file->nbytes = file->pos;
709  ptr = (const char *) ptr + nthistime;
710  size -= nthistime;
711  }
712 }
713 
714 /*
715  * BufFileFlush
716  *
717  * Like fflush(), except that I/O errors are reported with ereport().
718  */
719 static void
721 {
722  if (file->dirty)
723  BufFileDumpBuffer(file);
724 
725  Assert(!file->dirty);
726 }
727 
728 /*
729  * BufFileSeek
730  *
731  * Like fseek(), except that target position needs two values in order to
732  * work when logical filesize exceeds maximum value representable by off_t.
733  * We do not support relative seeks across more than that, however.
734  * I/O errors are reported by ereport().
735  *
736  * Result is 0 if OK, EOF if not. Logical position is not moved if an
737  * impossible seek is attempted.
738  */
739 int
740 BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
741 {
742  int newFile;
743  off_t newOffset;
744 
745  switch (whence)
746  {
747  case SEEK_SET:
748  if (fileno < 0)
749  return EOF;
750  newFile = fileno;
751  newOffset = offset;
752  break;
753  case SEEK_CUR:
754 
755  /*
756  * Relative seek considers only the signed offset, ignoring
757  * fileno. Note that large offsets (> 1 GB) risk overflow in this
758  * add, unless we have 64-bit off_t.
759  */
760  newFile = file->curFile;
761  newOffset = (file->curOffset + file->pos) + offset;
762  break;
763  case SEEK_END:
764 
765  /*
766  * The file size of the last file gives us the end offset of that
767  * file.
768  */
769  newFile = file->numFiles - 1;
770  newOffset = FileSize(file->files[file->numFiles - 1]);
771  if (newOffset < 0)
772  ereport(ERROR,
774  errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
775  FilePathName(file->files[file->numFiles - 1]),
776  file->name)));
777  break;
778  default:
779  elog(ERROR, "invalid whence: %d", whence);
780  return EOF;
781  }
782  while (newOffset < 0)
783  {
784  if (--newFile < 0)
785  return EOF;
786  newOffset += MAX_PHYSICAL_FILESIZE;
787  }
788  if (newFile == file->curFile &&
789  newOffset >= file->curOffset &&
790  newOffset <= file->curOffset + file->nbytes)
791  {
792  /*
793  * Seek is to a point within existing buffer; we can just adjust
794  * pos-within-buffer, without flushing buffer. Note this is OK
795  * whether reading or writing, but buffer remains dirty if we were
796  * writing.
797  */
798  file->pos = (int) (newOffset - file->curOffset);
799  return 0;
800  }
801  /* Otherwise, must reposition buffer, so flush any dirty data */
802  BufFileFlush(file);
803 
804  /*
805  * At this point and no sooner, check for seek past last segment. The
806  * above flush could have created a new segment, so checking sooner would
807  * not work (at least not with this code).
808  */
809 
810  /* convert seek to "start of next seg" to "end of last seg" */
811  if (newFile == file->numFiles && newOffset == 0)
812  {
813  newFile--;
814  newOffset = MAX_PHYSICAL_FILESIZE;
815  }
816  while (newOffset > MAX_PHYSICAL_FILESIZE)
817  {
818  if (++newFile >= file->numFiles)
819  return EOF;
820  newOffset -= MAX_PHYSICAL_FILESIZE;
821  }
822  if (newFile >= file->numFiles)
823  return EOF;
824  /* Seek is OK! */
825  file->curFile = newFile;
826  file->curOffset = newOffset;
827  file->pos = 0;
828  file->nbytes = 0;
829  return 0;
830 }
831 
832 void
833 BufFileTell(BufFile *file, int *fileno, off_t *offset)
834 {
835  *fileno = file->curFile;
836  *offset = file->curOffset + file->pos;
837 }
838 
839 /*
840  * BufFileSeekBlock --- block-oriented seek
841  *
842  * Performs absolute seek to the start of the n'th BLCKSZ-sized block of
843  * the file. Note that users of this interface will fail if their files
844  * exceed BLCKSZ * PG_INT64_MAX bytes, but that is quite a lot; we don't
845  * work with tables bigger than that, either...
846  *
847  * Result is 0 if OK, EOF if not. Logical position is not moved if an
848  * impossible seek is attempted.
849  */
850 int
851 BufFileSeekBlock(BufFile *file, int64 blknum)
852 {
853  return BufFileSeek(file,
854  (int) (blknum / BUFFILE_SEG_SIZE),
855  (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ,
856  SEEK_SET);
857 }
858 
859 /*
860  * Returns the amount of data in the given BufFile, in bytes.
861  *
862  * Returned value includes the size of any holes left behind by BufFileAppend.
863  * ereport()s on failure.
864  */
865 int64
867 {
868  int64 lastFileSize;
869 
870  /* Get the size of the last physical file. */
871  lastFileSize = FileSize(file->files[file->numFiles - 1]);
872  if (lastFileSize < 0)
873  ereport(ERROR,
875  errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
876  FilePathName(file->files[file->numFiles - 1]),
877  file->name)));
878 
879  return ((file->numFiles - 1) * (int64) MAX_PHYSICAL_FILESIZE) +
880  lastFileSize;
881 }
882 
883 /*
884  * Append the contents of the source file to the end of the target file.
885  *
886  * Note that operation subsumes ownership of underlying resources from
887  * "source". Caller should never call BufFileClose against source having
888  * called here first. Resource owners for source and target must match,
889  * too.
890  *
891  * This operation works by manipulating lists of segment files, so the
892  * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned
893  * boundary, typically creating empty holes before the boundary. These
894  * areas do not contain any interesting data, and cannot be read from by
895  * caller.
896  *
897  * Returns the block number within target where the contents of source
898  * begins. Caller should apply this as an offset when working off block
899  * positions that are in terms of the original BufFile space.
900  */
901 int64
903 {
904  int64 startBlock = (int64) target->numFiles * BUFFILE_SEG_SIZE;
905  int newNumFiles = target->numFiles + source->numFiles;
906  int i;
907 
908  Assert(source->readOnly);
909  Assert(!source->dirty);
910 
911  if (target->resowner != source->resowner)
912  elog(ERROR, "could not append BufFile with non-matching resource owner");
913 
914  target->files = (File *)
915  repalloc(target->files, sizeof(File) * newNumFiles);
916  for (i = target->numFiles; i < newNumFiles; i++)
917  target->files[i] = source->files[i - target->numFiles];
918  target->numFiles = newNumFiles;
919 
920  return startBlock;
921 }
922 
923 /*
924  * Truncate a BufFile created by BufFileCreateFileSet up to the given fileno
925  * and the offset.
926  */
927 void
928 BufFileTruncateFileSet(BufFile *file, int fileno, off_t offset)
929 {
930  int numFiles = file->numFiles;
931  int newFile = fileno;
932  off_t newOffset = file->curOffset;
933  char segment_name[MAXPGPATH];
934  int i;
935 
936  /*
937  * Loop over all the files up to the given fileno and remove the files
938  * that are greater than the fileno and truncate the given file up to the
939  * offset. Note that we also remove the given fileno if the offset is 0
940  * provided it is not the first file in which we truncate it.
941  */
942  for (i = file->numFiles - 1; i >= fileno; i--)
943  {
944  if ((i != fileno || offset == 0) && i != 0)
945  {
946  FileSetSegmentName(segment_name, file->name, i);
947  FileClose(file->files[i]);
948  if (!FileSetDelete(file->fileset, segment_name, true))
949  ereport(ERROR,
951  errmsg("could not delete fileset \"%s\": %m",
952  segment_name)));
953  numFiles--;
954  newOffset = MAX_PHYSICAL_FILESIZE;
955 
956  /*
957  * This is required to indicate that we have deleted the given
958  * fileno.
959  */
960  if (i == fileno)
961  newFile--;
962  }
963  else
964  {
965  if (FileTruncate(file->files[i], offset,
966  WAIT_EVENT_BUFFILE_TRUNCATE) < 0)
967  ereport(ERROR,
969  errmsg("could not truncate file \"%s\": %m",
970  FilePathName(file->files[i]))));
971  newOffset = offset;
972  }
973  }
974 
975  file->numFiles = numFiles;
976 
977  /*
978  * If the truncate point is within existing buffer then we can just adjust
979  * pos within buffer.
980  */
981  if (newFile == file->curFile &&
982  newOffset >= file->curOffset &&
983  newOffset <= file->curOffset + file->nbytes)
984  {
985  /* No need to reset the current pos if the new pos is greater. */
986  if (newOffset <= file->curOffset + file->pos)
987  file->pos = (int) (newOffset - file->curOffset);
988 
989  /* Adjust the nbytes for the current buffer. */
990  file->nbytes = (int) (newOffset - file->curOffset);
991  }
992  else if (newFile == file->curFile &&
993  newOffset < file->curOffset)
994  {
995  /*
996  * The truncate point is within the existing file but prior to the
997  * current position, so we can forget the current buffer and reset the
998  * current position.
999  */
1000  file->curOffset = newOffset;
1001  file->pos = 0;
1002  file->nbytes = 0;
1003  }
1004  else if (newFile < file->curFile)
1005  {
1006  /*
1007  * The truncate point is prior to the current file, so need to reset
1008  * the current position accordingly.
1009  */
1010  file->curFile = newFile;
1011  file->curOffset = newOffset;
1012  file->pos = 0;
1013  file->nbytes = 0;
1014  }
1015  /* Nothing to do, if the truncate point is beyond current file. */
1016 }
void PrepareTempTablespaces(void)
Definition: tablespace.c:1331
int BufFileSeekBlock(BufFile *file, int64 blknum)
Definition: buffile.c:851
void BufFileExportFileSet(BufFile *file)
Definition: buffile.c:394
size_t BufFileRead(BufFile *file, void *ptr, size_t size)
Definition: buffile.c:645
void BufFileReadExact(BufFile *file, void *ptr, size_t size)
Definition: buffile.c:654
static void FileSetSegmentName(char *name, const char *buffile_name, int segment)
Definition: buffile.c:222
BufFile * BufFileOpenFileSet(FileSet *fileset, const char *name, int mode, bool missing_ok)
Definition: buffile.c:291
static BufFile * makeBufFileCommon(int nfiles)
Definition: buffile.c:118
#define BUFFILE_SEG_SIZE
Definition: buffile.c:63
static void BufFileLoadBuffer(BufFile *file)
Definition: buffile.c:434
static File MakeNewFileSetSegment(BufFile *buffile, int segment)
Definition: buffile.c:231
void BufFileTell(BufFile *file, int *fileno, off_t *offset)
Definition: buffile.c:833
BufFile * BufFileCreateTemp(bool interXact)
Definition: buffile.c:193
static void extendBufFile(BufFile *file)
Definition: buffile.c:156
#define MAX_PHYSICAL_FILESIZE
Definition: buffile.c:62
static void BufFileFlush(BufFile *file)
Definition: buffile.c:720
void BufFileWrite(BufFile *file, const void *ptr, size_t size)
Definition: buffile.c:676
size_t BufFileReadMaybeEOF(BufFile *file, void *ptr, size_t size, bool eofOK)
Definition: buffile.c:664
void BufFileTruncateFileSet(BufFile *file, int fileno, off_t offset)
Definition: buffile.c:928
int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
Definition: buffile.c:740
int64 BufFileSize(BufFile *file)
Definition: buffile.c:866
BufFile * BufFileCreateFileSet(FileSet *fileset, const char *name)
Definition: buffile.c:267
static BufFile * makeBufFile(File firstfile)
Definition: buffile.c:139
static size_t BufFileReadCommon(BufFile *file, void *ptr, size_t size, bool exact, bool eofOK)
Definition: buffile.c:593
void BufFileClose(BufFile *file)
Definition: buffile.c:412
int64 BufFileAppend(BufFile *target, BufFile *source)
Definition: buffile.c:902
void BufFileDeleteFileSet(FileSet *fileset, const char *name, bool missing_ok)
Definition: buffile.c:364
static void BufFileDumpBuffer(BufFile *file)
Definition: buffile.c:494
bool track_io_timing
Definition: bufmgr.c:143
#define Assert(condition)
Definition: c.h:861
size_t Size
Definition: c.h:608
int errcode_for_file_access(void)
Definition: elog.c:876
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
void FileClose(File file)
Definition: fd.c:1978
File OpenTemporaryFile(bool interXact)
Definition: fd.c:1724
char * FilePathName(File file)
Definition: fd.c:2484
off_t FileSize(File file)
Definition: fd.c:2432
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
Definition: fd.c:2449
static ssize_t FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
Definition: fd.h:196
int File
Definition: fd.h:51
static ssize_t FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
Definition: fd.h:208
File FileSetOpen(FileSet *fileset, const char *name, int mode)
Definition: fileset.c:119
bool FileSetDelete(FileSet *fileset, const char *name, bool error_on_failure)
Definition: fileset.c:136
File FileSetCreate(FileSet *fileset, const char *name)
Definition: fileset.c:92
#define INSTR_TIME_SET_CURRENT(t)
Definition: instr_time.h:122
#define INSTR_TIME_SET_ZERO(t)
Definition: instr_time.h:172
#define INSTR_TIME_ACCUM_DIFF(x, y, z)
Definition: instr_time.h:184
BufferUsage pgBufferUsage
Definition: instrument.c:20
int i
Definition: isn.c:73
char * pstrdup(const char *in)
Definition: mcxt.c:1696
void pfree(void *pointer)
Definition: mcxt.c:1521
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void * palloc(Size size)
Definition: mcxt.c:1317
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
static PgChecksumMode mode
Definition: pg_checksums.c:56
#define MAXPGPATH
static rewind_source * source
Definition: pg_rewind.c:89
#define snprintf
Definition: port.h:238
ResourceOwner CurrentResourceOwner
Definition: resowner.c:165
static pg_noinline void Size size
Definition: slab.c:607
int nbytes
Definition: buffile.c:97
PGAlignedBlock buffer
Definition: buffile.c:103
int numFiles
Definition: buffile.c:72
bool dirty
Definition: buffile.c:77
FileSet * fileset
Definition: buffile.c:80
File * files
Definition: buffile.c:74
ResourceOwner resowner
Definition: buffile.c:88
bool isInterXact
Definition: buffile.c:76
bool readOnly
Definition: buffile.c:78
int pos
Definition: buffile.c:96
int curFile
Definition: buffile.c:94
off_t curOffset
Definition: buffile.c:95
const char * name
Definition: buffile.c:81
instr_time temp_blk_write_time
Definition: instrument.h:41
instr_time temp_blk_read_time
Definition: instrument.h:40
int64 temp_blks_read
Definition: instrument.h:34
int64 temp_blks_written
Definition: instrument.h:35
#define wpos(wep)
Definition: tsrank.c:27
char data[BLCKSZ]
Definition: c.h:1122
const char * name