PostgreSQL Source Code  git master
reinit.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * reinit.c
4  * Reinitialization of unlogged relations
5  *
6  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  * src/backend/storage/file/reinit.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <unistd.h>
18 
19 #include "common/relpath.h"
20 #include "postmaster/startup.h"
21 #include "storage/copydir.h"
22 #include "storage/fd.h"
23 #include "storage/reinit.h"
24 #include "utils/hsearch.h"
25 #include "utils/memutils.h"
26 
27 static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname,
28  int op);
29 static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname,
30  int op);
31 
32 typedef struct
33 {
34  RelFileNumber relnumber; /* hash key */
36 
37 /*
38  * Reset unlogged relations from before the last restart.
39  *
40  * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any
41  * relation with an "init" fork, except for the "init" fork itself.
42  *
43  * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main
44  * fork.
45  */
46 void
48 {
49  char temp_path[MAXPGPATH + sizeof(PG_TBLSPC_DIR) + sizeof(TABLESPACE_VERSION_DIRECTORY)];
50  DIR *spc_dir;
51  struct dirent *spc_de;
52  MemoryContext tmpctx,
53  oldctx;
54 
55  /* Log it. */
56  elog(DEBUG1, "resetting unlogged relations: cleanup %d init %d",
57  (op & UNLOGGED_RELATION_CLEANUP) != 0,
58  (op & UNLOGGED_RELATION_INIT) != 0);
59 
60  /*
61  * Just to be sure we don't leak any memory, let's create a temporary
62  * memory context for this operation.
63  */
65  "ResetUnloggedRelations",
67  oldctx = MemoryContextSwitchTo(tmpctx);
68 
69  /* Prepare to report progress resetting unlogged relations. */
71 
72  /*
73  * First process unlogged files in pg_default ($PGDATA/base)
74  */
76 
77  /*
78  * Cycle through directories for all non-default tablespaces.
79  */
80  spc_dir = AllocateDir(PG_TBLSPC_DIR);
81 
82  while ((spc_de = ReadDir(spc_dir, PG_TBLSPC_DIR)) != NULL)
83  {
84  if (strcmp(spc_de->d_name, ".") == 0 ||
85  strcmp(spc_de->d_name, "..") == 0)
86  continue;
87 
88  snprintf(temp_path, sizeof(temp_path), "%s/%s/%s",
91  }
92 
93  FreeDir(spc_dir);
94 
95  /*
96  * Restore memory context.
97  */
98  MemoryContextSwitchTo(oldctx);
99  MemoryContextDelete(tmpctx);
100 }
101 
102 /*
103  * Process one tablespace directory for ResetUnloggedRelations
104  */
105 static void
106 ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
107 {
108  DIR *ts_dir;
109  struct dirent *de;
110  char dbspace_path[MAXPGPATH * 2];
111 
112  ts_dir = AllocateDir(tsdirname);
113 
114  /*
115  * If we get ENOENT on a tablespace directory, log it and return. This
116  * can happen if a previous DROP TABLESPACE crashed between removing the
117  * tablespace directory and removing the symlink in pg_tblspc. We don't
118  * really want to prevent database startup in that scenario, so let it
119  * pass instead. Any other type of error will be reported by ReadDir
120  * (causing a startup failure).
121  */
122  if (ts_dir == NULL && errno == ENOENT)
123  {
124  ereport(LOG,
126  errmsg("could not open directory \"%s\": %m",
127  tsdirname)));
128  return;
129  }
130 
131  while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
132  {
133  /*
134  * We're only interested in the per-database directories, which have
135  * numeric names. Note that this code will also (properly) ignore "."
136  * and "..".
137  */
138  if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
139  continue;
140 
141  snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
142  tsdirname, de->d_name);
143 
144  if (op & UNLOGGED_RELATION_INIT)
145  ereport_startup_progress("resetting unlogged relations (init), elapsed time: %ld.%02d s, current path: %s",
146  dbspace_path);
147  else if (op & UNLOGGED_RELATION_CLEANUP)
148  ereport_startup_progress("resetting unlogged relations (cleanup), elapsed time: %ld.%02d s, current path: %s",
149  dbspace_path);
150 
151  ResetUnloggedRelationsInDbspaceDir(dbspace_path, op);
152  }
153 
154  FreeDir(ts_dir);
155 }
156 
157 /*
158  * Process one per-dbspace directory for ResetUnloggedRelations
159  */
160 static void
161 ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
162 {
163  DIR *dbspace_dir;
164  struct dirent *de;
165  char rm_path[MAXPGPATH * 2];
166 
167  /* Caller must specify at least one operation. */
169 
170  /*
171  * Cleanup is a two-pass operation. First, we go through and identify all
172  * the files with init forks. Then, we go through again and nuke
173  * everything with the same OID except the init fork.
174  */
175  if ((op & UNLOGGED_RELATION_CLEANUP) != 0)
176  {
177  HTAB *hash;
178  HASHCTL ctl;
179 
180  /*
181  * It's possible that someone could create a ton of unlogged relations
182  * in the same database & tablespace, so we'd better use a hash table
183  * rather than an array or linked list to keep track of which files
184  * need to be reset. Otherwise, this cleanup operation would be
185  * O(n^2).
186  */
187  ctl.keysize = sizeof(Oid);
188  ctl.entrysize = sizeof(unlogged_relation_entry);
189  ctl.hcxt = CurrentMemoryContext;
190  hash = hash_create("unlogged relation OIDs", 32, &ctl,
192 
193  /* Scan the directory. */
194  dbspace_dir = AllocateDir(dbspacedirname);
195  while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
196  {
197  ForkNumber forkNum;
198  unsigned segno;
200 
201  /* Skip anything that doesn't look like a relation data file. */
203  &ent.relnumber,
204  &forkNum, &segno))
205  continue;
206 
207  /* Also skip it unless this is the init fork. */
208  if (forkNum != INIT_FORKNUM)
209  continue;
210 
211  /*
212  * Put the RelFileNumber into the hash table, if it isn't already.
213  */
214  (void) hash_search(hash, &ent, HASH_ENTER, NULL);
215  }
216 
217  /* Done with the first pass. */
218  FreeDir(dbspace_dir);
219 
220  /*
221  * If we didn't find any init forks, there's no point in continuing;
222  * we can bail out now.
223  */
224  if (hash_get_num_entries(hash) == 0)
225  {
227  return;
228  }
229 
230  /*
231  * Now, make a second pass and remove anything that matches.
232  */
233  dbspace_dir = AllocateDir(dbspacedirname);
234  while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
235  {
236  ForkNumber forkNum;
237  unsigned segno;
239 
240  /* Skip anything that doesn't look like a relation data file. */
242  &ent.relnumber,
243  &forkNum, &segno))
244  continue;
245 
246  /* We never remove the init fork. */
247  if (forkNum == INIT_FORKNUM)
248  continue;
249 
250  /*
251  * See whether the OID portion of the name shows up in the hash
252  * table. If so, nuke it!
253  */
254  if (hash_search(hash, &ent, HASH_FIND, NULL))
255  {
256  snprintf(rm_path, sizeof(rm_path), "%s/%s",
257  dbspacedirname, de->d_name);
258  if (unlink(rm_path) < 0)
259  ereport(ERROR,
261  errmsg("could not remove file \"%s\": %m",
262  rm_path)));
263  else
264  elog(DEBUG2, "unlinked file \"%s\"", rm_path);
265  }
266  }
267 
268  /* Cleanup is complete. */
269  FreeDir(dbspace_dir);
271  }
272 
273  /*
274  * Initialization happens after cleanup is complete: we copy each init
275  * fork file to the corresponding main fork file. Note that if we are
276  * asked to do both cleanup and init, we may never get here: if the
277  * cleanup code determines that there are no init forks in this dbspace,
278  * it will return before we get to this point.
279  */
280  if ((op & UNLOGGED_RELATION_INIT) != 0)
281  {
282  /* Scan the directory. */
283  dbspace_dir = AllocateDir(dbspacedirname);
284  while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
285  {
286  ForkNumber forkNum;
287  RelFileNumber relNumber;
288  unsigned segno;
289  char srcpath[MAXPGPATH * 2];
290  char dstpath[MAXPGPATH];
291 
292  /* Skip anything that doesn't look like a relation data file. */
293  if (!parse_filename_for_nontemp_relation(de->d_name, &relNumber,
294  &forkNum, &segno))
295  continue;
296 
297  /* Also skip it unless this is the init fork. */
298  if (forkNum != INIT_FORKNUM)
299  continue;
300 
301  /* Construct source pathname. */
302  snprintf(srcpath, sizeof(srcpath), "%s/%s",
303  dbspacedirname, de->d_name);
304 
305  /* Construct destination pathname. */
306  if (segno == 0)
307  snprintf(dstpath, sizeof(dstpath), "%s/%u",
308  dbspacedirname, relNumber);
309  else
310  snprintf(dstpath, sizeof(dstpath), "%s/%u.%u",
311  dbspacedirname, relNumber, segno);
312 
313  /* OK, we're ready to perform the actual copy. */
314  elog(DEBUG2, "copying %s to %s", srcpath, dstpath);
315  copy_file(srcpath, dstpath);
316  }
317 
318  FreeDir(dbspace_dir);
319 
320  /*
321  * copy_file() above has already called pg_flush_data() on the files
322  * it created. Now we need to fsync those files, because a checkpoint
323  * won't do it for us while we're in recovery. We do this in a
324  * separate pass to allow the kernel to perform all the flushes
325  * (especially the metadata ones) at once.
326  */
327  dbspace_dir = AllocateDir(dbspacedirname);
328  while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
329  {
330  RelFileNumber relNumber;
331  ForkNumber forkNum;
332  unsigned segno;
333  char mainpath[MAXPGPATH];
334 
335  /* Skip anything that doesn't look like a relation data file. */
336  if (!parse_filename_for_nontemp_relation(de->d_name, &relNumber,
337  &forkNum, &segno))
338  continue;
339 
340  /* Also skip it unless this is the init fork. */
341  if (forkNum != INIT_FORKNUM)
342  continue;
343 
344  /* Construct main fork pathname. */
345  if (segno == 0)
346  snprintf(mainpath, sizeof(mainpath), "%s/%u",
347  dbspacedirname, relNumber);
348  else
349  snprintf(mainpath, sizeof(mainpath), "%s/%u.%u",
350  dbspacedirname, relNumber, segno);
351 
352  fsync_fname(mainpath, false);
353  }
354 
355  FreeDir(dbspace_dir);
356 
357  /*
358  * Lastly, fsync the database directory itself, ensuring the
359  * filesystem remembers the file creations and deletions we've done.
360  * We don't bother with this during a call that does only
361  * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we
362  * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step
363  * too at the next startup attempt.
364  */
365  fsync_fname(dbspacedirname, true);
366  }
367 }
368 
369 /*
370  * Basic parsing of putative relation filenames.
371  *
372  * This function returns true if the file appears to be in the correct format
373  * for a non-temporary relation and false otherwise.
374  *
375  * If it returns true, it sets *relnumber, *fork, and *segno to the values
376  * extracted from the filename. If it returns false, these values are set to
377  * InvalidRelFileNumber, InvalidForkNumber, and 0, respectively.
378  */
379 bool
381  ForkNumber *fork, unsigned *segno)
382 {
383  unsigned long n,
384  s;
385  ForkNumber f;
386  char *endp;
387 
388  *relnumber = InvalidRelFileNumber;
389  *fork = InvalidForkNumber;
390  *segno = 0;
391 
392  /*
393  * Relation filenames should begin with a digit that is not a zero. By
394  * rejecting cases involving leading zeroes, the caller can assume that
395  * there's only one possible string of characters that could have produced
396  * any given value for *relnumber.
397  *
398  * (To be clear, we don't expect files with names like 0017.3 to exist at
399  * all -- but if 0017.3 does exist, it's a non-relation file, not part of
400  * the main fork for relfilenode 17.)
401  */
402  if (name[0] < '1' || name[0] > '9')
403  return false;
404 
405  /*
406  * Parse the leading digit string. If the value is out of range, we
407  * conclude that this isn't a relation file at all.
408  */
409  errno = 0;
410  n = strtoul(name, &endp, 10);
411  if (errno || name == endp || n <= 0 || n > PG_UINT32_MAX)
412  return false;
413  name = endp;
414 
415  /* Check for a fork name. */
416  if (*name != '_')
417  f = MAIN_FORKNUM;
418  else
419  {
420  int forkchar;
421 
422  forkchar = forkname_chars(name + 1, &f);
423  if (forkchar <= 0)
424  return false;
425  name += forkchar + 1;
426  }
427 
428  /* Check for a segment number. */
429  if (*name != '.')
430  s = 0;
431  else
432  {
433  /* Reject leading zeroes, just like we do for RelFileNumber. */
434  if (name[1] < '1' || name[1] > '9')
435  return false;
436 
437  errno = 0;
438  s = strtoul(name + 1, &endp, 10);
439  if (errno || name + 1 == endp || s <= 0 || s > PG_UINT32_MAX)
440  return false;
441  name = endp;
442  }
443 
444  /* Now we should be at the end. */
445  if (*name != '\0')
446  return false;
447 
448  /* Set out parameters and return. */
449  *relnumber = (RelFileNumber) n;
450  *fork = f;
451  *segno = (unsigned) s;
452  return true;
453 }
void begin_startup_progress_phase(void)
Definition: startup.c:343
#define PG_UINT32_MAX
Definition: c.h:544
#define Assert(condition)
Definition: c.h:812
void copy_file(const char *fromfile, const char *tofile)
Definition: copydir.c:117
void hash_destroy(HTAB *hashp)
Definition: dynahash.c:865
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
long hash_get_num_entries(HTAB *hashp)
Definition: dynahash.c:1341
int errcode_for_file_access(void)
Definition: elog.c:876
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
struct dirent * ReadDir(DIR *dir, const char *dirname)
Definition: fd.c:2931
int FreeDir(DIR *dir)
Definition: fd.c:2983
void fsync_fname(const char *fname, bool isdir)
Definition: fd.c:755
DIR * AllocateDir(const char *dirname)
Definition: fd.c:2865
static char dstpath[MAXPGPATH]
Definition: file_ops.c:32
@ HASH_FIND
Definition: hsearch.h:113
@ HASH_ENTER
Definition: hsearch.h:114
#define HASH_CONTEXT
Definition: hsearch.h:102
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
MemoryContext CurrentMemoryContext
Definition: mcxt.c:143
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:454
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
#define MAXPGPATH
#define snprintf
Definition: port.h:238
unsigned int Oid
Definition: postgres_ext.h:31
MemoryContextSwitchTo(old_ctx)
tree ctl
Definition: radixtree.h:1855
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
Definition: reinit.c:106
void ResetUnloggedRelations(int op)
Definition: reinit.c:47
bool parse_filename_for_nontemp_relation(const char *name, RelFileNumber *relnumber, ForkNumber *fork, unsigned *segno)
Definition: reinit.c:380
static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
Definition: reinit.c:161
#define UNLOGGED_RELATION_INIT
Definition: reinit.h:28
#define UNLOGGED_RELATION_CLEANUP
Definition: reinit.h:27
int forkname_chars(const char *str, ForkNumber *fork)
Definition: relpath.c:81
Oid RelFileNumber
Definition: relpath.h:25
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
@ InvalidForkNumber
Definition: relpath.h:57
@ INIT_FORKNUM
Definition: relpath.h:61
#define InvalidRelFileNumber
Definition: relpath.h:26
#define PG_TBLSPC_DIR
Definition: relpath.h:41
#define TABLESPACE_VERSION_DIRECTORY
Definition: relpath.h:33
#define ereport_startup_progress(msg,...)
Definition: startup.h:18
Definition: dirent.c:26
Definition: dynahash.c:220
Definition: dirent.h:10
char d_name[MAX_PATH]
Definition: dirent.h:15
Definition: reinit.c:33
RelFileNumber relnumber
Definition: reinit.c:34
const char * name