PostgreSQL Source Code  git master
relmapper.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * relmapper.c
4  * Catalog-to-filenode mapping
5  *
6  * For most tables, the physical file underlying the table is specified by
7  * pg_class.relfilenode. However, that obviously won't work for pg_class
8  * itself, nor for the other "nailed" catalogs for which we have to be able
9  * to set up working Relation entries without access to pg_class. It also
10  * does not work for shared catalogs, since there is no practical way to
11  * update other databases' pg_class entries when relocating a shared catalog.
12  * Therefore, for these special catalogs (henceforth referred to as "mapped
13  * catalogs") we rely on a separately maintained file that shows the mapping
14  * from catalog OIDs to filenode numbers. Each database has a map file for
15  * its local mapped catalogs, and there is a separate map file for shared
16  * catalogs. Mapped catalogs have zero in their pg_class.relfilenode entries.
17  *
18  * Relocation of a normal table is committed (ie, the new physical file becomes
19  * authoritative) when the pg_class row update commits. For mapped catalogs,
20  * the act of updating the map file is effectively commit of the relocation.
21  * We postpone the file update till just before commit of the transaction
22  * doing the rewrite, but there is necessarily a window between. Therefore
23  * mapped catalogs can only be relocated by operations such as VACUUM FULL
24  * and CLUSTER, which make no transactionally-significant changes: it must be
25  * safe for the new file to replace the old, even if the transaction itself
26  * aborts. An important factor here is that the indexes and toast table of
27  * a mapped catalog must also be mapped, so that the rewrites/relocations of
28  * all these files commit in a single map file update rather than being tied
29  * to transaction commit.
30  *
31  * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group
32  * Portions Copyright (c) 1994, Regents of the University of California
33  *
34  *
35  * IDENTIFICATION
36  * src/backend/utils/cache/relmapper.c
37  *
38  *-------------------------------------------------------------------------
39  */
40 #include "postgres.h"
41 
42 #include <fcntl.h>
43 #include <sys/stat.h>
44 #include <unistd.h>
45 
46 #include "access/xact.h"
47 #include "access/xlog.h"
48 #include "access/xloginsert.h"
49 #include "catalog/catalog.h"
50 #include "catalog/pg_tablespace.h"
51 #include "catalog/storage.h"
52 #include "miscadmin.h"
53 #include "pgstat.h"
54 #include "storage/fd.h"
55 #include "storage/lwlock.h"
56 #include "utils/inval.h"
57 #include "utils/relmapper.h"
58 
59 
60 /*
61  * The map file is critical data: we have no automatic method for recovering
62  * from loss or corruption of it. We use a CRC so that we can detect
63  * corruption. To minimize the risk of failed updates, the map file should
64  * be kept to no more than one standard-size disk sector (ie 512 bytes),
65  * and we use overwrite-in-place rather than playing renaming games.
66  * The struct layout below is designed to occupy exactly 512 bytes, which
67  * might make filesystem updates a bit more efficient.
68  *
69  * Entries in the mappings[] array are in no particular order. We could
70  * speed searching by insisting on OID order, but it really shouldn't be
71  * worth the trouble given the intended size of the mapping sets.
72  */
73 #define RELMAPPER_FILENAME "pg_filenode.map"
74 
75 #define RELMAPPER_FILEMAGIC 0x592717 /* version ID value */
76 
77 #define MAX_MAPPINGS 62 /* 62 * 8 + 16 = 512 */
78 
79 typedef struct RelMapping
80 {
81  Oid mapoid; /* OID of a catalog */
82  Oid mapfilenode; /* its filenode number */
84 
85 typedef struct RelMapFile
86 {
87  int32 magic; /* always RELMAPPER_FILEMAGIC */
88  int32 num_mappings; /* number of valid RelMapping entries */
90  pg_crc32c crc; /* CRC of all above */
91  int32 pad; /* to make the struct size be 512 exactly */
93 
94 /*
95  * State for serializing local and shared relmappings for parallel workers
96  * (active states only). See notes on active_* and pending_* updates state.
97  */
99 {
103 
104 /*
105  * The currently known contents of the shared map file and our database's
106  * local map file are stored here. These can be reloaded from disk
107  * immediately whenever we receive an update sinval message.
108  */
111 
112 /*
113  * We use the same RelMapFile data structure to track uncommitted local
114  * changes in the mappings (but note the magic and crc fields are not made
115  * valid in these variables). Currently, map updates are not allowed within
116  * subtransactions, so one set of transaction-level changes is sufficient.
117  *
118  * The active_xxx variables contain updates that are valid in our transaction
119  * and should be honored by RelationMapOidToFilenode. The pending_xxx
120  * variables contain updates we have been told about that aren't active yet;
121  * they will become active at the next CommandCounterIncrement. This setup
122  * lets map updates act similarly to updates of pg_class rows, ie, they
123  * become visible only at the next CommandCounterIncrement boundary.
124  *
125  * Active shared and active local updates are serialized by the parallel
126  * infrastructure, and deserialized within parallel workers.
127  */
132 
133 
134 /* non-export function prototypes */
135 static void apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode,
136  bool add_okay);
137 static void merge_map_updates(RelMapFile *map, const RelMapFile *updates,
138  bool add_okay);
139 static void load_relmap_file(bool shared, bool lock_held);
140 static void read_relmap_file(RelMapFile *map, char *dbpath, bool lock_held,
141  int elevel);
142 static void write_relmap_file(RelMapFile *newmap, bool write_wal,
143  bool send_sinval, bool preserve_files,
144  Oid dbid, Oid tsid, const char *dbpath);
145 static void perform_relmap_update(bool shared, const RelMapFile *updates);
146 
147 
148 /*
149  * RelationMapOidToFilenode
150  *
151  * The raison d' etre ... given a relation OID, look up its filenode.
152  *
153  * Although shared and local relation OIDs should never overlap, the caller
154  * always knows which we need --- so pass that information to avoid useless
155  * searching.
156  *
157  * Returns InvalidOid if the OID is not known (which should never happen,
158  * but the caller is in a better position to report a meaningful error).
159  */
160 Oid
161 RelationMapOidToFilenode(Oid relationId, bool shared)
162 {
163  const RelMapFile *map;
164  int32 i;
165 
166  /* If there are active updates, believe those over the main maps */
167  if (shared)
168  {
169  map = &active_shared_updates;
170  for (i = 0; i < map->num_mappings; i++)
171  {
172  if (relationId == map->mappings[i].mapoid)
173  return map->mappings[i].mapfilenode;
174  }
175  map = &shared_map;
176  for (i = 0; i < map->num_mappings; i++)
177  {
178  if (relationId == map->mappings[i].mapoid)
179  return map->mappings[i].mapfilenode;
180  }
181  }
182  else
183  {
184  map = &active_local_updates;
185  for (i = 0; i < map->num_mappings; i++)
186  {
187  if (relationId == map->mappings[i].mapoid)
188  return map->mappings[i].mapfilenode;
189  }
190  map = &local_map;
191  for (i = 0; i < map->num_mappings; i++)
192  {
193  if (relationId == map->mappings[i].mapoid)
194  return map->mappings[i].mapfilenode;
195  }
196  }
197 
198  return InvalidOid;
199 }
200 
201 /*
202  * RelationMapFilenodeToOid
203  *
204  * Do the reverse of the normal direction of mapping done in
205  * RelationMapOidToFilenode.
206  *
207  * This is not supposed to be used during normal running but rather for
208  * information purposes when looking at the filesystem or xlog.
209  *
210  * Returns InvalidOid if the OID is not known; this can easily happen if the
211  * relfilenode doesn't pertain to a mapped relation.
212  */
213 Oid
214 RelationMapFilenodeToOid(Oid filenode, bool shared)
215 {
216  const RelMapFile *map;
217  int32 i;
218 
219  /* If there are active updates, believe those over the main maps */
220  if (shared)
221  {
222  map = &active_shared_updates;
223  for (i = 0; i < map->num_mappings; i++)
224  {
225  if (filenode == map->mappings[i].mapfilenode)
226  return map->mappings[i].mapoid;
227  }
228  map = &shared_map;
229  for (i = 0; i < map->num_mappings; i++)
230  {
231  if (filenode == map->mappings[i].mapfilenode)
232  return map->mappings[i].mapoid;
233  }
234  }
235  else
236  {
237  map = &active_local_updates;
238  for (i = 0; i < map->num_mappings; i++)
239  {
240  if (filenode == map->mappings[i].mapfilenode)
241  return map->mappings[i].mapoid;
242  }
243  map = &local_map;
244  for (i = 0; i < map->num_mappings; i++)
245  {
246  if (filenode == map->mappings[i].mapfilenode)
247  return map->mappings[i].mapoid;
248  }
249  }
250 
251  return InvalidOid;
252 }
253 
254 /*
255  * RelationMapOidToFilenodeForDatabase
256  *
257  * Like RelationMapOidToFilenode, but reads the mapping from the indicated
258  * path instead of using the one for the current database.
259  */
260 Oid
261 RelationMapOidToFilenodeForDatabase(char *dbpath, Oid relationId)
262 {
263  RelMapFile map;
264  int i;
265 
266  /* Read the relmap file from the source database. */
267  read_relmap_file(&map, dbpath, false, ERROR);
268 
269  /* Iterate over the relmap entries to find the input relation OID. */
270  for (i = 0; i < map.num_mappings; i++)
271  {
272  if (relationId == map.mappings[i].mapoid)
273  return map.mappings[i].mapfilenode;
274  }
275 
276  return InvalidOid;
277 }
278 
279 /*
280  * RelationMapCopy
281  *
282  * Copy relmapfile from source db path to the destination db path and WAL log
283  * the operation. This is intended for use in creating a new relmap file
284  * for a database that doesn't have one yet, not for replacing an existing
285  * relmap file.
286  */
287 void
288 RelationMapCopy(Oid dbid, Oid tsid, char *srcdbpath, char *dstdbpath)
289 {
290  RelMapFile map;
291 
292  /*
293  * Read the relmap file from the source database.
294  */
295  read_relmap_file(&map, srcdbpath, false, ERROR);
296 
297  /*
298  * Write the same data into the destination database's relmap file.
299  *
300  * No sinval is needed because no one can be connected to the destination
301  * database yet. For the same reason, there is no need to acquire
302  * RelationMappingLock.
303  *
304  * There's no point in trying to preserve files here. The new database
305  * isn't usable yet anyway, and won't ever be if we can't install a relmap
306  * file.
307  */
308  write_relmap_file(&map, true, false, false, dbid, tsid, dstdbpath);
309 }
310 
311 /*
312  * RelationMapUpdateMap
313  *
314  * Install a new relfilenode mapping for the specified relation.
315  *
316  * If immediate is true (or we're bootstrapping), the mapping is activated
317  * immediately. Otherwise it is made pending until CommandCounterIncrement.
318  */
319 void
320 RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared,
321  bool immediate)
322 {
323  RelMapFile *map;
324 
326  {
327  /*
328  * In bootstrap mode, the mapping gets installed in permanent map.
329  */
330  if (shared)
331  map = &shared_map;
332  else
333  map = &local_map;
334  }
335  else
336  {
337  /*
338  * We don't currently support map changes within subtransactions, or
339  * when in parallel mode. This could be done with more bookkeeping
340  * infrastructure, but it doesn't presently seem worth it.
341  */
343  elog(ERROR, "cannot change relation mapping within subtransaction");
344 
345  if (IsInParallelMode())
346  elog(ERROR, "cannot change relation mapping in parallel mode");
347 
348  if (immediate)
349  {
350  /* Make it active, but only locally */
351  if (shared)
352  map = &active_shared_updates;
353  else
354  map = &active_local_updates;
355  }
356  else
357  {
358  /* Make it pending */
359  if (shared)
360  map = &pending_shared_updates;
361  else
362  map = &pending_local_updates;
363  }
364  }
365  apply_map_update(map, relationId, fileNode, true);
366 }
367 
368 /*
369  * apply_map_update
370  *
371  * Insert a new mapping into the given map variable, replacing any existing
372  * mapping for the same relation.
373  *
374  * In some cases the caller knows there must be an existing mapping; pass
375  * add_okay = false to draw an error if not.
376  */
377 static void
378 apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay)
379 {
380  int32 i;
381 
382  /* Replace any existing mapping */
383  for (i = 0; i < map->num_mappings; i++)
384  {
385  if (relationId == map->mappings[i].mapoid)
386  {
387  map->mappings[i].mapfilenode = fileNode;
388  return;
389  }
390  }
391 
392  /* Nope, need to add a new mapping */
393  if (!add_okay)
394  elog(ERROR, "attempt to apply a mapping to unmapped relation %u",
395  relationId);
396  if (map->num_mappings >= MAX_MAPPINGS)
397  elog(ERROR, "ran out of space in relation map");
398  map->mappings[map->num_mappings].mapoid = relationId;
399  map->mappings[map->num_mappings].mapfilenode = fileNode;
400  map->num_mappings++;
401 }
402 
403 /*
404  * merge_map_updates
405  *
406  * Merge all the updates in the given pending-update map into the target map.
407  * This is just a bulk form of apply_map_update.
408  */
409 static void
410 merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay)
411 {
412  int32 i;
413 
414  for (i = 0; i < updates->num_mappings; i++)
415  {
416  apply_map_update(map,
417  updates->mappings[i].mapoid,
418  updates->mappings[i].mapfilenode,
419  add_okay);
420  }
421 }
422 
423 /*
424  * RelationMapRemoveMapping
425  *
426  * Remove a relation's entry in the map. This is only allowed for "active"
427  * (but not committed) local mappings. We need it so we can back out the
428  * entry for the transient target file when doing VACUUM FULL/CLUSTER on
429  * a mapped relation.
430  */
431 void
433 {
435  int32 i;
436 
437  for (i = 0; i < map->num_mappings; i++)
438  {
439  if (relationId == map->mappings[i].mapoid)
440  {
441  /* Found it, collapse it out */
442  map->mappings[i] = map->mappings[map->num_mappings - 1];
443  map->num_mappings--;
444  return;
445  }
446  }
447  elog(ERROR, "could not find temporary mapping for relation %u",
448  relationId);
449 }
450 
451 /*
452  * RelationMapInvalidate
453  *
454  * This routine is invoked for SI cache flush messages. We must re-read
455  * the indicated map file. However, we might receive a SI message in a
456  * process that hasn't yet, and might never, load the mapping files;
457  * for example the autovacuum launcher, which *must not* try to read
458  * a local map since it is attached to no particular database.
459  * So, re-read only if the map is valid now.
460  */
461 void
463 {
464  if (shared)
465  {
467  load_relmap_file(true, false);
468  }
469  else
470  {
472  load_relmap_file(false, false);
473  }
474 }
475 
476 /*
477  * RelationMapInvalidateAll
478  *
479  * Reload all map files. This is used to recover from SI message buffer
480  * overflow: we can't be sure if we missed an inval message.
481  * Again, reload only currently-valid maps.
482  */
483 void
485 {
487  load_relmap_file(true, false);
489  load_relmap_file(false, false);
490 }
491 
492 /*
493  * AtCCI_RelationMap
494  *
495  * Activate any "pending" relation map updates at CommandCounterIncrement time.
496  */
497 void
499 {
501  {
504  true);
506  }
508  {
511  true);
513  }
514 }
515 
516 /*
517  * AtEOXact_RelationMap
518  *
519  * Handle relation mapping at main-transaction commit or abort.
520  *
521  * During commit, this must be called as late as possible before the actual
522  * transaction commit, so as to minimize the window where the transaction
523  * could still roll back after committing map changes. Although nothing
524  * critically bad happens in such a case, we still would prefer that it
525  * not happen, since we'd possibly be losing useful updates to the relations'
526  * pg_class row(s).
527  *
528  * During abort, we just have to throw away any pending map changes.
529  * Normal post-abort cleanup will take care of fixing relcache entries.
530  * Parallel worker commit/abort is handled by resetting active mappings
531  * that may have been received from the leader process. (There should be
532  * no pending updates in parallel workers.)
533  */
534 void
535 AtEOXact_RelationMap(bool isCommit, bool isParallelWorker)
536 {
537  if (isCommit && !isParallelWorker)
538  {
539  /*
540  * We should not get here with any "pending" updates. (We could
541  * logically choose to treat such as committed, but in the current
542  * code this should never happen.)
543  */
546 
547  /*
548  * Write any active updates to the actual map files, then reset them.
549  */
551  {
554  }
556  {
559  }
560  }
561  else
562  {
563  /* Abort or parallel worker --- drop all local and pending updates */
564  Assert(!isParallelWorker || pending_shared_updates.num_mappings == 0);
565  Assert(!isParallelWorker || pending_local_updates.num_mappings == 0);
566 
571  }
572 }
573 
574 /*
575  * AtPrepare_RelationMap
576  *
577  * Handle relation mapping at PREPARE.
578  *
579  * Currently, we don't support preparing any transaction that changes the map.
580  */
581 void
583 {
588  ereport(ERROR,
589  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
590  errmsg("cannot PREPARE a transaction that modified relation mapping")));
591 }
592 
593 /*
594  * CheckPointRelationMap
595  *
596  * This is called during a checkpoint. It must ensure that any relation map
597  * updates that were WAL-logged before the start of the checkpoint are
598  * securely flushed to disk and will not need to be replayed later. This
599  * seems unlikely to be a performance-critical issue, so we use a simple
600  * method: we just take and release the RelationMappingLock. This ensures
601  * that any already-logged map update is complete, because write_relmap_file
602  * will fsync the map file before the lock is released.
603  */
604 void
606 {
607  LWLockAcquire(RelationMappingLock, LW_SHARED);
608  LWLockRelease(RelationMappingLock);
609 }
610 
611 /*
612  * RelationMapFinishBootstrap
613  *
614  * Write out the initial relation mapping files at the completion of
615  * bootstrap. All the mapped files should have been made known to us
616  * via RelationMapUpdateMap calls.
617  */
618 void
620 {
622 
623  /* Shouldn't be anything "pending" ... */
628 
629  /* Write the files; no WAL or sinval needed */
630  write_relmap_file(&shared_map, false, false, false,
631  InvalidOid, GLOBALTABLESPACE_OID, "global");
632  write_relmap_file(&local_map, false, false, false,
634 }
635 
636 /*
637  * RelationMapInitialize
638  *
639  * This initializes the mapper module at process startup. We can't access the
640  * database yet, so just make sure the maps are empty.
641  */
642 void
644 {
645  /* The static variables should initialize to zeroes, but let's be sure */
646  shared_map.magic = 0; /* mark it not loaded */
647  local_map.magic = 0;
654 }
655 
656 /*
657  * RelationMapInitializePhase2
658  *
659  * This is called to prepare for access to pg_database during startup.
660  * We should be able to read the shared map file now.
661  */
662 void
664 {
665  /*
666  * In bootstrap mode, the map file isn't there yet, so do nothing.
667  */
669  return;
670 
671  /*
672  * Load the shared map file, die on error.
673  */
674  load_relmap_file(true, false);
675 }
676 
677 /*
678  * RelationMapInitializePhase3
679  *
680  * This is called as soon as we have determined MyDatabaseId and set up
681  * DatabasePath. At this point we should be able to read the local map file.
682  */
683 void
685 {
686  /*
687  * In bootstrap mode, the map file isn't there yet, so do nothing.
688  */
690  return;
691 
692  /*
693  * Load the local map file, die on error.
694  */
695  load_relmap_file(false, false);
696 }
697 
698 /*
699  * EstimateRelationMapSpace
700  *
701  * Estimate space needed to pass active shared and local relmaps to parallel
702  * workers.
703  */
704 Size
706 {
707  return sizeof(SerializedActiveRelMaps);
708 }
709 
710 /*
711  * SerializeRelationMap
712  *
713  * Serialize active shared and local relmap state for parallel workers.
714  */
715 void
716 SerializeRelationMap(Size maxSize, char *startAddress)
717 {
718  SerializedActiveRelMaps *relmaps;
719 
720  Assert(maxSize >= EstimateRelationMapSpace());
721 
722  relmaps = (SerializedActiveRelMaps *) startAddress;
725 }
726 
727 /*
728  * RestoreRelationMap
729  *
730  * Restore active shared and local relmap state within a parallel worker.
731  */
732 void
733 RestoreRelationMap(char *startAddress)
734 {
735  SerializedActiveRelMaps *relmaps;
736 
741  elog(ERROR, "parallel worker has existing mappings");
742 
743  relmaps = (SerializedActiveRelMaps *) startAddress;
746 }
747 
748 /*
749  * load_relmap_file -- load the shared or local map file
750  *
751  * Because these files are essential for access to core system catalogs,
752  * failure to load either of them is a fatal error.
753  *
754  * Note that the local case requires DatabasePath to be set up.
755  */
756 static void
757 load_relmap_file(bool shared, bool lock_held)
758 {
759  if (shared)
760  read_relmap_file(&shared_map, "global", lock_held, FATAL);
761  else
763 }
764 
765 /*
766  * read_relmap_file -- load data from any relation mapper file
767  *
768  * dbpath must be the relevant database path, or "global" for shared relations.
769  *
770  * RelationMappingLock will be acquired released unless lock_held = true.
771  *
772  * Errors will be reported at the indicated elevel, which should be at least
773  * ERROR.
774  */
775 static void
776 read_relmap_file(RelMapFile *map, char *dbpath, bool lock_held, int elevel)
777 {
778  char mapfilename[MAXPGPATH];
779  pg_crc32c crc;
780  int fd;
781  int r;
782 
783  Assert(elevel >= ERROR);
784 
785  /* Open the target file. */
786  snprintf(mapfilename, sizeof(mapfilename), "%s/%s", dbpath,
788  fd = OpenTransientFile(mapfilename, O_RDONLY | PG_BINARY);
789  if (fd < 0)
790  ereport(elevel,
792  errmsg("could not open file \"%s\": %m",
793  mapfilename)));
794 
795  /*
796  * Grab the lock to prevent the file from being updated while we read it,
797  * unless the caller is already holding the lock. If the file is updated
798  * shortly after we look, the sinval signaling mechanism will make us
799  * re-read it before we are able to access any relation that's affected by
800  * the change.
801  */
802  if (!lock_held)
803  LWLockAcquire(RelationMappingLock, LW_SHARED);
804 
805  /* Now read the data. */
807  r = read(fd, map, sizeof(RelMapFile));
808  if (r != sizeof(RelMapFile))
809  {
810  if (r < 0)
811  ereport(elevel,
813  errmsg("could not read file \"%s\": %m", mapfilename)));
814  else
815  ereport(elevel,
817  errmsg("could not read file \"%s\": read %d of %zu",
818  mapfilename, r, sizeof(RelMapFile))));
819  }
821 
822  if (!lock_held)
823  LWLockRelease(RelationMappingLock);
824 
825  if (CloseTransientFile(fd) != 0)
826  ereport(elevel,
828  errmsg("could not close file \"%s\": %m",
829  mapfilename)));
830 
831  /* check for correct magic number, etc */
832  if (map->magic != RELMAPPER_FILEMAGIC ||
833  map->num_mappings < 0 ||
834  map->num_mappings > MAX_MAPPINGS)
835  ereport(elevel,
836  (errmsg("relation mapping file \"%s\" contains invalid data",
837  mapfilename)));
838 
839  /* verify the CRC */
840  INIT_CRC32C(crc);
841  COMP_CRC32C(crc, (char *) map, offsetof(RelMapFile, crc));
842  FIN_CRC32C(crc);
843 
844  if (!EQ_CRC32C(crc, map->crc))
845  ereport(elevel,
846  (errmsg("relation mapping file \"%s\" contains incorrect checksum",
847  mapfilename)));
848 }
849 
850 /*
851  * Write out a new shared or local map file with the given contents.
852  *
853  * The magic number and CRC are automatically updated in *newmap. On
854  * success, we copy the data to the appropriate permanent static variable.
855  *
856  * If write_wal is true then an appropriate WAL message is emitted.
857  * (It will be false for bootstrap and WAL replay cases.)
858  *
859  * If send_sinval is true then a SI invalidation message is sent.
860  * (This should be true except in bootstrap case.)
861  *
862  * If preserve_files is true then the storage manager is warned not to
863  * delete the files listed in the map.
864  *
865  * Because this may be called during WAL replay when MyDatabaseId,
866  * DatabasePath, etc aren't valid, we require the caller to pass in suitable
867  * values. Pass dbpath as "global" for the shared map.
868  *
869  * The caller is also responsible for being sure no concurrent map update
870  * could be happening.
871  */
872 static void
873 write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval,
874  bool preserve_files, Oid dbid, Oid tsid, const char *dbpath)
875 {
876  int fd;
877  char mapfilename[MAXPGPATH];
878 
879  /*
880  * Fill in the overhead fields and update CRC.
881  */
882  newmap->magic = RELMAPPER_FILEMAGIC;
883  if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
884  elog(ERROR, "attempt to write bogus relation mapping");
885 
886  INIT_CRC32C(newmap->crc);
887  COMP_CRC32C(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
888  FIN_CRC32C(newmap->crc);
889 
890  /*
891  * Open the target file. We prefer to do this before entering the
892  * critical section, so that an open() failure need not force PANIC.
893  */
894  snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
895  dbpath, RELMAPPER_FILENAME);
896  fd = OpenTransientFile(mapfilename, O_WRONLY | O_CREAT | PG_BINARY);
897  if (fd < 0)
898  ereport(ERROR,
900  errmsg("could not open file \"%s\": %m",
901  mapfilename)));
902 
903  if (write_wal)
904  {
905  xl_relmap_update xlrec;
906  XLogRecPtr lsn;
907 
908  /* now errors are fatal ... */
910 
911  xlrec.dbid = dbid;
912  xlrec.tsid = tsid;
913  xlrec.nbytes = sizeof(RelMapFile);
914 
915  XLogBeginInsert();
916  XLogRegisterData((char *) (&xlrec), MinSizeOfRelmapUpdate);
917  XLogRegisterData((char *) newmap, sizeof(RelMapFile));
918 
919  lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE);
920 
921  /* As always, WAL must hit the disk before the data update does */
922  XLogFlush(lsn);
923  }
924 
925  errno = 0;
927  if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
928  {
929  /* if write didn't set errno, assume problem is no disk space */
930  if (errno == 0)
931  errno = ENOSPC;
932  ereport(ERROR,
934  errmsg("could not write file \"%s\": %m",
935  mapfilename)));
936  }
938 
939  /*
940  * We choose to fsync the data to disk before considering the task done.
941  * It would be possible to relax this if it turns out to be a performance
942  * issue, but it would complicate checkpointing --- see notes for
943  * CheckPointRelationMap.
944  */
946  if (pg_fsync(fd) != 0)
949  errmsg("could not fsync file \"%s\": %m",
950  mapfilename)));
952 
953  if (CloseTransientFile(fd) != 0)
954  ereport(ERROR,
956  errmsg("could not close file \"%s\": %m",
957  mapfilename)));
958 
959  /*
960  * Now that the file is safely on disk, send sinval message to let other
961  * backends know to re-read it. We must do this inside the critical
962  * section: if for some reason we fail to send the message, we have to
963  * force a database-wide PANIC. Otherwise other backends might continue
964  * execution with stale mapping information, which would be catastrophic
965  * as soon as others began to use the now-committed data.
966  */
967  if (send_sinval)
968  CacheInvalidateRelmap(dbid);
969 
970  /*
971  * Make sure that the files listed in the map are not deleted if the outer
972  * transaction aborts. This had better be within the critical section
973  * too: it's not likely to fail, but if it did, we'd arrive at transaction
974  * abort with the files still vulnerable. PANICing will leave things in a
975  * good state on-disk.
976  *
977  * Note: we're cheating a little bit here by assuming that mapped files
978  * are either in pg_global or the database's default tablespace.
979  */
980  if (preserve_files)
981  {
982  int32 i;
983 
984  for (i = 0; i < newmap->num_mappings; i++)
985  {
986  RelFileNode rnode;
987 
988  rnode.spcNode = tsid;
989  rnode.dbNode = dbid;
990  rnode.relNode = newmap->mappings[i].mapfilenode;
991  RelationPreserveStorage(rnode, false);
992  }
993  }
994 
995  /* Critical section done */
996  if (write_wal)
998 }
999 
1000 /*
1001  * Merge the specified updates into the appropriate "real" map,
1002  * and write out the changes. This function must be used for committing
1003  * updates during normal multiuser operation.
1004  */
1005 static void
1006 perform_relmap_update(bool shared, const RelMapFile *updates)
1007 {
1008  RelMapFile newmap;
1009 
1010  /*
1011  * Anyone updating a relation's mapping info should take exclusive lock on
1012  * that rel and hold it until commit. This ensures that there will not be
1013  * concurrent updates on the same mapping value; but there could easily be
1014  * concurrent updates on different values in the same file. We cover that
1015  * by acquiring the RelationMappingLock, re-reading the target file to
1016  * ensure it's up to date, applying the updates, and writing the data
1017  * before releasing RelationMappingLock.
1018  *
1019  * There is only one RelationMappingLock. In principle we could try to
1020  * have one per mapping file, but it seems unlikely to be worth the
1021  * trouble.
1022  */
1023  LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
1024 
1025  /* Be certain we see any other updates just made */
1026  load_relmap_file(shared, true);
1027 
1028  /* Prepare updated data in a local variable */
1029  if (shared)
1030  memcpy(&newmap, &shared_map, sizeof(RelMapFile));
1031  else
1032  memcpy(&newmap, &local_map, sizeof(RelMapFile));
1033 
1034  /*
1035  * Apply the updates to newmap. No new mappings should appear, unless
1036  * somebody is adding indexes to system catalogs.
1037  */
1038  merge_map_updates(&newmap, updates, allowSystemTableMods);
1039 
1040  /* Write out the updated map and do other necessary tasks */
1041  write_relmap_file(&newmap, true, true, true,
1042  (shared ? InvalidOid : MyDatabaseId),
1043  (shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace),
1044  (shared ? "global" : DatabasePath));
1045 
1046  /*
1047  * We successfully wrote the updated file, so it's now safe to rely on the
1048  * new values in this process, too.
1049  */
1050  if (shared)
1051  memcpy(&shared_map, &newmap, sizeof(RelMapFile));
1052  else
1053  memcpy(&local_map, &newmap, sizeof(RelMapFile));
1054 
1055  /* Now we can release the lock */
1056  LWLockRelease(RelationMappingLock);
1057 }
1058 
1059 /*
1060  * RELMAP resource manager's routines
1061  */
1062 void
1064 {
1065  uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1066 
1067  /* Backup blocks are not used in relmap records */
1068  Assert(!XLogRecHasAnyBlockRefs(record));
1069 
1070  if (info == XLOG_RELMAP_UPDATE)
1071  {
1072  xl_relmap_update *xlrec = (xl_relmap_update *) XLogRecGetData(record);
1073  RelMapFile newmap;
1074  char *dbpath;
1075 
1076  if (xlrec->nbytes != sizeof(RelMapFile))
1077  elog(PANIC, "relmap_redo: wrong size %u in relmap update record",
1078  xlrec->nbytes);
1079  memcpy(&newmap, xlrec->data, sizeof(newmap));
1080 
1081  /* We need to construct the pathname for this database */
1082  dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid);
1083 
1084  /*
1085  * Write out the new map and send sinval, but of course don't write a
1086  * new WAL entry. There's no surrounding transaction to tell to
1087  * preserve files, either.
1088  *
1089  * There shouldn't be anyone else updating relmaps during WAL replay,
1090  * but grab the lock to interlock against load_relmap_file().
1091  *
1092  * Note that we use the same WAL record for updating the relmap of an
1093  * existing database as we do for creating a new database. In the
1094  * latter case, taking the relmap log and sending sinval messages is
1095  * unnecessary, but harmless. If we wanted to avoid it, we could add a
1096  * flag to the WAL record to indicate which operation is being
1097  * performed.
1098  */
1099  LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
1100  write_relmap_file(&newmap, false, true, false,
1101  xlrec->dbid, xlrec->tsid, dbpath);
1102  LWLockRelease(RelationMappingLock);
1103 
1104  pfree(dbpath);
1105  }
1106  else
1107  elog(PANIC, "relmap_redo: unknown op code %u", info);
1108 }
#define offsetof(type, field)
Definition: c.h:727
signed int int32
Definition: c.h:429
#define PG_BINARY
Definition: c.h:1268
unsigned char uint8
Definition: c.h:439
size_t Size
Definition: c.h:540
int errcode_for_file_access(void)
Definition: elog.c:716
int errcode(int sqlerrcode)
Definition: elog.c:693
int errmsg(const char *fmt,...)
Definition: elog.c:904
#define FATAL
Definition: elog.h:35
#define PANIC
Definition: elog.h:36
#define ERROR
Definition: elog.h:33
#define elog(elevel,...)
Definition: elog.h:218
#define ereport(elevel,...)
Definition: elog.h:143
int CloseTransientFile(int fd)
Definition: fd.c:2688
int data_sync_elevel(int elevel)
Definition: fd.c:3826
int pg_fsync(int fd)
Definition: fd.c:359
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2511
bool allowSystemTableMods
Definition: globals.c:124
Oid MyDatabaseTableSpace
Definition: globals.c:91
char * DatabasePath
Definition: globals.c:97
Oid MyDatabaseId
Definition: globals.c:89
#define write(a, b, c)
Definition: win32.h:14
#define read(a, b, c)
Definition: win32.h:13
void CacheInvalidateRelmap(Oid databaseId)
Definition: inval.c:1492
int i
Definition: isn.c:73
Assert(fmt[strlen(fmt) - 1] !='\n')
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1196
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1800
@ LW_SHARED
Definition: lwlock.h:105
@ LW_EXCLUSIVE
Definition: lwlock.h:104
void pfree(void *pointer)
Definition: mcxt.c:1175
#define IsBootstrapProcessingMode()
Definition: miscadmin.h:406
#define START_CRIT_SECTION()
Definition: miscadmin.h:148
#define END_CRIT_SECTION()
Definition: miscadmin.h:150
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:43
#define MAXPGPATH
uint32 pg_crc32c
Definition: pg_crc32c.h:38
#define COMP_CRC32C(crc, data, len)
Definition: pg_crc32c.h:89
#define EQ_CRC32C(c1, c2)
Definition: pg_crc32c.h:42
#define INIT_CRC32C(crc)
Definition: pg_crc32c.h:41
#define FIN_CRC32C(crc)
Definition: pg_crc32c.h:94
return crc
#define snprintf
Definition: port.h:225
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
static int fd(const char *x, int i)
Definition: preproc-init.c:105
static RelMapFile pending_local_updates
Definition: relmapper.c:131
Size EstimateRelationMapSpace(void)
Definition: relmapper.c:705
void RelationMapRemoveMapping(Oid relationId)
Definition: relmapper.c:432
void RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared, bool immediate)
Definition: relmapper.c:320
struct RelMapping RelMapping
void SerializeRelationMap(Size maxSize, char *startAddress)
Definition: relmapper.c:716
#define RELMAPPER_FILEMAGIC
Definition: relmapper.c:75
void RelationMapCopy(Oid dbid, Oid tsid, char *srcdbpath, char *dstdbpath)
Definition: relmapper.c:288
static void write_relmap_file(RelMapFile *newmap, bool write_wal, bool send_sinval, bool preserve_files, Oid dbid, Oid tsid, const char *dbpath)
Definition: relmapper.c:873
Oid RelationMapFilenodeToOid(Oid filenode, bool shared)
Definition: relmapper.c:214
void RelationMapInvalidateAll(void)
Definition: relmapper.c:484
void RestoreRelationMap(char *startAddress)
Definition: relmapper.c:733
#define MAX_MAPPINGS
Definition: relmapper.c:77
static RelMapFile shared_map
Definition: relmapper.c:109
static RelMapFile active_local_updates
Definition: relmapper.c:129
static void perform_relmap_update(bool shared, const RelMapFile *updates)
Definition: relmapper.c:1006
static void read_relmap_file(RelMapFile *map, char *dbpath, bool lock_held, int elevel)
Definition: relmapper.c:776
Oid RelationMapOidToFilenode(Oid relationId, bool shared)
Definition: relmapper.c:161
void RelationMapInitialize(void)
Definition: relmapper.c:643
void AtPrepare_RelationMap(void)
Definition: relmapper.c:582
static RelMapFile local_map
Definition: relmapper.c:110
void relmap_redo(XLogReaderState *record)
Definition: relmapper.c:1063
#define RELMAPPER_FILENAME
Definition: relmapper.c:73
void AtEOXact_RelationMap(bool isCommit, bool isParallelWorker)
Definition: relmapper.c:535
void RelationMapInvalidate(bool shared)
Definition: relmapper.c:462
static RelMapFile pending_shared_updates
Definition: relmapper.c:130
void RelationMapInitializePhase2(void)
Definition: relmapper.c:663
void RelationMapFinishBootstrap(void)
Definition: relmapper.c:619
struct SerializedActiveRelMaps SerializedActiveRelMaps
Oid RelationMapOidToFilenodeForDatabase(char *dbpath, Oid relationId)
Definition: relmapper.c:261
static void apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay)
Definition: relmapper.c:378
void RelationMapInitializePhase3(void)
Definition: relmapper.c:684
struct RelMapFile RelMapFile
void AtCCI_RelationMap(void)
Definition: relmapper.c:498
static void merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay)
Definition: relmapper.c:410
void CheckPointRelationMap(void)
Definition: relmapper.c:605
static RelMapFile active_shared_updates
Definition: relmapper.c:128
static void load_relmap_file(bool shared, bool lock_held)
Definition: relmapper.c:757
#define XLOG_RELMAP_UPDATE
Definition: relmapper.h:25
#define MinSizeOfRelmapUpdate
Definition: relmapper.h:35
char * GetDatabasePath(Oid dbNode, Oid spcNode)
Definition: relpath.c:110
void RelationPreserveStorage(RelFileNode rnode, bool atCommit)
Definition: storage.c:250
RelMapping mappings[MAX_MAPPINGS]
Definition: relmapper.c:89
int32 magic
Definition: relmapper.c:87
pg_crc32c crc
Definition: relmapper.c:90
int32 pad
Definition: relmapper.c:91
int32 num_mappings
Definition: relmapper.c:88
Oid mapoid
Definition: relmapper.c:81
Oid mapfilenode
Definition: relmapper.c:82
RelMapFile active_local_updates
Definition: relmapper.c:101
RelMapFile active_shared_updates
Definition: relmapper.c:100
char data[FLEXIBLE_ARRAY_MEMBER]
Definition: relmapper.h:32
@ WAIT_EVENT_RELATION_MAP_WRITE
Definition: wait_event.h:197
@ WAIT_EVENT_RELATION_MAP_READ
Definition: wait_event.h:195
@ WAIT_EVENT_RELATION_MAP_SYNC
Definition: wait_event.h:196
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:266
static void pgstat_report_wait_end(void)
Definition: wait_event.h:282
int GetCurrentTransactionNestLevel(void)
Definition: xact.c:910
bool IsInParallelMode(void)
Definition: xact.c:1065
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2509
uint64 XLogRecPtr
Definition: xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition: xloginsert.c:443
void XLogBeginInsert(void)
Definition: xloginsert.c:150
void XLogRegisterData(char *data, int len)
Definition: xloginsert.c:351
#define XLogRecGetInfo(decoder)
Definition: xlogreader.h:408
#define XLogRecGetData(decoder)
Definition: xlogreader.h:413
#define XLogRecHasAnyBlockRefs(decoder)
Definition: xlogreader.h:415
#define XLR_INFO_MASK
Definition: xlogrecord.h:62