PostgreSQL Source Code  git master
sysv_shmem.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * sysv_shmem.c
4  * Implement shared memory using SysV facilities
5  *
6  * These routines used to be a fairly thin layer on top of SysV shared
7  * memory functionality. With the addition of anonymous-shmem logic,
8  * they're a bit fatter now. We still require a SysV shmem block to
9  * exist, though, because mmap'd shmem provides no way to find out how
10  * many processes are attached, which we need for interlocking purposes.
11  *
12  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
13  * Portions Copyright (c) 1994, Regents of the University of California
14  *
15  * IDENTIFICATION
16  * src/backend/port/sysv_shmem.c
17  *
18  *-------------------------------------------------------------------------
19  */
20 #include "postgres.h"
21 
22 #include <signal.h>
23 #include <unistd.h>
24 #include <sys/file.h>
25 #include <sys/ipc.h>
26 #include <sys/mman.h>
27 #include <sys/shm.h>
28 #include <sys/stat.h>
29 
30 #include "miscadmin.h"
31 #include "port/pg_bitutils.h"
32 #include "portability/mem.h"
33 #include "storage/dsm.h"
34 #include "storage/fd.h"
35 #include "storage/ipc.h"
36 #include "storage/pg_shmem.h"
37 #include "utils/guc_hooks.h"
38 #include "utils/pidfile.h"
39 
40 
41 /*
42  * As of PostgreSQL 9.3, we normally allocate only a very small amount of
43  * System V shared memory, and only for the purposes of providing an
44  * interlock to protect the data directory. The real shared memory block
45  * is allocated using mmap(). This works around the problem that many
46  * systems have very low limits on the amount of System V shared memory
47  * that can be allocated. Even a limit of a few megabytes will be enough
48  * to run many copies of PostgreSQL without needing to adjust system settings.
49  *
50  * We assume that no one will attempt to run PostgreSQL 9.3 or later on
51  * systems that are ancient enough that anonymous shared memory is not
52  * supported, such as pre-2.4 versions of Linux. If that turns out to be
53  * false, we might need to add compile and/or run-time tests here and do this
54  * only if the running kernel supports it.
55  *
56  * However, we must always disable this logic in the EXEC_BACKEND case, and
57  * fall back to the old method of allocating the entire segment using System V
58  * shared memory, because there's no way to attach an anonymous mmap'd segment
59  * to a process after exec(). Since EXEC_BACKEND is intended only for
60  * developer use, this shouldn't be a big problem. Because of this, we do
61  * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below.
62  *
63  * As of PostgreSQL 12, we regained the ability to use a large System V shared
64  * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set
65  * to sysv (though this is not the default).
66  */
67 
68 
69 typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
70 typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
71 
72 /*
73  * How does a given IpcMemoryId relate to this PostgreSQL process?
74  *
75  * One could recycle unattached segments of different data directories if we
76  * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would
77  * cause us to visit less of the key space, making us less likely to detect a
78  * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis,
79  * in that postmasters of different data directories could simultaneously
80  * attempt to recycle a given key. We'll waste keys longer in some cases, but
81  * avoiding the problems of the alternative justifies that loss.
82  */
83 typedef enum
84 {
85  SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */
86  SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */
87  SHMSTATE_ENOENT, /* no segment of that ID */
88  SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */
89  SHMSTATE_UNATTACHED /* pertinent to DataDir, no attached PIDs */
91 
92 
93 unsigned long UsedShmemSegID = 0;
94 void *UsedShmemSegAddr = NULL;
95 
97 static void *AnonymousShmem = NULL;
98 
99 static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
100 static void IpcMemoryDetach(int status, Datum shmaddr);
101 static void IpcMemoryDelete(int status, Datum shmId);
103  void *attachAt,
104  PGShmemHeader **addr);
105 
106 
107 /*
108  * InternalIpcMemoryCreate(memKey, size)
109  *
110  * Attempt to create a new shared memory segment with the specified key.
111  * Will fail (return NULL) if such a segment already exists. If successful,
112  * attach the segment to the current process and return its attached address.
113  * On success, callbacks are registered with on_shmem_exit to detach and
114  * delete the segment when on_shmem_exit is called.
115  *
116  * If we fail with a failure code other than collision-with-existing-segment,
117  * print out an error and abort. Other types of errors are not recoverable.
118  */
119 static void *
121 {
122  IpcMemoryId shmid;
123  void *requestedAddress = NULL;
124  void *memAddress;
125 
126  /*
127  * Normally we just pass requestedAddress = NULL to shmat(), allowing the
128  * system to choose where the segment gets mapped. But in an EXEC_BACKEND
129  * build, it's possible for whatever is chosen in the postmaster to not
130  * work for backends, due to variations in address space layout. As a
131  * rather klugy workaround, allow the user to specify the address to use
132  * via setting the environment variable PG_SHMEM_ADDR. (If this were of
133  * interest for anything except debugging, we'd probably create a cleaner
134  * and better-documented way to set it, such as a GUC.)
135  */
136 #ifdef EXEC_BACKEND
137  {
138  char *pg_shmem_addr = getenv("PG_SHMEM_ADDR");
139 
140  if (pg_shmem_addr)
141  requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0);
142  else
143  {
144 #if defined(__darwin__) && SIZEOF_VOID_P == 8
145  /*
146  * Provide a default value that is believed to avoid problems with
147  * ASLR on the current macOS release.
148  */
149  requestedAddress = (void *) 0x80000000000;
150 #endif
151  }
152  }
153 #endif
154 
155  shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
156 
157  if (shmid < 0)
158  {
159  int shmget_errno = errno;
160 
161  /*
162  * Fail quietly if error indicates a collision with existing segment.
163  * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
164  * we could get a permission violation instead? Also, EIDRM might
165  * occur if an old seg is slated for destruction but not gone yet.
166  */
167  if (shmget_errno == EEXIST || shmget_errno == EACCES
168 #ifdef EIDRM
169  || shmget_errno == EIDRM
170 #endif
171  )
172  return NULL;
173 
174  /*
175  * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
176  * there is an existing segment but it's smaller than "size" (this is
177  * a result of poorly-thought-out ordering of error tests). To
178  * distinguish between collision and invalid size in such cases, we
179  * make a second try with size = 0. These kernels do not test size
180  * against SHMMIN in the preexisting-segment case, so we will not get
181  * EINVAL a second time if there is such a segment.
182  */
183  if (shmget_errno == EINVAL)
184  {
185  shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
186 
187  if (shmid < 0)
188  {
189  /* As above, fail quietly if we verify a collision */
190  if (errno == EEXIST || errno == EACCES
191 #ifdef EIDRM
192  || errno == EIDRM
193 #endif
194  )
195  return NULL;
196  /* Otherwise, fall through to report the original error */
197  }
198  else
199  {
200  /*
201  * On most platforms we cannot get here because SHMMIN is
202  * greater than zero. However, if we do succeed in creating a
203  * zero-size segment, free it and then fall through to report
204  * the original error.
205  */
206  if (shmctl(shmid, IPC_RMID, NULL) < 0)
207  elog(LOG, "shmctl(%d, %d, 0) failed: %m",
208  (int) shmid, IPC_RMID);
209  }
210  }
211 
212  /*
213  * Else complain and abort.
214  *
215  * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
216  * is violated. SHMALL violation might be reported as either ENOMEM
217  * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
218  * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
219  * not-enough-RAM is ENOMEM.
220  */
221  errno = shmget_errno;
222  ereport(FATAL,
223  (errmsg("could not create shared memory segment: %m"),
224  errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
225  (unsigned long) memKey, size,
227  (shmget_errno == EINVAL) ?
228  errhint("This error usually means that PostgreSQL's request for a shared memory "
229  "segment exceeded your kernel's SHMMAX parameter, or possibly that "
230  "it is less than "
231  "your kernel's SHMMIN parameter.\n"
232  "The PostgreSQL documentation contains more information about shared "
233  "memory configuration.") : 0,
234  (shmget_errno == ENOMEM) ?
235  errhint("This error usually means that PostgreSQL's request for a shared "
236  "memory segment exceeded your kernel's SHMALL parameter. You might need "
237  "to reconfigure the kernel with larger SHMALL.\n"
238  "The PostgreSQL documentation contains more information about shared "
239  "memory configuration.") : 0,
240  (shmget_errno == ENOSPC) ?
241  errhint("This error does *not* mean that you have run out of disk space. "
242  "It occurs either if all available shared memory IDs have been taken, "
243  "in which case you need to raise the SHMMNI parameter in your kernel, "
244  "or because the system's overall limit for shared memory has been "
245  "reached.\n"
246  "The PostgreSQL documentation contains more information about shared "
247  "memory configuration.") : 0));
248  }
249 
250  /* Register on-exit routine to delete the new segment */
252 
253  /* OK, should be able to attach to the segment */
254  memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
255 
256  if (memAddress == (void *) -1)
257  elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m",
258  shmid, requestedAddress, PG_SHMAT_FLAGS);
259 
260  /* Register on-exit routine to detach new segment before deleting */
262 
263  /*
264  * Store shmem key and ID in data directory lockfile. Format to try to
265  * keep it the same length always (trailing junk in the lockfile won't
266  * hurt, but might confuse humans).
267  */
268  {
269  char line[64];
270 
271  sprintf(line, "%9lu %9lu",
272  (unsigned long) memKey, (unsigned long) shmid);
274  }
275 
276  return memAddress;
277 }
278 
279 /****************************************************************************/
280 /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
281 /* from process' address space */
282 /* (called as an on_shmem_exit callback, hence funny argument list) */
283 /****************************************************************************/
284 static void
285 IpcMemoryDetach(int status, Datum shmaddr)
286 {
287  /* Detach System V shared memory block. */
288  if (shmdt((void *) DatumGetPointer(shmaddr)) < 0)
289  elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
290 }
291 
292 /****************************************************************************/
293 /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
294 /* (called as an on_shmem_exit callback, hence funny argument list) */
295 /****************************************************************************/
296 static void
297 IpcMemoryDelete(int status, Datum shmId)
298 {
299  if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
300  elog(LOG, "shmctl(%d, %d, 0) failed: %m",
301  DatumGetInt32(shmId), IPC_RMID);
302 }
303 
304 /*
305  * PGSharedMemoryIsInUse
306  *
307  * Is a previously-existing shmem segment still existing and in use?
308  *
309  * The point of this exercise is to detect the case where a prior postmaster
310  * crashed, but it left child backends that are still running. Therefore
311  * we only care about shmem segments that are associated with the intended
312  * DataDir. This is an important consideration since accidental matches of
313  * shmem segment IDs are reasonably common.
314  */
315 bool
316 PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
317 {
318  PGShmemHeader *memAddress;
320 
321  state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress);
322  if (memAddress && shmdt((void *) memAddress) < 0)
323  elog(LOG, "shmdt(%p) failed: %m", memAddress);
324  switch (state)
325  {
326  case SHMSTATE_ENOENT:
327  case SHMSTATE_FOREIGN:
328  case SHMSTATE_UNATTACHED:
329  return false;
331  case SHMSTATE_ATTACHED:
332  return true;
333  }
334  return true;
335 }
336 
337 /*
338  * Test for a segment with id shmId; see comment at IpcMemoryState.
339  *
340  * If the segment exists, we'll attempt to attach to it, using attachAt
341  * if that's not NULL (but it's best to pass NULL if possible).
342  *
343  * *addr is set to the segment memory address if we attached to it, else NULL.
344  */
345 static IpcMemoryState
347  void *attachAt,
348  PGShmemHeader **addr)
349 {
350  struct shmid_ds shmStat;
351  struct stat statbuf;
352  PGShmemHeader *hdr;
353 
354  *addr = NULL;
355 
356  /*
357  * First, try to stat the shm segment ID, to see if it exists at all.
358  */
359  if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
360  {
361  /*
362  * EINVAL actually has multiple possible causes documented in the
363  * shmctl man page, but we assume it must mean the segment no longer
364  * exists.
365  */
366  if (errno == EINVAL)
367  return SHMSTATE_ENOENT;
368 
369  /*
370  * EACCES implies we have no read permission, which means it is not a
371  * Postgres shmem segment (or at least, not one that is relevant to
372  * our data directory).
373  */
374  if (errno == EACCES)
375  return SHMSTATE_FOREIGN;
376 
377  /*
378  * Some Linux kernel versions (in fact, all of them as of July 2007)
379  * sometimes return EIDRM when EINVAL is correct. The Linux kernel
380  * actually does not have any internal state that would justify
381  * returning EIDRM, so we can get away with assuming that EIDRM is
382  * equivalent to EINVAL on that platform.
383  */
384 #ifdef HAVE_LINUX_EIDRM_BUG
385  if (errno == EIDRM)
386  return SHMSTATE_ENOENT;
387 #endif
388 
389  /*
390  * Otherwise, we had better assume that the segment is in use. The
391  * only likely case is (non-Linux, assumed spec-compliant) EIDRM,
392  * which implies that the segment has been IPC_RMID'd but there are
393  * still processes attached to it.
394  */
396  }
397 
398  /*
399  * Try to attach to the segment and see if it matches our data directory.
400  * This avoids any risk of duplicate-shmem-key conflicts on machines that
401  * are running several postmasters under the same userid.
402  *
403  * (When we're called from PGSharedMemoryCreate, this stat call is
404  * duplicative; but since this isn't a high-traffic case it's not worth
405  * trying to optimize.)
406  */
407  if (stat(DataDir, &statbuf) < 0)
408  return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */
409 
410  hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS);
411  if (hdr == (PGShmemHeader *) -1)
412  {
413  /*
414  * Attachment failed. The cases we're interested in are the same as
415  * for the shmctl() call above. In particular, note that the owning
416  * postmaster could have terminated and removed the segment between
417  * shmctl() and shmat().
418  *
419  * If attachAt isn't NULL, it's possible that EINVAL reflects a
420  * problem with that address not a vanished segment, so it's best to
421  * pass NULL when probing for conflicting segments.
422  */
423  if (errno == EINVAL)
424  return SHMSTATE_ENOENT; /* segment disappeared */
425  if (errno == EACCES)
426  return SHMSTATE_FOREIGN; /* must be non-Postgres */
427 #ifdef HAVE_LINUX_EIDRM_BUG
428  if (errno == EIDRM)
429  return SHMSTATE_ENOENT; /* segment disappeared */
430 #endif
431  /* Otherwise, be conservative. */
433  }
434  *addr = hdr;
435 
436  if (hdr->magic != PGShmemMagic ||
437  hdr->device != statbuf.st_dev ||
438  hdr->inode != statbuf.st_ino)
439  {
440  /*
441  * It's either not a Postgres segment, or not one for my data
442  * directory.
443  */
444  return SHMSTATE_FOREIGN;
445  }
446 
447  /*
448  * It does match our data directory, so now test whether any processes are
449  * still attached to it. (We are, now, but the shm_nattch result is from
450  * before we attached to it.)
451  */
452  return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED;
453 }
454 
455 /*
456  * Identify the huge page size to use, and compute the related mmap flags.
457  *
458  * Some Linux kernel versions have a bug causing mmap() to fail on requests
459  * that are not a multiple of the hugepage size. Versions without that bug
460  * instead silently round the request up to the next hugepage multiple ---
461  * and then munmap() fails when we give it a size different from that.
462  * So we have to round our request up to a multiple of the actual hugepage
463  * size to avoid trouble.
464  *
465  * Doing the round-up ourselves also lets us make use of the extra memory,
466  * rather than just wasting it. Currently, we just increase the available
467  * space recorded in the shmem header, which will make the extra usable for
468  * purposes such as additional locktable entries. Someday, for very large
469  * hugepage sizes, we might want to think about more invasive strategies,
470  * such as increasing shared_buffers to absorb the extra space.
471  *
472  * Returns the (real, assumed or config provided) page size into
473  * *hugepagesize, and the hugepage-related mmap flags to use into
474  * *mmap_flags if requested by the caller. If huge pages are not supported,
475  * *hugepagesize and *mmap_flags are set to 0.
476  */
477 void
478 GetHugePageSize(Size *hugepagesize, int *mmap_flags)
479 {
480 #ifdef MAP_HUGETLB
481 
482  Size default_hugepagesize = 0;
483  Size hugepagesize_local = 0;
484  int mmap_flags_local = 0;
485 
486  /*
487  * System-dependent code to find out the default huge page size.
488  *
489  * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
490  * nnnn kB". Ignore any failures, falling back to the preset default.
491  */
492 #ifdef __linux__
493 
494  {
495  FILE *fp = AllocateFile("/proc/meminfo", "r");
496  char buf[128];
497  unsigned int sz;
498  char ch;
499 
500  if (fp)
501  {
502  while (fgets(buf, sizeof(buf), fp))
503  {
504  if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2)
505  {
506  if (ch == 'k')
507  {
508  default_hugepagesize = sz * (Size) 1024;
509  break;
510  }
511  /* We could accept other units besides kB, if needed */
512  }
513  }
514  FreeFile(fp);
515  }
516  }
517 #endif /* __linux__ */
518 
519  if (huge_page_size != 0)
520  {
521  /* If huge page size is requested explicitly, use that. */
522  hugepagesize_local = (Size) huge_page_size * 1024;
523  }
524  else if (default_hugepagesize != 0)
525  {
526  /* Otherwise use the system default, if we have it. */
527  hugepagesize_local = default_hugepagesize;
528  }
529  else
530  {
531  /*
532  * If we fail to find out the system's default huge page size, or no
533  * huge page size is requested explicitly, assume it is 2MB. This will
534  * work fine when the actual size is less. If it's more, we might get
535  * mmap() or munmap() failures due to unaligned requests; but at this
536  * writing, there are no reports of any non-Linux systems being picky
537  * about that.
538  */
539  hugepagesize_local = 2 * 1024 * 1024;
540  }
541 
542  mmap_flags_local = MAP_HUGETLB;
543 
544  /*
545  * On recent enough Linux, also include the explicit page size, if
546  * necessary.
547  */
548 #if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)
549  if (hugepagesize_local != default_hugepagesize)
550  {
551  int shift = pg_ceil_log2_64(hugepagesize_local);
552 
553  mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
554  }
555 #endif
556 
557  /* assign the results found */
558  if (mmap_flags)
559  *mmap_flags = mmap_flags_local;
560  if (hugepagesize)
561  *hugepagesize = hugepagesize_local;
562 
563 #else
564 
565  if (hugepagesize)
566  *hugepagesize = 0;
567  if (mmap_flags)
568  *mmap_flags = 0;
569 
570 #endif /* MAP_HUGETLB */
571 }
572 
573 /*
574  * GUC check_hook for huge_page_size
575  */
576 bool
578 {
579 #if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT))
580  /* Recent enough Linux only, for now. See GetHugePageSize(). */
581  if (*newval != 0)
582  {
583  GUC_check_errdetail("huge_page_size must be 0 on this platform.");
584  return false;
585  }
586 #endif
587  return true;
588 }
589 
590 /*
591  * Creates an anonymous mmap()ed shared memory segment.
592  *
593  * Pass the requested size in *size. This function will modify *size to the
594  * actual size of the allocation, if it ends up allocating a segment that is
595  * larger than requested.
596  */
597 static void *
599 {
600  Size allocsize = *size;
601  void *ptr = MAP_FAILED;
602  int mmap_errno = 0;
603 
604 #ifndef MAP_HUGETLB
605  /* PGSharedMemoryCreate should have dealt with this case */
607 #else
609  {
610  /*
611  * Round up the request size to a suitable large value.
612  */
613  Size hugepagesize;
614  int mmap_flags;
615 
616  GetHugePageSize(&hugepagesize, &mmap_flags);
617 
618  if (allocsize % hugepagesize != 0)
619  allocsize += hugepagesize - (allocsize % hugepagesize);
620 
621  ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
622  PG_MMAP_FLAGS | mmap_flags, -1, 0);
623  mmap_errno = errno;
624  if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
625  elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
626  allocsize);
627  }
628 #endif
629 
630  /*
631  * Report whether huge pages are in use. This needs to be tracked before
632  * the second mmap() call if attempting to use huge pages failed
633  * previously.
634  */
635  SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on",
637 
638  if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
639  {
640  /*
641  * Use the original size, not the rounded-up value, when falling back
642  * to non-huge pages.
643  */
644  allocsize = *size;
645  ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
646  PG_MMAP_FLAGS, -1, 0);
647  mmap_errno = errno;
648  }
649 
650  if (ptr == MAP_FAILED)
651  {
652  errno = mmap_errno;
653  ereport(FATAL,
654  (errmsg("could not map anonymous shared memory: %m"),
655  (mmap_errno == ENOMEM) ?
656  errhint("This error usually means that PostgreSQL's request "
657  "for a shared memory segment exceeded available memory, "
658  "swap space, or huge pages. To reduce the request size "
659  "(currently %zu bytes), reduce PostgreSQL's shared "
660  "memory usage, perhaps by reducing shared_buffers or "
661  "max_connections.",
662  allocsize) : 0));
663  }
664 
665  *size = allocsize;
666  return ptr;
667 }
668 
669 /*
670  * AnonymousShmemDetach --- detach from an anonymous mmap'd block
671  * (called as an on_shmem_exit callback, hence funny argument list)
672  */
673 static void
675 {
676  /* Release anonymous shared memory block, if any. */
677  if (AnonymousShmem != NULL)
678  {
679  if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
680  elog(LOG, "munmap(%p, %zu) failed: %m",
682  AnonymousShmem = NULL;
683  }
684 }
685 
686 /*
687  * PGSharedMemoryCreate
688  *
689  * Create a shared memory segment of the given size and initialize its
690  * standard header. Also, register an on_shmem_exit callback to release
691  * the storage.
692  *
693  * Dead Postgres segments pertinent to this DataDir are recycled if found, but
694  * we do not fail upon collision with foreign shmem segments. The idea here
695  * is to detect and re-use keys that may have been assigned by a crashed
696  * postmaster or backend.
697  */
700  PGShmemHeader **shim)
701 {
702  IpcMemoryKey NextShmemSegID;
703  void *memAddress;
704  PGShmemHeader *hdr;
705  struct stat statbuf;
706  Size sysvsize;
707 
708  /*
709  * We use the data directory's ID info (inode and device numbers) to
710  * positively identify shmem segments associated with this data dir, and
711  * also as seeds for searching for a free shmem key.
712  */
713  if (stat(DataDir, &statbuf) < 0)
714  ereport(FATAL,
716  errmsg("could not stat data directory \"%s\": %m",
717  DataDir)));
718 
719  /* Complain if hugepages demanded but we can't possibly support them */
720 #if !defined(MAP_HUGETLB)
721  if (huge_pages == HUGE_PAGES_ON)
722  ereport(ERROR,
723  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
724  errmsg("huge pages not supported on this platform")));
725 #endif
726 
727  /* For now, we don't support huge pages in SysV memory */
729  ereport(ERROR,
730  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
731  errmsg("huge pages not supported with the current shared_memory_type setting")));
732 
733  /* Room for a header? */
734  Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
735 
737  {
739  AnonymousShmemSize = size;
740 
741  /* Register on-exit routine to unmap the anonymous segment */
743 
744  /* Now we need only allocate a minimal-sized SysV shmem block. */
745  sysvsize = sizeof(PGShmemHeader);
746  }
747  else
748  {
749  sysvsize = size;
750 
751  /* huge pages are only available with mmap */
752  SetConfigOption("huge_pages_status", "off",
754  }
755 
756  /*
757  * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to
758  * ensure no more than one postmaster per data directory can enter this
759  * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure
760  * that, but prefer fixing it over coping here.)
761  */
762  NextShmemSegID = statbuf.st_ino;
763 
764  for (;;)
765  {
766  IpcMemoryId shmid;
767  PGShmemHeader *oldhdr;
769 
770  /* Try to create new segment */
771  memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
772  if (memAddress)
773  break; /* successful create and attach */
774 
775  /* Check shared memory and possibly remove and recreate */
776 
777  /*
778  * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN.
779  * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can
780  * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN.
781  */
782  shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0);
783  if (shmid < 0)
784  {
785  oldhdr = NULL;
787  }
788  else
789  state = PGSharedMemoryAttach(shmid, NULL, &oldhdr);
790 
791  switch (state)
792  {
794  case SHMSTATE_ATTACHED:
795  ereport(FATAL,
796  (errcode(ERRCODE_LOCK_FILE_EXISTS),
797  errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use",
798  (unsigned long) NextShmemSegID,
799  (unsigned long) shmid),
800  errhint("Terminate any old server processes associated with data directory \"%s\".",
801  DataDir)));
802  break;
803  case SHMSTATE_ENOENT:
804 
805  /*
806  * To our surprise, some other process deleted since our last
807  * InternalIpcMemoryCreate(). Moments earlier, we would have
808  * seen SHMSTATE_FOREIGN. Try that same ID again.
809  */
810  elog(LOG,
811  "shared memory block (key %lu, ID %lu) deleted during startup",
812  (unsigned long) NextShmemSegID,
813  (unsigned long) shmid);
814  break;
815  case SHMSTATE_FOREIGN:
816  NextShmemSegID++;
817  break;
818  case SHMSTATE_UNATTACHED:
819 
820  /*
821  * The segment pertains to DataDir, and every process that had
822  * used it has died or detached. Zap it, if possible, and any
823  * associated dynamic shared memory segments, as well. This
824  * shouldn't fail, but if it does, assume the segment belongs
825  * to someone else after all, and try the next candidate.
826  * Otherwise, try again to create the segment. That may fail
827  * if some other process creates the same shmem key before we
828  * do, in which case we'll try the next key.
829  */
830  if (oldhdr->dsm_control != 0)
832  if (shmctl(shmid, IPC_RMID, NULL) < 0)
833  NextShmemSegID++;
834  break;
835  }
836 
837  if (oldhdr && shmdt((void *) oldhdr) < 0)
838  elog(LOG, "shmdt(%p) failed: %m", oldhdr);
839  }
840 
841  /* Initialize new segment. */
842  hdr = (PGShmemHeader *) memAddress;
843  hdr->creatorPID = getpid();
844  hdr->magic = PGShmemMagic;
845  hdr->dsm_control = 0;
846 
847  /* Fill in the data directory ID info, too */
848  hdr->device = statbuf.st_dev;
849  hdr->inode = statbuf.st_ino;
850 
851  /*
852  * Initialize space allocation status for segment.
853  */
854  hdr->totalsize = size;
855  hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
856  *shim = hdr;
857 
858  /* Save info for possible future use */
859  UsedShmemSegAddr = memAddress;
860  UsedShmemSegID = (unsigned long) NextShmemSegID;
861 
862  /*
863  * If AnonymousShmem is NULL here, then we're not using anonymous shared
864  * memory, and should return a pointer to the System V shared memory
865  * block. Otherwise, the System V shared memory block is only a shim, and
866  * we must return a pointer to the real block.
867  */
868  if (AnonymousShmem == NULL)
869  return hdr;
870  memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
871  return (PGShmemHeader *) AnonymousShmem;
872 }
873 
874 #ifdef EXEC_BACKEND
875 
876 /*
877  * PGSharedMemoryReAttach
878  *
879  * This is called during startup of a postmaster child process to re-attach to
880  * an already existing shared memory segment. This is needed only in the
881  * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
882  * segment attachment via fork().
883  *
884  * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
885  * routine. The caller must have already restored them to the postmaster's
886  * values.
887  */
888 void
890 {
891  IpcMemoryId shmid;
892  PGShmemHeader *hdr;
894  void *origUsedShmemSegAddr = UsedShmemSegAddr;
895 
896  Assert(UsedShmemSegAddr != NULL);
898 
899 #ifdef __CYGWIN__
900  /* cygipc (currently) appears to not detach on exec. */
902  UsedShmemSegAddr = origUsedShmemSegAddr;
903 #endif
904 
905  elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
906  shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0);
907  if (shmid < 0)
909  else
911  if (state != SHMSTATE_ATTACHED)
912  elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
914  if (hdr != origUsedShmemSegAddr)
915  elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
916  hdr, origUsedShmemSegAddr);
917  dsm_set_control_handle(hdr->dsm_control);
918 
919  UsedShmemSegAddr = hdr; /* probably redundant */
920 }
921 
922 /*
923  * PGSharedMemoryNoReAttach
924  *
925  * This is called during startup of a postmaster child process when we choose
926  * *not* to re-attach to the existing shared memory segment. We must clean up
927  * to leave things in the appropriate state. This is not used in the non
928  * EXEC_BACKEND case, either.
929  *
930  * The child process startup logic might or might not call PGSharedMemoryDetach
931  * after this; make sure that it will be a no-op if called.
932  *
933  * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
934  * routine. The caller must have already restored them to the postmaster's
935  * values.
936  */
937 void
939 {
940  Assert(UsedShmemSegAddr != NULL);
942 
943 #ifdef __CYGWIN__
944  /* cygipc (currently) appears to not detach on exec. */
946 #endif
947 
948  /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
949  UsedShmemSegAddr = NULL;
950  /* And the same for UsedShmemSegID. */
951  UsedShmemSegID = 0;
952 }
953 
954 #endif /* EXEC_BACKEND */
955 
956 /*
957  * PGSharedMemoryDetach
958  *
959  * Detach from the shared memory segment, if still attached. This is not
960  * intended to be called explicitly by the process that originally created the
961  * segment (it will have on_shmem_exit callback(s) registered to do that).
962  * Rather, this is for subprocesses that have inherited an attachment and want
963  * to get rid of it.
964  *
965  * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
966  * routine, also AnonymousShmem and AnonymousShmemSize.
967  */
968 void
970 {
971  if (UsedShmemSegAddr != NULL)
972  {
973  if ((shmdt(UsedShmemSegAddr) < 0)
974 #if defined(EXEC_BACKEND) && defined(__CYGWIN__)
975  /* Work-around for cygipc exec bug */
976  && shmdt(NULL) < 0
977 #endif
978  )
979  elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
980  UsedShmemSegAddr = NULL;
981  }
982 
983  if (AnonymousShmem != NULL)
984  {
985  if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
986  elog(LOG, "munmap(%p, %zu) failed: %m",
988  AnonymousShmem = NULL;
989  }
990 }
#define MAXALIGN(LEN)
Definition: c.h:800
size_t Size
Definition: c.h:594
void dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
Definition: dsm.c:211
int errcode_for_file_access(void)
Definition: elog.c:881
int errdetail(const char *fmt,...)
Definition: elog.c:1202
int errhint(const char *fmt,...)
Definition: elog.c:1316
int errcode(int sqlerrcode)
Definition: elog.c:858
int errmsg(const char *fmt,...)
Definition: elog.c:1069
#define LOG
Definition: elog.h:31
#define DEBUG3
Definition: elog.h:28
#define FATAL
Definition: elog.h:41
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define ereport(elevel,...)
Definition: elog.h:149
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2528
int FreeFile(FILE *file)
Definition: fd.c:2726
bool IsUnderPostmaster
Definition: globals.c:113
char * DataDir
Definition: globals.c:66
void SetConfigOption(const char *name, const char *value, GucContext context, GucSource source)
Definition: guc.c:4176
#define newval
#define GUC_check_errdetail
Definition: guc.h:436
GucSource
Definition: guc.h:108
@ PGC_S_DYNAMIC_DEFAULT
Definition: guc.h:110
@ PGC_INTERNAL
Definition: guc.h:69
int huge_pages
Definition: guc_tables.c:560
int huge_page_size
Definition: guc_tables.c:561
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:361
int shared_memory_type
Definition: ipci.c:55
Assert(fmt[strlen(fmt) - 1] !='\n')
#define PG_MMAP_FLAGS
Definition: mem.h:41
#define PG_SHMAT_FLAGS
Definition: mem.h:20
#define MAP_FAILED
Definition: mem.h:45
void AddToDataDirLockFile(int target_line, const char *str)
Definition: miscinit.c:1497
void * arg
static uint64 pg_ceil_log2_64(uint64 num)
Definition: pg_bitutils.h:271
static rewind_source * source
Definition: pg_rewind.c:89
@ HUGE_PAGES_ON
Definition: pg_shmem.h:53
@ HUGE_PAGES_TRY
Definition: pg_shmem.h:54
#define PGShmemMagic
Definition: pg_shmem.h:32
@ SHMEM_TYPE_MMAP
Definition: pg_shmem.h:63
struct PGShmemHeader PGShmemHeader
static char * buf
Definition: pg_test_fsync.c:67
#define LOCK_FILE_LINE_SHMEM_KEY
Definition: pidfile.h:43
#define sprintf
Definition: port.h:240
#define IPCProtection
Definition: posix_sema.c:59
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
dsm_handle dsm_control
Definition: pg_shmem.h:36
ino_t inode
Definition: pg_shmem.h:40
Size freeoffset
Definition: pg_shmem.h:35
pid_t creatorPID
Definition: pg_shmem.h:33
dev_t device
Definition: pg_shmem.h:39
int32 magic
Definition: pg_shmem.h:31
Size totalsize
Definition: pg_shmem.h:34
_dev_t st_dev
Definition: win32_port.h:266
_ino_t st_ino
Definition: win32_port.h:267
Definition: regguts.h:323
static void AnonymousShmemDetach(int status, Datum arg)
Definition: sysv_shmem.c:674
PGShmemHeader * PGSharedMemoryCreate(Size size, PGShmemHeader **shim)
Definition: sysv_shmem.c:699
void PGSharedMemoryDetach(void)
Definition: sysv_shmem.c:969
static void * CreateAnonymousSegment(Size *size)
Definition: sysv_shmem.c:598
int IpcMemoryId
Definition: sysv_shmem.c:70
IpcMemoryState
Definition: sysv_shmem.c:84
@ SHMSTATE_ATTACHED
Definition: sysv_shmem.c:86
@ SHMSTATE_UNATTACHED
Definition: sysv_shmem.c:89
@ SHMSTATE_FOREIGN
Definition: sysv_shmem.c:88
@ SHMSTATE_ENOENT
Definition: sysv_shmem.c:87
@ SHMSTATE_ANALYSIS_FAILURE
Definition: sysv_shmem.c:85
static Size AnonymousShmemSize
Definition: sysv_shmem.c:96
key_t IpcMemoryKey
Definition: sysv_shmem.c:69
unsigned long UsedShmemSegID
Definition: sysv_shmem.c:93
bool check_huge_page_size(int *newval, void **extra, GucSource source)
Definition: sysv_shmem.c:577
static void * InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
Definition: sysv_shmem.c:120
void GetHugePageSize(Size *hugepagesize, int *mmap_flags)
Definition: sysv_shmem.c:478
void * UsedShmemSegAddr
Definition: sysv_shmem.c:94
bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
Definition: sysv_shmem.c:316
static void IpcMemoryDetach(int status, Datum shmaddr)
Definition: sysv_shmem.c:285
static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, void *attachAt, PGShmemHeader **addr)
Definition: sysv_shmem.c:346
static void * AnonymousShmem
Definition: sysv_shmem.c:97
static void IpcMemoryDelete(int status, Datum shmId)
Definition: sysv_shmem.c:297
#define stat
Definition: win32_port.h:284
#define IPC_STAT
Definition: win32_port.h:100
#define IPC_RMID
Definition: win32_port.h:95
long key_t
Definition: win32_port.h:247
#define IPC_EXCL
Definition: win32_port.h:97
#define IPC_CREAT
Definition: win32_port.h:96
#define EIDRM
Definition: win32_port.h:104
void PGSharedMemoryReAttach(void)
Definition: win32_shmem.c:424
void PGSharedMemoryNoReAttach(void)
Definition: win32_shmem.c:472