PostgreSQL Source Code  git master
sysv_shmem.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * sysv_shmem.c
4  * Implement shared memory using SysV facilities
5  *
6  * These routines used to be a fairly thin layer on top of SysV shared
7  * memory functionality. With the addition of anonymous-shmem logic,
8  * they're a bit fatter now. We still require a SysV shmem block to
9  * exist, though, because mmap'd shmem provides no way to find out how
10  * many processes are attached, which we need for interlocking purposes.
11  *
12  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
13  * Portions Copyright (c) 1994, Regents of the University of California
14  *
15  * IDENTIFICATION
16  * src/backend/port/sysv_shmem.c
17  *
18  *-------------------------------------------------------------------------
19  */
20 #include "postgres.h"
21 
22 #include <signal.h>
23 #include <unistd.h>
24 #include <sys/file.h>
25 #include <sys/ipc.h>
26 #include <sys/mman.h>
27 #include <sys/shm.h>
28 #include <sys/stat.h>
29 
30 #include "miscadmin.h"
31 #include "port/pg_bitutils.h"
32 #include "portability/mem.h"
33 #include "storage/dsm.h"
34 #include "storage/fd.h"
35 #include "storage/ipc.h"
36 #include "storage/pg_shmem.h"
37 #include "utils/guc.h"
38 #include "utils/guc_hooks.h"
39 #include "utils/pidfile.h"
40 
41 
42 /*
43  * As of PostgreSQL 9.3, we normally allocate only a very small amount of
44  * System V shared memory, and only for the purposes of providing an
45  * interlock to protect the data directory. The real shared memory block
46  * is allocated using mmap(). This works around the problem that many
47  * systems have very low limits on the amount of System V shared memory
48  * that can be allocated. Even a limit of a few megabytes will be enough
49  * to run many copies of PostgreSQL without needing to adjust system settings.
50  *
51  * We assume that no one will attempt to run PostgreSQL 9.3 or later on
52  * systems that are ancient enough that anonymous shared memory is not
53  * supported, such as pre-2.4 versions of Linux. If that turns out to be
54  * false, we might need to add compile and/or run-time tests here and do this
55  * only if the running kernel supports it.
56  *
57  * However, we must always disable this logic in the EXEC_BACKEND case, and
58  * fall back to the old method of allocating the entire segment using System V
59  * shared memory, because there's no way to attach an anonymous mmap'd segment
60  * to a process after exec(). Since EXEC_BACKEND is intended only for
61  * developer use, this shouldn't be a big problem. Because of this, we do
62  * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below.
63  *
64  * As of PostgreSQL 12, we regained the ability to use a large System V shared
65  * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set
66  * to sysv (though this is not the default).
67  */
68 
69 
70 typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
71 typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
72 
73 /*
74  * How does a given IpcMemoryId relate to this PostgreSQL process?
75  *
76  * One could recycle unattached segments of different data directories if we
77  * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would
78  * cause us to visit less of the key space, making us less likely to detect a
79  * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis,
80  * in that postmasters of different data directories could simultaneously
81  * attempt to recycle a given key. We'll waste keys longer in some cases, but
82  * avoiding the problems of the alternative justifies that loss.
83  */
84 typedef enum
85 {
86  SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */
87  SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */
88  SHMSTATE_ENOENT, /* no segment of that ID */
89  SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */
90  SHMSTATE_UNATTACHED, /* pertinent to DataDir, no attached PIDs */
92 
93 
94 unsigned long UsedShmemSegID = 0;
95 void *UsedShmemSegAddr = NULL;
96 
98 static void *AnonymousShmem = NULL;
99 
100 static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
101 static void IpcMemoryDetach(int status, Datum shmaddr);
102 static void IpcMemoryDelete(int status, Datum shmId);
104  void *attachAt,
105  PGShmemHeader **addr);
106 
107 
108 /*
109  * InternalIpcMemoryCreate(memKey, size)
110  *
111  * Attempt to create a new shared memory segment with the specified key.
112  * Will fail (return NULL) if such a segment already exists. If successful,
113  * attach the segment to the current process and return its attached address.
114  * On success, callbacks are registered with on_shmem_exit to detach and
115  * delete the segment when on_shmem_exit is called.
116  *
117  * If we fail with a failure code other than collision-with-existing-segment,
118  * print out an error and abort. Other types of errors are not recoverable.
119  */
120 static void *
122 {
123  IpcMemoryId shmid;
124  void *requestedAddress = NULL;
125  void *memAddress;
126 
127  /*
128  * Normally we just pass requestedAddress = NULL to shmat(), allowing the
129  * system to choose where the segment gets mapped. But in an EXEC_BACKEND
130  * build, it's possible for whatever is chosen in the postmaster to not
131  * work for backends, due to variations in address space layout. As a
132  * rather klugy workaround, allow the user to specify the address to use
133  * via setting the environment variable PG_SHMEM_ADDR. (If this were of
134  * interest for anything except debugging, we'd probably create a cleaner
135  * and better-documented way to set it, such as a GUC.)
136  */
137 #ifdef EXEC_BACKEND
138  {
139  char *pg_shmem_addr = getenv("PG_SHMEM_ADDR");
140 
141  if (pg_shmem_addr)
142  requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0);
143  else
144  {
145 #if defined(__darwin__) && SIZEOF_VOID_P == 8
146  /*
147  * Provide a default value that is believed to avoid problems with
148  * ASLR on the current macOS release.
149  */
150  requestedAddress = (void *) 0x80000000000;
151 #endif
152  }
153  }
154 #endif
155 
156  shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
157 
158  if (shmid < 0)
159  {
160  int shmget_errno = errno;
161 
162  /*
163  * Fail quietly if error indicates a collision with existing segment.
164  * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
165  * we could get a permission violation instead? Also, EIDRM might
166  * occur if an old seg is slated for destruction but not gone yet.
167  */
168  if (shmget_errno == EEXIST || shmget_errno == EACCES
169 #ifdef EIDRM
170  || shmget_errno == EIDRM
171 #endif
172  )
173  return NULL;
174 
175  /*
176  * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
177  * there is an existing segment but it's smaller than "size" (this is
178  * a result of poorly-thought-out ordering of error tests). To
179  * distinguish between collision and invalid size in such cases, we
180  * make a second try with size = 0. These kernels do not test size
181  * against SHMMIN in the preexisting-segment case, so we will not get
182  * EINVAL a second time if there is such a segment.
183  */
184  if (shmget_errno == EINVAL)
185  {
186  shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
187 
188  if (shmid < 0)
189  {
190  /* As above, fail quietly if we verify a collision */
191  if (errno == EEXIST || errno == EACCES
192 #ifdef EIDRM
193  || errno == EIDRM
194 #endif
195  )
196  return NULL;
197  /* Otherwise, fall through to report the original error */
198  }
199  else
200  {
201  /*
202  * On most platforms we cannot get here because SHMMIN is
203  * greater than zero. However, if we do succeed in creating a
204  * zero-size segment, free it and then fall through to report
205  * the original error.
206  */
207  if (shmctl(shmid, IPC_RMID, NULL) < 0)
208  elog(LOG, "shmctl(%d, %d, 0) failed: %m",
209  (int) shmid, IPC_RMID);
210  }
211  }
212 
213  /*
214  * Else complain and abort.
215  *
216  * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
217  * is violated. SHMALL violation might be reported as either ENOMEM
218  * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
219  * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
220  * not-enough-RAM is ENOMEM.
221  */
222  errno = shmget_errno;
223  ereport(FATAL,
224  (errmsg("could not create shared memory segment: %m"),
225  errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
226  (unsigned long) memKey, size,
228  (shmget_errno == EINVAL) ?
229  errhint("This error usually means that PostgreSQL's request for a shared memory "
230  "segment exceeded your kernel's SHMMAX parameter, or possibly that "
231  "it is less than "
232  "your kernel's SHMMIN parameter.\n"
233  "The PostgreSQL documentation contains more information about shared "
234  "memory configuration.") : 0,
235  (shmget_errno == ENOMEM) ?
236  errhint("This error usually means that PostgreSQL's request for a shared "
237  "memory segment exceeded your kernel's SHMALL parameter. You might need "
238  "to reconfigure the kernel with larger SHMALL.\n"
239  "The PostgreSQL documentation contains more information about shared "
240  "memory configuration.") : 0,
241  (shmget_errno == ENOSPC) ?
242  errhint("This error does *not* mean that you have run out of disk space. "
243  "It occurs either if all available shared memory IDs have been taken, "
244  "in which case you need to raise the SHMMNI parameter in your kernel, "
245  "or because the system's overall limit for shared memory has been "
246  "reached.\n"
247  "The PostgreSQL documentation contains more information about shared "
248  "memory configuration.") : 0));
249  }
250 
251  /* Register on-exit routine to delete the new segment */
253 
254  /* OK, should be able to attach to the segment */
255  memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
256 
257  if (memAddress == (void *) -1)
258  elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m",
259  shmid, requestedAddress, PG_SHMAT_FLAGS);
260 
261  /* Register on-exit routine to detach new segment before deleting */
263 
264  /*
265  * Store shmem key and ID in data directory lockfile. Format to try to
266  * keep it the same length always (trailing junk in the lockfile won't
267  * hurt, but might confuse humans).
268  */
269  {
270  char line[64];
271 
272  sprintf(line, "%9lu %9lu",
273  (unsigned long) memKey, (unsigned long) shmid);
275  }
276 
277  return memAddress;
278 }
279 
280 /****************************************************************************/
281 /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
282 /* from process' address space */
283 /* (called as an on_shmem_exit callback, hence funny argument list) */
284 /****************************************************************************/
285 static void
286 IpcMemoryDetach(int status, Datum shmaddr)
287 {
288  /* Detach System V shared memory block. */
289  if (shmdt((void *) DatumGetPointer(shmaddr)) < 0)
290  elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
291 }
292 
293 /****************************************************************************/
294 /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
295 /* (called as an on_shmem_exit callback, hence funny argument list) */
296 /****************************************************************************/
297 static void
298 IpcMemoryDelete(int status, Datum shmId)
299 {
300  if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
301  elog(LOG, "shmctl(%d, %d, 0) failed: %m",
302  DatumGetInt32(shmId), IPC_RMID);
303 }
304 
305 /*
306  * PGSharedMemoryIsInUse
307  *
308  * Is a previously-existing shmem segment still existing and in use?
309  *
310  * The point of this exercise is to detect the case where a prior postmaster
311  * crashed, but it left child backends that are still running. Therefore
312  * we only care about shmem segments that are associated with the intended
313  * DataDir. This is an important consideration since accidental matches of
314  * shmem segment IDs are reasonably common.
315  */
316 bool
317 PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
318 {
319  PGShmemHeader *memAddress;
321 
322  state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress);
323  if (memAddress && shmdt((void *) memAddress) < 0)
324  elog(LOG, "shmdt(%p) failed: %m", memAddress);
325  switch (state)
326  {
327  case SHMSTATE_ENOENT:
328  case SHMSTATE_FOREIGN:
329  case SHMSTATE_UNATTACHED:
330  return false;
332  case SHMSTATE_ATTACHED:
333  return true;
334  }
335  return true;
336 }
337 
338 /*
339  * Test for a segment with id shmId; see comment at IpcMemoryState.
340  *
341  * If the segment exists, we'll attempt to attach to it, using attachAt
342  * if that's not NULL (but it's best to pass NULL if possible).
343  *
344  * *addr is set to the segment memory address if we attached to it, else NULL.
345  */
346 static IpcMemoryState
348  void *attachAt,
349  PGShmemHeader **addr)
350 {
351  struct shmid_ds shmStat;
352  struct stat statbuf;
353  PGShmemHeader *hdr;
354 
355  *addr = NULL;
356 
357  /*
358  * First, try to stat the shm segment ID, to see if it exists at all.
359  */
360  if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
361  {
362  /*
363  * EINVAL actually has multiple possible causes documented in the
364  * shmctl man page, but we assume it must mean the segment no longer
365  * exists.
366  */
367  if (errno == EINVAL)
368  return SHMSTATE_ENOENT;
369 
370  /*
371  * EACCES implies we have no read permission, which means it is not a
372  * Postgres shmem segment (or at least, not one that is relevant to
373  * our data directory).
374  */
375  if (errno == EACCES)
376  return SHMSTATE_FOREIGN;
377 
378  /*
379  * Some Linux kernel versions (in fact, all of them as of July 2007)
380  * sometimes return EIDRM when EINVAL is correct. The Linux kernel
381  * actually does not have any internal state that would justify
382  * returning EIDRM, so we can get away with assuming that EIDRM is
383  * equivalent to EINVAL on that platform.
384  */
385 #ifdef HAVE_LINUX_EIDRM_BUG
386  if (errno == EIDRM)
387  return SHMSTATE_ENOENT;
388 #endif
389 
390  /*
391  * Otherwise, we had better assume that the segment is in use. The
392  * only likely case is (non-Linux, assumed spec-compliant) EIDRM,
393  * which implies that the segment has been IPC_RMID'd but there are
394  * still processes attached to it.
395  */
397  }
398 
399  /*
400  * Try to attach to the segment and see if it matches our data directory.
401  * This avoids any risk of duplicate-shmem-key conflicts on machines that
402  * are running several postmasters under the same userid.
403  *
404  * (When we're called from PGSharedMemoryCreate, this stat call is
405  * duplicative; but since this isn't a high-traffic case it's not worth
406  * trying to optimize.)
407  */
408  if (stat(DataDir, &statbuf) < 0)
409  return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */
410 
411  hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS);
412  if (hdr == (PGShmemHeader *) -1)
413  {
414  /*
415  * Attachment failed. The cases we're interested in are the same as
416  * for the shmctl() call above. In particular, note that the owning
417  * postmaster could have terminated and removed the segment between
418  * shmctl() and shmat().
419  *
420  * If attachAt isn't NULL, it's possible that EINVAL reflects a
421  * problem with that address not a vanished segment, so it's best to
422  * pass NULL when probing for conflicting segments.
423  */
424  if (errno == EINVAL)
425  return SHMSTATE_ENOENT; /* segment disappeared */
426  if (errno == EACCES)
427  return SHMSTATE_FOREIGN; /* must be non-Postgres */
428 #ifdef HAVE_LINUX_EIDRM_BUG
429  if (errno == EIDRM)
430  return SHMSTATE_ENOENT; /* segment disappeared */
431 #endif
432  /* Otherwise, be conservative. */
434  }
435  *addr = hdr;
436 
437  if (hdr->magic != PGShmemMagic ||
438  hdr->device != statbuf.st_dev ||
439  hdr->inode != statbuf.st_ino)
440  {
441  /*
442  * It's either not a Postgres segment, or not one for my data
443  * directory.
444  */
445  return SHMSTATE_FOREIGN;
446  }
447 
448  /*
449  * It does match our data directory, so now test whether any processes are
450  * still attached to it. (We are, now, but the shm_nattch result is from
451  * before we attached to it.)
452  */
453  return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED;
454 }
455 
456 /*
457  * Identify the huge page size to use, and compute the related mmap flags.
458  *
459  * Some Linux kernel versions have a bug causing mmap() to fail on requests
460  * that are not a multiple of the hugepage size. Versions without that bug
461  * instead silently round the request up to the next hugepage multiple ---
462  * and then munmap() fails when we give it a size different from that.
463  * So we have to round our request up to a multiple of the actual hugepage
464  * size to avoid trouble.
465  *
466  * Doing the round-up ourselves also lets us make use of the extra memory,
467  * rather than just wasting it. Currently, we just increase the available
468  * space recorded in the shmem header, which will make the extra usable for
469  * purposes such as additional locktable entries. Someday, for very large
470  * hugepage sizes, we might want to think about more invasive strategies,
471  * such as increasing shared_buffers to absorb the extra space.
472  *
473  * Returns the (real, assumed or config provided) page size into
474  * *hugepagesize, and the hugepage-related mmap flags to use into
475  * *mmap_flags if requested by the caller. If huge pages are not supported,
476  * *hugepagesize and *mmap_flags are set to 0.
477  */
478 void
479 GetHugePageSize(Size *hugepagesize, int *mmap_flags)
480 {
481 #ifdef MAP_HUGETLB
482 
483  Size default_hugepagesize = 0;
484  Size hugepagesize_local = 0;
485  int mmap_flags_local = 0;
486 
487  /*
488  * System-dependent code to find out the default huge page size.
489  *
490  * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
491  * nnnn kB". Ignore any failures, falling back to the preset default.
492  */
493 #ifdef __linux__
494 
495  {
496  FILE *fp = AllocateFile("/proc/meminfo", "r");
497  char buf[128];
498  unsigned int sz;
499  char ch;
500 
501  if (fp)
502  {
503  while (fgets(buf, sizeof(buf), fp))
504  {
505  if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2)
506  {
507  if (ch == 'k')
508  {
509  default_hugepagesize = sz * (Size) 1024;
510  break;
511  }
512  /* We could accept other units besides kB, if needed */
513  }
514  }
515  FreeFile(fp);
516  }
517  }
518 #endif /* __linux__ */
519 
520  if (huge_page_size != 0)
521  {
522  /* If huge page size is requested explicitly, use that. */
523  hugepagesize_local = (Size) huge_page_size * 1024;
524  }
525  else if (default_hugepagesize != 0)
526  {
527  /* Otherwise use the system default, if we have it. */
528  hugepagesize_local = default_hugepagesize;
529  }
530  else
531  {
532  /*
533  * If we fail to find out the system's default huge page size, or no
534  * huge page size is requested explicitly, assume it is 2MB. This will
535  * work fine when the actual size is less. If it's more, we might get
536  * mmap() or munmap() failures due to unaligned requests; but at this
537  * writing, there are no reports of any non-Linux systems being picky
538  * about that.
539  */
540  hugepagesize_local = 2 * 1024 * 1024;
541  }
542 
543  mmap_flags_local = MAP_HUGETLB;
544 
545  /*
546  * On recent enough Linux, also include the explicit page size, if
547  * necessary.
548  */
549 #if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)
550  if (hugepagesize_local != default_hugepagesize)
551  {
552  int shift = pg_ceil_log2_64(hugepagesize_local);
553 
554  mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
555  }
556 #endif
557 
558  /* assign the results found */
559  if (mmap_flags)
560  *mmap_flags = mmap_flags_local;
561  if (hugepagesize)
562  *hugepagesize = hugepagesize_local;
563 
564 #else
565 
566  if (hugepagesize)
567  *hugepagesize = 0;
568  if (mmap_flags)
569  *mmap_flags = 0;
570 
571 #endif /* MAP_HUGETLB */
572 }
573 
574 /*
575  * GUC check_hook for huge_page_size
576  */
577 bool
579 {
580 #if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT))
581  /* Recent enough Linux only, for now. See GetHugePageSize(). */
582  if (*newval != 0)
583  {
584  GUC_check_errdetail("huge_page_size must be 0 on this platform.");
585  return false;
586  }
587 #endif
588  return true;
589 }
590 
591 /*
592  * Creates an anonymous mmap()ed shared memory segment.
593  *
594  * Pass the requested size in *size. This function will modify *size to the
595  * actual size of the allocation, if it ends up allocating a segment that is
596  * larger than requested.
597  */
598 static void *
600 {
601  Size allocsize = *size;
602  void *ptr = MAP_FAILED;
603  int mmap_errno = 0;
604 
605 #ifndef MAP_HUGETLB
606  /* PGSharedMemoryCreate should have dealt with this case */
608 #else
610  {
611  /*
612  * Round up the request size to a suitable large value.
613  */
614  Size hugepagesize;
615  int mmap_flags;
616 
617  GetHugePageSize(&hugepagesize, &mmap_flags);
618 
619  if (allocsize % hugepagesize != 0)
620  allocsize += hugepagesize - (allocsize % hugepagesize);
621 
622  ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
623  PG_MMAP_FLAGS | mmap_flags, -1, 0);
624  mmap_errno = errno;
625  if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
626  elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
627  allocsize);
628  }
629 #endif
630 
631  /*
632  * Report whether huge pages are in use. This needs to be tracked before
633  * the second mmap() call if attempting to use huge pages failed
634  * previously.
635  */
636  SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on",
638 
639  if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
640  {
641  /*
642  * Use the original size, not the rounded-up value, when falling back
643  * to non-huge pages.
644  */
645  allocsize = *size;
646  ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
647  PG_MMAP_FLAGS, -1, 0);
648  mmap_errno = errno;
649  }
650 
651  if (ptr == MAP_FAILED)
652  {
653  errno = mmap_errno;
654  ereport(FATAL,
655  (errmsg("could not map anonymous shared memory: %m"),
656  (mmap_errno == ENOMEM) ?
657  errhint("This error usually means that PostgreSQL's request "
658  "for a shared memory segment exceeded available memory, "
659  "swap space, or huge pages. To reduce the request size "
660  "(currently %zu bytes), reduce PostgreSQL's shared "
661  "memory usage, perhaps by reducing shared_buffers or "
662  "max_connections.",
663  allocsize) : 0));
664  }
665 
666  *size = allocsize;
667  return ptr;
668 }
669 
670 /*
671  * AnonymousShmemDetach --- detach from an anonymous mmap'd block
672  * (called as an on_shmem_exit callback, hence funny argument list)
673  */
674 static void
676 {
677  /* Release anonymous shared memory block, if any. */
678  if (AnonymousShmem != NULL)
679  {
680  if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
681  elog(LOG, "munmap(%p, %zu) failed: %m",
683  AnonymousShmem = NULL;
684  }
685 }
686 
687 /*
688  * PGSharedMemoryCreate
689  *
690  * Create a shared memory segment of the given size and initialize its
691  * standard header. Also, register an on_shmem_exit callback to release
692  * the storage.
693  *
694  * Dead Postgres segments pertinent to this DataDir are recycled if found, but
695  * we do not fail upon collision with foreign shmem segments. The idea here
696  * is to detect and re-use keys that may have been assigned by a crashed
697  * postmaster or backend.
698  */
701  PGShmemHeader **shim)
702 {
703  IpcMemoryKey NextShmemSegID;
704  void *memAddress;
705  PGShmemHeader *hdr;
706  struct stat statbuf;
707  Size sysvsize;
708 
709  /*
710  * We use the data directory's ID info (inode and device numbers) to
711  * positively identify shmem segments associated with this data dir, and
712  * also as seeds for searching for a free shmem key.
713  */
714  if (stat(DataDir, &statbuf) < 0)
715  ereport(FATAL,
717  errmsg("could not stat data directory \"%s\": %m",
718  DataDir)));
719 
720  /* Complain if hugepages demanded but we can't possibly support them */
721 #if !defined(MAP_HUGETLB)
722  if (huge_pages == HUGE_PAGES_ON)
723  ereport(ERROR,
724  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
725  errmsg("huge pages not supported on this platform")));
726 #endif
727 
728  /* For now, we don't support huge pages in SysV memory */
730  ereport(ERROR,
731  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
732  errmsg("huge pages not supported with the current shared_memory_type setting")));
733 
734  /* Room for a header? */
735  Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
736 
738  {
741 
742  /* Register on-exit routine to unmap the anonymous segment */
744 
745  /* Now we need only allocate a minimal-sized SysV shmem block. */
746  sysvsize = sizeof(PGShmemHeader);
747  }
748  else
749  {
750  sysvsize = size;
751 
752  /* huge pages are only available with mmap */
753  SetConfigOption("huge_pages_status", "off",
755  }
756 
757  /*
758  * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to
759  * ensure no more than one postmaster per data directory can enter this
760  * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure
761  * that, but prefer fixing it over coping here.)
762  */
763  NextShmemSegID = statbuf.st_ino;
764 
765  for (;;)
766  {
767  IpcMemoryId shmid;
768  PGShmemHeader *oldhdr;
770 
771  /* Try to create new segment */
772  memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
773  if (memAddress)
774  break; /* successful create and attach */
775 
776  /* Check shared memory and possibly remove and recreate */
777 
778  /*
779  * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN.
780  * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can
781  * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN.
782  */
783  shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0);
784  if (shmid < 0)
785  {
786  oldhdr = NULL;
788  }
789  else
790  state = PGSharedMemoryAttach(shmid, NULL, &oldhdr);
791 
792  switch (state)
793  {
795  case SHMSTATE_ATTACHED:
796  ereport(FATAL,
797  (errcode(ERRCODE_LOCK_FILE_EXISTS),
798  errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use",
799  (unsigned long) NextShmemSegID,
800  (unsigned long) shmid),
801  errhint("Terminate any old server processes associated with data directory \"%s\".",
802  DataDir)));
803  break;
804  case SHMSTATE_ENOENT:
805 
806  /*
807  * To our surprise, some other process deleted since our last
808  * InternalIpcMemoryCreate(). Moments earlier, we would have
809  * seen SHMSTATE_FOREIGN. Try that same ID again.
810  */
811  elog(LOG,
812  "shared memory block (key %lu, ID %lu) deleted during startup",
813  (unsigned long) NextShmemSegID,
814  (unsigned long) shmid);
815  break;
816  case SHMSTATE_FOREIGN:
817  NextShmemSegID++;
818  break;
819  case SHMSTATE_UNATTACHED:
820 
821  /*
822  * The segment pertains to DataDir, and every process that had
823  * used it has died or detached. Zap it, if possible, and any
824  * associated dynamic shared memory segments, as well. This
825  * shouldn't fail, but if it does, assume the segment belongs
826  * to someone else after all, and try the next candidate.
827  * Otherwise, try again to create the segment. That may fail
828  * if some other process creates the same shmem key before we
829  * do, in which case we'll try the next key.
830  */
831  if (oldhdr->dsm_control != 0)
833  if (shmctl(shmid, IPC_RMID, NULL) < 0)
834  NextShmemSegID++;
835  break;
836  }
837 
838  if (oldhdr && shmdt((void *) oldhdr) < 0)
839  elog(LOG, "shmdt(%p) failed: %m", oldhdr);
840  }
841 
842  /* Initialize new segment. */
843  hdr = (PGShmemHeader *) memAddress;
844  hdr->creatorPID = getpid();
845  hdr->magic = PGShmemMagic;
846  hdr->dsm_control = 0;
847 
848  /* Fill in the data directory ID info, too */
849  hdr->device = statbuf.st_dev;
850  hdr->inode = statbuf.st_ino;
851 
852  /*
853  * Initialize space allocation status for segment.
854  */
855  hdr->totalsize = size;
856  hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
857  *shim = hdr;
858 
859  /* Save info for possible future use */
860  UsedShmemSegAddr = memAddress;
861  UsedShmemSegID = (unsigned long) NextShmemSegID;
862 
863  /*
864  * If AnonymousShmem is NULL here, then we're not using anonymous shared
865  * memory, and should return a pointer to the System V shared memory
866  * block. Otherwise, the System V shared memory block is only a shim, and
867  * we must return a pointer to the real block.
868  */
869  if (AnonymousShmem == NULL)
870  return hdr;
871  memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
872  return (PGShmemHeader *) AnonymousShmem;
873 }
874 
875 #ifdef EXEC_BACKEND
876 
877 /*
878  * PGSharedMemoryReAttach
879  *
880  * This is called during startup of a postmaster child process to re-attach to
881  * an already existing shared memory segment. This is needed only in the
882  * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
883  * segment attachment via fork().
884  *
885  * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
886  * routine. The caller must have already restored them to the postmaster's
887  * values.
888  */
889 void
891 {
892  IpcMemoryId shmid;
893  PGShmemHeader *hdr;
895  void *origUsedShmemSegAddr = UsedShmemSegAddr;
896 
897  Assert(UsedShmemSegAddr != NULL);
899 
900 #ifdef __CYGWIN__
901  /* cygipc (currently) appears to not detach on exec. */
903  UsedShmemSegAddr = origUsedShmemSegAddr;
904 #endif
905 
906  elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
907  shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0);
908  if (shmid < 0)
910  else
912  if (state != SHMSTATE_ATTACHED)
913  elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
915  if (hdr != origUsedShmemSegAddr)
916  elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
917  hdr, origUsedShmemSegAddr);
918  dsm_set_control_handle(hdr->dsm_control);
919 
920  UsedShmemSegAddr = hdr; /* probably redundant */
921 }
922 
923 /*
924  * PGSharedMemoryNoReAttach
925  *
926  * This is called during startup of a postmaster child process when we choose
927  * *not* to re-attach to the existing shared memory segment. We must clean up
928  * to leave things in the appropriate state. This is not used in the non
929  * EXEC_BACKEND case, either.
930  *
931  * The child process startup logic might or might not call PGSharedMemoryDetach
932  * after this; make sure that it will be a no-op if called.
933  *
934  * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
935  * routine. The caller must have already restored them to the postmaster's
936  * values.
937  */
938 void
940 {
941  Assert(UsedShmemSegAddr != NULL);
943 
944 #ifdef __CYGWIN__
945  /* cygipc (currently) appears to not detach on exec. */
947 #endif
948 
949  /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
950  UsedShmemSegAddr = NULL;
951  /* And the same for UsedShmemSegID. */
952  UsedShmemSegID = 0;
953 }
954 
955 #endif /* EXEC_BACKEND */
956 
957 /*
958  * PGSharedMemoryDetach
959  *
960  * Detach from the shared memory segment, if still attached. This is not
961  * intended to be called explicitly by the process that originally created the
962  * segment (it will have on_shmem_exit callback(s) registered to do that).
963  * Rather, this is for subprocesses that have inherited an attachment and want
964  * to get rid of it.
965  *
966  * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
967  * routine, also AnonymousShmem and AnonymousShmemSize.
968  */
969 void
971 {
972  if (UsedShmemSegAddr != NULL)
973  {
974  if ((shmdt(UsedShmemSegAddr) < 0)
975 #if defined(EXEC_BACKEND) && defined(__CYGWIN__)
976  /* Work-around for cygipc exec bug */
977  && shmdt(NULL) < 0
978 #endif
979  )
980  elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
981  UsedShmemSegAddr = NULL;
982  }
983 
984  if (AnonymousShmem != NULL)
985  {
986  if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
987  elog(LOG, "munmap(%p, %zu) failed: %m",
989  AnonymousShmem = NULL;
990  }
991 }
#define MAXALIGN(LEN)
Definition: c.h:798
size_t Size
Definition: c.h:592
void dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
Definition: dsm.c:238
int errcode_for_file_access(void)
Definition: elog.c:882
int errdetail(const char *fmt,...)
Definition: elog.c:1205
int errhint(const char *fmt,...)
Definition: elog.c:1319
int errcode(int sqlerrcode)
Definition: elog.c:859
int errmsg(const char *fmt,...)
Definition: elog.c:1072
#define LOG
Definition: elog.h:31
#define DEBUG3
Definition: elog.h:28
#define FATAL
Definition: elog.h:41
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:224
#define ereport(elevel,...)
Definition: elog.h:149
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2583
int FreeFile(FILE *file)
Definition: fd.c:2781
bool IsUnderPostmaster
Definition: globals.c:117
char * DataDir
Definition: globals.c:68
void SetConfigOption(const char *name, const char *value, GucContext context, GucSource source)
Definition: guc.c:4275
#define newval
#define GUC_check_errdetail
Definition: guc.h:447
GucSource
Definition: guc.h:108
@ PGC_S_DYNAMIC_DEFAULT
Definition: guc.h:110
@ PGC_INTERNAL
Definition: guc.h:69
int huge_pages
Definition: guc_tables.c:563
int huge_page_size
Definition: guc_tables.c:564
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
int shared_memory_type
Definition: ipci.c:57
Assert(fmt[strlen(fmt) - 1] !='\n')
#define PG_MMAP_FLAGS
Definition: mem.h:41
#define PG_SHMAT_FLAGS
Definition: mem.h:20
#define MAP_FAILED
Definition: mem.h:45
void AddToDataDirLockFile(int target_line, const char *str)
Definition: miscinit.c:1511
void * arg
static uint64 pg_ceil_log2_64(uint64 num)
Definition: pg_bitutils.h:271
static rewind_source * source
Definition: pg_rewind.c:89
@ HUGE_PAGES_ON
Definition: pg_shmem.h:53
@ HUGE_PAGES_TRY
Definition: pg_shmem.h:54
#define PGShmemMagic
Definition: pg_shmem.h:32
@ SHMEM_TYPE_MMAP
Definition: pg_shmem.h:63
struct PGShmemHeader PGShmemHeader
static char * buf
Definition: pg_test_fsync.c:73
#define LOCK_FILE_LINE_SHMEM_KEY
Definition: pidfile.h:43
#define sprintf
Definition: port.h:240
#define IPCProtection
Definition: posix_sema.c:59
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
uintptr_t Datum
Definition: postgres.h:64
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:212
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:202
static pg_noinline void Size size
Definition: slab.c:607
dsm_handle dsm_control
Definition: pg_shmem.h:36
ino_t inode
Definition: pg_shmem.h:40
Size freeoffset
Definition: pg_shmem.h:35
pid_t creatorPID
Definition: pg_shmem.h:33
dev_t device
Definition: pg_shmem.h:39
int32 magic
Definition: pg_shmem.h:31
Size totalsize
Definition: pg_shmem.h:34
_dev_t st_dev
Definition: win32_port.h:266
_ino_t st_ino
Definition: win32_port.h:267
Definition: regguts.h:323
static void AnonymousShmemDetach(int status, Datum arg)
Definition: sysv_shmem.c:675
PGShmemHeader * PGSharedMemoryCreate(Size size, PGShmemHeader **shim)
Definition: sysv_shmem.c:700
void PGSharedMemoryDetach(void)
Definition: sysv_shmem.c:970
static void * CreateAnonymousSegment(Size *size)
Definition: sysv_shmem.c:599
int IpcMemoryId
Definition: sysv_shmem.c:71
IpcMemoryState
Definition: sysv_shmem.c:85
@ SHMSTATE_ATTACHED
Definition: sysv_shmem.c:87
@ SHMSTATE_UNATTACHED
Definition: sysv_shmem.c:90
@ SHMSTATE_FOREIGN
Definition: sysv_shmem.c:89
@ SHMSTATE_ENOENT
Definition: sysv_shmem.c:88
@ SHMSTATE_ANALYSIS_FAILURE
Definition: sysv_shmem.c:86
static Size AnonymousShmemSize
Definition: sysv_shmem.c:97
key_t IpcMemoryKey
Definition: sysv_shmem.c:70
unsigned long UsedShmemSegID
Definition: sysv_shmem.c:94
bool check_huge_page_size(int *newval, void **extra, GucSource source)
Definition: sysv_shmem.c:578
static void * InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
Definition: sysv_shmem.c:121
void GetHugePageSize(Size *hugepagesize, int *mmap_flags)
Definition: sysv_shmem.c:479
void * UsedShmemSegAddr
Definition: sysv_shmem.c:95
bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
Definition: sysv_shmem.c:317
static void IpcMemoryDetach(int status, Datum shmaddr)
Definition: sysv_shmem.c:286
static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, void *attachAt, PGShmemHeader **addr)
Definition: sysv_shmem.c:347
static void * AnonymousShmem
Definition: sysv_shmem.c:98
static void IpcMemoryDelete(int status, Datum shmId)
Definition: sysv_shmem.c:298
#define stat
Definition: win32_port.h:284
#define IPC_STAT
Definition: win32_port.h:100
#define IPC_RMID
Definition: win32_port.h:95
long key_t
Definition: win32_port.h:247
#define IPC_EXCL
Definition: win32_port.h:97
#define IPC_CREAT
Definition: win32_port.h:96
#define EIDRM
Definition: win32_port.h:104
void PGSharedMemoryReAttach(void)
Definition: win32_shmem.c:424
void PGSharedMemoryNoReAttach(void)
Definition: win32_shmem.c:472