PostgreSQL Source Code git master
sysv_shmem.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * sysv_shmem.c
4 * Implement shared memory using SysV facilities
5 *
6 * These routines used to be a fairly thin layer on top of SysV shared
7 * memory functionality. With the addition of anonymous-shmem logic,
8 * they're a bit fatter now. We still require a SysV shmem block to
9 * exist, though, because mmap'd shmem provides no way to find out how
10 * many processes are attached, which we need for interlocking purposes.
11 *
12 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
13 * Portions Copyright (c) 1994, Regents of the University of California
14 *
15 * IDENTIFICATION
16 * src/backend/port/sysv_shmem.c
17 *
18 *-------------------------------------------------------------------------
19 */
20#include "postgres.h"
21
22#include <signal.h>
23#include <unistd.h>
24#include <sys/file.h>
25#include <sys/ipc.h>
26#include <sys/mman.h>
27#include <sys/shm.h>
28#include <sys/stat.h>
29
30#include "miscadmin.h"
31#include "port/pg_bitutils.h"
32#include "portability/mem.h"
33#include "storage/dsm.h"
34#include "storage/fd.h"
35#include "storage/ipc.h"
36#include "storage/pg_shmem.h"
37#include "utils/guc.h"
38#include "utils/guc_hooks.h"
39#include "utils/pidfile.h"
40
41
42/*
43 * As of PostgreSQL 9.3, we normally allocate only a very small amount of
44 * System V shared memory, and only for the purposes of providing an
45 * interlock to protect the data directory. The real shared memory block
46 * is allocated using mmap(). This works around the problem that many
47 * systems have very low limits on the amount of System V shared memory
48 * that can be allocated. Even a limit of a few megabytes will be enough
49 * to run many copies of PostgreSQL without needing to adjust system settings.
50 *
51 * We assume that no one will attempt to run PostgreSQL 9.3 or later on
52 * systems that are ancient enough that anonymous shared memory is not
53 * supported, such as pre-2.4 versions of Linux. If that turns out to be
54 * false, we might need to add compile and/or run-time tests here and do this
55 * only if the running kernel supports it.
56 *
57 * However, we must always disable this logic in the EXEC_BACKEND case, and
58 * fall back to the old method of allocating the entire segment using System V
59 * shared memory, because there's no way to attach an anonymous mmap'd segment
60 * to a process after exec(). Since EXEC_BACKEND is intended only for
61 * developer use, this shouldn't be a big problem. Because of this, we do
62 * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below.
63 *
64 * As of PostgreSQL 12, we regained the ability to use a large System V shared
65 * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set
66 * to sysv (though this is not the default).
67 */
68
69
70typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */
71typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */
72
73/*
74 * How does a given IpcMemoryId relate to this PostgreSQL process?
75 *
76 * One could recycle unattached segments of different data directories if we
77 * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would
78 * cause us to visit less of the key space, making us less likely to detect a
79 * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis,
80 * in that postmasters of different data directories could simultaneously
81 * attempt to recycle a given key. We'll waste keys longer in some cases, but
82 * avoiding the problems of the alternative justifies that loss.
83 */
84typedef enum
85{
86 SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */
87 SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */
88 SHMSTATE_ENOENT, /* no segment of that ID */
89 SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */
90 SHMSTATE_UNATTACHED, /* pertinent to DataDir, no attached PIDs */
92
93
94unsigned long UsedShmemSegID = 0;
95void *UsedShmemSegAddr = NULL;
96
98static void *AnonymousShmem = NULL;
99
100static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
101static void IpcMemoryDetach(int status, Datum shmaddr);
102static void IpcMemoryDelete(int status, Datum shmId);
104 void *attachAt,
105 PGShmemHeader **addr);
106
107
108/*
109 * InternalIpcMemoryCreate(memKey, size)
110 *
111 * Attempt to create a new shared memory segment with the specified key.
112 * Will fail (return NULL) if such a segment already exists. If successful,
113 * attach the segment to the current process and return its attached address.
114 * On success, callbacks are registered with on_shmem_exit to detach and
115 * delete the segment when on_shmem_exit is called.
116 *
117 * If we fail with a failure code other than collision-with-existing-segment,
118 * print out an error and abort. Other types of errors are not recoverable.
119 */
120static void *
122{
123 IpcMemoryId shmid;
124 void *requestedAddress = NULL;
125 void *memAddress;
126
127 /*
128 * Normally we just pass requestedAddress = NULL to shmat(), allowing the
129 * system to choose where the segment gets mapped. But in an EXEC_BACKEND
130 * build, it's possible for whatever is chosen in the postmaster to not
131 * work for backends, due to variations in address space layout. As a
132 * rather klugy workaround, allow the user to specify the address to use
133 * via setting the environment variable PG_SHMEM_ADDR. (If this were of
134 * interest for anything except debugging, we'd probably create a cleaner
135 * and better-documented way to set it, such as a GUC.)
136 */
137#ifdef EXEC_BACKEND
138 {
139 char *pg_shmem_addr = getenv("PG_SHMEM_ADDR");
140
141 if (pg_shmem_addr)
142 requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0);
143 else
144 {
145#if defined(__darwin__) && SIZEOF_VOID_P == 8
146 /*
147 * Provide a default value that is believed to avoid problems with
148 * ASLR on the current macOS release.
149 */
150 requestedAddress = (void *) 0x80000000000;
151#endif
152 }
153 }
154#endif
155
156 shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection);
157
158 if (shmid < 0)
159 {
160 int shmget_errno = errno;
161
162 /*
163 * Fail quietly if error indicates a collision with existing segment.
164 * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
165 * we could get a permission violation instead? Also, EIDRM might
166 * occur if an old seg is slated for destruction but not gone yet.
167 */
168 if (shmget_errno == EEXIST || shmget_errno == EACCES
169#ifdef EIDRM
170 || shmget_errno == EIDRM
171#endif
172 )
173 return NULL;
174
175 /*
176 * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
177 * there is an existing segment but it's smaller than "size" (this is
178 * a result of poorly-thought-out ordering of error tests). To
179 * distinguish between collision and invalid size in such cases, we
180 * make a second try with size = 0. These kernels do not test size
181 * against SHMMIN in the preexisting-segment case, so we will not get
182 * EINVAL a second time if there is such a segment.
183 */
184 if (shmget_errno == EINVAL)
185 {
186 shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection);
187
188 if (shmid < 0)
189 {
190 /* As above, fail quietly if we verify a collision */
191 if (errno == EEXIST || errno == EACCES
192#ifdef EIDRM
193 || errno == EIDRM
194#endif
195 )
196 return NULL;
197 /* Otherwise, fall through to report the original error */
198 }
199 else
200 {
201 /*
202 * On most platforms we cannot get here because SHMMIN is
203 * greater than zero. However, if we do succeed in creating a
204 * zero-size segment, free it and then fall through to report
205 * the original error.
206 */
207 if (shmctl(shmid, IPC_RMID, NULL) < 0)
208 elog(LOG, "shmctl(%d, %d, 0) failed: %m",
209 (int) shmid, IPC_RMID);
210 }
211 }
212
213 /*
214 * Else complain and abort.
215 *
216 * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
217 * is violated. SHMALL violation might be reported as either ENOMEM
218 * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
219 * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
220 * not-enough-RAM is ENOMEM.
221 */
222 errno = shmget_errno;
224 (errmsg("could not create shared memory segment: %m"),
225 errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).",
226 (unsigned long) memKey, size,
228 (shmget_errno == EINVAL) ?
229 errhint("This error usually means that PostgreSQL's request for a shared memory "
230 "segment exceeded your kernel's SHMMAX parameter, or possibly that "
231 "it is less than "
232 "your kernel's SHMMIN parameter.\n"
233 "The PostgreSQL documentation contains more information about shared "
234 "memory configuration.") : 0,
235 (shmget_errno == ENOMEM) ?
236 errhint("This error usually means that PostgreSQL's request for a shared "
237 "memory segment exceeded your kernel's SHMALL parameter. You might need "
238 "to reconfigure the kernel with larger SHMALL.\n"
239 "The PostgreSQL documentation contains more information about shared "
240 "memory configuration.") : 0,
241 (shmget_errno == ENOSPC) ?
242 errhint("This error does *not* mean that you have run out of disk space. "
243 "It occurs either if all available shared memory IDs have been taken, "
244 "in which case you need to raise the SHMMNI parameter in your kernel, "
245 "or because the system's overall limit for shared memory has been "
246 "reached.\n"
247 "The PostgreSQL documentation contains more information about shared "
248 "memory configuration.") : 0));
249 }
250
251 /* Register on-exit routine to delete the new segment */
253
254 /* OK, should be able to attach to the segment */
255 memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS);
256
257 if (memAddress == (void *) -1)
258 elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m",
259 shmid, requestedAddress, PG_SHMAT_FLAGS);
260
261 /* Register on-exit routine to detach new segment before deleting */
263
264 /*
265 * Store shmem key and ID in data directory lockfile. Format to try to
266 * keep it the same length always (trailing junk in the lockfile won't
267 * hurt, but might confuse humans).
268 */
269 {
270 char line[64];
271
272 sprintf(line, "%9lu %9lu",
273 (unsigned long) memKey, (unsigned long) shmid);
275 }
276
277 return memAddress;
278}
279
280/****************************************************************************/
281/* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
282/* from process' address space */
283/* (called as an on_shmem_exit callback, hence funny argument list) */
284/****************************************************************************/
285static void
286IpcMemoryDetach(int status, Datum shmaddr)
287{
288 /* Detach System V shared memory block. */
289 if (shmdt(DatumGetPointer(shmaddr)) < 0)
290 elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr));
291}
292
293/****************************************************************************/
294/* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
295/* (called as an on_shmem_exit callback, hence funny argument list) */
296/****************************************************************************/
297static void
298IpcMemoryDelete(int status, Datum shmId)
299{
300 if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0)
301 elog(LOG, "shmctl(%d, %d, 0) failed: %m",
302 DatumGetInt32(shmId), IPC_RMID);
303}
304
305/*
306 * PGSharedMemoryIsInUse
307 *
308 * Is a previously-existing shmem segment still existing and in use?
309 *
310 * The point of this exercise is to detect the case where a prior postmaster
311 * crashed, but it left child backends that are still running. Therefore
312 * we only care about shmem segments that are associated with the intended
313 * DataDir. This is an important consideration since accidental matches of
314 * shmem segment IDs are reasonably common.
315 */
316bool
317PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
318{
319 PGShmemHeader *memAddress;
321
322 state = PGSharedMemoryAttach((IpcMemoryId) id2, NULL, &memAddress);
323 if (memAddress && shmdt(memAddress) < 0)
324 elog(LOG, "shmdt(%p) failed: %m", memAddress);
325 switch (state)
326 {
327 case SHMSTATE_ENOENT:
328 case SHMSTATE_FOREIGN:
330 return false;
333 return true;
334 }
335 return true;
336}
337
338/*
339 * Test for a segment with id shmId; see comment at IpcMemoryState.
340 *
341 * If the segment exists, we'll attempt to attach to it, using attachAt
342 * if that's not NULL (but it's best to pass NULL if possible).
343 *
344 * *addr is set to the segment memory address if we attached to it, else NULL.
345 */
346static IpcMemoryState
348 void *attachAt,
349 PGShmemHeader **addr)
350{
351 struct shmid_ds shmStat;
352 struct stat statbuf;
353 PGShmemHeader *hdr;
354
355 *addr = NULL;
356
357 /*
358 * First, try to stat the shm segment ID, to see if it exists at all.
359 */
360 if (shmctl(shmId, IPC_STAT, &shmStat) < 0)
361 {
362 /*
363 * EINVAL actually has multiple possible causes documented in the
364 * shmctl man page, but we assume it must mean the segment no longer
365 * exists.
366 */
367 if (errno == EINVAL)
368 return SHMSTATE_ENOENT;
369
370 /*
371 * EACCES implies we have no read permission, which means it is not a
372 * Postgres shmem segment (or at least, not one that is relevant to
373 * our data directory).
374 */
375 if (errno == EACCES)
376 return SHMSTATE_FOREIGN;
377
378 /*
379 * Some Linux kernel versions (in fact, all of them as of July 2007)
380 * sometimes return EIDRM when EINVAL is correct. The Linux kernel
381 * actually does not have any internal state that would justify
382 * returning EIDRM, so we can get away with assuming that EIDRM is
383 * equivalent to EINVAL on that platform.
384 */
385#ifdef HAVE_LINUX_EIDRM_BUG
386 if (errno == EIDRM)
387 return SHMSTATE_ENOENT;
388#endif
389
390 /*
391 * Otherwise, we had better assume that the segment is in use. The
392 * only likely case is (non-Linux, assumed spec-compliant) EIDRM,
393 * which implies that the segment has been IPC_RMID'd but there are
394 * still processes attached to it.
395 */
397 }
398
399 /*
400 * Try to attach to the segment and see if it matches our data directory.
401 * This avoids any risk of duplicate-shmem-key conflicts on machines that
402 * are running several postmasters under the same userid.
403 *
404 * (When we're called from PGSharedMemoryCreate, this stat call is
405 * duplicative; but since this isn't a high-traffic case it's not worth
406 * trying to optimize.)
407 */
408 if (stat(DataDir, &statbuf) < 0)
409 return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */
410
411 hdr = (PGShmemHeader *) shmat(shmId, attachAt, PG_SHMAT_FLAGS);
412 if (hdr == (PGShmemHeader *) -1)
413 {
414 /*
415 * Attachment failed. The cases we're interested in are the same as
416 * for the shmctl() call above. In particular, note that the owning
417 * postmaster could have terminated and removed the segment between
418 * shmctl() and shmat().
419 *
420 * If attachAt isn't NULL, it's possible that EINVAL reflects a
421 * problem with that address not a vanished segment, so it's best to
422 * pass NULL when probing for conflicting segments.
423 */
424 if (errno == EINVAL)
425 return SHMSTATE_ENOENT; /* segment disappeared */
426 if (errno == EACCES)
427 return SHMSTATE_FOREIGN; /* must be non-Postgres */
428#ifdef HAVE_LINUX_EIDRM_BUG
429 if (errno == EIDRM)
430 return SHMSTATE_ENOENT; /* segment disappeared */
431#endif
432 /* Otherwise, be conservative. */
434 }
435 *addr = hdr;
436
437 if (hdr->magic != PGShmemMagic ||
438 hdr->device != statbuf.st_dev ||
439 hdr->inode != statbuf.st_ino)
440 {
441 /*
442 * It's either not a Postgres segment, or not one for my data
443 * directory.
444 */
445 return SHMSTATE_FOREIGN;
446 }
447
448 /*
449 * It does match our data directory, so now test whether any processes are
450 * still attached to it. (We are, now, but the shm_nattch result is from
451 * before we attached to it.)
452 */
453 return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED;
454}
455
456/*
457 * Identify the huge page size to use, and compute the related mmap flags.
458 *
459 * Some Linux kernel versions have a bug causing mmap() to fail on requests
460 * that are not a multiple of the hugepage size. Versions without that bug
461 * instead silently round the request up to the next hugepage multiple ---
462 * and then munmap() fails when we give it a size different from that.
463 * So we have to round our request up to a multiple of the actual hugepage
464 * size to avoid trouble.
465 *
466 * Doing the round-up ourselves also lets us make use of the extra memory,
467 * rather than just wasting it. Currently, we just increase the available
468 * space recorded in the shmem header, which will make the extra usable for
469 * purposes such as additional locktable entries. Someday, for very large
470 * hugepage sizes, we might want to think about more invasive strategies,
471 * such as increasing shared_buffers to absorb the extra space.
472 *
473 * Returns the (real, assumed or config provided) page size into
474 * *hugepagesize, and the hugepage-related mmap flags to use into
475 * *mmap_flags if requested by the caller. If huge pages are not supported,
476 * *hugepagesize and *mmap_flags are set to 0.
477 */
478void
479GetHugePageSize(Size *hugepagesize, int *mmap_flags)
480{
481#ifdef MAP_HUGETLB
482
483 Size default_hugepagesize = 0;
484 Size hugepagesize_local = 0;
485 int mmap_flags_local = 0;
486
487 /*
488 * System-dependent code to find out the default huge page size.
489 *
490 * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
491 * nnnn kB". Ignore any failures, falling back to the preset default.
492 */
493#ifdef __linux__
494
495 {
496 FILE *fp = AllocateFile("/proc/meminfo", "r");
497 char buf[128];
498 unsigned int sz;
499 char ch;
500
501 if (fp)
502 {
503 while (fgets(buf, sizeof(buf), fp))
504 {
505 if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2)
506 {
507 if (ch == 'k')
508 {
509 default_hugepagesize = sz * (Size) 1024;
510 break;
511 }
512 /* We could accept other units besides kB, if needed */
513 }
514 }
515 FreeFile(fp);
516 }
517 }
518#endif /* __linux__ */
519
520 if (huge_page_size != 0)
521 {
522 /* If huge page size is requested explicitly, use that. */
523 hugepagesize_local = (Size) huge_page_size * 1024;
524 }
525 else if (default_hugepagesize != 0)
526 {
527 /* Otherwise use the system default, if we have it. */
528 hugepagesize_local = default_hugepagesize;
529 }
530 else
531 {
532 /*
533 * If we fail to find out the system's default huge page size, or no
534 * huge page size is requested explicitly, assume it is 2MB. This will
535 * work fine when the actual size is less. If it's more, we might get
536 * mmap() or munmap() failures due to unaligned requests; but at this
537 * writing, there are no reports of any non-Linux systems being picky
538 * about that.
539 */
540 hugepagesize_local = 2 * 1024 * 1024;
541 }
542
543 mmap_flags_local = MAP_HUGETLB;
544
545 /*
546 * On recent enough Linux, also include the explicit page size, if
547 * necessary.
548 */
549#if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)
550 if (hugepagesize_local != default_hugepagesize)
551 {
552 int shift = pg_ceil_log2_64(hugepagesize_local);
553
554 mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
555 }
556#endif
557
558 /* assign the results found */
559 if (mmap_flags)
560 *mmap_flags = mmap_flags_local;
561 if (hugepagesize)
562 *hugepagesize = hugepagesize_local;
563
564#else
565
566 if (hugepagesize)
567 *hugepagesize = 0;
568 if (mmap_flags)
569 *mmap_flags = 0;
570
571#endif /* MAP_HUGETLB */
572}
573
574/*
575 * GUC check_hook for huge_page_size
576 */
577bool
579{
580#if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT))
581 /* Recent enough Linux only, for now. See GetHugePageSize(). */
582 if (*newval != 0)
583 {
584 GUC_check_errdetail("\"huge_page_size\" must be 0 on this platform.");
585 return false;
586 }
587#endif
588 return true;
589}
590
591/*
592 * Creates an anonymous mmap()ed shared memory segment.
593 *
594 * Pass the requested size in *size. This function will modify *size to the
595 * actual size of the allocation, if it ends up allocating a segment that is
596 * larger than requested.
597 */
598static void *
600{
601 Size allocsize = *size;
602 void *ptr = MAP_FAILED;
603 int mmap_errno = 0;
604
605#ifndef MAP_HUGETLB
606 /* PGSharedMemoryCreate should have dealt with this case */
608#else
610 {
611 /*
612 * Round up the request size to a suitable large value.
613 */
614 Size hugepagesize;
615 int mmap_flags;
616
617 GetHugePageSize(&hugepagesize, &mmap_flags);
618
619 if (allocsize % hugepagesize != 0)
620 allocsize += hugepagesize - (allocsize % hugepagesize);
621
622 ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
623 PG_MMAP_FLAGS | mmap_flags, -1, 0);
624 mmap_errno = errno;
625 if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
626 elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
627 allocsize);
628 }
629#endif
630
631 /*
632 * Report whether huge pages are in use. This needs to be tracked before
633 * the second mmap() call if attempting to use huge pages failed
634 * previously.
635 */
636 SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on",
638
639 if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
640 {
641 /*
642 * Use the original size, not the rounded-up value, when falling back
643 * to non-huge pages.
644 */
645 allocsize = *size;
646 ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
647 PG_MMAP_FLAGS, -1, 0);
648 mmap_errno = errno;
649 }
650
651 if (ptr == MAP_FAILED)
652 {
653 errno = mmap_errno;
655 (errmsg("could not map anonymous shared memory: %m"),
656 (mmap_errno == ENOMEM) ?
657 errhint("This error usually means that PostgreSQL's request "
658 "for a shared memory segment exceeded available memory, "
659 "swap space, or huge pages. To reduce the request size "
660 "(currently %zu bytes), reduce PostgreSQL's shared "
661 "memory usage, perhaps by reducing \"shared_buffers\" or "
662 "\"max_connections\".",
663 allocsize) : 0));
664 }
665
666 *size = allocsize;
667 return ptr;
668}
669
670/*
671 * AnonymousShmemDetach --- detach from an anonymous mmap'd block
672 * (called as an on_shmem_exit callback, hence funny argument list)
673 */
674static void
676{
677 /* Release anonymous shared memory block, if any. */
678 if (AnonymousShmem != NULL)
679 {
680 if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
681 elog(LOG, "munmap(%p, %zu) failed: %m",
683 AnonymousShmem = NULL;
684 }
685}
686
687/*
688 * PGSharedMemoryCreate
689 *
690 * Create a shared memory segment of the given size and initialize its
691 * standard header. Also, register an on_shmem_exit callback to release
692 * the storage.
693 *
694 * Dead Postgres segments pertinent to this DataDir are recycled if found, but
695 * we do not fail upon collision with foreign shmem segments. The idea here
696 * is to detect and re-use keys that may have been assigned by a crashed
697 * postmaster or backend.
698 */
701 PGShmemHeader **shim)
702{
703 IpcMemoryKey NextShmemSegID;
704 void *memAddress;
705 PGShmemHeader *hdr;
706 struct stat statbuf;
707 Size sysvsize;
708
709 /*
710 * We use the data directory's ID info (inode and device numbers) to
711 * positively identify shmem segments associated with this data dir, and
712 * also as seeds for searching for a free shmem key.
713 */
714 if (stat(DataDir, &statbuf) < 0)
717 errmsg("could not stat data directory \"%s\": %m",
718 DataDir)));
719
720 /* Complain if hugepages demanded but we can't possibly support them */
721#if !defined(MAP_HUGETLB)
724 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
725 errmsg("huge pages not supported on this platform")));
726#endif
727
728 /* For now, we don't support huge pages in SysV memory */
731 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
732 errmsg("huge pages not supported with the current \"shared_memory_type\" setting")));
733
734 /* Room for a header? */
735 Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
736
738 {
741
742 /* Register on-exit routine to unmap the anonymous segment */
744
745 /* Now we need only allocate a minimal-sized SysV shmem block. */
746 sysvsize = sizeof(PGShmemHeader);
747 }
748 else
749 {
750 sysvsize = size;
751
752 /* huge pages are only available with mmap */
753 SetConfigOption("huge_pages_status", "off",
755 }
756
757 /*
758 * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to
759 * ensure no more than one postmaster per data directory can enter this
760 * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure
761 * that, but prefer fixing it over coping here.)
762 */
763 NextShmemSegID = statbuf.st_ino;
764
765 for (;;)
766 {
767 IpcMemoryId shmid;
768 PGShmemHeader *oldhdr;
770
771 /* Try to create new segment */
772 memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize);
773 if (memAddress)
774 break; /* successful create and attach */
775
776 /* Check shared memory and possibly remove and recreate */
777
778 /*
779 * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN.
780 * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can
781 * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN.
782 */
783 shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0);
784 if (shmid < 0)
785 {
786 oldhdr = NULL;
788 }
789 else
790 state = PGSharedMemoryAttach(shmid, NULL, &oldhdr);
791
792 switch (state)
793 {
797 (errcode(ERRCODE_LOCK_FILE_EXISTS),
798 errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use",
799 (unsigned long) NextShmemSegID,
800 (unsigned long) shmid),
801 errhint("Terminate any old server processes associated with data directory \"%s\".",
802 DataDir)));
803 break;
804 case SHMSTATE_ENOENT:
805
806 /*
807 * To our surprise, some other process deleted since our last
808 * InternalIpcMemoryCreate(). Moments earlier, we would have
809 * seen SHMSTATE_FOREIGN. Try that same ID again.
810 */
811 elog(LOG,
812 "shared memory block (key %lu, ID %lu) deleted during startup",
813 (unsigned long) NextShmemSegID,
814 (unsigned long) shmid);
815 break;
816 case SHMSTATE_FOREIGN:
817 NextShmemSegID++;
818 break;
820
821 /*
822 * The segment pertains to DataDir, and every process that had
823 * used it has died or detached. Zap it, if possible, and any
824 * associated dynamic shared memory segments, as well. This
825 * shouldn't fail, but if it does, assume the segment belongs
826 * to someone else after all, and try the next candidate.
827 * Otherwise, try again to create the segment. That may fail
828 * if some other process creates the same shmem key before we
829 * do, in which case we'll try the next key.
830 */
831 if (oldhdr->dsm_control != 0)
833 if (shmctl(shmid, IPC_RMID, NULL) < 0)
834 NextShmemSegID++;
835 break;
836 }
837
838 if (oldhdr && shmdt(oldhdr) < 0)
839 elog(LOG, "shmdt(%p) failed: %m", oldhdr);
840 }
841
842 /* Initialize new segment. */
843 hdr = (PGShmemHeader *) memAddress;
844 hdr->creatorPID = getpid();
845 hdr->magic = PGShmemMagic;
846 hdr->dsm_control = 0;
847
848 /* Fill in the data directory ID info, too */
849 hdr->device = statbuf.st_dev;
850 hdr->inode = statbuf.st_ino;
851
852 /*
853 * Initialize space allocation status for segment.
854 */
855 hdr->totalsize = size;
856 hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
857 *shim = hdr;
858
859 /* Save info for possible future use */
860 UsedShmemSegAddr = memAddress;
861 UsedShmemSegID = (unsigned long) NextShmemSegID;
862
863 /*
864 * If AnonymousShmem is NULL here, then we're not using anonymous shared
865 * memory, and should return a pointer to the System V shared memory
866 * block. Otherwise, the System V shared memory block is only a shim, and
867 * we must return a pointer to the real block.
868 */
869 if (AnonymousShmem == NULL)
870 return hdr;
871 memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
872 return (PGShmemHeader *) AnonymousShmem;
873}
874
875#ifdef EXEC_BACKEND
876
877/*
878 * PGSharedMemoryReAttach
879 *
880 * This is called during startup of a postmaster child process to re-attach to
881 * an already existing shared memory segment. This is needed only in the
882 * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
883 * segment attachment via fork().
884 *
885 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
886 * routine. The caller must have already restored them to the postmaster's
887 * values.
888 */
889void
891{
892 IpcMemoryId shmid;
893 PGShmemHeader *hdr;
895 void *origUsedShmemSegAddr = UsedShmemSegAddr;
896
897 Assert(UsedShmemSegAddr != NULL);
899
900#ifdef __CYGWIN__
901 /* cygipc (currently) appears to not detach on exec. */
903 UsedShmemSegAddr = origUsedShmemSegAddr;
904#endif
905
906 elog(DEBUG3, "attaching to %p", UsedShmemSegAddr);
907 shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0);
908 if (shmid < 0)
910 else
913 elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m",
915 if (hdr != origUsedShmemSegAddr)
916 elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)",
917 hdr, origUsedShmemSegAddr);
918 dsm_set_control_handle(hdr->dsm_control);
919
920 UsedShmemSegAddr = hdr; /* probably redundant */
921}
922
923/*
924 * PGSharedMemoryNoReAttach
925 *
926 * This is called during startup of a postmaster child process when we choose
927 * *not* to re-attach to the existing shared memory segment. We must clean up
928 * to leave things in the appropriate state. This is not used in the non
929 * EXEC_BACKEND case, either.
930 *
931 * The child process startup logic might or might not call PGSharedMemoryDetach
932 * after this; make sure that it will be a no-op if called.
933 *
934 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
935 * routine. The caller must have already restored them to the postmaster's
936 * values.
937 */
938void
940{
941 Assert(UsedShmemSegAddr != NULL);
943
944#ifdef __CYGWIN__
945 /* cygipc (currently) appears to not detach on exec. */
947#endif
948
949 /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
950 UsedShmemSegAddr = NULL;
951 /* And the same for UsedShmemSegID. */
952 UsedShmemSegID = 0;
953}
954
955#endif /* EXEC_BACKEND */
956
957/*
958 * PGSharedMemoryDetach
959 *
960 * Detach from the shared memory segment, if still attached. This is not
961 * intended to be called explicitly by the process that originally created the
962 * segment (it will have on_shmem_exit callback(s) registered to do that).
963 * Rather, this is for subprocesses that have inherited an attachment and want
964 * to get rid of it.
965 *
966 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
967 * routine, also AnonymousShmem and AnonymousShmemSize.
968 */
969void
971{
972 if (UsedShmemSegAddr != NULL)
973 {
974 if ((shmdt(UsedShmemSegAddr) < 0)
975#if defined(EXEC_BACKEND) && defined(__CYGWIN__)
976 /* Work-around for cygipc exec bug */
977 && shmdt(NULL) < 0
978#endif
979 )
980 elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
981 UsedShmemSegAddr = NULL;
982 }
983
984 if (AnonymousShmem != NULL)
985 {
986 if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
987 elog(LOG, "munmap(%p, %zu) failed: %m",
989 AnonymousShmem = NULL;
990 }
991}
#define MAXALIGN(LEN)
Definition: c.h:768
#define Assert(condition)
Definition: c.h:815
size_t Size
Definition: c.h:562
void dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
Definition: dsm.c:238
int errcode_for_file_access(void)
Definition: elog.c:876
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errhint(const char *fmt,...)
Definition: elog.c:1317
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define LOG
Definition: elog.h:31
#define DEBUG3
Definition: elog.h:28
#define FATAL
Definition: elog.h:41
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
int FreeFile(FILE *file)
Definition: fd.c:2803
FILE * AllocateFile(const char *name, const char *mode)
Definition: fd.c:2605
bool IsUnderPostmaster
Definition: globals.c:119
char * DataDir
Definition: globals.c:70
void SetConfigOption(const char *name, const char *value, GucContext context, GucSource source)
Definition: guc.c:4332
#define newval
#define GUC_check_errdetail
Definition: guc.h:480
GucSource
Definition: guc.h:112
@ PGC_S_DYNAMIC_DEFAULT
Definition: guc.h:114
@ PGC_INTERNAL
Definition: guc.h:73
int huge_pages
Definition: guc_tables.c:562
int huge_page_size
Definition: guc_tables.c:563
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
int shared_memory_type
Definition: ipci.c:55
#define PG_MMAP_FLAGS
Definition: mem.h:41
#define PG_SHMAT_FLAGS
Definition: mem.h:20
#define MAP_FAILED
Definition: mem.h:45
void AddToDataDirLockFile(int target_line, const char *str)
Definition: miscinit.c:1567
void * arg
static uint64 pg_ceil_log2_64(uint64 num)
Definition: pg_bitutils.h:271
static rewind_source * source
Definition: pg_rewind.c:89
@ HUGE_PAGES_ON
Definition: pg_shmem.h:53
@ HUGE_PAGES_TRY
Definition: pg_shmem.h:54
#define PGShmemMagic
Definition: pg_shmem.h:32
@ SHMEM_TYPE_MMAP
Definition: pg_shmem.h:63
struct PGShmemHeader PGShmemHeader
static char * buf
Definition: pg_test_fsync.c:72
#define LOCK_FILE_LINE_SHMEM_KEY
Definition: pidfile.h:43
#define sprintf
Definition: port.h:241
#define IPCProtection
Definition: posix_sema.c:59
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327
uintptr_t Datum
Definition: postgres.h:69
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
static Datum Int32GetDatum(int32 X)
Definition: postgres.h:217
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:207
static pg_noinline void Size size
Definition: slab.c:607
dsm_handle dsm_control
Definition: pg_shmem.h:36
ino_t inode
Definition: pg_shmem.h:40
Size freeoffset
Definition: pg_shmem.h:35
pid_t creatorPID
Definition: pg_shmem.h:33
dev_t device
Definition: pg_shmem.h:39
int32 magic
Definition: pg_shmem.h:31
Size totalsize
Definition: pg_shmem.h:34
_dev_t st_dev
Definition: win32_port.h:256
_ino_t st_ino
Definition: win32_port.h:257
Definition: regguts.h:323
static void AnonymousShmemDetach(int status, Datum arg)
Definition: sysv_shmem.c:675
void PGSharedMemoryDetach(void)
Definition: sysv_shmem.c:970
int IpcMemoryId
Definition: sysv_shmem.c:71
IpcMemoryState
Definition: sysv_shmem.c:85
@ SHMSTATE_ATTACHED
Definition: sysv_shmem.c:87
@ SHMSTATE_UNATTACHED
Definition: sysv_shmem.c:90
@ SHMSTATE_FOREIGN
Definition: sysv_shmem.c:89
@ SHMSTATE_ENOENT
Definition: sysv_shmem.c:88
@ SHMSTATE_ANALYSIS_FAILURE
Definition: sysv_shmem.c:86
PGShmemHeader * PGSharedMemoryCreate(Size size, PGShmemHeader **shim)
Definition: sysv_shmem.c:700
static Size AnonymousShmemSize
Definition: sysv_shmem.c:97
key_t IpcMemoryKey
Definition: sysv_shmem.c:70
unsigned long UsedShmemSegID
Definition: sysv_shmem.c:94
bool check_huge_page_size(int *newval, void **extra, GucSource source)
Definition: sysv_shmem.c:578
static void * CreateAnonymousSegment(Size *size)
Definition: sysv_shmem.c:599
static void * InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
Definition: sysv_shmem.c:121
void GetHugePageSize(Size *hugepagesize, int *mmap_flags)
Definition: sysv_shmem.c:479
void * UsedShmemSegAddr
Definition: sysv_shmem.c:95
bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
Definition: sysv_shmem.c:317
static void IpcMemoryDetach(int status, Datum shmaddr)
Definition: sysv_shmem.c:286
static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, void *attachAt, PGShmemHeader **addr)
Definition: sysv_shmem.c:347
static void * AnonymousShmem
Definition: sysv_shmem.c:98
static void IpcMemoryDelete(int status, Datum shmId)
Definition: sysv_shmem.c:298
#define stat
Definition: win32_port.h:274
#define IPC_STAT
Definition: win32_port.h:98
#define IPC_RMID
Definition: win32_port.h:93
long key_t
Definition: win32_port.h:237
#define IPC_EXCL
Definition: win32_port.h:95
#define IPC_CREAT
Definition: win32_port.h:94
#define EIDRM
Definition: win32_port.h:102
void PGSharedMemoryReAttach(void)
Definition: win32_shmem.c:424
void PGSharedMemoryNoReAttach(void)
Definition: win32_shmem.c:472