PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
dsm_impl.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * dsm_impl.c
4 * manage dynamic shared memory segments
5 *
6 * This file provides low-level APIs for creating and destroying shared
7 * memory segments using several different possible techniques. We refer
8 * to these segments as dynamic because they can be created, altered, and
9 * destroyed at any point during the server life cycle. This is unlike
10 * the main shared memory segment, of which there is always exactly one
11 * and which is always mapped at a fixed address in every PostgreSQL
12 * background process.
13 *
14 * Because not all systems provide the same primitives in this area, nor
15 * do all primitives behave the same way on all systems, we provide
16 * several implementations of this facility. Many systems implement
17 * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18 * in this area, with the exception that shared memory identifiers live
19 * in a flat system-wide namespace, raising the uncomfortable prospect of
20 * name collisions with other processes (including other copies of
21 * PostgreSQL) running on the same system. Some systems only support
22 * the older System V shared memory interface (shmget etc.) which is
23 * also usable; however, the default allocation limits are often quite
24 * small, and the namespace is even more restricted.
25 *
26 * We also provide an mmap-based shared memory implementation. This may
27 * be useful on systems that provide shared memory via a special-purpose
28 * filesystem; by opting for this implementation, the user can even
29 * control precisely where their shared memory segments are placed. It
30 * can also be used as a fallback for systems where shm_open and shmget
31 * are not available or can't be used for some reason. Of course,
32 * mapping a file residing on an actual spinning disk is a fairly poor
33 * approximation for shared memory because writeback may hurt performance
34 * substantially, but there should be few systems where we must make do
35 * with such poor tools.
36 *
37 * As ever, Windows requires its own implementation.
38 *
39 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
40 * Portions Copyright (c) 1994, Regents of the University of California
41 *
42 *
43 * IDENTIFICATION
44 * src/backend/storage/ipc/dsm_impl.c
45 *
46 *-------------------------------------------------------------------------
47 */
48
49#include "postgres.h"
50
51#include <fcntl.h>
52#include <signal.h>
53#include <unistd.h>
54#ifndef WIN32
55#include <sys/mman.h>
56#include <sys/ipc.h>
57#include <sys/shm.h>
58#include <sys/stat.h>
59#endif
60
61#include "common/file_perm.h"
62#include "libpq/pqsignal.h"
63#include "miscadmin.h"
64#include "pgstat.h"
65#include "portability/mem.h"
67#include "storage/dsm_impl.h"
68#include "storage/fd.h"
69#include "utils/guc.h"
70#include "utils/memutils.h"
71
72#ifdef USE_DSM_POSIX
73static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
74 void **impl_private, void **mapped_address,
75 Size *mapped_size, int elevel);
76static int dsm_impl_posix_resize(int fd, off_t size);
77#endif
78#ifdef USE_DSM_SYSV
79static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
80 void **impl_private, void **mapped_address,
81 Size *mapped_size, int elevel);
82#endif
83#ifdef USE_DSM_WINDOWS
84static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
85 void **impl_private, void **mapped_address,
86 Size *mapped_size, int elevel);
87#endif
88#ifdef USE_DSM_MMAP
89static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
90 void **impl_private, void **mapped_address,
91 Size *mapped_size, int elevel);
92#endif
94
96#ifdef USE_DSM_POSIX
97 {"posix", DSM_IMPL_POSIX, false},
98#endif
99#ifdef USE_DSM_SYSV
100 {"sysv", DSM_IMPL_SYSV, false},
101#endif
102#ifdef USE_DSM_WINDOWS
103 {"windows", DSM_IMPL_WINDOWS, false},
104#endif
105#ifdef USE_DSM_MMAP
106 {"mmap", DSM_IMPL_MMAP, false},
107#endif
108 {NULL, 0, false}
109};
110
111/* Implementation selector. */
113
114/* Amount of space reserved for DSM segments in the main area. */
116
117/* Size of buffer to be used for zero-filling. */
118#define ZBUFFER_SIZE 8192
119
120#define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
121
122/*------
123 * Perform a low-level shared memory operation in a platform-specific way,
124 * as dictated by the selected implementation. Each implementation is
125 * required to implement the following primitives.
126 *
127 * DSM_OP_CREATE. Create a segment whose size is the request_size and
128 * map it.
129 *
130 * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
131 *
132 * DSM_OP_DETACH. Unmap the segment.
133 *
134 * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
135 * segment.
136 *
137 * Arguments:
138 * op: The operation to be performed.
139 * handle: The handle of an existing object, or for DSM_OP_CREATE, the
140 * identifier for the new handle the caller wants created.
141 * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
142 * impl_private: Private, implementation-specific data. Will be a pointer
143 * to NULL for the first operation on a shared memory segment within this
144 * backend; thereafter, it will point to the value to which it was set
145 * on the previous call.
146 * mapped_address: Pointer to start of current mapping; pointer to NULL
147 * if none. Updated with new mapping address.
148 * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
149 * Updated with new mapped size.
150 * elevel: Level at which to log errors.
151 *
152 * Return value: true on success, false on failure. When false is returned,
153 * a message should first be logged at the specified elevel, except in the
154 * case where DSM_OP_CREATE experiences a name collision, which should
155 * silently return false.
156 *-----
157 */
158bool
159dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
160 void **impl_private, void **mapped_address, Size *mapped_size,
161 int elevel)
162{
163 Assert(op == DSM_OP_CREATE || request_size == 0);
164 Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
165 (*mapped_address == NULL && *mapped_size == 0));
166
168 {
169#ifdef USE_DSM_POSIX
170 case DSM_IMPL_POSIX:
171 return dsm_impl_posix(op, handle, request_size, impl_private,
172 mapped_address, mapped_size, elevel);
173#endif
174#ifdef USE_DSM_SYSV
175 case DSM_IMPL_SYSV:
176 return dsm_impl_sysv(op, handle, request_size, impl_private,
177 mapped_address, mapped_size, elevel);
178#endif
179#ifdef USE_DSM_WINDOWS
180 case DSM_IMPL_WINDOWS:
181 return dsm_impl_windows(op, handle, request_size, impl_private,
182 mapped_address, mapped_size, elevel);
183#endif
184#ifdef USE_DSM_MMAP
185 case DSM_IMPL_MMAP:
186 return dsm_impl_mmap(op, handle, request_size, impl_private,
187 mapped_address, mapped_size, elevel);
188#endif
189 default:
190 elog(ERROR, "unexpected dynamic shared memory type: %d",
192 return false;
193 }
194}
195
196#ifdef USE_DSM_POSIX
197/*
198 * Operating system primitives to support POSIX shared memory.
199 *
200 * POSIX shared memory segments are created and attached using shm_open()
201 * and shm_unlink(); other operations, such as sizing or mapping the
202 * segment, are performed as if the shared memory segments were files.
203 *
204 * Indeed, on some platforms, they may be implemented that way. While
205 * POSIX shared memory segments seem intended to exist in a flat namespace,
206 * some operating systems may implement them as files, even going so far
207 * to treat a request for /xyz as a request to create a file by that name
208 * in the root directory. Users of such broken platforms should select
209 * a different shared memory implementation.
210 */
211static bool
212dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
213 void **impl_private, void **mapped_address, Size *mapped_size,
214 int elevel)
215{
216 char name[64];
217 int flags;
218 int fd;
219 char *address;
220
221 snprintf(name, 64, "/PostgreSQL.%u", handle);
222
223 /* Handle teardown cases. */
224 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
225 {
226 if (*mapped_address != NULL
227 && munmap(*mapped_address, *mapped_size) != 0)
228 {
229 ereport(elevel,
231 errmsg("could not unmap shared memory segment \"%s\": %m",
232 name)));
233 return false;
234 }
235 *mapped_address = NULL;
236 *mapped_size = 0;
237 if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
238 {
239 ereport(elevel,
241 errmsg("could not remove shared memory segment \"%s\": %m",
242 name)));
243 return false;
244 }
245 return true;
246 }
247
248 /*
249 * Create new segment or open an existing one for attach.
250 *
251 * Even though we will close the FD before returning, it seems desirable
252 * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
253 * failure. The fact that we won't hold the FD open long justifies using
254 * ReserveExternalFD rather than AcquireExternalFD, though.
255 */
257
258 flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
259 if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
260 {
262 if (op == DSM_OP_ATTACH || errno != EEXIST)
263 ereport(elevel,
265 errmsg("could not open shared memory segment \"%s\": %m",
266 name)));
267 return false;
268 }
269
270 /*
271 * If we're attaching the segment, determine the current size; if we are
272 * creating the segment, set the size to the requested value.
273 */
274 if (op == DSM_OP_ATTACH)
275 {
276 struct stat st;
277
278 if (fstat(fd, &st) != 0)
279 {
280 int save_errno;
281
282 /* Back out what's already been done. */
283 save_errno = errno;
284 close(fd);
286 errno = save_errno;
287
288 ereport(elevel,
290 errmsg("could not stat shared memory segment \"%s\": %m",
291 name)));
292 return false;
293 }
294 request_size = st.st_size;
295 }
296 else if (dsm_impl_posix_resize(fd, request_size) != 0)
297 {
298 int save_errno;
299
300 /* Back out what's already been done. */
301 save_errno = errno;
302 close(fd);
304 shm_unlink(name);
305 errno = save_errno;
306
307 ereport(elevel,
309 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
310 name, request_size)));
311 return false;
312 }
313
314 /* Map it. */
315 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
316 MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
317 if (address == MAP_FAILED)
318 {
319 int save_errno;
320
321 /* Back out what's already been done. */
322 save_errno = errno;
323 close(fd);
325 if (op == DSM_OP_CREATE)
326 shm_unlink(name);
327 errno = save_errno;
328
329 ereport(elevel,
331 errmsg("could not map shared memory segment \"%s\": %m",
332 name)));
333 return false;
334 }
335 *mapped_address = address;
336 *mapped_size = request_size;
337 close(fd);
339
340 return true;
341}
342
343/*
344 * Set the size of a virtual memory region associated with a file descriptor.
345 * If necessary, also ensure that virtual memory is actually allocated by the
346 * operating system, to avoid nasty surprises later.
347 *
348 * Returns non-zero if either truncation or allocation fails, and sets errno.
349 */
350static int
351dsm_impl_posix_resize(int fd, off_t size)
352{
353 int rc;
354 int save_errno;
355 sigset_t save_sigmask;
356
357 /*
358 * Block all blockable signals, except SIGQUIT. posix_fallocate() can run
359 * for quite a long time, and is an all-or-nothing operation. If we
360 * allowed SIGUSR1 to interrupt us repeatedly (for example, due to
361 * recovery conflicts), the retry loop might never succeed.
362 */
364 sigprocmask(SIG_SETMASK, &BlockSig, &save_sigmask);
365
366 pgstat_report_wait_start(WAIT_EVENT_DSM_ALLOCATE);
367#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
368
369 /*
370 * On Linux, a shm_open fd is backed by a tmpfs file. If we were to use
371 * ftruncate, the file would contain a hole. Accessing memory backed by a
372 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
373 * is no more tmpfs space available. So we ask tmpfs to allocate pages
374 * here, so we can fail gracefully with ENOSPC now rather than risking
375 * SIGBUS later.
376 *
377 * We still use a traditional EINTR retry loop to handle SIGCONT.
378 * posix_fallocate() doesn't restart automatically, and we don't want this
379 * to fail if you attach a debugger.
380 */
381 do
382 {
383 rc = posix_fallocate(fd, 0, size);
384 } while (rc == EINTR);
385
386 /*
387 * The caller expects errno to be set, but posix_fallocate() doesn't set
388 * it. Instead it returns error numbers directly. So set errno, even
389 * though we'll also return rc to indicate success or failure.
390 */
391 errno = rc;
392#else
393 /* Extend the file to the requested size. */
394 do
395 {
396 rc = ftruncate(fd, size);
397 } while (rc < 0 && errno == EINTR);
398#endif
400
402 {
403 save_errno = errno;
404 sigprocmask(SIG_SETMASK, &save_sigmask, NULL);
405 errno = save_errno;
406 }
407
408 return rc;
409}
410
411#endif /* USE_DSM_POSIX */
412
413#ifdef USE_DSM_SYSV
414/*
415 * Operating system primitives to support System V shared memory.
416 *
417 * System V shared memory segments are manipulated using shmget(), shmat(),
418 * shmdt(), and shmctl(). As the default allocation limits for System V
419 * shared memory are usually quite low, the POSIX facilities may be
420 * preferable; but those are not supported everywhere.
421 */
422static bool
423dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
424 void **impl_private, void **mapped_address, Size *mapped_size,
425 int elevel)
426{
427 key_t key;
428 int ident;
429 char *address;
430 char name[64];
431 int *ident_cache;
432
433 /*
434 * POSIX shared memory and mmap-based shared memory identify segments with
435 * names. To avoid needless error message variation, we use the handle as
436 * the name.
437 */
438 snprintf(name, 64, "%u", handle);
439
440 /*
441 * The System V shared memory namespace is very restricted; names are of
442 * type key_t, which is expected to be some sort of integer data type, but
443 * not necessarily the same one as dsm_handle. Since we use dsm_handle to
444 * identify shared memory segments across processes, this might seem like
445 * a problem, but it's really not. If dsm_handle is bigger than key_t,
446 * the cast below might truncate away some bits from the handle the
447 * user-provided, but it'll truncate exactly the same bits away in exactly
448 * the same fashion every time we use that handle, which is all that
449 * really matters. Conversely, if dsm_handle is smaller than key_t, we
450 * won't use the full range of available key space, but that's no big deal
451 * either.
452 *
453 * We do make sure that the key isn't negative, because that might not be
454 * portable.
455 */
456 key = (key_t) handle;
457 if (key < 1) /* avoid compiler warning if type is unsigned */
458 key = -key;
459
460 /*
461 * There's one special key, IPC_PRIVATE, which can't be used. If we end
462 * up with that value by chance during a create operation, just pretend it
463 * already exists, so that caller will retry. If we run into it anywhere
464 * else, the caller has passed a handle that doesn't correspond to
465 * anything we ever created, which should not happen.
466 */
467 if (key == IPC_PRIVATE)
468 {
469 if (op != DSM_OP_CREATE)
470 elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
471 errno = EEXIST;
472 return false;
473 }
474
475 /*
476 * Before we can do anything with a shared memory segment, we have to map
477 * the shared memory key to a shared memory identifier using shmget(). To
478 * avoid repeated lookups, we store the key using impl_private.
479 */
480 if (*impl_private != NULL)
481 {
482 ident_cache = *impl_private;
483 ident = *ident_cache;
484 }
485 else
486 {
487 int flags = IPCProtection;
488 size_t segsize;
489
490 /*
491 * Allocate the memory BEFORE acquiring the resource, so that we don't
492 * leak the resource if memory allocation fails.
493 */
494 ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
495
496 /*
497 * When using shmget to find an existing segment, we must pass the
498 * size as 0. Passing a non-zero size which is greater than the
499 * actual size will result in EINVAL.
500 */
501 segsize = 0;
502
503 if (op == DSM_OP_CREATE)
504 {
505 flags |= IPC_CREAT | IPC_EXCL;
506 segsize = request_size;
507 }
508
509 if ((ident = shmget(key, segsize, flags)) == -1)
510 {
511 if (op == DSM_OP_ATTACH || errno != EEXIST)
512 {
513 int save_errno = errno;
514
515 pfree(ident_cache);
516 errno = save_errno;
517 ereport(elevel,
519 errmsg("could not get shared memory segment: %m")));
520 }
521 return false;
522 }
523
524 *ident_cache = ident;
525 *impl_private = ident_cache;
526 }
527
528 /* Handle teardown cases. */
529 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
530 {
531 pfree(ident_cache);
532 *impl_private = NULL;
533 if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
534 {
535 ereport(elevel,
537 errmsg("could not unmap shared memory segment \"%s\": %m",
538 name)));
539 return false;
540 }
541 *mapped_address = NULL;
542 *mapped_size = 0;
543 if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
544 {
545 ereport(elevel,
547 errmsg("could not remove shared memory segment \"%s\": %m",
548 name)));
549 return false;
550 }
551 return true;
552 }
553
554 /* If we're attaching it, we must use IPC_STAT to determine the size. */
555 if (op == DSM_OP_ATTACH)
556 {
557 struct shmid_ds shm;
558
559 if (shmctl(ident, IPC_STAT, &shm) != 0)
560 {
561 ereport(elevel,
563 errmsg("could not stat shared memory segment \"%s\": %m",
564 name)));
565 return false;
566 }
567 request_size = shm.shm_segsz;
568 }
569
570 /* Map it. */
571 address = shmat(ident, NULL, PG_SHMAT_FLAGS);
572 if (address == (void *) -1)
573 {
574 int save_errno;
575
576 /* Back out what's already been done. */
577 save_errno = errno;
578 if (op == DSM_OP_CREATE)
579 shmctl(ident, IPC_RMID, NULL);
580 errno = save_errno;
581
582 ereport(elevel,
584 errmsg("could not map shared memory segment \"%s\": %m",
585 name)));
586 return false;
587 }
588 *mapped_address = address;
589 *mapped_size = request_size;
590
591 return true;
592}
593#endif
594
595#ifdef USE_DSM_WINDOWS
596/*
597 * Operating system primitives to support Windows shared memory.
598 *
599 * Windows shared memory implementation is done using file mapping
600 * which can be backed by either physical file or system paging file.
601 * Current implementation uses system paging file as other effects
602 * like performance are not clear for physical file and it is used in similar
603 * way for main shared memory in windows.
604 *
605 * A memory mapping object is a kernel object - they always get deleted when
606 * the last reference to them goes away, either explicitly via a CloseHandle or
607 * when the process containing the reference exits.
608 */
609static bool
610dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
611 void **impl_private, void **mapped_address,
612 Size *mapped_size, int elevel)
613{
614 char *address;
615 HANDLE hmap;
616 char name[64];
617 MEMORY_BASIC_INFORMATION info;
618
619 /*
620 * Storing the shared memory segment in the Global\ namespace, can allow
621 * any process running in any session to access that file mapping object
622 * provided that the caller has the required access rights. But to avoid
623 * issues faced in main shared memory, we are using the naming convention
624 * similar to main shared memory. We can change here once issue mentioned
625 * in GetSharedMemName is resolved.
626 */
627 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
628
629 /*
630 * Handle teardown cases. Since Windows automatically destroys the object
631 * when no references remain, we can treat it the same as detach.
632 */
633 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
634 {
635 if (*mapped_address != NULL
636 && UnmapViewOfFile(*mapped_address) == 0)
637 {
638 _dosmaperr(GetLastError());
639 ereport(elevel,
641 errmsg("could not unmap shared memory segment \"%s\": %m",
642 name)));
643 return false;
644 }
645 if (*impl_private != NULL
646 && CloseHandle(*impl_private) == 0)
647 {
648 _dosmaperr(GetLastError());
649 ereport(elevel,
651 errmsg("could not remove shared memory segment \"%s\": %m",
652 name)));
653 return false;
654 }
655
656 *impl_private = NULL;
657 *mapped_address = NULL;
658 *mapped_size = 0;
659 return true;
660 }
661
662 /* Create new segment or open an existing one for attach. */
663 if (op == DSM_OP_CREATE)
664 {
665 DWORD size_high;
666 DWORD size_low;
667 DWORD errcode;
668
669 /* Shifts >= the width of the type are undefined. */
670#ifdef _WIN64
671 size_high = request_size >> 32;
672#else
673 size_high = 0;
674#endif
675 size_low = (DWORD) request_size;
676
677 /* CreateFileMapping might not clear the error code on success */
678 SetLastError(0);
679
680 hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
681 NULL, /* Default security attrs */
682 PAGE_READWRITE, /* Memory is read/write */
683 size_high, /* Upper 32 bits of size */
684 size_low, /* Lower 32 bits of size */
685 name);
686
687 errcode = GetLastError();
688 if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
689 {
690 /*
691 * On Windows, when the segment already exists, a handle for the
692 * existing segment is returned. We must close it before
693 * returning. However, if the existing segment is created by a
694 * service, then it returns ERROR_ACCESS_DENIED. We don't do
695 * _dosmaperr here, so errno won't be modified.
696 */
697 if (hmap)
698 CloseHandle(hmap);
699 return false;
700 }
701
702 if (!hmap)
703 {
705 ereport(elevel,
707 errmsg("could not create shared memory segment \"%s\": %m",
708 name)));
709 return false;
710 }
711 }
712 else
713 {
714 hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
715 FALSE, /* do not inherit the name */
716 name); /* name of mapping object */
717 if (!hmap)
718 {
719 _dosmaperr(GetLastError());
720 ereport(elevel,
722 errmsg("could not open shared memory segment \"%s\": %m",
723 name)));
724 return false;
725 }
726 }
727
728 /* Map it. */
729 address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
730 0, 0, 0);
731 if (!address)
732 {
733 int save_errno;
734
735 _dosmaperr(GetLastError());
736 /* Back out what's already been done. */
737 save_errno = errno;
738 CloseHandle(hmap);
739 errno = save_errno;
740
741 ereport(elevel,
743 errmsg("could not map shared memory segment \"%s\": %m",
744 name)));
745 return false;
746 }
747
748 /*
749 * VirtualQuery gives size in page_size units, which is 4K for Windows. We
750 * need size only when we are attaching, but it's better to get the size
751 * when creating new segment to keep size consistent both for
752 * DSM_OP_CREATE and DSM_OP_ATTACH.
753 */
754 if (VirtualQuery(address, &info, sizeof(info)) == 0)
755 {
756 int save_errno;
757
758 _dosmaperr(GetLastError());
759 /* Back out what's already been done. */
760 save_errno = errno;
761 UnmapViewOfFile(address);
762 CloseHandle(hmap);
763 errno = save_errno;
764
765 ereport(elevel,
767 errmsg("could not stat shared memory segment \"%s\": %m",
768 name)));
769 return false;
770 }
771
772 *mapped_address = address;
773 *mapped_size = info.RegionSize;
774 *impl_private = hmap;
775
776 return true;
777}
778#endif
779
780#ifdef USE_DSM_MMAP
781/*
782 * Operating system primitives to support mmap-based shared memory.
783 *
784 * Calling this "shared memory" is somewhat of a misnomer, because what
785 * we're really doing is creating a bunch of files and mapping them into
786 * our address space. The operating system may feel obliged to
787 * synchronize the contents to disk even if nothing is being paged out,
788 * which will not serve us well. The user can relocate the pg_dynshmem
789 * directory to a ramdisk to avoid this problem, if available.
790 */
791static bool
792dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
793 void **impl_private, void **mapped_address, Size *mapped_size,
794 int elevel)
795{
796 char name[64];
797 int flags;
798 int fd;
799 char *address;
800
802 handle);
803
804 /* Handle teardown cases. */
805 if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
806 {
807 if (*mapped_address != NULL
808 && munmap(*mapped_address, *mapped_size) != 0)
809 {
810 ereport(elevel,
812 errmsg("could not unmap shared memory segment \"%s\": %m",
813 name)));
814 return false;
815 }
816 *mapped_address = NULL;
817 *mapped_size = 0;
818 if (op == DSM_OP_DESTROY && unlink(name) != 0)
819 {
820 ereport(elevel,
822 errmsg("could not remove shared memory segment \"%s\": %m",
823 name)));
824 return false;
825 }
826 return true;
827 }
828
829 /* Create new segment or open an existing one for attach. */
830 flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
831 if ((fd = OpenTransientFile(name, flags)) == -1)
832 {
833 if (op == DSM_OP_ATTACH || errno != EEXIST)
834 ereport(elevel,
836 errmsg("could not open shared memory segment \"%s\": %m",
837 name)));
838 return false;
839 }
840
841 /*
842 * If we're attaching the segment, determine the current size; if we are
843 * creating the segment, set the size to the requested value.
844 */
845 if (op == DSM_OP_ATTACH)
846 {
847 struct stat st;
848
849 if (fstat(fd, &st) != 0)
850 {
851 int save_errno;
852
853 /* Back out what's already been done. */
854 save_errno = errno;
856 errno = save_errno;
857
858 ereport(elevel,
860 errmsg("could not stat shared memory segment \"%s\": %m",
861 name)));
862 return false;
863 }
864 request_size = st.st_size;
865 }
866 else
867 {
868 /*
869 * Allocate a buffer full of zeros.
870 *
871 * Note: palloc zbuffer, instead of just using a local char array, to
872 * ensure it is reasonably well-aligned; this may save a few cycles
873 * transferring data to the kernel.
874 */
875 char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
876 Size remaining = request_size;
877 bool success = true;
878
879 /*
880 * Zero-fill the file. We have to do this the hard way to ensure that
881 * all the file space has really been allocated, so that we don't
882 * later seg fault when accessing the memory mapping. This is pretty
883 * pessimal.
884 */
885 while (success && remaining > 0)
886 {
887 Size goal = remaining;
888
889 if (goal > ZBUFFER_SIZE)
890 goal = ZBUFFER_SIZE;
891 pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
892 if (write(fd, zbuffer, goal) == goal)
893 remaining -= goal;
894 else
895 success = false;
897 }
898
899 if (!success)
900 {
901 int save_errno;
902
903 /* Back out what's already been done. */
904 save_errno = errno;
906 unlink(name);
907 errno = save_errno ? save_errno : ENOSPC;
908
909 ereport(elevel,
911 errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
912 name, request_size)));
913 return false;
914 }
915 }
916
917 /* Map it. */
918 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
919 MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
920 if (address == MAP_FAILED)
921 {
922 int save_errno;
923
924 /* Back out what's already been done. */
925 save_errno = errno;
927 if (op == DSM_OP_CREATE)
928 unlink(name);
929 errno = save_errno;
930
931 ereport(elevel,
933 errmsg("could not map shared memory segment \"%s\": %m",
934 name)));
935 return false;
936 }
937 *mapped_address = address;
938 *mapped_size = request_size;
939
940 if (CloseTransientFile(fd) != 0)
941 {
942 ereport(elevel,
944 errmsg("could not close shared memory segment \"%s\": %m",
945 name)));
946 return false;
947 }
948
949 return true;
950}
951#endif
952
953/*
954 * Implementation-specific actions that must be performed when a segment is to
955 * be preserved even when no backend has it attached.
956 *
957 * Except on Windows, we don't need to do anything at all. But since Windows
958 * cleans up segments automatically when no references remain, we duplicate
959 * the segment handle into the postmaster process. The postmaster needn't
960 * do anything to receive the handle; Windows transfers it automatically.
961 */
962void
963dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
964 void **impl_private_pm_handle)
965{
967 {
968#ifdef USE_DSM_WINDOWS
969 case DSM_IMPL_WINDOWS:
971 {
972 HANDLE hmap;
973
974 if (!DuplicateHandle(GetCurrentProcess(), impl_private,
975 PostmasterHandle, &hmap, 0, FALSE,
976 DUPLICATE_SAME_ACCESS))
977 {
978 char name[64];
979
980 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
981 _dosmaperr(GetLastError());
984 errmsg("could not duplicate handle for \"%s\": %m",
985 name)));
986 }
987
988 /*
989 * Here, we remember the handle that we created in the
990 * postmaster process. This handle isn't actually usable in
991 * any process other than the postmaster, but that doesn't
992 * matter. We're just holding onto it so that, if the segment
993 * is unpinned, dsm_impl_unpin_segment can close it.
994 */
995 *impl_private_pm_handle = hmap;
996 }
997 break;
998#endif
999 default:
1000 break;
1001 }
1002}
1003
1004/*
1005 * Implementation-specific actions that must be performed when a segment is no
1006 * longer to be preserved, so that it will be cleaned up when all backends
1007 * have detached from it.
1008 *
1009 * Except on Windows, we don't need to do anything at all. For Windows, we
1010 * close the extra handle that dsm_impl_pin_segment created in the
1011 * postmaster's process space.
1012 */
1013void
1014dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1015{
1017 {
1018#ifdef USE_DSM_WINDOWS
1019 case DSM_IMPL_WINDOWS:
1021 {
1022 if (*impl_private &&
1023 !DuplicateHandle(PostmasterHandle, *impl_private,
1024 NULL, NULL, 0, FALSE,
1025 DUPLICATE_CLOSE_SOURCE))
1026 {
1027 char name[64];
1028
1029 snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1030 _dosmaperr(GetLastError());
1031 ereport(ERROR,
1033 errmsg("could not duplicate handle for \"%s\": %m",
1034 name)));
1035 }
1036
1037 *impl_private = NULL;
1038 }
1039 break;
1040#endif
1041 default:
1042 break;
1043 }
1044}
1045
1046static int
1048{
1049 if (errno == EFBIG || errno == ENOMEM)
1050 return errcode(ERRCODE_OUT_OF_MEMORY);
1051 else
1052 return errcode_for_file_access();
1053}
sigset_t BlockSig
Definition: pqsignal.c:23
size_t Size
Definition: c.h:576
void dsm_impl_pin_segment(dsm_handle handle, void *impl_private, void **impl_private_pm_handle)
Definition: dsm_impl.c:963
int min_dynamic_shared_memory
Definition: dsm_impl.c:115
static int errcode_for_dynamic_shared_memory(void)
Definition: dsm_impl.c:1047
#define SEGMENT_NAME_PREFIX
Definition: dsm_impl.c:120
void dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
Definition: dsm_impl.c:1014
static int dsm_impl_posix_resize(int fd, off_t size)
Definition: dsm_impl.c:351
bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:159
int dynamic_shared_memory_type
Definition: dsm_impl.c:112
static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:423
const struct config_enum_entry dynamic_shared_memory_options[]
Definition: dsm_impl.c:95
static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:212
#define ZBUFFER_SIZE
Definition: dsm_impl.c:118
static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:792
uint32 dsm_handle
Definition: dsm_impl.h:55
dsm_op
Definition: dsm_impl.h:62
@ DSM_OP_DETACH
Definition: dsm_impl.h:65
@ DSM_OP_CREATE
Definition: dsm_impl.h:63
@ DSM_OP_DESTROY
Definition: dsm_impl.h:66
@ DSM_OP_ATTACH
Definition: dsm_impl.h:64
#define DSM_IMPL_WINDOWS
Definition: dsm_impl.h:19
#define DSM_IMPL_POSIX
Definition: dsm_impl.h:17
#define DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE
Definition: dsm_impl.h:36
#define PG_DYNSHMEM_MMAP_FILE_PREFIX
Definition: dsm_impl.h:52
#define PG_DYNSHMEM_DIR
Definition: dsm_impl.h:51
#define DSM_IMPL_SYSV
Definition: dsm_impl.h:18
#define DSM_IMPL_MMAP
Definition: dsm_impl.h:20
int errcode_for_file_access(void)
Definition: elog.c:877
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:149
#define DEBUG4
Definition: elog.h:27
int CloseTransientFile(int fd)
Definition: fd.c:2871
void ReleaseExternalFD(void)
Definition: fd.c:1241
void ReserveExternalFD(void)
Definition: fd.c:1223
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2694
#define PG_FILE_MODE_OWNER
Definition: file_perm.h:38
bool IsUnderPostmaster
Definition: globals.c:121
Assert(PointerIsAligned(start, uint64))
#define ident
Definition: indent_codes.h:47
int remaining
Definition: informix.c:692
static bool success
Definition: initdb.c:187
#define close(a)
Definition: win32.h:12
#define write(a, b, c)
Definition: win32.h:14
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1260
void pfree(void *pointer)
Definition: mcxt.c:2150
void * palloc0(Size size)
Definition: mcxt.c:1973
MemoryContext TopMemoryContext
Definition: mcxt.c:165
#define PG_SHMAT_FLAGS
Definition: mem.h:20
#define MAP_FAILED
Definition: mem.h:45
#define MAP_HASSEMAPHORE
Definition: mem.h:30
#define MAP_NOSYNC
Definition: mem.h:38
#define snprintf
Definition: port.h:239
#define IPCProtection
Definition: posix_sema.c:59
static int fd(const char *x, int i)
Definition: preproc-init.c:105
Definition: guc.h:174
__int64 st_size
Definition: win32_port.h:263
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:85
static void pgstat_report_wait_end(void)
Definition: wait_event.h:101
const char * name
#define IPC_STAT
Definition: win32_port.h:98
#define EINTR
Definition: win32_port.h:364
#define IPC_RMID
Definition: win32_port.h:93
void _dosmaperr(unsigned long)
Definition: win32error.c:177
long key_t
Definition: win32_port.h:237
#define fstat
Definition: win32_port.h:273
#define IPC_EXCL
Definition: win32_port.h:95
#define IPC_CREAT
Definition: win32_port.h:94
#define IPC_PRIVATE
Definition: win32_port.h:96