PostgreSQL Source Code  git master
dsm_impl.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsm_impl.c
4  * manage dynamic shared memory segments
5  *
6  * This file provides low-level APIs for creating and destroying shared
7  * memory segments using several different possible techniques. We refer
8  * to these segments as dynamic because they can be created, altered, and
9  * destroyed at any point during the server life cycle. This is unlike
10  * the main shared memory segment, of which there is always exactly one
11  * and which is always mapped at a fixed address in every PostgreSQL
12  * background process.
13  *
14  * Because not all systems provide the same primitives in this area, nor
15  * do all primitives behave the same way on all systems, we provide
16  * several implementations of this facility. Many systems implement
17  * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18  * in this area, with the exception that shared memory identifiers live
19  * in a flat system-wide namespace, raising the uncomfortable prospect of
20  * name collisions with other processes (including other copies of
21  * PostgreSQL) running on the same system. Some systems only support
22  * the older System V shared memory interface (shmget etc.) which is
23  * also usable; however, the default allocation limits are often quite
24  * small, and the namespace is even more restricted.
25  *
26  * We also provide an mmap-based shared memory implementation. This may
27  * be useful on systems that provide shared memory via a special-purpose
28  * filesystem; by opting for this implementation, the user can even
29  * control precisely where their shared memory segments are placed. It
30  * can also be used as a fallback for systems where shm_open and shmget
31  * are not available or can't be used for some reason. Of course,
32  * mapping a file residing on an actual spinning disk is a fairly poor
33  * approximation for shared memory because writeback may hurt performance
34  * substantially, but there should be few systems where we must make do
35  * with such poor tools.
36  *
37  * As ever, Windows requires its own implementation.
38  *
39  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
40  * Portions Copyright (c) 1994, Regents of the University of California
41  *
42  *
43  * IDENTIFICATION
44  * src/backend/storage/ipc/dsm_impl.c
45  *
46  *-------------------------------------------------------------------------
47  */
48 
49 #include "postgres.h"
50 
51 #include <fcntl.h>
52 #include <signal.h>
53 #include <unistd.h>
54 #ifndef WIN32
55 #include <sys/mman.h>
56 #include <sys/ipc.h>
57 #include <sys/shm.h>
58 #include <sys/stat.h>
59 #endif
60 
61 #include "common/file_perm.h"
62 #include "libpq/pqsignal.h"
63 #include "miscadmin.h"
64 #include "pgstat.h"
65 #include "portability/mem.h"
66 #include "postmaster/postmaster.h"
67 #include "storage/dsm_impl.h"
68 #include "storage/fd.h"
69 #include "utils/guc.h"
70 #include "utils/memutils.h"
71 
72 #ifdef USE_DSM_POSIX
73 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
74  void **impl_private, void **mapped_address,
75  Size *mapped_size, int elevel);
76 static int dsm_impl_posix_resize(int fd, off_t size);
77 #endif
78 #ifdef USE_DSM_SYSV
79 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
80  void **impl_private, void **mapped_address,
81  Size *mapped_size, int elevel);
82 #endif
83 #ifdef USE_DSM_WINDOWS
84 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
85  void **impl_private, void **mapped_address,
86  Size *mapped_size, int elevel);
87 #endif
88 #ifdef USE_DSM_MMAP
89 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
90  void **impl_private, void **mapped_address,
91  Size *mapped_size, int elevel);
92 #endif
94 
96 #ifdef USE_DSM_POSIX
97  {"posix", DSM_IMPL_POSIX, false},
98 #endif
99 #ifdef USE_DSM_SYSV
100  {"sysv", DSM_IMPL_SYSV, false},
101 #endif
102 #ifdef USE_DSM_WINDOWS
103  {"windows", DSM_IMPL_WINDOWS, false},
104 #endif
105 #ifdef USE_DSM_MMAP
106  {"mmap", DSM_IMPL_MMAP, false},
107 #endif
108  {NULL, 0, false}
109 };
110 
111 /* Implementation selector. */
113 
114 /* Amount of space reserved for DSM segments in the main area. */
116 
117 /* Size of buffer to be used for zero-filling. */
118 #define ZBUFFER_SIZE 8192
119 
120 #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
121 
122 /*------
123  * Perform a low-level shared memory operation in a platform-specific way,
124  * as dictated by the selected implementation. Each implementation is
125  * required to implement the following primitives.
126  *
127  * DSM_OP_CREATE. Create a segment whose size is the request_size and
128  * map it.
129  *
130  * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
131  *
132  * DSM_OP_DETACH. Unmap the segment.
133  *
134  * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
135  * segment.
136  *
137  * Arguments:
138  * op: The operation to be performed.
139  * handle: The handle of an existing object, or for DSM_OP_CREATE, the
140  * identifier for the new handle the caller wants created.
141  * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
142  * impl_private: Private, implementation-specific data. Will be a pointer
143  * to NULL for the first operation on a shared memory segment within this
144  * backend; thereafter, it will point to the value to which it was set
145  * on the previous call.
146  * mapped_address: Pointer to start of current mapping; pointer to NULL
147  * if none. Updated with new mapping address.
148  * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
149  * Updated with new mapped size.
150  * elevel: Level at which to log errors.
151  *
152  * Return value: true on success, false on failure. When false is returned,
153  * a message should first be logged at the specified elevel, except in the
154  * case where DSM_OP_CREATE experiences a name collision, which should
155  * silently return false.
156  *-----
157  */
158 bool
159 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
160  void **impl_private, void **mapped_address, Size *mapped_size,
161  int elevel)
162 {
163  Assert(op == DSM_OP_CREATE || request_size == 0);
164  Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
165  (*mapped_address == NULL && *mapped_size == 0));
166 
168  {
169 #ifdef USE_DSM_POSIX
170  case DSM_IMPL_POSIX:
171  return dsm_impl_posix(op, handle, request_size, impl_private,
172  mapped_address, mapped_size, elevel);
173 #endif
174 #ifdef USE_DSM_SYSV
175  case DSM_IMPL_SYSV:
176  return dsm_impl_sysv(op, handle, request_size, impl_private,
177  mapped_address, mapped_size, elevel);
178 #endif
179 #ifdef USE_DSM_WINDOWS
180  case DSM_IMPL_WINDOWS:
181  return dsm_impl_windows(op, handle, request_size, impl_private,
182  mapped_address, mapped_size, elevel);
183 #endif
184 #ifdef USE_DSM_MMAP
185  case DSM_IMPL_MMAP:
186  return dsm_impl_mmap(op, handle, request_size, impl_private,
187  mapped_address, mapped_size, elevel);
188 #endif
189  default:
190  elog(ERROR, "unexpected dynamic shared memory type: %d",
192  return false;
193  }
194 }
195 
196 #ifdef USE_DSM_POSIX
197 /*
198  * Operating system primitives to support POSIX shared memory.
199  *
200  * POSIX shared memory segments are created and attached using shm_open()
201  * and shm_unlink(); other operations, such as sizing or mapping the
202  * segment, are performed as if the shared memory segments were files.
203  *
204  * Indeed, on some platforms, they may be implemented that way. While
205  * POSIX shared memory segments seem intended to exist in a flat namespace,
206  * some operating systems may implement them as files, even going so far
207  * to treat a request for /xyz as a request to create a file by that name
208  * in the root directory. Users of such broken platforms should select
209  * a different shared memory implementation.
210  */
211 static bool
212 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
213  void **impl_private, void **mapped_address, Size *mapped_size,
214  int elevel)
215 {
216  char name[64];
217  int flags;
218  int fd;
219  char *address;
220 
221  snprintf(name, 64, "/PostgreSQL.%u", handle);
222 
223  /* Handle teardown cases. */
224  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
225  {
226  if (*mapped_address != NULL
227  && munmap(*mapped_address, *mapped_size) != 0)
228  {
229  ereport(elevel,
231  errmsg("could not unmap shared memory segment \"%s\": %m",
232  name)));
233  return false;
234  }
235  *mapped_address = NULL;
236  *mapped_size = 0;
237  if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
238  {
239  ereport(elevel,
241  errmsg("could not remove shared memory segment \"%s\": %m",
242  name)));
243  return false;
244  }
245  return true;
246  }
247 
248  /*
249  * Create new segment or open an existing one for attach.
250  *
251  * Even though we will close the FD before returning, it seems desirable
252  * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
253  * failure. The fact that we won't hold the FD open long justifies using
254  * ReserveExternalFD rather than AcquireExternalFD, though.
255  */
257 
258  flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
259  if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
260  {
262  if (op == DSM_OP_ATTACH || errno != EEXIST)
263  ereport(elevel,
265  errmsg("could not open shared memory segment \"%s\": %m",
266  name)));
267  return false;
268  }
269 
270  /*
271  * If we're attaching the segment, determine the current size; if we are
272  * creating the segment, set the size to the requested value.
273  */
274  if (op == DSM_OP_ATTACH)
275  {
276  struct stat st;
277 
278  if (fstat(fd, &st) != 0)
279  {
280  int save_errno;
281 
282  /* Back out what's already been done. */
283  save_errno = errno;
284  close(fd);
286  errno = save_errno;
287 
288  ereport(elevel,
290  errmsg("could not stat shared memory segment \"%s\": %m",
291  name)));
292  return false;
293  }
294  request_size = st.st_size;
295  }
296  else if (dsm_impl_posix_resize(fd, request_size) != 0)
297  {
298  int save_errno;
299 
300  /* Back out what's already been done. */
301  save_errno = errno;
302  close(fd);
304  shm_unlink(name);
305  errno = save_errno;
306 
307  ereport(elevel,
309  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
310  name, request_size)));
311  return false;
312  }
313 
314  /* Map it. */
315  address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
316  MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
317  if (address == MAP_FAILED)
318  {
319  int save_errno;
320 
321  /* Back out what's already been done. */
322  save_errno = errno;
323  close(fd);
325  if (op == DSM_OP_CREATE)
326  shm_unlink(name);
327  errno = save_errno;
328 
329  ereport(elevel,
331  errmsg("could not map shared memory segment \"%s\": %m",
332  name)));
333  return false;
334  }
335  *mapped_address = address;
336  *mapped_size = request_size;
337  close(fd);
339 
340  return true;
341 }
342 
343 /*
344  * Set the size of a virtual memory region associated with a file descriptor.
345  * If necessary, also ensure that virtual memory is actually allocated by the
346  * operating system, to avoid nasty surprises later.
347  *
348  * Returns non-zero if either truncation or allocation fails, and sets errno.
349  */
350 static int
352 {
353  int rc;
354  int save_errno;
355  sigset_t save_sigmask;
356 
357  /*
358  * Block all blockable signals, except SIGQUIT. posix_fallocate() can run
359  * for quite a long time, and is an all-or-nothing operation. If we
360  * allowed SIGUSR1 to interrupt us repeatedly (for example, due to
361  * recovery conflicts), the retry loop might never succeed.
362  */
363  if (IsUnderPostmaster)
364  sigprocmask(SIG_SETMASK, &BlockSig, &save_sigmask);
365 
366  pgstat_report_wait_start(WAIT_EVENT_DSM_ALLOCATE);
367 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
368 
369  /*
370  * On Linux, a shm_open fd is backed by a tmpfs file. If we were to use
371  * ftruncate, the file would contain a hole. Accessing memory backed by a
372  * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
373  * is no more tmpfs space available. So we ask tmpfs to allocate pages
374  * here, so we can fail gracefully with ENOSPC now rather than risking
375  * SIGBUS later.
376  *
377  * We still use a traditional EINTR retry loop to handle SIGCONT.
378  * posix_fallocate() doesn't restart automatically, and we don't want this
379  * to fail if you attach a debugger.
380  */
381  do
382  {
383  rc = posix_fallocate(fd, 0, size);
384  } while (rc == EINTR);
385 
386  /*
387  * The caller expects errno to be set, but posix_fallocate() doesn't set
388  * it. Instead it returns error numbers directly. So set errno, even
389  * though we'll also return rc to indicate success or failure.
390  */
391  errno = rc;
392 #else
393  /* Extend the file to the requested size. */
394  do
395  {
396  rc = ftruncate(fd, size);
397  } while (rc < 0 && errno == EINTR);
398 #endif
400 
401  if (IsUnderPostmaster)
402  {
403  save_errno = errno;
404  sigprocmask(SIG_SETMASK, &save_sigmask, NULL);
405  errno = save_errno;
406  }
407 
408  return rc;
409 }
410 
411 #endif /* USE_DSM_POSIX */
412 
413 #ifdef USE_DSM_SYSV
414 /*
415  * Operating system primitives to support System V shared memory.
416  *
417  * System V shared memory segments are manipulated using shmget(), shmat(),
418  * shmdt(), and shmctl(). As the default allocation limits for System V
419  * shared memory are usually quite low, the POSIX facilities may be
420  * preferable; but those are not supported everywhere.
421  */
422 static bool
423 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
424  void **impl_private, void **mapped_address, Size *mapped_size,
425  int elevel)
426 {
427  key_t key;
428  int ident;
429  char *address;
430  char name[64];
431  int *ident_cache;
432 
433  /*
434  * POSIX shared memory and mmap-based shared memory identify segments with
435  * names. To avoid needless error message variation, we use the handle as
436  * the name.
437  */
438  snprintf(name, 64, "%u", handle);
439 
440  /*
441  * The System V shared memory namespace is very restricted; names are of
442  * type key_t, which is expected to be some sort of integer data type, but
443  * not necessarily the same one as dsm_handle. Since we use dsm_handle to
444  * identify shared memory segments across processes, this might seem like
445  * a problem, but it's really not. If dsm_handle is bigger than key_t,
446  * the cast below might truncate away some bits from the handle the
447  * user-provided, but it'll truncate exactly the same bits away in exactly
448  * the same fashion every time we use that handle, which is all that
449  * really matters. Conversely, if dsm_handle is smaller than key_t, we
450  * won't use the full range of available key space, but that's no big deal
451  * either.
452  *
453  * We do make sure that the key isn't negative, because that might not be
454  * portable.
455  */
456  key = (key_t) handle;
457  if (key < 1) /* avoid compiler warning if type is unsigned */
458  key = -key;
459 
460  /*
461  * There's one special key, IPC_PRIVATE, which can't be used. If we end
462  * up with that value by chance during a create operation, just pretend it
463  * already exists, so that caller will retry. If we run into it anywhere
464  * else, the caller has passed a handle that doesn't correspond to
465  * anything we ever created, which should not happen.
466  */
467  if (key == IPC_PRIVATE)
468  {
469  if (op != DSM_OP_CREATE)
470  elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
471  errno = EEXIST;
472  return false;
473  }
474 
475  /*
476  * Before we can do anything with a shared memory segment, we have to map
477  * the shared memory key to a shared memory identifier using shmget(). To
478  * avoid repeated lookups, we store the key using impl_private.
479  */
480  if (*impl_private != NULL)
481  {
482  ident_cache = *impl_private;
483  ident = *ident_cache;
484  }
485  else
486  {
487  int flags = IPCProtection;
488  size_t segsize;
489 
490  /*
491  * Allocate the memory BEFORE acquiring the resource, so that we don't
492  * leak the resource if memory allocation fails.
493  */
494  ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
495 
496  /*
497  * When using shmget to find an existing segment, we must pass the
498  * size as 0. Passing a non-zero size which is greater than the
499  * actual size will result in EINVAL.
500  */
501  segsize = 0;
502 
503  if (op == DSM_OP_CREATE)
504  {
505  flags |= IPC_CREAT | IPC_EXCL;
506  segsize = request_size;
507  }
508 
509  if ((ident = shmget(key, segsize, flags)) == -1)
510  {
511  if (op == DSM_OP_ATTACH || errno != EEXIST)
512  {
513  int save_errno = errno;
514 
515  pfree(ident_cache);
516  errno = save_errno;
517  ereport(elevel,
519  errmsg("could not get shared memory segment: %m")));
520  }
521  return false;
522  }
523 
524  *ident_cache = ident;
525  *impl_private = ident_cache;
526  }
527 
528  /* Handle teardown cases. */
529  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
530  {
531  pfree(ident_cache);
532  *impl_private = NULL;
533  if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
534  {
535  ereport(elevel,
537  errmsg("could not unmap shared memory segment \"%s\": %m",
538  name)));
539  return false;
540  }
541  *mapped_address = NULL;
542  *mapped_size = 0;
543  if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
544  {
545  ereport(elevel,
547  errmsg("could not remove shared memory segment \"%s\": %m",
548  name)));
549  return false;
550  }
551  return true;
552  }
553 
554  /* If we're attaching it, we must use IPC_STAT to determine the size. */
555  if (op == DSM_OP_ATTACH)
556  {
557  struct shmid_ds shm;
558 
559  if (shmctl(ident, IPC_STAT, &shm) != 0)
560  {
561  ereport(elevel,
563  errmsg("could not stat shared memory segment \"%s\": %m",
564  name)));
565  return false;
566  }
567  request_size = shm.shm_segsz;
568  }
569 
570  /* Map it. */
571  address = shmat(ident, NULL, PG_SHMAT_FLAGS);
572  if (address == (void *) -1)
573  {
574  int save_errno;
575 
576  /* Back out what's already been done. */
577  save_errno = errno;
578  if (op == DSM_OP_CREATE)
579  shmctl(ident, IPC_RMID, NULL);
580  errno = save_errno;
581 
582  ereport(elevel,
584  errmsg("could not map shared memory segment \"%s\": %m",
585  name)));
586  return false;
587  }
588  *mapped_address = address;
589  *mapped_size = request_size;
590 
591  return true;
592 }
593 #endif
594 
595 #ifdef USE_DSM_WINDOWS
596 /*
597  * Operating system primitives to support Windows shared memory.
598  *
599  * Windows shared memory implementation is done using file mapping
600  * which can be backed by either physical file or system paging file.
601  * Current implementation uses system paging file as other effects
602  * like performance are not clear for physical file and it is used in similar
603  * way for main shared memory in windows.
604  *
605  * A memory mapping object is a kernel object - they always get deleted when
606  * the last reference to them goes away, either explicitly via a CloseHandle or
607  * when the process containing the reference exits.
608  */
609 static bool
610 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
611  void **impl_private, void **mapped_address,
612  Size *mapped_size, int elevel)
613 {
614  char *address;
615  HANDLE hmap;
616  char name[64];
617  MEMORY_BASIC_INFORMATION info;
618 
619  /*
620  * Storing the shared memory segment in the Global\ namespace, can allow
621  * any process running in any session to access that file mapping object
622  * provided that the caller has the required access rights. But to avoid
623  * issues faced in main shared memory, we are using the naming convention
624  * similar to main shared memory. We can change here once issue mentioned
625  * in GetSharedMemName is resolved.
626  */
627  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
628 
629  /*
630  * Handle teardown cases. Since Windows automatically destroys the object
631  * when no references remain, we can treat it the same as detach.
632  */
633  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
634  {
635  if (*mapped_address != NULL
636  && UnmapViewOfFile(*mapped_address) == 0)
637  {
638  _dosmaperr(GetLastError());
639  ereport(elevel,
641  errmsg("could not unmap shared memory segment \"%s\": %m",
642  name)));
643  return false;
644  }
645  if (*impl_private != NULL
646  && CloseHandle(*impl_private) == 0)
647  {
648  _dosmaperr(GetLastError());
649  ereport(elevel,
651  errmsg("could not remove shared memory segment \"%s\": %m",
652  name)));
653  return false;
654  }
655 
656  *impl_private = NULL;
657  *mapped_address = NULL;
658  *mapped_size = 0;
659  return true;
660  }
661 
662  /* Create new segment or open an existing one for attach. */
663  if (op == DSM_OP_CREATE)
664  {
665  DWORD size_high;
666  DWORD size_low;
667  DWORD errcode;
668 
669  /* Shifts >= the width of the type are undefined. */
670 #ifdef _WIN64
671  size_high = request_size >> 32;
672 #else
673  size_high = 0;
674 #endif
675  size_low = (DWORD) request_size;
676 
677  /* CreateFileMapping might not clear the error code on success */
678  SetLastError(0);
679 
680  hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
681  NULL, /* Default security attrs */
682  PAGE_READWRITE, /* Memory is read/write */
683  size_high, /* Upper 32 bits of size */
684  size_low, /* Lower 32 bits of size */
685  name);
686 
687  errcode = GetLastError();
688  if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
689  {
690  /*
691  * On Windows, when the segment already exists, a handle for the
692  * existing segment is returned. We must close it before
693  * returning. However, if the existing segment is created by a
694  * service, then it returns ERROR_ACCESS_DENIED. We don't do
695  * _dosmaperr here, so errno won't be modified.
696  */
697  if (hmap)
698  CloseHandle(hmap);
699  return false;
700  }
701 
702  if (!hmap)
703  {
705  ereport(elevel,
707  errmsg("could not create shared memory segment \"%s\": %m",
708  name)));
709  return false;
710  }
711  }
712  else
713  {
714  hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
715  FALSE, /* do not inherit the name */
716  name); /* name of mapping object */
717  if (!hmap)
718  {
719  _dosmaperr(GetLastError());
720  ereport(elevel,
722  errmsg("could not open shared memory segment \"%s\": %m",
723  name)));
724  return false;
725  }
726  }
727 
728  /* Map it. */
729  address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
730  0, 0, 0);
731  if (!address)
732  {
733  int save_errno;
734 
735  _dosmaperr(GetLastError());
736  /* Back out what's already been done. */
737  save_errno = errno;
738  CloseHandle(hmap);
739  errno = save_errno;
740 
741  ereport(elevel,
743  errmsg("could not map shared memory segment \"%s\": %m",
744  name)));
745  return false;
746  }
747 
748  /*
749  * VirtualQuery gives size in page_size units, which is 4K for Windows. We
750  * need size only when we are attaching, but it's better to get the size
751  * when creating new segment to keep size consistent both for
752  * DSM_OP_CREATE and DSM_OP_ATTACH.
753  */
754  if (VirtualQuery(address, &info, sizeof(info)) == 0)
755  {
756  int save_errno;
757 
758  _dosmaperr(GetLastError());
759  /* Back out what's already been done. */
760  save_errno = errno;
761  UnmapViewOfFile(address);
762  CloseHandle(hmap);
763  errno = save_errno;
764 
765  ereport(elevel,
767  errmsg("could not stat shared memory segment \"%s\": %m",
768  name)));
769  return false;
770  }
771 
772  *mapped_address = address;
773  *mapped_size = info.RegionSize;
774  *impl_private = hmap;
775 
776  return true;
777 }
778 #endif
779 
780 #ifdef USE_DSM_MMAP
781 /*
782  * Operating system primitives to support mmap-based shared memory.
783  *
784  * Calling this "shared memory" is somewhat of a misnomer, because what
785  * we're really doing is creating a bunch of files and mapping them into
786  * our address space. The operating system may feel obliged to
787  * synchronize the contents to disk even if nothing is being paged out,
788  * which will not serve us well. The user can relocate the pg_dynshmem
789  * directory to a ramdisk to avoid this problem, if available.
790  */
791 static bool
792 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
793  void **impl_private, void **mapped_address, Size *mapped_size,
794  int elevel)
795 {
796  char name[64];
797  int flags;
798  int fd;
799  char *address;
800 
802  handle);
803 
804  /* Handle teardown cases. */
805  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
806  {
807  if (*mapped_address != NULL
808  && munmap(*mapped_address, *mapped_size) != 0)
809  {
810  ereport(elevel,
812  errmsg("could not unmap shared memory segment \"%s\": %m",
813  name)));
814  return false;
815  }
816  *mapped_address = NULL;
817  *mapped_size = 0;
818  if (op == DSM_OP_DESTROY && unlink(name) != 0)
819  {
820  ereport(elevel,
822  errmsg("could not remove shared memory segment \"%s\": %m",
823  name)));
824  return false;
825  }
826  return true;
827  }
828 
829  /* Create new segment or open an existing one for attach. */
830  flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
831  if ((fd = OpenTransientFile(name, flags)) == -1)
832  {
833  if (op == DSM_OP_ATTACH || errno != EEXIST)
834  ereport(elevel,
836  errmsg("could not open shared memory segment \"%s\": %m",
837  name)));
838  return false;
839  }
840 
841  /*
842  * If we're attaching the segment, determine the current size; if we are
843  * creating the segment, set the size to the requested value.
844  */
845  if (op == DSM_OP_ATTACH)
846  {
847  struct stat st;
848 
849  if (fstat(fd, &st) != 0)
850  {
851  int save_errno;
852 
853  /* Back out what's already been done. */
854  save_errno = errno;
856  errno = save_errno;
857 
858  ereport(elevel,
860  errmsg("could not stat shared memory segment \"%s\": %m",
861  name)));
862  return false;
863  }
864  request_size = st.st_size;
865  }
866  else
867  {
868  /*
869  * Allocate a buffer full of zeros.
870  *
871  * Note: palloc zbuffer, instead of just using a local char array, to
872  * ensure it is reasonably well-aligned; this may save a few cycles
873  * transferring data to the kernel.
874  */
875  char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
876  Size remaining = request_size;
877  bool success = true;
878 
879  /*
880  * Zero-fill the file. We have to do this the hard way to ensure that
881  * all the file space has really been allocated, so that we don't
882  * later seg fault when accessing the memory mapping. This is pretty
883  * pessimal.
884  */
885  while (success && remaining > 0)
886  {
887  Size goal = remaining;
888 
889  if (goal > ZBUFFER_SIZE)
890  goal = ZBUFFER_SIZE;
891  pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
892  if (write(fd, zbuffer, goal) == goal)
893  remaining -= goal;
894  else
895  success = false;
897  }
898 
899  if (!success)
900  {
901  int save_errno;
902 
903  /* Back out what's already been done. */
904  save_errno = errno;
906  unlink(name);
907  errno = save_errno ? save_errno : ENOSPC;
908 
909  ereport(elevel,
911  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
912  name, request_size)));
913  return false;
914  }
915  }
916 
917  /* Map it. */
918  address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
919  MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
920  if (address == MAP_FAILED)
921  {
922  int save_errno;
923 
924  /* Back out what's already been done. */
925  save_errno = errno;
927  if (op == DSM_OP_CREATE)
928  unlink(name);
929  errno = save_errno;
930 
931  ereport(elevel,
933  errmsg("could not map shared memory segment \"%s\": %m",
934  name)));
935  return false;
936  }
937  *mapped_address = address;
938  *mapped_size = request_size;
939 
940  if (CloseTransientFile(fd) != 0)
941  {
942  ereport(elevel,
944  errmsg("could not close shared memory segment \"%s\": %m",
945  name)));
946  return false;
947  }
948 
949  return true;
950 }
951 #endif
952 
953 /*
954  * Implementation-specific actions that must be performed when a segment is to
955  * be preserved even when no backend has it attached.
956  *
957  * Except on Windows, we don't need to do anything at all. But since Windows
958  * cleans up segments automatically when no references remain, we duplicate
959  * the segment handle into the postmaster process. The postmaster needn't
960  * do anything to receive the handle; Windows transfers it automatically.
961  */
962 void
963 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
964  void **impl_private_pm_handle)
965 {
967  {
968 #ifdef USE_DSM_WINDOWS
969  case DSM_IMPL_WINDOWS:
970  if (IsUnderPostmaster)
971  {
972  HANDLE hmap;
973 
974  if (!DuplicateHandle(GetCurrentProcess(), impl_private,
975  PostmasterHandle, &hmap, 0, FALSE,
976  DUPLICATE_SAME_ACCESS))
977  {
978  char name[64];
979 
980  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
981  _dosmaperr(GetLastError());
982  ereport(ERROR,
984  errmsg("could not duplicate handle for \"%s\": %m",
985  name)));
986  }
987 
988  /*
989  * Here, we remember the handle that we created in the
990  * postmaster process. This handle isn't actually usable in
991  * any process other than the postmaster, but that doesn't
992  * matter. We're just holding onto it so that, if the segment
993  * is unpinned, dsm_impl_unpin_segment can close it.
994  */
995  *impl_private_pm_handle = hmap;
996  }
997  break;
998 #endif
999  default:
1000  break;
1001  }
1002 }
1003 
1004 /*
1005  * Implementation-specific actions that must be performed when a segment is no
1006  * longer to be preserved, so that it will be cleaned up when all backends
1007  * have detached from it.
1008  *
1009  * Except on Windows, we don't need to do anything at all. For Windows, we
1010  * close the extra handle that dsm_impl_pin_segment created in the
1011  * postmaster's process space.
1012  */
1013 void
1014 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1015 {
1017  {
1018 #ifdef USE_DSM_WINDOWS
1019  case DSM_IMPL_WINDOWS:
1020  if (IsUnderPostmaster)
1021  {
1022  if (*impl_private &&
1023  !DuplicateHandle(PostmasterHandle, *impl_private,
1024  NULL, NULL, 0, FALSE,
1025  DUPLICATE_CLOSE_SOURCE))
1026  {
1027  char name[64];
1028 
1029  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1030  _dosmaperr(GetLastError());
1031  ereport(ERROR,
1033  errmsg("could not duplicate handle for \"%s\": %m",
1034  name)));
1035  }
1036 
1037  *impl_private = NULL;
1038  }
1039  break;
1040 #endif
1041  default:
1042  break;
1043  }
1044 }
1045 
1046 static int
1048 {
1049  if (errno == EFBIG || errno == ENOMEM)
1050  return errcode(ERRCODE_OUT_OF_MEMORY);
1051  else
1052  return errcode_for_file_access();
1053 }
sigset_t BlockSig
Definition: pqsignal.c:23
#define Assert(condition)
Definition: c.h:837
size_t Size
Definition: c.h:584
void dsm_impl_pin_segment(dsm_handle handle, void *impl_private, void **impl_private_pm_handle)
Definition: dsm_impl.c:963
int min_dynamic_shared_memory
Definition: dsm_impl.c:115
static int errcode_for_dynamic_shared_memory(void)
Definition: dsm_impl.c:1047
#define SEGMENT_NAME_PREFIX
Definition: dsm_impl.c:120
void dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
Definition: dsm_impl.c:1014
static int dsm_impl_posix_resize(int fd, off_t size)
Definition: dsm_impl.c:351
bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:159
int dynamic_shared_memory_type
Definition: dsm_impl.c:112
static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:423
const struct config_enum_entry dynamic_shared_memory_options[]
Definition: dsm_impl.c:95
static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:212
#define ZBUFFER_SIZE
Definition: dsm_impl.c:118
static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:792
uint32 dsm_handle
Definition: dsm_impl.h:55
dsm_op
Definition: dsm_impl.h:62
@ DSM_OP_DETACH
Definition: dsm_impl.h:65
@ DSM_OP_CREATE
Definition: dsm_impl.h:63
@ DSM_OP_DESTROY
Definition: dsm_impl.h:66
@ DSM_OP_ATTACH
Definition: dsm_impl.h:64
#define DSM_IMPL_WINDOWS
Definition: dsm_impl.h:19
#define DSM_IMPL_POSIX
Definition: dsm_impl.h:17
#define DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE
Definition: dsm_impl.h:36
#define PG_DYNSHMEM_MMAP_FILE_PREFIX
Definition: dsm_impl.h:52
#define PG_DYNSHMEM_DIR
Definition: dsm_impl.h:51
#define DSM_IMPL_SYSV
Definition: dsm_impl.h:18
#define DSM_IMPL_MMAP
Definition: dsm_impl.h:20
int errcode_for_file_access(void)
Definition: elog.c:876
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:225
#define ereport(elevel,...)
Definition: elog.h:149
#define DEBUG4
Definition: elog.h:27
int CloseTransientFile(int fd)
Definition: fd.c:2831
void ReleaseExternalFD(void)
Definition: fd.c:1238
void ReserveExternalFD(void)
Definition: fd.c:1220
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2655
#define PG_FILE_MODE_OWNER
Definition: file_perm.h:38
bool IsUnderPostmaster
Definition: globals.c:119
#define ident
Definition: indent_codes.h:47
int remaining
Definition: informix.c:692
static bool success
Definition: initdb.c:186
#define close(a)
Definition: win32.h:12
#define write(a, b, c)
Definition: win32.h:14
void pfree(void *pointer)
Definition: mcxt.c:1521
MemoryContext TopMemoryContext
Definition: mcxt.c:149
void * palloc0(Size size)
Definition: mcxt.c:1347
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1181
#define PG_SHMAT_FLAGS
Definition: mem.h:20
#define MAP_FAILED
Definition: mem.h:45
#define MAP_HASSEMAPHORE
Definition: mem.h:30
#define MAP_NOSYNC
Definition: mem.h:38
#define snprintf
Definition: port.h:238
#define IPCProtection
Definition: posix_sema.c:59
static int fd(const char *x, int i)
Definition: preproc-init.c:105
static pg_noinline void Size size
Definition: slab.c:607
Definition: guc.h:170
__int64 st_size
Definition: win32_port.h:273
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: wait_event.h:85
static void pgstat_report_wait_end(void)
Definition: wait_event.h:101
const char * name
#define IPC_STAT
Definition: win32_port.h:100
#define EINTR
Definition: win32_port.h:374
#define IPC_RMID
Definition: win32_port.h:95
void _dosmaperr(unsigned long)
Definition: win32error.c:177
long key_t
Definition: win32_port.h:247
#define fstat
Definition: win32_port.h:283
#define ftruncate(a, b)
Definition: win32_port.h:82
#define IPC_EXCL
Definition: win32_port.h:97
#define IPC_CREAT
Definition: win32_port.h:96
#define IPC_PRIVATE
Definition: win32_port.h:98