PostgreSQL Source Code  git master
dsm_impl.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsm_impl.c
4  * manage dynamic shared memory segments
5  *
6  * This file provides low-level APIs for creating and destroying shared
7  * memory segments using several different possible techniques. We refer
8  * to these segments as dynamic because they can be created, altered, and
9  * destroyed at any point during the server life cycle. This is unlike
10  * the main shared memory segment, of which there is always exactly one
11  * and which is always mapped at a fixed address in every PostgreSQL
12  * background process.
13  *
14  * Because not all systems provide the same primitives in this area, nor
15  * do all primitives behave the same way on all systems, we provide
16  * several implementations of this facility. Many systems implement
17  * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18  * in this area, with the exception that shared memory identifiers live
19  * in a flat system-wide namespace, raising the uncomfortable prospect of
20  * name collisions with other processes (including other copies of
21  * PostgreSQL) running on the same system. Some systems only support
22  * the older System V shared memory interface (shmget etc.) which is
23  * also usable; however, the default allocation limits are often quite
24  * small, and the namespace is even more restricted.
25  *
26  * We also provide an mmap-based shared memory implementation. This may
27  * be useful on systems that provide shared memory via a special-purpose
28  * filesystem; by opting for this implementation, the user can even
29  * control precisely where their shared memory segments are placed. It
30  * can also be used as a fallback for systems where shm_open and shmget
31  * are not available or can't be used for some reason. Of course,
32  * mapping a file residing on an actual spinning disk is a fairly poor
33  * approximation for shared memory because writeback may hurt performance
34  * substantially, but there should be few systems where we must make do
35  * with such poor tools.
36  *
37  * As ever, Windows requires its own implementation.
38  *
39  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
40  * Portions Copyright (c) 1994, Regents of the University of California
41  *
42  *
43  * IDENTIFICATION
44  * src/backend/storage/ipc/dsm_impl.c
45  *
46  *-------------------------------------------------------------------------
47  */
48 
49 #include "postgres.h"
50 
51 #include <fcntl.h>
52 #include <unistd.h>
53 #ifndef WIN32
54 #include <sys/mman.h>
55 #endif
56 #include <sys/stat.h>
57 #ifdef HAVE_SYS_IPC_H
58 #include <sys/ipc.h>
59 #endif
60 #ifdef HAVE_SYS_SHM_H
61 #include <sys/shm.h>
62 #endif
63 #include "common/file_perm.h"
64 #include "pgstat.h"
65 
66 #include "portability/mem.h"
67 #include "storage/dsm_impl.h"
68 #include "storage/fd.h"
69 #include "utils/guc.h"
70 #include "utils/memutils.h"
71 #include "postmaster/postmaster.h"
72 
73 #ifdef USE_DSM_POSIX
74 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
75  void **impl_private, void **mapped_address,
76  Size *mapped_size, int elevel);
77 static int dsm_impl_posix_resize(int fd, off_t size);
78 #endif
79 #ifdef USE_DSM_SYSV
80 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
81  void **impl_private, void **mapped_address,
82  Size *mapped_size, int elevel);
83 #endif
84 #ifdef USE_DSM_WINDOWS
85 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
86  void **impl_private, void **mapped_address,
87  Size *mapped_size, int elevel);
88 #endif
89 #ifdef USE_DSM_MMAP
90 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
91  void **impl_private, void **mapped_address,
92  Size *mapped_size, int elevel);
93 #endif
94 static int errcode_for_dynamic_shared_memory(void);
95 
97 #ifdef USE_DSM_POSIX
98  {"posix", DSM_IMPL_POSIX, false},
99 #endif
100 #ifdef USE_DSM_SYSV
101  {"sysv", DSM_IMPL_SYSV, false},
102 #endif
103 #ifdef USE_DSM_WINDOWS
104  {"windows", DSM_IMPL_WINDOWS, false},
105 #endif
106 #ifdef USE_DSM_MMAP
107  {"mmap", DSM_IMPL_MMAP, false},
108 #endif
109  {"none", DSM_IMPL_NONE, false},
110  {NULL, 0, false}
111 };
112 
113 /* Implementation selector. */
115 
116 /* Size of buffer to be used for zero-filling. */
117 #define ZBUFFER_SIZE 8192
118 
119 #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
120 
121 /*------
122  * Perform a low-level shared memory operation in a platform-specific way,
123  * as dictated by the selected implementation. Each implementation is
124  * required to implement the following primitives.
125  *
126  * DSM_OP_CREATE. Create a segment whose size is the request_size and
127  * map it.
128  *
129  * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
130  * The segment may already be mapped; any existing mapping should be removed
131  * before creating a new one.
132  *
133  * DSM_OP_DETACH. Unmap the segment.
134  *
135  * DSM_OP_RESIZE. Resize the segment to the given request_size and
136  * remap the segment at that new size.
137  *
138  * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
139  * segment.
140  *
141  * Arguments:
142  * op: The operation to be performed.
143  * handle: The handle of an existing object, or for DSM_OP_CREATE, the
144  * a new handle the caller wants created.
145  * request_size: For DSM_OP_CREATE, the requested size. For DSM_OP_RESIZE,
146  * the new size. Otherwise, 0.
147  * impl_private: Private, implementation-specific data. Will be a pointer
148  * to NULL for the first operation on a shared memory segment within this
149  * backend; thereafter, it will point to the value to which it was set
150  * on the previous call.
151  * mapped_address: Pointer to start of current mapping; pointer to NULL
152  * if none. Updated with new mapping address.
153  * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
154  * Updated with new mapped size.
155  * elevel: Level at which to log errors.
156  *
157  * Return value: true on success, false on failure. When false is returned,
158  * a message should first be logged at the specified elevel, except in the
159  * case where DSM_OP_CREATE experiences a name collision, which should
160  * silently return false.
161  *-----
162  */
163 bool
164 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
165  void **impl_private, void **mapped_address, Size *mapped_size,
166  int elevel)
167 {
168  Assert(op == DSM_OP_CREATE || op == DSM_OP_RESIZE || request_size == 0);
169  Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
170  (*mapped_address == NULL && *mapped_size == 0));
171 
173  {
174 #ifdef USE_DSM_POSIX
175  case DSM_IMPL_POSIX:
176  return dsm_impl_posix(op, handle, request_size, impl_private,
177  mapped_address, mapped_size, elevel);
178 #endif
179 #ifdef USE_DSM_SYSV
180  case DSM_IMPL_SYSV:
181  return dsm_impl_sysv(op, handle, request_size, impl_private,
182  mapped_address, mapped_size, elevel);
183 #endif
184 #ifdef USE_DSM_WINDOWS
185  case DSM_IMPL_WINDOWS:
186  return dsm_impl_windows(op, handle, request_size, impl_private,
187  mapped_address, mapped_size, elevel);
188 #endif
189 #ifdef USE_DSM_MMAP
190  case DSM_IMPL_MMAP:
191  return dsm_impl_mmap(op, handle, request_size, impl_private,
192  mapped_address, mapped_size, elevel);
193 #endif
194  default:
195  elog(ERROR, "unexpected dynamic shared memory type: %d",
197  return false;
198  }
199 }
200 
201 /*
202  * Does the current dynamic shared memory implementation support resizing
203  * segments? (The answer here could be platform-dependent in the future,
204  * since AIX allows shmctl(shmid, SHM_RESIZE, &buffer), though you apparently
205  * can't resize segments to anything larger than 256MB that way. For now,
206  * we keep it simple.)
207  */
208 bool
210 {
212  {
213  case DSM_IMPL_NONE:
214  return false;
215  case DSM_IMPL_POSIX:
216  return true;
217  case DSM_IMPL_SYSV:
218  return false;
219  case DSM_IMPL_WINDOWS:
220  return false;
221  case DSM_IMPL_MMAP:
222  return true;
223  default:
224  return false; /* should not happen */
225  }
226 }
227 
228 #ifdef USE_DSM_POSIX
229 /*
230  * Operating system primitives to support POSIX shared memory.
231  *
232  * POSIX shared memory segments are created and attached using shm_open()
233  * and shm_unlink(); other operations, such as sizing or mapping the
234  * segment, are performed as if the shared memory segments were files.
235  *
236  * Indeed, on some platforms, they may be implemented that way. While
237  * POSIX shared memory segments seem intended to exist in a flat namespace,
238  * some operating systems may implement them as files, even going so far
239  * to treat a request for /xyz as a request to create a file by that name
240  * in the root directory. Users of such broken platforms should select
241  * a different shared memory implementation.
242  */
243 static bool
244 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
245  void **impl_private, void **mapped_address, Size *mapped_size,
246  int elevel)
247 {
248  char name[64];
249  int flags;
250  int fd;
251  char *address;
252 
253  snprintf(name, 64, "/PostgreSQL.%u", handle);
254 
255  /* Handle teardown cases. */
256  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
257  {
258  if (*mapped_address != NULL
259  && munmap(*mapped_address, *mapped_size) != 0)
260  {
261  ereport(elevel,
263  errmsg("could not unmap shared memory segment \"%s\": %m",
264  name)));
265  return false;
266  }
267  *mapped_address = NULL;
268  *mapped_size = 0;
269  if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
270  {
271  ereport(elevel,
273  errmsg("could not remove shared memory segment \"%s\": %m",
274  name)));
275  return false;
276  }
277  return true;
278  }
279 
280  /*
281  * Create new segment or open an existing one for attach or resize.
282  *
283  * Even though we're not going through fd.c, we should be safe against
284  * running out of file descriptors, because of NUM_RESERVED_FDS. We're
285  * only opening one extra descriptor here, and we'll close it before
286  * returning.
287  */
288  flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
289  if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
290  {
291  if (errno != EEXIST)
292  ereport(elevel,
294  errmsg("could not open shared memory segment \"%s\": %m",
295  name)));
296  return false;
297  }
298 
299  /*
300  * If we're attaching the segment, determine the current size; if we are
301  * creating or resizing the segment, set the size to the requested value.
302  */
303  if (op == DSM_OP_ATTACH)
304  {
305  struct stat st;
306 
307  if (fstat(fd, &st) != 0)
308  {
309  int save_errno;
310 
311  /* Back out what's already been done. */
312  save_errno = errno;
313  close(fd);
314  errno = save_errno;
315 
316  ereport(elevel,
318  errmsg("could not stat shared memory segment \"%s\": %m",
319  name)));
320  return false;
321  }
322  request_size = st.st_size;
323  }
324  else if (*mapped_size != request_size &&
325  dsm_impl_posix_resize(fd, request_size) != 0)
326  {
327  int save_errno;
328 
329  /* Back out what's already been done. */
330  save_errno = errno;
331  close(fd);
332  if (op == DSM_OP_CREATE)
333  shm_unlink(name);
334  errno = save_errno;
335 
336  ereport(elevel,
338  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
339  name, request_size)));
340  return false;
341  }
342 
343  /*
344  * If we're reattaching or resizing, we must remove any existing mapping,
345  * unless we've already got the right thing mapped.
346  */
347  if (*mapped_address != NULL)
348  {
349  if (*mapped_size == request_size)
350  return true;
351  if (munmap(*mapped_address, *mapped_size) != 0)
352  {
353  int save_errno;
354 
355  /* Back out what's already been done. */
356  save_errno = errno;
357  close(fd);
358  if (op == DSM_OP_CREATE)
359  shm_unlink(name);
360  errno = save_errno;
361 
362  ereport(elevel,
364  errmsg("could not unmap shared memory segment \"%s\": %m",
365  name)));
366  return false;
367  }
368  *mapped_address = NULL;
369  *mapped_size = 0;
370  }
371 
372  /* Map it. */
373  address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
374  MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
375  if (address == MAP_FAILED)
376  {
377  int save_errno;
378 
379  /* Back out what's already been done. */
380  save_errno = errno;
381  close(fd);
382  if (op == DSM_OP_CREATE)
383  shm_unlink(name);
384  errno = save_errno;
385 
386  ereport(elevel,
388  errmsg("could not map shared memory segment \"%s\": %m",
389  name)));
390  return false;
391  }
392  *mapped_address = address;
393  *mapped_size = request_size;
394  close(fd);
395 
396  return true;
397 }
398 
399 /*
400  * Set the size of a virtual memory region associated with a file descriptor.
401  * If necessary, also ensure that virtual memory is actually allocated by the
402  * operating system, to avoid nasty surprises later.
403  *
404  * Returns non-zero if either truncation or allocation fails, and sets errno.
405  */
406 static int
407 dsm_impl_posix_resize(int fd, off_t size)
408 {
409  int rc;
410 
411  /* Truncate (or extend) the file to the requested size. */
412  rc = ftruncate(fd, size);
413 
414  /*
415  * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with
416  * ftruncate, the file may contain a hole. Accessing memory backed by a
417  * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
418  * is no more tmpfs space available. So we ask tmpfs to allocate pages
419  * here, so we can fail gracefully with ENOSPC now rather than risking
420  * SIGBUS later.
421  */
422 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
423  if (rc == 0)
424  {
425  /* We may get interrupted, if so just retry. */
426  do
427  {
428  rc = posix_fallocate(fd, 0, size);
429  } while (rc == EINTR);
430 
431  /*
432  * The caller expects errno to be set, but posix_fallocate() doesn't
433  * set it. Instead it returns error numbers directly. So set errno,
434  * even though we'll also return rc to indicate success or failure.
435  */
436  errno = rc;
437  }
438 #endif /* HAVE_POSIX_FALLOCATE && __linux__ */
439 
440  return rc;
441 }
442 
443 #endif /* USE_DSM_POSIX */
444 
445 #ifdef USE_DSM_SYSV
446 /*
447  * Operating system primitives to support System V shared memory.
448  *
449  * System V shared memory segments are manipulated using shmget(), shmat(),
450  * shmdt(), and shmctl(). There's no portable way to resize such
451  * segments. As the default allocation limits for System V shared memory
452  * are usually quite low, the POSIX facilities may be preferable; but
453  * those are not supported everywhere.
454  */
455 static bool
456 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
457  void **impl_private, void **mapped_address, Size *mapped_size,
458  int elevel)
459 {
460  key_t key;
461  int ident;
462  char *address;
463  char name[64];
464  int *ident_cache;
465 
466  /* Resize is not supported for System V shared memory. */
467  if (op == DSM_OP_RESIZE)
468  {
469  elog(elevel, "System V shared memory segments cannot be resized");
470  return false;
471  }
472 
473  /* Since resize isn't supported, reattach is a no-op. */
474  if (op == DSM_OP_ATTACH && *mapped_address != NULL)
475  return true;
476 
477  /*
478  * POSIX shared memory and mmap-based shared memory identify segments with
479  * names. To avoid needless error message variation, we use the handle as
480  * the name.
481  */
482  snprintf(name, 64, "%u", handle);
483 
484  /*
485  * The System V shared memory namespace is very restricted; names are of
486  * type key_t, which is expected to be some sort of integer data type, but
487  * not necessarily the same one as dsm_handle. Since we use dsm_handle to
488  * identify shared memory segments across processes, this might seem like
489  * a problem, but it's really not. If dsm_handle is bigger than key_t,
490  * the cast below might truncate away some bits from the handle the
491  * user-provided, but it'll truncate exactly the same bits away in exactly
492  * the same fashion every time we use that handle, which is all that
493  * really matters. Conversely, if dsm_handle is smaller than key_t, we
494  * won't use the full range of available key space, but that's no big deal
495  * either.
496  *
497  * We do make sure that the key isn't negative, because that might not be
498  * portable.
499  */
500  key = (key_t) handle;
501  if (key < 1) /* avoid compiler warning if type is unsigned */
502  key = -key;
503 
504  /*
505  * There's one special key, IPC_PRIVATE, which can't be used. If we end
506  * up with that value by chance during a create operation, just pretend it
507  * already exists, so that caller will retry. If we run into it anywhere
508  * else, the caller has passed a handle that doesn't correspond to
509  * anything we ever created, which should not happen.
510  */
511  if (key == IPC_PRIVATE)
512  {
513  if (op != DSM_OP_CREATE)
514  elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
515  errno = EEXIST;
516  return false;
517  }
518 
519  /*
520  * Before we can do anything with a shared memory segment, we have to map
521  * the shared memory key to a shared memory identifier using shmget(). To
522  * avoid repeated lookups, we store the key using impl_private.
523  */
524  if (*impl_private != NULL)
525  {
526  ident_cache = *impl_private;
527  ident = *ident_cache;
528  }
529  else
530  {
531  int flags = IPCProtection;
532  size_t segsize;
533 
534  /*
535  * Allocate the memory BEFORE acquiring the resource, so that we don't
536  * leak the resource if memory allocation fails.
537  */
538  ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
539 
540  /*
541  * When using shmget to find an existing segment, we must pass the
542  * size as 0. Passing a non-zero size which is greater than the
543  * actual size will result in EINVAL.
544  */
545  segsize = 0;
546 
547  if (op == DSM_OP_CREATE)
548  {
549  flags |= IPC_CREAT | IPC_EXCL;
550  segsize = request_size;
551  }
552 
553  if ((ident = shmget(key, segsize, flags)) == -1)
554  {
555  if (errno != EEXIST)
556  {
557  int save_errno = errno;
558 
559  pfree(ident_cache);
560  errno = save_errno;
561  ereport(elevel,
563  errmsg("could not get shared memory segment: %m")));
564  }
565  return false;
566  }
567 
568  *ident_cache = ident;
569  *impl_private = ident_cache;
570  }
571 
572  /* Handle teardown cases. */
573  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
574  {
575  pfree(ident_cache);
576  *impl_private = NULL;
577  if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
578  {
579  ereport(elevel,
581  errmsg("could not unmap shared memory segment \"%s\": %m",
582  name)));
583  return false;
584  }
585  *mapped_address = NULL;
586  *mapped_size = 0;
587  if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
588  {
589  ereport(elevel,
591  errmsg("could not remove shared memory segment \"%s\": %m",
592  name)));
593  return false;
594  }
595  return true;
596  }
597 
598  /* If we're attaching it, we must use IPC_STAT to determine the size. */
599  if (op == DSM_OP_ATTACH)
600  {
601  struct shmid_ds shm;
602 
603  if (shmctl(ident, IPC_STAT, &shm) != 0)
604  {
605  ereport(elevel,
607  errmsg("could not stat shared memory segment \"%s\": %m",
608  name)));
609  return false;
610  }
611  request_size = shm.shm_segsz;
612  }
613 
614  /* Map it. */
615  address = shmat(ident, NULL, PG_SHMAT_FLAGS);
616  if (address == (void *) -1)
617  {
618  int save_errno;
619 
620  /* Back out what's already been done. */
621  save_errno = errno;
622  if (op == DSM_OP_CREATE)
623  shmctl(ident, IPC_RMID, NULL);
624  errno = save_errno;
625 
626  ereport(elevel,
628  errmsg("could not map shared memory segment \"%s\": %m",
629  name)));
630  return false;
631  }
632  *mapped_address = address;
633  *mapped_size = request_size;
634 
635  return true;
636 }
637 #endif
638 
639 #ifdef USE_DSM_WINDOWS
640 /*
641  * Operating system primitives to support Windows shared memory.
642  *
643  * Windows shared memory implementation is done using file mapping
644  * which can be backed by either physical file or system paging file.
645  * Current implementation uses system paging file as other effects
646  * like performance are not clear for physical file and it is used in similar
647  * way for main shared memory in windows.
648  *
649  * A memory mapping object is a kernel object - they always get deleted when
650  * the last reference to them goes away, either explicitly via a CloseHandle or
651  * when the process containing the reference exits.
652  */
653 static bool
654 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
655  void **impl_private, void **mapped_address,
656  Size *mapped_size, int elevel)
657 {
658  char *address;
659  HANDLE hmap;
660  char name[64];
661  MEMORY_BASIC_INFORMATION info;
662 
663  /* Resize is not supported for Windows shared memory. */
664  if (op == DSM_OP_RESIZE)
665  {
666  elog(elevel, "Windows shared memory segments cannot be resized");
667  return false;
668  }
669 
670  /* Since resize isn't supported, reattach is a no-op. */
671  if (op == DSM_OP_ATTACH && *mapped_address != NULL)
672  return true;
673 
674  /*
675  * Storing the shared memory segment in the Global\ namespace, can allow
676  * any process running in any session to access that file mapping object
677  * provided that the caller has the required access rights. But to avoid
678  * issues faced in main shared memory, we are using the naming convention
679  * similar to main shared memory. We can change here once issue mentioned
680  * in GetSharedMemName is resolved.
681  */
682  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
683 
684  /*
685  * Handle teardown cases. Since Windows automatically destroys the object
686  * when no references remain, we can treat it the same as detach.
687  */
688  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
689  {
690  if (*mapped_address != NULL
691  && UnmapViewOfFile(*mapped_address) == 0)
692  {
693  _dosmaperr(GetLastError());
694  ereport(elevel,
696  errmsg("could not unmap shared memory segment \"%s\": %m",
697  name)));
698  return false;
699  }
700  if (*impl_private != NULL
701  && CloseHandle(*impl_private) == 0)
702  {
703  _dosmaperr(GetLastError());
704  ereport(elevel,
706  errmsg("could not remove shared memory segment \"%s\": %m",
707  name)));
708  return false;
709  }
710 
711  *impl_private = NULL;
712  *mapped_address = NULL;
713  *mapped_size = 0;
714  return true;
715  }
716 
717  /* Create new segment or open an existing one for attach. */
718  if (op == DSM_OP_CREATE)
719  {
720  DWORD size_high;
721  DWORD size_low;
722  DWORD errcode;
723 
724  /* Shifts >= the width of the type are undefined. */
725 #ifdef _WIN64
726  size_high = request_size >> 32;
727 #else
728  size_high = 0;
729 #endif
730  size_low = (DWORD) request_size;
731 
732  /* CreateFileMapping might not clear the error code on success */
733  SetLastError(0);
734 
735  hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
736  NULL, /* Default security attrs */
737  PAGE_READWRITE, /* Memory is read/write */
738  size_high, /* Upper 32 bits of size */
739  size_low, /* Lower 32 bits of size */
740  name);
741 
742  errcode = GetLastError();
743  if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
744  {
745  /*
746  * On Windows, when the segment already exists, a handle for the
747  * existing segment is returned. We must close it before
748  * returning. However, if the existing segment is created by a
749  * service, then it returns ERROR_ACCESS_DENIED. We don't do
750  * _dosmaperr here, so errno won't be modified.
751  */
752  if (hmap)
753  CloseHandle(hmap);
754  return false;
755  }
756 
757  if (!hmap)
758  {
759  _dosmaperr(errcode);
760  ereport(elevel,
762  errmsg("could not create shared memory segment \"%s\": %m",
763  name)));
764  return false;
765  }
766  }
767  else
768  {
769  hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
770  FALSE, /* do not inherit the name */
771  name); /* name of mapping object */
772  if (!hmap)
773  {
774  _dosmaperr(GetLastError());
775  ereport(elevel,
777  errmsg("could not open shared memory segment \"%s\": %m",
778  name)));
779  return false;
780  }
781  }
782 
783  /* Map it. */
784  address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
785  0, 0, 0);
786  if (!address)
787  {
788  int save_errno;
789 
790  _dosmaperr(GetLastError());
791  /* Back out what's already been done. */
792  save_errno = errno;
793  CloseHandle(hmap);
794  errno = save_errno;
795 
796  ereport(elevel,
798  errmsg("could not map shared memory segment \"%s\": %m",
799  name)));
800  return false;
801  }
802 
803  /*
804  * VirtualQuery gives size in page_size units, which is 4K for Windows. We
805  * need size only when we are attaching, but it's better to get the size
806  * when creating new segment to keep size consistent both for
807  * DSM_OP_CREATE and DSM_OP_ATTACH.
808  */
809  if (VirtualQuery(address, &info, sizeof(info)) == 0)
810  {
811  int save_errno;
812 
813  _dosmaperr(GetLastError());
814  /* Back out what's already been done. */
815  save_errno = errno;
816  UnmapViewOfFile(address);
817  CloseHandle(hmap);
818  errno = save_errno;
819 
820  ereport(elevel,
822  errmsg("could not stat shared memory segment \"%s\": %m",
823  name)));
824  return false;
825  }
826 
827  *mapped_address = address;
828  *mapped_size = info.RegionSize;
829  *impl_private = hmap;
830 
831  return true;
832 }
833 #endif
834 
835 #ifdef USE_DSM_MMAP
836 /*
837  * Operating system primitives to support mmap-based shared memory.
838  *
839  * Calling this "shared memory" is somewhat of a misnomer, because what
840  * we're really doing is creating a bunch of files and mapping them into
841  * our address space. The operating system may feel obliged to
842  * synchronize the contents to disk even if nothing is being paged out,
843  * which will not serve us well. The user can relocate the pg_dynshmem
844  * directory to a ramdisk to avoid this problem, if available.
845  */
846 static bool
847 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
848  void **impl_private, void **mapped_address, Size *mapped_size,
849  int elevel)
850 {
851  char name[64];
852  int flags;
853  int fd;
854  char *address;
855 
857  handle);
858 
859  /* Handle teardown cases. */
860  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
861  {
862  if (*mapped_address != NULL
863  && munmap(*mapped_address, *mapped_size) != 0)
864  {
865  ereport(elevel,
867  errmsg("could not unmap shared memory segment \"%s\": %m",
868  name)));
869  return false;
870  }
871  *mapped_address = NULL;
872  *mapped_size = 0;
873  if (op == DSM_OP_DESTROY && unlink(name) != 0)
874  {
875  ereport(elevel,
877  errmsg("could not remove shared memory segment \"%s\": %m",
878  name)));
879  return false;
880  }
881  return true;
882  }
883 
884  /* Create new segment or open an existing one for attach or resize. */
885  flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
886  if ((fd = OpenTransientFile(name, flags)) == -1)
887  {
888  if (errno != EEXIST)
889  ereport(elevel,
891  errmsg("could not open shared memory segment \"%s\": %m",
892  name)));
893  return false;
894  }
895 
896  /*
897  * If we're attaching the segment, determine the current size; if we are
898  * creating or resizing the segment, set the size to the requested value.
899  */
900  if (op == DSM_OP_ATTACH)
901  {
902  struct stat st;
903 
904  if (fstat(fd, &st) != 0)
905  {
906  int save_errno;
907 
908  /* Back out what's already been done. */
909  save_errno = errno;
910  CloseTransientFile(fd);
911  errno = save_errno;
912 
913  ereport(elevel,
915  errmsg("could not stat shared memory segment \"%s\": %m",
916  name)));
917  return false;
918  }
919  request_size = st.st_size;
920  }
921  else if (*mapped_size > request_size && ftruncate(fd, request_size))
922  {
923  int save_errno;
924 
925  /* Back out what's already been done. */
926  save_errno = errno;
927  CloseTransientFile(fd);
928  if (op == DSM_OP_CREATE)
929  unlink(name);
930  errno = save_errno;
931 
932  ereport(elevel,
934  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
935  name, request_size)));
936  return false;
937  }
938  else if (*mapped_size < request_size)
939  {
940  /*
941  * Allocate a buffer full of zeros.
942  *
943  * Note: palloc zbuffer, instead of just using a local char array, to
944  * ensure it is reasonably well-aligned; this may save a few cycles
945  * transferring data to the kernel.
946  */
947  char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
948  uint32 remaining = request_size;
949  bool success = true;
950 
951  /*
952  * Zero-fill the file. We have to do this the hard way to ensure that
953  * all the file space has really been allocated, so that we don't
954  * later seg fault when accessing the memory mapping. This is pretty
955  * pessimal.
956  */
957  while (success && remaining > 0)
958  {
959  Size goal = remaining;
960 
961  if (goal > ZBUFFER_SIZE)
962  goal = ZBUFFER_SIZE;
964  if (write(fd, zbuffer, goal) == goal)
965  remaining -= goal;
966  else
967  success = false;
969  }
970 
971  if (!success)
972  {
973  int save_errno;
974 
975  /* Back out what's already been done. */
976  save_errno = errno;
977  CloseTransientFile(fd);
978  if (op == DSM_OP_CREATE)
979  unlink(name);
980  errno = save_errno ? save_errno : ENOSPC;
981 
982  ereport(elevel,
984  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
985  name, request_size)));
986  return false;
987  }
988  }
989 
990  /*
991  * If we're reattaching or resizing, we must remove any existing mapping,
992  * unless we've already got the right thing mapped.
993  */
994  if (*mapped_address != NULL)
995  {
996  if (*mapped_size == request_size)
997  return true;
998  if (munmap(*mapped_address, *mapped_size) != 0)
999  {
1000  int save_errno;
1001 
1002  /* Back out what's already been done. */
1003  save_errno = errno;
1004  CloseTransientFile(fd);
1005  if (op == DSM_OP_CREATE)
1006  unlink(name);
1007  errno = save_errno;
1008 
1009  ereport(elevel,
1011  errmsg("could not unmap shared memory segment \"%s\": %m",
1012  name)));
1013  return false;
1014  }
1015  *mapped_address = NULL;
1016  *mapped_size = 0;
1017  }
1018 
1019  /* Map it. */
1020  address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
1021  MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
1022  if (address == MAP_FAILED)
1023  {
1024  int save_errno;
1025 
1026  /* Back out what's already been done. */
1027  save_errno = errno;
1028  CloseTransientFile(fd);
1029  if (op == DSM_OP_CREATE)
1030  unlink(name);
1031  errno = save_errno;
1032 
1033  ereport(elevel,
1035  errmsg("could not map shared memory segment \"%s\": %m",
1036  name)));
1037  return false;
1038  }
1039  *mapped_address = address;
1040  *mapped_size = request_size;
1041  CloseTransientFile(fd);
1042 
1043  return true;
1044 }
1045 #endif
1046 
1047 /*
1048  * Implementation-specific actions that must be performed when a segment is to
1049  * be preserved even when no backend has it attached.
1050  *
1051  * Except on Windows, we don't need to do anything at all. But since Windows
1052  * cleans up segments automatically when no references remain, we duplicate
1053  * the segment handle into the postmaster process. The postmaster needn't
1054  * do anything to receive the handle; Windows transfers it automatically.
1055  */
1056 void
1057 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
1058  void **impl_private_pm_handle)
1059 {
1061  {
1062 #ifdef USE_DSM_WINDOWS
1063  case DSM_IMPL_WINDOWS:
1064  {
1065  HANDLE hmap;
1066 
1067  if (!DuplicateHandle(GetCurrentProcess(), impl_private,
1068  PostmasterHandle, &hmap, 0, FALSE,
1069  DUPLICATE_SAME_ACCESS))
1070  {
1071  char name[64];
1072 
1073  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1074  _dosmaperr(GetLastError());
1075  ereport(ERROR,
1077  errmsg("could not duplicate handle for \"%s\": %m",
1078  name)));
1079  }
1080 
1081  /*
1082  * Here, we remember the handle that we created in the
1083  * postmaster process. This handle isn't actually usable in
1084  * any process other than the postmaster, but that doesn't
1085  * matter. We're just holding onto it so that, if the segment
1086  * is unpinned, dsm_impl_unpin_segment can close it.
1087  */
1088  *impl_private_pm_handle = hmap;
1089  break;
1090  }
1091 #endif
1092  default:
1093  break;
1094  }
1095 }
1096 
1097 /*
1098  * Implementation-specific actions that must be performed when a segment is no
1099  * longer to be preserved, so that it will be cleaned up when all backends
1100  * have detached from it.
1101  *
1102  * Except on Windows, we don't need to do anything at all. For Windows, we
1103  * close the extra handle that dsm_impl_pin_segment created in the
1104  * postmaster's process space.
1105  */
1106 void
1107 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1108 {
1110  {
1111 #ifdef USE_DSM_WINDOWS
1112  case DSM_IMPL_WINDOWS:
1113  {
1114  if (*impl_private &&
1115  !DuplicateHandle(PostmasterHandle, *impl_private,
1116  NULL, NULL, 0, FALSE,
1117  DUPLICATE_CLOSE_SOURCE))
1118  {
1119  char name[64];
1120 
1121  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1122  _dosmaperr(GetLastError());
1123  ereport(ERROR,
1125  errmsg("could not duplicate handle for \"%s\": %m",
1126  name)));
1127  }
1128 
1129  *impl_private = NULL;
1130  break;
1131  }
1132 #endif
1133  default:
1134  break;
1135  }
1136 }
1137 
1138 static int
1140 {
1141  if (errno == EFBIG || errno == ENOMEM)
1142  return errcode(ERRCODE_OUT_OF_MEMORY);
1143  else
1144  return errcode_for_file_access();
1145 }
int remaining
Definition: informix.c:692
#define DSM_IMPL_MMAP
Definition: dsm_impl.h:21
#define DSM_IMPL_SYSV
Definition: dsm_impl.h:19
#define MAP_HASSEMAPHORE
Definition: mem.h:30
#define MAP_FAILED
Definition: mem.h:45
void dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
Definition: dsm_impl.c:1107
uint32 dsm_handle
Definition: dsm_impl.h:55
#define PG_DYNSHMEM_DIR
Definition: dsm_impl.h:51
#define IPC_CREAT
Definition: win32_port.h:82
Definition: guc.h:164
#define write(a, b, c)
Definition: win32.h:14
#define IPCProtection
Definition: posix_sema.c:52
#define PG_SHMAT_FLAGS
Definition: mem.h:20
#define FALSE
Definition: ecpglib.h:39
#define MAP_NOSYNC
Definition: mem.h:38
void _dosmaperr(unsigned long)
Definition: win32error.c:171
int errcode(int sqlerrcode)
Definition: elog.c:575
int snprintf(char *str, size_t count, const char *fmt,...) pg_attribute_printf(3
#define DSM_IMPL_WINDOWS
Definition: dsm_impl.h:20
static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:847
#define DEBUG4
Definition: elog.h:22
static int fd(const char *x, int i)
Definition: preproc-init.c:105
static int errcode_for_dynamic_shared_memory(void)
Definition: dsm_impl.c:1139
void pfree(void *pointer)
Definition: mcxt.c:1031
const struct config_enum_entry dynamic_shared_memory_options[]
Definition: dsm_impl.c:96
#define ERROR
Definition: elog.h:43
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2396
#define DSM_IMPL_POSIX
Definition: dsm_impl.h:18
static bool success
#define SEGMENT_NAME_PREFIX
Definition: dsm_impl.c:119
int errcode_for_file_access(void)
Definition: elog.c:598
int dynamic_shared_memory_type
Definition: dsm_impl.c:114
unsigned int uint32
Definition: c.h:325
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1260
#define ereport(elevel, rest)
Definition: elog.h:122
#define IPC_PRIVATE
Definition: win32_port.h:84
MemoryContext TopMemoryContext
Definition: mcxt.c:44
int CloseTransientFile(int fd)
Definition: fd.c:2566
#define stat(a, b)
Definition: win32_port.h:266
static int elevel
Definition: vacuumlazy.c:144
void * palloc0(Size size)
Definition: mcxt.c:955
#define IPC_RMID
Definition: win32_port.h:81
static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:456
#define PG_FILE_MODE_OWNER
Definition: file_perm.h:36
#define Assert(condition)
Definition: c.h:699
long key_t
Definition: win32_port.h:244
#define DSM_IMPL_NONE
Definition: dsm_impl.h:17
size_t Size
Definition: c.h:433
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1236
#define IPC_EXCL
Definition: win32_port.h:83
const char * name
Definition: encode.c:521
void dsm_impl_pin_segment(dsm_handle handle, void *impl_private, void **impl_private_pm_handle)
Definition: dsm_impl.c:1057
int errmsg(const char *fmt,...)
Definition: elog.c:797
dsm_op
Definition: dsm_impl.h:58
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:771
#define PG_DYNSHMEM_MMAP_FILE_PREFIX
Definition: dsm_impl.h:52
#define ZBUFFER_SIZE
Definition: dsm_impl.c:117
bool dsm_impl_can_resize(void)
Definition: dsm_impl.c:209
#define elog
Definition: elog.h:219
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:334
bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:164
#define IPC_STAT
Definition: win32_port.h:86
#define ftruncate(a, b)
Definition: win32_port.h:60