PostgreSQL Source Code  git master
dsm_impl.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsm_impl.c
4  * manage dynamic shared memory segments
5  *
6  * This file provides low-level APIs for creating and destroying shared
7  * memory segments using several different possible techniques. We refer
8  * to these segments as dynamic because they can be created, altered, and
9  * destroyed at any point during the server life cycle. This is unlike
10  * the main shared memory segment, of which there is always exactly one
11  * and which is always mapped at a fixed address in every PostgreSQL
12  * background process.
13  *
14  * Because not all systems provide the same primitives in this area, nor
15  * do all primitives behave the same way on all systems, we provide
16  * several implementations of this facility. Many systems implement
17  * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18  * in this area, with the exception that shared memory identifiers live
19  * in a flat system-wide namespace, raising the uncomfortable prospect of
20  * name collisions with other processes (including other copies of
21  * PostgreSQL) running on the same system. Some systems only support
22  * the older System V shared memory interface (shmget etc.) which is
23  * also usable; however, the default allocation limits are often quite
24  * small, and the namespace is even more restricted.
25  *
26  * We also provide an mmap-based shared memory implementation. This may
27  * be useful on systems that provide shared memory via a special-purpose
28  * filesystem; by opting for this implementation, the user can even
29  * control precisely where their shared memory segments are placed. It
30  * can also be used as a fallback for systems where shm_open and shmget
31  * are not available or can't be used for some reason. Of course,
32  * mapping a file residing on an actual spinning disk is a fairly poor
33  * approximation for shared memory because writeback may hurt performance
34  * substantially, but there should be few systems where we must make do
35  * with such poor tools.
36  *
37  * As ever, Windows requires its own implementation.
38  *
39  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
40  * Portions Copyright (c) 1994, Regents of the University of California
41  *
42  *
43  * IDENTIFICATION
44  * src/backend/storage/ipc/dsm_impl.c
45  *
46  *-------------------------------------------------------------------------
47  */
48 
49 #include "postgres.h"
50 
51 #include <fcntl.h>
52 #include <unistd.h>
53 #ifndef WIN32
54 #include <sys/mman.h>
55 #endif
56 #include <sys/stat.h>
57 #ifdef HAVE_SYS_IPC_H
58 #include <sys/ipc.h>
59 #endif
60 #ifdef HAVE_SYS_SHM_H
61 #include <sys/shm.h>
62 #endif
63 #include "pgstat.h"
64 
65 #include "portability/mem.h"
66 #include "storage/dsm_impl.h"
67 #include "storage/fd.h"
68 #include "utils/guc.h"
69 #include "utils/memutils.h"
70 #include "postmaster/postmaster.h"
71 
72 #ifdef USE_DSM_POSIX
73 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
74  void **impl_private, void **mapped_address,
75  Size *mapped_size, int elevel);
76 static int dsm_impl_posix_resize(int fd, off_t size);
77 #endif
78 #ifdef USE_DSM_SYSV
79 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
80  void **impl_private, void **mapped_address,
81  Size *mapped_size, int elevel);
82 #endif
83 #ifdef USE_DSM_WINDOWS
84 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
85  void **impl_private, void **mapped_address,
86  Size *mapped_size, int elevel);
87 #endif
88 #ifdef USE_DSM_MMAP
89 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
90  void **impl_private, void **mapped_address,
91  Size *mapped_size, int elevel);
92 #endif
93 static int errcode_for_dynamic_shared_memory(void);
94 
96 #ifdef USE_DSM_POSIX
97  {"posix", DSM_IMPL_POSIX, false},
98 #endif
99 #ifdef USE_DSM_SYSV
100  {"sysv", DSM_IMPL_SYSV, false},
101 #endif
102 #ifdef USE_DSM_WINDOWS
103  {"windows", DSM_IMPL_WINDOWS, false},
104 #endif
105 #ifdef USE_DSM_MMAP
106  {"mmap", DSM_IMPL_MMAP, false},
107 #endif
108  {"none", DSM_IMPL_NONE, false},
109  {NULL, 0, false}
110 };
111 
112 /* Implementation selector. */
114 
115 /* Size of buffer to be used for zero-filling. */
116 #define ZBUFFER_SIZE 8192
117 
118 #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
119 
120 /*------
121  * Perform a low-level shared memory operation in a platform-specific way,
122  * as dictated by the selected implementation. Each implementation is
123  * required to implement the following primitives.
124  *
125  * DSM_OP_CREATE. Create a segment whose size is the request_size and
126  * map it.
127  *
128  * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
129  * The segment may already be mapped; any existing mapping should be removed
130  * before creating a new one.
131  *
132  * DSM_OP_DETACH. Unmap the segment.
133  *
134  * DSM_OP_RESIZE. Resize the segment to the given request_size and
135  * remap the segment at that new size.
136  *
137  * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
138  * segment.
139  *
140  * Arguments:
141  * op: The operation to be performed.
142  * handle: The handle of an existing object, or for DSM_OP_CREATE, the
143  * a new handle the caller wants created.
144  * request_size: For DSM_OP_CREATE, the requested size. For DSM_OP_RESIZE,
145  * the new size. Otherwise, 0.
146  * impl_private: Private, implementation-specific data. Will be a pointer
147  * to NULL for the first operation on a shared memory segment within this
148  * backend; thereafter, it will point to the value to which it was set
149  * on the previous call.
150  * mapped_address: Pointer to start of current mapping; pointer to NULL
151  * if none. Updated with new mapping address.
152  * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
153  * Updated with new mapped size.
154  * elevel: Level at which to log errors.
155  *
156  * Return value: true on success, false on failure. When false is returned,
157  * a message should first be logged at the specified elevel, except in the
158  * case where DSM_OP_CREATE experiences a name collision, which should
159  * silently return false.
160  *-----
161  */
162 bool
163 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
164  void **impl_private, void **mapped_address, Size *mapped_size,
165  int elevel)
166 {
167  Assert(op == DSM_OP_CREATE || op == DSM_OP_RESIZE || request_size == 0);
168  Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
169  (*mapped_address == NULL && *mapped_size == 0));
170 
172  {
173 #ifdef USE_DSM_POSIX
174  case DSM_IMPL_POSIX:
175  return dsm_impl_posix(op, handle, request_size, impl_private,
176  mapped_address, mapped_size, elevel);
177 #endif
178 #ifdef USE_DSM_SYSV
179  case DSM_IMPL_SYSV:
180  return dsm_impl_sysv(op, handle, request_size, impl_private,
181  mapped_address, mapped_size, elevel);
182 #endif
183 #ifdef USE_DSM_WINDOWS
184  case DSM_IMPL_WINDOWS:
185  return dsm_impl_windows(op, handle, request_size, impl_private,
186  mapped_address, mapped_size, elevel);
187 #endif
188 #ifdef USE_DSM_MMAP
189  case DSM_IMPL_MMAP:
190  return dsm_impl_mmap(op, handle, request_size, impl_private,
191  mapped_address, mapped_size, elevel);
192 #endif
193  default:
194  elog(ERROR, "unexpected dynamic shared memory type: %d",
196  return false;
197  }
198 }
199 
200 /*
201  * Does the current dynamic shared memory implementation support resizing
202  * segments? (The answer here could be platform-dependent in the future,
203  * since AIX allows shmctl(shmid, SHM_RESIZE, &buffer), though you apparently
204  * can't resize segments to anything larger than 256MB that way. For now,
205  * we keep it simple.)
206  */
207 bool
209 {
211  {
212  case DSM_IMPL_NONE:
213  return false;
214  case DSM_IMPL_POSIX:
215  return true;
216  case DSM_IMPL_SYSV:
217  return false;
218  case DSM_IMPL_WINDOWS:
219  return false;
220  case DSM_IMPL_MMAP:
221  return true;
222  default:
223  return false; /* should not happen */
224  }
225 }
226 
227 #ifdef USE_DSM_POSIX
228 /*
229  * Operating system primitives to support POSIX shared memory.
230  *
231  * POSIX shared memory segments are created and attached using shm_open()
232  * and shm_unlink(); other operations, such as sizing or mapping the
233  * segment, are performed as if the shared memory segments were files.
234  *
235  * Indeed, on some platforms, they may be implemented that way. While
236  * POSIX shared memory segments seem intended to exist in a flat namespace,
237  * some operating systems may implement them as files, even going so far
238  * to treat a request for /xyz as a request to create a file by that name
239  * in the root directory. Users of such broken platforms should select
240  * a different shared memory implementation.
241  */
242 static bool
243 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
244  void **impl_private, void **mapped_address, Size *mapped_size,
245  int elevel)
246 {
247  char name[64];
248  int flags;
249  int fd;
250  char *address;
251 
252  snprintf(name, 64, "/PostgreSQL.%u", handle);
253 
254  /* Handle teardown cases. */
255  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
256  {
257  if (*mapped_address != NULL
258  && munmap(*mapped_address, *mapped_size) != 0)
259  {
260  ereport(elevel,
262  errmsg("could not unmap shared memory segment \"%s\": %m",
263  name)));
264  return false;
265  }
266  *mapped_address = NULL;
267  *mapped_size = 0;
268  if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
269  {
270  ereport(elevel,
272  errmsg("could not remove shared memory segment \"%s\": %m",
273  name)));
274  return false;
275  }
276  return true;
277  }
278 
279  /*
280  * Create new segment or open an existing one for attach or resize.
281  *
282  * Even though we're not going through fd.c, we should be safe against
283  * running out of file descriptors, because of NUM_RESERVED_FDS. We're
284  * only opening one extra descriptor here, and we'll close it before
285  * returning.
286  */
287  flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
288  if ((fd = shm_open(name, flags, 0600)) == -1)
289  {
290  if (errno != EEXIST)
291  ereport(elevel,
293  errmsg("could not open shared memory segment \"%s\": %m",
294  name)));
295  return false;
296  }
297 
298  /*
299  * If we're attaching the segment, determine the current size; if we are
300  * creating or resizing the segment, set the size to the requested value.
301  */
302  if (op == DSM_OP_ATTACH)
303  {
304  struct stat st;
305 
306  if (fstat(fd, &st) != 0)
307  {
308  int save_errno;
309 
310  /* Back out what's already been done. */
311  save_errno = errno;
312  close(fd);
313  errno = save_errno;
314 
315  ereport(elevel,
317  errmsg("could not stat shared memory segment \"%s\": %m",
318  name)));
319  return false;
320  }
321  request_size = st.st_size;
322  }
323  else if (*mapped_size != request_size &&
324  dsm_impl_posix_resize(fd, request_size) != 0)
325  {
326  int save_errno;
327 
328  /* Back out what's already been done. */
329  save_errno = errno;
330  close(fd);
331  if (op == DSM_OP_CREATE)
332  shm_unlink(name);
333  errno = save_errno;
334 
335  ereport(elevel,
337  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
338  name, request_size)));
339  return false;
340  }
341 
342  /*
343  * If we're reattaching or resizing, we must remove any existing mapping,
344  * unless we've already got the right thing mapped.
345  */
346  if (*mapped_address != NULL)
347  {
348  if (*mapped_size == request_size)
349  return true;
350  if (munmap(*mapped_address, *mapped_size) != 0)
351  {
352  int save_errno;
353 
354  /* Back out what's already been done. */
355  save_errno = errno;
356  close(fd);
357  if (op == DSM_OP_CREATE)
358  shm_unlink(name);
359  errno = save_errno;
360 
361  ereport(elevel,
363  errmsg("could not unmap shared memory segment \"%s\": %m",
364  name)));
365  return false;
366  }
367  *mapped_address = NULL;
368  *mapped_size = 0;
369  }
370 
371  /* Map it. */
372  address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
373  MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
374  if (address == MAP_FAILED)
375  {
376  int save_errno;
377 
378  /* Back out what's already been done. */
379  save_errno = errno;
380  close(fd);
381  if (op == DSM_OP_CREATE)
382  shm_unlink(name);
383  errno = save_errno;
384 
385  ereport(elevel,
387  errmsg("could not map shared memory segment \"%s\": %m",
388  name)));
389  return false;
390  }
391  *mapped_address = address;
392  *mapped_size = request_size;
393  close(fd);
394 
395  return true;
396 }
397 
398 /*
399  * Set the size of a virtual memory region associated with a file descriptor.
400  * If necessary, also ensure that virtual memory is actually allocated by the
401  * operating system, to avoid nasty surprises later.
402  *
403  * Returns non-zero if either truncation or allocation fails, and sets errno.
404  */
405 static int
406 dsm_impl_posix_resize(int fd, off_t size)
407 {
408  int rc;
409 
410  /* Truncate (or extend) the file to the requested size. */
411  rc = ftruncate(fd, size);
412 
413  /*
414  * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with
415  * ftruncate, the file may contain a hole. Accessing memory backed by a
416  * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
417  * is no more tmpfs space available. So we ask tmpfs to allocate pages
418  * here, so we can fail gracefully with ENOSPC now rather than risking
419  * SIGBUS later.
420  */
421 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
422  if (rc == 0)
423  {
424  /* We may get interrupted, if so just retry. */
425  do
426  {
427  rc = posix_fallocate(fd, 0, size);
428  } while (rc == EINTR);
429 
430  /*
431  * The caller expects errno to be set, but posix_fallocate() doesn't
432  * set it. Instead it returns error numbers directly. So set errno,
433  * even though we'll also return rc to indicate success or failure.
434  */
435  errno = rc;
436  }
437 #endif /* HAVE_POSIX_FALLOCATE && __linux__ */
438 
439  return rc;
440 }
441 
442 #endif /* USE_DSM_POSIX */
443 
444 #ifdef USE_DSM_SYSV
445 /*
446  * Operating system primitives to support System V shared memory.
447  *
448  * System V shared memory segments are manipulated using shmget(), shmat(),
449  * shmdt(), and shmctl(). There's no portable way to resize such
450  * segments. As the default allocation limits for System V shared memory
451  * are usually quite low, the POSIX facilities may be preferable; but
452  * those are not supported everywhere.
453  */
454 static bool
455 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
456  void **impl_private, void **mapped_address, Size *mapped_size,
457  int elevel)
458 {
459  key_t key;
460  int ident;
461  char *address;
462  char name[64];
463  int *ident_cache;
464 
465  /* Resize is not supported for System V shared memory. */
466  if (op == DSM_OP_RESIZE)
467  {
468  elog(elevel, "System V shared memory segments cannot be resized");
469  return false;
470  }
471 
472  /* Since resize isn't supported, reattach is a no-op. */
473  if (op == DSM_OP_ATTACH && *mapped_address != NULL)
474  return true;
475 
476  /*
477  * POSIX shared memory and mmap-based shared memory identify segments with
478  * names. To avoid needless error message variation, we use the handle as
479  * the name.
480  */
481  snprintf(name, 64, "%u", handle);
482 
483  /*
484  * The System V shared memory namespace is very restricted; names are of
485  * type key_t, which is expected to be some sort of integer data type, but
486  * not necessarily the same one as dsm_handle. Since we use dsm_handle to
487  * identify shared memory segments across processes, this might seem like
488  * a problem, but it's really not. If dsm_handle is bigger than key_t,
489  * the cast below might truncate away some bits from the handle the
490  * user-provided, but it'll truncate exactly the same bits away in exactly
491  * the same fashion every time we use that handle, which is all that
492  * really matters. Conversely, if dsm_handle is smaller than key_t, we
493  * won't use the full range of available key space, but that's no big deal
494  * either.
495  *
496  * We do make sure that the key isn't negative, because that might not be
497  * portable.
498  */
499  key = (key_t) handle;
500  if (key < 1) /* avoid compiler warning if type is unsigned */
501  key = -key;
502 
503  /*
504  * There's one special key, IPC_PRIVATE, which can't be used. If we end
505  * up with that value by chance during a create operation, just pretend it
506  * already exists, so that caller will retry. If we run into it anywhere
507  * else, the caller has passed a handle that doesn't correspond to
508  * anything we ever created, which should not happen.
509  */
510  if (key == IPC_PRIVATE)
511  {
512  if (op != DSM_OP_CREATE)
513  elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
514  errno = EEXIST;
515  return false;
516  }
517 
518  /*
519  * Before we can do anything with a shared memory segment, we have to map
520  * the shared memory key to a shared memory identifier using shmget(). To
521  * avoid repeated lookups, we store the key using impl_private.
522  */
523  if (*impl_private != NULL)
524  {
525  ident_cache = *impl_private;
526  ident = *ident_cache;
527  }
528  else
529  {
530  int flags = IPCProtection;
531  size_t segsize;
532 
533  /*
534  * Allocate the memory BEFORE acquiring the resource, so that we don't
535  * leak the resource if memory allocation fails.
536  */
537  ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
538 
539  /*
540  * When using shmget to find an existing segment, we must pass the
541  * size as 0. Passing a non-zero size which is greater than the
542  * actual size will result in EINVAL.
543  */
544  segsize = 0;
545 
546  if (op == DSM_OP_CREATE)
547  {
548  flags |= IPC_CREAT | IPC_EXCL;
549  segsize = request_size;
550  }
551 
552  if ((ident = shmget(key, segsize, flags)) == -1)
553  {
554  if (errno != EEXIST)
555  {
556  int save_errno = errno;
557 
558  pfree(ident_cache);
559  errno = save_errno;
560  ereport(elevel,
562  errmsg("could not get shared memory segment: %m")));
563  }
564  return false;
565  }
566 
567  *ident_cache = ident;
568  *impl_private = ident_cache;
569  }
570 
571  /* Handle teardown cases. */
572  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
573  {
574  pfree(ident_cache);
575  *impl_private = NULL;
576  if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
577  {
578  ereport(elevel,
580  errmsg("could not unmap shared memory segment \"%s\": %m",
581  name)));
582  return false;
583  }
584  *mapped_address = NULL;
585  *mapped_size = 0;
586  if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
587  {
588  ereport(elevel,
590  errmsg("could not remove shared memory segment \"%s\": %m",
591  name)));
592  return false;
593  }
594  return true;
595  }
596 
597  /* If we're attaching it, we must use IPC_STAT to determine the size. */
598  if (op == DSM_OP_ATTACH)
599  {
600  struct shmid_ds shm;
601 
602  if (shmctl(ident, IPC_STAT, &shm) != 0)
603  {
604  ereport(elevel,
606  errmsg("could not stat shared memory segment \"%s\": %m",
607  name)));
608  return false;
609  }
610  request_size = shm.shm_segsz;
611  }
612 
613  /* Map it. */
614  address = shmat(ident, NULL, PG_SHMAT_FLAGS);
615  if (address == (void *) -1)
616  {
617  int save_errno;
618 
619  /* Back out what's already been done. */
620  save_errno = errno;
621  if (op == DSM_OP_CREATE)
622  shmctl(ident, IPC_RMID, NULL);
623  errno = save_errno;
624 
625  ereport(elevel,
627  errmsg("could not map shared memory segment \"%s\": %m",
628  name)));
629  return false;
630  }
631  *mapped_address = address;
632  *mapped_size = request_size;
633 
634  return true;
635 }
636 #endif
637 
638 #ifdef USE_DSM_WINDOWS
639 /*
640  * Operating system primitives to support Windows shared memory.
641  *
642  * Windows shared memory implementation is done using file mapping
643  * which can be backed by either physical file or system paging file.
644  * Current implementation uses system paging file as other effects
645  * like performance are not clear for physical file and it is used in similar
646  * way for main shared memory in windows.
647  *
648  * A memory mapping object is a kernel object - they always get deleted when
649  * the last reference to them goes away, either explicitly via a CloseHandle or
650  * when the process containing the reference exits.
651  */
652 static bool
653 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
654  void **impl_private, void **mapped_address,
655  Size *mapped_size, int elevel)
656 {
657  char *address;
658  HANDLE hmap;
659  char name[64];
660  MEMORY_BASIC_INFORMATION info;
661 
662  /* Resize is not supported for Windows shared memory. */
663  if (op == DSM_OP_RESIZE)
664  {
665  elog(elevel, "Windows shared memory segments cannot be resized");
666  return false;
667  }
668 
669  /* Since resize isn't supported, reattach is a no-op. */
670  if (op == DSM_OP_ATTACH && *mapped_address != NULL)
671  return true;
672 
673  /*
674  * Storing the shared memory segment in the Global\ namespace, can allow
675  * any process running in any session to access that file mapping object
676  * provided that the caller has the required access rights. But to avoid
677  * issues faced in main shared memory, we are using the naming convention
678  * similar to main shared memory. We can change here once issue mentioned
679  * in GetSharedMemName is resolved.
680  */
681  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
682 
683  /*
684  * Handle teardown cases. Since Windows automatically destroys the object
685  * when no references remain, we can treat it the same as detach.
686  */
687  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
688  {
689  if (*mapped_address != NULL
690  && UnmapViewOfFile(*mapped_address) == 0)
691  {
692  _dosmaperr(GetLastError());
693  ereport(elevel,
695  errmsg("could not unmap shared memory segment \"%s\": %m",
696  name)));
697  return false;
698  }
699  if (*impl_private != NULL
700  && CloseHandle(*impl_private) == 0)
701  {
702  _dosmaperr(GetLastError());
703  ereport(elevel,
705  errmsg("could not remove shared memory segment \"%s\": %m",
706  name)));
707  return false;
708  }
709 
710  *impl_private = NULL;
711  *mapped_address = NULL;
712  *mapped_size = 0;
713  return true;
714  }
715 
716  /* Create new segment or open an existing one for attach. */
717  if (op == DSM_OP_CREATE)
718  {
719  DWORD size_high;
720  DWORD size_low;
721  DWORD errcode;
722 
723  /* Shifts >= the width of the type are undefined. */
724 #ifdef _WIN64
725  size_high = request_size >> 32;
726 #else
727  size_high = 0;
728 #endif
729  size_low = (DWORD) request_size;
730 
731  /* CreateFileMapping might not clear the error code on success */
732  SetLastError(0);
733 
734  hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
735  NULL, /* Default security attrs */
736  PAGE_READWRITE, /* Memory is read/write */
737  size_high, /* Upper 32 bits of size */
738  size_low, /* Lower 32 bits of size */
739  name);
740 
741  errcode = GetLastError();
742  if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
743  {
744  /*
745  * On Windows, when the segment already exists, a handle for the
746  * existing segment is returned. We must close it before
747  * returning. However, if the existing segment is created by a
748  * service, then it returns ERROR_ACCESS_DENIED. We don't do
749  * _dosmaperr here, so errno won't be modified.
750  */
751  if (hmap)
752  CloseHandle(hmap);
753  return false;
754  }
755 
756  if (!hmap)
757  {
758  _dosmaperr(errcode);
759  ereport(elevel,
761  errmsg("could not create shared memory segment \"%s\": %m",
762  name)));
763  return false;
764  }
765  }
766  else
767  {
768  hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
769  FALSE, /* do not inherit the name */
770  name); /* name of mapping object */
771  if (!hmap)
772  {
773  _dosmaperr(GetLastError());
774  ereport(elevel,
776  errmsg("could not open shared memory segment \"%s\": %m",
777  name)));
778  return false;
779  }
780  }
781 
782  /* Map it. */
783  address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
784  0, 0, 0);
785  if (!address)
786  {
787  int save_errno;
788 
789  _dosmaperr(GetLastError());
790  /* Back out what's already been done. */
791  save_errno = errno;
792  CloseHandle(hmap);
793  errno = save_errno;
794 
795  ereport(elevel,
797  errmsg("could not map shared memory segment \"%s\": %m",
798  name)));
799  return false;
800  }
801 
802  /*
803  * VirtualQuery gives size in page_size units, which is 4K for Windows. We
804  * need size only when we are attaching, but it's better to get the size
805  * when creating new segment to keep size consistent both for
806  * DSM_OP_CREATE and DSM_OP_ATTACH.
807  */
808  if (VirtualQuery(address, &info, sizeof(info)) == 0)
809  {
810  int save_errno;
811 
812  _dosmaperr(GetLastError());
813  /* Back out what's already been done. */
814  save_errno = errno;
815  UnmapViewOfFile(address);
816  CloseHandle(hmap);
817  errno = save_errno;
818 
819  ereport(elevel,
821  errmsg("could not stat shared memory segment \"%s\": %m",
822  name)));
823  return false;
824  }
825 
826  *mapped_address = address;
827  *mapped_size = info.RegionSize;
828  *impl_private = hmap;
829 
830  return true;
831 }
832 #endif
833 
834 #ifdef USE_DSM_MMAP
835 /*
836  * Operating system primitives to support mmap-based shared memory.
837  *
838  * Calling this "shared memory" is somewhat of a misnomer, because what
839  * we're really doing is creating a bunch of files and mapping them into
840  * our address space. The operating system may feel obliged to
841  * synchronize the contents to disk even if nothing is being paged out,
842  * which will not serve us well. The user can relocate the pg_dynshmem
843  * directory to a ramdisk to avoid this problem, if available.
844  */
845 static bool
846 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
847  void **impl_private, void **mapped_address, Size *mapped_size,
848  int elevel)
849 {
850  char name[64];
851  int flags;
852  int fd;
853  char *address;
854 
856  handle);
857 
858  /* Handle teardown cases. */
859  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
860  {
861  if (*mapped_address != NULL
862  && munmap(*mapped_address, *mapped_size) != 0)
863  {
864  ereport(elevel,
866  errmsg("could not unmap shared memory segment \"%s\": %m",
867  name)));
868  return false;
869  }
870  *mapped_address = NULL;
871  *mapped_size = 0;
872  if (op == DSM_OP_DESTROY && unlink(name) != 0)
873  {
874  ereport(elevel,
876  errmsg("could not remove shared memory segment \"%s\": %m",
877  name)));
878  return false;
879  }
880  return true;
881  }
882 
883  /* Create new segment or open an existing one for attach or resize. */
884  flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
885  if ((fd = OpenTransientFile(name, flags)) == -1)
886  {
887  if (errno != EEXIST)
888  ereport(elevel,
890  errmsg("could not open shared memory segment \"%s\": %m",
891  name)));
892  return false;
893  }
894 
895  /*
896  * If we're attaching the segment, determine the current size; if we are
897  * creating or resizing the segment, set the size to the requested value.
898  */
899  if (op == DSM_OP_ATTACH)
900  {
901  struct stat st;
902 
903  if (fstat(fd, &st) != 0)
904  {
905  int save_errno;
906 
907  /* Back out what's already been done. */
908  save_errno = errno;
909  CloseTransientFile(fd);
910  errno = save_errno;
911 
912  ereport(elevel,
914  errmsg("could not stat shared memory segment \"%s\": %m",
915  name)));
916  return false;
917  }
918  request_size = st.st_size;
919  }
920  else if (*mapped_size > request_size && ftruncate(fd, request_size))
921  {
922  int save_errno;
923 
924  /* Back out what's already been done. */
925  save_errno = errno;
926  close(fd);
927  if (op == DSM_OP_CREATE)
928  unlink(name);
929  errno = save_errno;
930 
931  ereport(elevel,
933  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
934  name, request_size)));
935  return false;
936  }
937  else if (*mapped_size < request_size)
938  {
939  /*
940  * Allocate a buffer full of zeros.
941  *
942  * Note: palloc zbuffer, instead of just using a local char array, to
943  * ensure it is reasonably well-aligned; this may save a few cycles
944  * transferring data to the kernel.
945  */
946  char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
947  uint32 remaining = request_size;
948  bool success = true;
949 
950  /*
951  * Zero-fill the file. We have to do this the hard way to ensure that
952  * all the file space has really been allocated, so that we don't
953  * later seg fault when accessing the memory mapping. This is pretty
954  * pessimal.
955  */
956  while (success && remaining > 0)
957  {
958  Size goal = remaining;
959 
960  if (goal > ZBUFFER_SIZE)
961  goal = ZBUFFER_SIZE;
963  if (write(fd, zbuffer, goal) == goal)
964  remaining -= goal;
965  else
966  success = false;
968  }
969 
970  if (!success)
971  {
972  int save_errno;
973 
974  /* Back out what's already been done. */
975  save_errno = errno;
976  CloseTransientFile(fd);
977  if (op == DSM_OP_CREATE)
978  unlink(name);
979  errno = save_errno ? save_errno : ENOSPC;
980 
981  ereport(elevel,
983  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
984  name, request_size)));
985  return false;
986  }
987  }
988 
989  /*
990  * If we're reattaching or resizing, we must remove any existing mapping,
991  * unless we've already got the right thing mapped.
992  */
993  if (*mapped_address != NULL)
994  {
995  if (*mapped_size == request_size)
996  return true;
997  if (munmap(*mapped_address, *mapped_size) != 0)
998  {
999  int save_errno;
1000 
1001  /* Back out what's already been done. */
1002  save_errno = errno;
1003  CloseTransientFile(fd);
1004  if (op == DSM_OP_CREATE)
1005  unlink(name);
1006  errno = save_errno;
1007 
1008  ereport(elevel,
1010  errmsg("could not unmap shared memory segment \"%s\": %m",
1011  name)));
1012  return false;
1013  }
1014  *mapped_address = NULL;
1015  *mapped_size = 0;
1016  }
1017 
1018  /* Map it. */
1019  address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
1020  MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
1021  if (address == MAP_FAILED)
1022  {
1023  int save_errno;
1024 
1025  /* Back out what's already been done. */
1026  save_errno = errno;
1027  CloseTransientFile(fd);
1028  if (op == DSM_OP_CREATE)
1029  unlink(name);
1030  errno = save_errno;
1031 
1032  ereport(elevel,
1034  errmsg("could not map shared memory segment \"%s\": %m",
1035  name)));
1036  return false;
1037  }
1038  *mapped_address = address;
1039  *mapped_size = request_size;
1040  CloseTransientFile(fd);
1041 
1042  return true;
1043 }
1044 #endif
1045 
1046 /*
1047  * Implementation-specific actions that must be performed when a segment is to
1048  * be preserved even when no backend has it attached.
1049  *
1050  * Except on Windows, we don't need to do anything at all. But since Windows
1051  * cleans up segments automatically when no references remain, we duplicate
1052  * the segment handle into the postmaster process. The postmaster needn't
1053  * do anything to receive the handle; Windows transfers it automatically.
1054  */
1055 void
1056 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
1057  void **impl_private_pm_handle)
1058 {
1060  {
1061 #ifdef USE_DSM_WINDOWS
1062  case DSM_IMPL_WINDOWS:
1063  {
1064  HANDLE hmap;
1065 
1066  if (!DuplicateHandle(GetCurrentProcess(), impl_private,
1067  PostmasterHandle, &hmap, 0, FALSE,
1068  DUPLICATE_SAME_ACCESS))
1069  {
1070  char name[64];
1071 
1072  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1073  _dosmaperr(GetLastError());
1074  ereport(ERROR,
1076  errmsg("could not duplicate handle for \"%s\": %m",
1077  name)));
1078  }
1079 
1080  /*
1081  * Here, we remember the handle that we created in the
1082  * postmaster process. This handle isn't actually usable in
1083  * any process other than the postmaster, but that doesn't
1084  * matter. We're just holding onto it so that, if the segment
1085  * is unpinned, dsm_impl_unpin_segment can close it.
1086  */
1087  *impl_private_pm_handle = hmap;
1088  break;
1089  }
1090 #endif
1091  default:
1092  break;
1093  }
1094 }
1095 
1096 /*
1097  * Implementation-specific actions that must be performed when a segment is no
1098  * longer to be preserved, so that it will be cleaned up when all backends
1099  * have detached from it.
1100  *
1101  * Except on Windows, we don't need to do anything at all. For Windows, we
1102  * close the extra handle that dsm_impl_pin_segment created in the
1103  * postmaster's process space.
1104  */
1105 void
1106 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1107 {
1109  {
1110 #ifdef USE_DSM_WINDOWS
1111  case DSM_IMPL_WINDOWS:
1112  {
1113  if (*impl_private &&
1114  !DuplicateHandle(PostmasterHandle, *impl_private,
1115  NULL, NULL, 0, FALSE,
1116  DUPLICATE_CLOSE_SOURCE))
1117  {
1118  char name[64];
1119 
1120  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1121  _dosmaperr(GetLastError());
1122  ereport(ERROR,
1124  errmsg("could not duplicate handle for \"%s\": %m",
1125  name)));
1126  }
1127 
1128  *impl_private = NULL;
1129  break;
1130  }
1131 #endif
1132  default:
1133  break;
1134  }
1135 }
1136 
1137 static int
1139 {
1140  if (errno == EFBIG || errno == ENOMEM)
1141  return errcode(ERRCODE_OUT_OF_MEMORY);
1142  else
1143  return errcode_for_file_access();
1144 }
int remaining
Definition: informix.c:692
#define DSM_IMPL_MMAP
Definition: dsm_impl.h:21
#define DSM_IMPL_SYSV
Definition: dsm_impl.h:19
#define MAP_HASSEMAPHORE
Definition: mem.h:30
#define MAP_FAILED
Definition: mem.h:45
void dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
Definition: dsm_impl.c:1106
uint32 dsm_handle
Definition: dsm_impl.h:55
#define PG_DYNSHMEM_DIR
Definition: dsm_impl.h:51
#define IPC_CREAT
Definition: win32_port.h:82
Definition: guc.h:164
#define write(a, b, c)
Definition: win32.h:14
#define IPCProtection
Definition: posix_sema.c:52
#define PG_SHMAT_FLAGS
Definition: mem.h:20
#define FALSE
Definition: ecpglib.h:39
#define MAP_NOSYNC
Definition: mem.h:38
void _dosmaperr(unsigned long)
Definition: win32error.c:171
int errcode(int sqlerrcode)
Definition: elog.c:575
int snprintf(char *str, size_t count, const char *fmt,...) pg_attribute_printf(3
#define DSM_IMPL_WINDOWS
Definition: dsm_impl.h:20
static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:846
#define DEBUG4
Definition: elog.h:22
static int fd(const char *x, int i)
Definition: preproc-init.c:105
static int errcode_for_dynamic_shared_memory(void)
Definition: dsm_impl.c:1138
void pfree(void *pointer)
Definition: mcxt.c:949
const struct config_enum_entry dynamic_shared_memory_options[]
Definition: dsm_impl.c:95
#define ERROR
Definition: elog.h:43
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2173
#define DSM_IMPL_POSIX
Definition: dsm_impl.h:18
static bool success
Definition: pg_basebackup.c:99
#define SEGMENT_NAME_PREFIX
Definition: dsm_impl.c:118
int errcode_for_file_access(void)
Definition: elog.c:598
int dynamic_shared_memory_type
Definition: dsm_impl.c:113
unsigned int uint32
Definition: c.h:296
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1244
#define ereport(elevel, rest)
Definition: elog.h:122
#define IPC_PRIVATE
Definition: win32_port.h:84
MemoryContext TopMemoryContext
Definition: mcxt.c:43
int CloseTransientFile(int fd)
Definition: fd.c:2343
#define stat(a, b)
Definition: win32_port.h:266
static int elevel
Definition: vacuumlazy.c:136
void * palloc0(Size size)
Definition: mcxt.c:877
#define IPC_RMID
Definition: win32_port.h:81
static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:455
#define Assert(condition)
Definition: c.h:670
long key_t
Definition: win32_port.h:244
#define DSM_IMPL_NONE
Definition: dsm_impl.h:17
size_t Size
Definition: c.h:404
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1220
#define IPC_EXCL
Definition: win32_port.h:83
const char * name
Definition: encode.c:521
void dsm_impl_pin_segment(dsm_handle handle, void *impl_private, void **impl_private_pm_handle)
Definition: dsm_impl.c:1056
int errmsg(const char *fmt,...)
Definition: elog.c:797
dsm_op
Definition: dsm_impl.h:58
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:706
#define PG_DYNSHMEM_MMAP_FILE_PREFIX
Definition: dsm_impl.h:52
#define ZBUFFER_SIZE
Definition: dsm_impl.c:116
bool dsm_impl_can_resize(void)
Definition: dsm_impl.c:208
#define elog
Definition: elog.h:219
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:334
bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:163
#define IPC_STAT
Definition: win32_port.h:86
#define ftruncate(a, b)
Definition: win32_port.h:60