PostgreSQL Source Code  git master
dsm_impl.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsm_impl.c
4  * manage dynamic shared memory segments
5  *
6  * This file provides low-level APIs for creating and destroying shared
7  * memory segments using several different possible techniques. We refer
8  * to these segments as dynamic because they can be created, altered, and
9  * destroyed at any point during the server life cycle. This is unlike
10  * the main shared memory segment, of which there is always exactly one
11  * and which is always mapped at a fixed address in every PostgreSQL
12  * background process.
13  *
14  * Because not all systems provide the same primitives in this area, nor
15  * do all primitives behave the same way on all systems, we provide
16  * several implementations of this facility. Many systems implement
17  * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18  * in this area, with the exception that shared memory identifiers live
19  * in a flat system-wide namespace, raising the uncomfortable prospect of
20  * name collisions with other processes (including other copies of
21  * PostgreSQL) running on the same system. Some systems only support
22  * the older System V shared memory interface (shmget etc.) which is
23  * also usable; however, the default allocation limits are often quite
24  * small, and the namespace is even more restricted.
25  *
26  * We also provide an mmap-based shared memory implementation. This may
27  * be useful on systems that provide shared memory via a special-purpose
28  * filesystem; by opting for this implementation, the user can even
29  * control precisely where their shared memory segments are placed. It
30  * can also be used as a fallback for systems where shm_open and shmget
31  * are not available or can't be used for some reason. Of course,
32  * mapping a file residing on an actual spinning disk is a fairly poor
33  * approximation for shared memory because writeback may hurt performance
34  * substantially, but there should be few systems where we must make do
35  * with such poor tools.
36  *
37  * As ever, Windows requires its own implementation.
38  *
39  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
40  * Portions Copyright (c) 1994, Regents of the University of California
41  *
42  *
43  * IDENTIFICATION
44  * src/backend/storage/ipc/dsm_impl.c
45  *
46  *-------------------------------------------------------------------------
47  */
48 
49 #include "postgres.h"
50 
51 #include <fcntl.h>
52 #include <unistd.h>
53 #ifndef WIN32
54 #include <sys/mman.h>
55 #endif
56 #include <sys/stat.h>
57 #ifdef HAVE_SYS_IPC_H
58 #include <sys/ipc.h>
59 #endif
60 #ifdef HAVE_SYS_SHM_H
61 #include <sys/shm.h>
62 #endif
63 
64 #include "common/file_perm.h"
65 #include "miscadmin.h"
66 #include "pgstat.h"
67 #include "portability/mem.h"
68 #include "postmaster/postmaster.h"
69 #include "storage/dsm_impl.h"
70 #include "storage/fd.h"
71 #include "utils/guc.h"
72 #include "utils/memutils.h"
73 
74 #ifdef USE_DSM_POSIX
75 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
76  void **impl_private, void **mapped_address,
77  Size *mapped_size, int elevel);
78 static int dsm_impl_posix_resize(int fd, off_t size);
79 #endif
80 #ifdef USE_DSM_SYSV
81 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
82  void **impl_private, void **mapped_address,
83  Size *mapped_size, int elevel);
84 #endif
85 #ifdef USE_DSM_WINDOWS
86 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
87  void **impl_private, void **mapped_address,
88  Size *mapped_size, int elevel);
89 #endif
90 #ifdef USE_DSM_MMAP
91 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
92  void **impl_private, void **mapped_address,
93  Size *mapped_size, int elevel);
94 #endif
95 static int errcode_for_dynamic_shared_memory(void);
96 
98 #ifdef USE_DSM_POSIX
99  {"posix", DSM_IMPL_POSIX, false},
100 #endif
101 #ifdef USE_DSM_SYSV
102  {"sysv", DSM_IMPL_SYSV, false},
103 #endif
104 #ifdef USE_DSM_WINDOWS
105  {"windows", DSM_IMPL_WINDOWS, false},
106 #endif
107 #ifdef USE_DSM_MMAP
108  {"mmap", DSM_IMPL_MMAP, false},
109 #endif
110  {NULL, 0, false}
111 };
112 
113 /* Implementation selector. */
115 
116 /* Size of buffer to be used for zero-filling. */
117 #define ZBUFFER_SIZE 8192
118 
119 #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
120 
121 /*------
122  * Perform a low-level shared memory operation in a platform-specific way,
123  * as dictated by the selected implementation. Each implementation is
124  * required to implement the following primitives.
125  *
126  * DSM_OP_CREATE. Create a segment whose size is the request_size and
127  * map it.
128  *
129  * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
130  *
131  * DSM_OP_DETACH. Unmap the segment.
132  *
133  * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
134  * segment.
135  *
136  * Arguments:
137  * op: The operation to be performed.
138  * handle: The handle of an existing object, or for DSM_OP_CREATE, the
139  * a new handle the caller wants created.
140  * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
141  * impl_private: Private, implementation-specific data. Will be a pointer
142  * to NULL for the first operation on a shared memory segment within this
143  * backend; thereafter, it will point to the value to which it was set
144  * on the previous call.
145  * mapped_address: Pointer to start of current mapping; pointer to NULL
146  * if none. Updated with new mapping address.
147  * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
148  * Updated with new mapped size.
149  * elevel: Level at which to log errors.
150  *
151  * Return value: true on success, false on failure. When false is returned,
152  * a message should first be logged at the specified elevel, except in the
153  * case where DSM_OP_CREATE experiences a name collision, which should
154  * silently return false.
155  *-----
156  */
157 bool
158 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
159  void **impl_private, void **mapped_address, Size *mapped_size,
160  int elevel)
161 {
162  Assert(op == DSM_OP_CREATE || request_size == 0);
163  Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
164  (*mapped_address == NULL && *mapped_size == 0));
165 
167  {
168 #ifdef USE_DSM_POSIX
169  case DSM_IMPL_POSIX:
170  return dsm_impl_posix(op, handle, request_size, impl_private,
171  mapped_address, mapped_size, elevel);
172 #endif
173 #ifdef USE_DSM_SYSV
174  case DSM_IMPL_SYSV:
175  return dsm_impl_sysv(op, handle, request_size, impl_private,
176  mapped_address, mapped_size, elevel);
177 #endif
178 #ifdef USE_DSM_WINDOWS
179  case DSM_IMPL_WINDOWS:
180  return dsm_impl_windows(op, handle, request_size, impl_private,
181  mapped_address, mapped_size, elevel);
182 #endif
183 #ifdef USE_DSM_MMAP
184  case DSM_IMPL_MMAP:
185  return dsm_impl_mmap(op, handle, request_size, impl_private,
186  mapped_address, mapped_size, elevel);
187 #endif
188  default:
189  elog(ERROR, "unexpected dynamic shared memory type: %d",
191  return false;
192  }
193 }
194 
195 #ifdef USE_DSM_POSIX
196 /*
197  * Operating system primitives to support POSIX shared memory.
198  *
199  * POSIX shared memory segments are created and attached using shm_open()
200  * and shm_unlink(); other operations, such as sizing or mapping the
201  * segment, are performed as if the shared memory segments were files.
202  *
203  * Indeed, on some platforms, they may be implemented that way. While
204  * POSIX shared memory segments seem intended to exist in a flat namespace,
205  * some operating systems may implement them as files, even going so far
206  * to treat a request for /xyz as a request to create a file by that name
207  * in the root directory. Users of such broken platforms should select
208  * a different shared memory implementation.
209  */
210 static bool
211 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
212  void **impl_private, void **mapped_address, Size *mapped_size,
213  int elevel)
214 {
215  char name[64];
216  int flags;
217  int fd;
218  char *address;
219 
220  snprintf(name, 64, "/PostgreSQL.%u", handle);
221 
222  /* Handle teardown cases. */
223  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
224  {
225  if (*mapped_address != NULL
226  && munmap(*mapped_address, *mapped_size) != 0)
227  {
228  ereport(elevel,
230  errmsg("could not unmap shared memory segment \"%s\": %m",
231  name)));
232  return false;
233  }
234  *mapped_address = NULL;
235  *mapped_size = 0;
236  if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
237  {
238  ereport(elevel,
240  errmsg("could not remove shared memory segment \"%s\": %m",
241  name)));
242  return false;
243  }
244  return true;
245  }
246 
247  /*
248  * Create new segment or open an existing one for attach.
249  *
250  * Even though we will close the FD before returning, it seems desirable
251  * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
252  * failure. The fact that we won't hold the FD open long justifies using
253  * ReserveExternalFD rather than AcquireExternalFD, though.
254  */
256 
257  flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
258  if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
259  {
261  if (errno != EEXIST)
262  ereport(elevel,
264  errmsg("could not open shared memory segment \"%s\": %m",
265  name)));
266  return false;
267  }
268 
269  /*
270  * If we're attaching the segment, determine the current size; if we are
271  * creating the segment, set the size to the requested value.
272  */
273  if (op == DSM_OP_ATTACH)
274  {
275  struct stat st;
276 
277  if (fstat(fd, &st) != 0)
278  {
279  int save_errno;
280 
281  /* Back out what's already been done. */
282  save_errno = errno;
283  close(fd);
285  errno = save_errno;
286 
287  ereport(elevel,
289  errmsg("could not stat shared memory segment \"%s\": %m",
290  name)));
291  return false;
292  }
293  request_size = st.st_size;
294  }
295  else if (dsm_impl_posix_resize(fd, request_size) != 0)
296  {
297  int save_errno;
298 
299  /* Back out what's already been done. */
300  save_errno = errno;
301  close(fd);
303  shm_unlink(name);
304  errno = save_errno;
305 
306  /*
307  * If we received a query cancel or termination signal, we will have
308  * EINTR set here. If the caller said that errors are OK here, check
309  * for interrupts immediately.
310  */
311  if (errno == EINTR && elevel >= ERROR)
313 
314  ereport(elevel,
316  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
317  name, request_size)));
318  return false;
319  }
320 
321  /* Map it. */
322  address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
323  MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
324  if (address == MAP_FAILED)
325  {
326  int save_errno;
327 
328  /* Back out what's already been done. */
329  save_errno = errno;
330  close(fd);
332  if (op == DSM_OP_CREATE)
333  shm_unlink(name);
334  errno = save_errno;
335 
336  ereport(elevel,
338  errmsg("could not map shared memory segment \"%s\": %m",
339  name)));
340  return false;
341  }
342  *mapped_address = address;
343  *mapped_size = request_size;
344  close(fd);
346 
347  return true;
348 }
349 
350 /*
351  * Set the size of a virtual memory region associated with a file descriptor.
352  * If necessary, also ensure that virtual memory is actually allocated by the
353  * operating system, to avoid nasty surprises later.
354  *
355  * Returns non-zero if either truncation or allocation fails, and sets errno.
356  */
357 static int
358 dsm_impl_posix_resize(int fd, off_t size)
359 {
360  int rc;
361 
362  /* Truncate (or extend) the file to the requested size. */
363  rc = ftruncate(fd, size);
364 
365  /*
366  * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with
367  * ftruncate, the file may contain a hole. Accessing memory backed by a
368  * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
369  * is no more tmpfs space available. So we ask tmpfs to allocate pages
370  * here, so we can fail gracefully with ENOSPC now rather than risking
371  * SIGBUS later.
372  */
373 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
374  if (rc == 0)
375  {
376  /*
377  * We may get interrupted. If so, just retry unless there is an
378  * interrupt pending. This avoids the possibility of looping forever
379  * if another backend is repeatedly trying to interrupt us.
380  */
382  do
383  {
384  rc = posix_fallocate(fd, 0, size);
385  } while (rc == EINTR && !(ProcDiePending || QueryCancelPending));
387 
388  /*
389  * The caller expects errno to be set, but posix_fallocate() doesn't
390  * set it. Instead it returns error numbers directly. So set errno,
391  * even though we'll also return rc to indicate success or failure.
392  */
393  errno = rc;
394  }
395 #endif /* HAVE_POSIX_FALLOCATE && __linux__ */
396 
397  return rc;
398 }
399 
400 #endif /* USE_DSM_POSIX */
401 
402 #ifdef USE_DSM_SYSV
403 /*
404  * Operating system primitives to support System V shared memory.
405  *
406  * System V shared memory segments are manipulated using shmget(), shmat(),
407  * shmdt(), and shmctl(). As the default allocation limits for System V
408  * shared memory are usually quite low, the POSIX facilities may be
409  * preferable; but those are not supported everywhere.
410  */
411 static bool
412 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
413  void **impl_private, void **mapped_address, Size *mapped_size,
414  int elevel)
415 {
416  key_t key;
417  int ident;
418  char *address;
419  char name[64];
420  int *ident_cache;
421 
422  /*
423  * POSIX shared memory and mmap-based shared memory identify segments with
424  * names. To avoid needless error message variation, we use the handle as
425  * the name.
426  */
427  snprintf(name, 64, "%u", handle);
428 
429  /*
430  * The System V shared memory namespace is very restricted; names are of
431  * type key_t, which is expected to be some sort of integer data type, but
432  * not necessarily the same one as dsm_handle. Since we use dsm_handle to
433  * identify shared memory segments across processes, this might seem like
434  * a problem, but it's really not. If dsm_handle is bigger than key_t,
435  * the cast below might truncate away some bits from the handle the
436  * user-provided, but it'll truncate exactly the same bits away in exactly
437  * the same fashion every time we use that handle, which is all that
438  * really matters. Conversely, if dsm_handle is smaller than key_t, we
439  * won't use the full range of available key space, but that's no big deal
440  * either.
441  *
442  * We do make sure that the key isn't negative, because that might not be
443  * portable.
444  */
445  key = (key_t) handle;
446  if (key < 1) /* avoid compiler warning if type is unsigned */
447  key = -key;
448 
449  /*
450  * There's one special key, IPC_PRIVATE, which can't be used. If we end
451  * up with that value by chance during a create operation, just pretend it
452  * already exists, so that caller will retry. If we run into it anywhere
453  * else, the caller has passed a handle that doesn't correspond to
454  * anything we ever created, which should not happen.
455  */
456  if (key == IPC_PRIVATE)
457  {
458  if (op != DSM_OP_CREATE)
459  elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
460  errno = EEXIST;
461  return false;
462  }
463 
464  /*
465  * Before we can do anything with a shared memory segment, we have to map
466  * the shared memory key to a shared memory identifier using shmget(). To
467  * avoid repeated lookups, we store the key using impl_private.
468  */
469  if (*impl_private != NULL)
470  {
471  ident_cache = *impl_private;
472  ident = *ident_cache;
473  }
474  else
475  {
476  int flags = IPCProtection;
477  size_t segsize;
478 
479  /*
480  * Allocate the memory BEFORE acquiring the resource, so that we don't
481  * leak the resource if memory allocation fails.
482  */
483  ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
484 
485  /*
486  * When using shmget to find an existing segment, we must pass the
487  * size as 0. Passing a non-zero size which is greater than the
488  * actual size will result in EINVAL.
489  */
490  segsize = 0;
491 
492  if (op == DSM_OP_CREATE)
493  {
494  flags |= IPC_CREAT | IPC_EXCL;
495  segsize = request_size;
496  }
497 
498  if ((ident = shmget(key, segsize, flags)) == -1)
499  {
500  if (errno != EEXIST)
501  {
502  int save_errno = errno;
503 
504  pfree(ident_cache);
505  errno = save_errno;
506  ereport(elevel,
508  errmsg("could not get shared memory segment: %m")));
509  }
510  return false;
511  }
512 
513  *ident_cache = ident;
514  *impl_private = ident_cache;
515  }
516 
517  /* Handle teardown cases. */
518  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
519  {
520  pfree(ident_cache);
521  *impl_private = NULL;
522  if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
523  {
524  ereport(elevel,
526  errmsg("could not unmap shared memory segment \"%s\": %m",
527  name)));
528  return false;
529  }
530  *mapped_address = NULL;
531  *mapped_size = 0;
532  if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
533  {
534  ereport(elevel,
536  errmsg("could not remove shared memory segment \"%s\": %m",
537  name)));
538  return false;
539  }
540  return true;
541  }
542 
543  /* If we're attaching it, we must use IPC_STAT to determine the size. */
544  if (op == DSM_OP_ATTACH)
545  {
546  struct shmid_ds shm;
547 
548  if (shmctl(ident, IPC_STAT, &shm) != 0)
549  {
550  ereport(elevel,
552  errmsg("could not stat shared memory segment \"%s\": %m",
553  name)));
554  return false;
555  }
556  request_size = shm.shm_segsz;
557  }
558 
559  /* Map it. */
560  address = shmat(ident, NULL, PG_SHMAT_FLAGS);
561  if (address == (void *) -1)
562  {
563  int save_errno;
564 
565  /* Back out what's already been done. */
566  save_errno = errno;
567  if (op == DSM_OP_CREATE)
568  shmctl(ident, IPC_RMID, NULL);
569  errno = save_errno;
570 
571  ereport(elevel,
573  errmsg("could not map shared memory segment \"%s\": %m",
574  name)));
575  return false;
576  }
577  *mapped_address = address;
578  *mapped_size = request_size;
579 
580  return true;
581 }
582 #endif
583 
584 #ifdef USE_DSM_WINDOWS
585 /*
586  * Operating system primitives to support Windows shared memory.
587  *
588  * Windows shared memory implementation is done using file mapping
589  * which can be backed by either physical file or system paging file.
590  * Current implementation uses system paging file as other effects
591  * like performance are not clear for physical file and it is used in similar
592  * way for main shared memory in windows.
593  *
594  * A memory mapping object is a kernel object - they always get deleted when
595  * the last reference to them goes away, either explicitly via a CloseHandle or
596  * when the process containing the reference exits.
597  */
598 static bool
599 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
600  void **impl_private, void **mapped_address,
601  Size *mapped_size, int elevel)
602 {
603  char *address;
604  HANDLE hmap;
605  char name[64];
606  MEMORY_BASIC_INFORMATION info;
607 
608  /*
609  * Storing the shared memory segment in the Global\ namespace, can allow
610  * any process running in any session to access that file mapping object
611  * provided that the caller has the required access rights. But to avoid
612  * issues faced in main shared memory, we are using the naming convention
613  * similar to main shared memory. We can change here once issue mentioned
614  * in GetSharedMemName is resolved.
615  */
616  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
617 
618  /*
619  * Handle teardown cases. Since Windows automatically destroys the object
620  * when no references remain, we can treat it the same as detach.
621  */
622  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
623  {
624  if (*mapped_address != NULL
625  && UnmapViewOfFile(*mapped_address) == 0)
626  {
627  _dosmaperr(GetLastError());
628  ereport(elevel,
630  errmsg("could not unmap shared memory segment \"%s\": %m",
631  name)));
632  return false;
633  }
634  if (*impl_private != NULL
635  && CloseHandle(*impl_private) == 0)
636  {
637  _dosmaperr(GetLastError());
638  ereport(elevel,
640  errmsg("could not remove shared memory segment \"%s\": %m",
641  name)));
642  return false;
643  }
644 
645  *impl_private = NULL;
646  *mapped_address = NULL;
647  *mapped_size = 0;
648  return true;
649  }
650 
651  /* Create new segment or open an existing one for attach. */
652  if (op == DSM_OP_CREATE)
653  {
654  DWORD size_high;
655  DWORD size_low;
656  DWORD errcode;
657 
658  /* Shifts >= the width of the type are undefined. */
659 #ifdef _WIN64
660  size_high = request_size >> 32;
661 #else
662  size_high = 0;
663 #endif
664  size_low = (DWORD) request_size;
665 
666  /* CreateFileMapping might not clear the error code on success */
667  SetLastError(0);
668 
669  hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
670  NULL, /* Default security attrs */
671  PAGE_READWRITE, /* Memory is read/write */
672  size_high, /* Upper 32 bits of size */
673  size_low, /* Lower 32 bits of size */
674  name);
675 
676  errcode = GetLastError();
677  if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
678  {
679  /*
680  * On Windows, when the segment already exists, a handle for the
681  * existing segment is returned. We must close it before
682  * returning. However, if the existing segment is created by a
683  * service, then it returns ERROR_ACCESS_DENIED. We don't do
684  * _dosmaperr here, so errno won't be modified.
685  */
686  if (hmap)
687  CloseHandle(hmap);
688  return false;
689  }
690 
691  if (!hmap)
692  {
693  _dosmaperr(errcode);
694  ereport(elevel,
696  errmsg("could not create shared memory segment \"%s\": %m",
697  name)));
698  return false;
699  }
700  }
701  else
702  {
703  hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
704  FALSE, /* do not inherit the name */
705  name); /* name of mapping object */
706  if (!hmap)
707  {
708  _dosmaperr(GetLastError());
709  ereport(elevel,
711  errmsg("could not open shared memory segment \"%s\": %m",
712  name)));
713  return false;
714  }
715  }
716 
717  /* Map it. */
718  address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
719  0, 0, 0);
720  if (!address)
721  {
722  int save_errno;
723 
724  _dosmaperr(GetLastError());
725  /* Back out what's already been done. */
726  save_errno = errno;
727  CloseHandle(hmap);
728  errno = save_errno;
729 
730  ereport(elevel,
732  errmsg("could not map shared memory segment \"%s\": %m",
733  name)));
734  return false;
735  }
736 
737  /*
738  * VirtualQuery gives size in page_size units, which is 4K for Windows. We
739  * need size only when we are attaching, but it's better to get the size
740  * when creating new segment to keep size consistent both for
741  * DSM_OP_CREATE and DSM_OP_ATTACH.
742  */
743  if (VirtualQuery(address, &info, sizeof(info)) == 0)
744  {
745  int save_errno;
746 
747  _dosmaperr(GetLastError());
748  /* Back out what's already been done. */
749  save_errno = errno;
750  UnmapViewOfFile(address);
751  CloseHandle(hmap);
752  errno = save_errno;
753 
754  ereport(elevel,
756  errmsg("could not stat shared memory segment \"%s\": %m",
757  name)));
758  return false;
759  }
760 
761  *mapped_address = address;
762  *mapped_size = info.RegionSize;
763  *impl_private = hmap;
764 
765  return true;
766 }
767 #endif
768 
769 #ifdef USE_DSM_MMAP
770 /*
771  * Operating system primitives to support mmap-based shared memory.
772  *
773  * Calling this "shared memory" is somewhat of a misnomer, because what
774  * we're really doing is creating a bunch of files and mapping them into
775  * our address space. The operating system may feel obliged to
776  * synchronize the contents to disk even if nothing is being paged out,
777  * which will not serve us well. The user can relocate the pg_dynshmem
778  * directory to a ramdisk to avoid this problem, if available.
779  */
780 static bool
781 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
782  void **impl_private, void **mapped_address, Size *mapped_size,
783  int elevel)
784 {
785  char name[64];
786  int flags;
787  int fd;
788  char *address;
789 
791  handle);
792 
793  /* Handle teardown cases. */
794  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
795  {
796  if (*mapped_address != NULL
797  && munmap(*mapped_address, *mapped_size) != 0)
798  {
799  ereport(elevel,
801  errmsg("could not unmap shared memory segment \"%s\": %m",
802  name)));
803  return false;
804  }
805  *mapped_address = NULL;
806  *mapped_size = 0;
807  if (op == DSM_OP_DESTROY && unlink(name) != 0)
808  {
809  ereport(elevel,
811  errmsg("could not remove shared memory segment \"%s\": %m",
812  name)));
813  return false;
814  }
815  return true;
816  }
817 
818  /* Create new segment or open an existing one for attach. */
819  flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
820  if ((fd = OpenTransientFile(name, flags)) == -1)
821  {
822  if (errno != EEXIST)
823  ereport(elevel,
825  errmsg("could not open shared memory segment \"%s\": %m",
826  name)));
827  return false;
828  }
829 
830  /*
831  * If we're attaching the segment, determine the current size; if we are
832  * creating the segment, set the size to the requested value.
833  */
834  if (op == DSM_OP_ATTACH)
835  {
836  struct stat st;
837 
838  if (fstat(fd, &st) != 0)
839  {
840  int save_errno;
841 
842  /* Back out what's already been done. */
843  save_errno = errno;
844  CloseTransientFile(fd);
845  errno = save_errno;
846 
847  ereport(elevel,
849  errmsg("could not stat shared memory segment \"%s\": %m",
850  name)));
851  return false;
852  }
853  request_size = st.st_size;
854  }
855  else
856  {
857  /*
858  * Allocate a buffer full of zeros.
859  *
860  * Note: palloc zbuffer, instead of just using a local char array, to
861  * ensure it is reasonably well-aligned; this may save a few cycles
862  * transferring data to the kernel.
863  */
864  char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
865  uint32 remaining = request_size;
866  bool success = true;
867 
868  /*
869  * Zero-fill the file. We have to do this the hard way to ensure that
870  * all the file space has really been allocated, so that we don't
871  * later seg fault when accessing the memory mapping. This is pretty
872  * pessimal.
873  */
874  while (success && remaining > 0)
875  {
876  Size goal = remaining;
877 
878  if (goal > ZBUFFER_SIZE)
879  goal = ZBUFFER_SIZE;
881  if (write(fd, zbuffer, goal) == goal)
882  remaining -= goal;
883  else
884  success = false;
886  }
887 
888  if (!success)
889  {
890  int save_errno;
891 
892  /* Back out what's already been done. */
893  save_errno = errno;
894  CloseTransientFile(fd);
895  unlink(name);
896  errno = save_errno ? save_errno : ENOSPC;
897 
898  ereport(elevel,
900  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
901  name, request_size)));
902  return false;
903  }
904  }
905 
906  /* Map it. */
907  address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
908  MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
909  if (address == MAP_FAILED)
910  {
911  int save_errno;
912 
913  /* Back out what's already been done. */
914  save_errno = errno;
915  CloseTransientFile(fd);
916  if (op == DSM_OP_CREATE)
917  unlink(name);
918  errno = save_errno;
919 
920  ereport(elevel,
922  errmsg("could not map shared memory segment \"%s\": %m",
923  name)));
924  return false;
925  }
926  *mapped_address = address;
927  *mapped_size = request_size;
928 
929  if (CloseTransientFile(fd) != 0)
930  {
931  ereport(elevel,
933  errmsg("could not close shared memory segment \"%s\": %m",
934  name)));
935  return false;
936  }
937 
938  return true;
939 }
940 #endif
941 
942 /*
943  * Implementation-specific actions that must be performed when a segment is to
944  * be preserved even when no backend has it attached.
945  *
946  * Except on Windows, we don't need to do anything at all. But since Windows
947  * cleans up segments automatically when no references remain, we duplicate
948  * the segment handle into the postmaster process. The postmaster needn't
949  * do anything to receive the handle; Windows transfers it automatically.
950  */
951 void
952 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
953  void **impl_private_pm_handle)
954 {
956  {
957 #ifdef USE_DSM_WINDOWS
958  case DSM_IMPL_WINDOWS:
959  {
960  HANDLE hmap;
961 
962  if (!DuplicateHandle(GetCurrentProcess(), impl_private,
963  PostmasterHandle, &hmap, 0, FALSE,
964  DUPLICATE_SAME_ACCESS))
965  {
966  char name[64];
967 
968  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
969  _dosmaperr(GetLastError());
970  ereport(ERROR,
972  errmsg("could not duplicate handle for \"%s\": %m",
973  name)));
974  }
975 
976  /*
977  * Here, we remember the handle that we created in the
978  * postmaster process. This handle isn't actually usable in
979  * any process other than the postmaster, but that doesn't
980  * matter. We're just holding onto it so that, if the segment
981  * is unpinned, dsm_impl_unpin_segment can close it.
982  */
983  *impl_private_pm_handle = hmap;
984  break;
985  }
986 #endif
987  default:
988  break;
989  }
990 }
991 
992 /*
993  * Implementation-specific actions that must be performed when a segment is no
994  * longer to be preserved, so that it will be cleaned up when all backends
995  * have detached from it.
996  *
997  * Except on Windows, we don't need to do anything at all. For Windows, we
998  * close the extra handle that dsm_impl_pin_segment created in the
999  * postmaster's process space.
1000  */
1001 void
1002 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1003 {
1005  {
1006 #ifdef USE_DSM_WINDOWS
1007  case DSM_IMPL_WINDOWS:
1008  {
1009  if (*impl_private &&
1010  !DuplicateHandle(PostmasterHandle, *impl_private,
1011  NULL, NULL, 0, FALSE,
1012  DUPLICATE_CLOSE_SOURCE))
1013  {
1014  char name[64];
1015 
1016  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1017  _dosmaperr(GetLastError());
1018  ereport(ERROR,
1020  errmsg("could not duplicate handle for \"%s\": %m",
1021  name)));
1022  }
1023 
1024  *impl_private = NULL;
1025  break;
1026  }
1027 #endif
1028  default:
1029  break;
1030  }
1031 }
1032 
1033 static int
1035 {
1036  if (errno == EFBIG || errno == ENOMEM)
1037  return errcode(ERRCODE_OUT_OF_MEMORY);
1038  else
1039  return errcode_for_file_access();
1040 }
int remaining
Definition: informix.c:667
#define DSM_IMPL_MMAP
Definition: dsm_impl.h:20
#define DSM_IMPL_SYSV
Definition: dsm_impl.h:18
#define MAP_HASSEMAPHORE
Definition: mem.h:30
#define MAP_FAILED
Definition: mem.h:45
volatile sig_atomic_t QueryCancelPending
Definition: globals.c:31
void dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
Definition: dsm_impl.c:1002
uint32 dsm_handle
Definition: dsm_impl.h:54
#define PG_DYNSHMEM_DIR
Definition: dsm_impl.h:50
#define IPC_CREAT
Definition: win32_port.h:81
Definition: guc.h:164
#define write(a, b, c)
Definition: win32.h:14
#define IPCProtection
Definition: posix_sema.c:59
#define PG_SHMAT_FLAGS
Definition: mem.h:20
#define MAP_NOSYNC
Definition: mem.h:38
void _dosmaperr(unsigned long)
Definition: win32error.c:171
int errcode(int sqlerrcode)
Definition: elog.c:610
#define DSM_IMPL_WINDOWS
Definition: dsm_impl.h:19
static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:781
#define DEBUG4
Definition: elog.h:22
static int fd(const char *x, int i)
Definition: preproc-init.c:105
static int errcode_for_dynamic_shared_memory(void)
Definition: dsm_impl.c:1034
void pfree(void *pointer)
Definition: mcxt.c:1056
const struct config_enum_entry dynamic_shared_memory_options[]
Definition: dsm_impl.c:97
#define ERROR
Definition: elog.h:43
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2370
#define DSM_IMPL_POSIX
Definition: dsm_impl.h:17
void ReserveExternalFD(void)
Definition: fd.c:1080
#define SEGMENT_NAME_PREFIX
Definition: dsm_impl.c:119
int errcode_for_file_access(void)
Definition: elog.c:633
int dynamic_shared_memory_type
Definition: dsm_impl.c:114
unsigned int uint32
Definition: c.h:367
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1330
#define IPC_PRIVATE
Definition: win32_port.h:83
MemoryContext TopMemoryContext
Definition: mcxt.c:44
int CloseTransientFile(int fd)
Definition: fd.c:2547
#define stat(a, b)
Definition: win32_port.h:255
static int elevel
Definition: vacuumlazy.c:314
void * palloc0(Size size)
Definition: mcxt.c:980
#define IPC_RMID
Definition: win32_port.h:80
static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:412
#define ereport(elevel,...)
Definition: elog.h:144
#define PG_FILE_MODE_OWNER
Definition: file_perm.h:38
#define Assert(condition)
Definition: c.h:738
volatile sig_atomic_t ProcDiePending
Definition: globals.c:32
long key_t
Definition: win32_port.h:233
size_t Size
Definition: c.h:466
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1306
#define IPC_EXCL
Definition: win32_port.h:82
const char * name
Definition: encode.c:521
void dsm_impl_pin_segment(dsm_handle handle, void *impl_private, void **impl_private_pm_handle)
Definition: dsm_impl.c:952
void ReleaseExternalFD(void)
Definition: fd.c:1098
int errmsg(const char *fmt,...)
Definition: elog.c:824
dsm_op
Definition: dsm_impl.h:57
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:796
#define elog(elevel,...)
Definition: elog.h:214
#define PG_DYNSHMEM_MMAP_FILE_PREFIX
Definition: dsm_impl.h:51
#define ZBUFFER_SIZE
Definition: dsm_impl.c:117
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:323
static bool success
Definition: initdb.c:161
bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:158
#define snprintf
Definition: port.h:193
#define IPC_STAT
Definition: win32_port.h:85
#define ftruncate(a, b)
Definition: win32_port.h:59