PostgreSQL Source Code  git master
dsm_impl.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsm_impl.c
4  * manage dynamic shared memory segments
5  *
6  * This file provides low-level APIs for creating and destroying shared
7  * memory segments using several different possible techniques. We refer
8  * to these segments as dynamic because they can be created, altered, and
9  * destroyed at any point during the server life cycle. This is unlike
10  * the main shared memory segment, of which there is always exactly one
11  * and which is always mapped at a fixed address in every PostgreSQL
12  * background process.
13  *
14  * Because not all systems provide the same primitives in this area, nor
15  * do all primitives behave the same way on all systems, we provide
16  * several implementations of this facility. Many systems implement
17  * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18  * in this area, with the exception that shared memory identifiers live
19  * in a flat system-wide namespace, raising the uncomfortable prospect of
20  * name collisions with other processes (including other copies of
21  * PostgreSQL) running on the same system. Some systems only support
22  * the older System V shared memory interface (shmget etc.) which is
23  * also usable; however, the default allocation limits are often quite
24  * small, and the namespace is even more restricted.
25  *
26  * We also provide an mmap-based shared memory implementation. This may
27  * be useful on systems that provide shared memory via a special-purpose
28  * filesystem; by opting for this implementation, the user can even
29  * control precisely where their shared memory segments are placed. It
30  * can also be used as a fallback for systems where shm_open and shmget
31  * are not available or can't be used for some reason. Of course,
32  * mapping a file residing on an actual spinning disk is a fairly poor
33  * approximation for shared memory because writeback may hurt performance
34  * substantially, but there should be few systems where we must make do
35  * with such poor tools.
36  *
37  * As ever, Windows requires its own implementation.
38  *
39  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
40  * Portions Copyright (c) 1994, Regents of the University of California
41  *
42  *
43  * IDENTIFICATION
44  * src/backend/storage/ipc/dsm_impl.c
45  *
46  *-------------------------------------------------------------------------
47  */
48 
49 #include "postgres.h"
50 
51 #include <fcntl.h>
52 #include <unistd.h>
53 #ifndef WIN32
54 #include <sys/mman.h>
55 #endif
56 #include <sys/stat.h>
57 #ifdef HAVE_SYS_IPC_H
58 #include <sys/ipc.h>
59 #endif
60 #ifdef HAVE_SYS_SHM_H
61 #include <sys/shm.h>
62 #endif
63 
64 #include "common/file_perm.h"
65 #include "miscadmin.h"
66 #include "pgstat.h"
67 #include "portability/mem.h"
68 #include "postmaster/postmaster.h"
69 #include "storage/dsm_impl.h"
70 #include "storage/fd.h"
71 #include "utils/guc.h"
72 #include "utils/memutils.h"
73 
74 #ifdef USE_DSM_POSIX
75 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
76  void **impl_private, void **mapped_address,
77  Size *mapped_size, int elevel);
78 static int dsm_impl_posix_resize(int fd, off_t size);
79 #endif
80 #ifdef USE_DSM_SYSV
81 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
82  void **impl_private, void **mapped_address,
83  Size *mapped_size, int elevel);
84 #endif
85 #ifdef USE_DSM_WINDOWS
86 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
87  void **impl_private, void **mapped_address,
88  Size *mapped_size, int elevel);
89 #endif
90 #ifdef USE_DSM_MMAP
91 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
92  void **impl_private, void **mapped_address,
93  Size *mapped_size, int elevel);
94 #endif
95 static int errcode_for_dynamic_shared_memory(void);
96 
98 #ifdef USE_DSM_POSIX
99  {"posix", DSM_IMPL_POSIX, false},
100 #endif
101 #ifdef USE_DSM_SYSV
102  {"sysv", DSM_IMPL_SYSV, false},
103 #endif
104 #ifdef USE_DSM_WINDOWS
105  {"windows", DSM_IMPL_WINDOWS, false},
106 #endif
107 #ifdef USE_DSM_MMAP
108  {"mmap", DSM_IMPL_MMAP, false},
109 #endif
110  {NULL, 0, false}
111 };
112 
113 /* Implementation selector. */
115 
116 /* Amount of space reserved for DSM segments in the main area. */
118 
119 /* Size of buffer to be used for zero-filling. */
120 #define ZBUFFER_SIZE 8192
121 
122 #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
123 
124 /*------
125  * Perform a low-level shared memory operation in a platform-specific way,
126  * as dictated by the selected implementation. Each implementation is
127  * required to implement the following primitives.
128  *
129  * DSM_OP_CREATE. Create a segment whose size is the request_size and
130  * map it.
131  *
132  * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
133  *
134  * DSM_OP_DETACH. Unmap the segment.
135  *
136  * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
137  * segment.
138  *
139  * Arguments:
140  * op: The operation to be performed.
141  * handle: The handle of an existing object, or for DSM_OP_CREATE, the
142  * a new handle the caller wants created.
143  * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
144  * impl_private: Private, implementation-specific data. Will be a pointer
145  * to NULL for the first operation on a shared memory segment within this
146  * backend; thereafter, it will point to the value to which it was set
147  * on the previous call.
148  * mapped_address: Pointer to start of current mapping; pointer to NULL
149  * if none. Updated with new mapping address.
150  * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
151  * Updated with new mapped size.
152  * elevel: Level at which to log errors.
153  *
154  * Return value: true on success, false on failure. When false is returned,
155  * a message should first be logged at the specified elevel, except in the
156  * case where DSM_OP_CREATE experiences a name collision, which should
157  * silently return false.
158  *-----
159  */
160 bool
161 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
162  void **impl_private, void **mapped_address, Size *mapped_size,
163  int elevel)
164 {
165  Assert(op == DSM_OP_CREATE || request_size == 0);
166  Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
167  (*mapped_address == NULL && *mapped_size == 0));
168 
170  {
171 #ifdef USE_DSM_POSIX
172  case DSM_IMPL_POSIX:
173  return dsm_impl_posix(op, handle, request_size, impl_private,
174  mapped_address, mapped_size, elevel);
175 #endif
176 #ifdef USE_DSM_SYSV
177  case DSM_IMPL_SYSV:
178  return dsm_impl_sysv(op, handle, request_size, impl_private,
179  mapped_address, mapped_size, elevel);
180 #endif
181 #ifdef USE_DSM_WINDOWS
182  case DSM_IMPL_WINDOWS:
183  return dsm_impl_windows(op, handle, request_size, impl_private,
184  mapped_address, mapped_size, elevel);
185 #endif
186 #ifdef USE_DSM_MMAP
187  case DSM_IMPL_MMAP:
188  return dsm_impl_mmap(op, handle, request_size, impl_private,
189  mapped_address, mapped_size, elevel);
190 #endif
191  default:
192  elog(ERROR, "unexpected dynamic shared memory type: %d",
194  return false;
195  }
196 }
197 
198 #ifdef USE_DSM_POSIX
199 /*
200  * Operating system primitives to support POSIX shared memory.
201  *
202  * POSIX shared memory segments are created and attached using shm_open()
203  * and shm_unlink(); other operations, such as sizing or mapping the
204  * segment, are performed as if the shared memory segments were files.
205  *
206  * Indeed, on some platforms, they may be implemented that way. While
207  * POSIX shared memory segments seem intended to exist in a flat namespace,
208  * some operating systems may implement them as files, even going so far
209  * to treat a request for /xyz as a request to create a file by that name
210  * in the root directory. Users of such broken platforms should select
211  * a different shared memory implementation.
212  */
213 static bool
214 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
215  void **impl_private, void **mapped_address, Size *mapped_size,
216  int elevel)
217 {
218  char name[64];
219  int flags;
220  int fd;
221  char *address;
222 
223  snprintf(name, 64, "/PostgreSQL.%u", handle);
224 
225  /* Handle teardown cases. */
226  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
227  {
228  if (*mapped_address != NULL
229  && munmap(*mapped_address, *mapped_size) != 0)
230  {
231  ereport(elevel,
233  errmsg("could not unmap shared memory segment \"%s\": %m",
234  name)));
235  return false;
236  }
237  *mapped_address = NULL;
238  *mapped_size = 0;
239  if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
240  {
241  ereport(elevel,
243  errmsg("could not remove shared memory segment \"%s\": %m",
244  name)));
245  return false;
246  }
247  return true;
248  }
249 
250  /*
251  * Create new segment or open an existing one for attach.
252  *
253  * Even though we will close the FD before returning, it seems desirable
254  * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
255  * failure. The fact that we won't hold the FD open long justifies using
256  * ReserveExternalFD rather than AcquireExternalFD, though.
257  */
259 
260  flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
261  if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
262  {
264  if (errno != EEXIST)
265  ereport(elevel,
267  errmsg("could not open shared memory segment \"%s\": %m",
268  name)));
269  return false;
270  }
271 
272  /*
273  * If we're attaching the segment, determine the current size; if we are
274  * creating the segment, set the size to the requested value.
275  */
276  if (op == DSM_OP_ATTACH)
277  {
278  struct stat st;
279 
280  if (fstat(fd, &st) != 0)
281  {
282  int save_errno;
283 
284  /* Back out what's already been done. */
285  save_errno = errno;
286  close(fd);
288  errno = save_errno;
289 
290  ereport(elevel,
292  errmsg("could not stat shared memory segment \"%s\": %m",
293  name)));
294  return false;
295  }
296  request_size = st.st_size;
297  }
298  else if (dsm_impl_posix_resize(fd, request_size) != 0)
299  {
300  int save_errno;
301 
302  /* Back out what's already been done. */
303  save_errno = errno;
304  close(fd);
306  shm_unlink(name);
307  errno = save_errno;
308 
309  /*
310  * If we received a query cancel or termination signal, we will have
311  * EINTR set here. If the caller said that errors are OK here, check
312  * for interrupts immediately.
313  */
314  if (errno == EINTR && elevel >= ERROR)
316 
317  ereport(elevel,
319  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
320  name, request_size)));
321  return false;
322  }
323 
324  /* Map it. */
325  address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
326  MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
327  if (address == MAP_FAILED)
328  {
329  int save_errno;
330 
331  /* Back out what's already been done. */
332  save_errno = errno;
333  close(fd);
335  if (op == DSM_OP_CREATE)
336  shm_unlink(name);
337  errno = save_errno;
338 
339  ereport(elevel,
341  errmsg("could not map shared memory segment \"%s\": %m",
342  name)));
343  return false;
344  }
345  *mapped_address = address;
346  *mapped_size = request_size;
347  close(fd);
349 
350  return true;
351 }
352 
353 /*
354  * Set the size of a virtual memory region associated with a file descriptor.
355  * If necessary, also ensure that virtual memory is actually allocated by the
356  * operating system, to avoid nasty surprises later.
357  *
358  * Returns non-zero if either truncation or allocation fails, and sets errno.
359  */
360 static int
361 dsm_impl_posix_resize(int fd, off_t size)
362 {
363  int rc;
364 
365  /* Truncate (or extend) the file to the requested size. */
366  rc = ftruncate(fd, size);
367 
368  /*
369  * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with
370  * ftruncate, the file may contain a hole. Accessing memory backed by a
371  * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
372  * is no more tmpfs space available. So we ask tmpfs to allocate pages
373  * here, so we can fail gracefully with ENOSPC now rather than risking
374  * SIGBUS later.
375  */
376 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
377  if (rc == 0)
378  {
379  /*
380  * We may get interrupted. If so, just retry unless there is an
381  * interrupt pending. This avoids the possibility of looping forever
382  * if another backend is repeatedly trying to interrupt us.
383  */
385  do
386  {
387  rc = posix_fallocate(fd, 0, size);
388  } while (rc == EINTR && !(ProcDiePending || QueryCancelPending));
390 
391  /*
392  * The caller expects errno to be set, but posix_fallocate() doesn't
393  * set it. Instead it returns error numbers directly. So set errno,
394  * even though we'll also return rc to indicate success or failure.
395  */
396  errno = rc;
397  }
398 #endif /* HAVE_POSIX_FALLOCATE && __linux__ */
399 
400  return rc;
401 }
402 
403 #endif /* USE_DSM_POSIX */
404 
405 #ifdef USE_DSM_SYSV
406 /*
407  * Operating system primitives to support System V shared memory.
408  *
409  * System V shared memory segments are manipulated using shmget(), shmat(),
410  * shmdt(), and shmctl(). As the default allocation limits for System V
411  * shared memory are usually quite low, the POSIX facilities may be
412  * preferable; but those are not supported everywhere.
413  */
414 static bool
415 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
416  void **impl_private, void **mapped_address, Size *mapped_size,
417  int elevel)
418 {
419  key_t key;
420  int ident;
421  char *address;
422  char name[64];
423  int *ident_cache;
424 
425  /*
426  * POSIX shared memory and mmap-based shared memory identify segments with
427  * names. To avoid needless error message variation, we use the handle as
428  * the name.
429  */
430  snprintf(name, 64, "%u", handle);
431 
432  /*
433  * The System V shared memory namespace is very restricted; names are of
434  * type key_t, which is expected to be some sort of integer data type, but
435  * not necessarily the same one as dsm_handle. Since we use dsm_handle to
436  * identify shared memory segments across processes, this might seem like
437  * a problem, but it's really not. If dsm_handle is bigger than key_t,
438  * the cast below might truncate away some bits from the handle the
439  * user-provided, but it'll truncate exactly the same bits away in exactly
440  * the same fashion every time we use that handle, which is all that
441  * really matters. Conversely, if dsm_handle is smaller than key_t, we
442  * won't use the full range of available key space, but that's no big deal
443  * either.
444  *
445  * We do make sure that the key isn't negative, because that might not be
446  * portable.
447  */
448  key = (key_t) handle;
449  if (key < 1) /* avoid compiler warning if type is unsigned */
450  key = -key;
451 
452  /*
453  * There's one special key, IPC_PRIVATE, which can't be used. If we end
454  * up with that value by chance during a create operation, just pretend it
455  * already exists, so that caller will retry. If we run into it anywhere
456  * else, the caller has passed a handle that doesn't correspond to
457  * anything we ever created, which should not happen.
458  */
459  if (key == IPC_PRIVATE)
460  {
461  if (op != DSM_OP_CREATE)
462  elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
463  errno = EEXIST;
464  return false;
465  }
466 
467  /*
468  * Before we can do anything with a shared memory segment, we have to map
469  * the shared memory key to a shared memory identifier using shmget(). To
470  * avoid repeated lookups, we store the key using impl_private.
471  */
472  if (*impl_private != NULL)
473  {
474  ident_cache = *impl_private;
475  ident = *ident_cache;
476  }
477  else
478  {
479  int flags = IPCProtection;
480  size_t segsize;
481 
482  /*
483  * Allocate the memory BEFORE acquiring the resource, so that we don't
484  * leak the resource if memory allocation fails.
485  */
486  ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
487 
488  /*
489  * When using shmget to find an existing segment, we must pass the
490  * size as 0. Passing a non-zero size which is greater than the
491  * actual size will result in EINVAL.
492  */
493  segsize = 0;
494 
495  if (op == DSM_OP_CREATE)
496  {
497  flags |= IPC_CREAT | IPC_EXCL;
498  segsize = request_size;
499  }
500 
501  if ((ident = shmget(key, segsize, flags)) == -1)
502  {
503  if (errno != EEXIST)
504  {
505  int save_errno = errno;
506 
507  pfree(ident_cache);
508  errno = save_errno;
509  ereport(elevel,
511  errmsg("could not get shared memory segment: %m")));
512  }
513  return false;
514  }
515 
516  *ident_cache = ident;
517  *impl_private = ident_cache;
518  }
519 
520  /* Handle teardown cases. */
521  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
522  {
523  pfree(ident_cache);
524  *impl_private = NULL;
525  if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
526  {
527  ereport(elevel,
529  errmsg("could not unmap shared memory segment \"%s\": %m",
530  name)));
531  return false;
532  }
533  *mapped_address = NULL;
534  *mapped_size = 0;
535  if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
536  {
537  ereport(elevel,
539  errmsg("could not remove shared memory segment \"%s\": %m",
540  name)));
541  return false;
542  }
543  return true;
544  }
545 
546  /* If we're attaching it, we must use IPC_STAT to determine the size. */
547  if (op == DSM_OP_ATTACH)
548  {
549  struct shmid_ds shm;
550 
551  if (shmctl(ident, IPC_STAT, &shm) != 0)
552  {
553  ereport(elevel,
555  errmsg("could not stat shared memory segment \"%s\": %m",
556  name)));
557  return false;
558  }
559  request_size = shm.shm_segsz;
560  }
561 
562  /* Map it. */
563  address = shmat(ident, NULL, PG_SHMAT_FLAGS);
564  if (address == (void *) -1)
565  {
566  int save_errno;
567 
568  /* Back out what's already been done. */
569  save_errno = errno;
570  if (op == DSM_OP_CREATE)
571  shmctl(ident, IPC_RMID, NULL);
572  errno = save_errno;
573 
574  ereport(elevel,
576  errmsg("could not map shared memory segment \"%s\": %m",
577  name)));
578  return false;
579  }
580  *mapped_address = address;
581  *mapped_size = request_size;
582 
583  return true;
584 }
585 #endif
586 
587 #ifdef USE_DSM_WINDOWS
588 /*
589  * Operating system primitives to support Windows shared memory.
590  *
591  * Windows shared memory implementation is done using file mapping
592  * which can be backed by either physical file or system paging file.
593  * Current implementation uses system paging file as other effects
594  * like performance are not clear for physical file and it is used in similar
595  * way for main shared memory in windows.
596  *
597  * A memory mapping object is a kernel object - they always get deleted when
598  * the last reference to them goes away, either explicitly via a CloseHandle or
599  * when the process containing the reference exits.
600  */
601 static bool
602 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
603  void **impl_private, void **mapped_address,
604  Size *mapped_size, int elevel)
605 {
606  char *address;
607  HANDLE hmap;
608  char name[64];
609  MEMORY_BASIC_INFORMATION info;
610 
611  /*
612  * Storing the shared memory segment in the Global\ namespace, can allow
613  * any process running in any session to access that file mapping object
614  * provided that the caller has the required access rights. But to avoid
615  * issues faced in main shared memory, we are using the naming convention
616  * similar to main shared memory. We can change here once issue mentioned
617  * in GetSharedMemName is resolved.
618  */
619  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
620 
621  /*
622  * Handle teardown cases. Since Windows automatically destroys the object
623  * when no references remain, we can treat it the same as detach.
624  */
625  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
626  {
627  if (*mapped_address != NULL
628  && UnmapViewOfFile(*mapped_address) == 0)
629  {
630  _dosmaperr(GetLastError());
631  ereport(elevel,
633  errmsg("could not unmap shared memory segment \"%s\": %m",
634  name)));
635  return false;
636  }
637  if (*impl_private != NULL
638  && CloseHandle(*impl_private) == 0)
639  {
640  _dosmaperr(GetLastError());
641  ereport(elevel,
643  errmsg("could not remove shared memory segment \"%s\": %m",
644  name)));
645  return false;
646  }
647 
648  *impl_private = NULL;
649  *mapped_address = NULL;
650  *mapped_size = 0;
651  return true;
652  }
653 
654  /* Create new segment or open an existing one for attach. */
655  if (op == DSM_OP_CREATE)
656  {
657  DWORD size_high;
658  DWORD size_low;
659  DWORD errcode;
660 
661  /* Shifts >= the width of the type are undefined. */
662 #ifdef _WIN64
663  size_high = request_size >> 32;
664 #else
665  size_high = 0;
666 #endif
667  size_low = (DWORD) request_size;
668 
669  /* CreateFileMapping might not clear the error code on success */
670  SetLastError(0);
671 
672  hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
673  NULL, /* Default security attrs */
674  PAGE_READWRITE, /* Memory is read/write */
675  size_high, /* Upper 32 bits of size */
676  size_low, /* Lower 32 bits of size */
677  name);
678 
679  errcode = GetLastError();
680  if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
681  {
682  /*
683  * On Windows, when the segment already exists, a handle for the
684  * existing segment is returned. We must close it before
685  * returning. However, if the existing segment is created by a
686  * service, then it returns ERROR_ACCESS_DENIED. We don't do
687  * _dosmaperr here, so errno won't be modified.
688  */
689  if (hmap)
690  CloseHandle(hmap);
691  return false;
692  }
693 
694  if (!hmap)
695  {
696  _dosmaperr(errcode);
697  ereport(elevel,
699  errmsg("could not create shared memory segment \"%s\": %m",
700  name)));
701  return false;
702  }
703  }
704  else
705  {
706  hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
707  FALSE, /* do not inherit the name */
708  name); /* name of mapping object */
709  if (!hmap)
710  {
711  _dosmaperr(GetLastError());
712  ereport(elevel,
714  errmsg("could not open shared memory segment \"%s\": %m",
715  name)));
716  return false;
717  }
718  }
719 
720  /* Map it. */
721  address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
722  0, 0, 0);
723  if (!address)
724  {
725  int save_errno;
726 
727  _dosmaperr(GetLastError());
728  /* Back out what's already been done. */
729  save_errno = errno;
730  CloseHandle(hmap);
731  errno = save_errno;
732 
733  ereport(elevel,
735  errmsg("could not map shared memory segment \"%s\": %m",
736  name)));
737  return false;
738  }
739 
740  /*
741  * VirtualQuery gives size in page_size units, which is 4K for Windows. We
742  * need size only when we are attaching, but it's better to get the size
743  * when creating new segment to keep size consistent both for
744  * DSM_OP_CREATE and DSM_OP_ATTACH.
745  */
746  if (VirtualQuery(address, &info, sizeof(info)) == 0)
747  {
748  int save_errno;
749 
750  _dosmaperr(GetLastError());
751  /* Back out what's already been done. */
752  save_errno = errno;
753  UnmapViewOfFile(address);
754  CloseHandle(hmap);
755  errno = save_errno;
756 
757  ereport(elevel,
759  errmsg("could not stat shared memory segment \"%s\": %m",
760  name)));
761  return false;
762  }
763 
764  *mapped_address = address;
765  *mapped_size = info.RegionSize;
766  *impl_private = hmap;
767 
768  return true;
769 }
770 #endif
771 
772 #ifdef USE_DSM_MMAP
773 /*
774  * Operating system primitives to support mmap-based shared memory.
775  *
776  * Calling this "shared memory" is somewhat of a misnomer, because what
777  * we're really doing is creating a bunch of files and mapping them into
778  * our address space. The operating system may feel obliged to
779  * synchronize the contents to disk even if nothing is being paged out,
780  * which will not serve us well. The user can relocate the pg_dynshmem
781  * directory to a ramdisk to avoid this problem, if available.
782  */
783 static bool
784 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
785  void **impl_private, void **mapped_address, Size *mapped_size,
786  int elevel)
787 {
788  char name[64];
789  int flags;
790  int fd;
791  char *address;
792 
794  handle);
795 
796  /* Handle teardown cases. */
797  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
798  {
799  if (*mapped_address != NULL
800  && munmap(*mapped_address, *mapped_size) != 0)
801  {
802  ereport(elevel,
804  errmsg("could not unmap shared memory segment \"%s\": %m",
805  name)));
806  return false;
807  }
808  *mapped_address = NULL;
809  *mapped_size = 0;
810  if (op == DSM_OP_DESTROY && unlink(name) != 0)
811  {
812  ereport(elevel,
814  errmsg("could not remove shared memory segment \"%s\": %m",
815  name)));
816  return false;
817  }
818  return true;
819  }
820 
821  /* Create new segment or open an existing one for attach. */
822  flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
823  if ((fd = OpenTransientFile(name, flags)) == -1)
824  {
825  if (errno != EEXIST)
826  ereport(elevel,
828  errmsg("could not open shared memory segment \"%s\": %m",
829  name)));
830  return false;
831  }
832 
833  /*
834  * If we're attaching the segment, determine the current size; if we are
835  * creating the segment, set the size to the requested value.
836  */
837  if (op == DSM_OP_ATTACH)
838  {
839  struct stat st;
840 
841  if (fstat(fd, &st) != 0)
842  {
843  int save_errno;
844 
845  /* Back out what's already been done. */
846  save_errno = errno;
847  CloseTransientFile(fd);
848  errno = save_errno;
849 
850  ereport(elevel,
852  errmsg("could not stat shared memory segment \"%s\": %m",
853  name)));
854  return false;
855  }
856  request_size = st.st_size;
857  }
858  else
859  {
860  /*
861  * Allocate a buffer full of zeros.
862  *
863  * Note: palloc zbuffer, instead of just using a local char array, to
864  * ensure it is reasonably well-aligned; this may save a few cycles
865  * transferring data to the kernel.
866  */
867  char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
868  uint32 remaining = request_size;
869  bool success = true;
870 
871  /*
872  * Zero-fill the file. We have to do this the hard way to ensure that
873  * all the file space has really been allocated, so that we don't
874  * later seg fault when accessing the memory mapping. This is pretty
875  * pessimal.
876  */
877  while (success && remaining > 0)
878  {
879  Size goal = remaining;
880 
881  if (goal > ZBUFFER_SIZE)
882  goal = ZBUFFER_SIZE;
884  if (write(fd, zbuffer, goal) == goal)
885  remaining -= goal;
886  else
887  success = false;
889  }
890 
891  if (!success)
892  {
893  int save_errno;
894 
895  /* Back out what's already been done. */
896  save_errno = errno;
897  CloseTransientFile(fd);
898  unlink(name);
899  errno = save_errno ? save_errno : ENOSPC;
900 
901  ereport(elevel,
903  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
904  name, request_size)));
905  return false;
906  }
907  }
908 
909  /* Map it. */
910  address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
911  MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
912  if (address == MAP_FAILED)
913  {
914  int save_errno;
915 
916  /* Back out what's already been done. */
917  save_errno = errno;
918  CloseTransientFile(fd);
919  if (op == DSM_OP_CREATE)
920  unlink(name);
921  errno = save_errno;
922 
923  ereport(elevel,
925  errmsg("could not map shared memory segment \"%s\": %m",
926  name)));
927  return false;
928  }
929  *mapped_address = address;
930  *mapped_size = request_size;
931 
932  if (CloseTransientFile(fd) != 0)
933  {
934  ereport(elevel,
936  errmsg("could not close shared memory segment \"%s\": %m",
937  name)));
938  return false;
939  }
940 
941  return true;
942 }
943 #endif
944 
945 /*
946  * Implementation-specific actions that must be performed when a segment is to
947  * be preserved even when no backend has it attached.
948  *
949  * Except on Windows, we don't need to do anything at all. But since Windows
950  * cleans up segments automatically when no references remain, we duplicate
951  * the segment handle into the postmaster process. The postmaster needn't
952  * do anything to receive the handle; Windows transfers it automatically.
953  */
954 void
955 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
956  void **impl_private_pm_handle)
957 {
959  {
960 #ifdef USE_DSM_WINDOWS
961  case DSM_IMPL_WINDOWS:
962  {
963  HANDLE hmap;
964 
965  if (!DuplicateHandle(GetCurrentProcess(), impl_private,
966  PostmasterHandle, &hmap, 0, FALSE,
967  DUPLICATE_SAME_ACCESS))
968  {
969  char name[64];
970 
971  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
972  _dosmaperr(GetLastError());
973  ereport(ERROR,
975  errmsg("could not duplicate handle for \"%s\": %m",
976  name)));
977  }
978 
979  /*
980  * Here, we remember the handle that we created in the
981  * postmaster process. This handle isn't actually usable in
982  * any process other than the postmaster, but that doesn't
983  * matter. We're just holding onto it so that, if the segment
984  * is unpinned, dsm_impl_unpin_segment can close it.
985  */
986  *impl_private_pm_handle = hmap;
987  break;
988  }
989 #endif
990  default:
991  break;
992  }
993 }
994 
995 /*
996  * Implementation-specific actions that must be performed when a segment is no
997  * longer to be preserved, so that it will be cleaned up when all backends
998  * have detached from it.
999  *
1000  * Except on Windows, we don't need to do anything at all. For Windows, we
1001  * close the extra handle that dsm_impl_pin_segment created in the
1002  * postmaster's process space.
1003  */
1004 void
1005 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
1006 {
1008  {
1009 #ifdef USE_DSM_WINDOWS
1010  case DSM_IMPL_WINDOWS:
1011  {
1012  if (*impl_private &&
1013  !DuplicateHandle(PostmasterHandle, *impl_private,
1014  NULL, NULL, 0, FALSE,
1015  DUPLICATE_CLOSE_SOURCE))
1016  {
1017  char name[64];
1018 
1019  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1020  _dosmaperr(GetLastError());
1021  ereport(ERROR,
1023  errmsg("could not duplicate handle for \"%s\": %m",
1024  name)));
1025  }
1026 
1027  *impl_private = NULL;
1028  break;
1029  }
1030 #endif
1031  default:
1032  break;
1033  }
1034 }
1035 
1036 static int
1038 {
1039  if (errno == EFBIG || errno == ENOMEM)
1040  return errcode(ERRCODE_OUT_OF_MEMORY);
1041  else
1042  return errcode_for_file_access();
1043 }
int remaining
Definition: informix.c:667
#define DSM_IMPL_MMAP
Definition: dsm_impl.h:20
#define DSM_IMPL_SYSV
Definition: dsm_impl.h:18
#define MAP_HASSEMAPHORE
Definition: mem.h:30
#define MAP_FAILED
Definition: mem.h:45
volatile sig_atomic_t QueryCancelPending
Definition: globals.c:31
void dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
Definition: dsm_impl.c:1005
uint32 dsm_handle
Definition: dsm_impl.h:55
#define PG_DYNSHMEM_DIR
Definition: dsm_impl.h:51
#define IPC_CREAT
Definition: win32_port.h:87
Definition: guc.h:165
#define write(a, b, c)
Definition: win32.h:14
#define IPCProtection
Definition: posix_sema.c:59
#define PG_SHMAT_FLAGS
Definition: mem.h:20
#define MAP_NOSYNC
Definition: mem.h:38
void _dosmaperr(unsigned long)
Definition: win32error.c:171
int errcode(int sqlerrcode)
Definition: elog.c:691
#define DSM_IMPL_WINDOWS
Definition: dsm_impl.h:19
static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:784
#define DEBUG4
Definition: elog.h:22
static int fd(const char *x, int i)
Definition: preproc-init.c:105
static int errcode_for_dynamic_shared_memory(void)
Definition: dsm_impl.c:1037
#define fstat
Definition: win32_port.h:274
void pfree(void *pointer)
Definition: mcxt.c:1057
const struct config_enum_entry dynamic_shared_memory_options[]
Definition: dsm_impl.c:97
#define ERROR
Definition: elog.h:43
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2403
#define DSM_IMPL_POSIX
Definition: dsm_impl.h:17
void ReserveExternalFD(void)
Definition: fd.c:1110
#define SEGMENT_NAME_PREFIX
Definition: dsm_impl.c:122
int errcode_for_file_access(void)
Definition: elog.c:714
int dynamic_shared_memory_type
Definition: dsm_impl.c:114
unsigned int uint32
Definition: c.h:429
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1472
__int64 st_size
Definition: win32_port.h:265
#define IPC_PRIVATE
Definition: win32_port.h:89
MemoryContext TopMemoryContext
Definition: mcxt.c:44
int CloseTransientFile(int fd)
Definition: fd.c:2580
static int elevel
Definition: vacuumlazy.c:333
void * palloc0(Size size)
Definition: mcxt.c:981
#define IPC_RMID
Definition: win32_port.h:86
static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:415
#define ereport(elevel,...)
Definition: elog.h:155
#define PG_FILE_MODE_OWNER
Definition: file_perm.h:38
#define Assert(condition)
Definition: c.h:800
volatile sig_atomic_t ProcDiePending
Definition: globals.c:32
long key_t
Definition: win32_port.h:239
size_t Size
Definition: c.h:528
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1448
#define IPC_EXCL
Definition: win32_port.h:88
const char * name
Definition: encode.c:561
void dsm_impl_pin_segment(dsm_handle handle, void *impl_private, void **impl_private_pm_handle)
Definition: dsm_impl.c:955
void ReleaseExternalFD(void)
Definition: fd.c:1128
int errmsg(const char *fmt,...)
Definition: elog.c:902
dsm_op
Definition: dsm_impl.h:58
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:797
#define elog(elevel,...)
Definition: elog.h:228
#define PG_DYNSHMEM_MMAP_FILE_PREFIX
Definition: dsm_impl.h:52
#define ZBUFFER_SIZE
Definition: dsm_impl.c:120
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:343
static bool success
Definition: initdb.c:162
bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:161
int min_dynamic_shared_memory
Definition: dsm_impl.c:117
#define snprintf
Definition: port.h:215
#define IPC_STAT
Definition: win32_port.h:91
#define ftruncate(a, b)
Definition: win32_port.h:65