PostgreSQL Source Code  git master
dsm_impl.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * dsm_impl.c
4  * manage dynamic shared memory segments
5  *
6  * This file provides low-level APIs for creating and destroying shared
7  * memory segments using several different possible techniques. We refer
8  * to these segments as dynamic because they can be created, altered, and
9  * destroyed at any point during the server life cycle. This is unlike
10  * the main shared memory segment, of which there is always exactly one
11  * and which is always mapped at a fixed address in every PostgreSQL
12  * background process.
13  *
14  * Because not all systems provide the same primitives in this area, nor
15  * do all primitives behave the same way on all systems, we provide
16  * several implementations of this facility. Many systems implement
17  * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18  * in this area, with the exception that shared memory identifiers live
19  * in a flat system-wide namespace, raising the uncomfortable prospect of
20  * name collisions with other processes (including other copies of
21  * PostgreSQL) running on the same system. Some systems only support
22  * the older System V shared memory interface (shmget etc.) which is
23  * also usable; however, the default allocation limits are often quite
24  * small, and the namespace is even more restricted.
25  *
26  * We also provide an mmap-based shared memory implementation. This may
27  * be useful on systems that provide shared memory via a special-purpose
28  * filesystem; by opting for this implementation, the user can even
29  * control precisely where their shared memory segments are placed. It
30  * can also be used as a fallback for systems where shm_open and shmget
31  * are not available or can't be used for some reason. Of course,
32  * mapping a file residing on an actual spinning disk is a fairly poor
33  * approximation for shared memory because writeback may hurt performance
34  * substantially, but there should be few systems where we must make do
35  * with such poor tools.
36  *
37  * As ever, Windows requires its own implementation.
38  *
39  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
40  * Portions Copyright (c) 1994, Regents of the University of California
41  *
42  *
43  * IDENTIFICATION
44  * src/backend/storage/ipc/dsm_impl.c
45  *
46  *-------------------------------------------------------------------------
47  */
48 
49 #include "postgres.h"
50 #include "miscadmin.h"
51 
52 #include <fcntl.h>
53 #include <unistd.h>
54 #ifndef WIN32
55 #include <sys/mman.h>
56 #endif
57 #include <sys/stat.h>
58 #ifdef HAVE_SYS_IPC_H
59 #include <sys/ipc.h>
60 #endif
61 #ifdef HAVE_SYS_SHM_H
62 #include <sys/shm.h>
63 #endif
64 #include "common/file_perm.h"
65 #include "pgstat.h"
66 
67 #include "portability/mem.h"
68 #include "storage/dsm_impl.h"
69 #include "storage/fd.h"
70 #include "utils/guc.h"
71 #include "utils/memutils.h"
72 #include "postmaster/postmaster.h"
73 
74 #ifdef USE_DSM_POSIX
75 static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
76  void **impl_private, void **mapped_address,
77  Size *mapped_size, int elevel);
78 static int dsm_impl_posix_resize(int fd, off_t size);
79 #endif
80 #ifdef USE_DSM_SYSV
81 static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
82  void **impl_private, void **mapped_address,
83  Size *mapped_size, int elevel);
84 #endif
85 #ifdef USE_DSM_WINDOWS
86 static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
87  void **impl_private, void **mapped_address,
88  Size *mapped_size, int elevel);
89 #endif
90 #ifdef USE_DSM_MMAP
91 static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
92  void **impl_private, void **mapped_address,
93  Size *mapped_size, int elevel);
94 #endif
95 static int errcode_for_dynamic_shared_memory(void);
96 
98 #ifdef USE_DSM_POSIX
99  {"posix", DSM_IMPL_POSIX, false},
100 #endif
101 #ifdef USE_DSM_SYSV
102  {"sysv", DSM_IMPL_SYSV, false},
103 #endif
104 #ifdef USE_DSM_WINDOWS
105  {"windows", DSM_IMPL_WINDOWS, false},
106 #endif
107 #ifdef USE_DSM_MMAP
108  {"mmap", DSM_IMPL_MMAP, false},
109 #endif
110  {NULL, 0, false}
111 };
112 
113 /* Implementation selector. */
115 
116 /* Size of buffer to be used for zero-filling. */
117 #define ZBUFFER_SIZE 8192
118 
119 #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
120 
121 /*------
122  * Perform a low-level shared memory operation in a platform-specific way,
123  * as dictated by the selected implementation. Each implementation is
124  * required to implement the following primitives.
125  *
126  * DSM_OP_CREATE. Create a segment whose size is the request_size and
127  * map it.
128  *
129  * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
130  *
131  * DSM_OP_DETACH. Unmap the segment.
132  *
133  * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
134  * segment.
135  *
136  * Arguments:
137  * op: The operation to be performed.
138  * handle: The handle of an existing object, or for DSM_OP_CREATE, the
139  * a new handle the caller wants created.
140  * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
141  * impl_private: Private, implementation-specific data. Will be a pointer
142  * to NULL for the first operation on a shared memory segment within this
143  * backend; thereafter, it will point to the value to which it was set
144  * on the previous call.
145  * mapped_address: Pointer to start of current mapping; pointer to NULL
146  * if none. Updated with new mapping address.
147  * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
148  * Updated with new mapped size.
149  * elevel: Level at which to log errors.
150  *
151  * Return value: true on success, false on failure. When false is returned,
152  * a message should first be logged at the specified elevel, except in the
153  * case where DSM_OP_CREATE experiences a name collision, which should
154  * silently return false.
155  *-----
156  */
157 bool
158 dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
159  void **impl_private, void **mapped_address, Size *mapped_size,
160  int elevel)
161 {
162  Assert(op == DSM_OP_CREATE || request_size == 0);
163  Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
164  (*mapped_address == NULL && *mapped_size == 0));
165 
167  {
168 #ifdef USE_DSM_POSIX
169  case DSM_IMPL_POSIX:
170  return dsm_impl_posix(op, handle, request_size, impl_private,
171  mapped_address, mapped_size, elevel);
172 #endif
173 #ifdef USE_DSM_SYSV
174  case DSM_IMPL_SYSV:
175  return dsm_impl_sysv(op, handle, request_size, impl_private,
176  mapped_address, mapped_size, elevel);
177 #endif
178 #ifdef USE_DSM_WINDOWS
179  case DSM_IMPL_WINDOWS:
180  return dsm_impl_windows(op, handle, request_size, impl_private,
181  mapped_address, mapped_size, elevel);
182 #endif
183 #ifdef USE_DSM_MMAP
184  case DSM_IMPL_MMAP:
185  return dsm_impl_mmap(op, handle, request_size, impl_private,
186  mapped_address, mapped_size, elevel);
187 #endif
188  default:
189  elog(ERROR, "unexpected dynamic shared memory type: %d",
191  return false;
192  }
193 }
194 
195 #ifdef USE_DSM_POSIX
196 /*
197  * Operating system primitives to support POSIX shared memory.
198  *
199  * POSIX shared memory segments are created and attached using shm_open()
200  * and shm_unlink(); other operations, such as sizing or mapping the
201  * segment, are performed as if the shared memory segments were files.
202  *
203  * Indeed, on some platforms, they may be implemented that way. While
204  * POSIX shared memory segments seem intended to exist in a flat namespace,
205  * some operating systems may implement them as files, even going so far
206  * to treat a request for /xyz as a request to create a file by that name
207  * in the root directory. Users of such broken platforms should select
208  * a different shared memory implementation.
209  */
210 static bool
211 dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
212  void **impl_private, void **mapped_address, Size *mapped_size,
213  int elevel)
214 {
215  char name[64];
216  int flags;
217  int fd;
218  char *address;
219 
220  snprintf(name, 64, "/PostgreSQL.%u", handle);
221 
222  /* Handle teardown cases. */
223  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
224  {
225  if (*mapped_address != NULL
226  && munmap(*mapped_address, *mapped_size) != 0)
227  {
228  ereport(elevel,
230  errmsg("could not unmap shared memory segment \"%s\": %m",
231  name)));
232  return false;
233  }
234  *mapped_address = NULL;
235  *mapped_size = 0;
236  if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
237  {
238  ereport(elevel,
240  errmsg("could not remove shared memory segment \"%s\": %m",
241  name)));
242  return false;
243  }
244  return true;
245  }
246 
247  /*
248  * Create new segment or open an existing one for attach.
249  *
250  * Even though we're not going through fd.c, we should be safe against
251  * running out of file descriptors, because of NUM_RESERVED_FDS. We're
252  * only opening one extra descriptor here, and we'll close it before
253  * returning.
254  */
255  flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
256  if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
257  {
258  if (errno != EEXIST)
259  ereport(elevel,
261  errmsg("could not open shared memory segment \"%s\": %m",
262  name)));
263  return false;
264  }
265 
266  /*
267  * If we're attaching the segment, determine the current size; if we are
268  * creating the segment, set the size to the requested value.
269  */
270  if (op == DSM_OP_ATTACH)
271  {
272  struct stat st;
273 
274  if (fstat(fd, &st) != 0)
275  {
276  int save_errno;
277 
278  /* Back out what's already been done. */
279  save_errno = errno;
280  close(fd);
281  errno = save_errno;
282 
283  ereport(elevel,
285  errmsg("could not stat shared memory segment \"%s\": %m",
286  name)));
287  return false;
288  }
289  request_size = st.st_size;
290  }
291  else if (dsm_impl_posix_resize(fd, request_size) != 0)
292  {
293  int save_errno;
294 
295  /* Back out what's already been done. */
296  save_errno = errno;
297  close(fd);
298  shm_unlink(name);
299  errno = save_errno;
300 
301  /*
302  * If we received a query cancel or termination signal, we will have
303  * EINTR set here. If the caller said that errors are OK here, check
304  * for interrupts immediately.
305  */
306  if (errno == EINTR && elevel >= ERROR)
308 
309  ereport(elevel,
311  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
312  name, request_size)));
313  return false;
314  }
315 
316  /* Map it. */
317  address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
318  MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
319  if (address == MAP_FAILED)
320  {
321  int save_errno;
322 
323  /* Back out what's already been done. */
324  save_errno = errno;
325  close(fd);
326  if (op == DSM_OP_CREATE)
327  shm_unlink(name);
328  errno = save_errno;
329 
330  ereport(elevel,
332  errmsg("could not map shared memory segment \"%s\": %m",
333  name)));
334  return false;
335  }
336  *mapped_address = address;
337  *mapped_size = request_size;
338  close(fd);
339 
340  return true;
341 }
342 
343 /*
344  * Set the size of a virtual memory region associated with a file descriptor.
345  * If necessary, also ensure that virtual memory is actually allocated by the
346  * operating system, to avoid nasty surprises later.
347  *
348  * Returns non-zero if either truncation or allocation fails, and sets errno.
349  */
350 static int
351 dsm_impl_posix_resize(int fd, off_t size)
352 {
353  int rc;
354 
355  /* Truncate (or extend) the file to the requested size. */
356  rc = ftruncate(fd, size);
357 
358  /*
359  * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with
360  * ftruncate, the file may contain a hole. Accessing memory backed by a
361  * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
362  * is no more tmpfs space available. So we ask tmpfs to allocate pages
363  * here, so we can fail gracefully with ENOSPC now rather than risking
364  * SIGBUS later.
365  */
366 #if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
367  if (rc == 0)
368  {
369  /*
370  * We may get interrupted. If so, just retry unless there is an
371  * interrupt pending. This avoids the possibility of looping forever
372  * if another backend is repeatedly trying to interrupt us.
373  */
374  do
375  {
376  rc = posix_fallocate(fd, 0, size);
377  } while (rc == EINTR && !(ProcDiePending || QueryCancelPending));
378 
379  /*
380  * The caller expects errno to be set, but posix_fallocate() doesn't
381  * set it. Instead it returns error numbers directly. So set errno,
382  * even though we'll also return rc to indicate success or failure.
383  */
384  errno = rc;
385  }
386 #endif /* HAVE_POSIX_FALLOCATE && __linux__ */
387 
388  return rc;
389 }
390 
391 #endif /* USE_DSM_POSIX */
392 
393 #ifdef USE_DSM_SYSV
394 /*
395  * Operating system primitives to support System V shared memory.
396  *
397  * System V shared memory segments are manipulated using shmget(), shmat(),
398  * shmdt(), and shmctl(). As the default allocation limits for System V
399  * shared memory are usually quite low, the POSIX facilities may be
400  * preferable; but those are not supported everywhere.
401  */
402 static bool
403 dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
404  void **impl_private, void **mapped_address, Size *mapped_size,
405  int elevel)
406 {
407  key_t key;
408  int ident;
409  char *address;
410  char name[64];
411  int *ident_cache;
412 
413  /*
414  * POSIX shared memory and mmap-based shared memory identify segments with
415  * names. To avoid needless error message variation, we use the handle as
416  * the name.
417  */
418  snprintf(name, 64, "%u", handle);
419 
420  /*
421  * The System V shared memory namespace is very restricted; names are of
422  * type key_t, which is expected to be some sort of integer data type, but
423  * not necessarily the same one as dsm_handle. Since we use dsm_handle to
424  * identify shared memory segments across processes, this might seem like
425  * a problem, but it's really not. If dsm_handle is bigger than key_t,
426  * the cast below might truncate away some bits from the handle the
427  * user-provided, but it'll truncate exactly the same bits away in exactly
428  * the same fashion every time we use that handle, which is all that
429  * really matters. Conversely, if dsm_handle is smaller than key_t, we
430  * won't use the full range of available key space, but that's no big deal
431  * either.
432  *
433  * We do make sure that the key isn't negative, because that might not be
434  * portable.
435  */
436  key = (key_t) handle;
437  if (key < 1) /* avoid compiler warning if type is unsigned */
438  key = -key;
439 
440  /*
441  * There's one special key, IPC_PRIVATE, which can't be used. If we end
442  * up with that value by chance during a create operation, just pretend it
443  * already exists, so that caller will retry. If we run into it anywhere
444  * else, the caller has passed a handle that doesn't correspond to
445  * anything we ever created, which should not happen.
446  */
447  if (key == IPC_PRIVATE)
448  {
449  if (op != DSM_OP_CREATE)
450  elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
451  errno = EEXIST;
452  return false;
453  }
454 
455  /*
456  * Before we can do anything with a shared memory segment, we have to map
457  * the shared memory key to a shared memory identifier using shmget(). To
458  * avoid repeated lookups, we store the key using impl_private.
459  */
460  if (*impl_private != NULL)
461  {
462  ident_cache = *impl_private;
463  ident = *ident_cache;
464  }
465  else
466  {
467  int flags = IPCProtection;
468  size_t segsize;
469 
470  /*
471  * Allocate the memory BEFORE acquiring the resource, so that we don't
472  * leak the resource if memory allocation fails.
473  */
474  ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
475 
476  /*
477  * When using shmget to find an existing segment, we must pass the
478  * size as 0. Passing a non-zero size which is greater than the
479  * actual size will result in EINVAL.
480  */
481  segsize = 0;
482 
483  if (op == DSM_OP_CREATE)
484  {
485  flags |= IPC_CREAT | IPC_EXCL;
486  segsize = request_size;
487  }
488 
489  if ((ident = shmget(key, segsize, flags)) == -1)
490  {
491  if (errno != EEXIST)
492  {
493  int save_errno = errno;
494 
495  pfree(ident_cache);
496  errno = save_errno;
497  ereport(elevel,
499  errmsg("could not get shared memory segment: %m")));
500  }
501  return false;
502  }
503 
504  *ident_cache = ident;
505  *impl_private = ident_cache;
506  }
507 
508  /* Handle teardown cases. */
509  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
510  {
511  pfree(ident_cache);
512  *impl_private = NULL;
513  if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
514  {
515  ereport(elevel,
517  errmsg("could not unmap shared memory segment \"%s\": %m",
518  name)));
519  return false;
520  }
521  *mapped_address = NULL;
522  *mapped_size = 0;
523  if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
524  {
525  ereport(elevel,
527  errmsg("could not remove shared memory segment \"%s\": %m",
528  name)));
529  return false;
530  }
531  return true;
532  }
533 
534  /* If we're attaching it, we must use IPC_STAT to determine the size. */
535  if (op == DSM_OP_ATTACH)
536  {
537  struct shmid_ds shm;
538 
539  if (shmctl(ident, IPC_STAT, &shm) != 0)
540  {
541  ereport(elevel,
543  errmsg("could not stat shared memory segment \"%s\": %m",
544  name)));
545  return false;
546  }
547  request_size = shm.shm_segsz;
548  }
549 
550  /* Map it. */
551  address = shmat(ident, NULL, PG_SHMAT_FLAGS);
552  if (address == (void *) -1)
553  {
554  int save_errno;
555 
556  /* Back out what's already been done. */
557  save_errno = errno;
558  if (op == DSM_OP_CREATE)
559  shmctl(ident, IPC_RMID, NULL);
560  errno = save_errno;
561 
562  ereport(elevel,
564  errmsg("could not map shared memory segment \"%s\": %m",
565  name)));
566  return false;
567  }
568  *mapped_address = address;
569  *mapped_size = request_size;
570 
571  return true;
572 }
573 #endif
574 
575 #ifdef USE_DSM_WINDOWS
576 /*
577  * Operating system primitives to support Windows shared memory.
578  *
579  * Windows shared memory implementation is done using file mapping
580  * which can be backed by either physical file or system paging file.
581  * Current implementation uses system paging file as other effects
582  * like performance are not clear for physical file and it is used in similar
583  * way for main shared memory in windows.
584  *
585  * A memory mapping object is a kernel object - they always get deleted when
586  * the last reference to them goes away, either explicitly via a CloseHandle or
587  * when the process containing the reference exits.
588  */
589 static bool
590 dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
591  void **impl_private, void **mapped_address,
592  Size *mapped_size, int elevel)
593 {
594  char *address;
595  HANDLE hmap;
596  char name[64];
597  MEMORY_BASIC_INFORMATION info;
598 
599  /*
600  * Storing the shared memory segment in the Global\ namespace, can allow
601  * any process running in any session to access that file mapping object
602  * provided that the caller has the required access rights. But to avoid
603  * issues faced in main shared memory, we are using the naming convention
604  * similar to main shared memory. We can change here once issue mentioned
605  * in GetSharedMemName is resolved.
606  */
607  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
608 
609  /*
610  * Handle teardown cases. Since Windows automatically destroys the object
611  * when no references remain, we can treat it the same as detach.
612  */
613  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
614  {
615  if (*mapped_address != NULL
616  && UnmapViewOfFile(*mapped_address) == 0)
617  {
618  _dosmaperr(GetLastError());
619  ereport(elevel,
621  errmsg("could not unmap shared memory segment \"%s\": %m",
622  name)));
623  return false;
624  }
625  if (*impl_private != NULL
626  && CloseHandle(*impl_private) == 0)
627  {
628  _dosmaperr(GetLastError());
629  ereport(elevel,
631  errmsg("could not remove shared memory segment \"%s\": %m",
632  name)));
633  return false;
634  }
635 
636  *impl_private = NULL;
637  *mapped_address = NULL;
638  *mapped_size = 0;
639  return true;
640  }
641 
642  /* Create new segment or open an existing one for attach. */
643  if (op == DSM_OP_CREATE)
644  {
645  DWORD size_high;
646  DWORD size_low;
647  DWORD errcode;
648 
649  /* Shifts >= the width of the type are undefined. */
650 #ifdef _WIN64
651  size_high = request_size >> 32;
652 #else
653  size_high = 0;
654 #endif
655  size_low = (DWORD) request_size;
656 
657  /* CreateFileMapping might not clear the error code on success */
658  SetLastError(0);
659 
660  hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
661  NULL, /* Default security attrs */
662  PAGE_READWRITE, /* Memory is read/write */
663  size_high, /* Upper 32 bits of size */
664  size_low, /* Lower 32 bits of size */
665  name);
666 
667  errcode = GetLastError();
668  if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
669  {
670  /*
671  * On Windows, when the segment already exists, a handle for the
672  * existing segment is returned. We must close it before
673  * returning. However, if the existing segment is created by a
674  * service, then it returns ERROR_ACCESS_DENIED. We don't do
675  * _dosmaperr here, so errno won't be modified.
676  */
677  if (hmap)
678  CloseHandle(hmap);
679  return false;
680  }
681 
682  if (!hmap)
683  {
684  _dosmaperr(errcode);
685  ereport(elevel,
687  errmsg("could not create shared memory segment \"%s\": %m",
688  name)));
689  return false;
690  }
691  }
692  else
693  {
694  hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
695  FALSE, /* do not inherit the name */
696  name); /* name of mapping object */
697  if (!hmap)
698  {
699  _dosmaperr(GetLastError());
700  ereport(elevel,
702  errmsg("could not open shared memory segment \"%s\": %m",
703  name)));
704  return false;
705  }
706  }
707 
708  /* Map it. */
709  address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
710  0, 0, 0);
711  if (!address)
712  {
713  int save_errno;
714 
715  _dosmaperr(GetLastError());
716  /* Back out what's already been done. */
717  save_errno = errno;
718  CloseHandle(hmap);
719  errno = save_errno;
720 
721  ereport(elevel,
723  errmsg("could not map shared memory segment \"%s\": %m",
724  name)));
725  return false;
726  }
727 
728  /*
729  * VirtualQuery gives size in page_size units, which is 4K for Windows. We
730  * need size only when we are attaching, but it's better to get the size
731  * when creating new segment to keep size consistent both for
732  * DSM_OP_CREATE and DSM_OP_ATTACH.
733  */
734  if (VirtualQuery(address, &info, sizeof(info)) == 0)
735  {
736  int save_errno;
737 
738  _dosmaperr(GetLastError());
739  /* Back out what's already been done. */
740  save_errno = errno;
741  UnmapViewOfFile(address);
742  CloseHandle(hmap);
743  errno = save_errno;
744 
745  ereport(elevel,
747  errmsg("could not stat shared memory segment \"%s\": %m",
748  name)));
749  return false;
750  }
751 
752  *mapped_address = address;
753  *mapped_size = info.RegionSize;
754  *impl_private = hmap;
755 
756  return true;
757 }
758 #endif
759 
760 #ifdef USE_DSM_MMAP
761 /*
762  * Operating system primitives to support mmap-based shared memory.
763  *
764  * Calling this "shared memory" is somewhat of a misnomer, because what
765  * we're really doing is creating a bunch of files and mapping them into
766  * our address space. The operating system may feel obliged to
767  * synchronize the contents to disk even if nothing is being paged out,
768  * which will not serve us well. The user can relocate the pg_dynshmem
769  * directory to a ramdisk to avoid this problem, if available.
770  */
771 static bool
772 dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
773  void **impl_private, void **mapped_address, Size *mapped_size,
774  int elevel)
775 {
776  char name[64];
777  int flags;
778  int fd;
779  char *address;
780 
782  handle);
783 
784  /* Handle teardown cases. */
785  if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
786  {
787  if (*mapped_address != NULL
788  && munmap(*mapped_address, *mapped_size) != 0)
789  {
790  ereport(elevel,
792  errmsg("could not unmap shared memory segment \"%s\": %m",
793  name)));
794  return false;
795  }
796  *mapped_address = NULL;
797  *mapped_size = 0;
798  if (op == DSM_OP_DESTROY && unlink(name) != 0)
799  {
800  ereport(elevel,
802  errmsg("could not remove shared memory segment \"%s\": %m",
803  name)));
804  return false;
805  }
806  return true;
807  }
808 
809  /* Create new segment or open an existing one for attach. */
810  flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
811  if ((fd = OpenTransientFile(name, flags)) == -1)
812  {
813  if (errno != EEXIST)
814  ereport(elevel,
816  errmsg("could not open shared memory segment \"%s\": %m",
817  name)));
818  return false;
819  }
820 
821  /*
822  * If we're attaching the segment, determine the current size; if we are
823  * creating the segment, set the size to the requested value.
824  */
825  if (op == DSM_OP_ATTACH)
826  {
827  struct stat st;
828 
829  if (fstat(fd, &st) != 0)
830  {
831  int save_errno;
832 
833  /* Back out what's already been done. */
834  save_errno = errno;
835  CloseTransientFile(fd);
836  errno = save_errno;
837 
838  ereport(elevel,
840  errmsg("could not stat shared memory segment \"%s\": %m",
841  name)));
842  return false;
843  }
844  request_size = st.st_size;
845  }
846  else
847  {
848  /*
849  * Allocate a buffer full of zeros.
850  *
851  * Note: palloc zbuffer, instead of just using a local char array, to
852  * ensure it is reasonably well-aligned; this may save a few cycles
853  * transferring data to the kernel.
854  */
855  char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
856  uint32 remaining = request_size;
857  bool success = true;
858 
859  /*
860  * Zero-fill the file. We have to do this the hard way to ensure that
861  * all the file space has really been allocated, so that we don't
862  * later seg fault when accessing the memory mapping. This is pretty
863  * pessimal.
864  */
865  while (success && remaining > 0)
866  {
867  Size goal = remaining;
868 
869  if (goal > ZBUFFER_SIZE)
870  goal = ZBUFFER_SIZE;
872  if (write(fd, zbuffer, goal) == goal)
873  remaining -= goal;
874  else
875  success = false;
877  }
878 
879  if (!success)
880  {
881  int save_errno;
882 
883  /* Back out what's already been done. */
884  save_errno = errno;
885  CloseTransientFile(fd);
886  unlink(name);
887  errno = save_errno ? save_errno : ENOSPC;
888 
889  ereport(elevel,
891  errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
892  name, request_size)));
893  return false;
894  }
895  }
896 
897  /* Map it. */
898  address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
899  MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
900  if (address == MAP_FAILED)
901  {
902  int save_errno;
903 
904  /* Back out what's already been done. */
905  save_errno = errno;
906  CloseTransientFile(fd);
907  if (op == DSM_OP_CREATE)
908  unlink(name);
909  errno = save_errno;
910 
911  ereport(elevel,
913  errmsg("could not map shared memory segment \"%s\": %m",
914  name)));
915  return false;
916  }
917  *mapped_address = address;
918  *mapped_size = request_size;
919 
920  if (CloseTransientFile(fd) != 0)
921  {
922  ereport(elevel,
924  errmsg("could not close shared memory segment \"%s\": %m",
925  name)));
926  return false;
927  }
928 
929  return true;
930 }
931 #endif
932 
933 /*
934  * Implementation-specific actions that must be performed when a segment is to
935  * be preserved even when no backend has it attached.
936  *
937  * Except on Windows, we don't need to do anything at all. But since Windows
938  * cleans up segments automatically when no references remain, we duplicate
939  * the segment handle into the postmaster process. The postmaster needn't
940  * do anything to receive the handle; Windows transfers it automatically.
941  */
942 void
943 dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
944  void **impl_private_pm_handle)
945 {
947  {
948 #ifdef USE_DSM_WINDOWS
949  case DSM_IMPL_WINDOWS:
950  {
951  HANDLE hmap;
952 
953  if (!DuplicateHandle(GetCurrentProcess(), impl_private,
954  PostmasterHandle, &hmap, 0, FALSE,
955  DUPLICATE_SAME_ACCESS))
956  {
957  char name[64];
958 
959  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
960  _dosmaperr(GetLastError());
961  ereport(ERROR,
963  errmsg("could not duplicate handle for \"%s\": %m",
964  name)));
965  }
966 
967  /*
968  * Here, we remember the handle that we created in the
969  * postmaster process. This handle isn't actually usable in
970  * any process other than the postmaster, but that doesn't
971  * matter. We're just holding onto it so that, if the segment
972  * is unpinned, dsm_impl_unpin_segment can close it.
973  */
974  *impl_private_pm_handle = hmap;
975  break;
976  }
977 #endif
978  default:
979  break;
980  }
981 }
982 
983 /*
984  * Implementation-specific actions that must be performed when a segment is no
985  * longer to be preserved, so that it will be cleaned up when all backends
986  * have detached from it.
987  *
988  * Except on Windows, we don't need to do anything at all. For Windows, we
989  * close the extra handle that dsm_impl_pin_segment created in the
990  * postmaster's process space.
991  */
992 void
993 dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
994 {
996  {
997 #ifdef USE_DSM_WINDOWS
998  case DSM_IMPL_WINDOWS:
999  {
1000  if (*impl_private &&
1001  !DuplicateHandle(PostmasterHandle, *impl_private,
1002  NULL, NULL, 0, FALSE,
1003  DUPLICATE_CLOSE_SOURCE))
1004  {
1005  char name[64];
1006 
1007  snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
1008  _dosmaperr(GetLastError());
1009  ereport(ERROR,
1011  errmsg("could not duplicate handle for \"%s\": %m",
1012  name)));
1013  }
1014 
1015  *impl_private = NULL;
1016  break;
1017  }
1018 #endif
1019  default:
1020  break;
1021  }
1022 }
1023 
1024 static int
1026 {
1027  if (errno == EFBIG || errno == ENOMEM)
1028  return errcode(ERRCODE_OUT_OF_MEMORY);
1029  else
1030  return errcode_for_file_access();
1031 }
int remaining
Definition: informix.c:687
#define DSM_IMPL_MMAP
Definition: dsm_impl.h:20
#define DSM_IMPL_SYSV
Definition: dsm_impl.h:18
#define MAP_HASSEMAPHORE
Definition: mem.h:30
#define MAP_FAILED
Definition: mem.h:45
volatile sig_atomic_t QueryCancelPending
Definition: globals.c:31
void dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
Definition: dsm_impl.c:993
uint32 dsm_handle
Definition: dsm_impl.h:54
#define PG_DYNSHMEM_DIR
Definition: dsm_impl.h:50
#define IPC_CREAT
Definition: win32_port.h:82
Definition: guc.h:164
#define write(a, b, c)
Definition: win32.h:14
#define IPCProtection
Definition: posix_sema.c:59
#define PG_SHMAT_FLAGS
Definition: mem.h:20
#define FALSE
Definition: ecpglib.h:39
#define MAP_NOSYNC
Definition: mem.h:38
void _dosmaperr(unsigned long)
Definition: win32error.c:171
int errcode(int sqlerrcode)
Definition: elog.c:570
#define DSM_IMPL_WINDOWS
Definition: dsm_impl.h:19
static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:772
#define DEBUG4
Definition: elog.h:22
static int fd(const char *x, int i)
Definition: preproc-init.c:105
static int errcode_for_dynamic_shared_memory(void)
Definition: dsm_impl.c:1025
void pfree(void *pointer)
Definition: mcxt.c:1056
const struct config_enum_entry dynamic_shared_memory_options[]
Definition: dsm_impl.c:97
#define ERROR
Definition: elog.h:43
int OpenTransientFile(const char *fileName, int fileFlags)
Definition: fd.c:2255
#define DSM_IMPL_POSIX
Definition: dsm_impl.h:17
#define SEGMENT_NAME_PREFIX
Definition: dsm_impl.c:119
int errcode_for_file_access(void)
Definition: elog.c:593
int dynamic_shared_memory_type
Definition: dsm_impl.c:114
unsigned int uint32
Definition: c.h:358
static void pgstat_report_wait_end(void)
Definition: pgstat.h:1342
#define ereport(elevel, rest)
Definition: elog.h:141
#define IPC_PRIVATE
Definition: win32_port.h:84
MemoryContext TopMemoryContext
Definition: mcxt.c:44
int CloseTransientFile(int fd)
Definition: fd.c:2432
#define stat(a, b)
Definition: win32_port.h:255
static int elevel
Definition: vacuumlazy.c:143
void * palloc0(Size size)
Definition: mcxt.c:980
#define IPC_RMID
Definition: win32_port.h:81
static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:403
#define PG_FILE_MODE_OWNER
Definition: file_perm.h:38
#define Assert(condition)
Definition: c.h:732
volatile sig_atomic_t ProcDiePending
Definition: globals.c:32
long key_t
Definition: win32_port.h:233
size_t Size
Definition: c.h:466
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition: pgstat.h:1318
#define IPC_EXCL
Definition: win32_port.h:83
const char * name
Definition: encode.c:521
void dsm_impl_pin_segment(dsm_handle handle, void *impl_private, void **impl_private_pm_handle)
Definition: dsm_impl.c:943
int errmsg(const char *fmt,...)
Definition: elog.c:784
dsm_op
Definition: dsm_impl.h:57
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:796
#define elog(elevel,...)
Definition: elog.h:226
#define PG_DYNSHMEM_MMAP_FILE_PREFIX
Definition: dsm_impl.h:51
#define ZBUFFER_SIZE
Definition: dsm_impl.c:117
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:99
#define close(a)
Definition: win32.h:12
#define EINTR
Definition: win32_port.h:323
static bool success
Definition: initdb.c:163
bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
Definition: dsm_impl.c:158
#define snprintf
Definition: port.h:192
#define IPC_STAT
Definition: win32_port.h:86
#define ftruncate(a, b)
Definition: win32_port.h:60