PostgreSQL Source Code git master
Loading...
Searching...
No Matches
shmem.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * shmem.c
4 * create shared memory and initialize shared memory data structures.
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/ipc/shmem.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * POSTGRES processes share one or more regions of shared memory.
17 * The shared memory is created by a postmaster and is inherited
18 * by each backend via fork() (or, in some ports, via other OS-specific
19 * methods). The routines in this file are used for allocating and
20 * binding to shared memory data structures.
21 *
22 * This module provides facilities to allocate fixed-size structures in shared
23 * memory, for things like variables shared between all backend processes.
24 * Each such structure has a string name to identify it, specified when it is
25 * requested. shmem_hash.c provides a shared hash table implementation on top
26 * of that.
27 *
28 * Shared memory areas should usually not be allocated after postmaster
29 * startup, although we do allow small allocations later for the benefit of
30 * extension modules that are loaded after startup. Despite that allowance,
31 * extensions that need shared memory should be added in
32 * shared_preload_libraries, because the allowance is quite small and there is
33 * no guarantee that any memory is available after startup.
34 *
35 * Nowadays, there is also another way to allocate shared memory called
36 * Dynamic Shared Memory. See dsm.c for that facility. One big difference
37 * between traditional shared memory handled by shmem.c and dynamic shared
38 * memory is that traditional shared memory areas are mapped to the same
39 * address in all processes, so you can use normal pointers in shared memory
40 * structs. With Dynamic Shared Memory, you must use offsets or DSA pointers
41 * instead.
42 *
43 * Shared memory managed by shmem.c can never be freed, once allocated. Each
44 * hash table has its own free list, so hash buckets can be reused when an
45 * item is deleted.
46 *
47 * Usage
48 * -----
49 *
50 * To allocate shared memory, you need to register a set of callback functions
51 * which handle the lifecycle of the allocation. In the request_fn callback,
52 * call ShmemRequestStruct() with the desired name and size. When the area is
53 * later allocated or attached to, the global variable pointed to by the .ptr
54 * option is set to the shared memory location of the allocation. The init_fn
55 * callback can perform additional initialization.
56 *
57 * typedef struct MyShmemData {
58 * ...
59 * } MyShmemData;
60 *
61 * static MyShmemData *MyShmem;
62 *
63 * static void my_shmem_request(void *arg);
64 * static void my_shmem_init(void *arg);
65 *
66 * const ShmemCallbacks MyShmemCallbacks = {
67 * .request_fn = my_shmem_request,
68 * .init_fn = my_shmem_init,
69 * };
70 *
71 * static void
72 * my_shmem_request(void *arg)
73 * {
74 * ShmemRequestStruct(.name = "My shmem area",
75 * .size = sizeof(MyShmemData),
76 * .ptr = (void **) &MyShmem,
77 * );
78 * }
79 *
80 * In builtin PostgreSQL code, add the callbacks to the list in
81 * src/include/storage/subsystemlist.h. In an add-in module, you can register
82 * the callbacks by calling RegisterShmemCallbacks(&MyShmemCallbacks) in the
83 * extension's _PG_init() function.
84 *
85 * Lifecycle
86 * ---------
87 *
88 * Initializing shared memory happens in multiple phases. In the first phase,
89 * during postmaster startup, all the request_fn callbacks are called. Only
90 * after all the request_fn callbacks have been called and all the shmem areas
91 * have been requested by the ShmemRequestStruct() calls we know how much
92 * shared memory we need in total. After that, postmaster allocates global
93 * shared memory segment, and calls all the init_fn callbacks to initialize
94 * all the requested shmem areas.
95 *
96 * In standard Unix-ish environments, individual backends do not need to
97 * re-establish their local pointers into shared memory, because they inherit
98 * correct values of those variables via fork() from the postmaster. However,
99 * this does not work in the EXEC_BACKEND case. In ports using EXEC_BACKEND,
100 * backend startup also calls the shmem_request callbacks to re-establish the
101 * knowledge about each shared memory area, sets the pointer variables
102 * (*options->ptr), and calls the attach_fn callback, if any, for additional
103 * per-backend setup.
104 *
105 * Legacy ShmemInitStruct()/ShmemInitHash() functions
106 * --------------------------------------------------
107 *
108 * ShmemInitStruct()/ShmemInitHash() is another way of registering shmem
109 * areas. It pre-dates the ShmemRequestStruct()/ShmemRequestHash() functions,
110 * and should not be used in new code, but as of this writing it is still
111 * widely used in extensions.
112 *
113 * To allocate a shmem area with ShmemInitStruct(), you need to separately
114 * register the size needed for the area by calling RequestAddinShmemSpace()
115 * from the extension's shmem_request_hook, and allocate the area by calling
116 * ShmemInitStruct() from the extension's shmem_startup_hook. There are no
117 * init/attach callbacks. Instead, the caller of ShmemInitStruct() must check
118 * the return status of ShmemInitStruct() and initialize the struct if it was
119 * not previously initialized.
120 *
121 * Calling ShmemAlloc() directly
122 * -----------------------------
123 *
124 * There's a more low-level way of allocating shared memory too: you can call
125 * ShmemAlloc() directly. It's used to implement the higher level mechanisms,
126 * and should generally not be called directly.
127 */
128
129#include "postgres.h"
130
131#include <unistd.h>
132
133#include "access/slru.h"
134#include "common/int.h"
135#include "fmgr.h"
136#include "funcapi.h"
137#include "miscadmin.h"
138#include "port/pg_bitutils.h"
139#include "port/pg_numa.h"
140#include "storage/lwlock.h"
141#include "storage/pg_shmem.h"
142#include "storage/shmem.h"
144#include "storage/spin.h"
145#include "utils/builtins.h"
146#include "utils/tuplestore.h"
147
148/*
149 * Registered callbacks.
150 *
151 * During postmaster startup, we accumulate the callbacks from all subsystems
152 * in this list.
153 *
154 * This is in process private memory, although on Unix-like systems, we expect
155 * all the registrations to happen at postmaster startup time and be inherited
156 * by all the child processes via fork().
157 */
159
160/*
161 * In the shmem request phase, all the shmem areas requested with the
162 * ShmemRequest*() functions are accumulated here.
163 */
169
171
172/*
173 * Per-process state machine, for sanity checking that we do things in the
174 * right order.
175 *
176 * Postmaster:
177 * INITIAL -> REQUESTING -> INITIALIZING -> DONE
178 *
179 * Backends in EXEC_BACKEND mode:
180 * INITIAL -> REQUESTING -> ATTACHING -> DONE
181 *
182 * Late request:
183 * DONE -> REQUESTING -> AFTER_STARTUP_ATTACH_OR_INIT -> DONE
184 */
186{
187 /* Initial state */
189
190 /*
191 * When we start calling the shmem_request callbacks, we enter the
192 * SRS_REQUESTING phase. All ShmemRequestStruct calls happen in this
193 * state.
194 */
196
197 /*
198 * Postmaster has finished all shmem requests, and is now initializing the
199 * shared memory segment. init_fn callbacks are called in this state.
200 */
202
203 /*
204 * A postmaster child process is starting up. attach_fn callbacks are
205 * called in this state.
206 */
208
209 /* An after-startup allocation or attachment is in progress */
211
212 /* Normal state after shmem initialization / attachment */
214};
216
217/*
218 * This is the first data structure stored in the shared memory segment, at
219 * the offset that PGShmemHeader->content_offset points to. Allocations by
220 * ShmemAlloc() are carved out of the space after this.
221 *
222 * For the base pointer and the total size of the shmem segment, we rely on
223 * the PGShmemHeader.
224 */
225typedef struct ShmemAllocatorData
226{
227 Size free_offset; /* offset to first free space from ShmemBase */
228
229 /* protects 'free_offset' */
231
232 HASHHDR *index; /* location of ShmemIndex */
233 size_t index_size; /* size of shmem region holding ShmemIndex */
234 LWLock index_lock; /* protects ShmemIndex */
236
237#define ShmemIndexLock (&ShmemAllocator->index_lock)
238
239static void *ShmemAllocRaw(Size size, Size alignment, Size *allocated_size);
240
241/* shared memory global variables */
242
243static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */
244static void *ShmemBase; /* start address of shared memory */
245static void *ShmemEnd; /* end+1 address of shared memory */
246
248
249/*
250 * ShmemIndex is a global directory of shmem areas, itself also stored in the
251 * shared memory.
252 */
254
255 /* max size of data structure string name */
256#define SHMEM_INDEX_KEYSIZE (48)
257
258/*
259 * # of additional entries to reserve in the shmem index table, for
260 * allocations after postmaster startup. (This is not a hard limit, the hash
261 * table can grow larger than that if there is shared memory available)
262 */
263#define SHMEM_INDEX_ADDITIONAL_SIZE (128)
264
265/* this is a hash bucket in the shmem index table */
266typedef struct
267{
268 char key[SHMEM_INDEX_KEYSIZE]; /* string name */
269 void *location; /* location in shared mem */
270 Size size; /* # bytes requested for the structure */
271 Size allocated_size; /* # bytes actually allocated */
273
274/* To get reliable results for NUMA inquiry we need to "touch pages" once */
275static bool firstNumaTouch = true;
276
277static void CallShmemCallbacksAfterStartup(const ShmemCallbacks *callbacks);
279static bool AttachShmemIndexEntry(ShmemRequest *request, bool missing_ok);
280
282
283/*
284 * ShmemRequestStruct() --- request a named shared memory area
285 *
286 * Subsystems call this to register their shared memory needs. This is
287 * usually done early in postmaster startup, before the shared memory segment
288 * has been created, so that the size can be included in the estimate for
289 * total amount of shared memory needed. We set aside a small amount of
290 * memory for allocations that happen later, for the benefit of non-preloaded
291 * extensions, but that should not be relied upon.
292 *
293 * This does not yet allocate the memory, but merely registers the need for
294 * it. The actual allocation happens later in the postmaster startup
295 * sequence.
296 *
297 * This must be called from a shmem_request callback function, registered with
298 * RegisterShmemCallbacks(). This enforces a coding pattern that works the
299 * same in normal Unix systems and with EXEC_BACKEND. On Unix systems, the
300 * shmem_request callbacks are called once, early in postmaster startup, and
301 * the child processes inherit the struct descriptors and any other
302 * per-process state from the postmaster. In EXEC_BACKEND mode, shmem_request
303 * callbacks are *also* called in each backend, at backend startup, to
304 * re-establish the struct descriptors. By calling the same function in both
305 * cases, we ensure that all the shmem areas are registered the same way in
306 * all processes.
307 *
308 * 'options' defines the name and size of the area, and any other optional
309 * features. Leave unused options as zeros. The options are copied to
310 * longer-lived memory, so it doesn't need to live after the
311 * ShmemRequestStruct() call and can point to a local variable in the calling
312 * function. The 'name' must point to a long-lived string though, only the
313 * pointer to it is copied.
314 */
315void
326
327/*
328 * Internal workhorse of ShmemRequestStruct() and ShmemRequestHash().
329 *
330 * Note: Unlike in the public ShmemRequestStruct() and ShmemRequestHash()
331 * functions, 'options' is *not* copied. It must be allocated in
332 * TopMemoryContext by the caller, and will be freed after the init/attach
333 * callbacks have been called. This allows ShmemRequestHash() to pass a
334 * pointer to the extended ShmemHashOpts struct instead.
335 */
336void
338{
340
341 /* Check the options */
342 if (options->name == NULL)
343 elog(ERROR, "shared memory request is missing 'name' option");
344
346 {
347 if (options->size <= 0 && options->size != SHMEM_ATTACH_UNKNOWN_SIZE)
348 elog(ERROR, "invalid size %zd for shared memory request for \"%s\"",
349 options->size, options->name);
350 }
351 else
352 {
354 elog(ERROR, "SHMEM_ATTACH_UNKNOWN_SIZE cannot be used during startup");
355 if (options->size <= 0)
356 elog(ERROR, "invalid size %zd for shared memory request for \"%s\"",
357 options->size, options->name);
358 }
359
360 if (options->alignment != 0 && pg_nextpower2_size_t(options->alignment) != options->alignment)
361 elog(ERROR, "invalid alignment %zu for shared memory request for \"%s\"",
362 options->alignment, options->name);
363
364 /* Check that we're in the right state */
366 elog(ERROR, "ShmemRequestStruct can only be called from a shmem_request callback");
367
368 /* Check that it's not already registered in this process */
370 {
371 if (strcmp(existing->options->name, options->name) == 0)
373 (errmsg("shared memory struct \"%s\" is already registered",
374 options->name)));
375 }
376
377 /* Request looks valid, remember it */
378 request = palloc(sizeof(ShmemRequest));
379 request->options = options;
380 request->kind = kind;
382}
383
384/*
385 * ShmemGetRequestedSize() --- estimate the total size of all registered shared
386 * memory structures.
387 *
388 * This is called at postmaster startup, before the shared memory segment has
389 * been created.
390 */
391size_t
393{
394 size_t size;
395
396 /* memory needed for the ShmemIndex */
398 sizeof(ShmemIndexEnt));
399 size = CACHELINEALIGN(size);
400
401 /* memory needed for all the requested areas */
403 {
404 size_t alignment = request->options->alignment;
405
406 /* pad the start address for alignment like ShmemAllocRaw() does */
407 if (alignment < PG_CACHE_LINE_SIZE)
408 alignment = PG_CACHE_LINE_SIZE;
409 size = TYPEALIGN(alignment, size);
410
411 size = add_size(size, request->options->size);
412 }
413
414 return size;
415}
416
417/*
418 * ShmemInitRequested() --- allocate and initialize requested shared memory
419 * structures.
420 *
421 * This is called once at postmaster startup, after the shared memory segment
422 * has been created.
423 */
424void
426{
427 /* should be called only by the postmaster or a standalone backend */
430
431 /*
432 * Initialize the ShmemIndex entries and perform basic initialization of
433 * all the requested memory areas. There are no concurrent processes yet,
434 * so no need for locking.
435 */
437 {
439 pfree(request->options);
440 }
443
444 /*
445 * Call the subsystem-specific init callbacks to finish initialization of
446 * all the areas.
447 */
449 {
450 if (callbacks->init_fn)
451 callbacks->init_fn(callbacks->opaque_arg);
452 }
453
455}
456
457/*
458 * Re-establish process private state related to shmem areas.
459 *
460 * This is called at backend startup in EXEC_BACKEND mode, in every backend.
461 */
462#ifdef EXEC_BACKEND
463void
465{
466 ListCell *lc;
467
468 /* Must be initializing a (non-standalone) backend */
473
475
476 /*
477 * Attach to all the requested memory areas.
478 */
480 {
482 pfree(request->options);
483 }
486
487 /* Call attach callbacks */
489 {
490 const ShmemCallbacks *callbacks = (const ShmemCallbacks *) lfirst(lc);
491
492 if (callbacks->attach_fn)
493 callbacks->attach_fn(callbacks->opaque_arg);
494 }
495
497
499}
500#endif
501
502/*
503 * Insert requested shmem area into the shared memory index and initialize it.
504 *
505 * Note that this only does performs basic initialization depending on
506 * ShmemRequestKind, like setting the global pointer variable to the area for
507 * SHMEM_KIND_STRUCT or setting up the backend-private HTAB control struct.
508 * This does *not* call the subsystem-specific init callbacks. That's done
509 * later after all the shmem areas have been initialized or attached to.
510 */
511static void
513{
514 const char *name = request->options->name;
516 bool found;
517 size_t allocated_size;
518 void *structPtr;
519
520 /* look it up in the shmem index */
523 if (found)
524 elog(ERROR, "shared memory struct \"%s\" is already initialized", name);
525 if (!index_entry)
526 {
527 /* tried to add it to the hash table, but there was no space */
530 errmsg("could not create ShmemIndex entry for data structure \"%s\"",
531 name)));
532 }
533
534 /*
535 * We inserted the entry to the shared memory index. Allocate requested
536 * amount of shared memory for it, and initialize the index entry.
537 */
538 structPtr = ShmemAllocRaw(request->options->size,
539 request->options->alignment,
540 &allocated_size);
541 if (structPtr == NULL)
542 {
543 /* out of memory; remove the failed ShmemIndex entry */
547 errmsg("not enough shared memory for data structure"
548 " \"%s\" (%zu bytes requested)",
549 name, request->options->size)));
550 }
551 index_entry->size = request->options->size;
552 index_entry->allocated_size = allocated_size;
553 index_entry->location = structPtr;
554
555 /* Initialize depending on the kind of shmem area it is */
556 switch (request->kind)
557 {
559 if (request->options->ptr)
560 *(request->options->ptr) = index_entry->location;
561 break;
562 case SHMEM_KIND_HASH:
564 break;
565 case SHMEM_KIND_SLRU:
567 break;
568 }
569}
570
571/*
572 * Look up a named shmem area in the shared memory index and attach to it.
573 *
574 * Note that this only performs the basic attachment actions depending on
575 * ShmemRequestKind, like setting the global pointer variable to the area for
576 * SHMEM_KIND_STRUCT or setting up the backend-private HTAB control struct.
577 * This does *not* call the subsystem-specific attach callbacks. That's done
578 * later after all the shmem areas have been initialized or attached to.
579 */
580static bool
582{
583 const char *name = request->options->name;
585
586 /* Look it up in the shmem index */
589 if (!index_entry)
590 {
591 if (!missing_ok)
593 (errmsg("could not find ShmemIndex entry for data structure \"%s\"",
594 request->options->name)));
595 return false;
596 }
597
598 /* Check that the size in the index matches the request */
599 if (index_entry->size != request->options->size &&
600 request->options->size != SHMEM_ATTACH_UNKNOWN_SIZE)
601 {
603 (errmsg("shared memory struct \"%s\" was created with"
604 " different size: existing %zu, requested %zu",
605 name, index_entry->size, request->options->size)));
606 }
607
608 /*
609 * Re-establish the caller's pointer variable, or do other actions to
610 * attach depending on the kind of shmem area it is.
611 */
612 switch (request->kind)
613 {
615 if (request->options->ptr)
616 *(request->options->ptr) = index_entry->location;
617 break;
618 case SHMEM_KIND_HASH:
619 shmem_hash_attach(index_entry->location, request->options);
620 break;
621 case SHMEM_KIND_SLRU:
622 shmem_slru_attach(index_entry->location, request->options);
623 break;
624 }
625
626 return true;
627}
628
629/*
630 * InitShmemAllocator() --- set up basic pointers to shared memory.
631 *
632 * Called at postmaster or stand-alone backend startup, to initialize the
633 * allocator's data structure in the shared memory segment. In EXEC_BACKEND,
634 * this is also called at backend startup, to set up pointers to the
635 * already-initialized data structure.
636 */
637void
639{
640 Size offset;
642 HASHCTL info;
643 int hash_flags;
644
645#ifndef EXEC_BACKEND
647#endif
648 Assert(seghdr != NULL);
649
651 {
653 }
654 else
655 {
658 }
659
660 /*
661 * We assume the pointer and offset are MAXALIGN. Not a hard requirement,
662 * but it's true today and keeps the math below simpler.
663 */
664 Assert(seghdr == (void *) MAXALIGN(seghdr));
665 Assert(seghdr->content_offset == MAXALIGN(seghdr->content_offset));
666
667 /*
668 * Allocations after this point should go through ShmemAlloc, which
669 * expects to allocate everything on cache line boundaries. Make sure the
670 * first allocation begins on a cache line boundary.
671 */
672 offset = CACHELINEALIGN(seghdr->content_offset + sizeof(ShmemAllocatorData));
673 if (offset > seghdr->totalsize)
676 errmsg("out of shared memory (%zu bytes requested)",
677 offset)));
678
679 /*
680 * In postmaster or stand-alone backend, initialize the shared memory
681 * allocator so that we can allocate shared memory for ShmemIndex using
682 * ShmemAlloc(). In a regular backend just set up the pointers required
683 * by ShmemAlloc().
684 */
685 ShmemAllocator = (ShmemAllocatorData *) ((char *) seghdr + seghdr->content_offset);
687 {
689 ShmemAllocator->free_offset = offset;
691 }
692
695 ShmemEnd = (char *) ShmemBase + seghdr->totalsize;
696
697 /*
698 * Create (or attach to) the shared memory index of shmem areas.
699 *
700 * This is the same initialization as ShmemInitHash() does, but we cannot
701 * use ShmemInitHash() here because it relies on ShmemIndex being already
702 * initialized.
703 */
705
707 info.entrysize = sizeof(ShmemIndexEnt);
708 hash_flags = HASH_ELEM | HASH_STRINGS | HASH_FIXED_SIZE;
709
711 {
714 }
718 "ShmemIndex", hash_nelems,
719 &info, hash_flags);
721
722 /*
723 * Add an entry for ShmemIndex itself into ShmemIndex, so that it's
724 * visible in the pg_shmem_allocations view
725 */
727 {
728 bool found;
730 hash_search(ShmemIndex, "ShmemIndex", HASH_ENTER, &found);
731
732 Assert(!found);
734 result->allocated_size = ShmemAllocator->index_size;
735 result->location = ShmemAllocator->index;
736 }
737}
738
739/*
740 * Reset state on postmaster crash restart.
741 */
742void
744{
747
749
750 /*
751 * Note that we don't clear the registered callbacks. We will need to
752 * call them again as we restart
753 */
754}
755
756/*
757 * ShmemAlloc -- allocate max-aligned chunk from shared memory
758 *
759 * Throws error if request cannot be satisfied.
760 *
761 * Assumes ShmemSegHdr is initialized.
762 */
763void *
765{
766 void *newSpace;
767 Size allocated_size;
768
769 newSpace = ShmemAllocRaw(size, 0, &allocated_size);
770 if (!newSpace)
773 errmsg("out of shared memory (%zu bytes requested)",
774 size)));
775 return newSpace;
776}
777
778/*
779 * ShmemAllocNoError -- allocate max-aligned chunk from shared memory
780 *
781 * As ShmemAlloc, but returns NULL if out of space, rather than erroring.
782 */
783void *
785{
786 Size allocated_size;
787
788 return ShmemAllocRaw(size, 0, &allocated_size);
789}
790
791/*
792 * ShmemAllocRaw -- allocate align chunk and return allocated size
793 *
794 * Also sets *allocated_size to the number of bytes allocated, which will
795 * be equal to the number requested plus any padding we choose to add.
796 */
797static void *
798ShmemAllocRaw(Size size, Size alignment, Size *allocated_size)
799{
803 void *newSpace;
804
805 /*
806 * Ensure all space is adequately aligned. We used to only MAXALIGN this
807 * space but experience has proved that on modern systems that is not good
808 * enough. Many parts of the system are very sensitive to critical data
809 * structures getting split across cache line boundaries. To avoid that,
810 * attempt to align the beginning of the allocation to a cache line
811 * boundary. The calling code will still need to be careful about how it
812 * uses the allocated space - e.g. by padding each element in an array of
813 * structures out to a power-of-two size - but without this, even that
814 * won't be sufficient.
815 */
816 if (alignment < PG_CACHE_LINE_SIZE)
817 alignment = PG_CACHE_LINE_SIZE;
818
820
822
824 newStart = TYPEALIGN(alignment, rawStart);
825
826 newFree = newStart + size;
827 if (newFree <= ShmemSegHdr->totalsize)
828 {
829 newSpace = (char *) ShmemBase + newStart;
831 }
832 else
833 newSpace = NULL;
834
836
837 /* note this assert is okay with newSpace == NULL */
838 Assert(newSpace == (void *) TYPEALIGN(alignment, newSpace));
839
840 *allocated_size = newFree - rawStart;
841 return newSpace;
842}
843
844/*
845 * ShmemAddrIsValid -- test if an address refers to shared memory
846 *
847 * Returns true if the pointer points within the shared memory segment.
848 */
849bool
850ShmemAddrIsValid(const void *addr)
851{
852 return (addr >= ShmemBase) && (addr < ShmemEnd);
853}
854
855/*
856 * Register callbacks that define a shared memory area (or multiple areas).
857 *
858 * The system will call the callbacks at different stages of postmaster or
859 * backend startup, to allocate and initialize the area.
860 *
861 * This is normally called early during postmaster startup, but if the
862 * SHMEM_CALLBACKS_ALLOW_AFTER_STARTUP is set, this can also be used after
863 * startup, although after startup there's no guarantee that there's enough
864 * shared memory available. When called after startup, this immediately calls
865 * the right callbacks depending on whether another backend had already
866 * initialized the area.
867 *
868 * Note: In EXEC_BACKEND mode, this needs to be called in every backend
869 * process. That's needed because we cannot pass down the callback function
870 * pointers from the postmaster process, because different processes may have
871 * loaded libraries to different addresses.
872 */
873void
875{
877 {
878 /*
879 * After-startup initialization or attachment. Call the appropriate
880 * callbacks immediately.
881 */
882 if ((callbacks->flags & SHMEM_CALLBACKS_ALLOW_AFTER_STARTUP) == 0)
883 elog(ERROR, "cannot request shared memory at this time");
884
886 }
887 else
888 {
889 /* Remember the callbacks for later */
891 (void *) callbacks);
892 }
893}
894
895/*
896 * Register a shmem area (or multiple areas) after startup.
897 */
898static void
900{
901 bool found_any;
902 bool notfound_any;
903
906
907 /*
908 * Call the request callback first. The callback makes ShmemRequest*()
909 * calls for each shmem area, adding them to pending_shmem_requests.
910 */
912 if (callbacks->request_fn)
913 callbacks->request_fn(callbacks->opaque_arg);
915
917 {
919 return;
920 }
921
922 /* Hold ShmemIndexLock while we allocate all the shmem entries */
924
925 /*
926 * Check if the requested shared memory areas have already been
927 * initialized. We assume all the areas requested by the request callback
928 * to form a coherent unit such that they're all already initialized or
929 * none. Otherwise it would be ambiguous which callback, init or attach,
930 * to callback afterwards.
931 */
932 found_any = notfound_any = false;
934 {
935 if (hash_search(ShmemIndex, request->options->name, HASH_FIND, NULL))
936 found_any = true;
937 else
938 notfound_any = true;
939 }
940 if (found_any && notfound_any)
941 elog(ERROR, "found some but not all");
942
943 /*
944 * Allocate or attach all the shmem areas requested by the request_fn
945 * callback.
946 */
948 {
949 if (found_any)
951 else
953
954 pfree(request->options);
955 }
958
959 /* Finish by calling the appropriate subsystem-specific callback */
960 if (found_any)
961 {
962 if (callbacks->attach_fn)
963 callbacks->attach_fn(callbacks->opaque_arg);
964 }
965 else
966 {
967 if (callbacks->init_fn)
968 callbacks->init_fn(callbacks->opaque_arg);
969 }
970
973}
974
975/*
976 * Call all shmem request callbacks.
977 */
978void
980{
981 ListCell *lc;
982
985
987 {
988 const ShmemCallbacks *callbacks = (const ShmemCallbacks *) lfirst(lc);
989
990 if (callbacks->request_fn)
991 callbacks->request_fn(callbacks->opaque_arg);
992 }
993}
994
995/*
996 * ShmemInitStruct -- Create/attach to a structure in shared memory.
997 *
998 * This is called during initialization to find or allocate
999 * a data structure in shared memory. If no other process
1000 * has created the structure, this routine allocates space
1001 * for it. If it exists already, a pointer to the existing
1002 * structure is returned.
1003 *
1004 * Returns: pointer to the object. *foundPtr is set true if the object was
1005 * already in the shmem index (hence, already initialized).
1006 *
1007 * Note: This is a legacy interface, kept for backwards compatibility with
1008 * extensions. Use ShmemRequestStruct() in new code!
1009 */
1010void *
1011ShmemInitStruct(const char *name, Size size, bool *foundPtr)
1012{
1013 void *ptr = NULL;
1015 .name = name,
1016 .size = size,
1017 .ptr = &ptr,
1018 };
1020
1024
1026
1027 /*
1028 * During postmaster startup, look up the existing entry if any.
1029 */
1030 *foundPtr = false;
1033
1034 /* Initialize it if not found */
1035 if (!*foundPtr)
1037
1039
1040 Assert(ptr != NULL);
1041 return ptr;
1042}
1043
1044/*
1045 * Add two Size values, checking for overflow
1046 */
1047Size
1049{
1050 Size result;
1051
1053 ereport(ERROR,
1055 errmsg("requested shared memory size overflows size_t")));
1056 return result;
1057}
1058
1059/*
1060 * Multiply two Size values, checking for overflow
1061 */
1062Size
1064{
1065 Size result;
1066
1068 ereport(ERROR,
1070 errmsg("requested shared memory size overflows size_t")));
1071 return result;
1072}
1073
1074/* SQL SRF showing allocated shared memory */
1075Datum
1077{
1078#define PG_GET_SHMEM_SIZES_COLS 4
1079 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1084 bool nulls[PG_GET_SHMEM_SIZES_COLS];
1085
1086 InitMaterializedSRF(fcinfo, 0);
1087
1089
1091
1092 /* output all allocated entries */
1093 memset(nulls, 0, sizeof(nulls));
1094 while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
1095 {
1096 values[0] = CStringGetTextDatum(ent->key);
1097 values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
1098 values[2] = Int64GetDatum(ent->size);
1099 values[3] = Int64GetDatum(ent->allocated_size);
1100 named_allocated += ent->allocated_size;
1101
1102 tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
1103 values, nulls);
1104 }
1105
1106 /* output shared memory allocated but not counted via the shmem index */
1107 values[0] = CStringGetTextDatum("<anonymous>");
1108 nulls[1] = true;
1110 values[3] = values[2];
1111 tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
1112
1113 /* output as-of-yet unused shared memory */
1114 nulls[0] = true;
1116 nulls[1] = false;
1118 values[3] = values[2];
1119 tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
1120
1122
1123 return (Datum) 0;
1124}
1125
1126/*
1127 * SQL SRF showing NUMA memory nodes for allocated shared memory
1128 *
1129 * Compared to pg_get_shmem_allocations(), this function does not return
1130 * information about shared anonymous allocations and unused shared memory.
1131 */
1132Datum
1134{
1135#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
1136 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1140 bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
1142 void **page_ptrs;
1143 int *pages_status;
1146 max_nodes;
1147 Size *nodes;
1148
1149 if (pg_numa_init() == -1)
1150 elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
1151
1152 InitMaterializedSRF(fcinfo, 0);
1153
1156
1157 /*
1158 * Shared memory allocations can vary in size and may not align with OS
1159 * memory page boundaries, while NUMA queries work on pages.
1160 *
1161 * To correctly map each allocation to NUMA nodes, we need to: 1.
1162 * Determine the OS memory page size. 2. Align each allocation's start/end
1163 * addresses to page boundaries. 3. Query NUMA node information for all
1164 * pages spanning the allocation.
1165 */
1167
1168 /*
1169 * Allocate memory for page pointers and status based on total shared
1170 * memory size. This simplified approach allocates enough space for all
1171 * pages in shared memory rather than calculating the exact requirements
1172 * for each segment.
1173 *
1174 * Add 1, because we don't know how exactly the segments align to OS
1175 * pages, so the allocation might use one more memory page. In practice
1176 * this is not very likely, and moreover we have more entries, each of
1177 * them using only fraction of the total pages.
1178 */
1182
1183 if (firstNumaTouch)
1184 elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
1185
1187
1189
1190 /* output all allocated entries */
1191 while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
1192 {
1193 int i;
1194 char *startptr,
1195 *endptr;
1196 Size total_len;
1197
1198 /*
1199 * Calculate the range of OS pages used by this segment. The segment
1200 * may start / end half-way through a page, we want to count these
1201 * pages too. So we align the start/end pointers down/up, and then
1202 * calculate the number of pages from that.
1203 */
1204 startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
1205 endptr = (char *) TYPEALIGN(os_page_size,
1206 (char *) ent->location + ent->allocated_size);
1207 total_len = (endptr - startptr);
1208
1209 shm_ent_page_count = total_len / os_page_size;
1210
1211 /*
1212 * If we ever get 0xff (-1) back from kernel inquiry, then we probably
1213 * have a bug in mapping buffers to OS pages.
1214 */
1215 memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
1216
1217 /*
1218 * Setup page_ptrs[] with pointers to all OS pages for this segment,
1219 * and get the NUMA status using pg_numa_query_pages.
1220 *
1221 * In order to get reliable results we also need to touch memory
1222 * pages, so that inquiry about NUMA memory node doesn't return -2
1223 * (ENOENT, which indicates unmapped/unallocated pages).
1224 */
1225 for (i = 0; i < shm_ent_page_count; i++)
1226 {
1227 page_ptrs[i] = startptr + (i * os_page_size);
1228
1229 if (firstNumaTouch)
1231
1233 }
1234
1236 elog(ERROR, "failed NUMA pages inquiry status: %m");
1237
1238 /* Count number of NUMA nodes used for this shared memory entry */
1239 memset(nodes, 0, sizeof(Size) * (max_nodes + 2));
1240
1241 for (i = 0; i < shm_ent_page_count; i++)
1242 {
1243 int s = pages_status[i];
1244
1245 /* Ensure we are adding only valid index to the array */
1246 if (s >= 0 && s <= max_nodes)
1247 {
1248 /* valid NUMA node */
1249 nodes[s]++;
1250 continue;
1251 }
1252 else if (s == -2)
1253 {
1254 /* -2 means ENOENT (e.g. page was moved to swap) */
1255 nodes[max_nodes + 1]++;
1256 continue;
1257 }
1258
1259 elog(ERROR, "invalid NUMA node id outside of allowed range "
1260 "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
1261 }
1262
1263 /* no NULLs for regular nodes */
1264 memset(nulls, 0, sizeof(nulls));
1265
1266 /*
1267 * Add one entry for each NUMA node, including those without allocated
1268 * memory for this segment.
1269 */
1270 for (i = 0; i <= max_nodes; i++)
1271 {
1272 values[0] = CStringGetTextDatum(ent->key);
1273 values[1] = Int32GetDatum(i);
1275
1276 tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
1277 values, nulls);
1278 }
1279
1280 /* The last entry is used for pages without a NUMA node. */
1281 nulls[1] = true;
1282 values[0] = CStringGetTextDatum(ent->key);
1284
1285 tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
1286 values, nulls);
1287 }
1288
1290 firstNumaTouch = false;
1291
1292 return (Datum) 0;
1293}
1294
1295/*
1296 * Determine the memory page size used for the shared memory segment.
1297 *
1298 * If the shared segment was allocated using huge pages, returns the size of
1299 * a huge page. Otherwise returns the size of regular memory page.
1300 *
1301 * This should be used only after the server is started.
1302 */
1303Size
1305{
1307#ifdef WIN32
1309
1311 os_page_size = sysinfo.dwPageSize;
1312#else
1314#endif
1315
1318
1321
1322 return os_page_size;
1323}
1324
1325Datum
static Datum values[MAXATTR]
Definition bootstrap.c:190
#define CStringGetTextDatum(s)
Definition builtins.h:98
#define CACHELINEALIGN(LEN)
Definition c.h:899
#define MAXALIGN(LEN)
Definition c.h:896
#define TYPEALIGN(ALIGNVAL, LEN)
Definition c.h:889
#define Assert(condition)
Definition c.h:943
int64_t int64
Definition c.h:621
#define UINT64_FORMAT
Definition c.h:635
uint64_t uint64
Definition c.h:625
size_t Size
Definition c.h:689
#define TYPEALIGN_DOWN(ALIGNVAL, LEN)
Definition c.h:901
uint32 result
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition dynahash.c:889
Size hash_estimate_size(int64 num_entries, Size entrysize)
Definition dynahash.c:763
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition dynahash.c:1352
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition dynahash.c:1317
int errcode(int sqlerrcode)
Definition elog.c:874
#define DEBUG1
Definition elog.h:31
#define ERROR
Definition elog.h:40
#define elog(elevel,...)
Definition elog.h:228
#define ereport(elevel,...)
Definition elog.h:152
#define palloc_array(type, count)
Definition fe_memutils.h:76
#define palloc0_array(type, count)
Definition fe_memutils.h:77
#define PG_FUNCTION_ARGS
Definition fmgr.h:193
#define PG_RETURN_BOOL(x)
Definition fmgr.h:360
void InitMaterializedSRF(FunctionCallInfo fcinfo, uint32 flags)
Definition funcapi.c:76
bool IsUnderPostmaster
Definition globals.c:122
int huge_pages_status
Definition guc_tables.c:610
#define HASH_STRINGS
Definition hsearch.h:91
@ HASH_FIND
Definition hsearch.h:108
@ HASH_REMOVE
Definition hsearch.h:110
@ HASH_ENTER
Definition hsearch.h:109
@ HASH_ENTER_NULL
Definition hsearch.h:111
#define HASH_ELEM
Definition hsearch.h:90
#define HASH_FIXED_SIZE
Definition hsearch.h:100
static bool pg_mul_size_overflow(size_t a, size_t b, size_t *result)
Definition int.h:642
static bool pg_add_size_overflow(size_t a, size_t b, size_t *result)
Definition int.h:608
int i
Definition isn.c:77
List * lappend(List *list, void *datum)
Definition list.c:339
void list_free_deep(List *list)
Definition list.c:1560
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1150
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1767
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition lwlock.c:670
@ LW_SHARED
Definition lwlock.h:105
@ LW_EXCLUSIVE
Definition lwlock.h:104
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1232
void pfree(void *pointer)
Definition mcxt.c:1616
MemoryContext TopMemoryContext
Definition mcxt.c:166
void * palloc(Size size)
Definition mcxt.c:1387
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:125
static char * errmsg
#define pg_nextpower2_size_t
#define PG_CACHE_LINE_SIZE
#define lfirst(lc)
Definition pg_list.h:172
static int list_length(const List *l)
Definition pg_list.h:152
#define NIL
Definition pg_list.h:68
#define foreach_ptr(type, var, lst)
Definition pg_list.h:501
PGDLLIMPORT int pg_numa_get_max_node(void)
Definition pg_numa.c:138
#define pg_numa_touch_mem_if_required(ptr)
Definition pg_numa.h:37
PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
Definition pg_numa.c:132
PGDLLIMPORT int pg_numa_init(void)
Definition pg_numa.c:125
@ HUGE_PAGES_UNKNOWN
Definition pg_shmem.h:56
@ HUGE_PAGES_ON
Definition pg_shmem.h:54
static Datum Int64GetDatum(int64 X)
Definition postgres.h:413
uint64_t Datum
Definition postgres.h:70
static Datum Int32GetDatum(int32 X)
Definition postgres.h:212
static int fb(int x)
char * s1
char * s2
bool ShmemAddrIsValid(const void *addr)
Definition shmem.c:850
Datum pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
Definition shmem.c:1133
Datum pg_numa_available(PG_FUNCTION_ARGS)
Definition shmem.c:1326
void InitShmemAllocator(PGShmemHeader *seghdr)
Definition shmem.c:638
void ShmemRequestStructWithOpts(const ShmemStructOpts *options)
Definition shmem.c:316
static void * ShmemBase
Definition shmem.c:244
Datum pg_get_shmem_allocations(PG_FUNCTION_ARGS)
Definition shmem.c:1076
static void * ShmemEnd
Definition shmem.c:245
Size add_size(Size s1, Size s2)
Definition shmem.c:1048
Size pg_get_shmem_pagesize(void)
Definition shmem.c:1304
void RegisterShmemCallbacks(const ShmemCallbacks *callbacks)
Definition shmem.c:874
static List * registered_shmem_callbacks
Definition shmem.c:158
#define PG_GET_SHMEM_NUMA_SIZES_COLS
void * ShmemAllocNoError(Size size)
Definition shmem.c:784
Size mul_size(Size s1, Size s2)
Definition shmem.c:1063
void * ShmemAlloc(Size size)
Definition shmem.c:764
#define PG_GET_SHMEM_SIZES_COLS
void ShmemCallRequestCallbacks(void)
Definition shmem.c:979
static void * ShmemAllocRaw(Size size, Size alignment, Size *allocated_size)
Definition shmem.c:798
static void InitShmemIndexEntry(ShmemRequest *request)
Definition shmem.c:512
static void CallShmemCallbacksAfterStartup(const ShmemCallbacks *callbacks)
Definition shmem.c:899
void ShmemInitRequested(void)
Definition shmem.c:425
static PGShmemHeader * ShmemSegHdr
Definition shmem.c:243
void ShmemRequestInternal(ShmemStructOpts *options, ShmemRequestKind kind)
Definition shmem.c:337
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition shmem.c:1011
shmem_request_state
Definition shmem.c:186
@ SRS_INITIALIZING
Definition shmem.c:201
@ SRS_DONE
Definition shmem.c:213
@ SRS_ATTACHING
Definition shmem.c:207
@ SRS_INITIAL
Definition shmem.c:188
@ SRS_AFTER_STARTUP_ATTACH_OR_INIT
Definition shmem.c:210
@ SRS_REQUESTING
Definition shmem.c:195
static HTAB * ShmemIndex
Definition shmem.c:253
static List * pending_shmem_requests
Definition shmem.c:170
#define SHMEM_INDEX_ADDITIONAL_SIZE
Definition shmem.c:263
static ShmemAllocatorData * ShmemAllocator
Definition shmem.c:247
#define ShmemIndexLock
Definition shmem.c:237
static bool AttachShmemIndexEntry(ShmemRequest *request, bool missing_ok)
Definition shmem.c:581
size_t ShmemGetRequestedSize(void)
Definition shmem.c:392
static bool firstNumaTouch
Definition shmem.c:275
void ResetShmemAllocator(void)
Definition shmem.c:743
#define SHMEM_INDEX_KEYSIZE
Definition shmem.c:256
#define SHMEM_ATTACH_UNKNOWN_SIZE
Definition shmem.h:69
#define SHMEM_CALLBACKS_ALLOW_AFTER_STARTUP
Definition shmem.h:167
HTAB * shmem_hash_create(void *location, size_t size, bool found, const char *name, int64 nelems, HASHCTL *infoP, int hash_flags)
Definition shmem_hash.c:149
void shmem_hash_attach(void *location, ShmemStructOpts *base_options)
Definition shmem_hash.c:79
void shmem_hash_init(void *location, ShmemStructOpts *base_options)
Definition shmem_hash.c:63
ShmemRequestKind
@ SHMEM_KIND_SLRU
@ SHMEM_KIND_HASH
@ SHMEM_KIND_STRUCT
void shmem_slru_init(void *location, ShmemStructOpts *base_options)
Definition slru.c:267
void shmem_slru_attach(void *location, ShmemStructOpts *base_options)
Definition slru.c:359
static void SpinLockRelease(volatile slock_t *lock)
Definition spin.h:62
static void SpinLockAcquire(volatile slock_t *lock)
Definition spin.h:56
static void SpinLockInit(volatile slock_t *lock)
Definition spin.h:50
Size keysize
Definition hsearch.h:69
Size entrysize
Definition hsearch.h:70
Definition pg_list.h:54
Size totalsize
Definition pg_shmem.h:34
HASHHDR * index
Definition shmem.c:232
LWLock index_lock
Definition shmem.c:234
slock_t shmem_lock
Definition shmem.c:230
size_t index_size
Definition shmem.c:233
ShmemRequestCallback request_fn
Definition shmem.h:133
ShmemInitCallback init_fn
Definition shmem.h:139
void * opaque_arg
Definition shmem.h:153
ShmemAttachCallback attach_fn
Definition shmem.h:147
void * location
Definition shmem.c:269
Size size
Definition shmem.c:270
Size allocated_size
Definition shmem.c:271
ShmemRequestKind kind
Definition shmem.c:167
ShmemStructOpts * options
Definition shmem.c:166
void GetHugePageSize(Size *hugepagesize, int *mmap_flags)
Definition sysv_shmem.c:480
void tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc, const Datum *values, const bool *isnull)
Definition tuplestore.c:785
const char * name