PostgreSQL Source Code git master
Loading...
Searching...
No Matches
shmem.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * shmem.c
4 * create shared memory and initialize shared memory data structures.
5 *
6 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/storage/ipc/shmem.c
12 *
13 *-------------------------------------------------------------------------
14 */
15/*
16 * POSTGRES processes share one or more regions of shared memory.
17 * The shared memory is created by a postmaster and is inherited
18 * by each backend via fork() (or, in some ports, via other OS-specific
19 * methods). The routines in this file are used for allocating and
20 * binding to shared memory data structures.
21 *
22 * This module provides facilities to allocate fixed-size structures in shared
23 * memory, for things like variables shared between all backend processes.
24 * Each such structure has a string name to identify it, specified when it is
25 * requested. shmem_hash.c provides a shared hash table implementation on top
26 * of that.
27 *
28 * Shared memory areas should usually not be allocated after postmaster
29 * startup, although we do allow small allocations later for the benefit of
30 * extension modules that are loaded after startup. Despite that allowance,
31 * extensions that need shared memory should be added in
32 * shared_preload_libraries, because the allowance is quite small and there is
33 * no guarantee that any memory is available after startup.
34 *
35 * Nowadays, there is also another way to allocate shared memory called
36 * Dynamic Shared Memory. See dsm.c for that facility. One big difference
37 * between traditional shared memory handled by shmem.c and dynamic shared
38 * memory is that traditional shared memory areas are mapped to the same
39 * address in all processes, so you can use normal pointers in shared memory
40 * structs. With Dynamic Shared Memory, you must use offsets or DSA pointers
41 * instead.
42 *
43 * Shared memory managed by shmem.c can never be freed, once allocated. Each
44 * hash table has its own free list, so hash buckets can be reused when an
45 * item is deleted.
46 *
47 * Usage
48 * -----
49 *
50 * To allocate shared memory, you need to register a set of callback functions
51 * which handle the lifecycle of the allocation. In the request_fn callback,
52 * call ShmemRequestStruct() with the desired name and size. When the area is
53 * later allocated or attached to, the global variable pointed to by the .ptr
54 * option is set to the shared memory location of the allocation. The init_fn
55 * callback can perform additional initialization.
56 *
57 * typedef struct MyShmemData {
58 * ...
59 * } MyShmemData;
60 *
61 * static MyShmemData *MyShmem;
62 *
63 * static void my_shmem_request(void *arg);
64 * static void my_shmem_init(void *arg);
65 *
66 * const ShmemCallbacks MyShmemCallbacks = {
67 * .request_fn = my_shmem_request,
68 * .init_fn = my_shmem_init,
69 * };
70 *
71 * static void
72 * my_shmem_request(void *arg)
73 * {
74 * ShmemRequestStruct(.name = "My shmem area",
75 * .size = sizeof(MyShmemData),
76 * .ptr = (void **) &MyShmem,
77 * );
78 * }
79 *
80 * In builtin PostgreSQL code, add the callbacks to the list in
81 * src/include/storage/subsystemlist.h. In an add-in module, you can register
82 * the callbacks by calling RegisterShmemCallbacks(&MyShmemCallbacks) in the
83 * extension's _PG_init() function.
84 *
85 * Lifecycle
86 * ---------
87 *
88 * Initializing shared memory happens in multiple phases. In the first phase,
89 * during postmaster startup, all the request_fn callbacks are called. Only
90 * after all the request_fn callbacks have been called and all the shmem areas
91 * have been requested by the ShmemRequestStruct() calls we know how much
92 * shared memory we need in total. After that, postmaster allocates global
93 * shared memory segment, and calls all the init_fn callbacks to initialize
94 * all the requested shmem areas.
95 *
96 * In standard Unix-ish environments, individual backends do not need to
97 * re-establish their local pointers into shared memory, because they inherit
98 * correct values of those variables via fork() from the postmaster. However,
99 * this does not work in the EXEC_BACKEND case. In ports using EXEC_BACKEND,
100 * backend startup also calls the shmem_request callbacks to re-establish the
101 * knowledge about each shared memory area, sets the pointer variables
102 * (*options->ptr), and calls the attach_fn callback, if any, for additional
103 * per-backend setup.
104 *
105 * Legacy ShmemInitStruct()/ShmemInitHash() functions
106 * --------------------------------------------------
107 *
108 * ShmemInitStruct()/ShmemInitHash() is another way of registering shmem
109 * areas. It pre-dates the ShmemRequestStruct()/ShmemRequestHash() functions,
110 * and should not be used in new code, but as of this writing it is still
111 * widely used in extensions.
112 *
113 * To allocate a shmem area with ShmemInitStruct(), you need to separately
114 * register the size needed for the area by calling RequestAddinShmemSpace()
115 * from the extension's shmem_request_hook, and allocate the area by calling
116 * ShmemInitStruct() from the extension's shmem_startup_hook. There are no
117 * init/attach callbacks. Instead, the caller of ShmemInitStruct() must check
118 * the return status of ShmemInitStruct() and initialize the struct if it was
119 * not previously initialized.
120 *
121 * Calling ShmemAlloc() directly
122 * -----------------------------
123 *
124 * There's a more low-level way of allocating shared memory too: you can call
125 * ShmemAlloc() directly. It's used to implement the higher level mechanisms,
126 * and should generally not be called directly.
127 */
128
129#include "postgres.h"
130
131#include <unistd.h>
132
133#include "access/slru.h"
134#include "fmgr.h"
135#include "funcapi.h"
136#include "miscadmin.h"
137#include "port/pg_bitutils.h"
138#include "port/pg_numa.h"
139#include "storage/lwlock.h"
140#include "storage/pg_shmem.h"
141#include "storage/shmem.h"
143#include "storage/spin.h"
144#include "utils/builtins.h"
145#include "utils/tuplestore.h"
146
147/*
148 * Registered callbacks.
149 *
150 * During postmaster startup, we accumulate the callbacks from all subsystems
151 * in this list.
152 *
153 * This is in process private memory, although on Unix-like systems, we expect
154 * all the registrations to happen at postmaster startup time and be inherited
155 * by all the child processes via fork().
156 */
158
159/*
160 * In the shmem request phase, all the shmem areas requested with the
161 * ShmemRequest*() functions are accumulated here.
162 */
168
170
171/*
172 * Per-process state machine, for sanity checking that we do things in the
173 * right order.
174 *
175 * Postmaster:
176 * INITIAL -> REQUESTING -> INITIALIZING -> DONE
177 *
178 * Backends in EXEC_BACKEND mode:
179 * INITIAL -> REQUESTING -> ATTACHING -> DONE
180 *
181 * Late request:
182 * DONE -> REQUESTING -> AFTER_STARTUP_ATTACH_OR_INIT -> DONE
183 */
185{
186 /* Initial state */
188
189 /*
190 * When we start calling the shmem_request callbacks, we enter the
191 * SRS_REQUESTING phase. All ShmemRequestStruct calls happen in this
192 * state.
193 */
195
196 /*
197 * Postmaster has finished all shmem requests, and is now initializing the
198 * shared memory segment. init_fn callbacks are called in this state.
199 */
201
202 /*
203 * A postmaster child process is starting up. attach_fn callbacks are
204 * called in this state.
205 */
207
208 /* An after-startup allocation or attachment is in progress */
210
211 /* Normal state after shmem initialization / attachment */
213};
215
216/*
217 * This is the first data structure stored in the shared memory segment, at
218 * the offset that PGShmemHeader->content_offset points to. Allocations by
219 * ShmemAlloc() are carved out of the space after this.
220 *
221 * For the base pointer and the total size of the shmem segment, we rely on
222 * the PGShmemHeader.
223 */
224typedef struct ShmemAllocatorData
225{
226 Size free_offset; /* offset to first free space from ShmemBase */
227
228 /* protects 'free_offset' */
230
231 HASHHDR *index; /* location of ShmemIndex */
232 size_t index_size; /* size of shmem region holding ShmemIndex */
233 LWLock index_lock; /* protects ShmemIndex */
235
236#define ShmemIndexLock (&ShmemAllocator->index_lock)
237
238static void *ShmemAllocRaw(Size size, Size alignment, Size *allocated_size);
239
240/* shared memory global variables */
241
242static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */
243static void *ShmemBase; /* start address of shared memory */
244static void *ShmemEnd; /* end+1 address of shared memory */
245
247
248/*
249 * ShmemIndex is a global directory of shmem areas, itself also stored in the
250 * shared memory.
251 */
253
254 /* max size of data structure string name */
255#define SHMEM_INDEX_KEYSIZE (48)
256
257/*
258 * # of additional entries to reserve in the shmem index table, for
259 * allocations after postmaster startup. (This is not a hard limit, the hash
260 * table can grow larger than that if there is shared memory available)
261 */
262#define SHMEM_INDEX_ADDITIONAL_SIZE (128)
263
264/* this is a hash bucket in the shmem index table */
265typedef struct
266{
267 char key[SHMEM_INDEX_KEYSIZE]; /* string name */
268 void *location; /* location in shared mem */
269 Size size; /* # bytes requested for the structure */
270 Size allocated_size; /* # bytes actually allocated */
272
273/* To get reliable results for NUMA inquiry we need to "touch pages" once */
274static bool firstNumaTouch = true;
275
276static void CallShmemCallbacksAfterStartup(const ShmemCallbacks *callbacks);
278static bool AttachShmemIndexEntry(ShmemRequest *request, bool missing_ok);
279
281
282/*
283 * ShmemRequestStruct() --- request a named shared memory area
284 *
285 * Subsystems call this to register their shared memory needs. This is
286 * usually done early in postmaster startup, before the shared memory segment
287 * has been created, so that the size can be included in the estimate for
288 * total amount of shared memory needed. We set aside a small amount of
289 * memory for allocations that happen later, for the benefit of non-preloaded
290 * extensions, but that should not be relied upon.
291 *
292 * This does not yet allocate the memory, but merely registers the need for
293 * it. The actual allocation happens later in the postmaster startup
294 * sequence.
295 *
296 * This must be called from a shmem_request callback function, registered with
297 * RegisterShmemCallbacks(). This enforces a coding pattern that works the
298 * same in normal Unix systems and with EXEC_BACKEND. On Unix systems, the
299 * shmem_request callbacks are called once, early in postmaster startup, and
300 * the child processes inherit the struct descriptors and any other
301 * per-process state from the postmaster. In EXEC_BACKEND mode, shmem_request
302 * callbacks are *also* called in each backend, at backend startup, to
303 * re-establish the struct descriptors. By calling the same function in both
304 * cases, we ensure that all the shmem areas are registered the same way in
305 * all processes.
306 *
307 * 'options' defines the name and size of the area, and any other optional
308 * features. Leave unused options as zeros. The options are copied to
309 * longer-lived memory, so it doesn't need to live after the
310 * ShmemRequestStruct() call and can point to a local variable in the calling
311 * function. The 'name' must point to a long-lived string though, only the
312 * pointer to it is copied.
313 */
314void
325
326/*
327 * Internal workhorse of ShmemRequestStruct() and ShmemRequestHash().
328 *
329 * Note: Unlike in the public ShmemRequestStruct() and ShmemRequestHash()
330 * functions, 'options' is *not* copied. It must be allocated in
331 * TopMemoryContext by the caller, and will be freed after the init/attach
332 * callbacks have been called. This allows ShmemRequestHash() to pass a
333 * pointer to the extended ShmemHashOpts struct instead.
334 */
335void
337{
339
340 /* Check the options */
341 if (options->name == NULL)
342 elog(ERROR, "shared memory request is missing 'name' option");
343
345 {
346 if (options->size <= 0 && options->size != SHMEM_ATTACH_UNKNOWN_SIZE)
347 elog(ERROR, "invalid size %zd for shared memory request for \"%s\"",
348 options->size, options->name);
349 }
350 else
351 {
353 elog(ERROR, "SHMEM_ATTACH_UNKNOWN_SIZE cannot be used during startup");
354 if (options->size <= 0)
355 elog(ERROR, "invalid size %zd for shared memory request for \"%s\"",
356 options->size, options->name);
357 }
358
359 if (options->alignment != 0 && pg_nextpower2_size_t(options->alignment) != options->alignment)
360 elog(ERROR, "invalid alignment %zu for shared memory request for \"%s\"",
361 options->alignment, options->name);
362
363 /* Check that we're in the right state */
365 elog(ERROR, "ShmemRequestStruct can only be called from a shmem_request callback");
366
367 /* Check that it's not already registered in this process */
369 {
370 if (strcmp(existing->options->name, options->name) == 0)
372 (errmsg("shared memory struct \"%s\" is already registered",
373 options->name)));
374 }
375
376 /* Request looks valid, remember it */
377 request = palloc(sizeof(ShmemRequest));
378 request->options = options;
379 request->kind = kind;
381}
382
383/*
384 * ShmemGetRequestedSize() --- estimate the total size of all registered shared
385 * memory structures.
386 *
387 * This is called at postmaster startup, before the shared memory segment has
388 * been created.
389 */
390size_t
392{
393 size_t size;
394
395 /* memory needed for the ShmemIndex */
397 sizeof(ShmemIndexEnt));
398 size = CACHELINEALIGN(size);
399
400 /* memory needed for all the requested areas */
402 {
403 size_t alignment = request->options->alignment;
404
405 /* pad the start address for alignment like ShmemAllocRaw() does */
406 if (alignment < PG_CACHE_LINE_SIZE)
407 alignment = PG_CACHE_LINE_SIZE;
408 size = TYPEALIGN(alignment, size);
409
410 size = add_size(size, request->options->size);
411 }
412
413 return size;
414}
415
416/*
417 * ShmemInitRequested() --- allocate and initialize requested shared memory
418 * structures.
419 *
420 * This is called once at postmaster startup, after the shared memory segment
421 * has been created.
422 */
423void
425{
426 /* should be called only by the postmaster or a standalone backend */
429
430 /*
431 * Initialize the ShmemIndex entries and perform basic initialization of
432 * all the requested memory areas. There are no concurrent processes yet,
433 * so no need for locking.
434 */
436 {
438 pfree(request->options);
439 }
442
443 /*
444 * Call the subsystem-specific init callbacks to finish initialization of
445 * all the areas.
446 */
448 {
449 if (callbacks->init_fn)
450 callbacks->init_fn(callbacks->opaque_arg);
451 }
452
454}
455
456/*
457 * Re-establish process private state related to shmem areas.
458 *
459 * This is called at backend startup in EXEC_BACKEND mode, in every backend.
460 */
461#ifdef EXEC_BACKEND
462void
464{
465 ListCell *lc;
466
467 /* Must be initializing a (non-standalone) backend */
472
474
475 /*
476 * Attach to all the requested memory areas.
477 */
479 {
481 pfree(request->options);
482 }
485
486 /* Call attach callbacks */
488 {
489 const ShmemCallbacks *callbacks = (const ShmemCallbacks *) lfirst(lc);
490
491 if (callbacks->attach_fn)
492 callbacks->attach_fn(callbacks->opaque_arg);
493 }
494
496
498}
499#endif
500
501/*
502 * Insert requested shmem area into the shared memory index and initialize it.
503 *
504 * Note that this only does performs basic initialization depending on
505 * ShmemRequestKind, like setting the global pointer variable to the area for
506 * SHMEM_KIND_STRUCT or setting up the backend-private HTAB control struct.
507 * This does *not* call the subsystem-specific init callbacks. That's done
508 * later after all the shmem areas have been initialized or attached to.
509 */
510static void
512{
513 const char *name = request->options->name;
515 bool found;
516 size_t allocated_size;
517 void *structPtr;
518
519 /* look it up in the shmem index */
522 if (found)
523 elog(ERROR, "shared memory struct \"%s\" is already initialized", name);
524 if (!index_entry)
525 {
526 /* tried to add it to the hash table, but there was no space */
529 errmsg("could not create ShmemIndex entry for data structure \"%s\"",
530 name)));
531 }
532
533 /*
534 * We inserted the entry to the shared memory index. Allocate requested
535 * amount of shared memory for it, and initialize the index entry.
536 */
537 structPtr = ShmemAllocRaw(request->options->size,
538 request->options->alignment,
539 &allocated_size);
540 if (structPtr == NULL)
541 {
542 /* out of memory; remove the failed ShmemIndex entry */
546 errmsg("not enough shared memory for data structure"
547 " \"%s\" (%zd bytes requested)",
548 name, request->options->size)));
549 }
550 index_entry->size = request->options->size;
551 index_entry->allocated_size = allocated_size;
552 index_entry->location = structPtr;
553
554 /* Initialize depending on the kind of shmem area it is */
555 switch (request->kind)
556 {
558 if (request->options->ptr)
559 *(request->options->ptr) = index_entry->location;
560 break;
561 case SHMEM_KIND_HASH:
563 break;
564 case SHMEM_KIND_SLRU:
566 break;
567 }
568}
569
570/*
571 * Look up a named shmem area in the shared memory index and attach to it.
572 *
573 * Note that this only performs the basic attachment actions depending on
574 * ShmemRequestKind, like setting the global pointer variable to the area for
575 * SHMEM_KIND_STRUCT or setting up the backend-private HTAB control struct.
576 * This does *not* call the subsystem-specific attach callbacks. That's done
577 * later after all the shmem areas have been initialized or attached to.
578 */
579static bool
581{
582 const char *name = request->options->name;
584
585 /* Look it up in the shmem index */
588 if (!index_entry)
589 {
590 if (!missing_ok)
592 (errmsg("could not find ShmemIndex entry for data structure \"%s\"",
593 request->options->name)));
594 return false;
595 }
596
597 /* Check that the size in the index matches the request */
598 if (index_entry->size != request->options->size &&
599 request->options->size != SHMEM_ATTACH_UNKNOWN_SIZE)
600 {
602 (errmsg("shared memory struct \"%s\" was created with"
603 " different size: existing %zu, requested %zd",
604 name, index_entry->size, request->options->size)));
605 }
606
607 /*
608 * Re-establish the caller's pointer variable, or do other actions to
609 * attach depending on the kind of shmem area it is.
610 */
611 switch (request->kind)
612 {
614 if (request->options->ptr)
615 *(request->options->ptr) = index_entry->location;
616 break;
617 case SHMEM_KIND_HASH:
618 shmem_hash_attach(index_entry->location, request->options);
619 break;
620 case SHMEM_KIND_SLRU:
621 shmem_slru_attach(index_entry->location, request->options);
622 break;
623 }
624
625 return true;
626}
627
628/*
629 * InitShmemAllocator() --- set up basic pointers to shared memory.
630 *
631 * Called at postmaster or stand-alone backend startup, to initialize the
632 * allocator's data structure in the shared memory segment. In EXEC_BACKEND,
633 * this is also called at backend startup, to set up pointers to the
634 * already-initialized data structure.
635 */
636void
638{
639 Size offset;
641 HASHCTL info;
642 int hash_flags;
643
644#ifndef EXEC_BACKEND
646#endif
647 Assert(seghdr != NULL);
648
650 {
652 }
653 else
654 {
657 }
658
659 /*
660 * We assume the pointer and offset are MAXALIGN. Not a hard requirement,
661 * but it's true today and keeps the math below simpler.
662 */
663 Assert(seghdr == (void *) MAXALIGN(seghdr));
664 Assert(seghdr->content_offset == MAXALIGN(seghdr->content_offset));
665
666 /*
667 * Allocations after this point should go through ShmemAlloc, which
668 * expects to allocate everything on cache line boundaries. Make sure the
669 * first allocation begins on a cache line boundary.
670 */
671 offset = CACHELINEALIGN(seghdr->content_offset + sizeof(ShmemAllocatorData));
672 if (offset > seghdr->totalsize)
675 errmsg("out of shared memory (%zu bytes requested)",
676 offset)));
677
678 /*
679 * In postmaster or stand-alone backend, initialize the shared memory
680 * allocator so that we can allocate shared memory for ShmemIndex using
681 * ShmemAlloc(). In a regular backend just set up the pointers required
682 * by ShmemAlloc().
683 */
684 ShmemAllocator = (ShmemAllocatorData *) ((char *) seghdr + seghdr->content_offset);
686 {
688 ShmemAllocator->free_offset = offset;
690 }
691
694 ShmemEnd = (char *) ShmemBase + seghdr->totalsize;
695
696 /*
697 * Create (or attach to) the shared memory index of shmem areas.
698 *
699 * This is the same initialization as ShmemInitHash() does, but we cannot
700 * use ShmemInitHash() here because it relies on ShmemIndex being already
701 * initialized.
702 */
704
706 info.entrysize = sizeof(ShmemIndexEnt);
707 hash_flags = HASH_ELEM | HASH_STRINGS | HASH_FIXED_SIZE;
708
710 {
713 }
717 "ShmemIndex", hash_nelems,
718 &info, hash_flags);
720
721 /*
722 * Add an entry for ShmemIndex itself into ShmemIndex, so that it's
723 * visible in the pg_shmem_allocations view
724 */
726 {
727 bool found;
729 hash_search(ShmemIndex, "ShmemIndex", HASH_ENTER, &found);
730
731 Assert(!found);
733 result->allocated_size = ShmemAllocator->index_size;
734 result->location = ShmemAllocator->index;
735 }
736}
737
738/*
739 * Reset state on postmaster crash restart.
740 */
741void
743{
746
748
749 /*
750 * Note that we don't clear the registered callbacks. We will need to
751 * call them again as we restart
752 */
753}
754
755/*
756 * ShmemAlloc -- allocate max-aligned chunk from shared memory
757 *
758 * Throws error if request cannot be satisfied.
759 *
760 * Assumes ShmemSegHdr is initialized.
761 */
762void *
764{
765 void *newSpace;
766 Size allocated_size;
767
768 newSpace = ShmemAllocRaw(size, 0, &allocated_size);
769 if (!newSpace)
772 errmsg("out of shared memory (%zu bytes requested)",
773 size)));
774 return newSpace;
775}
776
777/*
778 * ShmemAllocNoError -- allocate max-aligned chunk from shared memory
779 *
780 * As ShmemAlloc, but returns NULL if out of space, rather than erroring.
781 */
782void *
784{
785 Size allocated_size;
786
787 return ShmemAllocRaw(size, 0, &allocated_size);
788}
789
790/*
791 * ShmemAllocRaw -- allocate align chunk and return allocated size
792 *
793 * Also sets *allocated_size to the number of bytes allocated, which will
794 * be equal to the number requested plus any padding we choose to add.
795 */
796static void *
797ShmemAllocRaw(Size size, Size alignment, Size *allocated_size)
798{
802 void *newSpace;
803
804 /*
805 * Ensure all space is adequately aligned. We used to only MAXALIGN this
806 * space but experience has proved that on modern systems that is not good
807 * enough. Many parts of the system are very sensitive to critical data
808 * structures getting split across cache line boundaries. To avoid that,
809 * attempt to align the beginning of the allocation to a cache line
810 * boundary. The calling code will still need to be careful about how it
811 * uses the allocated space - e.g. by padding each element in an array of
812 * structures out to a power-of-two size - but without this, even that
813 * won't be sufficient.
814 */
815 if (alignment < PG_CACHE_LINE_SIZE)
816 alignment = PG_CACHE_LINE_SIZE;
817
819
821
823 newStart = TYPEALIGN(alignment, rawStart);
824
825 newFree = newStart + size;
826 if (newFree <= ShmemSegHdr->totalsize)
827 {
828 newSpace = (char *) ShmemBase + newStart;
830 }
831 else
832 newSpace = NULL;
833
835
836 /* note this assert is okay with newSpace == NULL */
837 Assert(newSpace == (void *) TYPEALIGN(alignment, newSpace));
838
839 *allocated_size = newFree - rawStart;
840 return newSpace;
841}
842
843/*
844 * ShmemAddrIsValid -- test if an address refers to shared memory
845 *
846 * Returns true if the pointer points within the shared memory segment.
847 */
848bool
849ShmemAddrIsValid(const void *addr)
850{
851 return (addr >= ShmemBase) && (addr < ShmemEnd);
852}
853
854/*
855 * Register callbacks that define a shared memory area (or multiple areas).
856 *
857 * The system will call the callbacks at different stages of postmaster or
858 * backend startup, to allocate and initialize the area.
859 *
860 * This is normally called early during postmaster startup, but if the
861 * SHMEM_CALLBACKS_ALLOW_AFTER_STARTUP is set, this can also be used after
862 * startup, although after startup there's no guarantee that there's enough
863 * shared memory available. When called after startup, this immediately calls
864 * the right callbacks depending on whether another backend had already
865 * initialized the area.
866 *
867 * Note: In EXEC_BACKEND mode, this needs to be called in every backend
868 * process. That's needed because we cannot pass down the callback function
869 * pointers from the postmaster process, because different processes may have
870 * loaded libraries to different addresses.
871 */
872void
874{
876 {
877 /*
878 * After-startup initialization or attachment. Call the appropriate
879 * callbacks immediately.
880 */
881 if ((callbacks->flags & SHMEM_CALLBACKS_ALLOW_AFTER_STARTUP) == 0)
882 elog(ERROR, "cannot request shared memory at this time");
883
885 }
886 else
887 {
888 /* Remember the callbacks for later */
890 (void *) callbacks);
891 }
892}
893
894/*
895 * Register a shmem area (or multiple areas) after startup.
896 */
897static void
899{
900 bool found_any;
901 bool notfound_any;
902
905
906 /*
907 * Call the request callback first. The callback makes ShmemRequest*()
908 * calls for each shmem area, adding them to pending_shmem_requests.
909 */
911 if (callbacks->request_fn)
912 callbacks->request_fn(callbacks->opaque_arg);
914
916 {
918 return;
919 }
920
921 /* Hold ShmemIndexLock while we allocate all the shmem entries */
923
924 /*
925 * Check if the requested shared memory areas have already been
926 * initialized. We assume all the areas requested by the request callback
927 * to form a coherent unit such that they're all already initialized or
928 * none. Otherwise it would be ambiguous which callback, init or attach,
929 * to callback afterwards.
930 */
931 found_any = notfound_any = false;
933 {
934 if (hash_search(ShmemIndex, request->options->name, HASH_FIND, NULL))
935 found_any = true;
936 else
937 notfound_any = true;
938 }
939 if (found_any && notfound_any)
940 elog(ERROR, "found some but not all");
941
942 /*
943 * Allocate or attach all the shmem areas requested by the request_fn
944 * callback.
945 */
947 {
948 if (found_any)
950 else
952
953 pfree(request->options);
954 }
957
958 /* Finish by calling the appropriate subsystem-specific callback */
959 if (found_any)
960 {
961 if (callbacks->attach_fn)
962 callbacks->attach_fn(callbacks->opaque_arg);
963 }
964 else
965 {
966 if (callbacks->init_fn)
967 callbacks->init_fn(callbacks->opaque_arg);
968 }
969
972}
973
974/*
975 * Call all shmem request callbacks.
976 */
977void
979{
980 ListCell *lc;
981
984
986 {
987 const ShmemCallbacks *callbacks = (const ShmemCallbacks *) lfirst(lc);
988
989 if (callbacks->request_fn)
990 callbacks->request_fn(callbacks->opaque_arg);
991 }
992}
993
994/*
995 * ShmemInitStruct -- Create/attach to a structure in shared memory.
996 *
997 * This is called during initialization to find or allocate
998 * a data structure in shared memory. If no other process
999 * has created the structure, this routine allocates space
1000 * for it. If it exists already, a pointer to the existing
1001 * structure is returned.
1002 *
1003 * Returns: pointer to the object. *foundPtr is set true if the object was
1004 * already in the shmem index (hence, already initialized).
1005 *
1006 * Note: This is a legacy interface, kept for backwards compatibility with
1007 * extensions. Use ShmemRequestStruct() in new code!
1008 */
1009void *
1010ShmemInitStruct(const char *name, Size size, bool *foundPtr)
1011{
1012 void *ptr = NULL;
1014 .name = name,
1015 .size = size,
1016 .ptr = &ptr,
1017 };
1019
1023
1025
1026 /*
1027 * During postmaster startup, look up the existing entry if any.
1028 */
1029 *foundPtr = false;
1032
1033 /* Initialize it if not found */
1034 if (!*foundPtr)
1036
1038
1039 Assert(ptr != NULL);
1040 return ptr;
1041}
1042
1043/* SQL SRF showing allocated shared memory */
1044Datum
1046{
1047#define PG_GET_SHMEM_SIZES_COLS 4
1048 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1053 bool nulls[PG_GET_SHMEM_SIZES_COLS];
1054
1055 InitMaterializedSRF(fcinfo, 0);
1056
1058
1060
1061 /* output all allocated entries */
1062 memset(nulls, 0, sizeof(nulls));
1063 while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
1064 {
1065 values[0] = CStringGetTextDatum(ent->key);
1066 values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
1067 values[2] = Int64GetDatum(ent->size);
1068 values[3] = Int64GetDatum(ent->allocated_size);
1069 named_allocated += ent->allocated_size;
1070
1071 tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
1072 values, nulls);
1073 }
1074
1075 /* output shared memory allocated but not counted via the shmem index */
1076 values[0] = CStringGetTextDatum("<anonymous>");
1077 nulls[1] = true;
1079 values[3] = values[2];
1080 tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
1081
1082 /* output as-of-yet unused shared memory */
1083 nulls[0] = true;
1085 nulls[1] = false;
1087 values[3] = values[2];
1088 tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
1089
1091
1092 return (Datum) 0;
1093}
1094
1095/*
1096 * SQL SRF showing NUMA memory nodes for allocated shared memory
1097 *
1098 * Compared to pg_get_shmem_allocations(), this function does not return
1099 * information about shared anonymous allocations and unused shared memory.
1100 */
1101Datum
1103{
1104#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
1105 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
1109 bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
1111 void **page_ptrs;
1112 int *pages_status;
1115 max_nodes;
1116 Size *nodes;
1117
1118 if (pg_numa_init() == -1)
1119 elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
1120
1121 InitMaterializedSRF(fcinfo, 0);
1122
1125
1126 /*
1127 * Shared memory allocations can vary in size and may not align with OS
1128 * memory page boundaries, while NUMA queries work on pages.
1129 *
1130 * To correctly map each allocation to NUMA nodes, we need to: 1.
1131 * Determine the OS memory page size. 2. Align each allocation's start/end
1132 * addresses to page boundaries. 3. Query NUMA node information for all
1133 * pages spanning the allocation.
1134 */
1136
1137 /*
1138 * Allocate memory for page pointers and status based on total shared
1139 * memory size. This simplified approach allocates enough space for all
1140 * pages in shared memory rather than calculating the exact requirements
1141 * for each segment.
1142 *
1143 * Add 1, because we don't know how exactly the segments align to OS
1144 * pages, so the allocation might use one more memory page. In practice
1145 * this is not very likely, and moreover we have more entries, each of
1146 * them using only fraction of the total pages.
1147 */
1151
1152 if (firstNumaTouch)
1153 elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
1154
1156
1158
1159 /* output all allocated entries */
1160 while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
1161 {
1162 int i;
1163 char *startptr,
1164 *endptr;
1165 Size total_len;
1166
1167 /*
1168 * Calculate the range of OS pages used by this segment. The segment
1169 * may start / end half-way through a page, we want to count these
1170 * pages too. So we align the start/end pointers down/up, and then
1171 * calculate the number of pages from that.
1172 */
1173 startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
1174 endptr = (char *) TYPEALIGN(os_page_size,
1175 (char *) ent->location + ent->allocated_size);
1176 total_len = (endptr - startptr);
1177
1178 shm_ent_page_count = total_len / os_page_size;
1179
1180 /*
1181 * If we ever get 0xff (-1) back from kernel inquiry, then we probably
1182 * have a bug in mapping buffers to OS pages.
1183 */
1184 memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
1185
1186 /*
1187 * Setup page_ptrs[] with pointers to all OS pages for this segment,
1188 * and get the NUMA status using pg_numa_query_pages.
1189 *
1190 * In order to get reliable results we also need to touch memory
1191 * pages, so that inquiry about NUMA memory node doesn't return -2
1192 * (ENOENT, which indicates unmapped/unallocated pages).
1193 */
1194 for (i = 0; i < shm_ent_page_count; i++)
1195 {
1196 page_ptrs[i] = startptr + (i * os_page_size);
1197
1198 if (firstNumaTouch)
1200
1202 }
1203
1205 elog(ERROR, "failed NUMA pages inquiry status: %m");
1206
1207 /* Count number of NUMA nodes used for this shared memory entry */
1208 memset(nodes, 0, sizeof(Size) * (max_nodes + 2));
1209
1210 for (i = 0; i < shm_ent_page_count; i++)
1211 {
1212 int s = pages_status[i];
1213
1214 /* Ensure we are adding only valid index to the array */
1215 if (s >= 0 && s <= max_nodes)
1216 {
1217 /* valid NUMA node */
1218 nodes[s]++;
1219 continue;
1220 }
1221 else if (s == -2)
1222 {
1223 /* -2 means ENOENT (e.g. page was moved to swap) */
1224 nodes[max_nodes + 1]++;
1225 continue;
1226 }
1227
1228 elog(ERROR, "invalid NUMA node id outside of allowed range "
1229 "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
1230 }
1231
1232 /* no NULLs for regular nodes */
1233 memset(nulls, 0, sizeof(nulls));
1234
1235 /*
1236 * Add one entry for each NUMA node, including those without allocated
1237 * memory for this segment.
1238 */
1239 for (i = 0; i <= max_nodes; i++)
1240 {
1241 values[0] = CStringGetTextDatum(ent->key);
1242 values[1] = Int32GetDatum(i);
1244
1245 tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
1246 values, nulls);
1247 }
1248
1249 /* The last entry is used for pages without a NUMA node. */
1250 nulls[1] = true;
1251 values[0] = CStringGetTextDatum(ent->key);
1253
1254 tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
1255 values, nulls);
1256 }
1257
1259 firstNumaTouch = false;
1260
1261 return (Datum) 0;
1262}
1263
1264/*
1265 * Determine the memory page size used for the shared memory segment.
1266 *
1267 * If the shared segment was allocated using huge pages, returns the size of
1268 * a huge page. Otherwise returns the size of regular memory page.
1269 *
1270 * This should be used only after the server is started.
1271 */
1272Size
1274{
1276#ifdef WIN32
1278
1280 os_page_size = sysinfo.dwPageSize;
1281#else
1283#endif
1284
1287
1290
1291 return os_page_size;
1292}
1293
1294Datum
static Datum values[MAXATTR]
Definition bootstrap.c:190
#define CStringGetTextDatum(s)
Definition builtins.h:98
#define CACHELINEALIGN(LEN)
Definition c.h:899
#define MAXALIGN(LEN)
Definition c.h:896
#define TYPEALIGN(ALIGNVAL, LEN)
Definition c.h:889
#define Assert(condition)
Definition c.h:943
int64_t int64
Definition c.h:621
#define UINT64_FORMAT
Definition c.h:635
uint64_t uint64
Definition c.h:625
size_t Size
Definition c.h:689
#define TYPEALIGN_DOWN(ALIGNVAL, LEN)
Definition c.h:901
uint32 result
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition dynahash.c:889
Size hash_estimate_size(int64 num_entries, Size entrysize)
Definition dynahash.c:763
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition dynahash.c:1352
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition dynahash.c:1317
int errcode(int sqlerrcode)
Definition elog.c:875
#define DEBUG1
Definition elog.h:31
#define ERROR
Definition elog.h:40
#define elog(elevel,...)
Definition elog.h:228
#define ereport(elevel,...)
Definition elog.h:152
#define palloc_array(type, count)
Definition fe_memutils.h:91
#define palloc0_array(type, count)
Definition fe_memutils.h:92
#define PG_FUNCTION_ARGS
Definition fmgr.h:193
#define PG_RETURN_BOOL(x)
Definition fmgr.h:360
void InitMaterializedSRF(FunctionCallInfo fcinfo, uint32 flags)
Definition funcapi.c:76
bool IsUnderPostmaster
Definition globals.c:122
int huge_pages_status
Definition guc_tables.c:610
#define HASH_STRINGS
Definition hsearch.h:91
@ HASH_FIND
Definition hsearch.h:108
@ HASH_REMOVE
Definition hsearch.h:110
@ HASH_ENTER
Definition hsearch.h:109
@ HASH_ENTER_NULL
Definition hsearch.h:111
#define HASH_ELEM
Definition hsearch.h:90
#define HASH_FIXED_SIZE
Definition hsearch.h:100
int i
Definition isn.c:77
List * lappend(List *list, void *datum)
Definition list.c:339
void list_free_deep(List *list)
Definition list.c:1560
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1150
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1767
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition lwlock.c:670
@ LW_SHARED
Definition lwlock.h:105
@ LW_EXCLUSIVE
Definition lwlock.h:104
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition mcxt.c:1235
Size add_size(Size s1, Size s2)
Definition mcxt.c:1733
void pfree(void *pointer)
Definition mcxt.c:1619
MemoryContext TopMemoryContext
Definition mcxt.c:167
void * palloc(Size size)
Definition mcxt.c:1390
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:125
static char * errmsg
#define pg_nextpower2_size_t
#define PG_CACHE_LINE_SIZE
#define lfirst(lc)
Definition pg_list.h:172
static int list_length(const List *l)
Definition pg_list.h:152
#define NIL
Definition pg_list.h:68
#define foreach_ptr(type, var, lst)
Definition pg_list.h:501
PGDLLIMPORT int pg_numa_get_max_node(void)
Definition pg_numa.c:138
#define pg_numa_touch_mem_if_required(ptr)
Definition pg_numa.h:37
PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
Definition pg_numa.c:132
PGDLLIMPORT int pg_numa_init(void)
Definition pg_numa.c:125
@ HUGE_PAGES_UNKNOWN
Definition pg_shmem.h:56
@ HUGE_PAGES_ON
Definition pg_shmem.h:54
static Datum Int64GetDatum(int64 X)
Definition postgres.h:426
uint64_t Datum
Definition postgres.h:70
static Datum Int32GetDatum(int32 X)
Definition postgres.h:212
static int fb(int x)
bool ShmemAddrIsValid(const void *addr)
Definition shmem.c:849
Datum pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
Definition shmem.c:1102
Datum pg_numa_available(PG_FUNCTION_ARGS)
Definition shmem.c:1295
void InitShmemAllocator(PGShmemHeader *seghdr)
Definition shmem.c:637
void ShmemRequestStructWithOpts(const ShmemStructOpts *options)
Definition shmem.c:315
static void * ShmemBase
Definition shmem.c:243
Datum pg_get_shmem_allocations(PG_FUNCTION_ARGS)
Definition shmem.c:1045
static void * ShmemEnd
Definition shmem.c:244
Size pg_get_shmem_pagesize(void)
Definition shmem.c:1273
void RegisterShmemCallbacks(const ShmemCallbacks *callbacks)
Definition shmem.c:873
static List * registered_shmem_callbacks
Definition shmem.c:157
#define PG_GET_SHMEM_NUMA_SIZES_COLS
void * ShmemAllocNoError(Size size)
Definition shmem.c:783
void * ShmemAlloc(Size size)
Definition shmem.c:763
#define PG_GET_SHMEM_SIZES_COLS
void ShmemCallRequestCallbacks(void)
Definition shmem.c:978
static void * ShmemAllocRaw(Size size, Size alignment, Size *allocated_size)
Definition shmem.c:797
static void InitShmemIndexEntry(ShmemRequest *request)
Definition shmem.c:511
static void CallShmemCallbacksAfterStartup(const ShmemCallbacks *callbacks)
Definition shmem.c:898
void ShmemInitRequested(void)
Definition shmem.c:424
static PGShmemHeader * ShmemSegHdr
Definition shmem.c:242
void ShmemRequestInternal(ShmemStructOpts *options, ShmemRequestKind kind)
Definition shmem.c:336
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition shmem.c:1010
shmem_request_state
Definition shmem.c:185
@ SRS_INITIALIZING
Definition shmem.c:200
@ SRS_DONE
Definition shmem.c:212
@ SRS_ATTACHING
Definition shmem.c:206
@ SRS_INITIAL
Definition shmem.c:187
@ SRS_AFTER_STARTUP_ATTACH_OR_INIT
Definition shmem.c:209
@ SRS_REQUESTING
Definition shmem.c:194
static HTAB * ShmemIndex
Definition shmem.c:252
static List * pending_shmem_requests
Definition shmem.c:169
#define SHMEM_INDEX_ADDITIONAL_SIZE
Definition shmem.c:262
static ShmemAllocatorData * ShmemAllocator
Definition shmem.c:246
#define ShmemIndexLock
Definition shmem.c:236
static bool AttachShmemIndexEntry(ShmemRequest *request, bool missing_ok)
Definition shmem.c:580
size_t ShmemGetRequestedSize(void)
Definition shmem.c:391
static bool firstNumaTouch
Definition shmem.c:274
void ResetShmemAllocator(void)
Definition shmem.c:742
#define SHMEM_INDEX_KEYSIZE
Definition shmem.c:255
#define SHMEM_ATTACH_UNKNOWN_SIZE
Definition shmem.h:69
#define SHMEM_CALLBACKS_ALLOW_AFTER_STARTUP
Definition shmem.h:167
HTAB * shmem_hash_create(void *location, size_t size, bool found, const char *name, int64 nelems, HASHCTL *infoP, int hash_flags)
Definition shmem_hash.c:149
void shmem_hash_attach(void *location, ShmemStructOpts *base_options)
Definition shmem_hash.c:79
void shmem_hash_init(void *location, ShmemStructOpts *base_options)
Definition shmem_hash.c:63
ShmemRequestKind
@ SHMEM_KIND_SLRU
@ SHMEM_KIND_HASH
@ SHMEM_KIND_STRUCT
void shmem_slru_init(void *location, ShmemStructOpts *base_options)
Definition slru.c:267
void shmem_slru_attach(void *location, ShmemStructOpts *base_options)
Definition slru.c:359
static void SpinLockRelease(volatile slock_t *lock)
Definition spin.h:62
static void SpinLockAcquire(volatile slock_t *lock)
Definition spin.h:56
static void SpinLockInit(volatile slock_t *lock)
Definition spin.h:50
Size keysize
Definition hsearch.h:69
Size entrysize
Definition hsearch.h:70
Definition pg_list.h:54
Size totalsize
Definition pg_shmem.h:34
HASHHDR * index
Definition shmem.c:231
LWLock index_lock
Definition shmem.c:233
slock_t shmem_lock
Definition shmem.c:229
size_t index_size
Definition shmem.c:232
ShmemRequestCallback request_fn
Definition shmem.h:133
ShmemInitCallback init_fn
Definition shmem.h:139
void * opaque_arg
Definition shmem.h:153
ShmemAttachCallback attach_fn
Definition shmem.h:147
void * location
Definition shmem.c:268
Size size
Definition shmem.c:269
Size allocated_size
Definition shmem.c:270
ShmemRequestKind kind
Definition shmem.c:166
ShmemStructOpts * options
Definition shmem.c:165
void GetHugePageSize(Size *hugepagesize, int *mmap_flags)
Definition sysv_shmem.c:480
void tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc, const Datum *values, const bool *isnull)
Definition tuplestore.c:785
const char * name