PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
dsa.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * dsa.c
4 * Dynamic shared memory areas.
5 *
6 * This module provides dynamic shared memory areas which are built on top of
7 * DSM segments. While dsm.c allows segments of memory of shared memory to be
8 * created and shared between backends, it isn't designed to deal with small
9 * objects. A DSA area is a shared memory heap usually backed by one or more
10 * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11 * Alternatively, it can be created in pre-existing shared memory, including a
12 * DSM segment, and then create extra DSM segments as required. Unlike the
13 * regular system heap, it deals in pseudo-pointers which must be converted to
14 * backend-local pointers before they are dereferenced. These pseudo-pointers
15 * can however be shared with other backends, and can be used to construct
16 * shared data structures.
17 *
18 * Each DSA area manages a set of DSM segments, adding new segments as
19 * required and detaching them when they are no longer needed. Each segment
20 * contains a number of 4KB pages, a free page manager for tracking
21 * consecutive runs of free pages, and a page map for tracking the source of
22 * objects allocated on each page. Allocation requests above 8KB are handled
23 * by choosing a segment and finding consecutive free pages in its free page
24 * manager. Allocation requests for smaller sizes are handled using pools of
25 * objects of a selection of sizes. Each pool consists of a number of 16 page
26 * (64KB) superblocks allocated in the same way as large objects. Allocation
27 * of large objects and new superblocks is serialized by a single LWLock, but
28 * allocation of small objects from pre-existing superblocks uses one LWLock
29 * per pool. Currently there is one pool, and therefore one lock, per size
30 * class. Per-core pools to increase concurrency and strategies for reducing
31 * the resulting fragmentation are areas for future research. Each superblock
32 * is managed with a 'span', which tracks the superblock's freelist. Free
33 * requests are handled by looking in the page map to find which span an
34 * address was allocated from, so that small objects can be returned to the
35 * appropriate free list, and large object pages can be returned directly to
36 * the free page map. When allocating, simple heuristics for selecting
37 * segments and superblocks try to encourage occupied memory to be
38 * concentrated, increasing the likelihood that whole superblocks can become
39 * empty and be returned to the free page manager, and whole segments can
40 * become empty and be returned to the operating system.
41 *
42 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
43 * Portions Copyright (c) 1994, Regents of the University of California
44 *
45 * IDENTIFICATION
46 * src/backend/utils/mmgr/dsa.c
47 *
48 *-------------------------------------------------------------------------
49 */
50
51#include "postgres.h"
52
53#include "port/atomics.h"
54#include "port/pg_bitutils.h"
55#include "storage/dsm.h"
56#include "storage/lwlock.h"
57#include "utils/dsa.h"
58#include "utils/freepage.h"
59#include "utils/memutils.h"
60#include "utils/resowner.h"
61
62/*
63 * How many segments to create before we double the segment size. If this is
64 * low, then there is likely to be a lot of wasted space in the largest
65 * segment. If it is high, then we risk running out of segment slots (see
66 * dsm.c's limits on total number of segments), or limiting the total size
67 * an area can manage when using small pointers.
68 */
69#define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
70
71/*
72 * The maximum number of DSM segments that an area can own, determined by
73 * the number of bits remaining (but capped at 1024).
74 */
75#define DSA_MAX_SEGMENTS \
76 Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
77
78/* The bitmask for extracting the offset from a dsa_pointer. */
79#define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
80
81/* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
82#define DSA_PAGES_PER_SUPERBLOCK 16
83
84/*
85 * A magic number used as a sanity check for following DSM segments belonging
86 * to a DSA area (this number will be XORed with the area handle and
87 * the segment index).
88 */
89#define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
90
91/* Build a dsa_pointer given a segment number and offset. */
92#define DSA_MAKE_POINTER(segment_number, offset) \
93 (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
94
95/* Extract the segment number from a dsa_pointer. */
96#define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
97
98/* Extract the offset from a dsa_pointer. */
99#define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
100
101/* The type used for index segment indexes (zero based). */
102typedef size_t dsa_segment_index;
103
104/* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
105#define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
106
107/*
108 * How many bins of segments do we have? The bins are used to categorize
109 * segments by their largest contiguous run of free pages.
110 */
111#define DSA_NUM_SEGMENT_BINS 16
112
113/*
114 * What is the lowest bin that holds segments that *might* have n contiguous
115 * free pages? There is no point in looking in segments in lower bins; they
116 * definitely can't service a request for n free pages.
117 */
118static inline size_t
120{
121 size_t bin;
122
123 if (n == 0)
124 bin = 0;
125 else
126 bin = pg_leftmost_one_pos_size_t(n) + 1;
127
128 return Min(bin, DSA_NUM_SEGMENT_BINS - 1);
129}
130
131/* Macros for access to locks. */
132#define DSA_AREA_LOCK(area) (&area->control->lock)
133#define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
134
135/*
136 * The header for an individual segment. This lives at the start of each DSM
137 * segment owned by a DSA area including the first segment (where it appears
138 * as part of the dsa_area_control struct).
139 */
140typedef struct
141{
142 /* Sanity check magic value. */
144 /* Total number of pages in this segment (excluding metadata area). */
146 /* Total size of this segment in bytes. */
147 size_t size;
148
149 /*
150 * Index of the segment that precedes this one in the same segment bin, or
151 * DSA_SEGMENT_INDEX_NONE if this is the first one.
152 */
154
155 /*
156 * Index of the segment that follows this one in the same segment bin, or
157 * DSA_SEGMENT_INDEX_NONE if this is the last one.
158 */
160 /* The index of the bin that contains this segment. */
161 size_t bin;
162
163 /*
164 * A flag raised to indicate that this segment is being returned to the
165 * operating system and has been unpinned.
166 */
167 bool freed;
169
170/*
171 * Metadata for one superblock.
172 *
173 * For most blocks, span objects are stored out-of-line; that is, the span
174 * object is not stored within the block itself. But, as an exception, for a
175 * "span of spans", the span object is stored "inline". The allocation is
176 * always exactly one page, and the dsa_area_span object is located at
177 * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
178 * and the remaining fields are used just as they would be in an ordinary
179 * block. We can't allocate spans out of ordinary superblocks because
180 * creating an ordinary superblock requires us to be able to allocate a span
181 * *first*. Doing it this way avoids that circularity.
182 */
183typedef struct
184{
185 dsa_pointer pool; /* Containing pool. */
186 dsa_pointer prevspan; /* Previous span. */
187 dsa_pointer nextspan; /* Next span. */
188 dsa_pointer start; /* Starting address. */
189 size_t npages; /* Length of span in pages. */
190 uint16 size_class; /* Size class. */
191 uint16 ninitialized; /* Maximum number of objects ever allocated. */
192 uint16 nallocatable; /* Number of objects currently allocatable. */
193 uint16 firstfree; /* First object on free list. */
194 uint16 nmax; /* Maximum number of objects ever possible. */
195 uint16 fclass; /* Current fullness class. */
197
198/*
199 * Given a pointer to an object in a span, access the index of the next free
200 * object in the same span (ie in the span's freelist) as an L-value.
201 */
202#define NextFreeObjectIndex(object) (* (uint16 *) (object))
203
204/*
205 * Small allocations are handled by dividing a single block of memory into
206 * many small objects of equal size. The possible allocation sizes are
207 * defined by the following array. Larger size classes are spaced more widely
208 * than smaller size classes. We fudge the spacing for size classes >1kB to
209 * avoid space wastage: based on the knowledge that we plan to allocate 64kB
210 * blocks, we bump the maximum object size up to the largest multiple of
211 * 8 bytes that still lets us fit the same number of objects into one block.
212 *
213 * NB: Because of this fudging, if we were ever to use differently-sized blocks
214 * for small allocations, these size classes would need to be reworked to be
215 * optimal for the new size.
216 *
217 * NB: The optimal spacing for size classes, as well as the size of the blocks
218 * out of which small objects are allocated, is not a question that has one
219 * right answer. Some allocators (such as tcmalloc) use more closely-spaced
220 * size classes than we do here, while others (like aset.c) use more
221 * widely-spaced classes. Spacing the classes more closely avoids wasting
222 * memory within individual chunks, but also means a larger number of
223 * potentially-unfilled blocks.
224 */
225static const uint16 dsa_size_classes[] = {
226 sizeof(dsa_area_span), 0, /* special size classes */
227 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
228 80, 96, 112, 128, /* 4 classes separated by 16 bytes */
229 160, 192, 224, 256, /* 4 classes separated by 32 bytes */
230 320, 384, 448, 512, /* 4 classes separated by 64 bytes */
231 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
232 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
233 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
234 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
235};
236#define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
237
238/* Special size classes. */
239#define DSA_SCLASS_BLOCK_OF_SPANS 0
240#define DSA_SCLASS_SPAN_LARGE 1
241
242/*
243 * The following lookup table is used to map the size of small objects
244 * (less than 1kB) onto the corresponding size class. To use this table,
245 * round the size of the object up to the next multiple of 8 bytes, and then
246 * index into this array.
247 */
248static const uint8 dsa_size_class_map[] = {
249 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
250 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
251 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
252 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
253 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
254 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
255 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
256 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
257};
258#define DSA_SIZE_CLASS_MAP_QUANTUM 8
259
260/*
261 * Superblocks are binned by how full they are. Generally, each fullness
262 * class corresponds to one quartile, but the block being used for
263 * allocations is always at the head of the list for fullness class 1,
264 * regardless of how full it really is.
265 */
266#define DSA_FULLNESS_CLASSES 4
267
268/*
269 * A dsa_area_pool represents a set of objects of a given size class.
270 *
271 * Perhaps there should be multiple pools for the same size class for
272 * contention avoidance, but for now there is just one!
273 */
274typedef struct
275{
276 /* A lock protecting access to this pool. */
278 /* A set of linked lists of spans, arranged by fullness. */
280 /* Should we pad this out to a cacheline boundary? */
282
283/*
284 * The control block for an area. This lives in shared memory, at the start of
285 * the first DSM segment controlled by this area.
286 */
287typedef struct
288{
289 /* The segment header for the first segment. */
291 /* The handle for this area. */
293 /* The handles of the segments owned by this area. */
294 dsm_handle segment_handles[DSA_MAX_SEGMENTS];
295 /* Lists of segments, binned by maximum contiguous run of free pages. */
297 /* The object pools for each size class. */
299 /* initial allocation segment size */
301 /* maximum allocation segment size */
303 /* The total size of all active segments. */
305 /* The maximum total size of backing storage we are allowed. */
307 /* Highest used segment index in the history of this area. */
309 /* The reference count for this area. */
311 /* A flag indicating that this area has been pinned. */
312 bool pinned;
313 /* The number of times that segments have been freed. */
315 /* The LWLock tranche ID. */
317 /* The general lock (protects everything except object pools). */
320
321/* Given a pointer to a pool, find a dsa_pointer. */
322#define DsaAreaPoolToDsaPointer(area, p) \
323 DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
324
325/*
326 * A dsa_segment_map is stored within the backend-private memory of each
327 * individual backend. It holds the base address of the segment within that
328 * backend, plus the addresses of key objects within the segment. Those
329 * could instead be derived from the base address but it's handy to have them
330 * around.
331 */
332typedef struct
333{
334 dsm_segment *segment; /* DSM segment */
335 char *mapped_address; /* Address at which segment is mapped */
336 dsa_segment_header *header; /* Header (same as mapped_address) */
337 FreePageManager *fpm; /* Free page manager within segment. */
338 dsa_pointer *pagemap; /* Page map within segment. */
340
341/*
342 * Per-backend state for a storage area. Backends obtain one of these by
343 * creating an area or attaching to an existing one using a handle. Each
344 * process that needs to use an area uses its own object to track where the
345 * segments are mapped.
346 */
348{
349 /* Pointer to the control object in shared memory. */
351
352 /*
353 * All the mappings are owned by this. The dsa_area itself is not
354 * directly tracked by the ResourceOwner, but the effect is the same. NULL
355 * if the attachment has session lifespan, i.e if dsa_pin_mapping() has
356 * been called.
357 */
359
360 /*
361 * This backend's array of segment maps, ordered by segment index
362 * corresponding to control->segment_handles. Some of the area's segments
363 * may not be mapped in this backend yet, and some slots may have been
364 * freed and need to be detached; these operations happen on demand.
365 */
367
368 /* The highest segment index this backend has ever mapped. */
370
371 /* The last observed freed_segment_counter. */
373};
374
375#define DSA_SPAN_NOTHING_FREE ((uint16) -1)
376#define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
377
378/* Given a pointer to a segment_map, obtain a segment index number. */
379#define get_segment_index(area, segment_map_ptr) \
380 (segment_map_ptr - &area->segment_maps[0])
381
382static void init_span(dsa_area *area, dsa_pointer span_pointer,
383 dsa_area_pool *pool, dsa_pointer start, size_t npages,
384 uint16 size_class);
385static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
386 int fromclass, int toclass);
387static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
388static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
389 int size_class);
392static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
393static void unlink_span(dsa_area *area, dsa_area_span *span);
394static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
395 dsa_pointer span_pointer, int fclass);
396static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
397static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
398static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
399static dsa_area *create_internal(void *place, size_t size,
400 int tranche_id,
401 dsm_handle control_handle,
402 dsm_segment *control_segment,
403 size_t init_segment_size,
404 size_t max_segment_size);
405static dsa_area *attach_internal(void *place, dsm_segment *segment,
406 dsa_handle handle);
407static void check_for_freed_segments(dsa_area *area);
409static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map);
410
411/*
412 * Create a new shared area in a new DSM segment. Further DSM segments will
413 * be allocated as required to extend the available space.
414 *
415 * We can't allocate a LWLock tranche_id within this function, because tranche
416 * IDs are a scarce resource; there are only 64k available, using low numbers
417 * when possible matters, and we have no provision for recycling them. So,
418 * we require the caller to provide one.
419 */
420dsa_area *
421dsa_create_ext(int tranche_id, size_t init_segment_size, size_t max_segment_size)
422{
423 dsm_segment *segment;
424 dsa_area *area;
425
426 /*
427 * Create the DSM segment that will hold the shared control object and the
428 * first segment of usable space.
429 */
430 segment = dsm_create(init_segment_size, 0);
431
432 /*
433 * All segments backing this area are pinned, so that DSA can explicitly
434 * control their lifetime (otherwise a newly created segment belonging to
435 * this area might be freed when the only backend that happens to have it
436 * mapped in ends, corrupting the area).
437 */
438 dsm_pin_segment(segment);
439
440 /* Create a new DSA area with the control object in this segment. */
441 area = create_internal(dsm_segment_address(segment),
442 init_segment_size,
443 tranche_id,
444 dsm_segment_handle(segment), segment,
445 init_segment_size, max_segment_size);
446
447 /* Clean up when the control segment detaches. */
450
451 return area;
452}
453
454/*
455 * Create a new shared area in an existing shared memory space, which may be
456 * either DSM or Postmaster-initialized memory. DSM segments will be
457 * allocated as required to extend the available space, though that can be
458 * prevented with dsa_set_size_limit(area, size) using the same size provided
459 * to dsa_create_in_place.
460 *
461 * Areas created in-place must eventually be released by the backend that
462 * created them and all backends that attach to them. This can be done
463 * explicitly with dsa_release_in_place, or, in the special case that 'place'
464 * happens to be in a pre-existing DSM segment, by passing in a pointer to the
465 * segment so that a detach hook can be registered with the containing DSM
466 * segment.
467 *
468 * See dsa_create() for a note about the tranche arguments.
469 */
470dsa_area *
471dsa_create_in_place_ext(void *place, size_t size,
472 int tranche_id, dsm_segment *segment,
473 size_t init_segment_size, size_t max_segment_size)
474{
475 dsa_area *area;
476
477 area = create_internal(place, size, tranche_id,
478 DSM_HANDLE_INVALID, NULL,
479 init_segment_size, max_segment_size);
480
481 /*
482 * Clean up when the control segment detaches, if a containing DSM segment
483 * was provided.
484 */
485 if (segment != NULL)
487 PointerGetDatum(place));
488
489 return area;
490}
491
492/*
493 * Obtain a handle that can be passed to other processes so that they can
494 * attach to the given area. Cannot be called for areas created with
495 * dsa_create_in_place.
496 */
499{
501 return area->control->handle;
502}
503
504/*
505 * Attach to an area given a handle generated (possibly in another process) by
506 * dsa_get_handle. The area must have been created with dsa_create (not
507 * dsa_create_in_place).
508 */
509dsa_area *
511{
512 dsm_segment *segment;
513 dsa_area *area;
514
515 /*
516 * An area handle is really a DSM segment handle for the first segment, so
517 * we go ahead and attach to that.
518 */
519 segment = dsm_attach(handle);
520 if (segment == NULL)
522 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
523 errmsg("could not attach to dynamic shared area")));
524
525 area = attach_internal(dsm_segment_address(segment), segment, handle);
526
527 /* Clean up when the control segment detaches. */
530
531 return area;
532}
533
534/*
535 * Returns whether the area with the given handle was already attached by the
536 * current process. The area must have been created with dsa_create (not
537 * dsa_create_in_place).
538 */
539bool
541{
542 /*
543 * An area handle is really a DSM segment handle for the first segment, so
544 * we can just search for that.
545 */
546 return dsm_find_mapping(handle) != NULL;
547}
548
549/*
550 * Attach to an area that was created with dsa_create_in_place. The caller
551 * must somehow know the location in memory that was used when the area was
552 * created, though it may be mapped at a different virtual address in this
553 * process.
554 *
555 * See dsa_create_in_place for note about releasing in-place areas, and the
556 * optional 'segment' argument which can be provided to allow automatic
557 * release if the containing memory happens to be a DSM segment.
558 */
559dsa_area *
560dsa_attach_in_place(void *place, dsm_segment *segment)
561{
562 dsa_area *area;
563
564 area = attach_internal(place, NULL, DSA_HANDLE_INVALID);
565
566 /*
567 * Clean up when the control segment detaches, if a containing DSM segment
568 * was provided.
569 */
570 if (segment != NULL)
572 PointerGetDatum(place));
573
574 return area;
575}
576
577/*
578 * Release a DSA area that was produced by dsa_create_in_place or
579 * dsa_attach_in_place. The 'segment' argument is ignored but provides an
580 * interface suitable for on_dsm_detach, for the convenience of users who want
581 * to create a DSA segment inside an existing DSM segment and have it
582 * automatically released when the containing DSM segment is detached.
583 * 'place' should be the address of the place where the area was created.
584 *
585 * This callback is automatically registered for the DSM segment containing
586 * the control object of in-place areas when a segment is provided to
587 * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
588 * with dsa_create.
589 */
590void
592{
594}
595
596/*
597 * Release a DSA area that was produced by dsa_create_in_place or
598 * dsa_attach_in_place. The 'code' argument is ignored but provides an
599 * interface suitable for on_shmem_exit or before_shmem_exit, for the
600 * convenience of users who want to create a DSA segment inside shared memory
601 * other than a DSM segment and have it automatically release at backend exit.
602 * 'place' should be the address of the place where the area was created.
603 */
604void
606{
608}
609
610/*
611 * Release a DSA area that was produced by dsa_create_in_place or
612 * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
613 * callbacks so that this is managed automatically, because failure to release
614 * an area created in-place leaks its segments permanently.
615 *
616 * This is also called automatically for areas produced by dsa_create or
617 * dsa_attach as an implementation detail.
618 */
619void
621{
622 dsa_area_control *control = (dsa_area_control *) place;
623 int i;
624
625 LWLockAcquire(&control->lock, LW_EXCLUSIVE);
626 Assert(control->segment_header.magic ==
627 (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
628 Assert(control->refcnt > 0);
629 if (--control->refcnt == 0)
630 {
631 for (i = 0; i <= control->high_segment_index; ++i)
632 {
633 dsm_handle handle;
634
635 handle = control->segment_handles[i];
636 if (handle != DSM_HANDLE_INVALID)
637 dsm_unpin_segment(handle);
638 }
639 }
640 LWLockRelease(&control->lock);
641}
642
643/*
644 * Keep a DSA area attached until end of session or explicit detach.
645 *
646 * By default, areas are owned by the current resource owner, which means they
647 * are detached automatically when that scope ends.
648 */
649void
651{
652 int i;
653
654 if (area->resowner != NULL)
655 {
656 area->resowner = NULL;
657
658 for (i = 0; i <= area->high_segment_index; ++i)
659 if (area->segment_maps[i].segment != NULL)
661 }
662}
663
664/*
665 * Allocate memory in this storage area. The return value is a dsa_pointer
666 * that can be passed to other processes, and converted to a local pointer
667 * with dsa_get_address. 'flags' is a bitmap which should be constructed
668 * from the following values:
669 *
670 * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
671 * will result in an ERROR.
672 *
673 * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
674 * no memory is available or a size limit established by dsa_set_size_limit
675 * would be exceeded. Otherwise, such allocations will result in an ERROR.
676 *
677 * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
678 * contents of newly-allocated memory are indeterminate.
679 *
680 * These flags correspond to similarly named flags used by
681 * MemoryContextAllocExtended(). See also the macros dsa_allocate and
682 * dsa_allocate0 which expand to a call to this function with commonly used
683 * flags.
684 */
686dsa_allocate_extended(dsa_area *area, size_t size, int flags)
687{
688 uint16 size_class;
689 dsa_pointer start_pointer;
690 dsa_segment_map *segment_map;
691 dsa_pointer result;
692
693 Assert(size > 0);
694
695 /* Sanity check on huge individual allocation size. */
696 if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
697 ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
698 elog(ERROR, "invalid DSA memory alloc request size %zu", size);
699
700 /*
701 * If bigger than the largest size class, just grab a run of pages from
702 * the free page manager, instead of allocating an object from a pool.
703 * There will still be a span, but it's a special class of span that
704 * manages this whole allocation and simply gives all pages back to the
705 * free page manager when dsa_free is called.
706 */
708 {
709 size_t npages = fpm_size_to_pages(size);
710 size_t first_page;
711 dsa_pointer span_pointer;
713
714 /* Obtain a span object. */
715 span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
716 if (!DsaPointerIsValid(span_pointer))
717 {
718 /* Raise error unless asked not to. */
719 if ((flags & DSA_ALLOC_NO_OOM) == 0)
721 (errcode(ERRCODE_OUT_OF_MEMORY),
722 errmsg("out of memory"),
723 errdetail("Failed on DSA request of size %zu.",
724 size)));
725 return InvalidDsaPointer;
726 }
727
729
730 /* Find a segment from which to allocate. */
731 segment_map = get_best_segment(area, npages);
732 if (segment_map == NULL)
733 segment_map = make_new_segment(area, npages);
734 if (segment_map == NULL)
735 {
736 /* Can't make any more segments: game over. */
738 dsa_free(area, span_pointer);
739
740 /* Raise error unless asked not to. */
741 if ((flags & DSA_ALLOC_NO_OOM) == 0)
743 (errcode(ERRCODE_OUT_OF_MEMORY),
744 errmsg("out of memory"),
745 errdetail("Failed on DSA request of size %zu.",
746 size)));
747 return InvalidDsaPointer;
748 }
749
750 /*
751 * Ask the free page manager for a run of pages. This should always
752 * succeed, since both get_best_segment and make_new_segment should
753 * only return a non-NULL pointer if it actually contains enough
754 * contiguous freespace. If it does fail, something in our backend
755 * private state is out of whack, so use FATAL to kill the process.
756 */
757 if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
758 elog(FATAL,
759 "dsa_allocate could not find %zu free pages", npages);
761
762 start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
763 first_page * FPM_PAGE_SIZE);
764
765 /* Initialize span and pagemap. */
768 init_span(area, span_pointer, pool, start_pointer, npages,
770 segment_map->pagemap[first_page] = span_pointer;
772
773 /* Zero-initialize the memory if requested. */
774 if ((flags & DSA_ALLOC_ZERO) != 0)
775 memset(dsa_get_address(area, start_pointer), 0, size);
776
777 return start_pointer;
778 }
779
780 /* Map allocation to a size class. */
782 {
783 int mapidx;
784
785 /* For smaller sizes we have a lookup table... */
786 mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
788 size_class = dsa_size_class_map[mapidx];
789 }
790 else
791 {
792 uint16 min;
793 uint16 max;
794
795 /* ... and for the rest we search by binary chop. */
797 max = lengthof(dsa_size_classes) - 1;
798
799 while (min < max)
800 {
801 uint16 mid = (min + max) / 2;
802 uint16 class_size = dsa_size_classes[mid];
803
804 if (class_size < size)
805 min = mid + 1;
806 else
807 max = mid;
808 }
809
810 size_class = min;
811 }
812 Assert(size <= dsa_size_classes[size_class]);
813 Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
814
815 /* Attempt to allocate an object from the appropriate pool. */
816 result = alloc_object(area, size_class);
817
818 /* Check for failure to allocate. */
819 if (!DsaPointerIsValid(result))
820 {
821 /* Raise error unless asked not to. */
822 if ((flags & DSA_ALLOC_NO_OOM) == 0)
824 (errcode(ERRCODE_OUT_OF_MEMORY),
825 errmsg("out of memory"),
826 errdetail("Failed on DSA request of size %zu.", size)));
827 return InvalidDsaPointer;
828 }
829
830 /* Zero-initialize the memory if requested. */
831 if ((flags & DSA_ALLOC_ZERO) != 0)
832 memset(dsa_get_address(area, result), 0, size);
833
834 return result;
835}
836
837/*
838 * Free memory obtained with dsa_allocate.
839 */
840void
842{
843 dsa_segment_map *segment_map;
844 int pageno;
845 dsa_pointer span_pointer;
846 dsa_area_span *span;
847 char *superblock;
848 char *object;
849 size_t size;
850 int size_class;
851
852 /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
854
855 /* Locate the object, span and pool. */
856 segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
857 pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
858 span_pointer = segment_map->pagemap[pageno];
859 span = dsa_get_address(area, span_pointer);
860 superblock = dsa_get_address(area, span->start);
861 object = dsa_get_address(area, dp);
862 size_class = span->size_class;
863 size = dsa_size_classes[size_class];
864
865 /*
866 * Special case for large objects that live in a special span: we return
867 * those pages directly to the free page manager and free the span.
868 */
870 {
871
872#ifdef CLOBBER_FREED_MEMORY
873 memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
874#endif
875
876 /* Give pages back to free page manager. */
878 FreePageManagerPut(segment_map->fpm,
880 span->npages);
881
882 /* Move segment to appropriate bin if necessary. */
883 rebin_segment(area, segment_map);
885
886 /* Unlink span. */
889 unlink_span(area, span);
891 /* Free the span object so it can be reused. */
892 dsa_free(area, span_pointer);
893 return;
894 }
895
896#ifdef CLOBBER_FREED_MEMORY
897 memset(object, 0x7f, size);
898#endif
899
900 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
901
902 /* Put the object on the span's freelist. */
903 Assert(object >= superblock);
904 Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
905 Assert((object - superblock) % size == 0);
906 NextFreeObjectIndex(object) = span->firstfree;
907 span->firstfree = (object - superblock) / size;
908 ++span->nallocatable;
909
910 /*
911 * See if the span needs to moved to a different fullness class, or be
912 * freed so its pages can be given back to the segment.
913 */
914 if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
915 {
916 /*
917 * The block was completely full and is located in the
918 * highest-numbered fullness class, which is never scanned for free
919 * chunks. We must move it to the next-lower fullness class.
920 */
921 unlink_span(area, span);
922 add_span_to_fullness_class(area, span, span_pointer,
924
925 /*
926 * If this is the only span, and there is no active span, then we
927 * should probably move this span to fullness class 1. (Otherwise if
928 * you allocate exactly all the objects in the only span, it moves to
929 * class 3, then you free them all, it moves to 2, and then is given
930 * back, leaving no active span).
931 */
932 }
933 else if (span->nallocatable == span->nmax &&
934 (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
935 {
936 /*
937 * This entire block is free, and it's not the active block for this
938 * size class. Return the memory to the free page manager. We don't
939 * do this for the active block to prevent hysteresis: if we
940 * repeatedly allocate and free the only chunk in the active block, it
941 * will be very inefficient if we deallocate and reallocate the block
942 * every time.
943 */
944 destroy_superblock(area, span_pointer);
945 }
946
947 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
948}
949
950/*
951 * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
952 * memory allocated by the given area (possibly in another process) that
953 * hasn't yet been freed. This may cause a segment to be mapped into the
954 * current process if required, and may cause freed segments to be unmapped.
955 */
956void *
958{
960 size_t offset;
961
962 /* Convert InvalidDsaPointer to NULL. */
963 if (!DsaPointerIsValid(dp))
964 return NULL;
965
966 /* Process any requests to detach from freed segments. */
968
969 /* Break the dsa_pointer into its components. */
971 offset = DSA_EXTRACT_OFFSET(dp);
973
974 /* Check if we need to cause this segment to be mapped in. */
975 if (unlikely(area->segment_maps[index].mapped_address == NULL))
976 {
977 /* Call for effect (we don't need the result). */
979 }
980
981 return area->segment_maps[index].mapped_address + offset;
982}
983
984/*
985 * Pin this area, so that it will continue to exist even if all backends
986 * detach from it. In that case, the area can still be reattached to if a
987 * handle has been recorded somewhere.
988 */
989void
991{
993 if (area->control->pinned)
994 {
996 elog(ERROR, "dsa_area already pinned");
997 }
998 area->control->pinned = true;
999 ++area->control->refcnt;
1001}
1002
1003/*
1004 * Undo the effects of dsa_pin, so that the given area can be freed when no
1005 * backends are attached to it. May be called only if dsa_pin has been
1006 * called.
1007 */
1008void
1010{
1012 Assert(area->control->refcnt > 1);
1013 if (!area->control->pinned)
1014 {
1016 elog(ERROR, "dsa_area not pinned");
1017 }
1018 area->control->pinned = false;
1019 --area->control->refcnt;
1021}
1022
1023/*
1024 * Set the total size limit for this area. This limit is checked whenever new
1025 * segments need to be allocated from the operating system. If the new size
1026 * limit is already exceeded, this has no immediate effect.
1027 *
1028 * Note that the total virtual memory usage may be temporarily larger than
1029 * this limit when segments have been freed, but not yet detached by all
1030 * backends that have attached to them.
1031 */
1032void
1033dsa_set_size_limit(dsa_area *area, size_t limit)
1034{
1036 area->control->max_total_segment_size = limit;
1038}
1039
1040/* Return the total size of all active segments */
1041size_t
1043{
1044 size_t size;
1045
1047 size = area->control->total_segment_size;
1049
1050 return size;
1051}
1052
1053/*
1054 * Aggressively free all spare memory in the hope of returning DSM segments to
1055 * the operating system.
1056 */
1057void
1059{
1060 int size_class;
1061
1062 /*
1063 * Trim in reverse pool order so we get to the spans-of-spans last, just
1064 * in case any become entirely free while processing all the other pools.
1065 */
1066 for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1067 {
1068 dsa_area_pool *pool = &area->control->pools[size_class];
1069 dsa_pointer span_pointer;
1070
1071 if (size_class == DSA_SCLASS_SPAN_LARGE)
1072 {
1073 /* Large object frees give back segments aggressively already. */
1074 continue;
1075 }
1076
1077 /*
1078 * Search fullness class 1 only. That is where we expect to find an
1079 * entirely empty superblock (entirely empty superblocks in other
1080 * fullness classes are returned to the free page map by dsa_free).
1081 */
1082 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1083 span_pointer = pool->spans[1];
1084 while (DsaPointerIsValid(span_pointer))
1085 {
1086 dsa_area_span *span = dsa_get_address(area, span_pointer);
1087 dsa_pointer next = span->nextspan;
1088
1089 if (span->nallocatable == span->nmax)
1090 destroy_superblock(area, span_pointer);
1091
1092 span_pointer = next;
1093 }
1094 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1095 }
1096}
1097
1098/*
1099 * Print out debugging information about the internal state of the shared
1100 * memory area.
1101 */
1102void
1104{
1105 size_t i,
1106 j;
1107
1108 /*
1109 * Note: This gives an inconsistent snapshot as it acquires and releases
1110 * individual locks as it goes...
1111 */
1112
1115 fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1116 fprintf(stderr, " max_total_segment_size: %zu\n",
1118 fprintf(stderr, " total_segment_size: %zu\n",
1120 fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1121 fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1122 fprintf(stderr, " segment bins:\n");
1123 for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1124 {
1126 {
1127 dsa_segment_index segment_index;
1128
1129 if (i == 0)
1130 fprintf(stderr,
1131 " segment bin %zu (no contiguous free pages):\n", i);
1132 else
1133 fprintf(stderr,
1134 " segment bin %zu (at least %d contiguous pages free):\n",
1135 i, 1 << (i - 1));
1136 segment_index = area->control->segment_bins[i];
1137 while (segment_index != DSA_SEGMENT_INDEX_NONE)
1138 {
1139 dsa_segment_map *segment_map;
1140
1141 segment_map =
1142 get_segment_by_index(area, segment_index);
1143
1144 fprintf(stderr,
1145 " segment index %zu, usable_pages = %zu, "
1146 "contiguous_pages = %zu, mapped at %p\n",
1147 segment_index,
1148 segment_map->header->usable_pages,
1149 fpm_largest(segment_map->fpm),
1150 segment_map->mapped_address);
1151 segment_index = segment_map->header->next;
1152 }
1153 }
1154 }
1156
1157 fprintf(stderr, " pools:\n");
1158 for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1159 {
1160 bool found = false;
1161
1163 for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1164 if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1165 found = true;
1166 if (found)
1167 {
1169 fprintf(stderr, " pool for blocks of span objects:\n");
1170 else if (i == DSA_SCLASS_SPAN_LARGE)
1171 fprintf(stderr, " pool for large object spans:\n");
1172 else
1173 fprintf(stderr,
1174 " pool for size class %zu (object size %hu bytes):\n",
1175 i, dsa_size_classes[i]);
1176 for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1177 {
1178 if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1179 fprintf(stderr, " fullness class %zu is empty\n", j);
1180 else
1181 {
1182 dsa_pointer span_pointer = area->control->pools[i].spans[j];
1183
1184 fprintf(stderr, " fullness class %zu:\n", j);
1185 while (DsaPointerIsValid(span_pointer))
1186 {
1187 dsa_area_span *span;
1188
1189 span = dsa_get_address(area, span_pointer);
1190 fprintf(stderr,
1191 " span descriptor at "
1192 DSA_POINTER_FORMAT ", superblock at "
1194 ", pages = %zu, objects free = %hu/%hu\n",
1195 span_pointer, span->start, span->npages,
1196 span->nallocatable, span->nmax);
1197 span_pointer = span->nextspan;
1198 }
1199 }
1200 }
1201 }
1203 }
1204}
1205
1206/*
1207 * Return the smallest size that you can successfully provide to
1208 * dsa_create_in_place.
1209 */
1210size_t
1212{
1213 size_t size;
1214 int pages = 0;
1215
1216 size = MAXALIGN(sizeof(dsa_area_control)) +
1217 MAXALIGN(sizeof(FreePageManager));
1218
1219 /* Figure out how many pages we need, including the page map... */
1220 while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1221 {
1222 ++pages;
1223 size += sizeof(dsa_pointer);
1224 }
1225
1226 return pages * FPM_PAGE_SIZE;
1227}
1228
1229/*
1230 * Workhorse function for dsa_create and dsa_create_in_place.
1231 */
1232static dsa_area *
1233create_internal(void *place, size_t size,
1234 int tranche_id,
1235 dsm_handle control_handle,
1236 dsm_segment *control_segment,
1237 size_t init_segment_size, size_t max_segment_size)
1238{
1239 dsa_area_control *control;
1240 dsa_area *area;
1241 dsa_segment_map *segment_map;
1242 size_t usable_pages;
1243 size_t total_pages;
1244 size_t metadata_bytes;
1245 int i;
1246
1247 /* Check the initial and maximum block sizes */
1248 Assert(init_segment_size >= DSA_MIN_SEGMENT_SIZE);
1249 Assert(max_segment_size >= init_segment_size);
1250 Assert(max_segment_size <= DSA_MAX_SEGMENT_SIZE);
1251
1252 /* Sanity check on the space we have to work in. */
1253 if (size < dsa_minimum_size())
1254 elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1255 dsa_minimum_size(), size);
1256
1257 /* Now figure out how much space is usable */
1258 total_pages = size / FPM_PAGE_SIZE;
1259 metadata_bytes =
1260 MAXALIGN(sizeof(dsa_area_control)) +
1261 MAXALIGN(sizeof(FreePageManager)) +
1262 total_pages * sizeof(dsa_pointer);
1263 /* Add padding up to next page boundary. */
1264 if (metadata_bytes % FPM_PAGE_SIZE != 0)
1265 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1266 Assert(metadata_bytes <= size);
1267 usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1268
1269 /*
1270 * Initialize the dsa_area_control object located at the start of the
1271 * space.
1272 */
1273 control = (dsa_area_control *) place;
1274 memset(place, 0, sizeof(*control));
1275 control->segment_header.magic =
1276 DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1279 control->segment_header.usable_pages = usable_pages;
1280 control->segment_header.freed = false;
1281 control->segment_header.size = size;
1282 control->handle = control_handle;
1283 control->init_segment_size = init_segment_size;
1284 control->max_segment_size = max_segment_size;
1285 control->max_total_segment_size = (size_t) -1;
1286 control->total_segment_size = size;
1287 control->segment_handles[0] = control_handle;
1288 for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1290 control->refcnt = 1;
1291 control->lwlock_tranche_id = tranche_id;
1292
1293 /*
1294 * Create the dsa_area object that this backend will use to access the
1295 * area. Other backends will need to obtain their own dsa_area object by
1296 * attaching.
1297 */
1298 area = palloc(sizeof(dsa_area));
1299 area->control = control;
1301 memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1302 area->high_segment_index = 0;
1303 area->freed_segment_counter = 0;
1304 LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1305 for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1307 control->lwlock_tranche_id);
1308
1309 /* Set up the segment map for this process's mapping. */
1310 segment_map = &area->segment_maps[0];
1311 segment_map->segment = control_segment;
1312 segment_map->mapped_address = place;
1313 segment_map->header = (dsa_segment_header *) place;
1314 segment_map->fpm = (FreePageManager *)
1315 (segment_map->mapped_address +
1316 MAXALIGN(sizeof(dsa_area_control)));
1317 segment_map->pagemap = (dsa_pointer *)
1318 (segment_map->mapped_address +
1319 MAXALIGN(sizeof(dsa_area_control)) +
1320 MAXALIGN(sizeof(FreePageManager)));
1321
1322 /* Set up the free page map. */
1323 FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1324 /* There can be 0 usable pages if size is dsa_minimum_size(). */
1325
1326 if (usable_pages > 0)
1327 FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1328 usable_pages);
1329
1330 /* Put this segment into the appropriate bin. */
1331 control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1332 segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1333
1334 return area;
1335}
1336
1337/*
1338 * Workhorse function for dsa_attach and dsa_attach_in_place.
1339 */
1340static dsa_area *
1341attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1342{
1343 dsa_area_control *control;
1344 dsa_area *area;
1345 dsa_segment_map *segment_map;
1346
1347 control = (dsa_area_control *) place;
1348 Assert(control->handle == handle);
1349 Assert(control->segment_handles[0] == handle);
1350 Assert(control->segment_header.magic ==
1351 (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1352
1353 /* Build the backend-local area object. */
1354 area = palloc(sizeof(dsa_area));
1355 area->control = control;
1357 memset(&area->segment_maps[0], 0,
1359 area->high_segment_index = 0;
1360
1361 /* Set up the segment map for this process's mapping. */
1362 segment_map = &area->segment_maps[0];
1363 segment_map->segment = segment; /* NULL for in-place */
1364 segment_map->mapped_address = place;
1365 segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1366 segment_map->fpm = (FreePageManager *)
1367 (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1368 segment_map->pagemap = (dsa_pointer *)
1369 (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1370 MAXALIGN(sizeof(FreePageManager)));
1371
1372 /* Bump the reference count. */
1374 if (control->refcnt == 0)
1375 {
1376 /* We can't attach to a DSA area that has already been destroyed. */
1377 ereport(ERROR,
1378 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1379 errmsg("could not attach to dynamic shared area")));
1380 }
1381 ++control->refcnt;
1384
1385 return area;
1386}
1387
1388/*
1389 * Add a new span to fullness class 1 of the indicated pool.
1390 */
1391static void
1393 dsa_pointer span_pointer,
1394 dsa_area_pool *pool, dsa_pointer start, size_t npages,
1395 uint16 size_class)
1396{
1397 dsa_area_span *span = dsa_get_address(area, span_pointer);
1398 size_t obsize = dsa_size_classes[size_class];
1399
1400 /*
1401 * The per-pool lock must be held because we manipulate the span list for
1402 * this pool.
1403 */
1404 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1405
1406 /* Push this span onto the front of the span list for fullness class 1. */
1407 if (DsaPointerIsValid(pool->spans[1]))
1408 {
1409 dsa_area_span *head = (dsa_area_span *)
1410 dsa_get_address(area, pool->spans[1]);
1411
1412 head->prevspan = span_pointer;
1413 }
1414 span->pool = DsaAreaPoolToDsaPointer(area, pool);
1415 span->nextspan = pool->spans[1];
1417 pool->spans[1] = span_pointer;
1418
1419 span->start = start;
1420 span->npages = npages;
1421 span->size_class = size_class;
1422 span->ninitialized = 0;
1423 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1424 {
1425 /*
1426 * A block-of-spans contains its own descriptor, so mark one object as
1427 * initialized and reduce the count of allocatable objects by one.
1428 * Doing this here has the side effect of also reducing nmax by one,
1429 * which is important to make sure we free this object at the correct
1430 * time.
1431 */
1432 span->ninitialized = 1;
1433 span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1434 }
1435 else if (size_class != DSA_SCLASS_SPAN_LARGE)
1436 span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1438 span->nmax = span->nallocatable;
1439 span->fclass = 1;
1440}
1441
1442/*
1443 * Transfer the first span in one fullness class to the head of another
1444 * fullness class.
1445 */
1446static bool
1448 dsa_area_pool *pool, int fromclass, int toclass)
1449{
1450 dsa_pointer span_pointer;
1451 dsa_area_span *span;
1452 dsa_area_span *nextspan;
1453
1454 /* Can't do it if source list is empty. */
1455 span_pointer = pool->spans[fromclass];
1456 if (!DsaPointerIsValid(span_pointer))
1457 return false;
1458
1459 /* Remove span from head of source list. */
1460 span = dsa_get_address(area, span_pointer);
1461 pool->spans[fromclass] = span->nextspan;
1462 if (DsaPointerIsValid(span->nextspan))
1463 {
1464 nextspan = (dsa_area_span *)
1465 dsa_get_address(area, span->nextspan);
1466 nextspan->prevspan = InvalidDsaPointer;
1467 }
1468
1469 /* Add span to head of target list. */
1470 span->nextspan = pool->spans[toclass];
1471 pool->spans[toclass] = span_pointer;
1472 if (DsaPointerIsValid(span->nextspan))
1473 {
1474 nextspan = (dsa_area_span *)
1475 dsa_get_address(area, span->nextspan);
1476 nextspan->prevspan = span_pointer;
1477 }
1478 span->fclass = toclass;
1479
1480 return true;
1481}
1482
1483/*
1484 * Allocate one object of the requested size class from the given area.
1485 */
1486static inline dsa_pointer
1487alloc_object(dsa_area *area, int size_class)
1488{
1489 dsa_area_pool *pool = &area->control->pools[size_class];
1490 dsa_area_span *span;
1491 dsa_pointer block;
1492 dsa_pointer result;
1493 char *object;
1494 size_t size;
1495
1496 /*
1497 * Even though ensure_active_superblock can in turn call alloc_object if
1498 * it needs to allocate a new span, that's always from a different pool,
1499 * and the order of lock acquisition is always the same, so it's OK that
1500 * we hold this lock for the duration of this function.
1501 */
1502 Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1503 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1504
1505 /*
1506 * If there's no active superblock, we must successfully obtain one or
1507 * fail the request.
1508 */
1509 if (!DsaPointerIsValid(pool->spans[1]) &&
1510 !ensure_active_superblock(area, pool, size_class))
1511 {
1512 result = InvalidDsaPointer;
1513 }
1514 else
1515 {
1516 /*
1517 * There should be a block in fullness class 1 at this point, and it
1518 * should never be completely full. Thus we can either pop an object
1519 * from the free list or, failing that, initialize a new object.
1520 */
1521 Assert(DsaPointerIsValid(pool->spans[1]));
1522 span = (dsa_area_span *)
1523 dsa_get_address(area, pool->spans[1]);
1524 Assert(span->nallocatable > 0);
1525 block = span->start;
1526 Assert(size_class < DSA_NUM_SIZE_CLASSES);
1527 size = dsa_size_classes[size_class];
1528 if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1529 {
1530 result = block + span->firstfree * size;
1531 object = dsa_get_address(area, result);
1532 span->firstfree = NextFreeObjectIndex(object);
1533 }
1534 else
1535 {
1536 result = block + span->ninitialized * size;
1537 ++span->ninitialized;
1538 }
1539 --span->nallocatable;
1540
1541 /* If it's now full, move it to the highest-numbered fullness class. */
1542 if (span->nallocatable == 0)
1543 transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1544 }
1545
1546 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1547 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1548
1549 return result;
1550}
1551
1552/*
1553 * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1554 * superblocks are completely full and no more can be allocated.
1555 *
1556 * Fullness classes K of 0..N are loosely intended to represent blocks whose
1557 * utilization percentage is at least K/N, but we only enforce this rigorously
1558 * for the highest-numbered fullness class, which always contains exactly
1559 * those blocks that are completely full. It's otherwise acceptable for a
1560 * block to be in a higher-numbered fullness class than the one to which it
1561 * logically belongs. In addition, the active block, which is always the
1562 * first block in fullness class 1, is permitted to have a higher allocation
1563 * percentage than would normally be allowable for that fullness class; we
1564 * don't move it until it's completely full, and then it goes to the
1565 * highest-numbered fullness class.
1566 *
1567 * It might seem odd that the active block is the head of fullness class 1
1568 * rather than fullness class 0, but experience with other allocators has
1569 * shown that it's usually better to allocate from a block that's moderately
1570 * full rather than one that's nearly empty. Insofar as is reasonably
1571 * possible, we want to avoid performing new allocations in a block that would
1572 * otherwise become empty soon.
1573 */
1574static bool
1576 int size_class)
1577{
1578 dsa_pointer span_pointer;
1579 dsa_pointer start_pointer;
1580 size_t obsize = dsa_size_classes[size_class];
1581 size_t nmax;
1582 int fclass;
1583 size_t npages = 1;
1584 size_t first_page;
1585 size_t i;
1586 dsa_segment_map *segment_map;
1587
1588 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1589
1590 /*
1591 * Compute the number of objects that will fit in a block of this size
1592 * class. Span-of-spans blocks are just a single page, and the first
1593 * object isn't available for use because it describes the block-of-spans
1594 * itself.
1595 */
1596 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1597 nmax = FPM_PAGE_SIZE / obsize - 1;
1598 else
1599 nmax = DSA_SUPERBLOCK_SIZE / obsize;
1600
1601 /*
1602 * If fullness class 1 is empty, try to find a span to put in it by
1603 * scanning higher-numbered fullness classes (excluding the last one,
1604 * whose blocks are certain to all be completely full).
1605 */
1606 for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1607 {
1608 span_pointer = pool->spans[fclass];
1609
1610 while (DsaPointerIsValid(span_pointer))
1611 {
1612 int tfclass;
1613 dsa_area_span *span;
1614 dsa_area_span *nextspan;
1615 dsa_area_span *prevspan;
1616 dsa_pointer next_span_pointer;
1617
1618 span = (dsa_area_span *)
1619 dsa_get_address(area, span_pointer);
1620 next_span_pointer = span->nextspan;
1621
1622 /* Figure out what fullness class should contain this span. */
1623 tfclass = (nmax - span->nallocatable)
1624 * (DSA_FULLNESS_CLASSES - 1) / nmax;
1625
1626 /* Look up next span. */
1627 if (DsaPointerIsValid(span->nextspan))
1628 nextspan = (dsa_area_span *)
1629 dsa_get_address(area, span->nextspan);
1630 else
1631 nextspan = NULL;
1632
1633 /*
1634 * If utilization has dropped enough that this now belongs in some
1635 * other fullness class, move it there.
1636 */
1637 if (tfclass < fclass)
1638 {
1639 /* Remove from the current fullness class list. */
1640 if (pool->spans[fclass] == span_pointer)
1641 {
1642 /* It was the head; remove it. */
1644 pool->spans[fclass] = span->nextspan;
1645 if (nextspan != NULL)
1646 nextspan->prevspan = InvalidDsaPointer;
1647 }
1648 else
1649 {
1650 /* It was not the head. */
1652 prevspan = (dsa_area_span *)
1653 dsa_get_address(area, span->prevspan);
1654 prevspan->nextspan = span->nextspan;
1655 }
1656 if (nextspan != NULL)
1657 nextspan->prevspan = span->prevspan;
1658
1659 /* Push onto the head of the new fullness class list. */
1660 span->nextspan = pool->spans[tfclass];
1661 pool->spans[tfclass] = span_pointer;
1663 if (DsaPointerIsValid(span->nextspan))
1664 {
1665 nextspan = (dsa_area_span *)
1666 dsa_get_address(area, span->nextspan);
1667 nextspan->prevspan = span_pointer;
1668 }
1669 span->fclass = tfclass;
1670 }
1671
1672 /* Advance to next span on list. */
1673 span_pointer = next_span_pointer;
1674 }
1675
1676 /* Stop now if we found a suitable block. */
1677 if (DsaPointerIsValid(pool->spans[1]))
1678 return true;
1679 }
1680
1681 /*
1682 * If there are no blocks that properly belong in fullness class 1, pick
1683 * one from some other fullness class and move it there anyway, so that we
1684 * have an allocation target. Our last choice is to transfer a block
1685 * that's almost empty (and might become completely empty soon if left
1686 * alone), but even that is better than failing, which is what we must do
1687 * if there are no blocks at all with freespace.
1688 */
1689 Assert(!DsaPointerIsValid(pool->spans[1]));
1690 for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1691 if (transfer_first_span(area, pool, fclass, 1))
1692 return true;
1693 if (!DsaPointerIsValid(pool->spans[1]) &&
1694 transfer_first_span(area, pool, 0, 1))
1695 return true;
1696
1697 /*
1698 * We failed to find an existing span with free objects, so we need to
1699 * allocate a new superblock and construct a new span to manage it.
1700 *
1701 * First, get a dsa_area_span object to describe the new superblock block
1702 * ... unless this allocation is for a dsa_area_span object, in which case
1703 * that's surely not going to work. We handle that case by storing the
1704 * span describing a block-of-spans inline.
1705 */
1706 if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1707 {
1708 span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1709 if (!DsaPointerIsValid(span_pointer))
1710 return false;
1711 npages = DSA_PAGES_PER_SUPERBLOCK;
1712 }
1713
1714 /* Find or create a segment and allocate the superblock. */
1716 segment_map = get_best_segment(area, npages);
1717 if (segment_map == NULL)
1718 {
1719 segment_map = make_new_segment(area, npages);
1720 if (segment_map == NULL)
1721 {
1723 return false;
1724 }
1725 }
1726
1727 /*
1728 * This shouldn't happen: get_best_segment() or make_new_segment()
1729 * promised that we can successfully allocate npages.
1730 */
1731 if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1732 elog(FATAL,
1733 "dsa_allocate could not find %zu free pages for superblock",
1734 npages);
1736
1737 /* Compute the start of the superblock. */
1738 start_pointer =
1739 DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1740 first_page * FPM_PAGE_SIZE);
1741
1742 /*
1743 * If this is a block-of-spans, carve the descriptor right out of the
1744 * allocated space.
1745 */
1746 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1747 {
1748 /*
1749 * We have a pointer into the segment. We need to build a dsa_pointer
1750 * from the segment index and offset into the segment.
1751 */
1752 span_pointer = start_pointer;
1753 }
1754
1755 /* Initialize span and pagemap. */
1756 init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1757 for (i = 0; i < npages; ++i)
1758 segment_map->pagemap[first_page + i] = span_pointer;
1759
1760 return true;
1761}
1762
1763/*
1764 * Return the segment map corresponding to a given segment index, mapping the
1765 * segment in if necessary. For internal segment book-keeping, this is called
1766 * with the area lock held. It is also called by dsa_free and dsa_get_address
1767 * without any locking, relying on the fact they have a known live segment
1768 * index and they always call check_for_freed_segments to ensures that any
1769 * freed segment occupying the same slot is detached first.
1770 */
1771static dsa_segment_map *
1773{
1774 if (unlikely(area->segment_maps[index].mapped_address == NULL))
1775 {
1776 dsm_handle handle;
1777 dsm_segment *segment;
1778 dsa_segment_map *segment_map;
1779 ResourceOwner oldowner;
1780
1781 /*
1782 * If we are reached by dsa_free or dsa_get_address, there must be at
1783 * least one object allocated in the referenced segment. Otherwise,
1784 * their caller has a double-free or access-after-free bug, which we
1785 * have no hope of detecting. So we know it's safe to access this
1786 * array slot without holding a lock; it won't change underneath us.
1787 * Furthermore, we know that we can see the latest contents of the
1788 * slot, as explained in check_for_freed_segments, which those
1789 * functions call before arriving here.
1790 */
1791 handle = area->control->segment_handles[index];
1792
1793 /* It's an error to try to access an unused slot. */
1794 if (handle == DSM_HANDLE_INVALID)
1795 elog(ERROR,
1796 "dsa_area could not attach to a segment that has been freed");
1797
1798 oldowner = CurrentResourceOwner;
1800 segment = dsm_attach(handle);
1801 CurrentResourceOwner = oldowner;
1802 if (segment == NULL)
1803 elog(ERROR, "dsa_area could not attach to segment");
1804 segment_map = &area->segment_maps[index];
1805 segment_map->segment = segment;
1806 segment_map->mapped_address = dsm_segment_address(segment);
1807 segment_map->header =
1808 (dsa_segment_header *) segment_map->mapped_address;
1809 segment_map->fpm = (FreePageManager *)
1810 (segment_map->mapped_address +
1811 MAXALIGN(sizeof(dsa_segment_header)));
1812 segment_map->pagemap = (dsa_pointer *)
1813 (segment_map->mapped_address +
1814 MAXALIGN(sizeof(dsa_segment_header)) +
1815 MAXALIGN(sizeof(FreePageManager)));
1816
1817 /* Remember the highest index this backend has ever mapped. */
1818 if (area->high_segment_index < index)
1819 area->high_segment_index = index;
1820
1821 Assert(segment_map->header->magic ==
1823 }
1824
1825 /*
1826 * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1827 * but it's a bug in the calling code and undefined behavior if the
1828 * address is not live (ie if the segment might possibly have been freed,
1829 * they're trying to use a dangling pointer).
1830 *
1831 * For dsa.c code that holds the area lock to manipulate segment_bins
1832 * lists, it would be a bug if we ever reach a freed segment here. After
1833 * it's marked as freed, the only thing any backend should do with it is
1834 * unmap it, and it should always have done that in
1835 * check_for_freed_segments_locked() before arriving here to resolve an
1836 * index to a segment_map.
1837 *
1838 * Either way we can assert that we aren't returning a freed segment.
1839 */
1841
1842 return &area->segment_maps[index];
1843}
1844
1845/*
1846 * Return a superblock to the free page manager. If the underlying segment
1847 * has become entirely free, then return it to the operating system.
1848 *
1849 * The appropriate pool lock must be held.
1850 */
1851static void
1853{
1854 dsa_area_span *span = dsa_get_address(area, span_pointer);
1855 int size_class = span->size_class;
1856 dsa_segment_map *segment_map;
1857
1858
1859 /* Remove it from its fullness class list. */
1860 unlink_span(area, span);
1861
1862 /*
1863 * Note: Here we acquire the area lock while we already hold a per-pool
1864 * lock. We never hold the area lock and then take a pool lock, or we
1865 * could deadlock.
1866 */
1869 segment_map =
1871 FreePageManagerPut(segment_map->fpm,
1873 span->npages);
1874 /* Check if the segment is now entirely free. */
1875 if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1876 {
1877 dsa_segment_index index = get_segment_index(area, segment_map);
1878
1879 /* If it's not the segment with extra control data, free it. */
1880 if (index != 0)
1881 {
1882 /*
1883 * Give it back to the OS, and allow other backends to detect that
1884 * they need to detach.
1885 */
1886 unlink_segment(area, segment_map);
1887 segment_map->header->freed = true;
1889 segment_map->header->size);
1890 area->control->total_segment_size -=
1891 segment_map->header->size;
1893 dsm_detach(segment_map->segment);
1896 segment_map->segment = NULL;
1897 segment_map->header = NULL;
1898 segment_map->mapped_address = NULL;
1899 }
1900 }
1901
1902 /* Move segment to appropriate bin if necessary. */
1903 if (segment_map->header != NULL)
1904 rebin_segment(area, segment_map);
1905
1907
1908 /*
1909 * Span-of-spans blocks store the span which describes them within the
1910 * block itself, so freeing the storage implicitly frees the descriptor
1911 * also. If this is a block of any other type, we need to separately free
1912 * the span object also. This recursive call to dsa_free will acquire the
1913 * span pool's lock. We can't deadlock because the acquisition order is
1914 * always some other pool and then the span pool.
1915 */
1916 if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1917 dsa_free(area, span_pointer);
1918}
1919
1920static void
1922{
1923 if (DsaPointerIsValid(span->nextspan))
1924 {
1926
1927 next->prevspan = span->prevspan;
1928 }
1929 if (DsaPointerIsValid(span->prevspan))
1930 {
1931 dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1932
1933 prev->nextspan = span->nextspan;
1934 }
1935 else
1936 {
1937 dsa_area_pool *pool = dsa_get_address(area, span->pool);
1938
1939 pool->spans[span->fclass] = span->nextspan;
1940 }
1941}
1942
1943static void
1945 dsa_pointer span_pointer,
1946 int fclass)
1947{
1948 dsa_area_pool *pool = dsa_get_address(area, span->pool);
1949
1950 if (DsaPointerIsValid(pool->spans[fclass]))
1951 {
1952 dsa_area_span *head = dsa_get_address(area,
1953 pool->spans[fclass]);
1954
1955 head->prevspan = span_pointer;
1956 }
1958 span->nextspan = pool->spans[fclass];
1959 pool->spans[fclass] = span_pointer;
1960 span->fclass = fclass;
1961}
1962
1963/*
1964 * Detach from an area that was either created or attached to by this process.
1965 */
1966void
1968{
1969 int i;
1970
1971 /* Detach from all segments. */
1972 for (i = 0; i <= area->high_segment_index; ++i)
1973 if (area->segment_maps[i].segment != NULL)
1975
1976 /*
1977 * Note that 'detaching' (= detaching from DSM segments) doesn't include
1978 * 'releasing' (= adjusting the reference count). It would be nice to
1979 * combine these operations, but client code might never get around to
1980 * calling dsa_detach because of an error path, and a detach hook on any
1981 * particular segment is too late to detach other segments in the area
1982 * without risking a 'leak' warning in the non-error path.
1983 */
1984
1985 /* Free the backend-local area object. */
1986 pfree(area);
1987}
1988
1989/*
1990 * Unlink a segment from the bin that contains it.
1991 */
1992static void
1994{
1995 if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
1996 {
1997 dsa_segment_map *prev;
1998
1999 prev = get_segment_by_index(area, segment_map->header->prev);
2000 prev->header->next = segment_map->header->next;
2001 }
2002 else
2003 {
2004 Assert(area->control->segment_bins[segment_map->header->bin] ==
2005 get_segment_index(area, segment_map));
2006 area->control->segment_bins[segment_map->header->bin] =
2007 segment_map->header->next;
2008 }
2009 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2010 {
2012
2013 next = get_segment_by_index(area, segment_map->header->next);
2014 next->header->prev = segment_map->header->prev;
2015 }
2016}
2017
2018/*
2019 * Find a segment that could satisfy a request for 'npages' of contiguous
2020 * memory, or return NULL if none can be found. This may involve attaching to
2021 * segments that weren't previously attached so that we can query their free
2022 * pages map.
2023 */
2024static dsa_segment_map *
2025get_best_segment(dsa_area *area, size_t npages)
2026{
2027 size_t bin;
2028
2031
2032 /*
2033 * Start searching from the first bin that *might* have enough contiguous
2034 * pages.
2035 */
2036 for (bin = contiguous_pages_to_segment_bin(npages);
2038 ++bin)
2039 {
2040 /*
2041 * The minimum contiguous size that any segment in this bin should
2042 * have. We'll re-bin if we see segments with fewer.
2043 */
2044 size_t threshold = (size_t) 1 << (bin - 1);
2045 dsa_segment_index segment_index;
2046
2047 /* Search this bin for a segment with enough contiguous space. */
2048 segment_index = area->control->segment_bins[bin];
2049 while (segment_index != DSA_SEGMENT_INDEX_NONE)
2050 {
2051 dsa_segment_map *segment_map;
2052 dsa_segment_index next_segment_index;
2053 size_t contiguous_pages;
2054
2055 segment_map = get_segment_by_index(area, segment_index);
2056 next_segment_index = segment_map->header->next;
2057 contiguous_pages = fpm_largest(segment_map->fpm);
2058
2059 /* Not enough for the request, still enough for this bin. */
2060 if (contiguous_pages >= threshold && contiguous_pages < npages)
2061 {
2062 segment_index = next_segment_index;
2063 continue;
2064 }
2065
2066 /* Re-bin it if it's no longer in the appropriate bin. */
2067 if (contiguous_pages < threshold)
2068 {
2069 rebin_segment(area, segment_map);
2070
2071 /*
2072 * But fall through to see if it's enough to satisfy this
2073 * request anyway....
2074 */
2075 }
2076
2077 /* Check if we are done. */
2078 if (contiguous_pages >= npages)
2079 return segment_map;
2080
2081 /* Continue searching the same bin. */
2082 segment_index = next_segment_index;
2083 }
2084 }
2085
2086 /* Not found. */
2087 return NULL;
2088}
2089
2090/*
2091 * Create a new segment that can handle at least requested_pages. Returns
2092 * NULL if the requested total size limit or maximum allowed number of
2093 * segments would be exceeded.
2094 */
2095static dsa_segment_map *
2096make_new_segment(dsa_area *area, size_t requested_pages)
2097{
2098 dsa_segment_index new_index;
2099 size_t metadata_bytes;
2100 size_t total_size;
2101 size_t total_pages;
2102 size_t usable_pages;
2103 dsa_segment_map *segment_map;
2104 dsm_segment *segment;
2105 ResourceOwner oldowner;
2106
2108
2109 /* Find a segment slot that is not in use (linearly for now). */
2110 for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2111 {
2112 if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2113 break;
2114 }
2115 if (new_index == DSA_MAX_SEGMENTS)
2116 return NULL;
2117
2118 /*
2119 * If the total size limit is already exceeded, then we exit early and
2120 * avoid arithmetic wraparound in the unsigned expressions below.
2121 */
2122 if (area->control->total_segment_size >=
2124 return NULL;
2125
2126 /*
2127 * The size should be at least as big as requested, and at least big
2128 * enough to follow a geometric series that approximately doubles the
2129 * total storage each time we create a new segment. We use geometric
2130 * growth because the underlying DSM system isn't designed for large
2131 * numbers of segments (otherwise we might even consider just using one
2132 * DSM segment for each large allocation and for each superblock, and then
2133 * we wouldn't need to use FreePageManager).
2134 *
2135 * We decide on a total segment size first, so that we produce tidy
2136 * power-of-two sized segments. This is a good property to have if we
2137 * move to huge pages in the future. Then we work back to the number of
2138 * pages we can fit.
2139 */
2141 ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2146
2147 total_pages = total_size / FPM_PAGE_SIZE;
2148 metadata_bytes =
2149 MAXALIGN(sizeof(dsa_segment_header)) +
2150 MAXALIGN(sizeof(FreePageManager)) +
2151 sizeof(dsa_pointer) * total_pages;
2152
2153 /* Add padding up to next page boundary. */
2154 if (metadata_bytes % FPM_PAGE_SIZE != 0)
2155 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2156 if (total_size <= metadata_bytes)
2157 return NULL;
2158 usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2159 Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2160
2161 /* See if that is enough... */
2162 if (requested_pages > usable_pages)
2163 {
2164 /*
2165 * We'll make an odd-sized segment, working forward from the requested
2166 * number of pages.
2167 */
2168 usable_pages = requested_pages;
2169 metadata_bytes =
2170 MAXALIGN(sizeof(dsa_segment_header)) +
2171 MAXALIGN(sizeof(FreePageManager)) +
2172 usable_pages * sizeof(dsa_pointer);
2173
2174 /* Add padding up to next page boundary. */
2175 if (metadata_bytes % FPM_PAGE_SIZE != 0)
2176 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2177 total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2178
2179 /* Is that too large for dsa_pointer's addressing scheme? */
2181 return NULL;
2182
2183 /* Would that exceed the limit? */
2186 return NULL;
2187 }
2188
2189 /* Create the segment. */
2190 oldowner = CurrentResourceOwner;
2192 segment = dsm_create(total_size, 0);
2193 CurrentResourceOwner = oldowner;
2194 if (segment == NULL)
2195 return NULL;
2196 dsm_pin_segment(segment);
2197
2198 /* Store the handle in shared memory to be found by index. */
2199 area->control->segment_handles[new_index] =
2200 dsm_segment_handle(segment);
2201 /* Track the highest segment index in the history of the area. */
2202 if (area->control->high_segment_index < new_index)
2203 area->control->high_segment_index = new_index;
2204 /* Track the highest segment index this backend has ever mapped. */
2205 if (area->high_segment_index < new_index)
2206 area->high_segment_index = new_index;
2207 /* Track total size of all segments. */
2211
2212 /* Build a segment map for this segment in this backend. */
2213 segment_map = &area->segment_maps[new_index];
2214 segment_map->segment = segment;
2215 segment_map->mapped_address = dsm_segment_address(segment);
2216 segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2217 segment_map->fpm = (FreePageManager *)
2218 (segment_map->mapped_address +
2219 MAXALIGN(sizeof(dsa_segment_header)));
2220 segment_map->pagemap = (dsa_pointer *)
2221 (segment_map->mapped_address +
2222 MAXALIGN(sizeof(dsa_segment_header)) +
2223 MAXALIGN(sizeof(FreePageManager)));
2224
2225 /* Set up the free page map. */
2226 FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2227 FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2228 usable_pages);
2229
2230 /* Set up the segment header and put it in the appropriate bin. */
2231 segment_map->header->magic =
2232 DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2233 segment_map->header->usable_pages = usable_pages;
2234 segment_map->header->size = total_size;
2235 segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2236 segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2237 segment_map->header->next =
2238 area->control->segment_bins[segment_map->header->bin];
2239 segment_map->header->freed = false;
2240 area->control->segment_bins[segment_map->header->bin] = new_index;
2241 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2242 {
2244 get_segment_by_index(area, segment_map->header->next);
2245
2246 Assert(next->header->bin == segment_map->header->bin);
2247 next->header->prev = new_index;
2248 }
2249
2250 return segment_map;
2251}
2252
2253/*
2254 * Check if any segments have been freed by destroy_superblock, so we can
2255 * detach from them in this backend. This function is called by
2256 * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2257 * received can be resolved to the correct segment.
2258 *
2259 * The danger we want to defend against is that there could be an old segment
2260 * mapped into a given slot in this backend, and the dsa_pointer they have
2261 * might refer to some new segment in the same slot. So those functions must
2262 * be sure to process all instructions to detach from a freed segment that had
2263 * been generated by the time this process received the dsa_pointer, before
2264 * they call get_segment_by_index.
2265 */
2266static void
2268{
2269 size_t freed_segment_counter;
2270
2271 /*
2272 * Any other process that has freed a segment has incremented
2273 * freed_segment_counter while holding an LWLock, and that must precede
2274 * any backend creating a new segment in the same slot while holding an
2275 * LWLock, and that must precede the creation of any dsa_pointer pointing
2276 * into the new segment which might reach us here, and the caller must
2277 * have sent the dsa_pointer to this process using appropriate memory
2278 * synchronization (some kind of locking or atomic primitive or system
2279 * call). So all we need to do on the reading side is ask for the load of
2280 * freed_segment_counter to follow the caller's load of the dsa_pointer it
2281 * has, and we can be sure to detect any segments that had been freed as
2282 * of the time that the dsa_pointer reached this process.
2283 */
2285 freed_segment_counter = area->control->freed_segment_counter;
2286 if (unlikely(area->freed_segment_counter != freed_segment_counter))
2287 {
2288 /* Check all currently mapped segments to find what's been freed. */
2292 }
2293}
2294
2295/*
2296 * Workhorse for check_for_freed_segments(), and also used directly in path
2297 * where the area lock is already held. This should be called after acquiring
2298 * the lock but before looking up any segment by index number, to make sure we
2299 * unmap any stale segments that might have previously had the same index as a
2300 * current segment.
2301 */
2302static void
2304{
2305 size_t freed_segment_counter;
2306 int i;
2307
2309 freed_segment_counter = area->control->freed_segment_counter;
2310 if (unlikely(area->freed_segment_counter != freed_segment_counter))
2311 {
2312 for (i = 0; i <= area->high_segment_index; ++i)
2313 {
2314 if (area->segment_maps[i].header != NULL &&
2315 area->segment_maps[i].header->freed)
2316 {
2318 area->segment_maps[i].segment = NULL;
2319 area->segment_maps[i].header = NULL;
2320 area->segment_maps[i].mapped_address = NULL;
2321 }
2322 }
2323 area->freed_segment_counter = freed_segment_counter;
2324 }
2325}
2326
2327/*
2328 * Re-bin segment if it's no longer in the appropriate bin.
2329 */
2330static void
2332{
2333 size_t new_bin;
2334 dsa_segment_index segment_index;
2335
2336 new_bin = contiguous_pages_to_segment_bin(fpm_largest(segment_map->fpm));
2337 if (segment_map->header->bin == new_bin)
2338 return;
2339
2340 /* Remove it from its current bin. */
2341 unlink_segment(area, segment_map);
2342
2343 /* Push it onto the front of its new bin. */
2344 segment_index = get_segment_index(area, segment_map);
2345 segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2346 segment_map->header->next = area->control->segment_bins[new_bin];
2347 segment_map->header->bin = new_bin;
2348 area->control->segment_bins[new_bin] = segment_index;
2349 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2350 {
2352
2353 next = get_segment_by_index(area, segment_map->header->next);
2354 Assert(next->header->bin == new_bin);
2355 next->header->prev = segment_index;
2356 }
2357}
#define pg_read_barrier()
Definition: atomics.h:154
static int32 next
Definition: blutils.c:224
#define Min(x, y)
Definition: c.h:1007
#define MAXALIGN(LEN)
Definition: c.h:814
uint8_t uint8
Definition: c.h:540
uint16_t uint16
Definition: c.h:541
#define unlikely(x)
Definition: c.h:406
uint32_t uint32
Definition: c.h:542
#define lengthof(array)
Definition: c.h:791
#define fprintf(file, fmt, msg)
Definition: cubescan.l:21
static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:1993
static void check_for_freed_segments(dsa_area *area)
Definition: dsa.c:2267
static const uint16 dsa_size_classes[]
Definition: dsa.c:225
#define DSA_EXTRACT_SEGMENT_NUMBER(dp)
Definition: dsa.c:96
#define DSA_AREA_LOCK(area)
Definition: dsa.c:132
#define DSA_NUM_SEGMENTS_AT_EACH_SIZE
Definition: dsa.c:69
static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, dsa_pointer span_pointer, int fclass)
Definition: dsa.c:1944
static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, int size_class)
Definition: dsa.c:1575
#define DSA_SEGMENT_INDEX_NONE
Definition: dsa.c:105
static dsa_area * create_internal(void *place, size_t size, int tranche_id, dsm_handle control_handle, dsm_segment *control_segment, size_t init_segment_size, size_t max_segment_size)
Definition: dsa.c:1233
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:510
#define DSA_SEGMENT_HEADER_MAGIC
Definition: dsa.c:89
void dsa_trim(dsa_area *area)
Definition: dsa.c:1058
#define DSA_SPAN_NOTHING_FREE
Definition: dsa.c:375
#define DSA_MAKE_POINTER(segment_number, offset)
Definition: dsa.c:92
dsa_area * dsa_create_in_place_ext(void *place, size_t size, int tranche_id, dsm_segment *segment, size_t init_segment_size, size_t max_segment_size)
Definition: dsa.c:471
#define get_segment_index(area, segment_map_ptr)
Definition: dsa.c:379
dsa_area * dsa_attach_in_place(void *place, dsm_segment *segment)
Definition: dsa.c:560
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:957
void dsa_on_shmem_exit_release_in_place(int code, Datum place)
Definition: dsa.c:605
void dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
Definition: dsa.c:591
static dsa_pointer alloc_object(dsa_area *area, int size_class)
Definition: dsa.c:1487
#define DSA_PAGES_PER_SUPERBLOCK
Definition: dsa.c:82
#define DSA_SIZE_CLASS_MAP_QUANTUM
Definition: dsa.c:258
static size_t contiguous_pages_to_segment_bin(size_t n)
Definition: dsa.c:119
static const uint8 dsa_size_class_map[]
Definition: dsa.c:248
dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags)
Definition: dsa.c:686
static dsa_segment_map * make_new_segment(dsa_area *area, size_t requested_pages)
Definition: dsa.c:2096
size_t dsa_get_total_size(dsa_area *area)
Definition: dsa.c:1042
#define DSA_SUPERBLOCK_SIZE
Definition: dsa.c:376
#define DsaAreaPoolToDsaPointer(area, p)
Definition: dsa.c:322
static void check_for_freed_segments_locked(dsa_area *area)
Definition: dsa.c:2303
#define DSA_EXTRACT_OFFSET(dp)
Definition: dsa.c:99
dsa_area * dsa_create_ext(int tranche_id, size_t init_segment_size, size_t max_segment_size)
Definition: dsa.c:421
static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
Definition: dsa.c:1852
#define DSA_MAX_SEGMENTS
Definition: dsa.c:75
size_t dsa_segment_index
Definition: dsa.c:102
static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:2331
#define DSA_SCLASS_LOCK(area, sclass)
Definition: dsa.c:133
void dsa_release_in_place(void *place)
Definition: dsa.c:620
static dsa_segment_map * get_segment_by_index(dsa_area *area, dsa_segment_index index)
Definition: dsa.c:1772
void dsa_set_size_limit(dsa_area *area, size_t limit)
Definition: dsa.c:1033
#define DSA_SCLASS_BLOCK_OF_SPANS
Definition: dsa.c:239
static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, int fromclass, int toclass)
Definition: dsa.c:1447
static void unlink_span(dsa_area *area, dsa_area_span *span)
Definition: dsa.c:1921
#define DSA_SCLASS_SPAN_LARGE
Definition: dsa.c:240
#define DSA_NUM_SIZE_CLASSES
Definition: dsa.c:236
void dsa_unpin(dsa_area *area)
Definition: dsa.c:1009
void dsa_pin_mapping(dsa_area *area)
Definition: dsa.c:650
static dsa_area * attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
Definition: dsa.c:1341
#define NextFreeObjectIndex(object)
Definition: dsa.c:202
void dsa_dump(dsa_area *area)
Definition: dsa.c:1103
dsa_handle dsa_get_handle(dsa_area *area)
Definition: dsa.c:498
static void init_span(dsa_area *area, dsa_pointer span_pointer, dsa_area_pool *pool, dsa_pointer start, size_t npages, uint16 size_class)
Definition: dsa.c:1392
bool dsa_is_attached(dsa_handle handle)
Definition: dsa.c:540
void dsa_detach(dsa_area *area)
Definition: dsa.c:1967
static dsa_segment_map * get_best_segment(dsa_area *area, size_t npages)
Definition: dsa.c:2025
#define DSA_FULLNESS_CLASSES
Definition: dsa.c:266
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:841
#define DSA_NUM_SEGMENT_BINS
Definition: dsa.c:111
size_t dsa_minimum_size(void)
Definition: dsa.c:1211
void dsa_pin(dsa_area *area)
Definition: dsa.c:990
uint64 dsa_pointer
Definition: dsa.h:62
#define DSA_POINTER_FORMAT
Definition: dsa.h:69
#define DSA_MIN_SEGMENT_SIZE
Definition: dsa.h:100
dsm_handle dsa_handle
Definition: dsa.h:136
#define InvalidDsaPointer
Definition: dsa.h:78
#define DSA_ALLOC_NO_OOM
Definition: dsa.h:74
#define DSA_HANDLE_INVALID
Definition: dsa.h:139
#define DsaPointerIsValid(x)
Definition: dsa.h:106
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.h:103
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
dsm_handle dsm_segment_handle(dsm_segment *seg)
Definition: dsm.c:1123
void dsm_detach(dsm_segment *seg)
Definition: dsm.c:803
void on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
Definition: dsm.c:1132
void dsm_pin_mapping(dsm_segment *seg)
Definition: dsm.c:915
void dsm_unpin_segment(dsm_handle handle)
Definition: dsm.c:988
void dsm_pin_segment(dsm_segment *seg)
Definition: dsm.c:955
void * dsm_segment_address(dsm_segment *seg)
Definition: dsm.c:1095
dsm_segment * dsm_create(Size size, int flags)
Definition: dsm.c:516
dsm_segment * dsm_attach(dsm_handle h)
Definition: dsm.c:665
dsm_segment * dsm_find_mapping(dsm_handle handle)
Definition: dsm.c:1076
uint32 dsm_handle
Definition: dsm_impl.h:55
#define DSM_HANDLE_INVALID
Definition: dsm_impl.h:58
int errdetail(const char *fmt,...)
Definition: elog.c:1216
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
bool FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
Definition: freepage.c:210
void FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
Definition: freepage.c:379
void FreePageManagerInitialize(FreePageManager *fpm, char *base)
Definition: freepage.c:183
#define fpm_largest(fpm)
Definition: freepage.h:88
#define fpm_size_to_pages(sz)
Definition: freepage.h:74
#define FPM_PAGE_SIZE
Definition: freepage.h:30
Assert(PointerIsAligned(start, uint64))
return str start
int j
Definition: isn.c:78
int i
Definition: isn.c:77
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1977
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:698
@ LW_EXCLUSIVE
Definition: lwlock.h:112
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
#define AllocHugeSizeIsValid(size)
Definition: memutils.h:49
#define AllocSizeIsValid(size)
Definition: memutils.h:42
#define pg_leftmost_one_pos_size_t
Definition: pg_bitutils.h:440
static int64 total_size
Definition: pg_checksums.c:63
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:332
uint64_t Datum
Definition: postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:322
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
Definition: lwlock.h:42
dsa_segment_header segment_header
Definition: dsa.c:290
size_t init_segment_size
Definition: dsa.c:300
size_t total_segment_size
Definition: dsa.c:304
int lwlock_tranche_id
Definition: dsa.c:316
size_t max_segment_size
Definition: dsa.c:302
dsa_segment_index high_segment_index
Definition: dsa.c:308
bool pinned
Definition: dsa.c:312
size_t max_total_segment_size
Definition: dsa.c:306
dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]
Definition: dsa.c:296
dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]
Definition: dsa.c:298
size_t freed_segment_counter
Definition: dsa.c:314
int refcnt
Definition: dsa.c:310
LWLock lock
Definition: dsa.c:318
dsa_handle handle
Definition: dsa.c:292
dsm_handle segment_handles[DSA_MAX_SEGMENTS]
Definition: dsa.c:294
dsa_pointer spans[DSA_FULLNESS_CLASSES]
Definition: dsa.c:279
LWLock lock
Definition: dsa.c:277
dsa_pointer nextspan
Definition: dsa.c:187
uint16 fclass
Definition: dsa.c:195
dsa_pointer start
Definition: dsa.c:188
uint16 nallocatable
Definition: dsa.c:192
dsa_pointer prevspan
Definition: dsa.c:186
uint16 size_class
Definition: dsa.c:190
uint16 nmax
Definition: dsa.c:194
uint16 ninitialized
Definition: dsa.c:191
uint16 firstfree
Definition: dsa.c:193
dsa_pointer pool
Definition: dsa.c:185
size_t npages
Definition: dsa.c:189
Definition: dsa.c:348
dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]
Definition: dsa.c:366
dsa_segment_index high_segment_index
Definition: dsa.c:369
size_t freed_segment_counter
Definition: dsa.c:372
dsa_area_control * control
Definition: dsa.c:350
ResourceOwner resowner
Definition: dsa.c:358
uint32 magic
Definition: dsa.c:143
size_t size
Definition: dsa.c:147
dsa_segment_index next
Definition: dsa.c:159
dsa_segment_index prev
Definition: dsa.c:153
size_t usable_pages
Definition: dsa.c:145
size_t bin
Definition: dsa.c:161
dsa_segment_header * header
Definition: dsa.c:336
FreePageManager * fpm
Definition: dsa.c:337
dsm_segment * segment
Definition: dsa.c:334
dsa_pointer * pagemap
Definition: dsa.c:338
char * mapped_address
Definition: dsa.c:335
Definition: type.h:96