PostgreSQL Source Code git master
dsa.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * dsa.c
4 * Dynamic shared memory areas.
5 *
6 * This module provides dynamic shared memory areas which are built on top of
7 * DSM segments. While dsm.c allows segments of memory of shared memory to be
8 * created and shared between backends, it isn't designed to deal with small
9 * objects. A DSA area is a shared memory heap usually backed by one or more
10 * DSM segments which can allocate memory using dsa_allocate() and dsa_free().
11 * Alternatively, it can be created in pre-existing shared memory, including a
12 * DSM segment, and then create extra DSM segments as required. Unlike the
13 * regular system heap, it deals in pseudo-pointers which must be converted to
14 * backend-local pointers before they are dereferenced. These pseudo-pointers
15 * can however be shared with other backends, and can be used to construct
16 * shared data structures.
17 *
18 * Each DSA area manages a set of DSM segments, adding new segments as
19 * required and detaching them when they are no longer needed. Each segment
20 * contains a number of 4KB pages, a free page manager for tracking
21 * consecutive runs of free pages, and a page map for tracking the source of
22 * objects allocated on each page. Allocation requests above 8KB are handled
23 * by choosing a segment and finding consecutive free pages in its free page
24 * manager. Allocation requests for smaller sizes are handled using pools of
25 * objects of a selection of sizes. Each pool consists of a number of 16 page
26 * (64KB) superblocks allocated in the same way as large objects. Allocation
27 * of large objects and new superblocks is serialized by a single LWLock, but
28 * allocation of small objects from pre-existing superblocks uses one LWLock
29 * per pool. Currently there is one pool, and therefore one lock, per size
30 * class. Per-core pools to increase concurrency and strategies for reducing
31 * the resulting fragmentation are areas for future research. Each superblock
32 * is managed with a 'span', which tracks the superblock's freelist. Free
33 * requests are handled by looking in the page map to find which span an
34 * address was allocated from, so that small objects can be returned to the
35 * appropriate free list, and large object pages can be returned directly to
36 * the free page map. When allocating, simple heuristics for selecting
37 * segments and superblocks try to encourage occupied memory to be
38 * concentrated, increasing the likelihood that whole superblocks can become
39 * empty and be returned to the free page manager, and whole segments can
40 * become empty and be returned to the operating system.
41 *
42 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
43 * Portions Copyright (c) 1994, Regents of the University of California
44 *
45 * IDENTIFICATION
46 * src/backend/utils/mmgr/dsa.c
47 *
48 *-------------------------------------------------------------------------
49 */
50
51#include "postgres.h"
52
53#include "port/atomics.h"
54#include "port/pg_bitutils.h"
55#include "storage/dsm.h"
56#include "storage/lwlock.h"
57#include "utils/dsa.h"
58#include "utils/freepage.h"
59#include "utils/memutils.h"
60#include "utils/resowner.h"
61
62/*
63 * How many segments to create before we double the segment size. If this is
64 * low, then there is likely to be a lot of wasted space in the largest
65 * segment. If it is high, then we risk running out of segment slots (see
66 * dsm.c's limits on total number of segments), or limiting the total size
67 * an area can manage when using small pointers.
68 */
69#define DSA_NUM_SEGMENTS_AT_EACH_SIZE 2
70
71/*
72 * The maximum number of DSM segments that an area can own, determined by
73 * the number of bits remaining (but capped at 1024).
74 */
75#define DSA_MAX_SEGMENTS \
76 Min(1024, (1 << ((SIZEOF_DSA_POINTER * 8) - DSA_OFFSET_WIDTH)))
77
78/* The bitmask for extracting the offset from a dsa_pointer. */
79#define DSA_OFFSET_BITMASK (((dsa_pointer) 1 << DSA_OFFSET_WIDTH) - 1)
80
81/* Number of pages (see FPM_PAGE_SIZE) per regular superblock. */
82#define DSA_PAGES_PER_SUPERBLOCK 16
83
84/*
85 * A magic number used as a sanity check for following DSM segments belonging
86 * to a DSA area (this number will be XORed with the area handle and
87 * the segment index).
88 */
89#define DSA_SEGMENT_HEADER_MAGIC 0x0ce26608
90
91/* Build a dsa_pointer given a segment number and offset. */
92#define DSA_MAKE_POINTER(segment_number, offset) \
93 (((dsa_pointer) (segment_number) << DSA_OFFSET_WIDTH) | (offset))
94
95/* Extract the segment number from a dsa_pointer. */
96#define DSA_EXTRACT_SEGMENT_NUMBER(dp) ((dp) >> DSA_OFFSET_WIDTH)
97
98/* Extract the offset from a dsa_pointer. */
99#define DSA_EXTRACT_OFFSET(dp) ((dp) & DSA_OFFSET_BITMASK)
100
101/* The type used for index segment indexes (zero based). */
102typedef size_t dsa_segment_index;
103
104/* Sentinel value for dsa_segment_index indicating 'none' or 'end'. */
105#define DSA_SEGMENT_INDEX_NONE (~(dsa_segment_index)0)
106
107/*
108 * How many bins of segments do we have? The bins are used to categorize
109 * segments by their largest contiguous run of free pages.
110 */
111#define DSA_NUM_SEGMENT_BINS 16
112
113/*
114 * What is the lowest bin that holds segments that *might* have n contiguous
115 * free pages? There is no point in looking in segments in lower bins; they
116 * definitely can't service a request for n free pages.
117 */
118static inline size_t
120{
121 size_t bin;
122
123 if (n == 0)
124 bin = 0;
125 else
126 bin = pg_leftmost_one_pos_size_t(n) + 1;
127
128 return Min(bin, DSA_NUM_SEGMENT_BINS - 1);
129}
130
131/* Macros for access to locks. */
132#define DSA_AREA_LOCK(area) (&area->control->lock)
133#define DSA_SCLASS_LOCK(area, sclass) (&area->control->pools[sclass].lock)
134
135/*
136 * The header for an individual segment. This lives at the start of each DSM
137 * segment owned by a DSA area including the first segment (where it appears
138 * as part of the dsa_area_control struct).
139 */
140typedef struct
141{
142 /* Sanity check magic value. */
144 /* Total number of pages in this segment (excluding metadata area). */
146 /* Total size of this segment in bytes. */
147 size_t size;
148
149 /*
150 * Index of the segment that precedes this one in the same segment bin, or
151 * DSA_SEGMENT_INDEX_NONE if this is the first one.
152 */
154
155 /*
156 * Index of the segment that follows this one in the same segment bin, or
157 * DSA_SEGMENT_INDEX_NONE if this is the last one.
158 */
160 /* The index of the bin that contains this segment. */
161 size_t bin;
162
163 /*
164 * A flag raised to indicate that this segment is being returned to the
165 * operating system and has been unpinned.
166 */
167 bool freed;
169
170/*
171 * Metadata for one superblock.
172 *
173 * For most blocks, span objects are stored out-of-line; that is, the span
174 * object is not stored within the block itself. But, as an exception, for a
175 * "span of spans", the span object is stored "inline". The allocation is
176 * always exactly one page, and the dsa_area_span object is located at
177 * the beginning of that page. The size class is DSA_SCLASS_BLOCK_OF_SPANS,
178 * and the remaining fields are used just as they would be in an ordinary
179 * block. We can't allocate spans out of ordinary superblocks because
180 * creating an ordinary superblock requires us to be able to allocate a span
181 * *first*. Doing it this way avoids that circularity.
182 */
183typedef struct
184{
185 dsa_pointer pool; /* Containing pool. */
186 dsa_pointer prevspan; /* Previous span. */
187 dsa_pointer nextspan; /* Next span. */
188 dsa_pointer start; /* Starting address. */
189 size_t npages; /* Length of span in pages. */
190 uint16 size_class; /* Size class. */
191 uint16 ninitialized; /* Maximum number of objects ever allocated. */
192 uint16 nallocatable; /* Number of objects currently allocatable. */
193 uint16 firstfree; /* First object on free list. */
194 uint16 nmax; /* Maximum number of objects ever possible. */
195 uint16 fclass; /* Current fullness class. */
197
198/*
199 * Given a pointer to an object in a span, access the index of the next free
200 * object in the same span (ie in the span's freelist) as an L-value.
201 */
202#define NextFreeObjectIndex(object) (* (uint16 *) (object))
203
204/*
205 * Small allocations are handled by dividing a single block of memory into
206 * many small objects of equal size. The possible allocation sizes are
207 * defined by the following array. Larger size classes are spaced more widely
208 * than smaller size classes. We fudge the spacing for size classes >1kB to
209 * avoid space wastage: based on the knowledge that we plan to allocate 64kB
210 * blocks, we bump the maximum object size up to the largest multiple of
211 * 8 bytes that still lets us fit the same number of objects into one block.
212 *
213 * NB: Because of this fudging, if we were ever to use differently-sized blocks
214 * for small allocations, these size classes would need to be reworked to be
215 * optimal for the new size.
216 *
217 * NB: The optimal spacing for size classes, as well as the size of the blocks
218 * out of which small objects are allocated, is not a question that has one
219 * right answer. Some allocators (such as tcmalloc) use more closely-spaced
220 * size classes than we do here, while others (like aset.c) use more
221 * widely-spaced classes. Spacing the classes more closely avoids wasting
222 * memory within individual chunks, but also means a larger number of
223 * potentially-unfilled blocks.
224 */
225static const uint16 dsa_size_classes[] = {
226 sizeof(dsa_area_span), 0, /* special size classes */
227 8, 16, 24, 32, 40, 48, 56, 64, /* 8 classes separated by 8 bytes */
228 80, 96, 112, 128, /* 4 classes separated by 16 bytes */
229 160, 192, 224, 256, /* 4 classes separated by 32 bytes */
230 320, 384, 448, 512, /* 4 classes separated by 64 bytes */
231 640, 768, 896, 1024, /* 4 classes separated by 128 bytes */
232 1280, 1560, 1816, 2048, /* 4 classes separated by ~256 bytes */
233 2616, 3120, 3640, 4096, /* 4 classes separated by ~512 bytes */
234 5456, 6552, 7280, 8192 /* 4 classes separated by ~1024 bytes */
235};
236#define DSA_NUM_SIZE_CLASSES lengthof(dsa_size_classes)
237
238/* Special size classes. */
239#define DSA_SCLASS_BLOCK_OF_SPANS 0
240#define DSA_SCLASS_SPAN_LARGE 1
241
242/*
243 * The following lookup table is used to map the size of small objects
244 * (less than 1kB) onto the corresponding size class. To use this table,
245 * round the size of the object up to the next multiple of 8 bytes, and then
246 * index into this array.
247 */
248static const uint8 dsa_size_class_map[] = {
249 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
250 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
251 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
252 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21,
253 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
254 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
255 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
256 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
257};
258#define DSA_SIZE_CLASS_MAP_QUANTUM 8
259
260/*
261 * Superblocks are binned by how full they are. Generally, each fullness
262 * class corresponds to one quartile, but the block being used for
263 * allocations is always at the head of the list for fullness class 1,
264 * regardless of how full it really is.
265 */
266#define DSA_FULLNESS_CLASSES 4
267
268/*
269 * A dsa_area_pool represents a set of objects of a given size class.
270 *
271 * Perhaps there should be multiple pools for the same size class for
272 * contention avoidance, but for now there is just one!
273 */
274typedef struct
275{
276 /* A lock protecting access to this pool. */
278 /* A set of linked lists of spans, arranged by fullness. */
280 /* Should we pad this out to a cacheline boundary? */
282
283/*
284 * The control block for an area. This lives in shared memory, at the start of
285 * the first DSM segment controlled by this area.
286 */
287typedef struct
288{
289 /* The segment header for the first segment. */
291 /* The handle for this area. */
293 /* The handles of the segments owned by this area. */
294 dsm_handle segment_handles[DSA_MAX_SEGMENTS];
295 /* Lists of segments, binned by maximum contiguous run of free pages. */
297 /* The object pools for each size class. */
299 /* initial allocation segment size */
301 /* maximum allocation segment size */
303 /* The total size of all active segments. */
305 /* The maximum total size of backing storage we are allowed. */
307 /* Highest used segment index in the history of this area. */
309 /* The reference count for this area. */
311 /* A flag indicating that this area has been pinned. */
312 bool pinned;
313 /* The number of times that segments have been freed. */
315 /* The LWLock tranche ID. */
317 /* The general lock (protects everything except object pools). */
320
321/* Given a pointer to a pool, find a dsa_pointer. */
322#define DsaAreaPoolToDsaPointer(area, p) \
323 DSA_MAKE_POINTER(0, (char *) p - (char *) area->control)
324
325/*
326 * A dsa_segment_map is stored within the backend-private memory of each
327 * individual backend. It holds the base address of the segment within that
328 * backend, plus the addresses of key objects within the segment. Those
329 * could instead be derived from the base address but it's handy to have them
330 * around.
331 */
332typedef struct
333{
334 dsm_segment *segment; /* DSM segment */
335 char *mapped_address; /* Address at which segment is mapped */
336 dsa_segment_header *header; /* Header (same as mapped_address) */
337 FreePageManager *fpm; /* Free page manager within segment. */
338 dsa_pointer *pagemap; /* Page map within segment. */
340
341/*
342 * Per-backend state for a storage area. Backends obtain one of these by
343 * creating an area or attaching to an existing one using a handle. Each
344 * process that needs to use an area uses its own object to track where the
345 * segments are mapped.
346 */
348{
349 /* Pointer to the control object in shared memory. */
351
352 /*
353 * All the mappings are owned by this. The dsa_area itself is not
354 * directly tracked by the ResourceOwner, but the effect is the same. NULL
355 * if the attachment has session lifespan, i.e if dsa_pin_mapping() has
356 * been called.
357 */
359
360 /*
361 * This backend's array of segment maps, ordered by segment index
362 * corresponding to control->segment_handles. Some of the area's segments
363 * may not be mapped in this backend yet, and some slots may have been
364 * freed and need to be detached; these operations happen on demand.
365 */
367
368 /* The highest segment index this backend has ever mapped. */
370
371 /* The last observed freed_segment_counter. */
373};
374
375#define DSA_SPAN_NOTHING_FREE ((uint16) -1)
376#define DSA_SUPERBLOCK_SIZE (DSA_PAGES_PER_SUPERBLOCK * FPM_PAGE_SIZE)
377
378/* Given a pointer to a segment_map, obtain a segment index number. */
379#define get_segment_index(area, segment_map_ptr) \
380 (segment_map_ptr - &area->segment_maps[0])
381
382static void init_span(dsa_area *area, dsa_pointer span_pointer,
383 dsa_area_pool *pool, dsa_pointer start, size_t npages,
384 uint16 size_class);
385static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool,
386 int fromclass, int toclass);
387static inline dsa_pointer alloc_object(dsa_area *area, int size_class);
388static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
389 int size_class);
392static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer);
393static void unlink_span(dsa_area *area, dsa_area_span *span);
394static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span,
395 dsa_pointer span_pointer, int fclass);
396static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map);
397static dsa_segment_map *get_best_segment(dsa_area *area, size_t npages);
398static dsa_segment_map *make_new_segment(dsa_area *area, size_t requested_pages);
399static dsa_area *create_internal(void *place, size_t size,
400 int tranche_id,
401 dsm_handle control_handle,
402 dsm_segment *control_segment,
403 size_t init_segment_size,
404 size_t max_segment_size);
405static dsa_area *attach_internal(void *place, dsm_segment *segment,
406 dsa_handle handle);
407static void check_for_freed_segments(dsa_area *area);
409static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map);
410
411/*
412 * Create a new shared area in a new DSM segment. Further DSM segments will
413 * be allocated as required to extend the available space.
414 *
415 * We can't allocate a LWLock tranche_id within this function, because tranche
416 * IDs are a scarce resource; there are only 64k available, using low numbers
417 * when possible matters, and we have no provision for recycling them. So,
418 * we require the caller to provide one.
419 */
420dsa_area *
421dsa_create_ext(int tranche_id, size_t init_segment_size, size_t max_segment_size)
422{
423 dsm_segment *segment;
424 dsa_area *area;
425
426 /*
427 * Create the DSM segment that will hold the shared control object and the
428 * first segment of usable space.
429 */
430 segment = dsm_create(init_segment_size, 0);
431
432 /*
433 * All segments backing this area are pinned, so that DSA can explicitly
434 * control their lifetime (otherwise a newly created segment belonging to
435 * this area might be freed when the only backend that happens to have it
436 * mapped in ends, corrupting the area).
437 */
438 dsm_pin_segment(segment);
439
440 /* Create a new DSA area with the control object in this segment. */
441 area = create_internal(dsm_segment_address(segment),
442 init_segment_size,
443 tranche_id,
444 dsm_segment_handle(segment), segment,
445 init_segment_size, max_segment_size);
446
447 /* Clean up when the control segment detaches. */
450
451 return area;
452}
453
454/*
455 * Create a new shared area in an existing shared memory space, which may be
456 * either DSM or Postmaster-initialized memory. DSM segments will be
457 * allocated as required to extend the available space, though that can be
458 * prevented with dsa_set_size_limit(area, size) using the same size provided
459 * to dsa_create_in_place.
460 *
461 * Areas created in-place must eventually be released by the backend that
462 * created them and all backends that attach to them. This can be done
463 * explicitly with dsa_release_in_place, or, in the special case that 'place'
464 * happens to be in a pre-existing DSM segment, by passing in a pointer to the
465 * segment so that a detach hook can be registered with the containing DSM
466 * segment.
467 *
468 * See dsa_create() for a note about the tranche arguments.
469 */
470dsa_area *
471dsa_create_in_place_ext(void *place, size_t size,
472 int tranche_id, dsm_segment *segment,
473 size_t init_segment_size, size_t max_segment_size)
474{
475 dsa_area *area;
476
477 area = create_internal(place, size, tranche_id,
478 DSM_HANDLE_INVALID, NULL,
479 init_segment_size, max_segment_size);
480
481 /*
482 * Clean up when the control segment detaches, if a containing DSM segment
483 * was provided.
484 */
485 if (segment != NULL)
487 PointerGetDatum(place));
488
489 return area;
490}
491
492/*
493 * Obtain a handle that can be passed to other processes so that they can
494 * attach to the given area. Cannot be called for areas created with
495 * dsa_create_in_place.
496 */
499{
501 return area->control->handle;
502}
503
504/*
505 * Attach to an area given a handle generated (possibly in another process) by
506 * dsa_get_handle. The area must have been created with dsa_create (not
507 * dsa_create_in_place).
508 */
509dsa_area *
511{
512 dsm_segment *segment;
513 dsa_area *area;
514
515 /*
516 * An area handle is really a DSM segment handle for the first segment, so
517 * we go ahead and attach to that.
518 */
519 segment = dsm_attach(handle);
520 if (segment == NULL)
522 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
523 errmsg("could not attach to dynamic shared area")));
524
525 area = attach_internal(dsm_segment_address(segment), segment, handle);
526
527 /* Clean up when the control segment detaches. */
530
531 return area;
532}
533
534/*
535 * Returns whether the area with the given handle was already attached by the
536 * current process. The area must have been created with dsa_create (not
537 * dsa_create_in_place).
538 */
539bool
541{
542 /*
543 * An area handle is really a DSM segment handle for the first segment, so
544 * we can just search for that.
545 */
546 return dsm_find_mapping(handle) != NULL;
547}
548
549/*
550 * Attach to an area that was created with dsa_create_in_place. The caller
551 * must somehow know the location in memory that was used when the area was
552 * created, though it may be mapped at a different virtual address in this
553 * process.
554 *
555 * See dsa_create_in_place for note about releasing in-place areas, and the
556 * optional 'segment' argument which can be provided to allow automatic
557 * release if the containing memory happens to be a DSM segment.
558 */
559dsa_area *
560dsa_attach_in_place(void *place, dsm_segment *segment)
561{
562 dsa_area *area;
563
564 area = attach_internal(place, NULL, DSA_HANDLE_INVALID);
565
566 /*
567 * Clean up when the control segment detaches, if a containing DSM segment
568 * was provided.
569 */
570 if (segment != NULL)
572 PointerGetDatum(place));
573
574 return area;
575}
576
577/*
578 * Release a DSA area that was produced by dsa_create_in_place or
579 * dsa_attach_in_place. The 'segment' argument is ignored but provides an
580 * interface suitable for on_dsm_detach, for the convenience of users who want
581 * to create a DSA segment inside an existing DSM segment and have it
582 * automatically released when the containing DSM segment is detached.
583 * 'place' should be the address of the place where the area was created.
584 *
585 * This callback is automatically registered for the DSM segment containing
586 * the control object of in-place areas when a segment is provided to
587 * dsa_create_in_place or dsa_attach_in_place, and also for all areas created
588 * with dsa_create.
589 */
590void
592{
594}
595
596/*
597 * Release a DSA area that was produced by dsa_create_in_place or
598 * dsa_attach_in_place. The 'code' argument is ignored but provides an
599 * interface suitable for on_shmem_exit or before_shmem_exit, for the
600 * convenience of users who want to create a DSA segment inside shared memory
601 * other than a DSM segment and have it automatically release at backend exit.
602 * 'place' should be the address of the place where the area was created.
603 */
604void
606{
608}
609
610/*
611 * Release a DSA area that was produced by dsa_create_in_place or
612 * dsa_attach_in_place. It is preferable to use one of the 'dsa_on_XXX'
613 * callbacks so that this is managed automatically, because failure to release
614 * an area created in-place leaks its segments permanently.
615 *
616 * This is also called automatically for areas produced by dsa_create or
617 * dsa_attach as an implementation detail.
618 */
619void
621{
622 dsa_area_control *control = (dsa_area_control *) place;
623 int i;
624
625 LWLockAcquire(&control->lock, LW_EXCLUSIVE);
626 Assert(control->segment_header.magic ==
627 (DSA_SEGMENT_HEADER_MAGIC ^ control->handle ^ 0));
628 Assert(control->refcnt > 0);
629 if (--control->refcnt == 0)
630 {
631 for (i = 0; i <= control->high_segment_index; ++i)
632 {
633 dsm_handle handle;
634
635 handle = control->segment_handles[i];
636 if (handle != DSM_HANDLE_INVALID)
637 dsm_unpin_segment(handle);
638 }
639 }
640 LWLockRelease(&control->lock);
641}
642
643/*
644 * Keep a DSA area attached until end of session or explicit detach.
645 *
646 * By default, areas are owned by the current resource owner, which means they
647 * are detached automatically when that scope ends.
648 */
649void
651{
652 int i;
653
654 if (area->resowner != NULL)
655 {
656 area->resowner = NULL;
657
658 for (i = 0; i <= area->high_segment_index; ++i)
659 if (area->segment_maps[i].segment != NULL)
661 }
662}
663
664/*
665 * Allocate memory in this storage area. The return value is a dsa_pointer
666 * that can be passed to other processes, and converted to a local pointer
667 * with dsa_get_address. 'flags' is a bitmap which should be constructed
668 * from the following values:
669 *
670 * DSA_ALLOC_HUGE allows allocations >= 1GB. Otherwise, such allocations
671 * will result in an ERROR.
672 *
673 * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
674 * no memory is available or a size limit established by dsa_set_size_limit
675 * would be exceeded. Otherwise, such allocations will result in an ERROR.
676 *
677 * DSA_ALLOC_ZERO causes the allocated memory to be zeroed. Otherwise, the
678 * contents of newly-allocated memory are indeterminate.
679 *
680 * These flags correspond to similarly named flags used by
681 * MemoryContextAllocExtended(). See also the macros dsa_allocate and
682 * dsa_allocate0 which expand to a call to this function with commonly used
683 * flags.
684 */
686dsa_allocate_extended(dsa_area *area, size_t size, int flags)
687{
688 uint16 size_class;
689 dsa_pointer start_pointer;
690 dsa_segment_map *segment_map;
691 dsa_pointer result;
692
693 Assert(size > 0);
694
695 /* Sanity check on huge individual allocation size. */
696 if (((flags & DSA_ALLOC_HUGE) != 0 && !AllocHugeSizeIsValid(size)) ||
697 ((flags & DSA_ALLOC_HUGE) == 0 && !AllocSizeIsValid(size)))
698 elog(ERROR, "invalid DSA memory alloc request size %zu", size);
699
700 /*
701 * If bigger than the largest size class, just grab a run of pages from
702 * the free page manager, instead of allocating an object from a pool.
703 * There will still be a span, but it's a special class of span that
704 * manages this whole allocation and simply gives all pages back to the
705 * free page manager when dsa_free is called.
706 */
708 {
709 size_t npages = fpm_size_to_pages(size);
710 size_t first_page;
711 dsa_pointer span_pointer;
713
714 /* Obtain a span object. */
715 span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
716 if (!DsaPointerIsValid(span_pointer))
717 {
718 /* Raise error unless asked not to. */
719 if ((flags & DSA_ALLOC_NO_OOM) == 0)
721 (errcode(ERRCODE_OUT_OF_MEMORY),
722 errmsg("out of memory"),
723 errdetail("Failed on DSA request of size %zu.",
724 size)));
725 return InvalidDsaPointer;
726 }
727
729
730 /* Find a segment from which to allocate. */
731 segment_map = get_best_segment(area, npages);
732 if (segment_map == NULL)
733 segment_map = make_new_segment(area, npages);
734 if (segment_map == NULL)
735 {
736 /* Can't make any more segments: game over. */
738 dsa_free(area, span_pointer);
739
740 /* Raise error unless asked not to. */
741 if ((flags & DSA_ALLOC_NO_OOM) == 0)
743 (errcode(ERRCODE_OUT_OF_MEMORY),
744 errmsg("out of memory"),
745 errdetail("Failed on DSA request of size %zu.",
746 size)));
747 return InvalidDsaPointer;
748 }
749
750 /*
751 * Ask the free page manager for a run of pages. This should always
752 * succeed, since both get_best_segment and make_new_segment should
753 * only return a non-NULL pointer if it actually contains enough
754 * contiguous freespace. If it does fail, something in our backend
755 * private state is out of whack, so use FATAL to kill the process.
756 */
757 if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
758 elog(FATAL,
759 "dsa_allocate could not find %zu free pages", npages);
761
762 start_pointer = DSA_MAKE_POINTER(get_segment_index(area, segment_map),
763 first_page * FPM_PAGE_SIZE);
764
765 /* Initialize span and pagemap. */
768 init_span(area, span_pointer, pool, start_pointer, npages,
770 segment_map->pagemap[first_page] = span_pointer;
772
773 /* Zero-initialize the memory if requested. */
774 if ((flags & DSA_ALLOC_ZERO) != 0)
775 memset(dsa_get_address(area, start_pointer), 0, size);
776
777 return start_pointer;
778 }
779
780 /* Map allocation to a size class. */
782 {
783 int mapidx;
784
785 /* For smaller sizes we have a lookup table... */
786 mapidx = ((size + DSA_SIZE_CLASS_MAP_QUANTUM - 1) /
788 size_class = dsa_size_class_map[mapidx];
789 }
790 else
791 {
792 uint16 min;
793 uint16 max;
794
795 /* ... and for the rest we search by binary chop. */
797 max = lengthof(dsa_size_classes) - 1;
798
799 while (min < max)
800 {
801 uint16 mid = (min + max) / 2;
802 uint16 class_size = dsa_size_classes[mid];
803
804 if (class_size < size)
805 min = mid + 1;
806 else
807 max = mid;
808 }
809
810 size_class = min;
811 }
812 Assert(size <= dsa_size_classes[size_class]);
813 Assert(size_class == 0 || size > dsa_size_classes[size_class - 1]);
814
815 /* Attempt to allocate an object from the appropriate pool. */
816 result = alloc_object(area, size_class);
817
818 /* Check for failure to allocate. */
819 if (!DsaPointerIsValid(result))
820 {
821 /* Raise error unless asked not to. */
822 if ((flags & DSA_ALLOC_NO_OOM) == 0)
824 (errcode(ERRCODE_OUT_OF_MEMORY),
825 errmsg("out of memory"),
826 errdetail("Failed on DSA request of size %zu.", size)));
827 return InvalidDsaPointer;
828 }
829
830 /* Zero-initialize the memory if requested. */
831 if ((flags & DSA_ALLOC_ZERO) != 0)
832 memset(dsa_get_address(area, result), 0, size);
833
834 return result;
835}
836
837/*
838 * Free memory obtained with dsa_allocate.
839 */
840void
842{
843 dsa_segment_map *segment_map;
844 int pageno;
845 dsa_pointer span_pointer;
846 dsa_area_span *span;
847 char *superblock;
848 char *object;
849 size_t size;
850 int size_class;
851
852 /* Make sure we don't have a stale segment in the slot 'dp' refers to. */
854
855 /* Locate the object, span and pool. */
856 segment_map = get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(dp));
857 pageno = DSA_EXTRACT_OFFSET(dp) / FPM_PAGE_SIZE;
858 span_pointer = segment_map->pagemap[pageno];
859 span = dsa_get_address(area, span_pointer);
860 superblock = dsa_get_address(area, span->start);
861 object = dsa_get_address(area, dp);
862 size_class = span->size_class;
863 size = dsa_size_classes[size_class];
864
865 /*
866 * Special case for large objects that live in a special span: we return
867 * those pages directly to the free page manager and free the span.
868 */
870 {
871
872#ifdef CLOBBER_FREED_MEMORY
873 memset(object, 0x7f, span->npages * FPM_PAGE_SIZE);
874#endif
875
876 /* Give pages back to free page manager. */
878 FreePageManagerPut(segment_map->fpm,
880 span->npages);
881
882 /* Move segment to appropriate bin if necessary. */
883 rebin_segment(area, segment_map);
885
886 /* Unlink span. */
889 unlink_span(area, span);
891 /* Free the span object so it can be reused. */
892 dsa_free(area, span_pointer);
893 return;
894 }
895
896#ifdef CLOBBER_FREED_MEMORY
897 memset(object, 0x7f, size);
898#endif
899
900 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
901
902 /* Put the object on the span's freelist. */
903 Assert(object >= superblock);
904 Assert(object < superblock + DSA_SUPERBLOCK_SIZE);
905 Assert((object - superblock) % size == 0);
906 NextFreeObjectIndex(object) = span->firstfree;
907 span->firstfree = (object - superblock) / size;
908 ++span->nallocatable;
909
910 /*
911 * See if the span needs to moved to a different fullness class, or be
912 * freed so its pages can be given back to the segment.
913 */
914 if (span->nallocatable == 1 && span->fclass == DSA_FULLNESS_CLASSES - 1)
915 {
916 /*
917 * The block was completely full and is located in the
918 * highest-numbered fullness class, which is never scanned for free
919 * chunks. We must move it to the next-lower fullness class.
920 */
921 unlink_span(area, span);
922 add_span_to_fullness_class(area, span, span_pointer,
924
925 /*
926 * If this is the only span, and there is no active span, then we
927 * should probably move this span to fullness class 1. (Otherwise if
928 * you allocate exactly all the objects in the only span, it moves to
929 * class 3, then you free them all, it moves to 2, and then is given
930 * back, leaving no active span).
931 */
932 }
933 else if (span->nallocatable == span->nmax &&
934 (span->fclass != 1 || span->prevspan != InvalidDsaPointer))
935 {
936 /*
937 * This entire block is free, and it's not the active block for this
938 * size class. Return the memory to the free page manager. We don't
939 * do this for the active block to prevent hysteresis: if we
940 * repeatedly allocate and free the only chunk in the active block, it
941 * will be very inefficient if we deallocate and reallocate the block
942 * every time.
943 */
944 destroy_superblock(area, span_pointer);
945 }
946
947 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
948}
949
950/*
951 * Obtain a backend-local address for a dsa_pointer. 'dp' must point to
952 * memory allocated by the given area (possibly in another process) that
953 * hasn't yet been freed. This may cause a segment to be mapped into the
954 * current process if required, and may cause freed segments to be unmapped.
955 */
956void *
958{
960 size_t offset;
961
962 /* Convert InvalidDsaPointer to NULL. */
963 if (!DsaPointerIsValid(dp))
964 return NULL;
965
966 /* Process any requests to detach from freed segments. */
968
969 /* Break the dsa_pointer into its components. */
971 offset = DSA_EXTRACT_OFFSET(dp);
973
974 /* Check if we need to cause this segment to be mapped in. */
975 if (unlikely(area->segment_maps[index].mapped_address == NULL))
976 {
977 /* Call for effect (we don't need the result). */
979 }
980
981 return area->segment_maps[index].mapped_address + offset;
982}
983
984/*
985 * Pin this area, so that it will continue to exist even if all backends
986 * detach from it. In that case, the area can still be reattached to if a
987 * handle has been recorded somewhere.
988 */
989void
991{
993 if (area->control->pinned)
994 {
996 elog(ERROR, "dsa_area already pinned");
997 }
998 area->control->pinned = true;
999 ++area->control->refcnt;
1001}
1002
1003/*
1004 * Undo the effects of dsa_pin, so that the given area can be freed when no
1005 * backends are attached to it. May be called only if dsa_pin has been
1006 * called.
1007 */
1008void
1010{
1012 Assert(area->control->refcnt > 1);
1013 if (!area->control->pinned)
1014 {
1016 elog(ERROR, "dsa_area not pinned");
1017 }
1018 area->control->pinned = false;
1019 --area->control->refcnt;
1021}
1022
1023/*
1024 * Set the total size limit for this area. This limit is checked whenever new
1025 * segments need to be allocated from the operating system. If the new size
1026 * limit is already exceeded, this has no immediate effect.
1027 *
1028 * Note that the total virtual memory usage may be temporarily larger than
1029 * this limit when segments have been freed, but not yet detached by all
1030 * backends that have attached to them.
1031 */
1032void
1033dsa_set_size_limit(dsa_area *area, size_t limit)
1034{
1036 area->control->max_total_segment_size = limit;
1038}
1039
1040/* Return the total size of all active segments */
1041size_t
1043{
1044 size_t size;
1045
1047 size = area->control->total_segment_size;
1049
1050 return size;
1051}
1052
1053/*
1054 * Same as dsa_get_total_size(), but accepts a DSA handle. The area must have
1055 * been created with dsa_create (not dsa_create_in_place).
1056 */
1057size_t
1059{
1060 size_t size;
1061 bool already_attached;
1062 dsm_segment *segment;
1063 dsa_area_control *control;
1064
1065 already_attached = dsa_is_attached(handle);
1066 if (already_attached)
1067 segment = dsm_find_mapping(handle);
1068 else
1069 segment = dsm_attach(handle);
1070
1071 if (segment == NULL)
1072 ereport(ERROR,
1073 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1074 errmsg("could not attach to dynamic shared area")));
1075
1076 control = (dsa_area_control *) dsm_segment_address(segment);
1077
1078 LWLockAcquire(&control->lock, LW_SHARED);
1079 size = control->total_segment_size;
1080 LWLockRelease(&control->lock);
1081
1082 if (!already_attached)
1083 dsm_detach(segment);
1084
1085 return size;
1086}
1087
1088/*
1089 * Aggressively free all spare memory in the hope of returning DSM segments to
1090 * the operating system.
1091 */
1092void
1094{
1095 int size_class;
1096
1097 /*
1098 * Trim in reverse pool order so we get to the spans-of-spans last, just
1099 * in case any become entirely free while processing all the other pools.
1100 */
1101 for (size_class = DSA_NUM_SIZE_CLASSES - 1; size_class >= 0; --size_class)
1102 {
1103 dsa_area_pool *pool = &area->control->pools[size_class];
1104 dsa_pointer span_pointer;
1105
1106 if (size_class == DSA_SCLASS_SPAN_LARGE)
1107 {
1108 /* Large object frees give back segments aggressively already. */
1109 continue;
1110 }
1111
1112 /*
1113 * Search fullness class 1 only. That is where we expect to find an
1114 * entirely empty superblock (entirely empty superblocks in other
1115 * fullness classes are returned to the free page map by dsa_free).
1116 */
1117 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1118 span_pointer = pool->spans[1];
1119 while (DsaPointerIsValid(span_pointer))
1120 {
1121 dsa_area_span *span = dsa_get_address(area, span_pointer);
1122 dsa_pointer next = span->nextspan;
1123
1124 if (span->nallocatable == span->nmax)
1125 destroy_superblock(area, span_pointer);
1126
1127 span_pointer = next;
1128 }
1129 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1130 }
1131}
1132
1133/*
1134 * Print out debugging information about the internal state of the shared
1135 * memory area.
1136 */
1137void
1139{
1140 size_t i,
1141 j;
1142
1143 /*
1144 * Note: This gives an inconsistent snapshot as it acquires and releases
1145 * individual locks as it goes...
1146 */
1147
1150 fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
1151 fprintf(stderr, " max_total_segment_size: %zu\n",
1153 fprintf(stderr, " total_segment_size: %zu\n",
1155 fprintf(stderr, " refcnt: %d\n", area->control->refcnt);
1156 fprintf(stderr, " pinned: %c\n", area->control->pinned ? 't' : 'f');
1157 fprintf(stderr, " segment bins:\n");
1158 for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1159 {
1161 {
1162 dsa_segment_index segment_index;
1163
1164 if (i == 0)
1165 fprintf(stderr,
1166 " segment bin %zu (no contiguous free pages):\n", i);
1167 else
1168 fprintf(stderr,
1169 " segment bin %zu (at least %d contiguous pages free):\n",
1170 i, 1 << (i - 1));
1171 segment_index = area->control->segment_bins[i];
1172 while (segment_index != DSA_SEGMENT_INDEX_NONE)
1173 {
1174 dsa_segment_map *segment_map;
1175
1176 segment_map =
1177 get_segment_by_index(area, segment_index);
1178
1179 fprintf(stderr,
1180 " segment index %zu, usable_pages = %zu, "
1181 "contiguous_pages = %zu, mapped at %p\n",
1182 segment_index,
1183 segment_map->header->usable_pages,
1184 fpm_largest(segment_map->fpm),
1185 segment_map->mapped_address);
1186 segment_index = segment_map->header->next;
1187 }
1188 }
1189 }
1191
1192 fprintf(stderr, " pools:\n");
1193 for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1194 {
1195 bool found = false;
1196
1198 for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1199 if (DsaPointerIsValid(area->control->pools[i].spans[j]))
1200 found = true;
1201 if (found)
1202 {
1204 fprintf(stderr, " pool for blocks of span objects:\n");
1205 else if (i == DSA_SCLASS_SPAN_LARGE)
1206 fprintf(stderr, " pool for large object spans:\n");
1207 else
1208 fprintf(stderr,
1209 " pool for size class %zu (object size %hu bytes):\n",
1210 i, dsa_size_classes[i]);
1211 for (j = 0; j < DSA_FULLNESS_CLASSES; ++j)
1212 {
1213 if (!DsaPointerIsValid(area->control->pools[i].spans[j]))
1214 fprintf(stderr, " fullness class %zu is empty\n", j);
1215 else
1216 {
1217 dsa_pointer span_pointer = area->control->pools[i].spans[j];
1218
1219 fprintf(stderr, " fullness class %zu:\n", j);
1220 while (DsaPointerIsValid(span_pointer))
1221 {
1222 dsa_area_span *span;
1223
1224 span = dsa_get_address(area, span_pointer);
1225 fprintf(stderr,
1226 " span descriptor at "
1227 DSA_POINTER_FORMAT ", superblock at "
1229 ", pages = %zu, objects free = %hu/%hu\n",
1230 span_pointer, span->start, span->npages,
1231 span->nallocatable, span->nmax);
1232 span_pointer = span->nextspan;
1233 }
1234 }
1235 }
1236 }
1238 }
1239}
1240
1241/*
1242 * Return the smallest size that you can successfully provide to
1243 * dsa_create_in_place.
1244 */
1245size_t
1247{
1248 size_t size;
1249 int pages = 0;
1250
1251 size = MAXALIGN(sizeof(dsa_area_control)) +
1252 MAXALIGN(sizeof(FreePageManager));
1253
1254 /* Figure out how many pages we need, including the page map... */
1255 while (((size + FPM_PAGE_SIZE - 1) / FPM_PAGE_SIZE) > pages)
1256 {
1257 ++pages;
1258 size += sizeof(dsa_pointer);
1259 }
1260
1261 return pages * FPM_PAGE_SIZE;
1262}
1263
1264/*
1265 * Workhorse function for dsa_create and dsa_create_in_place.
1266 */
1267static dsa_area *
1268create_internal(void *place, size_t size,
1269 int tranche_id,
1270 dsm_handle control_handle,
1271 dsm_segment *control_segment,
1272 size_t init_segment_size, size_t max_segment_size)
1273{
1274 dsa_area_control *control;
1275 dsa_area *area;
1276 dsa_segment_map *segment_map;
1277 size_t usable_pages;
1278 size_t total_pages;
1279 size_t metadata_bytes;
1280 int i;
1281
1282 /* Check the initial and maximum block sizes */
1283 Assert(init_segment_size >= DSA_MIN_SEGMENT_SIZE);
1284 Assert(max_segment_size >= init_segment_size);
1285 Assert(max_segment_size <= DSA_MAX_SEGMENT_SIZE);
1286
1287 /* Sanity check on the space we have to work in. */
1288 if (size < dsa_minimum_size())
1289 elog(ERROR, "dsa_area space must be at least %zu, but %zu provided",
1290 dsa_minimum_size(), size);
1291
1292 /* Now figure out how much space is usable */
1293 total_pages = size / FPM_PAGE_SIZE;
1294 metadata_bytes =
1295 MAXALIGN(sizeof(dsa_area_control)) +
1296 MAXALIGN(sizeof(FreePageManager)) +
1297 total_pages * sizeof(dsa_pointer);
1298 /* Add padding up to next page boundary. */
1299 if (metadata_bytes % FPM_PAGE_SIZE != 0)
1300 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
1301 Assert(metadata_bytes <= size);
1302 usable_pages = (size - metadata_bytes) / FPM_PAGE_SIZE;
1303
1304 /*
1305 * Initialize the dsa_area_control object located at the start of the
1306 * space.
1307 */
1308 control = (dsa_area_control *) place;
1309 memset(place, 0, sizeof(*control));
1310 control->segment_header.magic =
1311 DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
1314 control->segment_header.usable_pages = usable_pages;
1315 control->segment_header.freed = false;
1316 control->segment_header.size = size;
1317 control->handle = control_handle;
1318 control->init_segment_size = init_segment_size;
1319 control->max_segment_size = max_segment_size;
1320 control->max_total_segment_size = (size_t) -1;
1321 control->total_segment_size = size;
1322 control->segment_handles[0] = control_handle;
1323 for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
1325 control->refcnt = 1;
1326 control->lwlock_tranche_id = tranche_id;
1327
1328 /*
1329 * Create the dsa_area object that this backend will use to access the
1330 * area. Other backends will need to obtain their own dsa_area object by
1331 * attaching.
1332 */
1333 area = palloc(sizeof(dsa_area));
1334 area->control = control;
1336 memset(area->segment_maps, 0, sizeof(dsa_segment_map) * DSA_MAX_SEGMENTS);
1337 area->high_segment_index = 0;
1338 area->freed_segment_counter = 0;
1339 LWLockInitialize(&control->lock, control->lwlock_tranche_id);
1340 for (i = 0; i < DSA_NUM_SIZE_CLASSES; ++i)
1342 control->lwlock_tranche_id);
1343
1344 /* Set up the segment map for this process's mapping. */
1345 segment_map = &area->segment_maps[0];
1346 segment_map->segment = control_segment;
1347 segment_map->mapped_address = place;
1348 segment_map->header = (dsa_segment_header *) place;
1349 segment_map->fpm = (FreePageManager *)
1350 (segment_map->mapped_address +
1351 MAXALIGN(sizeof(dsa_area_control)));
1352 segment_map->pagemap = (dsa_pointer *)
1353 (segment_map->mapped_address +
1354 MAXALIGN(sizeof(dsa_area_control)) +
1355 MAXALIGN(sizeof(FreePageManager)));
1356
1357 /* Set up the free page map. */
1358 FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
1359 /* There can be 0 usable pages if size is dsa_minimum_size(). */
1360
1361 if (usable_pages > 0)
1362 FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
1363 usable_pages);
1364
1365 /* Put this segment into the appropriate bin. */
1366 control->segment_bins[contiguous_pages_to_segment_bin(usable_pages)] = 0;
1367 segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
1368
1369 return area;
1370}
1371
1372/*
1373 * Workhorse function for dsa_attach and dsa_attach_in_place.
1374 */
1375static dsa_area *
1376attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
1377{
1378 dsa_area_control *control;
1379 dsa_area *area;
1380 dsa_segment_map *segment_map;
1381
1382 control = (dsa_area_control *) place;
1383 Assert(control->handle == handle);
1384 Assert(control->segment_handles[0] == handle);
1385 Assert(control->segment_header.magic ==
1386 (DSA_SEGMENT_HEADER_MAGIC ^ handle ^ 0));
1387
1388 /* Build the backend-local area object. */
1389 area = palloc(sizeof(dsa_area));
1390 area->control = control;
1392 memset(&area->segment_maps[0], 0,
1394 area->high_segment_index = 0;
1395
1396 /* Set up the segment map for this process's mapping. */
1397 segment_map = &area->segment_maps[0];
1398 segment_map->segment = segment; /* NULL for in-place */
1399 segment_map->mapped_address = place;
1400 segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
1401 segment_map->fpm = (FreePageManager *)
1402 (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)));
1403 segment_map->pagemap = (dsa_pointer *)
1404 (segment_map->mapped_address + MAXALIGN(sizeof(dsa_area_control)) +
1405 MAXALIGN(sizeof(FreePageManager)));
1406
1407 /* Bump the reference count. */
1409 if (control->refcnt == 0)
1410 {
1411 /* We can't attach to a DSA area that has already been destroyed. */
1412 ereport(ERROR,
1413 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1414 errmsg("could not attach to dynamic shared area")));
1415 }
1416 ++control->refcnt;
1419
1420 return area;
1421}
1422
1423/*
1424 * Add a new span to fullness class 1 of the indicated pool.
1425 */
1426static void
1428 dsa_pointer span_pointer,
1429 dsa_area_pool *pool, dsa_pointer start, size_t npages,
1430 uint16 size_class)
1431{
1432 dsa_area_span *span = dsa_get_address(area, span_pointer);
1433 size_t obsize = dsa_size_classes[size_class];
1434
1435 /*
1436 * The per-pool lock must be held because we manipulate the span list for
1437 * this pool.
1438 */
1439 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1440
1441 /* Push this span onto the front of the span list for fullness class 1. */
1442 if (DsaPointerIsValid(pool->spans[1]))
1443 {
1444 dsa_area_span *head = (dsa_area_span *)
1445 dsa_get_address(area, pool->spans[1]);
1446
1447 head->prevspan = span_pointer;
1448 }
1449 span->pool = DsaAreaPoolToDsaPointer(area, pool);
1450 span->nextspan = pool->spans[1];
1452 pool->spans[1] = span_pointer;
1453
1454 span->start = start;
1455 span->npages = npages;
1456 span->size_class = size_class;
1457 span->ninitialized = 0;
1458 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1459 {
1460 /*
1461 * A block-of-spans contains its own descriptor, so mark one object as
1462 * initialized and reduce the count of allocatable objects by one.
1463 * Doing this here has the side effect of also reducing nmax by one,
1464 * which is important to make sure we free this object at the correct
1465 * time.
1466 */
1467 span->ninitialized = 1;
1468 span->nallocatable = FPM_PAGE_SIZE / obsize - 1;
1469 }
1470 else if (size_class != DSA_SCLASS_SPAN_LARGE)
1471 span->nallocatable = DSA_SUPERBLOCK_SIZE / obsize;
1473 span->nmax = span->nallocatable;
1474 span->fclass = 1;
1475}
1476
1477/*
1478 * Transfer the first span in one fullness class to the head of another
1479 * fullness class.
1480 */
1481static bool
1483 dsa_area_pool *pool, int fromclass, int toclass)
1484{
1485 dsa_pointer span_pointer;
1486 dsa_area_span *span;
1487 dsa_area_span *nextspan;
1488
1489 /* Can't do it if source list is empty. */
1490 span_pointer = pool->spans[fromclass];
1491 if (!DsaPointerIsValid(span_pointer))
1492 return false;
1493
1494 /* Remove span from head of source list. */
1495 span = dsa_get_address(area, span_pointer);
1496 pool->spans[fromclass] = span->nextspan;
1497 if (DsaPointerIsValid(span->nextspan))
1498 {
1499 nextspan = (dsa_area_span *)
1500 dsa_get_address(area, span->nextspan);
1501 nextspan->prevspan = InvalidDsaPointer;
1502 }
1503
1504 /* Add span to head of target list. */
1505 span->nextspan = pool->spans[toclass];
1506 pool->spans[toclass] = span_pointer;
1507 if (DsaPointerIsValid(span->nextspan))
1508 {
1509 nextspan = (dsa_area_span *)
1510 dsa_get_address(area, span->nextspan);
1511 nextspan->prevspan = span_pointer;
1512 }
1513 span->fclass = toclass;
1514
1515 return true;
1516}
1517
1518/*
1519 * Allocate one object of the requested size class from the given area.
1520 */
1521static inline dsa_pointer
1522alloc_object(dsa_area *area, int size_class)
1523{
1524 dsa_area_pool *pool = &area->control->pools[size_class];
1525 dsa_area_span *span;
1526 dsa_pointer block;
1527 dsa_pointer result;
1528 char *object;
1529 size_t size;
1530
1531 /*
1532 * Even though ensure_active_superblock can in turn call alloc_object if
1533 * it needs to allocate a new span, that's always from a different pool,
1534 * and the order of lock acquisition is always the same, so it's OK that
1535 * we hold this lock for the duration of this function.
1536 */
1537 Assert(!LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1538 LWLockAcquire(DSA_SCLASS_LOCK(area, size_class), LW_EXCLUSIVE);
1539
1540 /*
1541 * If there's no active superblock, we must successfully obtain one or
1542 * fail the request.
1543 */
1544 if (!DsaPointerIsValid(pool->spans[1]) &&
1545 !ensure_active_superblock(area, pool, size_class))
1546 {
1547 result = InvalidDsaPointer;
1548 }
1549 else
1550 {
1551 /*
1552 * There should be a block in fullness class 1 at this point, and it
1553 * should never be completely full. Thus we can either pop an object
1554 * from the free list or, failing that, initialize a new object.
1555 */
1556 Assert(DsaPointerIsValid(pool->spans[1]));
1557 span = (dsa_area_span *)
1558 dsa_get_address(area, pool->spans[1]);
1559 Assert(span->nallocatable > 0);
1560 block = span->start;
1561 Assert(size_class < DSA_NUM_SIZE_CLASSES);
1562 size = dsa_size_classes[size_class];
1563 if (span->firstfree != DSA_SPAN_NOTHING_FREE)
1564 {
1565 result = block + span->firstfree * size;
1566 object = dsa_get_address(area, result);
1567 span->firstfree = NextFreeObjectIndex(object);
1568 }
1569 else
1570 {
1571 result = block + span->ninitialized * size;
1572 ++span->ninitialized;
1573 }
1574 --span->nallocatable;
1575
1576 /* If it's now full, move it to the highest-numbered fullness class. */
1577 if (span->nallocatable == 0)
1578 transfer_first_span(area, pool, 1, DSA_FULLNESS_CLASSES - 1);
1579 }
1580
1581 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1582 LWLockRelease(DSA_SCLASS_LOCK(area, size_class));
1583
1584 return result;
1585}
1586
1587/*
1588 * Ensure an active (i.e. fullness class 1) superblock, unless all existing
1589 * superblocks are completely full and no more can be allocated.
1590 *
1591 * Fullness classes K of 0..N are loosely intended to represent blocks whose
1592 * utilization percentage is at least K/N, but we only enforce this rigorously
1593 * for the highest-numbered fullness class, which always contains exactly
1594 * those blocks that are completely full. It's otherwise acceptable for a
1595 * block to be in a higher-numbered fullness class than the one to which it
1596 * logically belongs. In addition, the active block, which is always the
1597 * first block in fullness class 1, is permitted to have a higher allocation
1598 * percentage than would normally be allowable for that fullness class; we
1599 * don't move it until it's completely full, and then it goes to the
1600 * highest-numbered fullness class.
1601 *
1602 * It might seem odd that the active block is the head of fullness class 1
1603 * rather than fullness class 0, but experience with other allocators has
1604 * shown that it's usually better to allocate from a block that's moderately
1605 * full rather than one that's nearly empty. Insofar as is reasonably
1606 * possible, we want to avoid performing new allocations in a block that would
1607 * otherwise become empty soon.
1608 */
1609static bool
1611 int size_class)
1612{
1613 dsa_pointer span_pointer;
1614 dsa_pointer start_pointer;
1615 size_t obsize = dsa_size_classes[size_class];
1616 size_t nmax;
1617 int fclass;
1618 size_t npages = 1;
1619 size_t first_page;
1620 size_t i;
1621 dsa_segment_map *segment_map;
1622
1623 Assert(LWLockHeldByMe(DSA_SCLASS_LOCK(area, size_class)));
1624
1625 /*
1626 * Compute the number of objects that will fit in a block of this size
1627 * class. Span-of-spans blocks are just a single page, and the first
1628 * object isn't available for use because it describes the block-of-spans
1629 * itself.
1630 */
1631 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1632 nmax = FPM_PAGE_SIZE / obsize - 1;
1633 else
1634 nmax = DSA_SUPERBLOCK_SIZE / obsize;
1635
1636 /*
1637 * If fullness class 1 is empty, try to find a span to put in it by
1638 * scanning higher-numbered fullness classes (excluding the last one,
1639 * whose blocks are certain to all be completely full).
1640 */
1641 for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1642 {
1643 span_pointer = pool->spans[fclass];
1644
1645 while (DsaPointerIsValid(span_pointer))
1646 {
1647 int tfclass;
1648 dsa_area_span *span;
1649 dsa_area_span *nextspan;
1650 dsa_area_span *prevspan;
1651 dsa_pointer next_span_pointer;
1652
1653 span = (dsa_area_span *)
1654 dsa_get_address(area, span_pointer);
1655 next_span_pointer = span->nextspan;
1656
1657 /* Figure out what fullness class should contain this span. */
1658 tfclass = (nmax - span->nallocatable)
1659 * (DSA_FULLNESS_CLASSES - 1) / nmax;
1660
1661 /* Look up next span. */
1662 if (DsaPointerIsValid(span->nextspan))
1663 nextspan = (dsa_area_span *)
1664 dsa_get_address(area, span->nextspan);
1665 else
1666 nextspan = NULL;
1667
1668 /*
1669 * If utilization has dropped enough that this now belongs in some
1670 * other fullness class, move it there.
1671 */
1672 if (tfclass < fclass)
1673 {
1674 /* Remove from the current fullness class list. */
1675 if (pool->spans[fclass] == span_pointer)
1676 {
1677 /* It was the head; remove it. */
1679 pool->spans[fclass] = span->nextspan;
1680 if (nextspan != NULL)
1681 nextspan->prevspan = InvalidDsaPointer;
1682 }
1683 else
1684 {
1685 /* It was not the head. */
1687 prevspan = (dsa_area_span *)
1688 dsa_get_address(area, span->prevspan);
1689 prevspan->nextspan = span->nextspan;
1690 }
1691 if (nextspan != NULL)
1692 nextspan->prevspan = span->prevspan;
1693
1694 /* Push onto the head of the new fullness class list. */
1695 span->nextspan = pool->spans[tfclass];
1696 pool->spans[tfclass] = span_pointer;
1698 if (DsaPointerIsValid(span->nextspan))
1699 {
1700 nextspan = (dsa_area_span *)
1701 dsa_get_address(area, span->nextspan);
1702 nextspan->prevspan = span_pointer;
1703 }
1704 span->fclass = tfclass;
1705 }
1706
1707 /* Advance to next span on list. */
1708 span_pointer = next_span_pointer;
1709 }
1710
1711 /* Stop now if we found a suitable block. */
1712 if (DsaPointerIsValid(pool->spans[1]))
1713 return true;
1714 }
1715
1716 /*
1717 * If there are no blocks that properly belong in fullness class 1, pick
1718 * one from some other fullness class and move it there anyway, so that we
1719 * have an allocation target. Our last choice is to transfer a block
1720 * that's almost empty (and might become completely empty soon if left
1721 * alone), but even that is better than failing, which is what we must do
1722 * if there are no blocks at all with freespace.
1723 */
1724 Assert(!DsaPointerIsValid(pool->spans[1]));
1725 for (fclass = 2; fclass < DSA_FULLNESS_CLASSES - 1; ++fclass)
1726 if (transfer_first_span(area, pool, fclass, 1))
1727 return true;
1728 if (!DsaPointerIsValid(pool->spans[1]) &&
1729 transfer_first_span(area, pool, 0, 1))
1730 return true;
1731
1732 /*
1733 * We failed to find an existing span with free objects, so we need to
1734 * allocate a new superblock and construct a new span to manage it.
1735 *
1736 * First, get a dsa_area_span object to describe the new superblock block
1737 * ... unless this allocation is for a dsa_area_span object, in which case
1738 * that's surely not going to work. We handle that case by storing the
1739 * span describing a block-of-spans inline.
1740 */
1741 if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1742 {
1743 span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
1744 if (!DsaPointerIsValid(span_pointer))
1745 return false;
1746 npages = DSA_PAGES_PER_SUPERBLOCK;
1747 }
1748
1749 /* Find or create a segment and allocate the superblock. */
1751 segment_map = get_best_segment(area, npages);
1752 if (segment_map == NULL)
1753 {
1754 segment_map = make_new_segment(area, npages);
1755 if (segment_map == NULL)
1756 {
1758 return false;
1759 }
1760 }
1761
1762 /*
1763 * This shouldn't happen: get_best_segment() or make_new_segment()
1764 * promised that we can successfully allocate npages.
1765 */
1766 if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
1767 elog(FATAL,
1768 "dsa_allocate could not find %zu free pages for superblock",
1769 npages);
1771
1772 /* Compute the start of the superblock. */
1773 start_pointer =
1774 DSA_MAKE_POINTER(get_segment_index(area, segment_map),
1775 first_page * FPM_PAGE_SIZE);
1776
1777 /*
1778 * If this is a block-of-spans, carve the descriptor right out of the
1779 * allocated space.
1780 */
1781 if (size_class == DSA_SCLASS_BLOCK_OF_SPANS)
1782 {
1783 /*
1784 * We have a pointer into the segment. We need to build a dsa_pointer
1785 * from the segment index and offset into the segment.
1786 */
1787 span_pointer = start_pointer;
1788 }
1789
1790 /* Initialize span and pagemap. */
1791 init_span(area, span_pointer, pool, start_pointer, npages, size_class);
1792 for (i = 0; i < npages; ++i)
1793 segment_map->pagemap[first_page + i] = span_pointer;
1794
1795 return true;
1796}
1797
1798/*
1799 * Return the segment map corresponding to a given segment index, mapping the
1800 * segment in if necessary. For internal segment book-keeping, this is called
1801 * with the area lock held. It is also called by dsa_free and dsa_get_address
1802 * without any locking, relying on the fact they have a known live segment
1803 * index and they always call check_for_freed_segments to ensures that any
1804 * freed segment occupying the same slot is detached first.
1805 */
1806static dsa_segment_map *
1808{
1809 if (unlikely(area->segment_maps[index].mapped_address == NULL))
1810 {
1811 dsm_handle handle;
1812 dsm_segment *segment;
1813 dsa_segment_map *segment_map;
1814 ResourceOwner oldowner;
1815
1816 /*
1817 * If we are reached by dsa_free or dsa_get_address, there must be at
1818 * least one object allocated in the referenced segment. Otherwise,
1819 * their caller has a double-free or access-after-free bug, which we
1820 * have no hope of detecting. So we know it's safe to access this
1821 * array slot without holding a lock; it won't change underneath us.
1822 * Furthermore, we know that we can see the latest contents of the
1823 * slot, as explained in check_for_freed_segments, which those
1824 * functions call before arriving here.
1825 */
1826 handle = area->control->segment_handles[index];
1827
1828 /* It's an error to try to access an unused slot. */
1829 if (handle == DSM_HANDLE_INVALID)
1830 elog(ERROR,
1831 "dsa_area could not attach to a segment that has been freed");
1832
1833 oldowner = CurrentResourceOwner;
1835 segment = dsm_attach(handle);
1836 CurrentResourceOwner = oldowner;
1837 if (segment == NULL)
1838 elog(ERROR, "dsa_area could not attach to segment");
1839 segment_map = &area->segment_maps[index];
1840 segment_map->segment = segment;
1841 segment_map->mapped_address = dsm_segment_address(segment);
1842 segment_map->header =
1843 (dsa_segment_header *) segment_map->mapped_address;
1844 segment_map->fpm = (FreePageManager *)
1845 (segment_map->mapped_address +
1846 MAXALIGN(sizeof(dsa_segment_header)));
1847 segment_map->pagemap = (dsa_pointer *)
1848 (segment_map->mapped_address +
1849 MAXALIGN(sizeof(dsa_segment_header)) +
1850 MAXALIGN(sizeof(FreePageManager)));
1851
1852 /* Remember the highest index this backend has ever mapped. */
1853 if (area->high_segment_index < index)
1854 area->high_segment_index = index;
1855
1856 Assert(segment_map->header->magic ==
1858 }
1859
1860 /*
1861 * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
1862 * but it's a bug in the calling code and undefined behavior if the
1863 * address is not live (ie if the segment might possibly have been freed,
1864 * they're trying to use a dangling pointer).
1865 *
1866 * For dsa.c code that holds the area lock to manipulate segment_bins
1867 * lists, it would be a bug if we ever reach a freed segment here. After
1868 * it's marked as freed, the only thing any backend should do with it is
1869 * unmap it, and it should always have done that in
1870 * check_for_freed_segments_locked() before arriving here to resolve an
1871 * index to a segment_map.
1872 *
1873 * Either way we can assert that we aren't returning a freed segment.
1874 */
1876
1877 return &area->segment_maps[index];
1878}
1879
1880/*
1881 * Return a superblock to the free page manager. If the underlying segment
1882 * has become entirely free, then return it to the operating system.
1883 *
1884 * The appropriate pool lock must be held.
1885 */
1886static void
1888{
1889 dsa_area_span *span = dsa_get_address(area, span_pointer);
1890 int size_class = span->size_class;
1891 dsa_segment_map *segment_map;
1892
1893
1894 /* Remove it from its fullness class list. */
1895 unlink_span(area, span);
1896
1897 /*
1898 * Note: Here we acquire the area lock while we already hold a per-pool
1899 * lock. We never hold the area lock and then take a pool lock, or we
1900 * could deadlock.
1901 */
1904 segment_map =
1906 FreePageManagerPut(segment_map->fpm,
1908 span->npages);
1909 /* Check if the segment is now entirely free. */
1910 if (fpm_largest(segment_map->fpm) == segment_map->header->usable_pages)
1911 {
1912 dsa_segment_index index = get_segment_index(area, segment_map);
1913
1914 /* If it's not the segment with extra control data, free it. */
1915 if (index != 0)
1916 {
1917 /*
1918 * Give it back to the OS, and allow other backends to detect that
1919 * they need to detach.
1920 */
1921 unlink_segment(area, segment_map);
1922 segment_map->header->freed = true;
1924 segment_map->header->size);
1925 area->control->total_segment_size -=
1926 segment_map->header->size;
1928 dsm_detach(segment_map->segment);
1931 segment_map->segment = NULL;
1932 segment_map->header = NULL;
1933 segment_map->mapped_address = NULL;
1934 }
1935 }
1936
1937 /* Move segment to appropriate bin if necessary. */
1938 if (segment_map->header != NULL)
1939 rebin_segment(area, segment_map);
1940
1942
1943 /*
1944 * Span-of-spans blocks store the span which describes them within the
1945 * block itself, so freeing the storage implicitly frees the descriptor
1946 * also. If this is a block of any other type, we need to separately free
1947 * the span object also. This recursive call to dsa_free will acquire the
1948 * span pool's lock. We can't deadlock because the acquisition order is
1949 * always some other pool and then the span pool.
1950 */
1951 if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
1952 dsa_free(area, span_pointer);
1953}
1954
1955static void
1957{
1958 if (DsaPointerIsValid(span->nextspan))
1959 {
1961
1962 next->prevspan = span->prevspan;
1963 }
1964 if (DsaPointerIsValid(span->prevspan))
1965 {
1966 dsa_area_span *prev = dsa_get_address(area, span->prevspan);
1967
1968 prev->nextspan = span->nextspan;
1969 }
1970 else
1971 {
1972 dsa_area_pool *pool = dsa_get_address(area, span->pool);
1973
1974 pool->spans[span->fclass] = span->nextspan;
1975 }
1976}
1977
1978static void
1980 dsa_pointer span_pointer,
1981 int fclass)
1982{
1983 dsa_area_pool *pool = dsa_get_address(area, span->pool);
1984
1985 if (DsaPointerIsValid(pool->spans[fclass]))
1986 {
1987 dsa_area_span *head = dsa_get_address(area,
1988 pool->spans[fclass]);
1989
1990 head->prevspan = span_pointer;
1991 }
1993 span->nextspan = pool->spans[fclass];
1994 pool->spans[fclass] = span_pointer;
1995 span->fclass = fclass;
1996}
1997
1998/*
1999 * Detach from an area that was either created or attached to by this process.
2000 */
2001void
2003{
2004 int i;
2005
2006 /* Detach from all segments. */
2007 for (i = 0; i <= area->high_segment_index; ++i)
2008 if (area->segment_maps[i].segment != NULL)
2010
2011 /*
2012 * Note that 'detaching' (= detaching from DSM segments) doesn't include
2013 * 'releasing' (= adjusting the reference count). It would be nice to
2014 * combine these operations, but client code might never get around to
2015 * calling dsa_detach because of an error path, and a detach hook on any
2016 * particular segment is too late to detach other segments in the area
2017 * without risking a 'leak' warning in the non-error path.
2018 */
2019
2020 /* Free the backend-local area object. */
2021 pfree(area);
2022}
2023
2024/*
2025 * Unlink a segment from the bin that contains it.
2026 */
2027static void
2029{
2030 if (segment_map->header->prev != DSA_SEGMENT_INDEX_NONE)
2031 {
2032 dsa_segment_map *prev;
2033
2034 prev = get_segment_by_index(area, segment_map->header->prev);
2035 prev->header->next = segment_map->header->next;
2036 }
2037 else
2038 {
2039 Assert(area->control->segment_bins[segment_map->header->bin] ==
2040 get_segment_index(area, segment_map));
2041 area->control->segment_bins[segment_map->header->bin] =
2042 segment_map->header->next;
2043 }
2044 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2045 {
2047
2048 next = get_segment_by_index(area, segment_map->header->next);
2049 next->header->prev = segment_map->header->prev;
2050 }
2051}
2052
2053/*
2054 * Find a segment that could satisfy a request for 'npages' of contiguous
2055 * memory, or return NULL if none can be found. This may involve attaching to
2056 * segments that weren't previously attached so that we can query their free
2057 * pages map.
2058 */
2059static dsa_segment_map *
2060get_best_segment(dsa_area *area, size_t npages)
2061{
2062 size_t bin;
2063
2066
2067 /*
2068 * Start searching from the first bin that *might* have enough contiguous
2069 * pages.
2070 */
2071 for (bin = contiguous_pages_to_segment_bin(npages);
2073 ++bin)
2074 {
2075 /*
2076 * The minimum contiguous size that any segment in this bin should
2077 * have. We'll re-bin if we see segments with fewer.
2078 */
2079 size_t threshold = (size_t) 1 << (bin - 1);
2080 dsa_segment_index segment_index;
2081
2082 /* Search this bin for a segment with enough contiguous space. */
2083 segment_index = area->control->segment_bins[bin];
2084 while (segment_index != DSA_SEGMENT_INDEX_NONE)
2085 {
2086 dsa_segment_map *segment_map;
2087 dsa_segment_index next_segment_index;
2088 size_t contiguous_pages;
2089
2090 segment_map = get_segment_by_index(area, segment_index);
2091 next_segment_index = segment_map->header->next;
2092 contiguous_pages = fpm_largest(segment_map->fpm);
2093
2094 /* Not enough for the request, still enough for this bin. */
2095 if (contiguous_pages >= threshold && contiguous_pages < npages)
2096 {
2097 segment_index = next_segment_index;
2098 continue;
2099 }
2100
2101 /* Re-bin it if it's no longer in the appropriate bin. */
2102 if (contiguous_pages < threshold)
2103 {
2104 rebin_segment(area, segment_map);
2105
2106 /*
2107 * But fall through to see if it's enough to satisfy this
2108 * request anyway....
2109 */
2110 }
2111
2112 /* Check if we are done. */
2113 if (contiguous_pages >= npages)
2114 return segment_map;
2115
2116 /* Continue searching the same bin. */
2117 segment_index = next_segment_index;
2118 }
2119 }
2120
2121 /* Not found. */
2122 return NULL;
2123}
2124
2125/*
2126 * Create a new segment that can handle at least requested_pages. Returns
2127 * NULL if the requested total size limit or maximum allowed number of
2128 * segments would be exceeded.
2129 */
2130static dsa_segment_map *
2131make_new_segment(dsa_area *area, size_t requested_pages)
2132{
2133 dsa_segment_index new_index;
2134 size_t metadata_bytes;
2135 size_t total_size;
2136 size_t total_pages;
2137 size_t usable_pages;
2138 dsa_segment_map *segment_map;
2139 dsm_segment *segment;
2140 ResourceOwner oldowner;
2141
2143
2144 /* Find a segment slot that is not in use (linearly for now). */
2145 for (new_index = 1; new_index < DSA_MAX_SEGMENTS; ++new_index)
2146 {
2147 if (area->control->segment_handles[new_index] == DSM_HANDLE_INVALID)
2148 break;
2149 }
2150 if (new_index == DSA_MAX_SEGMENTS)
2151 return NULL;
2152
2153 /*
2154 * If the total size limit is already exceeded, then we exit early and
2155 * avoid arithmetic wraparound in the unsigned expressions below.
2156 */
2157 if (area->control->total_segment_size >=
2159 return NULL;
2160
2161 /*
2162 * The size should be at least as big as requested, and at least big
2163 * enough to follow a geometric series that approximately doubles the
2164 * total storage each time we create a new segment. We use geometric
2165 * growth because the underlying DSM system isn't designed for large
2166 * numbers of segments (otherwise we might even consider just using one
2167 * DSM segment for each large allocation and for each superblock, and then
2168 * we wouldn't need to use FreePageManager).
2169 *
2170 * We decide on a total segment size first, so that we produce tidy
2171 * power-of-two sized segments. This is a good property to have if we
2172 * move to huge pages in the future. Then we work back to the number of
2173 * pages we can fit.
2174 */
2176 ((size_t) 1 << (new_index / DSA_NUM_SEGMENTS_AT_EACH_SIZE));
2181
2182 total_pages = total_size / FPM_PAGE_SIZE;
2183 metadata_bytes =
2184 MAXALIGN(sizeof(dsa_segment_header)) +
2185 MAXALIGN(sizeof(FreePageManager)) +
2186 sizeof(dsa_pointer) * total_pages;
2187
2188 /* Add padding up to next page boundary. */
2189 if (metadata_bytes % FPM_PAGE_SIZE != 0)
2190 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2191 if (total_size <= metadata_bytes)
2192 return NULL;
2193 usable_pages = (total_size - metadata_bytes) / FPM_PAGE_SIZE;
2194 Assert(metadata_bytes + usable_pages * FPM_PAGE_SIZE <= total_size);
2195
2196 /* See if that is enough... */
2197 if (requested_pages > usable_pages)
2198 {
2199 /*
2200 * We'll make an odd-sized segment, working forward from the requested
2201 * number of pages.
2202 */
2203 usable_pages = requested_pages;
2204 metadata_bytes =
2205 MAXALIGN(sizeof(dsa_segment_header)) +
2206 MAXALIGN(sizeof(FreePageManager)) +
2207 usable_pages * sizeof(dsa_pointer);
2208
2209 /* Add padding up to next page boundary. */
2210 if (metadata_bytes % FPM_PAGE_SIZE != 0)
2211 metadata_bytes += FPM_PAGE_SIZE - (metadata_bytes % FPM_PAGE_SIZE);
2212 total_size = metadata_bytes + usable_pages * FPM_PAGE_SIZE;
2213
2214 /* Is that too large for dsa_pointer's addressing scheme? */
2216 return NULL;
2217
2218 /* Would that exceed the limit? */
2221 return NULL;
2222 }
2223
2224 /* Create the segment. */
2225 oldowner = CurrentResourceOwner;
2227 segment = dsm_create(total_size, 0);
2228 CurrentResourceOwner = oldowner;
2229 if (segment == NULL)
2230 return NULL;
2231 dsm_pin_segment(segment);
2232
2233 /* Store the handle in shared memory to be found by index. */
2234 area->control->segment_handles[new_index] =
2235 dsm_segment_handle(segment);
2236 /* Track the highest segment index in the history of the area. */
2237 if (area->control->high_segment_index < new_index)
2238 area->control->high_segment_index = new_index;
2239 /* Track the highest segment index this backend has ever mapped. */
2240 if (area->high_segment_index < new_index)
2241 area->high_segment_index = new_index;
2242 /* Track total size of all segments. */
2246
2247 /* Build a segment map for this segment in this backend. */
2248 segment_map = &area->segment_maps[new_index];
2249 segment_map->segment = segment;
2250 segment_map->mapped_address = dsm_segment_address(segment);
2251 segment_map->header = (dsa_segment_header *) segment_map->mapped_address;
2252 segment_map->fpm = (FreePageManager *)
2253 (segment_map->mapped_address +
2254 MAXALIGN(sizeof(dsa_segment_header)));
2255 segment_map->pagemap = (dsa_pointer *)
2256 (segment_map->mapped_address +
2257 MAXALIGN(sizeof(dsa_segment_header)) +
2258 MAXALIGN(sizeof(FreePageManager)));
2259
2260 /* Set up the free page map. */
2261 FreePageManagerInitialize(segment_map->fpm, segment_map->mapped_address);
2262 FreePageManagerPut(segment_map->fpm, metadata_bytes / FPM_PAGE_SIZE,
2263 usable_pages);
2264
2265 /* Set up the segment header and put it in the appropriate bin. */
2266 segment_map->header->magic =
2267 DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ new_index;
2268 segment_map->header->usable_pages = usable_pages;
2269 segment_map->header->size = total_size;
2270 segment_map->header->bin = contiguous_pages_to_segment_bin(usable_pages);
2271 segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2272 segment_map->header->next =
2273 area->control->segment_bins[segment_map->header->bin];
2274 segment_map->header->freed = false;
2275 area->control->segment_bins[segment_map->header->bin] = new_index;
2276 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2277 {
2279 get_segment_by_index(area, segment_map->header->next);
2280
2281 Assert(next->header->bin == segment_map->header->bin);
2282 next->header->prev = new_index;
2283 }
2284
2285 return segment_map;
2286}
2287
2288/*
2289 * Check if any segments have been freed by destroy_superblock, so we can
2290 * detach from them in this backend. This function is called by
2291 * dsa_get_address and dsa_free to make sure that a dsa_pointer they have
2292 * received can be resolved to the correct segment.
2293 *
2294 * The danger we want to defend against is that there could be an old segment
2295 * mapped into a given slot in this backend, and the dsa_pointer they have
2296 * might refer to some new segment in the same slot. So those functions must
2297 * be sure to process all instructions to detach from a freed segment that had
2298 * been generated by the time this process received the dsa_pointer, before
2299 * they call get_segment_by_index.
2300 */
2301static void
2303{
2304 size_t freed_segment_counter;
2305
2306 /*
2307 * Any other process that has freed a segment has incremented
2308 * freed_segment_counter while holding an LWLock, and that must precede
2309 * any backend creating a new segment in the same slot while holding an
2310 * LWLock, and that must precede the creation of any dsa_pointer pointing
2311 * into the new segment which might reach us here, and the caller must
2312 * have sent the dsa_pointer to this process using appropriate memory
2313 * synchronization (some kind of locking or atomic primitive or system
2314 * call). So all we need to do on the reading side is ask for the load of
2315 * freed_segment_counter to follow the caller's load of the dsa_pointer it
2316 * has, and we can be sure to detect any segments that had been freed as
2317 * of the time that the dsa_pointer reached this process.
2318 */
2320 freed_segment_counter = area->control->freed_segment_counter;
2321 if (unlikely(area->freed_segment_counter != freed_segment_counter))
2322 {
2323 /* Check all currently mapped segments to find what's been freed. */
2327 }
2328}
2329
2330/*
2331 * Workhorse for check_for_freed_segments(), and also used directly in path
2332 * where the area lock is already held. This should be called after acquiring
2333 * the lock but before looking up any segment by index number, to make sure we
2334 * unmap any stale segments that might have previously had the same index as a
2335 * current segment.
2336 */
2337static void
2339{
2340 size_t freed_segment_counter;
2341 int i;
2342
2344 freed_segment_counter = area->control->freed_segment_counter;
2345 if (unlikely(area->freed_segment_counter != freed_segment_counter))
2346 {
2347 for (i = 0; i <= area->high_segment_index; ++i)
2348 {
2349 if (area->segment_maps[i].header != NULL &&
2350 area->segment_maps[i].header->freed)
2351 {
2353 area->segment_maps[i].segment = NULL;
2354 area->segment_maps[i].header = NULL;
2355 area->segment_maps[i].mapped_address = NULL;
2356 }
2357 }
2358 area->freed_segment_counter = freed_segment_counter;
2359 }
2360}
2361
2362/*
2363 * Re-bin segment if it's no longer in the appropriate bin.
2364 */
2365static void
2367{
2368 size_t new_bin;
2369 dsa_segment_index segment_index;
2370
2371 new_bin = contiguous_pages_to_segment_bin(fpm_largest(segment_map->fpm));
2372 if (segment_map->header->bin == new_bin)
2373 return;
2374
2375 /* Remove it from its current bin. */
2376 unlink_segment(area, segment_map);
2377
2378 /* Push it onto the front of its new bin. */
2379 segment_index = get_segment_index(area, segment_map);
2380 segment_map->header->prev = DSA_SEGMENT_INDEX_NONE;
2381 segment_map->header->next = area->control->segment_bins[new_bin];
2382 segment_map->header->bin = new_bin;
2383 area->control->segment_bins[new_bin] = segment_index;
2384 if (segment_map->header->next != DSA_SEGMENT_INDEX_NONE)
2385 {
2387
2388 next = get_segment_by_index(area, segment_map->header->next);
2389 Assert(next->header->bin == new_bin);
2390 next->header->prev = segment_index;
2391 }
2392}
#define pg_read_barrier()
Definition: atomics.h:154
static int32 next
Definition: blutils.c:224
#define Min(x, y)
Definition: c.h:1006
#define MAXALIGN(LEN)
Definition: c.h:813
uint8_t uint8
Definition: c.h:539
uint16_t uint16
Definition: c.h:540
#define unlikely(x)
Definition: c.h:407
uint32_t uint32
Definition: c.h:541
#define lengthof(array)
Definition: c.h:790
#define fprintf(file, fmt, msg)
Definition: cubescan.l:21
static void unlink_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:2028
static void check_for_freed_segments(dsa_area *area)
Definition: dsa.c:2302
static const uint16 dsa_size_classes[]
Definition: dsa.c:225
#define DSA_EXTRACT_SEGMENT_NUMBER(dp)
Definition: dsa.c:96
#define DSA_AREA_LOCK(area)
Definition: dsa.c:132
#define DSA_NUM_SEGMENTS_AT_EACH_SIZE
Definition: dsa.c:69
static void add_span_to_fullness_class(dsa_area *area, dsa_area_span *span, dsa_pointer span_pointer, int fclass)
Definition: dsa.c:1979
static bool ensure_active_superblock(dsa_area *area, dsa_area_pool *pool, int size_class)
Definition: dsa.c:1610
#define DSA_SEGMENT_INDEX_NONE
Definition: dsa.c:105
static dsa_area * create_internal(void *place, size_t size, int tranche_id, dsm_handle control_handle, dsm_segment *control_segment, size_t init_segment_size, size_t max_segment_size)
Definition: dsa.c:1268
dsa_area * dsa_attach(dsa_handle handle)
Definition: dsa.c:510
#define DSA_SEGMENT_HEADER_MAGIC
Definition: dsa.c:89
void dsa_trim(dsa_area *area)
Definition: dsa.c:1093
#define DSA_SPAN_NOTHING_FREE
Definition: dsa.c:375
#define DSA_MAKE_POINTER(segment_number, offset)
Definition: dsa.c:92
dsa_area * dsa_create_in_place_ext(void *place, size_t size, int tranche_id, dsm_segment *segment, size_t init_segment_size, size_t max_segment_size)
Definition: dsa.c:471
#define get_segment_index(area, segment_map_ptr)
Definition: dsa.c:379
dsa_area * dsa_attach_in_place(void *place, dsm_segment *segment)
Definition: dsa.c:560
void * dsa_get_address(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:957
void dsa_on_shmem_exit_release_in_place(int code, Datum place)
Definition: dsa.c:605
void dsa_on_dsm_detach_release_in_place(dsm_segment *segment, Datum place)
Definition: dsa.c:591
static dsa_pointer alloc_object(dsa_area *area, int size_class)
Definition: dsa.c:1522
#define DSA_PAGES_PER_SUPERBLOCK
Definition: dsa.c:82
#define DSA_SIZE_CLASS_MAP_QUANTUM
Definition: dsa.c:258
static size_t contiguous_pages_to_segment_bin(size_t n)
Definition: dsa.c:119
static const uint8 dsa_size_class_map[]
Definition: dsa.c:248
dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags)
Definition: dsa.c:686
static dsa_segment_map * make_new_segment(dsa_area *area, size_t requested_pages)
Definition: dsa.c:2131
size_t dsa_get_total_size(dsa_area *area)
Definition: dsa.c:1042
#define DSA_SUPERBLOCK_SIZE
Definition: dsa.c:376
#define DsaAreaPoolToDsaPointer(area, p)
Definition: dsa.c:322
size_t dsa_get_total_size_from_handle(dsa_handle handle)
Definition: dsa.c:1058
static void check_for_freed_segments_locked(dsa_area *area)
Definition: dsa.c:2338
#define DSA_EXTRACT_OFFSET(dp)
Definition: dsa.c:99
dsa_area * dsa_create_ext(int tranche_id, size_t init_segment_size, size_t max_segment_size)
Definition: dsa.c:421
static void destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
Definition: dsa.c:1887
#define DSA_MAX_SEGMENTS
Definition: dsa.c:75
size_t dsa_segment_index
Definition: dsa.c:102
static void rebin_segment(dsa_area *area, dsa_segment_map *segment_map)
Definition: dsa.c:2366
#define DSA_SCLASS_LOCK(area, sclass)
Definition: dsa.c:133
void dsa_release_in_place(void *place)
Definition: dsa.c:620
static dsa_segment_map * get_segment_by_index(dsa_area *area, dsa_segment_index index)
Definition: dsa.c:1807
void dsa_set_size_limit(dsa_area *area, size_t limit)
Definition: dsa.c:1033
#define DSA_SCLASS_BLOCK_OF_SPANS
Definition: dsa.c:239
static bool transfer_first_span(dsa_area *area, dsa_area_pool *pool, int fromclass, int toclass)
Definition: dsa.c:1482
static void unlink_span(dsa_area *area, dsa_area_span *span)
Definition: dsa.c:1956
#define DSA_SCLASS_SPAN_LARGE
Definition: dsa.c:240
#define DSA_NUM_SIZE_CLASSES
Definition: dsa.c:236
void dsa_unpin(dsa_area *area)
Definition: dsa.c:1009
void dsa_pin_mapping(dsa_area *area)
Definition: dsa.c:650
static dsa_area * attach_internal(void *place, dsm_segment *segment, dsa_handle handle)
Definition: dsa.c:1376
#define NextFreeObjectIndex(object)
Definition: dsa.c:202
void dsa_dump(dsa_area *area)
Definition: dsa.c:1138
dsa_handle dsa_get_handle(dsa_area *area)
Definition: dsa.c:498
static void init_span(dsa_area *area, dsa_pointer span_pointer, dsa_area_pool *pool, dsa_pointer start, size_t npages, uint16 size_class)
Definition: dsa.c:1427
bool dsa_is_attached(dsa_handle handle)
Definition: dsa.c:540
void dsa_detach(dsa_area *area)
Definition: dsa.c:2002
static dsa_segment_map * get_best_segment(dsa_area *area, size_t npages)
Definition: dsa.c:2060
#define DSA_FULLNESS_CLASSES
Definition: dsa.c:266
void dsa_free(dsa_area *area, dsa_pointer dp)
Definition: dsa.c:841
#define DSA_NUM_SEGMENT_BINS
Definition: dsa.c:111
size_t dsa_minimum_size(void)
Definition: dsa.c:1246
void dsa_pin(dsa_area *area)
Definition: dsa.c:990
uint64 dsa_pointer
Definition: dsa.h:62
#define DSA_POINTER_FORMAT
Definition: dsa.h:69
#define DSA_MIN_SEGMENT_SIZE
Definition: dsa.h:100
dsm_handle dsa_handle
Definition: dsa.h:136
#define InvalidDsaPointer
Definition: dsa.h:78
#define DSA_ALLOC_NO_OOM
Definition: dsa.h:74
#define DSA_HANDLE_INVALID
Definition: dsa.h:139
#define DsaPointerIsValid(x)
Definition: dsa.h:106
#define DSA_MAX_SEGMENT_SIZE
Definition: dsa.h:103
#define DSA_ALLOC_HUGE
Definition: dsa.h:73
#define DSA_ALLOC_ZERO
Definition: dsa.h:75
dsm_handle dsm_segment_handle(dsm_segment *seg)
Definition: dsm.c:1123
void dsm_detach(dsm_segment *seg)
Definition: dsm.c:803
void on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
Definition: dsm.c:1132
void dsm_pin_mapping(dsm_segment *seg)
Definition: dsm.c:915
void dsm_unpin_segment(dsm_handle handle)
Definition: dsm.c:988
void dsm_pin_segment(dsm_segment *seg)
Definition: dsm.c:955
void * dsm_segment_address(dsm_segment *seg)
Definition: dsm.c:1095
dsm_segment * dsm_create(Size size, int flags)
Definition: dsm.c:516
dsm_segment * dsm_attach(dsm_handle h)
Definition: dsm.c:665
dsm_segment * dsm_find_mapping(dsm_handle handle)
Definition: dsm.c:1076
uint32 dsm_handle
Definition: dsm_impl.h:55
#define DSM_HANDLE_INVALID
Definition: dsm_impl.h:58
int errdetail(const char *fmt,...)
Definition: elog.c:1216
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define FATAL
Definition: elog.h:41
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
bool FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
Definition: freepage.c:210
void FreePageManagerPut(FreePageManager *fpm, Size first_page, Size npages)
Definition: freepage.c:379
void FreePageManagerInitialize(FreePageManager *fpm, char *base)
Definition: freepage.c:183
#define fpm_largest(fpm)
Definition: freepage.h:88
#define fpm_size_to_pages(sz)
Definition: freepage.h:74
#define FPM_PAGE_SIZE
Definition: freepage.h:30
Assert(PointerIsAligned(start, uint64))
return str start
int j
Definition: isn.c:78
int i
Definition: isn.c:77
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1977
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition: lwlock.c:698
@ LW_SHARED
Definition: lwlock.h:113
@ LW_EXCLUSIVE
Definition: lwlock.h:112
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
#define AllocHugeSizeIsValid(size)
Definition: memutils.h:49
#define AllocSizeIsValid(size)
Definition: memutils.h:42
#define pg_leftmost_one_pos_size_t
Definition: pg_bitutils.h:440
static int64 total_size
Definition: pg_checksums.c:63
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:332
uint64_t Datum
Definition: postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:322
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
Definition: lwlock.h:42
dsa_segment_header segment_header
Definition: dsa.c:290
size_t init_segment_size
Definition: dsa.c:300
size_t total_segment_size
Definition: dsa.c:304
int lwlock_tranche_id
Definition: dsa.c:316
size_t max_segment_size
Definition: dsa.c:302
dsa_segment_index high_segment_index
Definition: dsa.c:308
bool pinned
Definition: dsa.c:312
size_t max_total_segment_size
Definition: dsa.c:306
dsa_segment_index segment_bins[DSA_NUM_SEGMENT_BINS]
Definition: dsa.c:296
dsa_area_pool pools[DSA_NUM_SIZE_CLASSES]
Definition: dsa.c:298
size_t freed_segment_counter
Definition: dsa.c:314
int refcnt
Definition: dsa.c:310
LWLock lock
Definition: dsa.c:318
dsa_handle handle
Definition: dsa.c:292
dsm_handle segment_handles[DSA_MAX_SEGMENTS]
Definition: dsa.c:294
dsa_pointer spans[DSA_FULLNESS_CLASSES]
Definition: dsa.c:279
LWLock lock
Definition: dsa.c:277
dsa_pointer nextspan
Definition: dsa.c:187
uint16 fclass
Definition: dsa.c:195
dsa_pointer start
Definition: dsa.c:188
uint16 nallocatable
Definition: dsa.c:192
dsa_pointer prevspan
Definition: dsa.c:186
uint16 size_class
Definition: dsa.c:190
uint16 nmax
Definition: dsa.c:194
uint16 ninitialized
Definition: dsa.c:191
uint16 firstfree
Definition: dsa.c:193
dsa_pointer pool
Definition: dsa.c:185
size_t npages
Definition: dsa.c:189
Definition: dsa.c:348
dsa_segment_map segment_maps[DSA_MAX_SEGMENTS]
Definition: dsa.c:366
dsa_segment_index high_segment_index
Definition: dsa.c:369
size_t freed_segment_counter
Definition: dsa.c:372
dsa_area_control * control
Definition: dsa.c:350
ResourceOwner resowner
Definition: dsa.c:358
uint32 magic
Definition: dsa.c:143
size_t size
Definition: dsa.c:147
dsa_segment_index next
Definition: dsa.c:159
dsa_segment_index prev
Definition: dsa.c:153
size_t usable_pages
Definition: dsa.c:145
size_t bin
Definition: dsa.c:161
dsa_segment_header * header
Definition: dsa.c:336
FreePageManager * fpm
Definition: dsa.c:337
dsm_segment * segment
Definition: dsa.c:334
dsa_pointer * pagemap
Definition: dsa.c:338
char * mapped_address
Definition: dsa.c:335
Definition: type.h:96